]> Git Repo - J-linux.git/blob - arch/x86/kvm/mmu/mmu.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / arch / x86 / kvm / mmu / mmu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * This module enables machines with Intel VT-x extensions to run virtual
6  * machines without emulation or binary translation.
7  *
8  * MMU support
9  *
10  * Copyright (C) 2006 Qumranet, Inc.
11  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
12  *
13  * Authors:
14  *   Yaniv Kamay  <[email protected]>
15  *   Avi Kivity   <[email protected]>
16  */
17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
19 #include "irq.h"
20 #include "ioapic.h"
21 #include "mmu.h"
22 #include "mmu_internal.h"
23 #include "tdp_mmu.h"
24 #include "x86.h"
25 #include "kvm_cache_regs.h"
26 #include "smm.h"
27 #include "kvm_emulate.h"
28 #include "page_track.h"
29 #include "cpuid.h"
30 #include "spte.h"
31
32 #include <linux/kvm_host.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/mm.h>
36 #include <linux/highmem.h>
37 #include <linux/moduleparam.h>
38 #include <linux/export.h>
39 #include <linux/swap.h>
40 #include <linux/hugetlb.h>
41 #include <linux/compiler.h>
42 #include <linux/srcu.h>
43 #include <linux/slab.h>
44 #include <linux/sched/signal.h>
45 #include <linux/uaccess.h>
46 #include <linux/hash.h>
47 #include <linux/kern_levels.h>
48 #include <linux/kstrtox.h>
49 #include <linux/kthread.h>
50 #include <linux/wordpart.h>
51
52 #include <asm/page.h>
53 #include <asm/memtype.h>
54 #include <asm/cmpxchg.h>
55 #include <asm/io.h>
56 #include <asm/set_memory.h>
57 #include <asm/spec-ctrl.h>
58 #include <asm/vmx.h>
59
60 #include "trace.h"
61
62 static bool nx_hugepage_mitigation_hard_disabled;
63
64 int __read_mostly nx_huge_pages = -1;
65 static uint __read_mostly nx_huge_pages_recovery_period_ms;
66 #ifdef CONFIG_PREEMPT_RT
67 /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
68 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
69 #else
70 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
71 #endif
72
73 static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp);
74 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
75 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
76
77 static const struct kernel_param_ops nx_huge_pages_ops = {
78         .set = set_nx_huge_pages,
79         .get = get_nx_huge_pages,
80 };
81
82 static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
83         .set = set_nx_huge_pages_recovery_param,
84         .get = param_get_uint,
85 };
86
87 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
88 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
89 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
90                 &nx_huge_pages_recovery_ratio, 0644);
91 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
92 module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
93                 &nx_huge_pages_recovery_period_ms, 0644);
94 __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
95
96 static bool __read_mostly force_flush_and_sync_on_reuse;
97 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
98
99 /*
100  * When setting this variable to true it enables Two-Dimensional-Paging
101  * where the hardware walks 2 page tables:
102  * 1. the guest-virtual to guest-physical
103  * 2. while doing 1. it walks guest-physical to host-physical
104  * If the hardware supports that we don't need to do shadow paging.
105  */
106 bool tdp_enabled = false;
107
108 static bool __ro_after_init tdp_mmu_allowed;
109
110 #ifdef CONFIG_X86_64
111 bool __read_mostly tdp_mmu_enabled = true;
112 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
113 #endif
114
115 static int max_huge_page_level __read_mostly;
116 static int tdp_root_level __read_mostly;
117 static int max_tdp_level __read_mostly;
118
119 #define PTE_PREFETCH_NUM                8
120
121 #include <trace/events/kvm.h>
122
123 /* make pte_list_desc fit well in cache lines */
124 #define PTE_LIST_EXT 14
125
126 /*
127  * struct pte_list_desc is the core data structure used to implement a custom
128  * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
129  * given GFN when used in the context of rmaps.  Using a custom list allows KVM
130  * to optimize for the common case where many GFNs will have at most a handful
131  * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
132  * memory footprint, which in turn improves runtime performance by exploiting
133  * cache locality.
134  *
135  * A list is comprised of one or more pte_list_desc objects (descriptors).
136  * Each individual descriptor stores up to PTE_LIST_EXT SPTEs.  If a descriptor
137  * is full and a new SPTEs needs to be added, a new descriptor is allocated and
138  * becomes the head of the list.  This means that by definitions, all tail
139  * descriptors are full.
140  *
141  * Note, the meta data fields are deliberately placed at the start of the
142  * structure to optimize the cacheline layout; accessing the descriptor will
143  * touch only a single cacheline so long as @spte_count<=6 (or if only the
144  * descriptors metadata is accessed).
145  */
146 struct pte_list_desc {
147         struct pte_list_desc *more;
148         /* The number of PTEs stored in _this_ descriptor. */
149         u32 spte_count;
150         /* The number of PTEs stored in all tails of this descriptor. */
151         u32 tail_count;
152         u64 *sptes[PTE_LIST_EXT];
153 };
154
155 struct kvm_shadow_walk_iterator {
156         u64 addr;
157         hpa_t shadow_addr;
158         u64 *sptep;
159         int level;
160         unsigned index;
161 };
162
163 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
164         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
165                                          (_root), (_addr));                \
166              shadow_walk_okay(&(_walker));                                 \
167              shadow_walk_next(&(_walker)))
168
169 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
170         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
171              shadow_walk_okay(&(_walker));                      \
172              shadow_walk_next(&(_walker)))
173
174 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
175         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
176              shadow_walk_okay(&(_walker)) &&                            \
177                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
178              __shadow_walk_next(&(_walker), spte))
179
180 static struct kmem_cache *pte_list_desc_cache;
181 struct kmem_cache *mmu_page_header_cache;
182
183 static void mmu_spte_set(u64 *sptep, u64 spte);
184
185 struct kvm_mmu_role_regs {
186         const unsigned long cr0;
187         const unsigned long cr4;
188         const u64 efer;
189 };
190
191 #define CREATE_TRACE_POINTS
192 #include "mmutrace.h"
193
194 /*
195  * Yes, lot's of underscores.  They're a hint that you probably shouldn't be
196  * reading from the role_regs.  Once the root_role is constructed, it becomes
197  * the single source of truth for the MMU's state.
198  */
199 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag)                   \
200 static inline bool __maybe_unused                                       \
201 ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs)             \
202 {                                                                       \
203         return !!(regs->reg & flag);                                    \
204 }
205 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
206 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
207 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
208 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
209 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
210 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
211 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
212 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
213 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
214 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
215
216 /*
217  * The MMU itself (with a valid role) is the single source of truth for the
218  * MMU.  Do not use the regs used to build the MMU/role, nor the vCPU.  The
219  * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
220  * and the vCPU may be incorrect/irrelevant.
221  */
222 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name)         \
223 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu)        \
224 {                                                               \
225         return !!(mmu->cpu_role. base_or_ext . reg##_##name);   \
226 }
227 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
228 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pse);
229 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smep);
230 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, smap);
231 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, pke);
232 BUILD_MMU_ROLE_ACCESSOR(ext,  cr4, la57);
233 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
234 BUILD_MMU_ROLE_ACCESSOR(ext,  efer, lma);
235
236 static inline bool is_cr0_pg(struct kvm_mmu *mmu)
237 {
238         return mmu->cpu_role.base.level > 0;
239 }
240
241 static inline bool is_cr4_pae(struct kvm_mmu *mmu)
242 {
243         return !mmu->cpu_role.base.has_4_byte_gpte;
244 }
245
246 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
247 {
248         struct kvm_mmu_role_regs regs = {
249                 .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
250                 .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
251                 .efer = vcpu->arch.efer,
252         };
253
254         return regs;
255 }
256
257 static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
258 {
259         return kvm_read_cr3(vcpu);
260 }
261
262 static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
263                                                   struct kvm_mmu *mmu)
264 {
265         if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
266                 return kvm_read_cr3(vcpu);
267
268         return mmu->get_guest_pgd(vcpu);
269 }
270
271 static inline bool kvm_available_flush_remote_tlbs_range(void)
272 {
273 #if IS_ENABLED(CONFIG_HYPERV)
274         return kvm_x86_ops.flush_remote_tlbs_range;
275 #else
276         return false;
277 #endif
278 }
279
280 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
281
282 /* Flush the range of guest memory mapped by the given SPTE. */
283 static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
284 {
285         struct kvm_mmu_page *sp = sptep_to_sp(sptep);
286         gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
287
288         kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
289 }
290
291 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
292                            unsigned int access)
293 {
294         u64 spte = make_mmio_spte(vcpu, gfn, access);
295
296         trace_mark_mmio_spte(sptep, gfn, spte);
297         mmu_spte_set(sptep, spte);
298 }
299
300 static gfn_t get_mmio_spte_gfn(u64 spte)
301 {
302         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
303
304         gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
305                & shadow_nonpresent_or_rsvd_mask;
306
307         return gpa >> PAGE_SHIFT;
308 }
309
310 static unsigned get_mmio_spte_access(u64 spte)
311 {
312         return spte & shadow_mmio_access_mask;
313 }
314
315 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
316 {
317         u64 kvm_gen, spte_gen, gen;
318
319         gen = kvm_vcpu_memslots(vcpu)->generation;
320         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
321                 return false;
322
323         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
324         spte_gen = get_mmio_spte_generation(spte);
325
326         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
327         return likely(kvm_gen == spte_gen);
328 }
329
330 static int is_cpuid_PSE36(void)
331 {
332         return 1;
333 }
334
335 #ifdef CONFIG_X86_64
336 static void __set_spte(u64 *sptep, u64 spte)
337 {
338         KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
339         WRITE_ONCE(*sptep, spte);
340 }
341
342 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
343 {
344         KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
345         WRITE_ONCE(*sptep, spte);
346 }
347
348 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
349 {
350         KVM_MMU_WARN_ON(is_ept_ve_possible(spte));
351         return xchg(sptep, spte);
352 }
353
354 static u64 __get_spte_lockless(u64 *sptep)
355 {
356         return READ_ONCE(*sptep);
357 }
358 #else
359 union split_spte {
360         struct {
361                 u32 spte_low;
362                 u32 spte_high;
363         };
364         u64 spte;
365 };
366
367 static void count_spte_clear(u64 *sptep, u64 spte)
368 {
369         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
370
371         if (is_shadow_present_pte(spte))
372                 return;
373
374         /* Ensure the spte is completely set before we increase the count */
375         smp_wmb();
376         sp->clear_spte_count++;
377 }
378
379 static void __set_spte(u64 *sptep, u64 spte)
380 {
381         union split_spte *ssptep, sspte;
382
383         ssptep = (union split_spte *)sptep;
384         sspte = (union split_spte)spte;
385
386         ssptep->spte_high = sspte.spte_high;
387
388         /*
389          * If we map the spte from nonpresent to present, We should store
390          * the high bits firstly, then set present bit, so cpu can not
391          * fetch this spte while we are setting the spte.
392          */
393         smp_wmb();
394
395         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
396 }
397
398 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
399 {
400         union split_spte *ssptep, sspte;
401
402         ssptep = (union split_spte *)sptep;
403         sspte = (union split_spte)spte;
404
405         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
406
407         /*
408          * If we map the spte from present to nonpresent, we should clear
409          * present bit firstly to avoid vcpu fetch the old high bits.
410          */
411         smp_wmb();
412
413         ssptep->spte_high = sspte.spte_high;
414         count_spte_clear(sptep, spte);
415 }
416
417 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
418 {
419         union split_spte *ssptep, sspte, orig;
420
421         ssptep = (union split_spte *)sptep;
422         sspte = (union split_spte)spte;
423
424         /* xchg acts as a barrier before the setting of the high bits */
425         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
426         orig.spte_high = ssptep->spte_high;
427         ssptep->spte_high = sspte.spte_high;
428         count_spte_clear(sptep, spte);
429
430         return orig.spte;
431 }
432
433 /*
434  * The idea using the light way get the spte on x86_32 guest is from
435  * gup_get_pte (mm/gup.c).
436  *
437  * An spte tlb flush may be pending, because they are coalesced and
438  * we are running out of the MMU lock.  Therefore
439  * we need to protect against in-progress updates of the spte.
440  *
441  * Reading the spte while an update is in progress may get the old value
442  * for the high part of the spte.  The race is fine for a present->non-present
443  * change (because the high part of the spte is ignored for non-present spte),
444  * but for a present->present change we must reread the spte.
445  *
446  * All such changes are done in two steps (present->non-present and
447  * non-present->present), hence it is enough to count the number of
448  * present->non-present updates: if it changed while reading the spte,
449  * we might have hit the race.  This is done using clear_spte_count.
450  */
451 static u64 __get_spte_lockless(u64 *sptep)
452 {
453         struct kvm_mmu_page *sp =  sptep_to_sp(sptep);
454         union split_spte spte, *orig = (union split_spte *)sptep;
455         int count;
456
457 retry:
458         count = sp->clear_spte_count;
459         smp_rmb();
460
461         spte.spte_low = orig->spte_low;
462         smp_rmb();
463
464         spte.spte_high = orig->spte_high;
465         smp_rmb();
466
467         if (unlikely(spte.spte_low != orig->spte_low ||
468               count != sp->clear_spte_count))
469                 goto retry;
470
471         return spte.spte;
472 }
473 #endif
474
475 /* Rules for using mmu_spte_set:
476  * Set the sptep from nonpresent to present.
477  * Note: the sptep being assigned *must* be either not present
478  * or in a state where the hardware will not attempt to update
479  * the spte.
480  */
481 static void mmu_spte_set(u64 *sptep, u64 new_spte)
482 {
483         WARN_ON_ONCE(is_shadow_present_pte(*sptep));
484         __set_spte(sptep, new_spte);
485 }
486
487 /* Rules for using mmu_spte_update:
488  * Update the state bits, it means the mapped pfn is not changed.
489  *
490  * Returns true if the TLB needs to be flushed
491  */
492 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
493 {
494         u64 old_spte = *sptep;
495
496         WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
497         check_spte_writable_invariants(new_spte);
498
499         if (!is_shadow_present_pte(old_spte)) {
500                 mmu_spte_set(sptep, new_spte);
501                 return false;
502         }
503
504         if (!spte_has_volatile_bits(old_spte))
505                 __update_clear_spte_fast(sptep, new_spte);
506         else
507                 old_spte = __update_clear_spte_slow(sptep, new_spte);
508
509         WARN_ON_ONCE(!is_shadow_present_pte(old_spte) ||
510                      spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
511
512         return leaf_spte_change_needs_tlb_flush(old_spte, new_spte);
513 }
514
515 /*
516  * Rules for using mmu_spte_clear_track_bits:
517  * It sets the sptep from present to nonpresent, and track the
518  * state bits, it is used to clear the last level sptep.
519  * Returns the old PTE.
520  */
521 static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
522 {
523         u64 old_spte = *sptep;
524         int level = sptep_to_sp(sptep)->role.level;
525
526         if (!is_shadow_present_pte(old_spte) ||
527             !spte_has_volatile_bits(old_spte))
528                 __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
529         else
530                 old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
531
532         if (!is_shadow_present_pte(old_spte))
533                 return old_spte;
534
535         kvm_update_page_stats(kvm, level, -1);
536         return old_spte;
537 }
538
539 /*
540  * Rules for using mmu_spte_clear_no_track:
541  * Directly clear spte without caring the state bits of sptep,
542  * it is used to set the upper level spte.
543  */
544 static void mmu_spte_clear_no_track(u64 *sptep)
545 {
546         __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
547 }
548
549 static u64 mmu_spte_get_lockless(u64 *sptep)
550 {
551         return __get_spte_lockless(sptep);
552 }
553
554 static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
555 {
556         return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
557 }
558
559 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
560 {
561         if (is_tdp_mmu_active(vcpu)) {
562                 kvm_tdp_mmu_walk_lockless_begin();
563         } else {
564                 /*
565                  * Prevent page table teardown by making any free-er wait during
566                  * kvm_flush_remote_tlbs() IPI to all active vcpus.
567                  */
568                 local_irq_disable();
569
570                 /*
571                  * Make sure a following spte read is not reordered ahead of the write
572                  * to vcpu->mode.
573                  */
574                 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
575         }
576 }
577
578 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
579 {
580         if (is_tdp_mmu_active(vcpu)) {
581                 kvm_tdp_mmu_walk_lockless_end();
582         } else {
583                 /*
584                  * Make sure the write to vcpu->mode is not reordered in front of
585                  * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
586                  * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
587                  */
588                 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
589                 local_irq_enable();
590         }
591 }
592
593 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
594 {
595         int r;
596
597         /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
598         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
599                                        1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
600         if (r)
601                 return r;
602         r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
603                                        PT64_ROOT_MAX_LEVEL);
604         if (r)
605                 return r;
606         if (maybe_indirect) {
607                 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
608                                                PT64_ROOT_MAX_LEVEL);
609                 if (r)
610                         return r;
611         }
612         return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
613                                           PT64_ROOT_MAX_LEVEL);
614 }
615
616 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
617 {
618         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
619         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
620         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
621         kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
622 }
623
624 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
625 {
626         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
627 }
628
629 static bool sp_has_gptes(struct kvm_mmu_page *sp);
630
631 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
632 {
633         if (sp->role.passthrough)
634                 return sp->gfn;
635
636         if (sp->shadowed_translation)
637                 return sp->shadowed_translation[index] >> PAGE_SHIFT;
638
639         return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS));
640 }
641
642 /*
643  * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
644  * that the SPTE itself may have a more constrained access permissions that
645  * what the guest enforces. For example, a guest may create an executable
646  * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
647  */
648 static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
649 {
650         if (sp->shadowed_translation)
651                 return sp->shadowed_translation[index] & ACC_ALL;
652
653         /*
654          * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
655          * KVM is not shadowing any guest page tables, so the "guest access
656          * permissions" are just ACC_ALL.
657          *
658          * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
659          * is shadowing a guest huge page with small pages, the guest access
660          * permissions being shadowed are the access permissions of the huge
661          * page.
662          *
663          * In both cases, sp->role.access contains the correct access bits.
664          */
665         return sp->role.access;
666 }
667
668 static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
669                                          gfn_t gfn, unsigned int access)
670 {
671         if (sp->shadowed_translation) {
672                 sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
673                 return;
674         }
675
676         WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
677                   "access mismatch under %s page %llx (expected %u, got %u)\n",
678                   sp->role.passthrough ? "passthrough" : "direct",
679                   sp->gfn, kvm_mmu_page_get_access(sp, index), access);
680
681         WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
682                   "gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
683                   sp->role.passthrough ? "passthrough" : "direct",
684                   sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
685 }
686
687 static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
688                                     unsigned int access)
689 {
690         gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
691
692         kvm_mmu_page_set_translation(sp, index, gfn, access);
693 }
694
695 /*
696  * Return the pointer to the large page information for a given gfn,
697  * handling slots that are not large page aligned.
698  */
699 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
700                 const struct kvm_memory_slot *slot, int level)
701 {
702         unsigned long idx;
703
704         idx = gfn_to_index(gfn, slot->base_gfn, level);
705         return &slot->arch.lpage_info[level - 2][idx];
706 }
707
708 /*
709  * The most significant bit in disallow_lpage tracks whether or not memory
710  * attributes are mixed, i.e. not identical for all gfns at the current level.
711  * The lower order bits are used to refcount other cases where a hugepage is
712  * disallowed, e.g. if KVM has shadow a page table at the gfn.
713  */
714 #define KVM_LPAGE_MIXED_FLAG    BIT(31)
715
716 static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
717                                             gfn_t gfn, int count)
718 {
719         struct kvm_lpage_info *linfo;
720         int old, i;
721
722         for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
723                 linfo = lpage_info_slot(gfn, slot, i);
724
725                 old = linfo->disallow_lpage;
726                 linfo->disallow_lpage += count;
727                 WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG);
728         }
729 }
730
731 void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
732 {
733         update_gfn_disallow_lpage_count(slot, gfn, 1);
734 }
735
736 void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
737 {
738         update_gfn_disallow_lpage_count(slot, gfn, -1);
739 }
740
741 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
742 {
743         struct kvm_memslots *slots;
744         struct kvm_memory_slot *slot;
745         gfn_t gfn;
746
747         kvm->arch.indirect_shadow_pages++;
748         /*
749          * Ensure indirect_shadow_pages is elevated prior to re-reading guest
750          * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight
751          * emulated writes are visible before re-reading guest PTEs, or that
752          * an emulated write will see the elevated count and acquire mmu_lock
753          * to update SPTEs.  Pairs with the smp_mb() in kvm_mmu_track_write().
754          */
755         smp_mb();
756
757         gfn = sp->gfn;
758         slots = kvm_memslots_for_spte_role(kvm, sp->role);
759         slot = __gfn_to_memslot(slots, gfn);
760
761         /* the non-leaf shadow pages are keeping readonly. */
762         if (sp->role.level > PG_LEVEL_4K)
763                 return __kvm_write_track_add_gfn(kvm, slot, gfn);
764
765         kvm_mmu_gfn_disallow_lpage(slot, gfn);
766
767         if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
768                 kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
769 }
770
771 void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
772 {
773         /*
774          * If it's possible to replace the shadow page with an NX huge page,
775          * i.e. if the shadow page is the only thing currently preventing KVM
776          * from using a huge page, add the shadow page to the list of "to be
777          * zapped for NX recovery" pages.  Note, the shadow page can already be
778          * on the list if KVM is reusing an existing shadow page, i.e. if KVM
779          * links a shadow page at multiple points.
780          */
781         if (!list_empty(&sp->possible_nx_huge_page_link))
782                 return;
783
784         ++kvm->stat.nx_lpage_splits;
785         list_add_tail(&sp->possible_nx_huge_page_link,
786                       &kvm->arch.possible_nx_huge_pages);
787 }
788
789 static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
790                                  bool nx_huge_page_possible)
791 {
792         sp->nx_huge_page_disallowed = true;
793
794         if (nx_huge_page_possible)
795                 track_possible_nx_huge_page(kvm, sp);
796 }
797
798 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
799 {
800         struct kvm_memslots *slots;
801         struct kvm_memory_slot *slot;
802         gfn_t gfn;
803
804         kvm->arch.indirect_shadow_pages--;
805         gfn = sp->gfn;
806         slots = kvm_memslots_for_spte_role(kvm, sp->role);
807         slot = __gfn_to_memslot(slots, gfn);
808         if (sp->role.level > PG_LEVEL_4K)
809                 return __kvm_write_track_remove_gfn(kvm, slot, gfn);
810
811         kvm_mmu_gfn_allow_lpage(slot, gfn);
812 }
813
814 void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
815 {
816         if (list_empty(&sp->possible_nx_huge_page_link))
817                 return;
818
819         --kvm->stat.nx_lpage_splits;
820         list_del_init(&sp->possible_nx_huge_page_link);
821 }
822
823 static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
824 {
825         sp->nx_huge_page_disallowed = false;
826
827         untrack_possible_nx_huge_page(kvm, sp);
828 }
829
830 static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
831                                                            gfn_t gfn,
832                                                            bool no_dirty_log)
833 {
834         struct kvm_memory_slot *slot;
835
836         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
837         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
838                 return NULL;
839         if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
840                 return NULL;
841
842         return slot;
843 }
844
845 /*
846  * About rmap_head encoding:
847  *
848  * If the bit zero of rmap_head->val is clear, then it points to the only spte
849  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
850  * pte_list_desc containing more mappings.
851  */
852 #define KVM_RMAP_MANY   BIT(0)
853
854 /*
855  * Returns the number of pointers in the rmap chain, not counting the new one.
856  */
857 static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
858                         struct kvm_rmap_head *rmap_head)
859 {
860         struct pte_list_desc *desc;
861         int count = 0;
862
863         if (!rmap_head->val) {
864                 rmap_head->val = (unsigned long)spte;
865         } else if (!(rmap_head->val & KVM_RMAP_MANY)) {
866                 desc = kvm_mmu_memory_cache_alloc(cache);
867                 desc->sptes[0] = (u64 *)rmap_head->val;
868                 desc->sptes[1] = spte;
869                 desc->spte_count = 2;
870                 desc->tail_count = 0;
871                 rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
872                 ++count;
873         } else {
874                 desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
875                 count = desc->tail_count + desc->spte_count;
876
877                 /*
878                  * If the previous head is full, allocate a new head descriptor
879                  * as tail descriptors are always kept full.
880                  */
881                 if (desc->spte_count == PTE_LIST_EXT) {
882                         desc = kvm_mmu_memory_cache_alloc(cache);
883                         desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
884                         desc->spte_count = 0;
885                         desc->tail_count = count;
886                         rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
887                 }
888                 desc->sptes[desc->spte_count++] = spte;
889         }
890         return count;
891 }
892
893 static void pte_list_desc_remove_entry(struct kvm *kvm,
894                                        struct kvm_rmap_head *rmap_head,
895                                        struct pte_list_desc *desc, int i)
896 {
897         struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
898         int j = head_desc->spte_count - 1;
899
900         /*
901          * The head descriptor should never be empty.  A new head is added only
902          * when adding an entry and the previous head is full, and heads are
903          * removed (this flow) when they become empty.
904          */
905         KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
906
907         /*
908          * Replace the to-be-freed SPTE with the last valid entry from the head
909          * descriptor to ensure that tail descriptors are full at all times.
910          * Note, this also means that tail_count is stable for each descriptor.
911          */
912         desc->sptes[i] = head_desc->sptes[j];
913         head_desc->sptes[j] = NULL;
914         head_desc->spte_count--;
915         if (head_desc->spte_count)
916                 return;
917
918         /*
919          * The head descriptor is empty.  If there are no tail descriptors,
920          * nullify the rmap head to mark the list as empty, else point the rmap
921          * head at the next descriptor, i.e. the new head.
922          */
923         if (!head_desc->more)
924                 rmap_head->val = 0;
925         else
926                 rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
927         mmu_free_pte_list_desc(head_desc);
928 }
929
930 static void pte_list_remove(struct kvm *kvm, u64 *spte,
931                             struct kvm_rmap_head *rmap_head)
932 {
933         struct pte_list_desc *desc;
934         int i;
935
936         if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
937                 return;
938
939         if (!(rmap_head->val & KVM_RMAP_MANY)) {
940                 if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
941                         return;
942
943                 rmap_head->val = 0;
944         } else {
945                 desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
946                 while (desc) {
947                         for (i = 0; i < desc->spte_count; ++i) {
948                                 if (desc->sptes[i] == spte) {
949                                         pte_list_desc_remove_entry(kvm, rmap_head,
950                                                                    desc, i);
951                                         return;
952                                 }
953                         }
954                         desc = desc->more;
955                 }
956
957                 KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
958         }
959 }
960
961 static void kvm_zap_one_rmap_spte(struct kvm *kvm,
962                                   struct kvm_rmap_head *rmap_head, u64 *sptep)
963 {
964         mmu_spte_clear_track_bits(kvm, sptep);
965         pte_list_remove(kvm, sptep, rmap_head);
966 }
967
968 /* Return true if at least one SPTE was zapped, false otherwise */
969 static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
970                                    struct kvm_rmap_head *rmap_head)
971 {
972         struct pte_list_desc *desc, *next;
973         int i;
974
975         if (!rmap_head->val)
976                 return false;
977
978         if (!(rmap_head->val & KVM_RMAP_MANY)) {
979                 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
980                 goto out;
981         }
982
983         desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
984
985         for (; desc; desc = next) {
986                 for (i = 0; i < desc->spte_count; i++)
987                         mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
988                 next = desc->more;
989                 mmu_free_pte_list_desc(desc);
990         }
991 out:
992         /* rmap_head is meaningless now, remember to reset it */
993         rmap_head->val = 0;
994         return true;
995 }
996
997 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
998 {
999         struct pte_list_desc *desc;
1000
1001         if (!rmap_head->val)
1002                 return 0;
1003         else if (!(rmap_head->val & KVM_RMAP_MANY))
1004                 return 1;
1005
1006         desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
1007         return desc->tail_count + desc->spte_count;
1008 }
1009
1010 static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
1011                                          const struct kvm_memory_slot *slot)
1012 {
1013         unsigned long idx;
1014
1015         idx = gfn_to_index(gfn, slot->base_gfn, level);
1016         return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1017 }
1018
1019 static void rmap_remove(struct kvm *kvm, u64 *spte)
1020 {
1021         struct kvm_memslots *slots;
1022         struct kvm_memory_slot *slot;
1023         struct kvm_mmu_page *sp;
1024         gfn_t gfn;
1025         struct kvm_rmap_head *rmap_head;
1026
1027         sp = sptep_to_sp(spte);
1028         gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte));
1029
1030         /*
1031          * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
1032          * so we have to determine which memslots to use based on context
1033          * information in sp->role.
1034          */
1035         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1036
1037         slot = __gfn_to_memslot(slots, gfn);
1038         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1039
1040         pte_list_remove(kvm, spte, rmap_head);
1041 }
1042
1043 /*
1044  * Used by the following functions to iterate through the sptes linked by a
1045  * rmap.  All fields are private and not assumed to be used outside.
1046  */
1047 struct rmap_iterator {
1048         /* private fields */
1049         struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1050         int pos;                        /* index of the sptep */
1051 };
1052
1053 /*
1054  * Iteration must be started by this function.  This should also be used after
1055  * removing/dropping sptes from the rmap link because in such cases the
1056  * information in the iterator may not be valid.
1057  *
1058  * Returns sptep if found, NULL otherwise.
1059  */
1060 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1061                            struct rmap_iterator *iter)
1062 {
1063         u64 *sptep;
1064
1065         if (!rmap_head->val)
1066                 return NULL;
1067
1068         if (!(rmap_head->val & KVM_RMAP_MANY)) {
1069                 iter->desc = NULL;
1070                 sptep = (u64 *)rmap_head->val;
1071                 goto out;
1072         }
1073
1074         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
1075         iter->pos = 0;
1076         sptep = iter->desc->sptes[iter->pos];
1077 out:
1078         BUG_ON(!is_shadow_present_pte(*sptep));
1079         return sptep;
1080 }
1081
1082 /*
1083  * Must be used with a valid iterator: e.g. after rmap_get_first().
1084  *
1085  * Returns sptep if found, NULL otherwise.
1086  */
1087 static u64 *rmap_get_next(struct rmap_iterator *iter)
1088 {
1089         u64 *sptep;
1090
1091         if (iter->desc) {
1092                 if (iter->pos < PTE_LIST_EXT - 1) {
1093                         ++iter->pos;
1094                         sptep = iter->desc->sptes[iter->pos];
1095                         if (sptep)
1096                                 goto out;
1097                 }
1098
1099                 iter->desc = iter->desc->more;
1100
1101                 if (iter->desc) {
1102                         iter->pos = 0;
1103                         /* desc->sptes[0] cannot be NULL */
1104                         sptep = iter->desc->sptes[iter->pos];
1105                         goto out;
1106                 }
1107         }
1108
1109         return NULL;
1110 out:
1111         BUG_ON(!is_shadow_present_pte(*sptep));
1112         return sptep;
1113 }
1114
1115 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1116         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1117              _spte_; _spte_ = rmap_get_next(_iter_))
1118
1119 static void drop_spte(struct kvm *kvm, u64 *sptep)
1120 {
1121         u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
1122
1123         if (is_shadow_present_pte(old_spte))
1124                 rmap_remove(kvm, sptep);
1125 }
1126
1127 static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
1128 {
1129         struct kvm_mmu_page *sp;
1130
1131         sp = sptep_to_sp(sptep);
1132         WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K);
1133
1134         drop_spte(kvm, sptep);
1135
1136         if (flush)
1137                 kvm_flush_remote_tlbs_sptep(kvm, sptep);
1138 }
1139
1140 /*
1141  * Write-protect on the specified @sptep, @pt_protect indicates whether
1142  * spte write-protection is caused by protecting shadow page table.
1143  *
1144  * Note: write protection is difference between dirty logging and spte
1145  * protection:
1146  * - for dirty logging, the spte can be set to writable at anytime if
1147  *   its dirty bitmap is properly set.
1148  * - for spte protection, the spte can be writable only after unsync-ing
1149  *   shadow page.
1150  *
1151  * Return true if tlb need be flushed.
1152  */
1153 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1154 {
1155         u64 spte = *sptep;
1156
1157         if (!is_writable_pte(spte) &&
1158             !(pt_protect && is_mmu_writable_spte(spte)))
1159                 return false;
1160
1161         if (pt_protect)
1162                 spte &= ~shadow_mmu_writable_mask;
1163         spte = spte & ~PT_WRITABLE_MASK;
1164
1165         return mmu_spte_update(sptep, spte);
1166 }
1167
1168 static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
1169                                bool pt_protect)
1170 {
1171         u64 *sptep;
1172         struct rmap_iterator iter;
1173         bool flush = false;
1174
1175         for_each_rmap_spte(rmap_head, &iter, sptep)
1176                 flush |= spte_write_protect(sptep, pt_protect);
1177
1178         return flush;
1179 }
1180
1181 static bool spte_clear_dirty(u64 *sptep)
1182 {
1183         u64 spte = *sptep;
1184
1185         KVM_MMU_WARN_ON(!spte_ad_enabled(spte));
1186         spte &= ~shadow_dirty_mask;
1187         return mmu_spte_update(sptep, spte);
1188 }
1189
1190 /*
1191  * Gets the GFN ready for another round of dirty logging by clearing the
1192  *      - D bit on ad-enabled SPTEs, and
1193  *      - W bit on ad-disabled SPTEs.
1194  * Returns true iff any D or W bits were cleared.
1195  */
1196 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1197                                const struct kvm_memory_slot *slot)
1198 {
1199         u64 *sptep;
1200         struct rmap_iterator iter;
1201         bool flush = false;
1202
1203         for_each_rmap_spte(rmap_head, &iter, sptep)
1204                 if (spte_ad_need_write_protect(*sptep))
1205                         flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
1206                                                     (unsigned long *)sptep);
1207                 else
1208                         flush |= spte_clear_dirty(sptep);
1209
1210         return flush;
1211 }
1212
1213 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1214                                      struct kvm_memory_slot *slot,
1215                                      gfn_t gfn_offset, unsigned long mask)
1216 {
1217         struct kvm_rmap_head *rmap_head;
1218
1219         if (tdp_mmu_enabled)
1220                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1221                                 slot->base_gfn + gfn_offset, mask, true);
1222
1223         if (!kvm_memslots_have_rmaps(kvm))
1224                 return;
1225
1226         while (mask) {
1227                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1228                                         PG_LEVEL_4K, slot);
1229                 rmap_write_protect(rmap_head, false);
1230
1231                 /* clear the first set bit */
1232                 mask &= mask - 1;
1233         }
1234 }
1235
1236 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1237                                          struct kvm_memory_slot *slot,
1238                                          gfn_t gfn_offset, unsigned long mask)
1239 {
1240         struct kvm_rmap_head *rmap_head;
1241
1242         if (tdp_mmu_enabled)
1243                 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1244                                 slot->base_gfn + gfn_offset, mask, false);
1245
1246         if (!kvm_memslots_have_rmaps(kvm))
1247                 return;
1248
1249         while (mask) {
1250                 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1251                                         PG_LEVEL_4K, slot);
1252                 __rmap_clear_dirty(kvm, rmap_head, slot);
1253
1254                 /* clear the first set bit */
1255                 mask &= mask - 1;
1256         }
1257 }
1258
1259 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1260                                 struct kvm_memory_slot *slot,
1261                                 gfn_t gfn_offset, unsigned long mask)
1262 {
1263         /*
1264          * If the slot was assumed to be "initially all dirty", write-protect
1265          * huge pages to ensure they are split to 4KiB on the first write (KVM
1266          * dirty logs at 4KiB granularity). If eager page splitting is enabled,
1267          * immediately try to split huge pages, e.g. so that vCPUs don't get
1268          * saddled with the cost of splitting.
1269          *
1270          * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
1271          * of memslot has no such restriction, so the range can cross two large
1272          * pages.
1273          */
1274         if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1275                 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
1276                 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
1277
1278                 if (READ_ONCE(eager_page_split))
1279                         kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K);
1280
1281                 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
1282
1283                 /* Cross two large pages? */
1284                 if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
1285                     ALIGN(end << PAGE_SHIFT, PMD_SIZE))
1286                         kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
1287                                                        PG_LEVEL_2M);
1288         }
1289
1290         /*
1291          * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in
1292          * mask.  If PML is enabled and the GFN doesn't need to be write-
1293          * protected for other reasons, e.g. shadow paging, clear the Dirty bit.
1294          * Otherwise clear the Writable bit.
1295          *
1296          * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is
1297          * enabled but it chooses between clearing the Dirty bit and Writeable
1298          * bit based on the context.
1299          */
1300         if (kvm_x86_ops.cpu_dirty_log_size)
1301                 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
1302         else
1303                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1304 }
1305
1306 int kvm_cpu_dirty_log_size(void)
1307 {
1308         return kvm_x86_ops.cpu_dirty_log_size;
1309 }
1310
1311 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1312                                     struct kvm_memory_slot *slot, u64 gfn,
1313                                     int min_level)
1314 {
1315         struct kvm_rmap_head *rmap_head;
1316         int i;
1317         bool write_protected = false;
1318
1319         if (kvm_memslots_have_rmaps(kvm)) {
1320                 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1321                         rmap_head = gfn_to_rmap(gfn, i, slot);
1322                         write_protected |= rmap_write_protect(rmap_head, true);
1323                 }
1324         }
1325
1326         if (tdp_mmu_enabled)
1327                 write_protected |=
1328                         kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
1329
1330         return write_protected;
1331 }
1332
1333 static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
1334 {
1335         struct kvm_memory_slot *slot;
1336
1337         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1338         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
1339 }
1340
1341 static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1342                          const struct kvm_memory_slot *slot)
1343 {
1344         return kvm_zap_all_rmap_sptes(kvm, rmap_head);
1345 }
1346
1347 struct slot_rmap_walk_iterator {
1348         /* input fields. */
1349         const struct kvm_memory_slot *slot;
1350         gfn_t start_gfn;
1351         gfn_t end_gfn;
1352         int start_level;
1353         int end_level;
1354
1355         /* output fields. */
1356         gfn_t gfn;
1357         struct kvm_rmap_head *rmap;
1358         int level;
1359
1360         /* private field. */
1361         struct kvm_rmap_head *end_rmap;
1362 };
1363
1364 static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
1365                                  int level)
1366 {
1367         iterator->level = level;
1368         iterator->gfn = iterator->start_gfn;
1369         iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
1370         iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
1371 }
1372
1373 static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1374                                 const struct kvm_memory_slot *slot,
1375                                 int start_level, int end_level,
1376                                 gfn_t start_gfn, gfn_t end_gfn)
1377 {
1378         iterator->slot = slot;
1379         iterator->start_level = start_level;
1380         iterator->end_level = end_level;
1381         iterator->start_gfn = start_gfn;
1382         iterator->end_gfn = end_gfn;
1383
1384         rmap_walk_init_level(iterator, iterator->start_level);
1385 }
1386
1387 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1388 {
1389         return !!iterator->rmap;
1390 }
1391
1392 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1393 {
1394         while (++iterator->rmap <= iterator->end_rmap) {
1395                 iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
1396
1397                 if (iterator->rmap->val)
1398                         return;
1399         }
1400
1401         if (++iterator->level > iterator->end_level) {
1402                 iterator->rmap = NULL;
1403                 return;
1404         }
1405
1406         rmap_walk_init_level(iterator, iterator->level);
1407 }
1408
1409 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1410            _start_gfn, _end_gfn, _iter_)                                \
1411         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1412                                  _end_level_, _start_gfn, _end_gfn);    \
1413              slot_rmap_walk_okay(_iter_);                               \
1414              slot_rmap_walk_next(_iter_))
1415
1416 /* The return value indicates if tlb flush on all vcpus is needed. */
1417 typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
1418                                     struct kvm_rmap_head *rmap_head,
1419                                     const struct kvm_memory_slot *slot);
1420
1421 static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
1422                                               const struct kvm_memory_slot *slot,
1423                                               slot_rmaps_handler fn,
1424                                               int start_level, int end_level,
1425                                               gfn_t start_gfn, gfn_t end_gfn,
1426                                               bool can_yield, bool flush_on_yield,
1427                                               bool flush)
1428 {
1429         struct slot_rmap_walk_iterator iterator;
1430
1431         lockdep_assert_held_write(&kvm->mmu_lock);
1432
1433         for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
1434                         end_gfn, &iterator) {
1435                 if (iterator.rmap)
1436                         flush |= fn(kvm, iterator.rmap, slot);
1437
1438                 if (!can_yield)
1439                         continue;
1440
1441                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
1442                         if (flush && flush_on_yield) {
1443                                 kvm_flush_remote_tlbs_range(kvm, start_gfn,
1444                                                             iterator.gfn - start_gfn + 1);
1445                                 flush = false;
1446                         }
1447                         cond_resched_rwlock_write(&kvm->mmu_lock);
1448                 }
1449         }
1450
1451         return flush;
1452 }
1453
1454 static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
1455                                             const struct kvm_memory_slot *slot,
1456                                             slot_rmaps_handler fn,
1457                                             int start_level, int end_level,
1458                                             bool flush_on_yield)
1459 {
1460         return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
1461                                  slot->base_gfn, slot->base_gfn + slot->npages - 1,
1462                                  true, flush_on_yield, false);
1463 }
1464
1465 static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
1466                                                const struct kvm_memory_slot *slot,
1467                                                slot_rmaps_handler fn,
1468                                                bool flush_on_yield)
1469 {
1470         return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
1471 }
1472
1473 static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
1474                                      const struct kvm_memory_slot *slot,
1475                                      gfn_t start, gfn_t end, bool can_yield,
1476                                      bool flush)
1477 {
1478         return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap,
1479                                  PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1480                                  start, end - 1, can_yield, true, flush);
1481 }
1482
1483 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1484 {
1485         bool flush = false;
1486
1487         /*
1488          * To prevent races with vCPUs faulting in a gfn using stale data,
1489          * zapping a gfn range must be protected by mmu_invalidate_in_progress
1490          * (and mmu_invalidate_seq).  The only exception is memslot deletion;
1491          * in that case, SRCU synchronization ensures that SPTEs are zapped
1492          * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the
1493          * invalid slot.
1494          */
1495         lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
1496                             lockdep_is_held(&kvm->slots_lock));
1497
1498         if (kvm_memslots_have_rmaps(kvm))
1499                 flush = __kvm_rmap_zap_gfn_range(kvm, range->slot,
1500                                                  range->start, range->end,
1501                                                  range->may_block, flush);
1502
1503         if (tdp_mmu_enabled)
1504                 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
1505
1506         if (kvm_x86_ops.set_apic_access_page_addr &&
1507             range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
1508                 kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
1509
1510         return flush;
1511 }
1512
1513 #define RMAP_RECYCLE_THRESHOLD 1000
1514
1515 static void __rmap_add(struct kvm *kvm,
1516                        struct kvm_mmu_memory_cache *cache,
1517                        const struct kvm_memory_slot *slot,
1518                        u64 *spte, gfn_t gfn, unsigned int access)
1519 {
1520         struct kvm_mmu_page *sp;
1521         struct kvm_rmap_head *rmap_head;
1522         int rmap_count;
1523
1524         sp = sptep_to_sp(spte);
1525         kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access);
1526         kvm_update_page_stats(kvm, sp->role.level, 1);
1527
1528         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1529         rmap_count = pte_list_add(cache, spte, rmap_head);
1530
1531         if (rmap_count > kvm->stat.max_mmu_rmap_size)
1532                 kvm->stat.max_mmu_rmap_size = rmap_count;
1533         if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
1534                 kvm_zap_all_rmap_sptes(kvm, rmap_head);
1535                 kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
1536         }
1537 }
1538
1539 static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
1540                      u64 *spte, gfn_t gfn, unsigned int access)
1541 {
1542         struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
1543
1544         __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
1545 }
1546
1547 static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
1548                                    struct kvm_gfn_range *range, bool test_only)
1549 {
1550         struct slot_rmap_walk_iterator iterator;
1551         struct rmap_iterator iter;
1552         bool young = false;
1553         u64 *sptep;
1554
1555         for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1556                                  range->start, range->end - 1, &iterator) {
1557                 for_each_rmap_spte(iterator.rmap, &iter, sptep) {
1558                         u64 spte = *sptep;
1559
1560                         if (!is_accessed_spte(spte))
1561                                 continue;
1562
1563                         if (test_only)
1564                                 return true;
1565
1566                         if (spte_ad_enabled(spte)) {
1567                                 clear_bit((ffs(shadow_accessed_mask) - 1),
1568                                         (unsigned long *)sptep);
1569                         } else {
1570                                 /*
1571                                  * WARN if mmu_spte_update() signals the need
1572                                  * for a TLB flush, as Access tracking a SPTE
1573                                  * should never trigger an _immediate_ flush.
1574                                  */
1575                                 spte = mark_spte_for_access_track(spte);
1576                                 WARN_ON_ONCE(mmu_spte_update(sptep, spte));
1577                         }
1578                         young = true;
1579                 }
1580         }
1581         return young;
1582 }
1583
1584 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1585 {
1586         bool young = false;
1587
1588         if (kvm_memslots_have_rmaps(kvm))
1589                 young = kvm_rmap_age_gfn_range(kvm, range, false);
1590
1591         if (tdp_mmu_enabled)
1592                 young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
1593
1594         return young;
1595 }
1596
1597 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1598 {
1599         bool young = false;
1600
1601         if (kvm_memslots_have_rmaps(kvm))
1602                 young = kvm_rmap_age_gfn_range(kvm, range, true);
1603
1604         if (tdp_mmu_enabled)
1605                 young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
1606
1607         return young;
1608 }
1609
1610 static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
1611 {
1612 #ifdef CONFIG_KVM_PROVE_MMU
1613         int i;
1614
1615         for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
1616                 if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
1617                         pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
1618                                            sp->spt[i], &sp->spt[i],
1619                                            kvm_mmu_page_get_gfn(sp, i));
1620         }
1621 #endif
1622 }
1623
1624 static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1625 {
1626         kvm->arch.n_used_mmu_pages++;
1627         kvm_account_pgtable_pages((void *)sp->spt, +1);
1628 }
1629
1630 static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1631 {
1632         kvm->arch.n_used_mmu_pages--;
1633         kvm_account_pgtable_pages((void *)sp->spt, -1);
1634 }
1635
1636 static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
1637 {
1638         kvm_mmu_check_sptes_at_free(sp);
1639
1640         hlist_del(&sp->hash_link);
1641         list_del(&sp->link);
1642         free_page((unsigned long)sp->spt);
1643         free_page((unsigned long)sp->shadowed_translation);
1644         kmem_cache_free(mmu_page_header_cache, sp);
1645 }
1646
1647 static unsigned kvm_page_table_hashfn(gfn_t gfn)
1648 {
1649         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
1650 }
1651
1652 static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
1653                                     struct kvm_mmu_page *sp, u64 *parent_pte)
1654 {
1655         if (!parent_pte)
1656                 return;
1657
1658         pte_list_add(cache, parent_pte, &sp->parent_ptes);
1659 }
1660
1661 static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1662                                        u64 *parent_pte)
1663 {
1664         pte_list_remove(kvm, parent_pte, &sp->parent_ptes);
1665 }
1666
1667 static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1668                             u64 *parent_pte)
1669 {
1670         mmu_page_remove_parent_pte(kvm, sp, parent_pte);
1671         mmu_spte_clear_no_track(parent_pte);
1672 }
1673
1674 static void mark_unsync(u64 *spte);
1675 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1676 {
1677         u64 *sptep;
1678         struct rmap_iterator iter;
1679
1680         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1681                 mark_unsync(sptep);
1682         }
1683 }
1684
1685 static void mark_unsync(u64 *spte)
1686 {
1687         struct kvm_mmu_page *sp;
1688
1689         sp = sptep_to_sp(spte);
1690         if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
1691                 return;
1692         if (sp->unsync_children++)
1693                 return;
1694         kvm_mmu_mark_parents_unsync(sp);
1695 }
1696
1697 #define KVM_PAGE_ARRAY_NR 16
1698
1699 struct kvm_mmu_pages {
1700         struct mmu_page_and_offset {
1701                 struct kvm_mmu_page *sp;
1702                 unsigned int idx;
1703         } page[KVM_PAGE_ARRAY_NR];
1704         unsigned int nr;
1705 };
1706
1707 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
1708                          int idx)
1709 {
1710         int i;
1711
1712         if (sp->unsync)
1713                 for (i=0; i < pvec->nr; i++)
1714                         if (pvec->page[i].sp == sp)
1715                                 return 0;
1716
1717         pvec->page[pvec->nr].sp = sp;
1718         pvec->page[pvec->nr].idx = idx;
1719         pvec->nr++;
1720         return (pvec->nr == KVM_PAGE_ARRAY_NR);
1721 }
1722
1723 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
1724 {
1725         --sp->unsync_children;
1726         WARN_ON_ONCE((int)sp->unsync_children < 0);
1727         __clear_bit(idx, sp->unsync_child_bitmap);
1728 }
1729
1730 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1731                            struct kvm_mmu_pages *pvec)
1732 {
1733         int i, ret, nr_unsync_leaf = 0;
1734
1735         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
1736                 struct kvm_mmu_page *child;
1737                 u64 ent = sp->spt[i];
1738
1739                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
1740                         clear_unsync_child_bit(sp, i);
1741                         continue;
1742                 }
1743
1744                 child = spte_to_child_sp(ent);
1745
1746                 if (child->unsync_children) {
1747                         if (mmu_pages_add(pvec, child, i))
1748                                 return -ENOSPC;
1749
1750                         ret = __mmu_unsync_walk(child, pvec);
1751                         if (!ret) {
1752                                 clear_unsync_child_bit(sp, i);
1753                                 continue;
1754                         } else if (ret > 0) {
1755                                 nr_unsync_leaf += ret;
1756                         } else
1757                                 return ret;
1758                 } else if (child->unsync) {
1759                         nr_unsync_leaf++;
1760                         if (mmu_pages_add(pvec, child, i))
1761                                 return -ENOSPC;
1762                 } else
1763                         clear_unsync_child_bit(sp, i);
1764         }
1765
1766         return nr_unsync_leaf;
1767 }
1768
1769 #define INVALID_INDEX (-1)
1770
1771 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1772                            struct kvm_mmu_pages *pvec)
1773 {
1774         pvec->nr = 0;
1775         if (!sp->unsync_children)
1776                 return 0;
1777
1778         mmu_pages_add(pvec, sp, INVALID_INDEX);
1779         return __mmu_unsync_walk(sp, pvec);
1780 }
1781
1782 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1783 {
1784         WARN_ON_ONCE(!sp->unsync);
1785         trace_kvm_mmu_sync_page(sp);
1786         sp->unsync = 0;
1787         --kvm->stat.mmu_unsync;
1788 }
1789
1790 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1791                                      struct list_head *invalid_list);
1792 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1793                                     struct list_head *invalid_list);
1794
1795 static bool sp_has_gptes(struct kvm_mmu_page *sp)
1796 {
1797         if (sp->role.direct)
1798                 return false;
1799
1800         if (sp->role.passthrough)
1801                 return false;
1802
1803         return true;
1804 }
1805
1806 #define for_each_valid_sp(_kvm, _sp, _list)                             \
1807         hlist_for_each_entry(_sp, _list, hash_link)                     \
1808                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
1809                 } else
1810
1811 #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn)               \
1812         for_each_valid_sp(_kvm, _sp,                                    \
1813           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])     \
1814                 if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
1815
1816 static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1817 {
1818         union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
1819
1820         /*
1821          * Ignore various flags when verifying that it's safe to sync a shadow
1822          * page using the current MMU context.
1823          *
1824          *  - level: not part of the overall MMU role and will never match as the MMU's
1825          *           level tracks the root level
1826          *  - access: updated based on the new guest PTE
1827          *  - quadrant: not part of the overall MMU role (similar to level)
1828          */
1829         const union kvm_mmu_page_role sync_role_ign = {
1830                 .level = 0xf,
1831                 .access = 0x7,
1832                 .quadrant = 0x3,
1833                 .passthrough = 0x1,
1834         };
1835
1836         /*
1837          * Direct pages can never be unsync, and KVM should never attempt to
1838          * sync a shadow page for a different MMU context, e.g. if the role
1839          * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
1840          * reserved bits checks will be wrong, etc...
1841          */
1842         if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
1843                          (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
1844                 return false;
1845
1846         return true;
1847 }
1848
1849 static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
1850 {
1851         /* sp->spt[i] has initial value of shadow page table allocation */
1852         if (sp->spt[i] == SHADOW_NONPRESENT_VALUE)
1853                 return 0;
1854
1855         return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
1856 }
1857
1858 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1859 {
1860         int flush = 0;
1861         int i;
1862
1863         if (!kvm_sync_page_check(vcpu, sp))
1864                 return -1;
1865
1866         for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
1867                 int ret = kvm_sync_spte(vcpu, sp, i);
1868
1869                 if (ret < -1)
1870                         return -1;
1871                 flush |= ret;
1872         }
1873
1874         /*
1875          * Note, any flush is purely for KVM's correctness, e.g. when dropping
1876          * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
1877          * unmap or dirty logging event doesn't fail to flush.  The guest is
1878          * responsible for flushing the TLB to ensure any changes in protection
1879          * bits are recognized, i.e. until the guest flushes or page faults on
1880          * a relevant address, KVM is architecturally allowed to let vCPUs use
1881          * cached translations with the old protection bits.
1882          */
1883         return flush;
1884 }
1885
1886 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1887                          struct list_head *invalid_list)
1888 {
1889         int ret = __kvm_sync_page(vcpu, sp);
1890
1891         if (ret < 0)
1892                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1893         return ret;
1894 }
1895
1896 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
1897                                         struct list_head *invalid_list,
1898                                         bool remote_flush)
1899 {
1900         if (!remote_flush && list_empty(invalid_list))
1901                 return false;
1902
1903         if (!list_empty(invalid_list))
1904                 kvm_mmu_commit_zap_page(kvm, invalid_list);
1905         else
1906                 kvm_flush_remote_tlbs(kvm);
1907         return true;
1908 }
1909
1910 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1911 {
1912         if (sp->role.invalid)
1913                 return true;
1914
1915         /* TDP MMU pages do not use the MMU generation. */
1916         return !is_tdp_mmu_page(sp) &&
1917                unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1918 }
1919
1920 struct mmu_page_path {
1921         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
1922         unsigned int idx[PT64_ROOT_MAX_LEVEL];
1923 };
1924
1925 #define for_each_sp(pvec, sp, parents, i)                       \
1926                 for (i = mmu_pages_first(&pvec, &parents);      \
1927                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
1928                         i = mmu_pages_next(&pvec, &parents, i))
1929
1930 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
1931                           struct mmu_page_path *parents,
1932                           int i)
1933 {
1934         int n;
1935
1936         for (n = i+1; n < pvec->nr; n++) {
1937                 struct kvm_mmu_page *sp = pvec->page[n].sp;
1938                 unsigned idx = pvec->page[n].idx;
1939                 int level = sp->role.level;
1940
1941                 parents->idx[level-1] = idx;
1942                 if (level == PG_LEVEL_4K)
1943                         break;
1944
1945                 parents->parent[level-2] = sp;
1946         }
1947
1948         return n;
1949 }
1950
1951 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
1952                            struct mmu_page_path *parents)
1953 {
1954         struct kvm_mmu_page *sp;
1955         int level;
1956
1957         if (pvec->nr == 0)
1958                 return 0;
1959
1960         WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX);
1961
1962         sp = pvec->page[0].sp;
1963         level = sp->role.level;
1964         WARN_ON_ONCE(level == PG_LEVEL_4K);
1965
1966         parents->parent[level-2] = sp;
1967
1968         /* Also set up a sentinel.  Further entries in pvec are all
1969          * children of sp, so this element is never overwritten.
1970          */
1971         parents->parent[level-1] = NULL;
1972         return mmu_pages_next(pvec, parents, 0);
1973 }
1974
1975 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
1976 {
1977         struct kvm_mmu_page *sp;
1978         unsigned int level = 0;
1979
1980         do {
1981                 unsigned int idx = parents->idx[level];
1982                 sp = parents->parent[level];
1983                 if (!sp)
1984                         return;
1985
1986                 WARN_ON_ONCE(idx == INVALID_INDEX);
1987                 clear_unsync_child_bit(sp, idx);
1988                 level++;
1989         } while (!sp->unsync_children);
1990 }
1991
1992 static int mmu_sync_children(struct kvm_vcpu *vcpu,
1993                              struct kvm_mmu_page *parent, bool can_yield)
1994 {
1995         int i;
1996         struct kvm_mmu_page *sp;
1997         struct mmu_page_path parents;
1998         struct kvm_mmu_pages pages;
1999         LIST_HEAD(invalid_list);
2000         bool flush = false;
2001
2002         while (mmu_unsync_walk(parent, &pages)) {
2003                 bool protected = false;
2004
2005                 for_each_sp(pages, sp, parents, i)
2006                         protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn);
2007
2008                 if (protected) {
2009                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
2010                         flush = false;
2011                 }
2012
2013                 for_each_sp(pages, sp, parents, i) {
2014                         kvm_unlink_unsync_page(vcpu->kvm, sp);
2015                         flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
2016                         mmu_pages_clear_parents(&parents);
2017                 }
2018                 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
2019                         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2020                         if (!can_yield) {
2021                                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2022                                 return -EINTR;
2023                         }
2024
2025                         cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
2026                         flush = false;
2027                 }
2028         }
2029
2030         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
2031         return 0;
2032 }
2033
2034 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2035 {
2036         atomic_set(&sp->write_flooding_count,  0);
2037 }
2038
2039 static void clear_sp_write_flooding_count(u64 *spte)
2040 {
2041         __clear_sp_write_flooding_count(sptep_to_sp(spte));
2042 }
2043
2044 /*
2045  * The vCPU is required when finding indirect shadow pages; the shadow
2046  * page may already exist and syncing it needs the vCPU pointer in
2047  * order to read guest page tables.  Direct shadow pages are never
2048  * unsync, thus @vcpu can be NULL if @role.direct is true.
2049  */
2050 static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
2051                                                      struct kvm_vcpu *vcpu,
2052                                                      gfn_t gfn,
2053                                                      struct hlist_head *sp_list,
2054                                                      union kvm_mmu_page_role role)
2055 {
2056         struct kvm_mmu_page *sp;
2057         int ret;
2058         int collisions = 0;
2059         LIST_HEAD(invalid_list);
2060
2061         for_each_valid_sp(kvm, sp, sp_list) {
2062                 if (sp->gfn != gfn) {
2063                         collisions++;
2064                         continue;
2065                 }
2066
2067                 if (sp->role.word != role.word) {
2068                         /*
2069                          * If the guest is creating an upper-level page, zap
2070                          * unsync pages for the same gfn.  While it's possible
2071                          * the guest is using recursive page tables, in all
2072                          * likelihood the guest has stopped using the unsync
2073                          * page and is installing a completely unrelated page.
2074                          * Unsync pages must not be left as is, because the new
2075                          * upper-level page will be write-protected.
2076                          */
2077                         if (role.level > PG_LEVEL_4K && sp->unsync)
2078                                 kvm_mmu_prepare_zap_page(kvm, sp,
2079                                                          &invalid_list);
2080                         continue;
2081                 }
2082
2083                 /* unsync and write-flooding only apply to indirect SPs. */
2084                 if (sp->role.direct)
2085                         goto out;
2086
2087                 if (sp->unsync) {
2088                         if (KVM_BUG_ON(!vcpu, kvm))
2089                                 break;
2090
2091                         /*
2092                          * The page is good, but is stale.  kvm_sync_page does
2093                          * get the latest guest state, but (unlike mmu_unsync_children)
2094                          * it doesn't write-protect the page or mark it synchronized!
2095                          * This way the validity of the mapping is ensured, but the
2096                          * overhead of write protection is not incurred until the
2097                          * guest invalidates the TLB mapping.  This allows multiple
2098                          * SPs for a single gfn to be unsync.
2099                          *
2100                          * If the sync fails, the page is zapped.  If so, break
2101                          * in order to rebuild it.
2102                          */
2103                         ret = kvm_sync_page(vcpu, sp, &invalid_list);
2104                         if (ret < 0)
2105                                 break;
2106
2107                         WARN_ON_ONCE(!list_empty(&invalid_list));
2108                         if (ret > 0)
2109                                 kvm_flush_remote_tlbs(kvm);
2110                 }
2111
2112                 __clear_sp_write_flooding_count(sp);
2113
2114                 goto out;
2115         }
2116
2117         sp = NULL;
2118         ++kvm->stat.mmu_cache_miss;
2119
2120 out:
2121         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2122
2123         if (collisions > kvm->stat.max_mmu_page_hash_collisions)
2124                 kvm->stat.max_mmu_page_hash_collisions = collisions;
2125         return sp;
2126 }
2127
2128 /* Caches used when allocating a new shadow page. */
2129 struct shadow_page_caches {
2130         struct kvm_mmu_memory_cache *page_header_cache;
2131         struct kvm_mmu_memory_cache *shadow_page_cache;
2132         struct kvm_mmu_memory_cache *shadowed_info_cache;
2133 };
2134
2135 static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
2136                                                       struct shadow_page_caches *caches,
2137                                                       gfn_t gfn,
2138                                                       struct hlist_head *sp_list,
2139                                                       union kvm_mmu_page_role role)
2140 {
2141         struct kvm_mmu_page *sp;
2142
2143         sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache);
2144         sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache);
2145         if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL)
2146                 sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache);
2147
2148         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2149
2150         INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
2151
2152         /*
2153          * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2154          * depends on valid pages being added to the head of the list.  See
2155          * comments in kvm_zap_obsolete_pages().
2156          */
2157         sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
2158         list_add(&sp->link, &kvm->arch.active_mmu_pages);
2159         kvm_account_mmu_page(kvm, sp);
2160
2161         sp->gfn = gfn;
2162         sp->role = role;
2163         hlist_add_head(&sp->hash_link, sp_list);
2164         if (sp_has_gptes(sp))
2165                 account_shadowed(kvm, sp);
2166
2167         return sp;
2168 }
2169
2170 /* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */
2171 static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
2172                                                       struct kvm_vcpu *vcpu,
2173                                                       struct shadow_page_caches *caches,
2174                                                       gfn_t gfn,
2175                                                       union kvm_mmu_page_role role)
2176 {
2177         struct hlist_head *sp_list;
2178         struct kvm_mmu_page *sp;
2179         bool created = false;
2180
2181         sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
2182
2183         sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
2184         if (!sp) {
2185                 created = true;
2186                 sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
2187         }
2188
2189         trace_kvm_mmu_get_page(sp, created);
2190         return sp;
2191 }
2192
2193 static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu,
2194                                                     gfn_t gfn,
2195                                                     union kvm_mmu_page_role role)
2196 {
2197         struct shadow_page_caches caches = {
2198                 .page_header_cache = &vcpu->arch.mmu_page_header_cache,
2199                 .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
2200                 .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
2201         };
2202
2203         return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role);
2204 }
2205
2206 static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
2207                                                   unsigned int access)
2208 {
2209         struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
2210         union kvm_mmu_page_role role;
2211
2212         role = parent_sp->role;
2213         role.level--;
2214         role.access = access;
2215         role.direct = direct;
2216         role.passthrough = 0;
2217
2218         /*
2219          * If the guest has 4-byte PTEs then that means it's using 32-bit,
2220          * 2-level, non-PAE paging. KVM shadows such guests with PAE paging
2221          * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
2222          * shadow each guest page table with multiple shadow page tables, which
2223          * requires extra bookkeeping in the role.
2224          *
2225          * Specifically, to shadow the guest's page directory (which covers a
2226          * 4GiB address space), KVM uses 4 PAE page directories, each mapping
2227          * 1GiB of the address space. @role.quadrant encodes which quarter of
2228          * the address space each maps.
2229          *
2230          * To shadow the guest's page tables (which each map a 4MiB region), KVM
2231          * uses 2 PAE page tables, each mapping a 2MiB region. For these,
2232          * @role.quadrant encodes which half of the region they map.
2233          *
2234          * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
2235          * consumes bits 29:21.  To consume bits 31:30, KVM's uses 4 shadow
2236          * PDPTEs; those 4 PAE page directories are pre-allocated and their
2237          * quadrant is assigned in mmu_alloc_root().   A 4-byte PTE consumes
2238          * bits 21:12, while an 8-byte PTE consumes bits 20:12.  To consume
2239          * bit 21 in the PTE (the child here), KVM propagates that bit to the
2240          * quadrant, i.e. sets quadrant to '0' or '1'.  The parent 8-byte PDE
2241          * covers bit 21 (see above), thus the quadrant is calculated from the
2242          * _least_ significant bit of the PDE index.
2243          */
2244         if (role.has_4_byte_gpte) {
2245                 WARN_ON_ONCE(role.level != PG_LEVEL_4K);
2246                 role.quadrant = spte_index(sptep) & 1;
2247         }
2248
2249         return role;
2250 }
2251
2252 static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu,
2253                                                  u64 *sptep, gfn_t gfn,
2254                                                  bool direct, unsigned int access)
2255 {
2256         union kvm_mmu_page_role role;
2257
2258         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
2259                 return ERR_PTR(-EEXIST);
2260
2261         role = kvm_mmu_child_role(sptep, direct, access);
2262         return kvm_mmu_get_shadow_page(vcpu, gfn, role);
2263 }
2264
2265 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2266                                         struct kvm_vcpu *vcpu, hpa_t root,
2267                                         u64 addr)
2268 {
2269         iterator->addr = addr;
2270         iterator->shadow_addr = root;
2271         iterator->level = vcpu->arch.mmu->root_role.level;
2272
2273         if (iterator->level >= PT64_ROOT_4LEVEL &&
2274             vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
2275             !vcpu->arch.mmu->root_role.direct)
2276                 iterator->level = PT32E_ROOT_LEVEL;
2277
2278         if (iterator->level == PT32E_ROOT_LEVEL) {
2279                 /*
2280                  * prev_root is currently only used for 64-bit hosts. So only
2281                  * the active root_hpa is valid here.
2282                  */
2283                 BUG_ON(root != vcpu->arch.mmu->root.hpa);
2284
2285                 iterator->shadow_addr
2286                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2287                 iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
2288                 --iterator->level;
2289                 if (!iterator->shadow_addr)
2290                         iterator->level = 0;
2291         }
2292 }
2293
2294 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2295                              struct kvm_vcpu *vcpu, u64 addr)
2296 {
2297         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa,
2298                                     addr);
2299 }
2300
2301 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2302 {
2303         if (iterator->level < PG_LEVEL_4K)
2304                 return false;
2305
2306         iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
2307         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2308         return true;
2309 }
2310
2311 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2312                                u64 spte)
2313 {
2314         if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
2315                 iterator->level = 0;
2316                 return;
2317         }
2318
2319         iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
2320         --iterator->level;
2321 }
2322
2323 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2324 {
2325         __shadow_walk_next(iterator, *iterator->sptep);
2326 }
2327
2328 static void __link_shadow_page(struct kvm *kvm,
2329                                struct kvm_mmu_memory_cache *cache, u64 *sptep,
2330                                struct kvm_mmu_page *sp, bool flush)
2331 {
2332         u64 spte;
2333
2334         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2335
2336         /*
2337          * If an SPTE is present already, it must be a leaf and therefore
2338          * a large one.  Drop it, and flush the TLB if needed, before
2339          * installing sp.
2340          */
2341         if (is_shadow_present_pte(*sptep))
2342                 drop_large_spte(kvm, sptep, flush);
2343
2344         spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
2345
2346         mmu_spte_set(sptep, spte);
2347
2348         mmu_page_add_parent_pte(cache, sp, sptep);
2349
2350         /*
2351          * The non-direct sub-pagetable must be updated before linking.  For
2352          * L1 sp, the pagetable is updated via kvm_sync_page() in
2353          * kvm_mmu_find_shadow_page() without write-protecting the gfn,
2354          * so sp->unsync can be true or false.  For higher level non-direct
2355          * sp, the pagetable is updated/synced via mmu_sync_children() in
2356          * FNAME(fetch)(), so sp->unsync_children can only be false.
2357          * WARN_ON_ONCE() if anything happens unexpectedly.
2358          */
2359         if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
2360                 mark_unsync(sptep);
2361 }
2362
2363 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2364                              struct kvm_mmu_page *sp)
2365 {
2366         __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true);
2367 }
2368
2369 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2370                                    unsigned direct_access)
2371 {
2372         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2373                 struct kvm_mmu_page *child;
2374
2375                 /*
2376                  * For the direct sp, if the guest pte's dirty bit
2377                  * changed form clean to dirty, it will corrupt the
2378                  * sp's access: allow writable in the read-only sp,
2379                  * so we should update the spte at this point to get
2380                  * a new sp with the correct access.
2381                  */
2382                 child = spte_to_child_sp(*sptep);
2383                 if (child->role.access == direct_access)
2384                         return;
2385
2386                 drop_parent_pte(vcpu->kvm, child, sptep);
2387                 kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
2388         }
2389 }
2390
2391 /* Returns the number of zapped non-leaf child shadow pages. */
2392 static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2393                             u64 *spte, struct list_head *invalid_list)
2394 {
2395         u64 pte;
2396         struct kvm_mmu_page *child;
2397
2398         pte = *spte;
2399         if (is_shadow_present_pte(pte)) {
2400                 if (is_last_spte(pte, sp->role.level)) {
2401                         drop_spte(kvm, spte);
2402                 } else {
2403                         child = spte_to_child_sp(pte);
2404                         drop_parent_pte(kvm, child, spte);
2405
2406                         /*
2407                          * Recursively zap nested TDP SPs, parentless SPs are
2408                          * unlikely to be used again in the near future.  This
2409                          * avoids retaining a large number of stale nested SPs.
2410                          */
2411                         if (tdp_enabled && invalid_list &&
2412                             child->role.guest_mode && !child->parent_ptes.val)
2413                                 return kvm_mmu_prepare_zap_page(kvm, child,
2414                                                                 invalid_list);
2415                 }
2416         } else if (is_mmio_spte(kvm, pte)) {
2417                 mmu_spte_clear_no_track(spte);
2418         }
2419         return 0;
2420 }
2421
2422 static int kvm_mmu_page_unlink_children(struct kvm *kvm,
2423                                         struct kvm_mmu_page *sp,
2424                                         struct list_head *invalid_list)
2425 {
2426         int zapped = 0;
2427         unsigned i;
2428
2429         for (i = 0; i < SPTE_ENT_PER_PAGE; ++i)
2430                 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
2431
2432         return zapped;
2433 }
2434
2435 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2436 {
2437         u64 *sptep;
2438         struct rmap_iterator iter;
2439
2440         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2441                 drop_parent_pte(kvm, sp, sptep);
2442 }
2443
2444 static int mmu_zap_unsync_children(struct kvm *kvm,
2445                                    struct kvm_mmu_page *parent,
2446                                    struct list_head *invalid_list)
2447 {
2448         int i, zapped = 0;
2449         struct mmu_page_path parents;
2450         struct kvm_mmu_pages pages;
2451
2452         if (parent->role.level == PG_LEVEL_4K)
2453                 return 0;
2454
2455         while (mmu_unsync_walk(parent, &pages)) {
2456                 struct kvm_mmu_page *sp;
2457
2458                 for_each_sp(pages, sp, parents, i) {
2459                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2460                         mmu_pages_clear_parents(&parents);
2461                         zapped++;
2462                 }
2463         }
2464
2465         return zapped;
2466 }
2467
2468 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2469                                        struct kvm_mmu_page *sp,
2470                                        struct list_head *invalid_list,
2471                                        int *nr_zapped)
2472 {
2473         bool list_unstable, zapped_root = false;
2474
2475         lockdep_assert_held_write(&kvm->mmu_lock);
2476         trace_kvm_mmu_prepare_zap_page(sp);
2477         ++kvm->stat.mmu_shadow_zapped;
2478         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2479         *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
2480         kvm_mmu_unlink_parents(kvm, sp);
2481
2482         /* Zapping children means active_mmu_pages has become unstable. */
2483         list_unstable = *nr_zapped;
2484
2485         if (!sp->role.invalid && sp_has_gptes(sp))
2486                 unaccount_shadowed(kvm, sp);
2487
2488         if (sp->unsync)
2489                 kvm_unlink_unsync_page(kvm, sp);
2490         if (!sp->root_count) {
2491                 /* Count self */
2492                 (*nr_zapped)++;
2493
2494                 /*
2495                  * Already invalid pages (previously active roots) are not on
2496                  * the active page list.  See list_del() in the "else" case of
2497                  * !sp->root_count.
2498                  */
2499                 if (sp->role.invalid)
2500                         list_add(&sp->link, invalid_list);
2501                 else
2502                         list_move(&sp->link, invalid_list);
2503                 kvm_unaccount_mmu_page(kvm, sp);
2504         } else {
2505                 /*
2506                  * Remove the active root from the active page list, the root
2507                  * will be explicitly freed when the root_count hits zero.
2508                  */
2509                 list_del(&sp->link);
2510
2511                 /*
2512                  * Obsolete pages cannot be used on any vCPUs, see the comment
2513                  * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2514                  * treats invalid shadow pages as being obsolete.
2515                  */
2516                 zapped_root = !is_obsolete_sp(kvm, sp);
2517         }
2518
2519         if (sp->nx_huge_page_disallowed)
2520                 unaccount_nx_huge_page(kvm, sp);
2521
2522         sp->role.invalid = 1;
2523
2524         /*
2525          * Make the request to free obsolete roots after marking the root
2526          * invalid, otherwise other vCPUs may not see it as invalid.
2527          */
2528         if (zapped_root)
2529                 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
2530         return list_unstable;
2531 }
2532
2533 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2534                                      struct list_head *invalid_list)
2535 {
2536         int nr_zapped;
2537
2538         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2539         return nr_zapped;
2540 }
2541
2542 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2543                                     struct list_head *invalid_list)
2544 {
2545         struct kvm_mmu_page *sp, *nsp;
2546
2547         if (list_empty(invalid_list))
2548                 return;
2549
2550         /*
2551          * We need to make sure everyone sees our modifications to
2552          * the page tables and see changes to vcpu->mode here. The barrier
2553          * in the kvm_flush_remote_tlbs() achieves this. This pairs
2554          * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2555          *
2556          * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2557          * guest mode and/or lockless shadow page table walks.
2558          */
2559         kvm_flush_remote_tlbs(kvm);
2560
2561         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2562                 WARN_ON_ONCE(!sp->role.invalid || sp->root_count);
2563                 kvm_mmu_free_shadow_page(sp);
2564         }
2565 }
2566
2567 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2568                                                   unsigned long nr_to_zap)
2569 {
2570         unsigned long total_zapped = 0;
2571         struct kvm_mmu_page *sp, *tmp;
2572         LIST_HEAD(invalid_list);
2573         bool unstable;
2574         int nr_zapped;
2575
2576         if (list_empty(&kvm->arch.active_mmu_pages))
2577                 return 0;
2578
2579 restart:
2580         list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2581                 /*
2582                  * Don't zap active root pages, the page itself can't be freed
2583                  * and zapping it will just force vCPUs to realloc and reload.
2584                  */
2585                 if (sp->root_count)
2586                         continue;
2587
2588                 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
2589                                                       &nr_zapped);
2590                 total_zapped += nr_zapped;
2591                 if (total_zapped >= nr_to_zap)
2592                         break;
2593
2594                 if (unstable)
2595                         goto restart;
2596         }
2597
2598         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2599
2600         kvm->stat.mmu_recycled += total_zapped;
2601         return total_zapped;
2602 }
2603
2604 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2605 {
2606         if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2607                 return kvm->arch.n_max_mmu_pages -
2608                         kvm->arch.n_used_mmu_pages;
2609
2610         return 0;
2611 }
2612
2613 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2614 {
2615         unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
2616
2617         if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2618                 return 0;
2619
2620         kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
2621
2622         /*
2623          * Note, this check is intentionally soft, it only guarantees that one
2624          * page is available, while the caller may end up allocating as many as
2625          * four pages, e.g. for PAE roots or for 5-level paging.  Temporarily
2626          * exceeding the (arbitrary by default) limit will not harm the host,
2627          * being too aggressive may unnecessarily kill the guest, and getting an
2628          * exact count is far more trouble than it's worth, especially in the
2629          * page fault paths.
2630          */
2631         if (!kvm_mmu_available_pages(vcpu->kvm))
2632                 return -ENOSPC;
2633         return 0;
2634 }
2635
2636 /*
2637  * Changing the number of mmu pages allocated to the vm
2638  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2639  */
2640 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2641 {
2642         write_lock(&kvm->mmu_lock);
2643
2644         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2645                 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
2646                                                   goal_nr_mmu_pages);
2647
2648                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2649         }
2650
2651         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2652
2653         write_unlock(&kvm->mmu_lock);
2654 }
2655
2656 bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
2657                                        bool always_retry)
2658 {
2659         struct kvm *kvm = vcpu->kvm;
2660         LIST_HEAD(invalid_list);
2661         struct kvm_mmu_page *sp;
2662         gpa_t gpa = cr2_or_gpa;
2663         bool r = false;
2664
2665         /*
2666          * Bail early if there aren't any write-protected shadow pages to avoid
2667          * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked
2668          * by a third party.  Reading indirect_shadow_pages without holding
2669          * mmu_lock is safe, as this is purely an optimization, i.e. a false
2670          * positive is benign, and a false negative will simply result in KVM
2671          * skipping the unprotect+retry path, which is also an optimization.
2672          */
2673         if (!READ_ONCE(kvm->arch.indirect_shadow_pages))
2674                 goto out;
2675
2676         if (!vcpu->arch.mmu->root_role.direct) {
2677                 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
2678                 if (gpa == INVALID_GPA)
2679                         goto out;
2680         }
2681
2682         write_lock(&kvm->mmu_lock);
2683         for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa))
2684                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2685
2686         /*
2687          * Snapshot the result before zapping, as zapping will remove all list
2688          * entries, i.e. checking the list later would yield a false negative.
2689          */
2690         r = !list_empty(&invalid_list);
2691         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2692         write_unlock(&kvm->mmu_lock);
2693
2694 out:
2695         if (r || always_retry) {
2696                 vcpu->arch.last_retry_eip = kvm_rip_read(vcpu);
2697                 vcpu->arch.last_retry_addr = cr2_or_gpa;
2698         }
2699         return r;
2700 }
2701
2702 static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2703 {
2704         trace_kvm_mmu_unsync_page(sp);
2705         ++kvm->stat.mmu_unsync;
2706         sp->unsync = 1;
2707
2708         kvm_mmu_mark_parents_unsync(sp);
2709 }
2710
2711 /*
2712  * Attempt to unsync any shadow pages that can be reached by the specified gfn,
2713  * KVM is creating a writable mapping for said gfn.  Returns 0 if all pages
2714  * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
2715  * be write-protected.
2716  */
2717 int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
2718                             gfn_t gfn, bool synchronizing, bool prefetch)
2719 {
2720         struct kvm_mmu_page *sp;
2721         bool locked = false;
2722
2723         /*
2724          * Force write-protection if the page is being tracked.  Note, the page
2725          * track machinery is used to write-protect upper-level shadow pages,
2726          * i.e. this guards the role.level == 4K assertion below!
2727          */
2728         if (kvm_gfn_is_write_tracked(kvm, slot, gfn))
2729                 return -EPERM;
2730
2731         /*
2732          * The page is not write-tracked, mark existing shadow pages unsync
2733          * unless KVM is synchronizing an unsync SP.  In that case, KVM must
2734          * complete emulation of the guest TLB flush before allowing shadow
2735          * pages to become unsync (writable by the guest).
2736          */
2737         for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
2738                 if (synchronizing)
2739                         return -EPERM;
2740
2741                 if (sp->unsync)
2742                         continue;
2743
2744                 if (prefetch)
2745                         return -EEXIST;
2746
2747                 /*
2748                  * TDP MMU page faults require an additional spinlock as they
2749                  * run with mmu_lock held for read, not write, and the unsync
2750                  * logic is not thread safe.  Take the spinklock regardless of
2751                  * the MMU type to avoid extra conditionals/parameters, there's
2752                  * no meaningful penalty if mmu_lock is held for write.
2753                  */
2754                 if (!locked) {
2755                         locked = true;
2756                         spin_lock(&kvm->arch.mmu_unsync_pages_lock);
2757
2758                         /*
2759                          * Recheck after taking the spinlock, a different vCPU
2760                          * may have since marked the page unsync.  A false
2761                          * negative on the unprotected check above is not
2762                          * possible as clearing sp->unsync _must_ hold mmu_lock
2763                          * for write, i.e. unsync cannot transition from 1->0
2764                          * while this CPU holds mmu_lock for read (or write).
2765                          */
2766                         if (READ_ONCE(sp->unsync))
2767                                 continue;
2768                 }
2769
2770                 WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
2771                 kvm_unsync_page(kvm, sp);
2772         }
2773         if (locked)
2774                 spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
2775
2776         /*
2777          * We need to ensure that the marking of unsync pages is visible
2778          * before the SPTE is updated to allow writes because
2779          * kvm_mmu_sync_roots() checks the unsync flags without holding
2780          * the MMU lock and so can race with this. If the SPTE was updated
2781          * before the page had been marked as unsync-ed, something like the
2782          * following could happen:
2783          *
2784          * CPU 1                    CPU 2
2785          * ---------------------------------------------------------------------
2786          * 1.2 Host updates SPTE
2787          *     to be writable
2788          *                      2.1 Guest writes a GPTE for GVA X.
2789          *                          (GPTE being in the guest page table shadowed
2790          *                           by the SP from CPU 1.)
2791          *                          This reads SPTE during the page table walk.
2792          *                          Since SPTE.W is read as 1, there is no
2793          *                          fault.
2794          *
2795          *                      2.2 Guest issues TLB flush.
2796          *                          That causes a VM Exit.
2797          *
2798          *                      2.3 Walking of unsync pages sees sp->unsync is
2799          *                          false and skips the page.
2800          *
2801          *                      2.4 Guest accesses GVA X.
2802          *                          Since the mapping in the SP was not updated,
2803          *                          so the old mapping for GVA X incorrectly
2804          *                          gets used.
2805          * 1.1 Host marks SP
2806          *     as unsync
2807          *     (sp->unsync = true)
2808          *
2809          * The write barrier below ensures that 1.1 happens before 1.2 and thus
2810          * the situation in 2.4 does not arise.  It pairs with the read barrier
2811          * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
2812          */
2813         smp_wmb();
2814
2815         return 0;
2816 }
2817
2818 static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
2819                         u64 *sptep, unsigned int pte_access, gfn_t gfn,
2820                         kvm_pfn_t pfn, struct kvm_page_fault *fault)
2821 {
2822         struct kvm_mmu_page *sp = sptep_to_sp(sptep);
2823         int level = sp->role.level;
2824         int was_rmapped = 0;
2825         int ret = RET_PF_FIXED;
2826         bool flush = false;
2827         bool wrprot;
2828         u64 spte;
2829
2830         /* Prefetching always gets a writable pfn.  */
2831         bool host_writable = !fault || fault->map_writable;
2832         bool prefetch = !fault || fault->prefetch;
2833         bool write_fault = fault && fault->write;
2834
2835         if (unlikely(is_noslot_pfn(pfn))) {
2836                 vcpu->stat.pf_mmio_spte_created++;
2837                 mark_mmio_spte(vcpu, sptep, gfn, pte_access);
2838                 return RET_PF_EMULATE;
2839         }
2840
2841         if (is_shadow_present_pte(*sptep)) {
2842                 if (prefetch)
2843                         return RET_PF_SPURIOUS;
2844
2845                 /*
2846                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
2847                  * the parent of the now unreachable PTE.
2848                  */
2849                 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) {
2850                         struct kvm_mmu_page *child;
2851                         u64 pte = *sptep;
2852
2853                         child = spte_to_child_sp(pte);
2854                         drop_parent_pte(vcpu->kvm, child, sptep);
2855                         flush = true;
2856                 } else if (pfn != spte_to_pfn(*sptep)) {
2857                         drop_spte(vcpu->kvm, sptep);
2858                         flush = true;
2859                 } else
2860                         was_rmapped = 1;
2861         }
2862
2863         wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
2864                            false, host_writable, &spte);
2865
2866         if (*sptep == spte) {
2867                 ret = RET_PF_SPURIOUS;
2868         } else {
2869                 flush |= mmu_spte_update(sptep, spte);
2870                 trace_kvm_mmu_set_spte(level, gfn, sptep);
2871         }
2872
2873         if (wrprot && write_fault)
2874                 ret = RET_PF_WRITE_PROTECTED;
2875
2876         if (flush)
2877                 kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
2878
2879         if (!was_rmapped) {
2880                 WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
2881                 rmap_add(vcpu, slot, sptep, gfn, pte_access);
2882         } else {
2883                 /* Already rmapped but the pte_access bits may have changed. */
2884                 kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access);
2885         }
2886
2887         return ret;
2888 }
2889
2890 static bool kvm_mmu_prefetch_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *sptep,
2891                                    int nr_pages, unsigned int access)
2892 {
2893         struct page *pages[PTE_PREFETCH_NUM];
2894         struct kvm_memory_slot *slot;
2895         int i;
2896
2897         if (WARN_ON_ONCE(nr_pages > PTE_PREFETCH_NUM))
2898                 return false;
2899
2900         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
2901         if (!slot)
2902                 return false;
2903
2904         nr_pages = kvm_prefetch_pages(slot, gfn, pages, nr_pages);
2905         if (nr_pages <= 0)
2906                 return false;
2907
2908         for (i = 0; i < nr_pages; i++, gfn++, sptep++) {
2909                 mmu_set_spte(vcpu, slot, sptep, access, gfn,
2910                              page_to_pfn(pages[i]), NULL);
2911
2912                 /*
2913                  * KVM always prefetches writable pages from the primary MMU,
2914                  * and KVM can make its SPTE writable in the fast page handler,
2915                  * without notifying the primary MMU.  Mark pages/folios dirty
2916                  * now to ensure file data is written back if it ends up being
2917                  * written by the guest.  Because KVM's prefetching GUPs
2918                  * writable PTEs, the probability of unnecessary writeback is
2919                  * extremely low.
2920                  */
2921                 kvm_release_page_dirty(pages[i]);
2922         }
2923
2924         return true;
2925 }
2926
2927 static bool direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2928                                      struct kvm_mmu_page *sp,
2929                                      u64 *start, u64 *end)
2930 {
2931         gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(start));
2932         unsigned int access = sp->role.access;
2933
2934         return kvm_mmu_prefetch_sptes(vcpu, gfn, start, end - start, access);
2935 }
2936
2937 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2938                                   struct kvm_mmu_page *sp, u64 *sptep)
2939 {
2940         u64 *spte, *start = NULL;
2941         int i;
2942
2943         WARN_ON_ONCE(!sp->role.direct);
2944
2945         i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
2946         spte = sp->spt + i;
2947
2948         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2949                 if (is_shadow_present_pte(*spte) || spte == sptep) {
2950                         if (!start)
2951                                 continue;
2952                         if (!direct_pte_prefetch_many(vcpu, sp, start, spte))
2953                                 return;
2954
2955                         start = NULL;
2956                 } else if (!start)
2957                         start = spte;
2958         }
2959         if (start)
2960                 direct_pte_prefetch_many(vcpu, sp, start, spte);
2961 }
2962
2963 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2964 {
2965         struct kvm_mmu_page *sp;
2966
2967         sp = sptep_to_sp(sptep);
2968
2969         /*
2970          * Without accessed bits, there's no way to distinguish between
2971          * actually accessed translations and prefetched, so disable pte
2972          * prefetch if accessed bits aren't available.
2973          */
2974         if (sp_ad_disabled(sp))
2975                 return;
2976
2977         if (sp->role.level > PG_LEVEL_4K)
2978                 return;
2979
2980         /*
2981          * If addresses are being invalidated, skip prefetching to avoid
2982          * accidentally prefetching those addresses.
2983          */
2984         if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
2985                 return;
2986
2987         __direct_pte_prefetch(vcpu, sp, sptep);
2988 }
2989
2990 /*
2991  * Lookup the mapping level for @gfn in the current mm.
2992  *
2993  * WARNING!  Use of host_pfn_mapping_level() requires the caller and the end
2994  * consumer to be tied into KVM's handlers for MMU notifier events!
2995  *
2996  * There are several ways to safely use this helper:
2997  *
2998  * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
2999  *   consuming it.  In this case, mmu_lock doesn't need to be held during the
3000  *   lookup, but it does need to be held while checking the MMU notifier.
3001  *
3002  * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
3003  *   event for the hva.  This can be done by explicit checking the MMU notifier
3004  *   or by ensuring that KVM already has a valid mapping that covers the hva.
3005  *
3006  * - Do not use the result to install new mappings, e.g. use the host mapping
3007  *   level only to decide whether or not to zap an entry.  In this case, it's
3008  *   not required to hold mmu_lock (though it's highly likely the caller will
3009  *   want to hold mmu_lock anyways, e.g. to modify SPTEs).
3010  *
3011  * Note!  The lookup can still race with modifications to host page tables, but
3012  * the above "rules" ensure KVM will not _consume_ the result of the walk if a
3013  * race with the primary MMU occurs.
3014  */
3015 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
3016                                   const struct kvm_memory_slot *slot)
3017 {
3018         int level = PG_LEVEL_4K;
3019         unsigned long hva;
3020         unsigned long flags;
3021         pgd_t pgd;
3022         p4d_t p4d;
3023         pud_t pud;
3024         pmd_t pmd;
3025
3026         /*
3027          * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
3028          * is not solely for performance, it's also necessary to avoid the
3029          * "writable" check in __gfn_to_hva_many(), which will always fail on
3030          * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
3031          * page fault steps have already verified the guest isn't writing a
3032          * read-only memslot.
3033          */
3034         hva = __gfn_to_hva_memslot(slot, gfn);
3035
3036         /*
3037          * Disable IRQs to prevent concurrent tear down of host page tables,
3038          * e.g. if the primary MMU promotes a P*D to a huge page and then frees
3039          * the original page table.
3040          */
3041         local_irq_save(flags);
3042
3043         /*
3044          * Read each entry once.  As above, a non-leaf entry can be promoted to
3045          * a huge page _during_ this walk.  Re-reading the entry could send the
3046          * walk into the weeks, e.g. p*d_leaf() returns false (sees the old
3047          * value) and then p*d_offset() walks into the target huge page instead
3048          * of the old page table (sees the new value).
3049          */
3050         pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
3051         if (pgd_none(pgd))
3052                 goto out;
3053
3054         p4d = READ_ONCE(*p4d_offset(&pgd, hva));
3055         if (p4d_none(p4d) || !p4d_present(p4d))
3056                 goto out;
3057
3058         pud = READ_ONCE(*pud_offset(&p4d, hva));
3059         if (pud_none(pud) || !pud_present(pud))
3060                 goto out;
3061
3062         if (pud_leaf(pud)) {
3063                 level = PG_LEVEL_1G;
3064                 goto out;
3065         }
3066
3067         pmd = READ_ONCE(*pmd_offset(&pud, hva));
3068         if (pmd_none(pmd) || !pmd_present(pmd))
3069                 goto out;
3070
3071         if (pmd_leaf(pmd))
3072                 level = PG_LEVEL_2M;
3073
3074 out:
3075         local_irq_restore(flags);
3076         return level;
3077 }
3078
3079 static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
3080                                        const struct kvm_memory_slot *slot,
3081                                        gfn_t gfn, int max_level, bool is_private)
3082 {
3083         struct kvm_lpage_info *linfo;
3084         int host_level;
3085
3086         max_level = min(max_level, max_huge_page_level);
3087         for ( ; max_level > PG_LEVEL_4K; max_level--) {
3088                 linfo = lpage_info_slot(gfn, slot, max_level);
3089                 if (!linfo->disallow_lpage)
3090                         break;
3091         }
3092
3093         if (is_private)
3094                 return max_level;
3095
3096         if (max_level == PG_LEVEL_4K)
3097                 return PG_LEVEL_4K;
3098
3099         host_level = host_pfn_mapping_level(kvm, gfn, slot);
3100         return min(host_level, max_level);
3101 }
3102
3103 int kvm_mmu_max_mapping_level(struct kvm *kvm,
3104                               const struct kvm_memory_slot *slot, gfn_t gfn)
3105 {
3106         bool is_private = kvm_slot_can_be_private(slot) &&
3107                           kvm_mem_is_private(kvm, gfn);
3108
3109         return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
3110 }
3111
3112 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3113 {
3114         struct kvm_memory_slot *slot = fault->slot;
3115         kvm_pfn_t mask;
3116
3117         fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
3118
3119         if (unlikely(fault->max_level == PG_LEVEL_4K))
3120                 return;
3121
3122         if (is_error_noslot_pfn(fault->pfn))
3123                 return;
3124
3125         if (kvm_slot_dirty_track_enabled(slot))
3126                 return;
3127
3128         /*
3129          * Enforce the iTLB multihit workaround after capturing the requested
3130          * level, which will be used to do precise, accurate accounting.
3131          */
3132         fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
3133                                                        fault->gfn, fault->max_level,
3134                                                        fault->is_private);
3135         if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
3136                 return;
3137
3138         /*
3139          * mmu_invalidate_retry() was successful and mmu_lock is held, so
3140          * the pmd can't be split from under us.
3141          */
3142         fault->goal_level = fault->req_level;
3143         mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
3144         VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
3145         fault->pfn &= ~mask;
3146 }
3147
3148 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
3149 {
3150         if (cur_level > PG_LEVEL_4K &&
3151             cur_level == fault->goal_level &&
3152             is_shadow_present_pte(spte) &&
3153             !is_large_pte(spte) &&
3154             spte_to_child_sp(spte)->nx_huge_page_disallowed) {
3155                 /*
3156                  * A small SPTE exists for this pfn, but FNAME(fetch),
3157                  * direct_map(), or kvm_tdp_mmu_map() would like to create a
3158                  * large PTE instead: just force them to go down another level,
3159                  * patching back for them into pfn the next 9 bits of the
3160                  * address.
3161                  */
3162                 u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
3163                                 KVM_PAGES_PER_HPAGE(cur_level - 1);
3164                 fault->pfn |= fault->gfn & page_mask;
3165                 fault->goal_level--;
3166         }
3167 }
3168
3169 static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3170 {
3171         struct kvm_shadow_walk_iterator it;
3172         struct kvm_mmu_page *sp;
3173         int ret;
3174         gfn_t base_gfn = fault->gfn;
3175
3176         kvm_mmu_hugepage_adjust(vcpu, fault);
3177
3178         trace_kvm_mmu_spte_requested(fault);
3179         for_each_shadow_entry(vcpu, fault->addr, it) {
3180                 /*
3181                  * We cannot overwrite existing page tables with an NX
3182                  * large page, as the leaf could be executable.
3183                  */
3184                 if (fault->nx_huge_page_workaround_enabled)
3185                         disallowed_hugepage_adjust(fault, *it.sptep, it.level);
3186
3187                 base_gfn = gfn_round_for_level(fault->gfn, it.level);
3188                 if (it.level == fault->goal_level)
3189                         break;
3190
3191                 sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
3192                 if (sp == ERR_PTR(-EEXIST))
3193                         continue;
3194
3195                 link_shadow_page(vcpu, it.sptep, sp);
3196                 if (fault->huge_page_disallowed)
3197                         account_nx_huge_page(vcpu->kvm, sp,
3198                                              fault->req_level >= it.level);
3199         }
3200
3201         if (WARN_ON_ONCE(it.level != fault->goal_level))
3202                 return -EFAULT;
3203
3204         ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
3205                            base_gfn, fault->pfn, fault);
3206         if (ret == RET_PF_SPURIOUS)
3207                 return ret;
3208
3209         direct_pte_prefetch(vcpu, it.sptep);
3210         return ret;
3211 }
3212
3213 static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn)
3214 {
3215         unsigned long hva = gfn_to_hva_memslot(slot, gfn);
3216
3217         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current);
3218 }
3219
3220 static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3221 {
3222         if (is_sigpending_pfn(fault->pfn)) {
3223                 kvm_handle_signal_exit(vcpu);
3224                 return -EINTR;
3225         }
3226
3227         /*
3228          * Do not cache the mmio info caused by writing the readonly gfn
3229          * into the spte otherwise read access on readonly gfn also can
3230          * caused mmio page fault and treat it as mmio access.
3231          */
3232         if (fault->pfn == KVM_PFN_ERR_RO_FAULT)
3233                 return RET_PF_EMULATE;
3234
3235         if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
3236                 kvm_send_hwpoison_signal(fault->slot, fault->gfn);
3237                 return RET_PF_RETRY;
3238         }
3239
3240         return -EFAULT;
3241 }
3242
3243 static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
3244                                    struct kvm_page_fault *fault,
3245                                    unsigned int access)
3246 {
3247         gva_t gva = fault->is_tdp ? 0 : fault->addr;
3248
3249         if (fault->is_private) {
3250                 kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
3251                 return -EFAULT;
3252         }
3253
3254         vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
3255                              access & shadow_mmio_access_mask);
3256
3257         fault->slot = NULL;
3258         fault->pfn = KVM_PFN_NOSLOT;
3259         fault->map_writable = false;
3260
3261         /*
3262          * If MMIO caching is disabled, emulate immediately without
3263          * touching the shadow page tables as attempting to install an
3264          * MMIO SPTE will just be an expensive nop.
3265          */
3266         if (unlikely(!enable_mmio_caching))
3267                 return RET_PF_EMULATE;
3268
3269         /*
3270          * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
3271          * any guest that generates such gfns is running nested and is being
3272          * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
3273          * only if L1's MAXPHYADDR is inaccurate with respect to the
3274          * hardware's).
3275          */
3276         if (unlikely(fault->gfn > kvm_mmu_max_gfn()))
3277                 return RET_PF_EMULATE;
3278
3279         return RET_PF_CONTINUE;
3280 }
3281
3282 static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault)
3283 {
3284         /*
3285          * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
3286          * reach the common page fault handler if the SPTE has an invalid MMIO
3287          * generation number.  Refreshing the MMIO generation needs to go down
3288          * the slow path.  Note, EPT Misconfigs do NOT set the PRESENT flag!
3289          */
3290         if (fault->rsvd)
3291                 return false;
3292
3293         /*
3294          * For hardware-protected VMs, certain conditions like attempting to
3295          * perform a write to a page which is not in the state that the guest
3296          * expects it to be in can result in a nested/extended #PF. In this
3297          * case, the below code might misconstrue this situation as being the
3298          * result of a write-protected access, and treat it as a spurious case
3299          * rather than taking any action to satisfy the real source of the #PF
3300          * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the
3301          * guest spinning on a #PF indefinitely, so don't attempt the fast path
3302          * in this case.
3303          *
3304          * Note that the kvm_mem_is_private() check might race with an
3305          * attribute update, but this will either result in the guest spinning
3306          * on RET_PF_SPURIOUS until the update completes, or an actual spurious
3307          * case might go down the slow path. Either case will resolve itself.
3308          */
3309         if (kvm->arch.has_private_mem &&
3310             fault->is_private != kvm_mem_is_private(kvm, fault->gfn))
3311                 return false;
3312
3313         /*
3314          * #PF can be fast if:
3315          *
3316          * 1. The shadow page table entry is not present and A/D bits are
3317          *    disabled _by KVM_, which could mean that the fault is potentially
3318          *    caused by access tracking (if enabled).  If A/D bits are enabled
3319          *    by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
3320          *    bits for L2 and employ access tracking, but the fast page fault
3321          *    mechanism only supports direct MMUs.
3322          * 2. The shadow page table entry is present, the access is a write,
3323          *    and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
3324          *    the fault was caused by a write-protection violation.  If the
3325          *    SPTE is MMU-writable (determined later), the fault can be fixed
3326          *    by setting the Writable bit, which can be done out of mmu_lock.
3327          */
3328         if (!fault->present)
3329                 return !kvm_ad_enabled;
3330
3331         /*
3332          * Note, instruction fetches and writes are mutually exclusive, ignore
3333          * the "exec" flag.
3334          */
3335         return fault->write;
3336 }
3337
3338 /*
3339  * Returns true if the SPTE was fixed successfully. Otherwise,
3340  * someone else modified the SPTE from its original value.
3341  */
3342 static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
3343                                     struct kvm_page_fault *fault,
3344                                     u64 *sptep, u64 old_spte, u64 new_spte)
3345 {
3346         /*
3347          * Theoretically we could also set dirty bit (and flush TLB) here in
3348          * order to eliminate unnecessary PML logging. See comments in
3349          * set_spte. But fast_page_fault is very unlikely to happen with PML
3350          * enabled, so we do not do this. This might result in the same GPA
3351          * to be logged in PML buffer again when the write really happens, and
3352          * eventually to be called by mark_page_dirty twice. But it's also no
3353          * harm. This also avoids the TLB flush needed after setting dirty bit
3354          * so non-PML cases won't be impacted.
3355          *
3356          * Compare with make_spte() where instead shadow_dirty_mask is set.
3357          */
3358         if (!try_cmpxchg64(sptep, &old_spte, new_spte))
3359                 return false;
3360
3361         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
3362                 mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
3363
3364         return true;
3365 }
3366
3367 /*
3368  * Returns the last level spte pointer of the shadow page walk for the given
3369  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
3370  * walk could be performed, returns NULL and *spte does not contain valid data.
3371  *
3372  * Contract:
3373  *  - Must be called between walk_shadow_page_lockless_{begin,end}.
3374  *  - The returned sptep must not be used after walk_shadow_page_lockless_end.
3375  */
3376 static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
3377 {
3378         struct kvm_shadow_walk_iterator iterator;
3379         u64 old_spte;
3380         u64 *sptep = NULL;
3381
3382         for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
3383                 sptep = iterator.sptep;
3384                 *spte = old_spte;
3385         }
3386
3387         return sptep;
3388 }
3389
3390 /*
3391  * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
3392  */
3393 static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3394 {
3395         struct kvm_mmu_page *sp;
3396         int ret = RET_PF_INVALID;
3397         u64 spte;
3398         u64 *sptep;
3399         uint retry_count = 0;
3400
3401         if (!page_fault_can_be_fast(vcpu->kvm, fault))
3402                 return ret;
3403
3404         walk_shadow_page_lockless_begin(vcpu);
3405
3406         do {
3407                 u64 new_spte;
3408
3409                 if (tdp_mmu_enabled)
3410                         sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte);
3411                 else
3412                         sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
3413
3414                 /*
3415                  * It's entirely possible for the mapping to have been zapped
3416                  * by a different task, but the root page should always be
3417                  * available as the vCPU holds a reference to its root(s).
3418                  */
3419                 if (WARN_ON_ONCE(!sptep))
3420                         spte = FROZEN_SPTE;
3421
3422                 if (!is_shadow_present_pte(spte))
3423                         break;
3424
3425                 sp = sptep_to_sp(sptep);
3426                 if (!is_last_spte(spte, sp->role.level))
3427                         break;
3428
3429                 /*
3430                  * Check whether the memory access that caused the fault would
3431                  * still cause it if it were to be performed right now. If not,
3432                  * then this is a spurious fault caused by TLB lazily flushed,
3433                  * or some other CPU has already fixed the PTE after the
3434                  * current CPU took the fault.
3435                  *
3436                  * Need not check the access of upper level table entries since
3437                  * they are always ACC_ALL.
3438                  */
3439                 if (is_access_allowed(fault, spte)) {
3440                         ret = RET_PF_SPURIOUS;
3441                         break;
3442                 }
3443
3444                 new_spte = spte;
3445
3446                 /*
3447                  * KVM only supports fixing page faults outside of MMU lock for
3448                  * direct MMUs, nested MMUs are always indirect, and KVM always
3449                  * uses A/D bits for non-nested MMUs.  Thus, if A/D bits are
3450                  * enabled, the SPTE can't be an access-tracked SPTE.
3451                  */
3452                 if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte))
3453                         new_spte = restore_acc_track_spte(new_spte) |
3454                                    shadow_accessed_mask;
3455
3456                 /*
3457                  * To keep things simple, only SPTEs that are MMU-writable can
3458                  * be made fully writable outside of mmu_lock, e.g. only SPTEs
3459                  * that were write-protected for dirty-logging or access
3460                  * tracking are handled here.  Don't bother checking if the
3461                  * SPTE is writable to prioritize running with A/D bits enabled.
3462                  * The is_access_allowed() check above handles the common case
3463                  * of the fault being spurious, and the SPTE is known to be
3464                  * shadow-present, i.e. except for access tracking restoration
3465                  * making the new SPTE writable, the check is wasteful.
3466                  */
3467                 if (fault->write && is_mmu_writable_spte(spte)) {
3468                         new_spte |= PT_WRITABLE_MASK;
3469
3470                         /*
3471                          * Do not fix write-permission on the large spte when
3472                          * dirty logging is enabled. Since we only dirty the
3473                          * first page into the dirty-bitmap in
3474                          * fast_pf_fix_direct_spte(), other pages are missed
3475                          * if its slot has dirty logging enabled.
3476                          *
3477                          * Instead, we let the slow page fault path create a
3478                          * normal spte to fix the access.
3479                          */
3480                         if (sp->role.level > PG_LEVEL_4K &&
3481                             kvm_slot_dirty_track_enabled(fault->slot))
3482                                 break;
3483                 }
3484
3485                 /* Verify that the fault can be handled in the fast path */
3486                 if (new_spte == spte ||
3487                     !is_access_allowed(fault, new_spte))
3488                         break;
3489
3490                 /*
3491                  * Currently, fast page fault only works for direct mapping
3492                  * since the gfn is not stable for indirect shadow page. See
3493                  * Documentation/virt/kvm/locking.rst to get more detail.
3494                  */
3495                 if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
3496                         ret = RET_PF_FIXED;
3497                         break;
3498                 }
3499
3500                 if (++retry_count > 4) {
3501                         pr_warn_once("Fast #PF retrying more than 4 times.\n");
3502                         break;
3503                 }
3504
3505         } while (true);
3506
3507         trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
3508         walk_shadow_page_lockless_end(vcpu);
3509
3510         if (ret != RET_PF_INVALID)
3511                 vcpu->stat.pf_fast++;
3512
3513         return ret;
3514 }
3515
3516 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3517                                struct list_head *invalid_list)
3518 {
3519         struct kvm_mmu_page *sp;
3520
3521         if (!VALID_PAGE(*root_hpa))
3522                 return;
3523
3524         sp = root_to_sp(*root_hpa);
3525         if (WARN_ON_ONCE(!sp))
3526                 return;
3527
3528         if (is_tdp_mmu_page(sp)) {
3529                 lockdep_assert_held_read(&kvm->mmu_lock);
3530                 kvm_tdp_mmu_put_root(kvm, sp);
3531         } else {
3532                 lockdep_assert_held_write(&kvm->mmu_lock);
3533                 if (!--sp->root_count && sp->role.invalid)
3534                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3535         }
3536
3537         *root_hpa = INVALID_PAGE;
3538 }
3539
3540 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3541 void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
3542                         ulong roots_to_free)
3543 {
3544         bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
3545         int i;
3546         LIST_HEAD(invalid_list);
3547         bool free_active_root;
3548
3549         WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
3550
3551         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3552
3553         /* Before acquiring the MMU lock, see if we need to do any real work. */
3554         free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
3555                 && VALID_PAGE(mmu->root.hpa);
3556
3557         if (!free_active_root) {
3558                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3559                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3560                             VALID_PAGE(mmu->prev_roots[i].hpa))
3561                                 break;
3562
3563                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3564                         return;
3565         }
3566
3567         if (is_tdp_mmu)
3568                 read_lock(&kvm->mmu_lock);
3569         else
3570                 write_lock(&kvm->mmu_lock);
3571
3572         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3573                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3574                         mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
3575                                            &invalid_list);
3576
3577         if (free_active_root) {
3578                 if (kvm_mmu_is_dummy_root(mmu->root.hpa)) {
3579                         /* Nothing to cleanup for dummy roots. */
3580                 } else if (root_to_sp(mmu->root.hpa)) {
3581                         mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
3582                 } else if (mmu->pae_root) {
3583                         for (i = 0; i < 4; ++i) {
3584                                 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
3585                                         continue;
3586
3587                                 mmu_free_root_page(kvm, &mmu->pae_root[i],
3588                                                    &invalid_list);
3589                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3590                         }
3591                 }
3592                 mmu->root.hpa = INVALID_PAGE;
3593                 mmu->root.pgd = 0;
3594         }
3595
3596         if (is_tdp_mmu) {
3597                 read_unlock(&kvm->mmu_lock);
3598                 WARN_ON_ONCE(!list_empty(&invalid_list));
3599         } else {
3600                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
3601                 write_unlock(&kvm->mmu_lock);
3602         }
3603 }
3604 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3605
3606 void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
3607 {
3608         unsigned long roots_to_free = 0;
3609         struct kvm_mmu_page *sp;
3610         hpa_t root_hpa;
3611         int i;
3612
3613         /*
3614          * This should not be called while L2 is active, L2 can't invalidate
3615          * _only_ its own roots, e.g. INVVPID unconditionally exits.
3616          */
3617         WARN_ON_ONCE(mmu->root_role.guest_mode);
3618
3619         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
3620                 root_hpa = mmu->prev_roots[i].hpa;
3621                 if (!VALID_PAGE(root_hpa))
3622                         continue;
3623
3624                 sp = root_to_sp(root_hpa);
3625                 if (!sp || sp->role.guest_mode)
3626                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
3627         }
3628
3629         kvm_mmu_free_roots(kvm, mmu, roots_to_free);
3630 }
3631 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
3632
3633 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
3634                             u8 level)
3635 {
3636         union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
3637         struct kvm_mmu_page *sp;
3638
3639         role.level = level;
3640         role.quadrant = quadrant;
3641
3642         WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
3643         WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
3644
3645         sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
3646         ++sp->root_count;
3647
3648         return __pa(sp->spt);
3649 }
3650
3651 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3652 {
3653         struct kvm_mmu *mmu = vcpu->arch.mmu;
3654         u8 shadow_root_level = mmu->root_role.level;
3655         hpa_t root;
3656         unsigned i;
3657         int r;
3658
3659         if (tdp_mmu_enabled)
3660                 return kvm_tdp_mmu_alloc_root(vcpu);
3661
3662         write_lock(&vcpu->kvm->mmu_lock);
3663         r = make_mmu_pages_available(vcpu);
3664         if (r < 0)
3665                 goto out_unlock;
3666
3667         if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3668                 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
3669                 mmu->root.hpa = root;
3670         } else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3671                 if (WARN_ON_ONCE(!mmu->pae_root)) {
3672                         r = -EIO;
3673                         goto out_unlock;
3674                 }
3675
3676                 for (i = 0; i < 4; ++i) {
3677                         WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3678
3679                         root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0,
3680                                               PT32_ROOT_LEVEL);
3681                         mmu->pae_root[i] = root | PT_PRESENT_MASK |
3682                                            shadow_me_value;
3683                 }
3684                 mmu->root.hpa = __pa(mmu->pae_root);
3685         } else {
3686                 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level);
3687                 r = -EIO;
3688                 goto out_unlock;
3689         }
3690
3691         /* root.pgd is ignored for direct MMUs. */
3692         mmu->root.pgd = 0;
3693 out_unlock:
3694         write_unlock(&vcpu->kvm->mmu_lock);
3695         return r;
3696 }
3697
3698 static int mmu_first_shadow_root_alloc(struct kvm *kvm)
3699 {
3700         struct kvm_memslots *slots;
3701         struct kvm_memory_slot *slot;
3702         int r = 0, i, bkt;
3703
3704         /*
3705          * Check if this is the first shadow root being allocated before
3706          * taking the lock.
3707          */
3708         if (kvm_shadow_root_allocated(kvm))
3709                 return 0;
3710
3711         mutex_lock(&kvm->slots_arch_lock);
3712
3713         /* Recheck, under the lock, whether this is the first shadow root. */
3714         if (kvm_shadow_root_allocated(kvm))
3715                 goto out_unlock;
3716
3717         /*
3718          * Check if anything actually needs to be allocated, e.g. all metadata
3719          * will be allocated upfront if TDP is disabled.
3720          */
3721         if (kvm_memslots_have_rmaps(kvm) &&
3722             kvm_page_track_write_tracking_enabled(kvm))
3723                 goto out_success;
3724
3725         for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
3726                 slots = __kvm_memslots(kvm, i);
3727                 kvm_for_each_memslot(slot, bkt, slots) {
3728                         /*
3729                          * Both of these functions are no-ops if the target is
3730                          * already allocated, so unconditionally calling both
3731                          * is safe.  Intentionally do NOT free allocations on
3732                          * failure to avoid having to track which allocations
3733                          * were made now versus when the memslot was created.
3734                          * The metadata is guaranteed to be freed when the slot
3735                          * is freed, and will be kept/used if userspace retries
3736                          * KVM_RUN instead of killing the VM.
3737                          */
3738                         r = memslot_rmap_alloc(slot, slot->npages);
3739                         if (r)
3740                                 goto out_unlock;
3741                         r = kvm_page_track_write_tracking_alloc(slot);
3742                         if (r)
3743                                 goto out_unlock;
3744                 }
3745         }
3746
3747         /*
3748          * Ensure that shadow_root_allocated becomes true strictly after
3749          * all the related pointers are set.
3750          */
3751 out_success:
3752         smp_store_release(&kvm->arch.shadow_root_allocated, true);
3753
3754 out_unlock:
3755         mutex_unlock(&kvm->slots_arch_lock);
3756         return r;
3757 }
3758
3759 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3760 {
3761         struct kvm_mmu *mmu = vcpu->arch.mmu;
3762         u64 pdptrs[4], pm_mask;
3763         gfn_t root_gfn, root_pgd;
3764         int quadrant, i, r;
3765         hpa_t root;
3766
3767         root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
3768         root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
3769
3770         if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
3771                 mmu->root.hpa = kvm_mmu_get_dummy_root();
3772                 return 0;
3773         }
3774
3775         /*
3776          * On SVM, reading PDPTRs might access guest memory, which might fault
3777          * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
3778          */
3779         if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3780                 for (i = 0; i < 4; ++i) {
3781                         pdptrs[i] = mmu->get_pdptr(vcpu, i);
3782                         if (!(pdptrs[i] & PT_PRESENT_MASK))
3783                                 continue;
3784
3785                         if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT))
3786                                 pdptrs[i] = 0;
3787                 }
3788         }
3789
3790         r = mmu_first_shadow_root_alloc(vcpu->kvm);
3791         if (r)
3792                 return r;
3793
3794         write_lock(&vcpu->kvm->mmu_lock);
3795         r = make_mmu_pages_available(vcpu);
3796         if (r < 0)
3797                 goto out_unlock;
3798
3799         /*
3800          * Do we shadow a long mode page table? If so we need to
3801          * write-protect the guests page table root.
3802          */
3803         if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
3804                 root = mmu_alloc_root(vcpu, root_gfn, 0,
3805                                       mmu->root_role.level);
3806                 mmu->root.hpa = root;
3807                 goto set_root_pgd;
3808         }
3809
3810         if (WARN_ON_ONCE(!mmu->pae_root)) {
3811                 r = -EIO;
3812                 goto out_unlock;
3813         }
3814
3815         /*
3816          * We shadow a 32 bit page table. This may be a legacy 2-level
3817          * or a PAE 3-level page table. In either case we need to be aware that
3818          * the shadow page table may be a PAE or a long mode page table.
3819          */
3820         pm_mask = PT_PRESENT_MASK | shadow_me_value;
3821         if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
3822                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3823
3824                 if (WARN_ON_ONCE(!mmu->pml4_root)) {
3825                         r = -EIO;
3826                         goto out_unlock;
3827                 }
3828                 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
3829
3830                 if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
3831                         if (WARN_ON_ONCE(!mmu->pml5_root)) {
3832                                 r = -EIO;
3833                                 goto out_unlock;
3834                         }
3835                         mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
3836                 }
3837         }
3838
3839         for (i = 0; i < 4; ++i) {
3840                 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3841
3842                 if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3843                         if (!(pdptrs[i] & PT_PRESENT_MASK)) {
3844                                 mmu->pae_root[i] = INVALID_PAE_ROOT;
3845                                 continue;
3846                         }
3847                         root_gfn = pdptrs[i] >> PAGE_SHIFT;
3848                 }
3849
3850                 /*
3851                  * If shadowing 32-bit non-PAE page tables, each PAE page
3852                  * directory maps one quarter of the guest's non-PAE page
3853                  * directory. Othwerise each PAE page direct shadows one guest
3854                  * PAE page directory so that quadrant should be 0.
3855                  */
3856                 quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
3857
3858                 root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
3859                 mmu->pae_root[i] = root | pm_mask;
3860         }
3861
3862         if (mmu->root_role.level == PT64_ROOT_5LEVEL)
3863                 mmu->root.hpa = __pa(mmu->pml5_root);
3864         else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
3865                 mmu->root.hpa = __pa(mmu->pml4_root);
3866         else
3867                 mmu->root.hpa = __pa(mmu->pae_root);
3868
3869 set_root_pgd:
3870         mmu->root.pgd = root_pgd;
3871 out_unlock:
3872         write_unlock(&vcpu->kvm->mmu_lock);
3873
3874         return r;
3875 }
3876
3877 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
3878 {
3879         struct kvm_mmu *mmu = vcpu->arch.mmu;
3880         bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
3881         u64 *pml5_root = NULL;
3882         u64 *pml4_root = NULL;
3883         u64 *pae_root;
3884
3885         /*
3886          * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
3887          * tables are allocated and initialized at root creation as there is no
3888          * equivalent level in the guest's NPT to shadow.  Allocate the tables
3889          * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
3890          */
3891         if (mmu->root_role.direct ||
3892             mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
3893             mmu->root_role.level < PT64_ROOT_4LEVEL)
3894                 return 0;
3895
3896         /*
3897          * NPT, the only paging mode that uses this horror, uses a fixed number
3898          * of levels for the shadow page tables, e.g. all MMUs are 4-level or
3899          * all MMus are 5-level.  Thus, this can safely require that pml5_root
3900          * is allocated if the other roots are valid and pml5 is needed, as any
3901          * prior MMU would also have required pml5.
3902          */
3903         if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
3904                 return 0;
3905
3906         /*
3907          * The special roots should always be allocated in concert.  Yell and
3908          * bail if KVM ends up in a state where only one of the roots is valid.
3909          */
3910         if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
3911                          (need_pml5 && mmu->pml5_root)))
3912                 return -EIO;
3913
3914         /*
3915          * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
3916          * doesn't need to be decrypted.
3917          */
3918         pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3919         if (!pae_root)
3920                 return -ENOMEM;
3921
3922 #ifdef CONFIG_X86_64
3923         pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3924         if (!pml4_root)
3925                 goto err_pml4;
3926
3927         if (need_pml5) {
3928                 pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3929                 if (!pml5_root)
3930                         goto err_pml5;
3931         }
3932 #endif
3933
3934         mmu->pae_root = pae_root;
3935         mmu->pml4_root = pml4_root;
3936         mmu->pml5_root = pml5_root;
3937
3938         return 0;
3939
3940 #ifdef CONFIG_X86_64
3941 err_pml5:
3942         free_page((unsigned long)pml4_root);
3943 err_pml4:
3944         free_page((unsigned long)pae_root);
3945         return -ENOMEM;
3946 #endif
3947 }
3948
3949 static bool is_unsync_root(hpa_t root)
3950 {
3951         struct kvm_mmu_page *sp;
3952
3953         if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root))
3954                 return false;
3955
3956         /*
3957          * The read barrier orders the CPU's read of SPTE.W during the page table
3958          * walk before the reads of sp->unsync/sp->unsync_children here.
3959          *
3960          * Even if another CPU was marking the SP as unsync-ed simultaneously,
3961          * any guest page table changes are not guaranteed to be visible anyway
3962          * until this VCPU issues a TLB flush strictly after those changes are
3963          * made.  We only need to ensure that the other CPU sets these flags
3964          * before any actual changes to the page tables are made.  The comments
3965          * in mmu_try_to_unsync_pages() describe what could go wrong if this
3966          * requirement isn't satisfied.
3967          */
3968         smp_rmb();
3969         sp = root_to_sp(root);
3970
3971         /*
3972          * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
3973          * PDPTEs for a given PAE root need to be synchronized individually.
3974          */
3975         if (WARN_ON_ONCE(!sp))
3976                 return false;
3977
3978         if (sp->unsync || sp->unsync_children)
3979                 return true;
3980
3981         return false;
3982 }
3983
3984 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3985 {
3986         int i;
3987         struct kvm_mmu_page *sp;
3988
3989         if (vcpu->arch.mmu->root_role.direct)
3990                 return;
3991
3992         if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
3993                 return;
3994
3995         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3996
3997         if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
3998                 hpa_t root = vcpu->arch.mmu->root.hpa;
3999
4000                 if (!is_unsync_root(root))
4001                         return;
4002
4003                 sp = root_to_sp(root);
4004
4005                 write_lock(&vcpu->kvm->mmu_lock);
4006                 mmu_sync_children(vcpu, sp, true);
4007                 write_unlock(&vcpu->kvm->mmu_lock);
4008                 return;
4009         }
4010
4011         write_lock(&vcpu->kvm->mmu_lock);
4012
4013         for (i = 0; i < 4; ++i) {
4014                 hpa_t root = vcpu->arch.mmu->pae_root[i];
4015
4016                 if (IS_VALID_PAE_ROOT(root)) {
4017                         sp = spte_to_child_sp(root);
4018                         mmu_sync_children(vcpu, sp, true);
4019                 }
4020         }
4021
4022         write_unlock(&vcpu->kvm->mmu_lock);
4023 }
4024
4025 void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
4026 {
4027         unsigned long roots_to_free = 0;
4028         int i;
4029
4030         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4031                 if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
4032                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
4033
4034         /* sync prev_roots by simply freeing them */
4035         kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
4036 }
4037
4038 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4039                                   gpa_t vaddr, u64 access,
4040                                   struct x86_exception *exception)
4041 {
4042         if (exception)
4043                 exception->error_code = 0;
4044         return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
4045 }
4046
4047 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4048 {
4049         /*
4050          * A nested guest cannot use the MMIO cache if it is using nested
4051          * page tables, because cr2 is a nGPA while the cache stores GPAs.
4052          */
4053         if (mmu_is_nested(vcpu))
4054                 return false;
4055
4056         if (direct)
4057                 return vcpu_match_mmio_gpa(vcpu, addr);
4058
4059         return vcpu_match_mmio_gva(vcpu, addr);
4060 }
4061
4062 /*
4063  * Return the level of the lowest level SPTE added to sptes.
4064  * That SPTE may be non-present.
4065  *
4066  * Must be called between walk_shadow_page_lockless_{begin,end}.
4067  */
4068 static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
4069 {
4070         struct kvm_shadow_walk_iterator iterator;
4071         int leaf = -1;
4072         u64 spte;
4073
4074         for (shadow_walk_init(&iterator, vcpu, addr),
4075              *root_level = iterator.level;
4076              shadow_walk_okay(&iterator);
4077              __shadow_walk_next(&iterator, spte)) {
4078                 leaf = iterator.level;
4079                 spte = mmu_spte_get_lockless(iterator.sptep);
4080
4081                 sptes[leaf] = spte;
4082         }
4083
4084         return leaf;
4085 }
4086
4087 static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
4088                               int *root_level)
4089 {
4090         int leaf;
4091
4092         walk_shadow_page_lockless_begin(vcpu);
4093
4094         if (is_tdp_mmu_active(vcpu))
4095                 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level);
4096         else
4097                 leaf = get_walk(vcpu, addr, sptes, root_level);
4098
4099         walk_shadow_page_lockless_end(vcpu);
4100         return leaf;
4101 }
4102
4103 /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */
4104 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
4105 {
4106         u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
4107         struct rsvd_bits_validate *rsvd_check;
4108         int root, leaf, level;
4109         bool reserved = false;
4110
4111         leaf = get_sptes_lockless(vcpu, addr, sptes, &root);
4112         if (unlikely(leaf < 0)) {
4113                 *sptep = 0ull;
4114                 return reserved;
4115         }
4116
4117         *sptep = sptes[leaf];
4118
4119         /*
4120          * Skip reserved bits checks on the terminal leaf if it's not a valid
4121          * SPTE.  Note, this also (intentionally) skips MMIO SPTEs, which, by
4122          * design, always have reserved bits set.  The purpose of the checks is
4123          * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
4124          */
4125         if (!is_shadow_present_pte(sptes[leaf]))
4126                 leaf++;
4127
4128         rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
4129
4130         for (level = root; level >= leaf; level--)
4131                 reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
4132
4133         if (reserved) {
4134                 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
4135                        __func__, addr);
4136                 for (level = root; level >= leaf; level--)
4137                         pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
4138                                sptes[level], level,
4139                                get_rsvd_bits(rsvd_check, sptes[level], level));
4140         }
4141
4142         return reserved;
4143 }
4144
4145 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4146 {
4147         u64 spte;
4148         bool reserved;
4149
4150         if (mmio_info_in_cache(vcpu, addr, direct))
4151                 return RET_PF_EMULATE;
4152
4153         reserved = get_mmio_spte(vcpu, addr, &spte);
4154         if (WARN_ON_ONCE(reserved))
4155                 return -EINVAL;
4156
4157         if (is_mmio_spte(vcpu->kvm, spte)) {
4158                 gfn_t gfn = get_mmio_spte_gfn(spte);
4159                 unsigned int access = get_mmio_spte_access(spte);
4160
4161                 if (!check_mmio_spte(vcpu, spte))
4162                         return RET_PF_INVALID;
4163
4164                 if (direct)
4165                         addr = 0;
4166
4167                 trace_handle_mmio_page_fault(addr, gfn, access);
4168                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4169                 return RET_PF_EMULATE;
4170         }
4171
4172         /*
4173          * If the page table is zapped by other cpus, let CPU fault again on
4174          * the address.
4175          */
4176         return RET_PF_RETRY;
4177 }
4178
4179 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4180                                          struct kvm_page_fault *fault)
4181 {
4182         if (unlikely(fault->rsvd))
4183                 return false;
4184
4185         if (!fault->present || !fault->write)
4186                 return false;
4187
4188         /*
4189          * guest is writing the page which is write tracked which can
4190          * not be fixed by page fault handler.
4191          */
4192         if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn))
4193                 return true;
4194
4195         return false;
4196 }
4197
4198 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4199 {
4200         struct kvm_shadow_walk_iterator iterator;
4201         u64 spte;
4202
4203         walk_shadow_page_lockless_begin(vcpu);
4204         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
4205                 clear_sp_write_flooding_count(iterator.sptep);
4206         walk_shadow_page_lockless_end(vcpu);
4207 }
4208
4209 static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
4210 {
4211         /* make sure the token value is not 0 */
4212         u32 id = vcpu->arch.apf.id;
4213
4214         if (id << 12 == 0)
4215                 vcpu->arch.apf.id = 1;
4216
4217         return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4218 }
4219
4220 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
4221                                     struct kvm_page_fault *fault)
4222 {
4223         struct kvm_arch_async_pf arch;
4224
4225         arch.token = alloc_apf_token(vcpu);
4226         arch.gfn = fault->gfn;
4227         arch.error_code = fault->error_code;
4228         arch.direct_map = vcpu->arch.mmu->root_role.direct;
4229         arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
4230
4231         return kvm_setup_async_pf(vcpu, fault->addr,
4232                                   kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
4233 }
4234
4235 void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
4236 {
4237         int r;
4238
4239         if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
4240                 return;
4241
4242         if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
4243               work->wakeup_all)
4244                 return;
4245
4246         r = kvm_mmu_reload(vcpu);
4247         if (unlikely(r))
4248                 return;
4249
4250         if (!vcpu->arch.mmu->root_role.direct &&
4251               work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
4252                 return;
4253
4254         r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
4255                                   true, NULL, NULL);
4256
4257         /*
4258          * Account fixed page faults, otherwise they'll never be counted, but
4259          * ignore stats for all other return times.  Page-ready "faults" aren't
4260          * truly spurious and never trigger emulation
4261          */
4262         if (r == RET_PF_FIXED)
4263                 vcpu->stat.pf_fixed++;
4264 }
4265
4266 static inline u8 kvm_max_level_for_order(int order)
4267 {
4268         BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
4269
4270         KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
4271                         order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
4272                         order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
4273
4274         if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
4275                 return PG_LEVEL_1G;
4276
4277         if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
4278                 return PG_LEVEL_2M;
4279
4280         return PG_LEVEL_4K;
4281 }
4282
4283 static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
4284                                         u8 max_level, int gmem_order)
4285 {
4286         u8 req_max_level;
4287
4288         if (max_level == PG_LEVEL_4K)
4289                 return PG_LEVEL_4K;
4290
4291         max_level = min(kvm_max_level_for_order(gmem_order), max_level);
4292         if (max_level == PG_LEVEL_4K)
4293                 return PG_LEVEL_4K;
4294
4295         req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
4296         if (req_max_level)
4297                 max_level = min(max_level, req_max_level);
4298
4299         return max_level;
4300 }
4301
4302 static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
4303                                       struct kvm_page_fault *fault, int r)
4304 {
4305         kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page,
4306                                  r == RET_PF_RETRY, fault->map_writable);
4307 }
4308
4309 static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
4310                                        struct kvm_page_fault *fault)
4311 {
4312         int max_order, r;
4313
4314         if (!kvm_slot_can_be_private(fault->slot)) {
4315                 kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
4316                 return -EFAULT;
4317         }
4318
4319         r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
4320                              &fault->refcounted_page, &max_order);
4321         if (r) {
4322                 kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
4323                 return r;
4324         }
4325
4326         fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
4327         fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
4328                                                          fault->max_level, max_order);
4329
4330         return RET_PF_CONTINUE;
4331 }
4332
4333 static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
4334                                  struct kvm_page_fault *fault)
4335 {
4336         unsigned int foll = fault->write ? FOLL_WRITE : 0;
4337
4338         if (fault->is_private)
4339                 return kvm_mmu_faultin_pfn_private(vcpu, fault);
4340
4341         foll |= FOLL_NOWAIT;
4342         fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
4343                                        &fault->map_writable, &fault->refcounted_page);
4344
4345         /*
4346          * If resolving the page failed because I/O is needed to fault-in the
4347          * page, then either set up an asynchronous #PF to do the I/O, or if
4348          * doing an async #PF isn't possible, retry with I/O allowed.  All
4349          * other failures are terminal, i.e. retrying won't help.
4350          */
4351         if (fault->pfn != KVM_PFN_ERR_NEEDS_IO)
4352                 return RET_PF_CONTINUE;
4353
4354         if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
4355                 trace_kvm_try_async_get_page(fault->addr, fault->gfn);
4356                 if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
4357                         trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
4358                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4359                         return RET_PF_RETRY;
4360                 } else if (kvm_arch_setup_async_pf(vcpu, fault)) {
4361                         return RET_PF_RETRY;
4362                 }
4363         }
4364
4365         /*
4366          * Allow gup to bail on pending non-fatal signals when it's also allowed
4367          * to wait for IO.  Note, gup always bails if it is unable to quickly
4368          * get a page and a fatal signal, i.e. SIGKILL, is pending.
4369          */
4370         foll |= FOLL_INTERRUPTIBLE;
4371         foll &= ~FOLL_NOWAIT;
4372         fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
4373                                        &fault->map_writable, &fault->refcounted_page);
4374
4375         return RET_PF_CONTINUE;
4376 }
4377
4378 static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
4379                                struct kvm_page_fault *fault, unsigned int access)
4380 {
4381         struct kvm_memory_slot *slot = fault->slot;
4382         int ret;
4383
4384         /*
4385          * Note that the mmu_invalidate_seq also serves to detect a concurrent
4386          * change in attributes.  is_page_fault_stale() will detect an
4387          * invalidation relate to fault->fn and resume the guest without
4388          * installing a mapping in the page tables.
4389          */
4390         fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
4391         smp_rmb();
4392
4393         /*
4394          * Now that we have a snapshot of mmu_invalidate_seq we can check for a
4395          * private vs. shared mismatch.
4396          */
4397         if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
4398                 kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
4399                 return -EFAULT;
4400         }
4401
4402         if (unlikely(!slot))
4403                 return kvm_handle_noslot_fault(vcpu, fault, access);
4404
4405         /*
4406          * Retry the page fault if the gfn hit a memslot that is being deleted
4407          * or moved.  This ensures any existing SPTEs for the old memslot will
4408          * be zapped before KVM inserts a new MMIO SPTE for the gfn.
4409          */
4410         if (slot->flags & KVM_MEMSLOT_INVALID)
4411                 return RET_PF_RETRY;
4412
4413         if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
4414                 /*
4415                  * Don't map L1's APIC access page into L2, KVM doesn't support
4416                  * using APICv/AVIC to accelerate L2 accesses to L1's APIC,
4417                  * i.e. the access needs to be emulated.  Emulating access to
4418                  * L1's APIC is also correct if L1 is accelerating L2's own
4419                  * virtual APIC, but for some reason L1 also maps _L1's_ APIC
4420                  * into L2.  Note, vcpu_is_mmio_gpa() always treats access to
4421                  * the APIC as MMIO.  Allow an MMIO SPTE to be created, as KVM
4422                  * uses different roots for L1 vs. L2, i.e. there is no danger
4423                  * of breaking APICv/AVIC for L1.
4424                  */
4425                 if (is_guest_mode(vcpu))
4426                         return kvm_handle_noslot_fault(vcpu, fault, access);
4427
4428                 /*
4429                  * If the APIC access page exists but is disabled, go directly
4430                  * to emulation without caching the MMIO access or creating a
4431                  * MMIO SPTE.  That way the cache doesn't need to be purged
4432                  * when the AVIC is re-enabled.
4433                  */
4434                 if (!kvm_apicv_activated(vcpu->kvm))
4435                         return RET_PF_EMULATE;
4436         }
4437
4438         /*
4439          * Check for a relevant mmu_notifier invalidation event before getting
4440          * the pfn from the primary MMU, and before acquiring mmu_lock.
4441          *
4442          * For mmu_lock, if there is an in-progress invalidation and the kernel
4443          * allows preemption, the invalidation task may drop mmu_lock and yield
4444          * in response to mmu_lock being contended, which is *very* counter-
4445          * productive as this vCPU can't actually make forward progress until
4446          * the invalidation completes.
4447          *
4448          * Retrying now can also avoid unnessary lock contention in the primary
4449          * MMU, as the primary MMU doesn't necessarily hold a single lock for
4450          * the duration of the invalidation, i.e. faulting in a conflicting pfn
4451          * can cause the invalidation to take longer by holding locks that are
4452          * needed to complete the invalidation.
4453          *
4454          * Do the pre-check even for non-preemtible kernels, i.e. even if KVM
4455          * will never yield mmu_lock in response to contention, as this vCPU is
4456          * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
4457          * to detect retry guarantees the worst case latency for the vCPU.
4458          */
4459         if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
4460                 return RET_PF_RETRY;
4461
4462         ret = __kvm_mmu_faultin_pfn(vcpu, fault);
4463         if (ret != RET_PF_CONTINUE)
4464                 return ret;
4465
4466         if (unlikely(is_error_pfn(fault->pfn)))
4467                 return kvm_handle_error_pfn(vcpu, fault);
4468
4469         if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn)))
4470                 return kvm_handle_noslot_fault(vcpu, fault, access);
4471
4472         /*
4473          * Check again for a relevant mmu_notifier invalidation event purely to
4474          * avoid contending mmu_lock.  Most invalidations will be detected by
4475          * the previous check, but checking is extremely cheap relative to the
4476          * overall cost of failing to detect the invalidation until after
4477          * mmu_lock is acquired.
4478          */
4479         if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
4480                 kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY);
4481                 return RET_PF_RETRY;
4482         }
4483
4484         return RET_PF_CONTINUE;
4485 }
4486
4487 /*
4488  * Returns true if the page fault is stale and needs to be retried, i.e. if the
4489  * root was invalidated by a memslot update or a relevant mmu_notifier fired.
4490  */
4491 static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
4492                                 struct kvm_page_fault *fault)
4493 {
4494         struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
4495
4496         /* Special roots, e.g. pae_root, are not backed by shadow pages. */
4497         if (sp && is_obsolete_sp(vcpu->kvm, sp))
4498                 return true;
4499
4500         /*
4501          * Roots without an associated shadow page are considered invalid if
4502          * there is a pending request to free obsolete roots.  The request is
4503          * only a hint that the current root _may_ be obsolete and needs to be
4504          * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
4505          * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
4506          * to reload even if no vCPU is actively using the root.
4507          */
4508         if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
4509                 return true;
4510
4511         /*
4512          * Check for a relevant mmu_notifier invalidation event one last time
4513          * now that mmu_lock is held, as the "unsafe" checks performed without
4514          * holding mmu_lock can get false negatives.
4515          */
4516         return fault->slot &&
4517                mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
4518 }
4519
4520 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4521 {
4522         int r;
4523
4524         /* Dummy roots are used only for shadowing bad guest roots. */
4525         if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa)))
4526                 return RET_PF_RETRY;
4527
4528         if (page_fault_handle_page_track(vcpu, fault))
4529                 return RET_PF_WRITE_PROTECTED;
4530
4531         r = fast_page_fault(vcpu, fault);
4532         if (r != RET_PF_INVALID)
4533                 return r;
4534
4535         r = mmu_topup_memory_caches(vcpu, false);
4536         if (r)
4537                 return r;
4538
4539         r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
4540         if (r != RET_PF_CONTINUE)
4541                 return r;
4542
4543         r = RET_PF_RETRY;
4544         write_lock(&vcpu->kvm->mmu_lock);
4545
4546         if (is_page_fault_stale(vcpu, fault))
4547                 goto out_unlock;
4548
4549         r = make_mmu_pages_available(vcpu);
4550         if (r)
4551                 goto out_unlock;
4552
4553         r = direct_map(vcpu, fault);
4554
4555 out_unlock:
4556         kvm_mmu_finish_page_fault(vcpu, fault, r);
4557         write_unlock(&vcpu->kvm->mmu_lock);
4558         return r;
4559 }
4560
4561 static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
4562                                 struct kvm_page_fault *fault)
4563 {
4564         /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
4565         fault->max_level = PG_LEVEL_2M;
4566         return direct_page_fault(vcpu, fault);
4567 }
4568
4569 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4570                                 u64 fault_address, char *insn, int insn_len)
4571 {
4572         int r = 1;
4573         u32 flags = vcpu->arch.apf.host_apf_flags;
4574
4575 #ifndef CONFIG_X86_64
4576         /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4577         if (WARN_ON_ONCE(fault_address >> 32))
4578                 return -EFAULT;
4579 #endif
4580         /*
4581          * Legacy #PF exception only have a 32-bit error code.  Simply drop the
4582          * upper bits as KVM doesn't use them for #PF (because they are never
4583          * set), and to ensure there are no collisions with KVM-defined bits.
4584          */
4585         if (WARN_ON_ONCE(error_code >> 32))
4586                 error_code = lower_32_bits(error_code);
4587
4588         /*
4589          * Restrict KVM-defined flags to bits 63:32 so that it's impossible for
4590          * them to conflict with #PF error codes, which are limited to 32 bits.
4591          */
4592         BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
4593
4594         vcpu->arch.l1tf_flush_l1d = true;
4595         if (!flags) {
4596                 trace_kvm_page_fault(vcpu, fault_address, error_code);
4597
4598                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4599                                 insn_len);
4600         } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4601                 vcpu->arch.apf.host_apf_flags = 0;
4602                 local_irq_disable();
4603                 kvm_async_pf_task_wait_schedule(fault_address);
4604                 local_irq_enable();
4605         } else {
4606                 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
4607         }
4608
4609         return r;
4610 }
4611 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4612
4613 #ifdef CONFIG_X86_64
4614 static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
4615                                   struct kvm_page_fault *fault)
4616 {
4617         int r;
4618
4619         if (page_fault_handle_page_track(vcpu, fault))
4620                 return RET_PF_WRITE_PROTECTED;
4621
4622         r = fast_page_fault(vcpu, fault);
4623         if (r != RET_PF_INVALID)
4624                 return r;
4625
4626         r = mmu_topup_memory_caches(vcpu, false);
4627         if (r)
4628                 return r;
4629
4630         r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
4631         if (r != RET_PF_CONTINUE)
4632                 return r;
4633
4634         r = RET_PF_RETRY;
4635         read_lock(&vcpu->kvm->mmu_lock);
4636
4637         if (is_page_fault_stale(vcpu, fault))
4638                 goto out_unlock;
4639
4640         r = kvm_tdp_mmu_map(vcpu, fault);
4641
4642 out_unlock:
4643         kvm_mmu_finish_page_fault(vcpu, fault, r);
4644         read_unlock(&vcpu->kvm->mmu_lock);
4645         return r;
4646 }
4647 #endif
4648
4649 bool kvm_mmu_may_ignore_guest_pat(void)
4650 {
4651         /*
4652          * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM
4653          * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
4654          * honor the memtype from the guest's PAT so that guest accesses to
4655          * memory that is DMA'd aren't cached against the guest's wishes.  As a
4656          * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
4657          * KVM _always_ ignores guest PAT (when EPT is enabled).
4658          */
4659         return shadow_memtype_mask;
4660 }
4661
4662 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
4663 {
4664 #ifdef CONFIG_X86_64
4665         if (tdp_mmu_enabled)
4666                 return kvm_tdp_mmu_page_fault(vcpu, fault);
4667 #endif
4668
4669         return direct_page_fault(vcpu, fault);
4670 }
4671
4672 static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
4673                             u8 *level)
4674 {
4675         int r;
4676
4677         /*
4678          * Restrict to TDP page fault, since that's the only case where the MMU
4679          * is indexed by GPA.
4680          */
4681         if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
4682                 return -EOPNOTSUPP;
4683
4684         do {
4685                 if (signal_pending(current))
4686                         return -EINTR;
4687                 cond_resched();
4688                 r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
4689         } while (r == RET_PF_RETRY);
4690
4691         if (r < 0)
4692                 return r;
4693
4694         switch (r) {
4695         case RET_PF_FIXED:
4696         case RET_PF_SPURIOUS:
4697         case RET_PF_WRITE_PROTECTED:
4698                 return 0;
4699
4700         case RET_PF_EMULATE:
4701                 return -ENOENT;
4702
4703         case RET_PF_RETRY:
4704         case RET_PF_CONTINUE:
4705         case RET_PF_INVALID:
4706         default:
4707                 WARN_ONCE(1, "could not fix page fault during prefault");
4708                 return -EIO;
4709         }
4710 }
4711
4712 long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
4713                                     struct kvm_pre_fault_memory *range)
4714 {
4715         u64 error_code = PFERR_GUEST_FINAL_MASK;
4716         u8 level = PG_LEVEL_4K;
4717         u64 end;
4718         int r;
4719
4720         if (!vcpu->kvm->arch.pre_fault_allowed)
4721                 return -EOPNOTSUPP;
4722
4723         /*
4724          * reload is efficient when called repeatedly, so we can do it on
4725          * every iteration.
4726          */
4727         r = kvm_mmu_reload(vcpu);
4728         if (r)
4729                 return r;
4730
4731         if (kvm_arch_has_private_mem(vcpu->kvm) &&
4732             kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
4733                 error_code |= PFERR_PRIVATE_ACCESS;
4734
4735         /*
4736          * Shadow paging uses GVA for kvm page fault, so restrict to
4737          * two-dimensional paging.
4738          */
4739         r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
4740         if (r < 0)
4741                 return r;
4742
4743         /*
4744          * If the mapping that covers range->gpa can use a huge page, it
4745          * may start below it or end after range->gpa + range->size.
4746          */
4747         end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level);
4748         return min(range->size, end - range->gpa);
4749 }
4750
4751 static void nonpaging_init_context(struct kvm_mmu *context)
4752 {
4753         context->page_fault = nonpaging_page_fault;
4754         context->gva_to_gpa = nonpaging_gva_to_gpa;
4755         context->sync_spte = NULL;
4756 }
4757
4758 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4759                                   union kvm_mmu_page_role role)
4760 {
4761         struct kvm_mmu_page *sp;
4762
4763         if (!VALID_PAGE(root->hpa))
4764                 return false;
4765
4766         if (!role.direct && pgd != root->pgd)
4767                 return false;
4768
4769         sp = root_to_sp(root->hpa);
4770         if (WARN_ON_ONCE(!sp))
4771                 return false;
4772
4773         return role.word == sp->role.word;
4774 }
4775
4776 /*
4777  * Find out if a previously cached root matching the new pgd/role is available,
4778  * and insert the current root as the MRU in the cache.
4779  * If a matching root is found, it is assigned to kvm_mmu->root and
4780  * true is returned.
4781  * If no match is found, kvm_mmu->root is left invalid, the LRU root is
4782  * evicted to make room for the current root, and false is returned.
4783  */
4784 static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
4785                                               gpa_t new_pgd,
4786                                               union kvm_mmu_page_role new_role)
4787 {
4788         uint i;
4789
4790         if (is_root_usable(&mmu->root, new_pgd, new_role))
4791                 return true;
4792
4793         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4794                 /*
4795                  * The swaps end up rotating the cache like this:
4796                  *   C   0 1 2 3   (on entry to the function)
4797                  *   0   C 1 2 3
4798                  *   1   C 0 2 3
4799                  *   2   C 0 1 3
4800                  *   3   C 0 1 2   (on exit from the loop)
4801                  */
4802                 swap(mmu->root, mmu->prev_roots[i]);
4803                 if (is_root_usable(&mmu->root, new_pgd, new_role))
4804                         return true;
4805         }
4806
4807         kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4808         return false;
4809 }
4810
4811 /*
4812  * Find out if a previously cached root matching the new pgd/role is available.
4813  * On entry, mmu->root is invalid.
4814  * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
4815  * of the cache becomes invalid, and true is returned.
4816  * If no match is found, kvm_mmu->root is left invalid and false is returned.
4817  */
4818 static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
4819                                              gpa_t new_pgd,
4820                                              union kvm_mmu_page_role new_role)
4821 {
4822         uint i;
4823
4824         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4825                 if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role))
4826                         goto hit;
4827
4828         return false;
4829
4830 hit:
4831         swap(mmu->root, mmu->prev_roots[i]);
4832         /* Bubble up the remaining roots.  */
4833         for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
4834                 mmu->prev_roots[i] = mmu->prev_roots[i + 1];
4835         mmu->prev_roots[i].hpa = INVALID_PAGE;
4836         return true;
4837 }
4838
4839 static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
4840                             gpa_t new_pgd, union kvm_mmu_page_role new_role)
4841 {
4842         /*
4843          * Limit reuse to 64-bit hosts+VMs without "special" roots in order to
4844          * avoid having to deal with PDPTEs and other complexities.
4845          */
4846         if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa))
4847                 kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4848
4849         if (VALID_PAGE(mmu->root.hpa))
4850                 return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
4851         else
4852                 return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
4853 }
4854
4855 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
4856 {
4857         struct kvm_mmu *mmu = vcpu->arch.mmu;
4858         union kvm_mmu_page_role new_role = mmu->root_role;
4859
4860         /*
4861          * Return immediately if no usable root was found, kvm_mmu_reload()
4862          * will establish a valid root prior to the next VM-Enter.
4863          */
4864         if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role))
4865                 return;
4866
4867         /*
4868          * It's possible that the cached previous root page is obsolete because
4869          * of a change in the MMU generation number. However, changing the
4870          * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
4871          * which will free the root set here and allocate a new one.
4872          */
4873         kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4874
4875         if (force_flush_and_sync_on_reuse) {
4876                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4877                 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4878         }
4879
4880         /*
4881          * The last MMIO access's GVA and GPA are cached in the VCPU. When
4882          * switching to a new CR3, that GVA->GPA mapping may no longer be
4883          * valid. So clear any cached MMIO info even when we don't need to sync
4884          * the shadow page tables.
4885          */
4886         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4887
4888         /*
4889          * If this is a direct root page, it doesn't have a write flooding
4890          * count. Otherwise, clear the write flooding count.
4891          */
4892         if (!new_role.direct) {
4893                 struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
4894
4895                 if (!WARN_ON_ONCE(!sp))
4896                         __clear_sp_write_flooding_count(sp);
4897         }
4898 }
4899 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4900
4901 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4902                            unsigned int access)
4903 {
4904         if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
4905                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4906                         mmu_spte_clear_no_track(sptep);
4907                         return true;
4908                 }
4909
4910                 mark_mmio_spte(vcpu, sptep, gfn, access);
4911                 return true;
4912         }
4913
4914         return false;
4915 }
4916
4917 #define PTTYPE_EPT 18 /* arbitrary */
4918 #define PTTYPE PTTYPE_EPT
4919 #include "paging_tmpl.h"
4920 #undef PTTYPE
4921
4922 #define PTTYPE 64
4923 #include "paging_tmpl.h"
4924 #undef PTTYPE
4925
4926 #define PTTYPE 32
4927 #include "paging_tmpl.h"
4928 #undef PTTYPE
4929
4930 static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
4931                                     u64 pa_bits_rsvd, int level, bool nx,
4932                                     bool gbpages, bool pse, bool amd)
4933 {
4934         u64 gbpages_bit_rsvd = 0;
4935         u64 nonleaf_bit8_rsvd = 0;
4936         u64 high_bits_rsvd;
4937
4938         rsvd_check->bad_mt_xwr = 0;
4939
4940         if (!gbpages)
4941                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4942
4943         if (level == PT32E_ROOT_LEVEL)
4944                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
4945         else
4946                 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
4947
4948         /* Note, NX doesn't exist in PDPTEs, this is handled below. */
4949         if (!nx)
4950                 high_bits_rsvd |= rsvd_bits(63, 63);
4951
4952         /*
4953          * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4954          * leaf entries) on AMD CPUs only.
4955          */
4956         if (amd)
4957                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4958
4959         switch (level) {
4960         case PT32_ROOT_LEVEL:
4961                 /* no rsvd bits for 2 level 4K page table entries */
4962                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4963                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4964                 rsvd_check->rsvd_bits_mask[1][0] =
4965                         rsvd_check->rsvd_bits_mask[0][0];
4966
4967                 if (!pse) {
4968                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4969                         break;
4970                 }
4971
4972                 if (is_cpuid_PSE36())
4973                         /* 36bits PSE 4MB page */
4974                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4975                 else
4976                         /* 32 bits PSE 4MB page */
4977                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4978                 break;
4979         case PT32E_ROOT_LEVEL:
4980                 rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
4981                                                    high_bits_rsvd |
4982                                                    rsvd_bits(5, 8) |
4983                                                    rsvd_bits(1, 2);     /* PDPTE */
4984                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;      /* PDE */
4985                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;      /* PTE */
4986                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
4987                                                    rsvd_bits(13, 20);   /* large page */
4988                 rsvd_check->rsvd_bits_mask[1][0] =
4989                         rsvd_check->rsvd_bits_mask[0][0];
4990                 break;
4991         case PT64_ROOT_5LEVEL:
4992                 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
4993                                                    nonleaf_bit8_rsvd |
4994                                                    rsvd_bits(7, 7);
4995                 rsvd_check->rsvd_bits_mask[1][4] =
4996                         rsvd_check->rsvd_bits_mask[0][4];
4997                 fallthrough;
4998         case PT64_ROOT_4LEVEL:
4999                 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
5000                                                    nonleaf_bit8_rsvd |
5001                                                    rsvd_bits(7, 7);
5002                 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
5003                                                    gbpages_bit_rsvd;
5004                 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
5005                 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
5006                 rsvd_check->rsvd_bits_mask[1][3] =
5007                         rsvd_check->rsvd_bits_mask[0][3];
5008                 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
5009                                                    gbpages_bit_rsvd |
5010                                                    rsvd_bits(13, 29);
5011                 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
5012                                                    rsvd_bits(13, 20); /* large page */
5013                 rsvd_check->rsvd_bits_mask[1][0] =
5014                         rsvd_check->rsvd_bits_mask[0][0];
5015                 break;
5016         }
5017 }
5018
5019 static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
5020                                         struct kvm_mmu *context)
5021 {
5022         __reset_rsvds_bits_mask(&context->guest_rsvd_check,
5023                                 vcpu->arch.reserved_gpa_bits,
5024                                 context->cpu_role.base.level, is_efer_nx(context),
5025                                 guest_can_use(vcpu, X86_FEATURE_GBPAGES),
5026                                 is_cr4_pse(context),
5027                                 guest_cpuid_is_amd_compatible(vcpu));
5028 }
5029
5030 static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
5031                                         u64 pa_bits_rsvd, bool execonly,
5032                                         int huge_page_level)
5033 {
5034         u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
5035         u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
5036         u64 bad_mt_xwr;
5037
5038         if (huge_page_level < PG_LEVEL_1G)
5039                 large_1g_rsvd = rsvd_bits(7, 7);
5040         if (huge_page_level < PG_LEVEL_2M)
5041                 large_2m_rsvd = rsvd_bits(7, 7);
5042
5043         rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
5044         rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
5045         rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
5046         rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
5047         rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
5048
5049         /* large page */
5050         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
5051         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
5052         rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
5053         rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
5054         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
5055
5056         bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
5057         bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
5058         bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
5059         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
5060         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
5061         if (!execonly) {
5062                 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
5063                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
5064         }
5065         rsvd_check->bad_mt_xwr = bad_mt_xwr;
5066 }
5067
5068 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
5069                 struct kvm_mmu *context, bool execonly, int huge_page_level)
5070 {
5071         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
5072                                     vcpu->arch.reserved_gpa_bits, execonly,
5073                                     huge_page_level);
5074 }
5075
5076 static inline u64 reserved_hpa_bits(void)
5077 {
5078         return rsvd_bits(kvm_host.maxphyaddr, 63);
5079 }
5080
5081 /*
5082  * the page table on host is the shadow page table for the page
5083  * table in guest or amd nested guest, its mmu features completely
5084  * follow the features in guest.
5085  */
5086 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
5087                                         struct kvm_mmu *context)
5088 {
5089         /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
5090         bool is_amd = true;
5091         /* KVM doesn't use 2-level page tables for the shadow MMU. */
5092         bool is_pse = false;
5093         struct rsvd_bits_validate *shadow_zero_check;
5094         int i;
5095
5096         WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
5097
5098         shadow_zero_check = &context->shadow_zero_check;
5099         __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
5100                                 context->root_role.level,
5101                                 context->root_role.efer_nx,
5102                                 guest_can_use(vcpu, X86_FEATURE_GBPAGES),
5103                                 is_pse, is_amd);
5104
5105         if (!shadow_me_mask)
5106                 return;
5107
5108         for (i = context->root_role.level; --i >= 0;) {
5109                 /*
5110                  * So far shadow_me_value is a constant during KVM's life
5111                  * time.  Bits in shadow_me_value are allowed to be set.
5112                  * Bits in shadow_me_mask but not in shadow_me_value are
5113                  * not allowed to be set.
5114                  */
5115                 shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
5116                 shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
5117                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
5118                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
5119         }
5120
5121 }
5122
5123 static inline bool boot_cpu_is_amd(void)
5124 {
5125         WARN_ON_ONCE(!tdp_enabled);
5126         return shadow_x_mask == 0;
5127 }
5128
5129 /*
5130  * the direct page table on host, use as much mmu features as
5131  * possible, however, kvm currently does not do execution-protection.
5132  */
5133 static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
5134 {
5135         struct rsvd_bits_validate *shadow_zero_check;
5136         int i;
5137
5138         shadow_zero_check = &context->shadow_zero_check;
5139
5140         if (boot_cpu_is_amd())
5141                 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
5142                                         context->root_role.level, true,
5143                                         boot_cpu_has(X86_FEATURE_GBPAGES),
5144                                         false, true);
5145         else
5146                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
5147                                             reserved_hpa_bits(), false,
5148                                             max_huge_page_level);
5149
5150         if (!shadow_me_mask)
5151                 return;
5152
5153         for (i = context->root_role.level; --i >= 0;) {
5154                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
5155                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
5156         }
5157 }
5158
5159 /*
5160  * as the comments in reset_shadow_zero_bits_mask() except it
5161  * is the shadow page table for intel nested guest.
5162  */
5163 static void
5164 reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
5165 {
5166         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
5167                                     reserved_hpa_bits(), execonly,
5168                                     max_huge_page_level);
5169 }
5170
5171 #define BYTE_MASK(access) \
5172         ((1 & (access) ? 2 : 0) | \
5173          (2 & (access) ? 4 : 0) | \
5174          (3 & (access) ? 8 : 0) | \
5175          (4 & (access) ? 16 : 0) | \
5176          (5 & (access) ? 32 : 0) | \
5177          (6 & (access) ? 64 : 0) | \
5178          (7 & (access) ? 128 : 0))
5179
5180
5181 static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
5182 {
5183         unsigned byte;
5184
5185         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
5186         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
5187         const u8 u = BYTE_MASK(ACC_USER_MASK);
5188
5189         bool cr4_smep = is_cr4_smep(mmu);
5190         bool cr4_smap = is_cr4_smap(mmu);
5191         bool cr0_wp = is_cr0_wp(mmu);
5192         bool efer_nx = is_efer_nx(mmu);
5193
5194         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
5195                 unsigned pfec = byte << 1;
5196
5197                 /*
5198                  * Each "*f" variable has a 1 bit for each UWX value
5199                  * that causes a fault with the given PFEC.
5200                  */
5201
5202                 /* Faults from writes to non-writable pages */
5203                 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
5204                 /* Faults from user mode accesses to supervisor pages */
5205                 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
5206                 /* Faults from fetches of non-executable pages*/
5207                 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
5208                 /* Faults from kernel mode fetches of user pages */
5209                 u8 smepf = 0;
5210                 /* Faults from kernel mode accesses of user pages */
5211                 u8 smapf = 0;
5212
5213                 if (!ept) {
5214                         /* Faults from kernel mode accesses to user pages */
5215                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
5216
5217                         /* Not really needed: !nx will cause pte.nx to fault */
5218                         if (!efer_nx)
5219                                 ff = 0;
5220
5221                         /* Allow supervisor writes if !cr0.wp */
5222                         if (!cr0_wp)
5223                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
5224
5225                         /* Disallow supervisor fetches of user code if cr4.smep */
5226                         if (cr4_smep)
5227                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
5228
5229                         /*
5230                          * SMAP:kernel-mode data accesses from user-mode
5231                          * mappings should fault. A fault is considered
5232                          * as a SMAP violation if all of the following
5233                          * conditions are true:
5234                          *   - X86_CR4_SMAP is set in CR4
5235                          *   - A user page is accessed
5236                          *   - The access is not a fetch
5237                          *   - The access is supervisor mode
5238                          *   - If implicit supervisor access or X86_EFLAGS_AC is clear
5239                          *
5240                          * Here, we cover the first four conditions.
5241                          * The fifth is computed dynamically in permission_fault();
5242                          * PFERR_RSVD_MASK bit will be set in PFEC if the access is
5243                          * *not* subject to SMAP restrictions.
5244                          */
5245                         if (cr4_smap)
5246                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
5247                 }
5248
5249                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
5250         }
5251 }
5252
5253 /*
5254 * PKU is an additional mechanism by which the paging controls access to
5255 * user-mode addresses based on the value in the PKRU register.  Protection
5256 * key violations are reported through a bit in the page fault error code.
5257 * Unlike other bits of the error code, the PK bit is not known at the
5258 * call site of e.g. gva_to_gpa; it must be computed directly in
5259 * permission_fault based on two bits of PKRU, on some machine state (CR4,
5260 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
5261 *
5262 * In particular the following conditions come from the error code, the
5263 * page tables and the machine state:
5264 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
5265 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
5266 * - PK is always zero if U=0 in the page tables
5267 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
5268 *
5269 * The PKRU bitmask caches the result of these four conditions.  The error
5270 * code (minus the P bit) and the page table's U bit form an index into the
5271 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
5272 * with the two bits of the PKRU register corresponding to the protection key.
5273 * For the first three conditions above the bits will be 00, thus masking
5274 * away both AD and WD.  For all reads or if the last condition holds, WD
5275 * only will be masked away.
5276 */
5277 static void update_pkru_bitmask(struct kvm_mmu *mmu)
5278 {
5279         unsigned bit;
5280         bool wp;
5281
5282         mmu->pkru_mask = 0;
5283
5284         if (!is_cr4_pke(mmu))
5285                 return;
5286
5287         wp = is_cr0_wp(mmu);
5288
5289         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
5290                 unsigned pfec, pkey_bits;
5291                 bool check_pkey, check_write, ff, uf, wf, pte_user;
5292
5293                 pfec = bit << 1;
5294                 ff = pfec & PFERR_FETCH_MASK;
5295                 uf = pfec & PFERR_USER_MASK;
5296                 wf = pfec & PFERR_WRITE_MASK;
5297
5298                 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
5299                 pte_user = pfec & PFERR_RSVD_MASK;
5300
5301                 /*
5302                  * Only need to check the access which is not an
5303                  * instruction fetch and is to a user page.
5304                  */
5305                 check_pkey = (!ff && pte_user);
5306                 /*
5307                  * write access is controlled by PKRU if it is a
5308                  * user access or CR0.WP = 1.
5309                  */
5310                 check_write = check_pkey && wf && (uf || wp);
5311
5312                 /* PKRU.AD stops both read and write access. */
5313                 pkey_bits = !!check_pkey;
5314                 /* PKRU.WD stops write access. */
5315                 pkey_bits |= (!!check_write) << 1;
5316
5317                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
5318         }
5319 }
5320
5321 static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
5322                                         struct kvm_mmu *mmu)
5323 {
5324         if (!is_cr0_pg(mmu))
5325                 return;
5326
5327         reset_guest_rsvds_bits_mask(vcpu, mmu);
5328         update_permission_bitmask(mmu, false);
5329         update_pkru_bitmask(mmu);
5330 }
5331
5332 static void paging64_init_context(struct kvm_mmu *context)
5333 {
5334         context->page_fault = paging64_page_fault;
5335         context->gva_to_gpa = paging64_gva_to_gpa;
5336         context->sync_spte = paging64_sync_spte;
5337 }
5338
5339 static void paging32_init_context(struct kvm_mmu *context)
5340 {
5341         context->page_fault = paging32_page_fault;
5342         context->gva_to_gpa = paging32_gva_to_gpa;
5343         context->sync_spte = paging32_sync_spte;
5344 }
5345
5346 static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
5347                                             const struct kvm_mmu_role_regs *regs)
5348 {
5349         union kvm_cpu_role role = {0};
5350
5351         role.base.access = ACC_ALL;
5352         role.base.smm = is_smm(vcpu);
5353         role.base.guest_mode = is_guest_mode(vcpu);
5354         role.ext.valid = 1;
5355
5356         if (!____is_cr0_pg(regs)) {
5357                 role.base.direct = 1;
5358                 return role;
5359         }
5360
5361         role.base.efer_nx = ____is_efer_nx(regs);
5362         role.base.cr0_wp = ____is_cr0_wp(regs);
5363         role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
5364         role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
5365         role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
5366
5367         if (____is_efer_lma(regs))
5368                 role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
5369                                                         : PT64_ROOT_4LEVEL;
5370         else if (____is_cr4_pae(regs))
5371                 role.base.level = PT32E_ROOT_LEVEL;
5372         else
5373                 role.base.level = PT32_ROOT_LEVEL;
5374
5375         role.ext.cr4_smep = ____is_cr4_smep(regs);
5376         role.ext.cr4_smap = ____is_cr4_smap(regs);
5377         role.ext.cr4_pse = ____is_cr4_pse(regs);
5378
5379         /* PKEY and LA57 are active iff long mode is active. */
5380         role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
5381         role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
5382         role.ext.efer_lma = ____is_efer_lma(regs);
5383         return role;
5384 }
5385
5386 void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
5387                                         struct kvm_mmu *mmu)
5388 {
5389         const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP);
5390
5391         BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP);
5392         BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS));
5393
5394         if (is_cr0_wp(mmu) == cr0_wp)
5395                 return;
5396
5397         mmu->cpu_role.base.cr0_wp = cr0_wp;
5398         reset_guest_paging_metadata(vcpu, mmu);
5399 }
5400
5401 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
5402 {
5403         /* tdp_root_level is architecture forced level, use it if nonzero */
5404         if (tdp_root_level)
5405                 return tdp_root_level;
5406
5407         /* Use 5-level TDP if and only if it's useful/necessary. */
5408         if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
5409                 return 4;
5410
5411         return max_tdp_level;
5412 }
5413
5414 u8 kvm_mmu_get_max_tdp_level(void)
5415 {
5416         return tdp_root_level ? tdp_root_level : max_tdp_level;
5417 }
5418
5419 static union kvm_mmu_page_role
5420 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
5421                                 union kvm_cpu_role cpu_role)
5422 {
5423         union kvm_mmu_page_role role = {0};
5424
5425         role.access = ACC_ALL;
5426         role.cr0_wp = true;
5427         role.efer_nx = true;
5428         role.smm = cpu_role.base.smm;
5429         role.guest_mode = cpu_role.base.guest_mode;
5430         role.ad_disabled = !kvm_ad_enabled;
5431         role.level = kvm_mmu_get_tdp_level(vcpu);
5432         role.direct = true;
5433         role.has_4_byte_gpte = false;
5434
5435         return role;
5436 }
5437
5438 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
5439                              union kvm_cpu_role cpu_role)
5440 {
5441         struct kvm_mmu *context = &vcpu->arch.root_mmu;
5442         union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
5443
5444         if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5445             root_role.word == context->root_role.word)
5446                 return;
5447
5448         context->cpu_role.as_u64 = cpu_role.as_u64;
5449         context->root_role.word = root_role.word;
5450         context->page_fault = kvm_tdp_page_fault;
5451         context->sync_spte = NULL;
5452         context->get_guest_pgd = get_guest_cr3;
5453         context->get_pdptr = kvm_pdptr_read;
5454         context->inject_page_fault = kvm_inject_page_fault;
5455
5456         if (!is_cr0_pg(context))
5457                 context->gva_to_gpa = nonpaging_gva_to_gpa;
5458         else if (is_cr4_pae(context))
5459                 context->gva_to_gpa = paging64_gva_to_gpa;
5460         else
5461                 context->gva_to_gpa = paging32_gva_to_gpa;
5462
5463         reset_guest_paging_metadata(vcpu, context);
5464         reset_tdp_shadow_zero_bits_mask(context);
5465 }
5466
5467 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
5468                                     union kvm_cpu_role cpu_role,
5469                                     union kvm_mmu_page_role root_role)
5470 {
5471         if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5472             root_role.word == context->root_role.word)
5473                 return;
5474
5475         context->cpu_role.as_u64 = cpu_role.as_u64;
5476         context->root_role.word = root_role.word;
5477
5478         if (!is_cr0_pg(context))
5479                 nonpaging_init_context(context);
5480         else if (is_cr4_pae(context))
5481                 paging64_init_context(context);
5482         else
5483                 paging32_init_context(context);
5484
5485         reset_guest_paging_metadata(vcpu, context);
5486         reset_shadow_zero_bits_mask(vcpu, context);
5487 }
5488
5489 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
5490                                 union kvm_cpu_role cpu_role)
5491 {
5492         struct kvm_mmu *context = &vcpu->arch.root_mmu;
5493         union kvm_mmu_page_role root_role;
5494
5495         root_role = cpu_role.base;
5496
5497         /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
5498         root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
5499
5500         /*
5501          * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
5502          * KVM uses NX when TDP is disabled to handle a variety of scenarios,
5503          * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
5504          * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
5505          * The iTLB multi-hit workaround can be toggled at any time, so assume
5506          * NX can be used by any non-nested shadow MMU to avoid having to reset
5507          * MMU contexts.
5508          */
5509         root_role.efer_nx = true;
5510
5511         shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5512 }
5513
5514 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
5515                              unsigned long cr4, u64 efer, gpa_t nested_cr3)
5516 {
5517         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5518         struct kvm_mmu_role_regs regs = {
5519                 .cr0 = cr0,
5520                 .cr4 = cr4 & ~X86_CR4_PKE,
5521                 .efer = efer,
5522         };
5523         union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
5524         union kvm_mmu_page_role root_role;
5525
5526         /* NPT requires CR0.PG=1. */
5527         WARN_ON_ONCE(cpu_role.base.direct);
5528
5529         root_role = cpu_role.base;
5530         root_role.level = kvm_mmu_get_tdp_level(vcpu);
5531         if (root_role.level == PT64_ROOT_5LEVEL &&
5532             cpu_role.base.level == PT64_ROOT_4LEVEL)
5533                 root_role.passthrough = 1;
5534
5535         shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5536         kvm_mmu_new_pgd(vcpu, nested_cr3);
5537 }
5538 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
5539
5540 static union kvm_cpu_role
5541 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5542                                    bool execonly, u8 level)
5543 {
5544         union kvm_cpu_role role = {0};
5545
5546         /*
5547          * KVM does not support SMM transfer monitors, and consequently does not
5548          * support the "entry to SMM" control either.  role.base.smm is always 0.
5549          */
5550         WARN_ON_ONCE(is_smm(vcpu));
5551         role.base.level = level;
5552         role.base.has_4_byte_gpte = false;
5553         role.base.direct = false;
5554         role.base.ad_disabled = !accessed_dirty;
5555         role.base.guest_mode = true;
5556         role.base.access = ACC_ALL;
5557
5558         role.ext.word = 0;
5559         role.ext.execonly = execonly;
5560         role.ext.valid = 1;
5561
5562         return role;
5563 }
5564
5565 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5566                              int huge_page_level, bool accessed_dirty,
5567                              gpa_t new_eptp)
5568 {
5569         struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5570         u8 level = vmx_eptp_page_walk_level(new_eptp);
5571         union kvm_cpu_role new_mode =
5572                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5573                                                    execonly, level);
5574
5575         if (new_mode.as_u64 != context->cpu_role.as_u64) {
5576                 /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
5577                 context->cpu_role.as_u64 = new_mode.as_u64;
5578                 context->root_role.word = new_mode.base.word;
5579
5580                 context->page_fault = ept_page_fault;
5581                 context->gva_to_gpa = ept_gva_to_gpa;
5582                 context->sync_spte = ept_sync_spte;
5583
5584                 update_permission_bitmask(context, true);
5585                 context->pkru_mask = 0;
5586                 reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
5587                 reset_ept_shadow_zero_bits_mask(context, execonly);
5588         }
5589
5590         kvm_mmu_new_pgd(vcpu, new_eptp);
5591 }
5592 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5593
5594 static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
5595                              union kvm_cpu_role cpu_role)
5596 {
5597         struct kvm_mmu *context = &vcpu->arch.root_mmu;
5598
5599         kvm_init_shadow_mmu(vcpu, cpu_role);
5600
5601         context->get_guest_pgd     = get_guest_cr3;
5602         context->get_pdptr         = kvm_pdptr_read;
5603         context->inject_page_fault = kvm_inject_page_fault;
5604 }
5605
5606 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
5607                                 union kvm_cpu_role new_mode)
5608 {
5609         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5610
5611         if (new_mode.as_u64 == g_context->cpu_role.as_u64)
5612                 return;
5613
5614         g_context->cpu_role.as_u64   = new_mode.as_u64;
5615         g_context->get_guest_pgd     = get_guest_cr3;
5616         g_context->get_pdptr         = kvm_pdptr_read;
5617         g_context->inject_page_fault = kvm_inject_page_fault;
5618
5619         /*
5620          * L2 page tables are never shadowed, so there is no need to sync
5621          * SPTEs.
5622          */
5623         g_context->sync_spte         = NULL;
5624
5625         /*
5626          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5627          * L1's nested page tables (e.g. EPT12). The nested translation
5628          * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5629          * L2's page tables as the first level of translation and L1's
5630          * nested page tables as the second level of translation. Basically
5631          * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5632          */
5633         if (!is_paging(vcpu))
5634                 g_context->gva_to_gpa = nonpaging_gva_to_gpa;
5635         else if (is_long_mode(vcpu))
5636                 g_context->gva_to_gpa = paging64_gva_to_gpa;
5637         else if (is_pae(vcpu))
5638                 g_context->gva_to_gpa = paging64_gva_to_gpa;
5639         else
5640                 g_context->gva_to_gpa = paging32_gva_to_gpa;
5641
5642         reset_guest_paging_metadata(vcpu, g_context);
5643 }
5644
5645 void kvm_init_mmu(struct kvm_vcpu *vcpu)
5646 {
5647         struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
5648         union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
5649
5650         if (mmu_is_nested(vcpu))
5651                 init_kvm_nested_mmu(vcpu, cpu_role);
5652         else if (tdp_enabled)
5653                 init_kvm_tdp_mmu(vcpu, cpu_role);
5654         else
5655                 init_kvm_softmmu(vcpu, cpu_role);
5656 }
5657 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5658
5659 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
5660 {
5661         /*
5662          * Invalidate all MMU roles to force them to reinitialize as CPUID
5663          * information is factored into reserved bit calculations.
5664          *
5665          * Correctly handling multiple vCPU models with respect to paging and
5666          * physical address properties) in a single VM would require tracking
5667          * all relevant CPUID information in kvm_mmu_page_role. That is very
5668          * undesirable as it would increase the memory requirements for
5669          * gfn_write_track (see struct kvm_mmu_page_role comments).  For now
5670          * that problem is swept under the rug; KVM's CPUID API is horrific and
5671          * it's all but impossible to solve it without introducing a new API.
5672          */
5673         vcpu->arch.root_mmu.root_role.invalid = 1;
5674         vcpu->arch.guest_mmu.root_role.invalid = 1;
5675         vcpu->arch.nested_mmu.root_role.invalid = 1;
5676         vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
5677         vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
5678         vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
5679         kvm_mmu_reset_context(vcpu);
5680
5681         /*
5682          * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
5683          * kvm_arch_vcpu_ioctl().
5684          */
5685         KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
5686 }
5687
5688 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5689 {
5690         kvm_mmu_unload(vcpu);
5691         kvm_init_mmu(vcpu);
5692 }
5693 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5694
5695 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5696 {
5697         int r;
5698
5699         r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
5700         if (r)
5701                 goto out;
5702         r = mmu_alloc_special_roots(vcpu);
5703         if (r)
5704                 goto out;
5705         if (vcpu->arch.mmu->root_role.direct)
5706                 r = mmu_alloc_direct_roots(vcpu);
5707         else
5708                 r = mmu_alloc_shadow_roots(vcpu);
5709         if (r)
5710                 goto out;
5711
5712         kvm_mmu_sync_roots(vcpu);
5713
5714         kvm_mmu_load_pgd(vcpu);
5715
5716         /*
5717          * Flush any TLB entries for the new root, the provenance of the root
5718          * is unknown.  Even if KVM ensures there are no stale TLB entries
5719          * for a freed root, in theory another hypervisor could have left
5720          * stale entries.  Flushing on alloc also allows KVM to skip the TLB
5721          * flush when freeing a root (see kvm_tdp_mmu_put_root()).
5722          */
5723         kvm_x86_call(flush_tlb_current)(vcpu);
5724 out:
5725         return r;
5726 }
5727
5728 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5729 {
5730         struct kvm *kvm = vcpu->kvm;
5731
5732         kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5733         WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
5734         kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5735         WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
5736         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
5737 }
5738
5739 static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
5740 {
5741         struct kvm_mmu_page *sp;
5742
5743         if (!VALID_PAGE(root_hpa))
5744                 return false;
5745
5746         /*
5747          * When freeing obsolete roots, treat roots as obsolete if they don't
5748          * have an associated shadow page, as it's impossible to determine if
5749          * such roots are fresh or stale.  This does mean KVM will get false
5750          * positives and free roots that don't strictly need to be freed, but
5751          * such false positives are relatively rare:
5752          *
5753          *  (a) only PAE paging and nested NPT have roots without shadow pages
5754          *      (or any shadow paging flavor with a dummy root, see note below)
5755          *  (b) remote reloads due to a memslot update obsoletes _all_ roots
5756          *  (c) KVM doesn't track previous roots for PAE paging, and the guest
5757          *      is unlikely to zap an in-use PGD.
5758          *
5759          * Note!  Dummy roots are unique in that they are obsoleted by memslot
5760          * _creation_!  See also FNAME(fetch).
5761          */
5762         sp = root_to_sp(root_hpa);
5763         return !sp || is_obsolete_sp(kvm, sp);
5764 }
5765
5766 static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
5767 {
5768         unsigned long roots_to_free = 0;
5769         int i;
5770
5771         if (is_obsolete_root(kvm, mmu->root.hpa))
5772                 roots_to_free |= KVM_MMU_ROOT_CURRENT;
5773
5774         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5775                 if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
5776                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
5777         }
5778
5779         if (roots_to_free)
5780                 kvm_mmu_free_roots(kvm, mmu, roots_to_free);
5781 }
5782
5783 void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
5784 {
5785         __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu);
5786         __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu);
5787 }
5788
5789 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5790                                     int *bytes)
5791 {
5792         u64 gentry = 0;
5793         int r;
5794
5795         /*
5796          * Assume that the pte write on a page table of the same type
5797          * as the current vcpu paging mode since we update the sptes only
5798          * when they have the same mode.
5799          */
5800         if (is_pae(vcpu) && *bytes == 4) {
5801                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5802                 *gpa &= ~(gpa_t)7;
5803                 *bytes = 8;
5804         }
5805
5806         if (*bytes == 4 || *bytes == 8) {
5807                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5808                 if (r)
5809                         gentry = 0;
5810         }
5811
5812         return gentry;
5813 }
5814
5815 /*
5816  * If we're seeing too many writes to a page, it may no longer be a page table,
5817  * or we may be forking, in which case it is better to unmap the page.
5818  */
5819 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5820 {
5821         /*
5822          * Skip write-flooding detected for the sp whose level is 1, because
5823          * it can become unsync, then the guest page is not write-protected.
5824          */
5825         if (sp->role.level == PG_LEVEL_4K)
5826                 return false;
5827
5828         atomic_inc(&sp->write_flooding_count);
5829         return atomic_read(&sp->write_flooding_count) >= 3;
5830 }
5831
5832 /*
5833  * Misaligned accesses are too much trouble to fix up; also, they usually
5834  * indicate a page is not used as a page table.
5835  */
5836 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5837                                     int bytes)
5838 {
5839         unsigned offset, pte_size, misaligned;
5840
5841         offset = offset_in_page(gpa);
5842         pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
5843
5844         /*
5845          * Sometimes, the OS only writes the last one bytes to update status
5846          * bits, for example, in linux, andb instruction is used in clear_bit().
5847          */
5848         if (!(offset & (pte_size - 1)) && bytes == 1)
5849                 return false;
5850
5851         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5852         misaligned |= bytes < 4;
5853
5854         return misaligned;
5855 }
5856
5857 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5858 {
5859         unsigned page_offset, quadrant;
5860         u64 *spte;
5861         int level;
5862
5863         page_offset = offset_in_page(gpa);
5864         level = sp->role.level;
5865         *nspte = 1;
5866         if (sp->role.has_4_byte_gpte) {
5867                 page_offset <<= 1;      /* 32->64 */
5868                 /*
5869                  * A 32-bit pde maps 4MB while the shadow pdes map
5870                  * only 2MB.  So we need to double the offset again
5871                  * and zap two pdes instead of one.
5872                  */
5873                 if (level == PT32_ROOT_LEVEL) {
5874                         page_offset &= ~7; /* kill rounding error */
5875                         page_offset <<= 1;
5876                         *nspte = 2;
5877                 }
5878                 quadrant = page_offset >> PAGE_SHIFT;
5879                 page_offset &= ~PAGE_MASK;
5880                 if (quadrant != sp->role.quadrant)
5881                         return NULL;
5882         }
5883
5884         spte = &sp->spt[page_offset / sizeof(*spte)];
5885         return spte;
5886 }
5887
5888 void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
5889                          int bytes)
5890 {
5891         gfn_t gfn = gpa >> PAGE_SHIFT;
5892         struct kvm_mmu_page *sp;
5893         LIST_HEAD(invalid_list);
5894         u64 entry, gentry, *spte;
5895         int npte;
5896         bool flush = false;
5897
5898         /*
5899          * When emulating guest writes, ensure the written value is visible to
5900          * any task that is handling page faults before checking whether or not
5901          * KVM is shadowing a guest PTE.  This ensures either KVM will create
5902          * the correct SPTE in the page fault handler, or this task will see
5903          * a non-zero indirect_shadow_pages.  Pairs with the smp_mb() in
5904          * account_shadowed().
5905          */
5906         smp_mb();
5907         if (!vcpu->kvm->arch.indirect_shadow_pages)
5908                 return;
5909
5910         write_lock(&vcpu->kvm->mmu_lock);
5911
5912         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5913
5914         ++vcpu->kvm->stat.mmu_pte_write;
5915
5916         for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
5917                 if (detect_write_misaligned(sp, gpa, bytes) ||
5918                       detect_write_flooding(sp)) {
5919                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5920                         ++vcpu->kvm->stat.mmu_flooded;
5921                         continue;
5922                 }
5923
5924                 spte = get_written_sptes(sp, gpa, &npte);
5925                 if (!spte)
5926                         continue;
5927
5928                 while (npte--) {
5929                         entry = *spte;
5930                         mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
5931                         if (gentry && sp->role.level != PG_LEVEL_4K)
5932                                 ++vcpu->kvm->stat.mmu_pde_zapped;
5933                         if (is_shadow_present_pte(entry))
5934                                 flush = true;
5935                         ++spte;
5936                 }
5937         }
5938         kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
5939         write_unlock(&vcpu->kvm->mmu_lock);
5940 }
5941
5942 static bool is_write_to_guest_page_table(u64 error_code)
5943 {
5944         const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK;
5945
5946         return (error_code & mask) == mask;
5947 }
5948
5949 static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
5950                                        u64 error_code, int *emulation_type)
5951 {
5952         bool direct = vcpu->arch.mmu->root_role.direct;
5953
5954         /*
5955          * Do not try to unprotect and retry if the vCPU re-faulted on the same
5956          * RIP with the same address that was previously unprotected, as doing
5957          * so will likely put the vCPU into an infinite.  E.g. if the vCPU uses
5958          * a non-page-table modifying instruction on the PDE that points to the
5959          * instruction, then unprotecting the gfn will unmap the instruction's
5960          * code, i.e. make it impossible for the instruction to ever complete.
5961          */
5962         if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) &&
5963             vcpu->arch.last_retry_addr == cr2_or_gpa)
5964                 return RET_PF_EMULATE;
5965
5966         /*
5967          * Reset the unprotect+retry values that guard against infinite loops.
5968          * The values will be refreshed if KVM explicitly unprotects a gfn and
5969          * retries, in all other cases it's safe to retry in the future even if
5970          * the next page fault happens on the same RIP+address.
5971          */
5972         vcpu->arch.last_retry_eip = 0;
5973         vcpu->arch.last_retry_addr = 0;
5974
5975         /*
5976          * It should be impossible to reach this point with an MMIO cache hit,
5977          * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid,
5978          * writable memslot, and creating a memslot should invalidate the MMIO
5979          * cache by way of changing the memslot generation.  WARN and disallow
5980          * retry if MMIO is detected, as retrying MMIO emulation is pointless
5981          * and could put the vCPU into an infinite loop because the processor
5982          * will keep faulting on the non-existent MMIO address.
5983          */
5984         if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct)))
5985                 return RET_PF_EMULATE;
5986
5987         /*
5988          * Before emulating the instruction, check to see if the access was due
5989          * to a read-only violation while the CPU was walking non-nested NPT
5990          * page tables, i.e. for a direct MMU, for _guest_ page tables in L1.
5991          * If L1 is sharing (a subset of) its page tables with L2, e.g. by
5992          * having nCR3 share lower level page tables with hCR3, then when KVM
5993          * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also
5994          * unknowingly write-protecting L1's guest page tables, which KVM isn't
5995          * shadowing.
5996          *
5997          * Because the CPU (by default) walks NPT page tables using a write
5998          * access (to ensure the CPU can do A/D updates), page walks in L1 can
5999          * trigger write faults for the above case even when L1 isn't modifying
6000          * PTEs.  As a result, KVM will unnecessarily emulate (or at least, try
6001          * to emulate) an excessive number of L1 instructions; because L1's MMU
6002          * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs
6003          * and thus no need to emulate in order to guarantee forward progress.
6004          *
6005          * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can
6006          * proceed without triggering emulation.  If one or more shadow pages
6007          * was zapped, skip emulation and resume L1 to let it natively execute
6008          * the instruction.  If no shadow pages were zapped, then the write-
6009          * fault is due to something else entirely, i.e. KVM needs to emulate,
6010          * as resuming the guest will put it into an infinite loop.
6011          *
6012          * Note, this code also applies to Intel CPUs, even though it is *very*
6013          * unlikely that an L1 will share its page tables (IA32/PAE/paging64
6014          * format) with L2's page tables (EPT format).
6015          *
6016          * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to
6017          * unprotect the gfn and retry if an event is awaiting reinjection.  If
6018          * KVM emulates multiple instructions before completing event injection,
6019          * the event could be delayed beyond what is architecturally allowed,
6020          * e.g. KVM could inject an IRQ after the TPR has been raised.
6021          */
6022         if (((direct && is_write_to_guest_page_table(error_code)) ||
6023              (!direct && kvm_event_needs_reinjection(vcpu))) &&
6024             kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
6025                 return RET_PF_RETRY;
6026
6027         /*
6028          * The gfn is write-protected, but if KVM detects its emulating an
6029          * instruction that is unlikely to be used to modify page tables, or if
6030          * emulation fails, KVM can try to unprotect the gfn and let the CPU
6031          * re-execute the instruction that caused the page fault.  Do not allow
6032          * retrying an instruction from a nested guest as KVM is only explicitly
6033          * shadowing L1's page tables, i.e. unprotecting something for L1 isn't
6034          * going to magically fix whatever issue caused L2 to fail.
6035          */
6036         if (!is_guest_mode(vcpu))
6037                 *emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
6038
6039         return RET_PF_EMULATE;
6040 }
6041
6042 int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
6043                        void *insn, int insn_len)
6044 {
6045         int r, emulation_type = EMULTYPE_PF;
6046         bool direct = vcpu->arch.mmu->root_role.direct;
6047
6048         if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
6049                 return RET_PF_RETRY;
6050
6051         /*
6052          * Except for reserved faults (emulated MMIO is shared-only), set the
6053          * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
6054          * current attributes, which are the source of truth for such VMs.  Note,
6055          * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
6056          * currently supported nested virtualization (among many other things)
6057          * for software-protected VMs.
6058          */
6059         if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
6060             !(error_code & PFERR_RSVD_MASK) &&
6061             vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
6062             kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
6063                 error_code |= PFERR_PRIVATE_ACCESS;
6064
6065         r = RET_PF_INVALID;
6066         if (unlikely(error_code & PFERR_RSVD_MASK)) {
6067                 if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS))
6068                         return -EFAULT;
6069
6070                 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
6071                 if (r == RET_PF_EMULATE)
6072                         goto emulate;
6073         }
6074
6075         if (r == RET_PF_INVALID) {
6076                 vcpu->stat.pf_taken++;
6077
6078                 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
6079                                           &emulation_type, NULL);
6080                 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
6081                         return -EIO;
6082         }
6083
6084         if (r < 0)
6085                 return r;
6086
6087         if (r == RET_PF_WRITE_PROTECTED)
6088                 r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
6089                                                 &emulation_type);
6090
6091         if (r == RET_PF_FIXED)
6092                 vcpu->stat.pf_fixed++;
6093         else if (r == RET_PF_EMULATE)
6094                 vcpu->stat.pf_emulate++;
6095         else if (r == RET_PF_SPURIOUS)
6096                 vcpu->stat.pf_spurious++;
6097
6098         if (r != RET_PF_EMULATE)
6099                 return 1;
6100
6101 emulate:
6102         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
6103                                        insn_len);
6104 }
6105 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
6106
6107 void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg)
6108 {
6109         u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
6110         int root_level, leaf, level;
6111
6112         leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level);
6113         if (unlikely(leaf < 0))
6114                 return;
6115
6116         pr_err("%s %llx", msg, gpa);
6117         for (level = root_level; level >= leaf; level--)
6118                 pr_cont(", spte[%d] = 0x%llx", level, sptes[level]);
6119         pr_cont("\n");
6120 }
6121 EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes);
6122
6123 static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
6124                                       u64 addr, hpa_t root_hpa)
6125 {
6126         struct kvm_shadow_walk_iterator iterator;
6127
6128         vcpu_clear_mmio_info(vcpu, addr);
6129
6130         /*
6131          * Walking and synchronizing SPTEs both assume they are operating in
6132          * the context of the current MMU, and would need to be reworked if
6133          * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT.
6134          */
6135         if (WARN_ON_ONCE(mmu != vcpu->arch.mmu))
6136                 return;
6137
6138         if (!VALID_PAGE(root_hpa))
6139                 return;
6140
6141         write_lock(&vcpu->kvm->mmu_lock);
6142         for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
6143                 struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep);
6144
6145                 if (sp->unsync) {
6146                         int ret = kvm_sync_spte(vcpu, sp, iterator.index);
6147
6148                         if (ret < 0)
6149                                 mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL);
6150                         if (ret)
6151                                 kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep);
6152                 }
6153
6154                 if (!sp->unsync_children)
6155                         break;
6156         }
6157         write_unlock(&vcpu->kvm->mmu_lock);
6158 }
6159
6160 void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
6161                              u64 addr, unsigned long roots)
6162 {
6163         int i;
6164
6165         WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
6166
6167         /* It's actually a GPA for vcpu->arch.guest_mmu.  */
6168         if (mmu != &vcpu->arch.guest_mmu) {
6169                 /* INVLPG on a non-canonical address is a NOP according to the SDM.  */
6170                 if (is_noncanonical_invlpg_address(addr, vcpu))
6171                         return;
6172
6173                 kvm_x86_call(flush_tlb_gva)(vcpu, addr);
6174         }
6175
6176         if (!mmu->sync_spte)
6177                 return;
6178
6179         if (roots & KVM_MMU_ROOT_CURRENT)
6180                 __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
6181
6182         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
6183                 if (roots & KVM_MMU_ROOT_PREVIOUS(i))
6184                         __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
6185         }
6186 }
6187 EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
6188
6189 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
6190 {
6191         /*
6192          * INVLPG is required to invalidate any global mappings for the VA,
6193          * irrespective of PCID.  Blindly sync all roots as it would take
6194          * roughly the same amount of work/time to determine whether any of the
6195          * previous roots have a global mapping.
6196          *
6197          * Mappings not reachable via the current or previous cached roots will
6198          * be synced when switching to that new cr3, so nothing needs to be
6199          * done here for them.
6200          */
6201         kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
6202         ++vcpu->stat.invlpg;
6203 }
6204 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
6205
6206
6207 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
6208 {
6209         struct kvm_mmu *mmu = vcpu->arch.mmu;
6210         unsigned long roots = 0;
6211         uint i;
6212
6213         if (pcid == kvm_get_active_pcid(vcpu))
6214                 roots |= KVM_MMU_ROOT_CURRENT;
6215
6216         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
6217                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
6218                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
6219                         roots |= KVM_MMU_ROOT_PREVIOUS(i);
6220         }
6221
6222         if (roots)
6223                 kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
6224         ++vcpu->stat.invlpg;
6225
6226         /*
6227          * Mappings not reachable via the current cr3 or the prev_roots will be
6228          * synced when switching to that cr3, so nothing needs to be done here
6229          * for them.
6230          */
6231 }
6232
6233 void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
6234                        int tdp_max_root_level, int tdp_huge_page_level)
6235 {
6236         tdp_enabled = enable_tdp;
6237         tdp_root_level = tdp_forced_root_level;
6238         max_tdp_level = tdp_max_root_level;
6239
6240 #ifdef CONFIG_X86_64
6241         tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
6242 #endif
6243         /*
6244          * max_huge_page_level reflects KVM's MMU capabilities irrespective
6245          * of kernel support, e.g. KVM may be capable of using 1GB pages when
6246          * the kernel is not.  But, KVM never creates a page size greater than
6247          * what is used by the kernel for any given HVA, i.e. the kernel's
6248          * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
6249          */
6250         if (tdp_enabled)
6251                 max_huge_page_level = tdp_huge_page_level;
6252         else if (boot_cpu_has(X86_FEATURE_GBPAGES))
6253                 max_huge_page_level = PG_LEVEL_1G;
6254         else
6255                 max_huge_page_level = PG_LEVEL_2M;
6256 }
6257 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
6258
6259 static void free_mmu_pages(struct kvm_mmu *mmu)
6260 {
6261         if (!tdp_enabled && mmu->pae_root)
6262                 set_memory_encrypted((unsigned long)mmu->pae_root, 1);
6263         free_page((unsigned long)mmu->pae_root);
6264         free_page((unsigned long)mmu->pml4_root);
6265         free_page((unsigned long)mmu->pml5_root);
6266 }
6267
6268 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
6269 {
6270         struct page *page;
6271         int i;
6272
6273         mmu->root.hpa = INVALID_PAGE;
6274         mmu->root.pgd = 0;
6275         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
6276                 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
6277
6278         /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
6279         if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
6280                 return 0;
6281
6282         /*
6283          * When using PAE paging, the four PDPTEs are treated as 'root' pages,
6284          * while the PDP table is a per-vCPU construct that's allocated at MMU
6285          * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
6286          * x86_64.  Therefore we need to allocate the PDP table in the first
6287          * 4GB of memory, which happens to fit the DMA32 zone.  TDP paging
6288          * generally doesn't use PAE paging and can skip allocating the PDP
6289          * table.  The main exception, handled here, is SVM's 32-bit NPT.  The
6290          * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
6291          * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
6292          */
6293         if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
6294                 return 0;
6295
6296         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
6297         if (!page)
6298                 return -ENOMEM;
6299
6300         mmu->pae_root = page_address(page);
6301
6302         /*
6303          * CR3 is only 32 bits when PAE paging is used, thus it's impossible to
6304          * get the CPU to treat the PDPTEs as encrypted.  Decrypt the page so
6305          * that KVM's writes and the CPU's reads get along.  Note, this is
6306          * only necessary when using shadow paging, as 64-bit NPT can get at
6307          * the C-bit even when shadowing 32-bit NPT, and SME isn't supported
6308          * by 32-bit kernels (when KVM itself uses 32-bit NPT).
6309          */
6310         if (!tdp_enabled)
6311                 set_memory_decrypted((unsigned long)mmu->pae_root, 1);
6312         else
6313                 WARN_ON_ONCE(shadow_me_value);
6314
6315         for (i = 0; i < 4; ++i)
6316                 mmu->pae_root[i] = INVALID_PAE_ROOT;
6317
6318         return 0;
6319 }
6320
6321 int kvm_mmu_create(struct kvm_vcpu *vcpu)
6322 {
6323         int ret;
6324
6325         vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
6326         vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
6327
6328         vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
6329         vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
6330
6331         vcpu->arch.mmu_shadow_page_cache.init_value =
6332                 SHADOW_NONPRESENT_VALUE;
6333         if (!vcpu->arch.mmu_shadow_page_cache.init_value)
6334                 vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
6335
6336         vcpu->arch.mmu = &vcpu->arch.root_mmu;
6337         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
6338
6339         ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
6340         if (ret)
6341                 return ret;
6342
6343         ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
6344         if (ret)
6345                 goto fail_allocate_root;
6346
6347         return ret;
6348  fail_allocate_root:
6349         free_mmu_pages(&vcpu->arch.guest_mmu);
6350         return ret;
6351 }
6352
6353 #define BATCH_ZAP_PAGES 10
6354 static void kvm_zap_obsolete_pages(struct kvm *kvm)
6355 {
6356         struct kvm_mmu_page *sp, *node;
6357         int nr_zapped, batch = 0;
6358         LIST_HEAD(invalid_list);
6359         bool unstable;
6360
6361         lockdep_assert_held(&kvm->slots_lock);
6362
6363 restart:
6364         list_for_each_entry_safe_reverse(sp, node,
6365               &kvm->arch.active_mmu_pages, link) {
6366                 /*
6367                  * No obsolete valid page exists before a newly created page
6368                  * since active_mmu_pages is a FIFO list.
6369                  */
6370                 if (!is_obsolete_sp(kvm, sp))
6371                         break;
6372
6373                 /*
6374                  * Invalid pages should never land back on the list of active
6375                  * pages.  Skip the bogus page, otherwise we'll get stuck in an
6376                  * infinite loop if the page gets put back on the list (again).
6377                  */
6378                 if (WARN_ON_ONCE(sp->role.invalid))
6379                         continue;
6380
6381                 /*
6382                  * No need to flush the TLB since we're only zapping shadow
6383                  * pages with an obsolete generation number and all vCPUS have
6384                  * loaded a new root, i.e. the shadow pages being zapped cannot
6385                  * be in active use by the guest.
6386                  */
6387                 if (batch >= BATCH_ZAP_PAGES &&
6388                     cond_resched_rwlock_write(&kvm->mmu_lock)) {
6389                         batch = 0;
6390                         goto restart;
6391                 }
6392
6393                 unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
6394                                 &invalid_list, &nr_zapped);
6395                 batch += nr_zapped;
6396
6397                 if (unstable)
6398                         goto restart;
6399         }
6400
6401         /*
6402          * Kick all vCPUs (via remote TLB flush) before freeing the page tables
6403          * to ensure KVM is not in the middle of a lockless shadow page table
6404          * walk, which may reference the pages.  The remote TLB flush itself is
6405          * not required and is simply a convenient way to kick vCPUs as needed.
6406          * KVM performs a local TLB flush when allocating a new root (see
6407          * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
6408          * running with an obsolete MMU.
6409          */
6410         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6411 }
6412
6413 /*
6414  * Fast invalidate all shadow pages and use lock-break technique
6415  * to zap obsolete pages.
6416  *
6417  * It's required when memslot is being deleted or VM is being
6418  * destroyed, in these cases, we should ensure that KVM MMU does
6419  * not use any resource of the being-deleted slot or all slots
6420  * after calling the function.
6421  */
6422 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
6423 {
6424         lockdep_assert_held(&kvm->slots_lock);
6425
6426         write_lock(&kvm->mmu_lock);
6427         trace_kvm_mmu_zap_all_fast(kvm);
6428
6429         /*
6430          * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
6431          * held for the entire duration of zapping obsolete pages, it's
6432          * impossible for there to be multiple invalid generations associated
6433          * with *valid* shadow pages at any given time, i.e. there is exactly
6434          * one valid generation and (at most) one invalid generation.
6435          */
6436         kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
6437
6438         /*
6439          * In order to ensure all vCPUs drop their soon-to-be invalid roots,
6440          * invalidating TDP MMU roots must be done while holding mmu_lock for
6441          * write and in the same critical section as making the reload request,
6442          * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
6443          */
6444         if (tdp_mmu_enabled)
6445                 kvm_tdp_mmu_invalidate_all_roots(kvm);
6446
6447         /*
6448          * Notify all vcpus to reload its shadow page table and flush TLB.
6449          * Then all vcpus will switch to new shadow page table with the new
6450          * mmu_valid_gen.
6451          *
6452          * Note: we need to do this under the protection of mmu_lock,
6453          * otherwise, vcpu would purge shadow page but miss tlb flush.
6454          */
6455         kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
6456
6457         kvm_zap_obsolete_pages(kvm);
6458
6459         write_unlock(&kvm->mmu_lock);
6460
6461         /*
6462          * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
6463          * returning to the caller, e.g. if the zap is in response to a memslot
6464          * deletion, mmu_notifier callbacks will be unable to reach the SPTEs
6465          * associated with the deleted memslot once the update completes, and
6466          * Deferring the zap until the final reference to the root is put would
6467          * lead to use-after-free.
6468          */
6469         if (tdp_mmu_enabled)
6470                 kvm_tdp_mmu_zap_invalidated_roots(kvm);
6471 }
6472
6473 void kvm_mmu_init_vm(struct kvm *kvm)
6474 {
6475         kvm->arch.shadow_mmio_value = shadow_mmio_value;
6476         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6477         INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
6478         spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
6479
6480         if (tdp_mmu_enabled)
6481                 kvm_mmu_init_tdp_mmu(kvm);
6482
6483         kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
6484         kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
6485
6486         kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
6487
6488         kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
6489         kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
6490 }
6491
6492 static void mmu_free_vm_memory_caches(struct kvm *kvm)
6493 {
6494         kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache);
6495         kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache);
6496         kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache);
6497 }
6498
6499 void kvm_mmu_uninit_vm(struct kvm *kvm)
6500 {
6501         if (tdp_mmu_enabled)
6502                 kvm_mmu_uninit_tdp_mmu(kvm);
6503
6504         mmu_free_vm_memory_caches(kvm);
6505 }
6506
6507 static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6508 {
6509         const struct kvm_memory_slot *memslot;
6510         struct kvm_memslots *slots;
6511         struct kvm_memslot_iter iter;
6512         bool flush = false;
6513         gfn_t start, end;
6514         int i;
6515
6516         if (!kvm_memslots_have_rmaps(kvm))
6517                 return flush;
6518
6519         for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
6520                 slots = __kvm_memslots(kvm, i);
6521
6522                 kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
6523                         memslot = iter.slot;
6524                         start = max(gfn_start, memslot->base_gfn);
6525                         end = min(gfn_end, memslot->base_gfn + memslot->npages);
6526                         if (WARN_ON_ONCE(start >= end))
6527                                 continue;
6528
6529                         flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start,
6530                                                          end, true, flush);
6531                 }
6532         }
6533
6534         return flush;
6535 }
6536
6537 /*
6538  * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
6539  * (not including it)
6540  */
6541 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6542 {
6543         bool flush;
6544
6545         if (WARN_ON_ONCE(gfn_end <= gfn_start))
6546                 return;
6547
6548         write_lock(&kvm->mmu_lock);
6549
6550         kvm_mmu_invalidate_begin(kvm);
6551
6552         kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end);
6553
6554         flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
6555
6556         if (tdp_mmu_enabled)
6557                 flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush);
6558
6559         if (flush)
6560                 kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
6561
6562         kvm_mmu_invalidate_end(kvm);
6563
6564         write_unlock(&kvm->mmu_lock);
6565 }
6566
6567 static bool slot_rmap_write_protect(struct kvm *kvm,
6568                                     struct kvm_rmap_head *rmap_head,
6569                                     const struct kvm_memory_slot *slot)
6570 {
6571         return rmap_write_protect(rmap_head, false);
6572 }
6573
6574 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
6575                                       const struct kvm_memory_slot *memslot,
6576                                       int start_level)
6577 {
6578         if (kvm_memslots_have_rmaps(kvm)) {
6579                 write_lock(&kvm->mmu_lock);
6580                 walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect,
6581                                 start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
6582                 write_unlock(&kvm->mmu_lock);
6583         }
6584
6585         if (tdp_mmu_enabled) {
6586                 read_lock(&kvm->mmu_lock);
6587                 kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
6588                 read_unlock(&kvm->mmu_lock);
6589         }
6590 }
6591
6592 static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min)
6593 {
6594         return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
6595 }
6596
6597 static bool need_topup_split_caches_or_resched(struct kvm *kvm)
6598 {
6599         if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
6600                 return true;
6601
6602         /*
6603          * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
6604          * to split a single huge page. Calculating how many are actually needed
6605          * is possible but not worth the complexity.
6606          */
6607         return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
6608                need_topup(&kvm->arch.split_page_header_cache, 1) ||
6609                need_topup(&kvm->arch.split_shadow_page_cache, 1);
6610 }
6611
6612 static int topup_split_caches(struct kvm *kvm)
6613 {
6614         /*
6615          * Allocating rmap list entries when splitting huge pages for nested
6616          * MMUs is uncommon as KVM needs to use a list if and only if there is
6617          * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
6618          * aliased by multiple L2 gfns and/or from multiple nested roots with
6619          * different roles.  Aliasing gfns when using TDP is atypical for VMMs;
6620          * a few gfns are often aliased during boot, e.g. when remapping BIOS,
6621          * but aliasing rarely occurs post-boot or for many gfns.  If there is
6622          * only one rmap entry, rmap->val points directly at that one entry and
6623          * doesn't need to allocate a list.  Buffer the cache by the default
6624          * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
6625          * encounters an aliased gfn or two.
6626          */
6627         const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
6628                              KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
6629         int r;
6630
6631         lockdep_assert_held(&kvm->slots_lock);
6632
6633         r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
6634                                          SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
6635         if (r)
6636                 return r;
6637
6638         r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1);
6639         if (r)
6640                 return r;
6641
6642         return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1);
6643 }
6644
6645 static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep)
6646 {
6647         struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6648         struct shadow_page_caches caches = {};
6649         union kvm_mmu_page_role role;
6650         unsigned int access;
6651         gfn_t gfn;
6652
6653         gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
6654         access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep));
6655
6656         /*
6657          * Note, huge page splitting always uses direct shadow pages, regardless
6658          * of whether the huge page itself is mapped by a direct or indirect
6659          * shadow page, since the huge page region itself is being directly
6660          * mapped with smaller pages.
6661          */
6662         role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
6663
6664         /* Direct SPs do not require a shadowed_info_cache. */
6665         caches.page_header_cache = &kvm->arch.split_page_header_cache;
6666         caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
6667
6668         /* Safe to pass NULL for vCPU since requesting a direct SP. */
6669         return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
6670 }
6671
6672 static void shadow_mmu_split_huge_page(struct kvm *kvm,
6673                                        const struct kvm_memory_slot *slot,
6674                                        u64 *huge_sptep)
6675
6676 {
6677         struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
6678         u64 huge_spte = READ_ONCE(*huge_sptep);
6679         struct kvm_mmu_page *sp;
6680         bool flush = false;
6681         u64 *sptep, spte;
6682         gfn_t gfn;
6683         int index;
6684
6685         sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
6686
6687         for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
6688                 sptep = &sp->spt[index];
6689                 gfn = kvm_mmu_page_get_gfn(sp, index);
6690
6691                 /*
6692                  * The SP may already have populated SPTEs, e.g. if this huge
6693                  * page is aliased by multiple sptes with the same access
6694                  * permissions. These entries are guaranteed to map the same
6695                  * gfn-to-pfn translation since the SP is direct, so no need to
6696                  * modify them.
6697                  *
6698                  * However, if a given SPTE points to a lower level page table,
6699                  * that lower level page table may only be partially populated.
6700                  * Installing such SPTEs would effectively unmap a potion of the
6701                  * huge page. Unmapping guest memory always requires a TLB flush
6702                  * since a subsequent operation on the unmapped regions would
6703                  * fail to detect the need to flush.
6704                  */
6705                 if (is_shadow_present_pte(*sptep)) {
6706                         flush |= !is_last_spte(*sptep, sp->role.level);
6707                         continue;
6708                 }
6709
6710                 spte = make_small_spte(kvm, huge_spte, sp->role, index);
6711                 mmu_spte_set(sptep, spte);
6712                 __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
6713         }
6714
6715         __link_shadow_page(kvm, cache, huge_sptep, sp, flush);
6716 }
6717
6718 static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
6719                                           const struct kvm_memory_slot *slot,
6720                                           u64 *huge_sptep)
6721 {
6722         struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
6723         int level, r = 0;
6724         gfn_t gfn;
6725         u64 spte;
6726
6727         /* Grab information for the tracepoint before dropping the MMU lock. */
6728         gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
6729         level = huge_sp->role.level;
6730         spte = *huge_sptep;
6731
6732         if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
6733                 r = -ENOSPC;
6734                 goto out;
6735         }
6736
6737         if (need_topup_split_caches_or_resched(kvm)) {
6738                 write_unlock(&kvm->mmu_lock);
6739                 cond_resched();
6740                 /*
6741                  * If the topup succeeds, return -EAGAIN to indicate that the
6742                  * rmap iterator should be restarted because the MMU lock was
6743                  * dropped.
6744                  */
6745                 r = topup_split_caches(kvm) ?: -EAGAIN;
6746                 write_lock(&kvm->mmu_lock);
6747                 goto out;
6748         }
6749
6750         shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
6751
6752 out:
6753         trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
6754         return r;
6755 }
6756
6757 static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6758                                             struct kvm_rmap_head *rmap_head,
6759                                             const struct kvm_memory_slot *slot)
6760 {
6761         struct rmap_iterator iter;
6762         struct kvm_mmu_page *sp;
6763         u64 *huge_sptep;
6764         int r;
6765
6766 restart:
6767         for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
6768                 sp = sptep_to_sp(huge_sptep);
6769
6770                 /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */
6771                 if (WARN_ON_ONCE(!sp->role.guest_mode))
6772                         continue;
6773
6774                 /* The rmaps should never contain non-leaf SPTEs. */
6775                 if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
6776                         continue;
6777
6778                 /* SPs with level >PG_LEVEL_4K should never by unsync. */
6779                 if (WARN_ON_ONCE(sp->unsync))
6780                         continue;
6781
6782                 /* Don't bother splitting huge pages on invalid SPs. */
6783                 if (sp->role.invalid)
6784                         continue;
6785
6786                 r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
6787
6788                 /*
6789                  * The split succeeded or needs to be retried because the MMU
6790                  * lock was dropped. Either way, restart the iterator to get it
6791                  * back into a consistent state.
6792                  */
6793                 if (!r || r == -EAGAIN)
6794                         goto restart;
6795
6796                 /* The split failed and shouldn't be retried (e.g. -ENOMEM). */
6797                 break;
6798         }
6799
6800         return false;
6801 }
6802
6803 static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6804                                                 const struct kvm_memory_slot *slot,
6805                                                 gfn_t start, gfn_t end,
6806                                                 int target_level)
6807 {
6808         int level;
6809
6810         /*
6811          * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
6812          * down to the target level. This ensures pages are recursively split
6813          * all the way to the target level. There's no need to split pages
6814          * already at the target level.
6815          */
6816         for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
6817                 __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
6818                                   level, level, start, end - 1, true, true, false);
6819 }
6820
6821 /* Must be called with the mmu_lock held in write-mode. */
6822 void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
6823                                    const struct kvm_memory_slot *memslot,
6824                                    u64 start, u64 end,
6825                                    int target_level)
6826 {
6827         if (!tdp_mmu_enabled)
6828                 return;
6829
6830         if (kvm_memslots_have_rmaps(kvm))
6831                 kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6832
6833         kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
6834
6835         /*
6836          * A TLB flush is unnecessary at this point for the same reasons as in
6837          * kvm_mmu_slot_try_split_huge_pages().
6838          */
6839 }
6840
6841 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
6842                                         const struct kvm_memory_slot *memslot,
6843                                         int target_level)
6844 {
6845         u64 start = memslot->base_gfn;
6846         u64 end = start + memslot->npages;
6847
6848         if (!tdp_mmu_enabled)
6849                 return;
6850
6851         if (kvm_memslots_have_rmaps(kvm)) {
6852                 write_lock(&kvm->mmu_lock);
6853                 kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
6854                 write_unlock(&kvm->mmu_lock);
6855         }
6856
6857         read_lock(&kvm->mmu_lock);
6858         kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
6859         read_unlock(&kvm->mmu_lock);
6860
6861         /*
6862          * No TLB flush is necessary here. KVM will flush TLBs after
6863          * write-protecting and/or clearing dirty on the newly split SPTEs to
6864          * ensure that guest writes are reflected in the dirty log before the
6865          * ioctl to enable dirty logging on this memslot completes. Since the
6866          * split SPTEs retain the write and dirty bits of the huge SPTE, it is
6867          * safe for KVM to decide if a TLB flush is necessary based on the split
6868          * SPTEs.
6869          */
6870 }
6871
6872 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
6873                                          struct kvm_rmap_head *rmap_head,
6874                                          const struct kvm_memory_slot *slot)
6875 {
6876         u64 *sptep;
6877         struct rmap_iterator iter;
6878         int need_tlb_flush = 0;
6879         struct kvm_mmu_page *sp;
6880
6881 restart:
6882         for_each_rmap_spte(rmap_head, &iter, sptep) {
6883                 sp = sptep_to_sp(sptep);
6884
6885                 /*
6886                  * We cannot do huge page mapping for indirect shadow pages,
6887                  * which are found on the last rmap (level = 1) when not using
6888                  * tdp; such shadow pages are synced with the page table in
6889                  * the guest, and the guest page table is using 4K page size
6890                  * mapping if the indirect sp has level = 1.
6891                  */
6892                 if (sp->role.direct &&
6893                     sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
6894                         kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
6895
6896                         if (kvm_available_flush_remote_tlbs_range())
6897                                 kvm_flush_remote_tlbs_sptep(kvm, sptep);
6898                         else
6899                                 need_tlb_flush = 1;
6900
6901                         goto restart;
6902                 }
6903         }
6904
6905         return need_tlb_flush;
6906 }
6907 EXPORT_SYMBOL_GPL(kvm_zap_gfn_range);
6908
6909 static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
6910                                            const struct kvm_memory_slot *slot)
6911 {
6912         /*
6913          * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
6914          * pages that are already mapped at the maximum hugepage level.
6915          */
6916         if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
6917                             PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
6918                 kvm_flush_remote_tlbs_memslot(kvm, slot);
6919 }
6920
6921 void kvm_mmu_recover_huge_pages(struct kvm *kvm,
6922                                 const struct kvm_memory_slot *slot)
6923 {
6924         if (kvm_memslots_have_rmaps(kvm)) {
6925                 write_lock(&kvm->mmu_lock);
6926                 kvm_rmap_zap_collapsible_sptes(kvm, slot);
6927                 write_unlock(&kvm->mmu_lock);
6928         }
6929
6930         if (tdp_mmu_enabled) {
6931                 read_lock(&kvm->mmu_lock);
6932                 kvm_tdp_mmu_recover_huge_pages(kvm, slot);
6933                 read_unlock(&kvm->mmu_lock);
6934         }
6935 }
6936
6937 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6938                                    const struct kvm_memory_slot *memslot)
6939 {
6940         if (kvm_memslots_have_rmaps(kvm)) {
6941                 write_lock(&kvm->mmu_lock);
6942                 /*
6943                  * Clear dirty bits only on 4k SPTEs since the legacy MMU only
6944                  * support dirty logging at a 4k granularity.
6945                  */
6946                 walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
6947                 write_unlock(&kvm->mmu_lock);
6948         }
6949
6950         if (tdp_mmu_enabled) {
6951                 read_lock(&kvm->mmu_lock);
6952                 kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
6953                 read_unlock(&kvm->mmu_lock);
6954         }
6955
6956         /*
6957          * The caller will flush the TLBs after this function returns.
6958          *
6959          * It's also safe to flush TLBs out of mmu lock here as currently this
6960          * function is only used for dirty logging, in which case flushing TLB
6961          * out of mmu lock also guarantees no dirty pages will be lost in
6962          * dirty_bitmap.
6963          */
6964 }
6965
6966 static void kvm_mmu_zap_all(struct kvm *kvm)
6967 {
6968         struct kvm_mmu_page *sp, *node;
6969         LIST_HEAD(invalid_list);
6970         int ign;
6971
6972         write_lock(&kvm->mmu_lock);
6973 restart:
6974         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6975                 if (WARN_ON_ONCE(sp->role.invalid))
6976                         continue;
6977                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6978                         goto restart;
6979                 if (cond_resched_rwlock_write(&kvm->mmu_lock))
6980                         goto restart;
6981         }
6982
6983         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6984
6985         if (tdp_mmu_enabled)
6986                 kvm_tdp_mmu_zap_all(kvm);
6987
6988         write_unlock(&kvm->mmu_lock);
6989 }
6990
6991 void kvm_arch_flush_shadow_all(struct kvm *kvm)
6992 {
6993         kvm_mmu_zap_all(kvm);
6994 }
6995
6996 static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm,
6997                                                 struct kvm_memory_slot *slot,
6998                                                 bool flush)
6999 {
7000         LIST_HEAD(invalid_list);
7001         unsigned long i;
7002
7003         if (list_empty(&kvm->arch.active_mmu_pages))
7004                 goto out_flush;
7005
7006         /*
7007          * Since accounting information is stored in struct kvm_arch_memory_slot,
7008          * all MMU pages that are shadowing guest PTEs must be zapped before the
7009          * memslot is deleted, as freeing such pages after the memslot is freed
7010          * will result in use-after-free, e.g. in unaccount_shadowed().
7011          */
7012         for (i = 0; i < slot->npages; i++) {
7013                 struct kvm_mmu_page *sp;
7014                 gfn_t gfn = slot->base_gfn + i;
7015
7016                 for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn)
7017                         kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
7018
7019                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
7020                         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
7021                         flush = false;
7022                         cond_resched_rwlock_write(&kvm->mmu_lock);
7023                 }
7024         }
7025
7026 out_flush:
7027         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
7028 }
7029
7030 static void kvm_mmu_zap_memslot(struct kvm *kvm,
7031                                 struct kvm_memory_slot *slot)
7032 {
7033         struct kvm_gfn_range range = {
7034                 .slot = slot,
7035                 .start = slot->base_gfn,
7036                 .end = slot->base_gfn + slot->npages,
7037                 .may_block = true,
7038         };
7039         bool flush;
7040
7041         write_lock(&kvm->mmu_lock);
7042         flush = kvm_unmap_gfn_range(kvm, &range);
7043         kvm_mmu_zap_memslot_pages_and_flush(kvm, slot, flush);
7044         write_unlock(&kvm->mmu_lock);
7045 }
7046
7047 static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
7048 {
7049         return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
7050                kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
7051 }
7052
7053 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
7054                                    struct kvm_memory_slot *slot)
7055 {
7056         if (kvm_memslot_flush_zap_all(kvm))
7057                 kvm_mmu_zap_all_fast(kvm);
7058         else
7059                 kvm_mmu_zap_memslot(kvm, slot);
7060 }
7061
7062 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
7063 {
7064         WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
7065
7066         gen &= MMIO_SPTE_GEN_MASK;
7067
7068         /*
7069          * Generation numbers are incremented in multiples of the number of
7070          * address spaces in order to provide unique generations across all
7071          * address spaces.  Strip what is effectively the address space
7072          * modifier prior to checking for a wrap of the MMIO generation so
7073          * that a wrap in any address space is detected.
7074          */
7075         gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1);
7076
7077         /*
7078          * The very rare case: if the MMIO generation number has wrapped,
7079          * zap all shadow pages.
7080          */
7081         if (unlikely(gen == 0)) {
7082                 kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
7083                 kvm_mmu_zap_all_fast(kvm);
7084         }
7085 }
7086
7087 static void mmu_destroy_caches(void)
7088 {
7089         kmem_cache_destroy(pte_list_desc_cache);
7090         kmem_cache_destroy(mmu_page_header_cache);
7091 }
7092
7093 static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
7094 {
7095         if (nx_hugepage_mitigation_hard_disabled)
7096                 return sysfs_emit(buffer, "never\n");
7097
7098         return param_get_bool(buffer, kp);
7099 }
7100
7101 static bool get_nx_auto_mode(void)
7102 {
7103         /* Return true when CPU has the bug, and mitigations are ON */
7104         return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
7105 }
7106
7107 static void __set_nx_huge_pages(bool val)
7108 {
7109         nx_huge_pages = itlb_multihit_kvm_mitigation = val;
7110 }
7111
7112 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
7113 {
7114         bool old_val = nx_huge_pages;
7115         bool new_val;
7116
7117         if (nx_hugepage_mitigation_hard_disabled)
7118                 return -EPERM;
7119
7120         /* In "auto" mode deploy workaround only if CPU has the bug. */
7121         if (sysfs_streq(val, "off")) {
7122                 new_val = 0;
7123         } else if (sysfs_streq(val, "force")) {
7124                 new_val = 1;
7125         } else if (sysfs_streq(val, "auto")) {
7126                 new_val = get_nx_auto_mode();
7127         } else if (sysfs_streq(val, "never")) {
7128                 new_val = 0;
7129
7130                 mutex_lock(&kvm_lock);
7131                 if (!list_empty(&vm_list)) {
7132                         mutex_unlock(&kvm_lock);
7133                         return -EBUSY;
7134                 }
7135                 nx_hugepage_mitigation_hard_disabled = true;
7136                 mutex_unlock(&kvm_lock);
7137         } else if (kstrtobool(val, &new_val) < 0) {
7138                 return -EINVAL;
7139         }
7140
7141         __set_nx_huge_pages(new_val);
7142
7143         if (new_val != old_val) {
7144                 struct kvm *kvm;
7145
7146                 mutex_lock(&kvm_lock);
7147
7148                 list_for_each_entry(kvm, &vm_list, vm_list) {
7149                         mutex_lock(&kvm->slots_lock);
7150                         kvm_mmu_zap_all_fast(kvm);
7151                         mutex_unlock(&kvm->slots_lock);
7152
7153                         vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
7154                 }
7155                 mutex_unlock(&kvm_lock);
7156         }
7157
7158         return 0;
7159 }
7160
7161 /*
7162  * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
7163  * its default value of -1 is technically undefined behavior for a boolean.
7164  * Forward the module init call to SPTE code so that it too can handle module
7165  * params that need to be resolved/snapshot.
7166  */
7167 void __init kvm_mmu_x86_module_init(void)
7168 {
7169         if (nx_huge_pages == -1)
7170                 __set_nx_huge_pages(get_nx_auto_mode());
7171
7172         /*
7173          * Snapshot userspace's desire to enable the TDP MMU. Whether or not the
7174          * TDP MMU is actually enabled is determined in kvm_configure_mmu()
7175          * when the vendor module is loaded.
7176          */
7177         tdp_mmu_allowed = tdp_mmu_enabled;
7178
7179         kvm_mmu_spte_module_init();
7180 }
7181
7182 /*
7183  * The bulk of the MMU initialization is deferred until the vendor module is
7184  * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
7185  * to be reset when a potentially different vendor module is loaded.
7186  */
7187 int kvm_mmu_vendor_module_init(void)
7188 {
7189         int ret = -ENOMEM;
7190
7191         /*
7192          * MMU roles use union aliasing which is, generally speaking, an
7193          * undefined behavior. However, we supposedly know how compilers behave
7194          * and the current status quo is unlikely to change. Guardians below are
7195          * supposed to let us know if the assumption becomes false.
7196          */
7197         BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
7198         BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
7199         BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
7200
7201         kvm_mmu_reset_all_pte_masks();
7202
7203         pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
7204         if (!pte_list_desc_cache)
7205                 goto out;
7206
7207         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
7208                                                   sizeof(struct kvm_mmu_page),
7209                                                   0, SLAB_ACCOUNT, NULL);
7210         if (!mmu_page_header_cache)
7211                 goto out;
7212
7213         return 0;
7214
7215 out:
7216         mmu_destroy_caches();
7217         return ret;
7218 }
7219
7220 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
7221 {
7222         kvm_mmu_unload(vcpu);
7223         free_mmu_pages(&vcpu->arch.root_mmu);
7224         free_mmu_pages(&vcpu->arch.guest_mmu);
7225         mmu_free_memory_caches(vcpu);
7226 }
7227
7228 void kvm_mmu_vendor_module_exit(void)
7229 {
7230         mmu_destroy_caches();
7231 }
7232
7233 /*
7234  * Calculate the effective recovery period, accounting for '0' meaning "let KVM
7235  * select a halving time of 1 hour".  Returns true if recovery is enabled.
7236  */
7237 static bool calc_nx_huge_pages_recovery_period(uint *period)
7238 {
7239         /*
7240          * Use READ_ONCE to get the params, this may be called outside of the
7241          * param setters, e.g. by the kthread to compute its next timeout.
7242          */
7243         bool enabled = READ_ONCE(nx_huge_pages);
7244         uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
7245
7246         if (!enabled || !ratio)
7247                 return false;
7248
7249         *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
7250         if (!*period) {
7251                 /* Make sure the period is not less than one second.  */
7252                 ratio = min(ratio, 3600u);
7253                 *period = 60 * 60 * 1000 / ratio;
7254         }
7255         return true;
7256 }
7257
7258 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
7259 {
7260         bool was_recovery_enabled, is_recovery_enabled;
7261         uint old_period, new_period;
7262         int err;
7263
7264         if (nx_hugepage_mitigation_hard_disabled)
7265                 return -EPERM;
7266
7267         was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
7268
7269         err = param_set_uint(val, kp);
7270         if (err)
7271                 return err;
7272
7273         is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
7274
7275         if (is_recovery_enabled &&
7276             (!was_recovery_enabled || old_period > new_period)) {
7277                 struct kvm *kvm;
7278
7279                 mutex_lock(&kvm_lock);
7280
7281                 list_for_each_entry(kvm, &vm_list, vm_list)
7282                         vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
7283
7284                 mutex_unlock(&kvm_lock);
7285         }
7286
7287         return err;
7288 }
7289
7290 static void kvm_recover_nx_huge_pages(struct kvm *kvm)
7291 {
7292         unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
7293         struct kvm_memory_slot *slot;
7294         int rcu_idx;
7295         struct kvm_mmu_page *sp;
7296         unsigned int ratio;
7297         LIST_HEAD(invalid_list);
7298         bool flush = false;
7299         ulong to_zap;
7300
7301         rcu_idx = srcu_read_lock(&kvm->srcu);
7302         write_lock(&kvm->mmu_lock);
7303
7304         /*
7305          * Zapping TDP MMU shadow pages, including the remote TLB flush, must
7306          * be done under RCU protection, because the pages are freed via RCU
7307          * callback.
7308          */
7309         rcu_read_lock();
7310
7311         ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
7312         to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
7313         for ( ; to_zap; --to_zap) {
7314                 if (list_empty(&kvm->arch.possible_nx_huge_pages))
7315                         break;
7316
7317                 /*
7318                  * We use a separate list instead of just using active_mmu_pages
7319                  * because the number of shadow pages that be replaced with an
7320                  * NX huge page is expected to be relatively small compared to
7321                  * the total number of shadow pages.  And because the TDP MMU
7322                  * doesn't use active_mmu_pages.
7323                  */
7324                 sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
7325                                       struct kvm_mmu_page,
7326                                       possible_nx_huge_page_link);
7327                 WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
7328                 WARN_ON_ONCE(!sp->role.direct);
7329
7330                 /*
7331                  * Unaccount and do not attempt to recover any NX Huge Pages
7332                  * that are being dirty tracked, as they would just be faulted
7333                  * back in as 4KiB pages. The NX Huge Pages in this slot will be
7334                  * recovered, along with all the other huge pages in the slot,
7335                  * when dirty logging is disabled.
7336                  *
7337                  * Since gfn_to_memslot() is relatively expensive, it helps to
7338                  * skip it if it the test cannot possibly return true.  On the
7339                  * other hand, if any memslot has logging enabled, chances are
7340                  * good that all of them do, in which case unaccount_nx_huge_page()
7341                  * is much cheaper than zapping the page.
7342                  *
7343                  * If a memslot update is in progress, reading an incorrect value
7344                  * of kvm->nr_memslots_dirty_logging is not a problem: if it is
7345                  * becoming zero, gfn_to_memslot() will be done unnecessarily; if
7346                  * it is becoming nonzero, the page will be zapped unnecessarily.
7347                  * Either way, this only affects efficiency in racy situations,
7348                  * and not correctness.
7349                  */
7350                 slot = NULL;
7351                 if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
7352                         struct kvm_memslots *slots;
7353
7354                         slots = kvm_memslots_for_spte_role(kvm, sp->role);
7355                         slot = __gfn_to_memslot(slots, sp->gfn);
7356                         WARN_ON_ONCE(!slot);
7357                 }
7358
7359                 if (slot && kvm_slot_dirty_track_enabled(slot))
7360                         unaccount_nx_huge_page(kvm, sp);
7361                 else if (is_tdp_mmu_page(sp))
7362                         flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
7363                 else
7364                         kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
7365                 WARN_ON_ONCE(sp->nx_huge_page_disallowed);
7366
7367                 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
7368                         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
7369                         rcu_read_unlock();
7370
7371                         cond_resched_rwlock_write(&kvm->mmu_lock);
7372                         flush = false;
7373
7374                         rcu_read_lock();
7375                 }
7376         }
7377         kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
7378
7379         rcu_read_unlock();
7380
7381         write_unlock(&kvm->mmu_lock);
7382         srcu_read_unlock(&kvm->srcu, rcu_idx);
7383 }
7384
7385 static void kvm_nx_huge_page_recovery_worker_kill(void *data)
7386 {
7387 }
7388
7389 static bool kvm_nx_huge_page_recovery_worker(void *data)
7390 {
7391         struct kvm *kvm = data;
7392         bool enabled;
7393         uint period;
7394         long remaining_time;
7395
7396         enabled = calc_nx_huge_pages_recovery_period(&period);
7397         if (!enabled)
7398                 return false;
7399
7400         remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period)
7401                 - get_jiffies_64();
7402         if (remaining_time > 0) {
7403                 schedule_timeout(remaining_time);
7404                 /* check for signals and come back */
7405                 return true;
7406         }
7407
7408         __set_current_state(TASK_RUNNING);
7409         kvm_recover_nx_huge_pages(kvm);
7410         kvm->arch.nx_huge_page_last = get_jiffies_64();
7411         return true;
7412 }
7413
7414 int kvm_mmu_post_init_vm(struct kvm *kvm)
7415 {
7416         if (nx_hugepage_mitigation_hard_disabled)
7417                 return 0;
7418
7419         kvm->arch.nx_huge_page_last = get_jiffies_64();
7420         kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
7421                 kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
7422                 kvm, "kvm-nx-lpage-recovery");
7423
7424         if (!kvm->arch.nx_huge_page_recovery_thread)
7425                 return -ENOMEM;
7426
7427         vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
7428         return 0;
7429 }
7430
7431 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
7432 {
7433         if (kvm->arch.nx_huge_page_recovery_thread)
7434                 vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
7435 }
7436
7437 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
7438 bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
7439                                         struct kvm_gfn_range *range)
7440 {
7441         /*
7442          * Zap SPTEs even if the slot can't be mapped PRIVATE.  KVM x86 only
7443          * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM
7444          * can simply ignore such slots.  But if userspace is making memory
7445          * PRIVATE, then KVM must prevent the guest from accessing the memory
7446          * as shared.  And if userspace is making memory SHARED and this point
7447          * is reached, then at least one page within the range was previously
7448          * PRIVATE, i.e. the slot's possible hugepage ranges are changing.
7449          * Zapping SPTEs in this case ensures KVM will reassess whether or not
7450          * a hugepage can be used for affected ranges.
7451          */
7452         if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
7453                 return false;
7454
7455         return kvm_unmap_gfn_range(kvm, range);
7456 }
7457
7458 static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
7459                                 int level)
7460 {
7461         return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
7462 }
7463
7464 static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
7465                                  int level)
7466 {
7467         lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
7468 }
7469
7470 static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
7471                                int level)
7472 {
7473         lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG;
7474 }
7475
7476 static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot,
7477                                gfn_t gfn, int level, unsigned long attrs)
7478 {
7479         const unsigned long start = gfn;
7480         const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
7481
7482         if (level == PG_LEVEL_2M)
7483                 return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs);
7484
7485         for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) {
7486                 if (hugepage_test_mixed(slot, gfn, level - 1) ||
7487                     attrs != kvm_get_memory_attributes(kvm, gfn))
7488                         return false;
7489         }
7490         return true;
7491 }
7492
7493 bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
7494                                          struct kvm_gfn_range *range)
7495 {
7496         unsigned long attrs = range->arg.attributes;
7497         struct kvm_memory_slot *slot = range->slot;
7498         int level;
7499
7500         lockdep_assert_held_write(&kvm->mmu_lock);
7501         lockdep_assert_held(&kvm->slots_lock);
7502
7503         /*
7504          * Calculate which ranges can be mapped with hugepages even if the slot
7505          * can't map memory PRIVATE.  KVM mustn't create a SHARED hugepage over
7506          * a range that has PRIVATE GFNs, and conversely converting a range to
7507          * SHARED may now allow hugepages.
7508          */
7509         if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
7510                 return false;
7511
7512         /*
7513          * The sequence matters here: upper levels consume the result of lower
7514          * level's scanning.
7515          */
7516         for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
7517                 gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
7518                 gfn_t gfn = gfn_round_for_level(range->start, level);
7519
7520                 /* Process the head page if it straddles the range. */
7521                 if (gfn != range->start || gfn + nr_pages > range->end) {
7522                         /*
7523                          * Skip mixed tracking if the aligned gfn isn't covered
7524                          * by the memslot, KVM can't use a hugepage due to the
7525                          * misaligned address regardless of memory attributes.
7526                          */
7527                         if (gfn >= slot->base_gfn &&
7528                             gfn + nr_pages <= slot->base_gfn + slot->npages) {
7529                                 if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
7530                                         hugepage_clear_mixed(slot, gfn, level);
7531                                 else
7532                                         hugepage_set_mixed(slot, gfn, level);
7533                         }
7534                         gfn += nr_pages;
7535                 }
7536
7537                 /*
7538                  * Pages entirely covered by the range are guaranteed to have
7539                  * only the attributes which were just set.
7540                  */
7541                 for ( ; gfn + nr_pages <= range->end; gfn += nr_pages)
7542                         hugepage_clear_mixed(slot, gfn, level);
7543
7544                 /*
7545                  * Process the last tail page if it straddles the range and is
7546                  * contained by the memslot.  Like the head page, KVM can't
7547                  * create a hugepage if the slot size is misaligned.
7548                  */
7549                 if (gfn < range->end &&
7550                     (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) {
7551                         if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
7552                                 hugepage_clear_mixed(slot, gfn, level);
7553                         else
7554                                 hugepage_set_mixed(slot, gfn, level);
7555                 }
7556         }
7557         return false;
7558 }
7559
7560 void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
7561                                             struct kvm_memory_slot *slot)
7562 {
7563         int level;
7564
7565         if (!kvm_arch_has_private_mem(kvm))
7566                 return;
7567
7568         for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
7569                 /*
7570                  * Don't bother tracking mixed attributes for pages that can't
7571                  * be huge due to alignment, i.e. process only pages that are
7572                  * entirely contained by the memslot.
7573                  */
7574                 gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level);
7575                 gfn_t start = gfn_round_for_level(slot->base_gfn, level);
7576                 gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
7577                 gfn_t gfn;
7578
7579                 if (start < slot->base_gfn)
7580                         start += nr_pages;
7581
7582                 /*
7583                  * Unlike setting attributes, every potential hugepage needs to
7584                  * be manually checked as the attributes may already be mixed.
7585                  */
7586                 for (gfn = start; gfn < end; gfn += nr_pages) {
7587                         unsigned long attrs = kvm_get_memory_attributes(kvm, gfn);
7588
7589                         if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
7590                                 hugepage_clear_mixed(slot, gfn, level);
7591                         else
7592                                 hugepage_set_mixed(slot, gfn, level);
7593                 }
7594         }
7595 }
7596 #endif
This page took 0.452011 seconds and 4 git commands to generate.