]> Git Repo - linux.git/blob - arch/arm64/kvm/hyp/pgtable.c
Linux 6.14-rc3
[linux.git] / arch / arm64 / kvm / hyp / pgtable.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
4  * No bombay mix was harmed in the writing of this file.
5  *
6  * Copyright (C) 2020 Google LLC
7  * Author: Will Deacon <[email protected]>
8  */
9
10 #include <linux/bitfield.h>
11 #include <asm/kvm_pgtable.h>
12 #include <asm/stage2_pgtable.h>
13
14
15 #define KVM_PTE_TYPE                    BIT(1)
16 #define KVM_PTE_TYPE_BLOCK              0
17 #define KVM_PTE_TYPE_PAGE               1
18 #define KVM_PTE_TYPE_TABLE              1
19
20 struct kvm_pgtable_walk_data {
21         struct kvm_pgtable_walker       *walker;
22
23         const u64                       start;
24         u64                             addr;
25         const u64                       end;
26 };
27
28 static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx)
29 {
30         return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI);
31 }
32
33 static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
34 {
35         return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO);
36 }
37
38 static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
39 {
40         u64 granule = kvm_granule_size(ctx->level);
41
42         if (!kvm_level_supports_block_mapping(ctx->level))
43                 return false;
44
45         if (granule > (ctx->end - ctx->addr))
46                 return false;
47
48         if (!IS_ALIGNED(phys, granule))
49                 return false;
50
51         return IS_ALIGNED(ctx->addr, granule);
52 }
53
54 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level)
55 {
56         u64 shift = kvm_granule_shift(level);
57         u64 mask = BIT(PAGE_SHIFT - 3) - 1;
58
59         return (data->addr >> shift) & mask;
60 }
61
62 static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
63 {
64         u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
65         u64 mask = BIT(pgt->ia_bits) - 1;
66
67         return (addr & mask) >> shift;
68 }
69
70 static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level)
71 {
72         struct kvm_pgtable pgt = {
73                 .ia_bits        = ia_bits,
74                 .start_level    = start_level,
75         };
76
77         return kvm_pgd_page_idx(&pgt, -1ULL) + 1;
78 }
79
80 static bool kvm_pte_table(kvm_pte_t pte, s8 level)
81 {
82         if (level == KVM_PGTABLE_LAST_LEVEL)
83                 return false;
84
85         if (!kvm_pte_valid(pte))
86                 return false;
87
88         return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
89 }
90
91 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
92 {
93         return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
94 }
95
96 static void kvm_clear_pte(kvm_pte_t *ptep)
97 {
98         WRITE_ONCE(*ptep, 0);
99 }
100
101 static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops)
102 {
103         kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));
104
105         pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
106         pte |= KVM_PTE_VALID;
107         return pte;
108 }
109
110 static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level)
111 {
112         kvm_pte_t pte = kvm_phys_to_pte(pa);
113         u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE :
114                                                        KVM_PTE_TYPE_BLOCK;
115
116         pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
117         pte |= FIELD_PREP(KVM_PTE_TYPE, type);
118         pte |= KVM_PTE_VALID;
119
120         return pte;
121 }
122
123 static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
124 {
125         return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
126 }
127
128 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
129                                   const struct kvm_pgtable_visit_ctx *ctx,
130                                   enum kvm_pgtable_walk_flags visit)
131 {
132         struct kvm_pgtable_walker *walker = data->walker;
133
134         /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */
135         WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held());
136         return walker->cb(ctx, visit);
137 }
138
139 static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker,
140                                       int r)
141 {
142         /*
143          * Visitor callbacks return EAGAIN when the conditions that led to a
144          * fault are no longer reflected in the page tables due to a race to
145          * update a PTE. In the context of a fault handler this is interpreted
146          * as a signal to retry guest execution.
147          *
148          * Ignore the return code altogether for walkers outside a fault handler
149          * (e.g. write protecting a range of memory) and chug along with the
150          * page table walk.
151          */
152         if (r == -EAGAIN)
153                 return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT);
154
155         return !r;
156 }
157
158 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
159                               struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level);
160
161 static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
162                                       struct kvm_pgtable_mm_ops *mm_ops,
163                                       kvm_pteref_t pteref, s8 level)
164 {
165         enum kvm_pgtable_walk_flags flags = data->walker->flags;
166         kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref);
167         struct kvm_pgtable_visit_ctx ctx = {
168                 .ptep   = ptep,
169                 .old    = READ_ONCE(*ptep),
170                 .arg    = data->walker->arg,
171                 .mm_ops = mm_ops,
172                 .start  = data->start,
173                 .addr   = data->addr,
174                 .end    = data->end,
175                 .level  = level,
176                 .flags  = flags,
177         };
178         int ret = 0;
179         bool reload = false;
180         kvm_pteref_t childp;
181         bool table = kvm_pte_table(ctx.old, level);
182
183         if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
184                 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE);
185                 reload = true;
186         }
187
188         if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) {
189                 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF);
190                 reload = true;
191         }
192
193         /*
194          * Reload the page table after invoking the walker callback for leaf
195          * entries or after pre-order traversal, to allow the walker to descend
196          * into a newly installed or replaced table.
197          */
198         if (reload) {
199                 ctx.old = READ_ONCE(*ptep);
200                 table = kvm_pte_table(ctx.old, level);
201         }
202
203         if (!kvm_pgtable_walk_continue(data->walker, ret))
204                 goto out;
205
206         if (!table) {
207                 data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
208                 data->addr += kvm_granule_size(level);
209                 goto out;
210         }
211
212         childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops);
213         ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1);
214         if (!kvm_pgtable_walk_continue(data->walker, ret))
215                 goto out;
216
217         if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST)
218                 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST);
219
220 out:
221         if (kvm_pgtable_walk_continue(data->walker, ret))
222                 return 0;
223
224         return ret;
225 }
226
227 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
228                               struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level)
229 {
230         u32 idx;
231         int ret = 0;
232
233         if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL ||
234                          level > KVM_PGTABLE_LAST_LEVEL))
235                 return -EINVAL;
236
237         for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
238                 kvm_pteref_t pteref = &pgtable[idx];
239
240                 if (data->addr >= data->end)
241                         break;
242
243                 ret = __kvm_pgtable_visit(data, mm_ops, pteref, level);
244                 if (ret)
245                         break;
246         }
247
248         return ret;
249 }
250
251 static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data)
252 {
253         u32 idx;
254         int ret = 0;
255         u64 limit = BIT(pgt->ia_bits);
256
257         if (data->addr > limit || data->end > limit)
258                 return -ERANGE;
259
260         if (!pgt->pgd)
261                 return -EINVAL;
262
263         for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) {
264                 kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE];
265
266                 ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level);
267                 if (ret)
268                         break;
269         }
270
271         return ret;
272 }
273
274 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
275                      struct kvm_pgtable_walker *walker)
276 {
277         struct kvm_pgtable_walk_data walk_data = {
278                 .start  = ALIGN_DOWN(addr, PAGE_SIZE),
279                 .addr   = ALIGN_DOWN(addr, PAGE_SIZE),
280                 .end    = PAGE_ALIGN(walk_data.addr + size),
281                 .walker = walker,
282         };
283         int r;
284
285         r = kvm_pgtable_walk_begin(walker);
286         if (r)
287                 return r;
288
289         r = _kvm_pgtable_walk(pgt, &walk_data);
290         kvm_pgtable_walk_end(walker);
291
292         return r;
293 }
294
295 struct leaf_walk_data {
296         kvm_pte_t       pte;
297         s8              level;
298 };
299
300 static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
301                        enum kvm_pgtable_walk_flags visit)
302 {
303         struct leaf_walk_data *data = ctx->arg;
304
305         data->pte   = ctx->old;
306         data->level = ctx->level;
307
308         return 0;
309 }
310
311 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
312                          kvm_pte_t *ptep, s8 *level)
313 {
314         struct leaf_walk_data data;
315         struct kvm_pgtable_walker walker = {
316                 .cb     = leaf_walker,
317                 .flags  = KVM_PGTABLE_WALK_LEAF,
318                 .arg    = &data,
319         };
320         int ret;
321
322         ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE),
323                                PAGE_SIZE, &walker);
324         if (!ret) {
325                 if (ptep)
326                         *ptep  = data.pte;
327                 if (level)
328                         *level = data.level;
329         }
330
331         return ret;
332 }
333
334 struct hyp_map_data {
335         const u64                       phys;
336         kvm_pte_t                       attr;
337 };
338
339 static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
340 {
341         bool device = prot & KVM_PGTABLE_PROT_DEVICE;
342         u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
343         kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
344         u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
345         u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
346                                                KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
347
348         if (!(prot & KVM_PGTABLE_PROT_R))
349                 return -EINVAL;
350
351         if (prot & KVM_PGTABLE_PROT_X) {
352                 if (prot & KVM_PGTABLE_PROT_W)
353                         return -EINVAL;
354
355                 if (device)
356                         return -EINVAL;
357
358                 if (system_supports_bti_kernel())
359                         attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP;
360         } else {
361                 attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
362         }
363
364         attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
365         if (!kvm_lpa2_is_enabled())
366                 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
367         attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
368         attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
369         *ptep = attr;
370
371         return 0;
372 }
373
374 enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
375 {
376         enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
377         u32 ap;
378
379         if (!kvm_pte_valid(pte))
380                 return prot;
381
382         if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
383                 prot |= KVM_PGTABLE_PROT_X;
384
385         ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte);
386         if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
387                 prot |= KVM_PGTABLE_PROT_R;
388         else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
389                 prot |= KVM_PGTABLE_PROT_RW;
390
391         return prot;
392 }
393
394 static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
395                                     struct hyp_map_data *data)
396 {
397         u64 phys = data->phys + (ctx->addr - ctx->start);
398         kvm_pte_t new;
399
400         if (!kvm_block_mapping_supported(ctx, phys))
401                 return false;
402
403         new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
404         if (ctx->old == new)
405                 return true;
406         if (!kvm_pte_valid(ctx->old))
407                 ctx->mm_ops->get_page(ctx->ptep);
408         else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
409                 return false;
410
411         smp_store_release(ctx->ptep, new);
412         return true;
413 }
414
415 static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
416                           enum kvm_pgtable_walk_flags visit)
417 {
418         kvm_pte_t *childp, new;
419         struct hyp_map_data *data = ctx->arg;
420         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
421
422         if (hyp_map_walker_try_leaf(ctx, data))
423                 return 0;
424
425         if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
426                 return -EINVAL;
427
428         childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
429         if (!childp)
430                 return -ENOMEM;
431
432         new = kvm_init_table_pte(childp, mm_ops);
433         mm_ops->get_page(ctx->ptep);
434         smp_store_release(ctx->ptep, new);
435
436         return 0;
437 }
438
439 int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
440                         enum kvm_pgtable_prot prot)
441 {
442         int ret;
443         struct hyp_map_data map_data = {
444                 .phys   = ALIGN_DOWN(phys, PAGE_SIZE),
445         };
446         struct kvm_pgtable_walker walker = {
447                 .cb     = hyp_map_walker,
448                 .flags  = KVM_PGTABLE_WALK_LEAF,
449                 .arg    = &map_data,
450         };
451
452         ret = hyp_set_prot_attr(prot, &map_data.attr);
453         if (ret)
454                 return ret;
455
456         ret = kvm_pgtable_walk(pgt, addr, size, &walker);
457         dsb(ishst);
458         isb();
459         return ret;
460 }
461
462 static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
463                             enum kvm_pgtable_walk_flags visit)
464 {
465         kvm_pte_t *childp = NULL;
466         u64 granule = kvm_granule_size(ctx->level);
467         u64 *unmapped = ctx->arg;
468         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
469
470         if (!kvm_pte_valid(ctx->old))
471                 return -EINVAL;
472
473         if (kvm_pte_table(ctx->old, ctx->level)) {
474                 childp = kvm_pte_follow(ctx->old, mm_ops);
475
476                 if (mm_ops->page_count(childp) != 1)
477                         return 0;
478
479                 kvm_clear_pte(ctx->ptep);
480                 dsb(ishst);
481                 __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN);
482         } else {
483                 if (ctx->end - ctx->addr < granule)
484                         return -EINVAL;
485
486                 kvm_clear_pte(ctx->ptep);
487                 dsb(ishst);
488                 __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
489                 *unmapped += granule;
490         }
491
492         dsb(ish);
493         isb();
494         mm_ops->put_page(ctx->ptep);
495
496         if (childp)
497                 mm_ops->put_page(childp);
498
499         return 0;
500 }
501
502 u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
503 {
504         u64 unmapped = 0;
505         struct kvm_pgtable_walker walker = {
506                 .cb     = hyp_unmap_walker,
507                 .arg    = &unmapped,
508                 .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
509         };
510
511         if (!pgt->mm_ops->page_count)
512                 return 0;
513
514         kvm_pgtable_walk(pgt, addr, size, &walker);
515         return unmapped;
516 }
517
518 int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
519                          struct kvm_pgtable_mm_ops *mm_ops)
520 {
521         s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 -
522                          ARM64_HW_PGTABLE_LEVELS(va_bits);
523
524         if (start_level < KVM_PGTABLE_FIRST_LEVEL ||
525             start_level > KVM_PGTABLE_LAST_LEVEL)
526                 return -EINVAL;
527
528         pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL);
529         if (!pgt->pgd)
530                 return -ENOMEM;
531
532         pgt->ia_bits            = va_bits;
533         pgt->start_level        = start_level;
534         pgt->mm_ops             = mm_ops;
535         pgt->mmu                = NULL;
536         pgt->force_pte_cb       = NULL;
537
538         return 0;
539 }
540
541 static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
542                            enum kvm_pgtable_walk_flags visit)
543 {
544         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
545
546         if (!kvm_pte_valid(ctx->old))
547                 return 0;
548
549         mm_ops->put_page(ctx->ptep);
550
551         if (kvm_pte_table(ctx->old, ctx->level))
552                 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
553
554         return 0;
555 }
556
557 void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
558 {
559         struct kvm_pgtable_walker walker = {
560                 .cb     = hyp_free_walker,
561                 .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
562         };
563
564         WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
565         pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd));
566         pgt->pgd = NULL;
567 }
568
569 struct stage2_map_data {
570         const u64                       phys;
571         kvm_pte_t                       attr;
572         u8                              owner_id;
573
574         kvm_pte_t                       *anchor;
575         kvm_pte_t                       *childp;
576
577         struct kvm_s2_mmu               *mmu;
578         void                            *memcache;
579
580         /* Force mappings to page granularity */
581         bool                            force_pte;
582
583         /* Walk should update owner_id only */
584         bool                            annotation;
585 };
586
587 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
588 {
589         u64 vtcr = VTCR_EL2_FLAGS;
590         s8 lvls;
591
592         vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
593         vtcr |= VTCR_EL2_T0SZ(phys_shift);
594         /*
595          * Use a minimum 2 level page table to prevent splitting
596          * host PMD huge pages at stage2.
597          */
598         lvls = stage2_pgtable_levels(phys_shift);
599         if (lvls < 2)
600                 lvls = 2;
601
602         /*
603          * When LPA2 is enabled, the HW supports an extra level of translation
604          * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2
605          * to as an addition to SL0 to enable encoding this extra start level.
606          * However, since we always use concatenated pages for the first level
607          * lookup, we will never need this extra level and therefore do not need
608          * to touch SL2.
609          */
610         vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
611
612 #ifdef CONFIG_ARM64_HW_AFDBM
613         /*
614          * Enable the Hardware Access Flag management, unconditionally
615          * on all CPUs. In systems that have asymmetric support for the feature
616          * this allows KVM to leverage hardware support on the subset of cores
617          * that implement the feature.
618          *
619          * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by
620          * hardware) on implementations that do not advertise support for the
621          * feature. As such, setting HA unconditionally is safe, unless you
622          * happen to be running on a design that has unadvertised support for
623          * HAFDBS. Here be dragons.
624          */
625         if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
626                 vtcr |= VTCR_EL2_HA;
627 #endif /* CONFIG_ARM64_HW_AFDBM */
628
629         if (kvm_lpa2_is_enabled())
630                 vtcr |= VTCR_EL2_DS;
631
632         /* Set the vmid bits */
633         vtcr |= (get_vmid_bits(mmfr1) == 16) ?
634                 VTCR_EL2_VS_16BIT :
635                 VTCR_EL2_VS_8BIT;
636
637         return vtcr;
638 }
639
640 static bool stage2_has_fwb(struct kvm_pgtable *pgt)
641 {
642         if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
643                 return false;
644
645         return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
646 }
647
648 void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
649                                 phys_addr_t addr, size_t size)
650 {
651         unsigned long pages, inval_pages;
652
653         if (!system_supports_tlb_range()) {
654                 kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
655                 return;
656         }
657
658         pages = size >> PAGE_SHIFT;
659         while (pages > 0) {
660                 inval_pages = min(pages, MAX_TLBI_RANGE_PAGES);
661                 kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages);
662
663                 addr += inval_pages << PAGE_SHIFT;
664                 pages -= inval_pages;
665         }
666 }
667
668 #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
669
670 static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
671                                 kvm_pte_t *ptep)
672 {
673         kvm_pte_t attr;
674         u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
675
676         switch (prot & (KVM_PGTABLE_PROT_DEVICE |
677                         KVM_PGTABLE_PROT_NORMAL_NC)) {
678         case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
679                 return -EINVAL;
680         case KVM_PGTABLE_PROT_DEVICE:
681                 if (prot & KVM_PGTABLE_PROT_X)
682                         return -EINVAL;
683                 attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE);
684                 break;
685         case KVM_PGTABLE_PROT_NORMAL_NC:
686                 if (prot & KVM_PGTABLE_PROT_X)
687                         return -EINVAL;
688                 attr = KVM_S2_MEMATTR(pgt, NORMAL_NC);
689                 break;
690         default:
691                 attr = KVM_S2_MEMATTR(pgt, NORMAL);
692         }
693
694         if (!(prot & KVM_PGTABLE_PROT_X))
695                 attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
696
697         if (prot & KVM_PGTABLE_PROT_R)
698                 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
699
700         if (prot & KVM_PGTABLE_PROT_W)
701                 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
702
703         if (!kvm_lpa2_is_enabled())
704                 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
705
706         attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
707         attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
708         *ptep = attr;
709
710         return 0;
711 }
712
713 enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
714 {
715         enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
716
717         if (!kvm_pte_valid(pte))
718                 return prot;
719
720         if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R)
721                 prot |= KVM_PGTABLE_PROT_R;
722         if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
723                 prot |= KVM_PGTABLE_PROT_W;
724         if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
725                 prot |= KVM_PGTABLE_PROT_X;
726
727         return prot;
728 }
729
730 static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
731 {
732         if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
733                 return true;
734
735         return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
736 }
737
738 static bool stage2_pte_is_counted(kvm_pte_t pte)
739 {
740         /*
741          * The refcount tracks valid entries as well as invalid entries if they
742          * encode ownership of a page to another entity than the page-table
743          * owner, whose id is 0.
744          */
745         return !!pte;
746 }
747
748 static bool stage2_pte_is_locked(kvm_pte_t pte)
749 {
750         return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
751 }
752
753 static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
754 {
755         if (!kvm_pgtable_walk_shared(ctx)) {
756                 WRITE_ONCE(*ctx->ptep, new);
757                 return true;
758         }
759
760         return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
761 }
762
763 /**
764  * stage2_try_break_pte() - Invalidates a pte according to the
765  *                          'break-before-make' requirements of the
766  *                          architecture.
767  *
768  * @ctx: context of the visited pte.
769  * @mmu: stage-2 mmu
770  *
771  * Returns: true if the pte was successfully broken.
772  *
773  * If the removed pte was valid, performs the necessary serialization and TLB
774  * invalidation for the old value. For counted ptes, drops the reference count
775  * on the containing table page.
776  */
777 static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
778                                  struct kvm_s2_mmu *mmu)
779 {
780         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
781
782         if (stage2_pte_is_locked(ctx->old)) {
783                 /*
784                  * Should never occur if this walker has exclusive access to the
785                  * page tables.
786                  */
787                 WARN_ON(!kvm_pgtable_walk_shared(ctx));
788                 return false;
789         }
790
791         if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
792                 return false;
793
794         if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
795                 /*
796                  * Perform the appropriate TLB invalidation based on the
797                  * evicted pte value (if any).
798                  */
799                 if (kvm_pte_table(ctx->old, ctx->level)) {
800                         u64 size = kvm_granule_size(ctx->level);
801                         u64 addr = ALIGN_DOWN(ctx->addr, size);
802
803                         kvm_tlb_flush_vmid_range(mmu, addr, size);
804                 } else if (kvm_pte_valid(ctx->old)) {
805                         kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
806                                      ctx->addr, ctx->level);
807                 }
808         }
809
810         if (stage2_pte_is_counted(ctx->old))
811                 mm_ops->put_page(ctx->ptep);
812
813         return true;
814 }
815
816 static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
817 {
818         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
819
820         WARN_ON(!stage2_pte_is_locked(*ctx->ptep));
821
822         if (stage2_pte_is_counted(new))
823                 mm_ops->get_page(ctx->ptep);
824
825         smp_store_release(ctx->ptep, new);
826 }
827
828 static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
829 {
830         /*
831          * If FEAT_TLBIRANGE is implemented, defer the individual
832          * TLB invalidations until the entire walk is finished, and
833          * then use the range-based TLBI instructions to do the
834          * invalidations. Condition deferred TLB invalidation on the
835          * system supporting FWB as the optimization is entirely
836          * pointless when the unmap walker needs to perform CMOs.
837          */
838         return system_supports_tlb_range() && stage2_has_fwb(pgt);
839 }
840
841 static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
842                                 struct kvm_s2_mmu *mmu,
843                                 struct kvm_pgtable_mm_ops *mm_ops)
844 {
845         struct kvm_pgtable *pgt = ctx->arg;
846
847         /*
848          * Clear the existing PTE, and perform break-before-make if it was
849          * valid. Depending on the system support, defer the TLB maintenance
850          * for the same until the entire unmap walk is completed.
851          */
852         if (kvm_pte_valid(ctx->old)) {
853                 kvm_clear_pte(ctx->ptep);
854
855                 if (kvm_pte_table(ctx->old, ctx->level)) {
856                         kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
857                                      TLBI_TTL_UNKNOWN);
858                 } else if (!stage2_unmap_defer_tlb_flush(pgt)) {
859                         kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
860                                      ctx->level);
861                 }
862         }
863
864         mm_ops->put_page(ctx->ptep);
865 }
866
867 static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
868 {
869         u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
870         return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL);
871 }
872
873 static bool stage2_pte_executable(kvm_pte_t pte)
874 {
875         return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
876 }
877
878 static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
879                                        const struct stage2_map_data *data)
880 {
881         u64 phys = data->phys;
882
883         /* Work out the correct PA based on how far the walk has gotten */
884         return phys + (ctx->addr - ctx->start);
885 }
886
887 static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
888                                         struct stage2_map_data *data)
889 {
890         u64 phys = stage2_map_walker_phys_addr(ctx, data);
891
892         if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL)
893                 return false;
894
895         if (data->annotation)
896                 return true;
897
898         return kvm_block_mapping_supported(ctx, phys);
899 }
900
901 static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
902                                       struct stage2_map_data *data)
903 {
904         kvm_pte_t new;
905         u64 phys = stage2_map_walker_phys_addr(ctx, data);
906         u64 granule = kvm_granule_size(ctx->level);
907         struct kvm_pgtable *pgt = data->mmu->pgt;
908         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
909
910         if (!stage2_leaf_mapping_allowed(ctx, data))
911                 return -E2BIG;
912
913         if (!data->annotation)
914                 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
915         else
916                 new = kvm_init_invalid_leaf_owner(data->owner_id);
917
918         /*
919          * Skip updating the PTE if we are trying to recreate the exact
920          * same mapping or only change the access permissions. Instead,
921          * the vCPU will exit one more time from guest if still needed
922          * and then go through the path of relaxing permissions.
923          */
924         if (!stage2_pte_needs_update(ctx->old, new))
925                 return -EAGAIN;
926
927         /* If we're only changing software bits, then store them and go! */
928         if (!kvm_pgtable_walk_shared(ctx) &&
929             !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) {
930                 bool old_is_counted = stage2_pte_is_counted(ctx->old);
931
932                 if (old_is_counted != stage2_pte_is_counted(new)) {
933                         if (old_is_counted)
934                                 mm_ops->put_page(ctx->ptep);
935                         else
936                                 mm_ops->get_page(ctx->ptep);
937                 }
938                 WARN_ON_ONCE(!stage2_try_set_pte(ctx, new));
939                 return 0;
940         }
941
942         if (!stage2_try_break_pte(ctx, data->mmu))
943                 return -EAGAIN;
944
945         /* Perform CMOs before installation of the guest stage-2 PTE */
946         if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc &&
947             stage2_pte_cacheable(pgt, new))
948                 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
949                                                granule);
950
951         if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou &&
952             stage2_pte_executable(new))
953                 mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
954
955         stage2_make_pte(ctx, new);
956
957         return 0;
958 }
959
960 static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
961                                      struct stage2_map_data *data)
962 {
963         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
964         kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
965         int ret;
966
967         if (!stage2_leaf_mapping_allowed(ctx, data))
968                 return 0;
969
970         ret = stage2_map_walker_try_leaf(ctx, data);
971         if (ret)
972                 return ret;
973
974         mm_ops->free_unlinked_table(childp, ctx->level);
975         return 0;
976 }
977
978 static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
979                                 struct stage2_map_data *data)
980 {
981         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
982         kvm_pte_t *childp, new;
983         int ret;
984
985         ret = stage2_map_walker_try_leaf(ctx, data);
986         if (ret != -E2BIG)
987                 return ret;
988
989         if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
990                 return -EINVAL;
991
992         if (!data->memcache)
993                 return -ENOMEM;
994
995         childp = mm_ops->zalloc_page(data->memcache);
996         if (!childp)
997                 return -ENOMEM;
998
999         if (!stage2_try_break_pte(ctx, data->mmu)) {
1000                 mm_ops->put_page(childp);
1001                 return -EAGAIN;
1002         }
1003
1004         /*
1005          * If we've run into an existing block mapping then replace it with
1006          * a table. Accesses beyond 'end' that fall within the new table
1007          * will be mapped lazily.
1008          */
1009         new = kvm_init_table_pte(childp, mm_ops);
1010         stage2_make_pte(ctx, new);
1011
1012         return 0;
1013 }
1014
1015 /*
1016  * The TABLE_PRE callback runs for table entries on the way down, looking
1017  * for table entries which we could conceivably replace with a block entry
1018  * for this mapping. If it finds one it replaces the entry and calls
1019  * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table.
1020  *
1021  * Otherwise, the LEAF callback performs the mapping at the existing leaves
1022  * instead.
1023  */
1024 static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
1025                              enum kvm_pgtable_walk_flags visit)
1026 {
1027         struct stage2_map_data *data = ctx->arg;
1028
1029         switch (visit) {
1030         case KVM_PGTABLE_WALK_TABLE_PRE:
1031                 return stage2_map_walk_table_pre(ctx, data);
1032         case KVM_PGTABLE_WALK_LEAF:
1033                 return stage2_map_walk_leaf(ctx, data);
1034         default:
1035                 return -EINVAL;
1036         }
1037 }
1038
1039 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
1040                            u64 phys, enum kvm_pgtable_prot prot,
1041                            void *mc, enum kvm_pgtable_walk_flags flags)
1042 {
1043         int ret;
1044         struct stage2_map_data map_data = {
1045                 .phys           = ALIGN_DOWN(phys, PAGE_SIZE),
1046                 .mmu            = pgt->mmu,
1047                 .memcache       = mc,
1048                 .force_pte      = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot),
1049         };
1050         struct kvm_pgtable_walker walker = {
1051                 .cb             = stage2_map_walker,
1052                 .flags          = flags |
1053                                   KVM_PGTABLE_WALK_TABLE_PRE |
1054                                   KVM_PGTABLE_WALK_LEAF,
1055                 .arg            = &map_data,
1056         };
1057
1058         if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
1059                 return -EINVAL;
1060
1061         ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
1062         if (ret)
1063                 return ret;
1064
1065         ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1066         dsb(ishst);
1067         return ret;
1068 }
1069
1070 int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
1071                                  void *mc, u8 owner_id)
1072 {
1073         int ret;
1074         struct stage2_map_data map_data = {
1075                 .mmu            = pgt->mmu,
1076                 .memcache       = mc,
1077                 .owner_id       = owner_id,
1078                 .force_pte      = true,
1079                 .annotation     = true,
1080         };
1081         struct kvm_pgtable_walker walker = {
1082                 .cb             = stage2_map_walker,
1083                 .flags          = KVM_PGTABLE_WALK_TABLE_PRE |
1084                                   KVM_PGTABLE_WALK_LEAF,
1085                 .arg            = &map_data,
1086         };
1087
1088         if (owner_id > KVM_MAX_OWNER_ID)
1089                 return -EINVAL;
1090
1091         ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1092         return ret;
1093 }
1094
1095 static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
1096                                enum kvm_pgtable_walk_flags visit)
1097 {
1098         struct kvm_pgtable *pgt = ctx->arg;
1099         struct kvm_s2_mmu *mmu = pgt->mmu;
1100         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1101         kvm_pte_t *childp = NULL;
1102         bool need_flush = false;
1103
1104         if (!kvm_pte_valid(ctx->old)) {
1105                 if (stage2_pte_is_counted(ctx->old)) {
1106                         kvm_clear_pte(ctx->ptep);
1107                         mm_ops->put_page(ctx->ptep);
1108                 }
1109                 return 0;
1110         }
1111
1112         if (kvm_pte_table(ctx->old, ctx->level)) {
1113                 childp = kvm_pte_follow(ctx->old, mm_ops);
1114
1115                 if (mm_ops->page_count(childp) != 1)
1116                         return 0;
1117         } else if (stage2_pte_cacheable(pgt, ctx->old)) {
1118                 need_flush = !stage2_has_fwb(pgt);
1119         }
1120
1121         /*
1122          * This is similar to the map() path in that we unmap the entire
1123          * block entry and rely on the remaining portions being faulted
1124          * back lazily.
1125          */
1126         stage2_unmap_put_pte(ctx, mmu, mm_ops);
1127
1128         if (need_flush && mm_ops->dcache_clean_inval_poc)
1129                 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
1130                                                kvm_granule_size(ctx->level));
1131
1132         if (childp)
1133                 mm_ops->put_page(childp);
1134
1135         return 0;
1136 }
1137
1138 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
1139 {
1140         int ret;
1141         struct kvm_pgtable_walker walker = {
1142                 .cb     = stage2_unmap_walker,
1143                 .arg    = pgt,
1144                 .flags  = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
1145         };
1146
1147         ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1148         if (stage2_unmap_defer_tlb_flush(pgt))
1149                 /* Perform the deferred TLB invalidations */
1150                 kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);
1151
1152         return ret;
1153 }
1154
1155 struct stage2_attr_data {
1156         kvm_pte_t                       attr_set;
1157         kvm_pte_t                       attr_clr;
1158         kvm_pte_t                       pte;
1159         s8                              level;
1160 };
1161
1162 static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
1163                               enum kvm_pgtable_walk_flags visit)
1164 {
1165         kvm_pte_t pte = ctx->old;
1166         struct stage2_attr_data *data = ctx->arg;
1167         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1168
1169         if (!kvm_pte_valid(ctx->old))
1170                 return -EAGAIN;
1171
1172         data->level = ctx->level;
1173         data->pte = pte;
1174         pte &= ~data->attr_clr;
1175         pte |= data->attr_set;
1176
1177         /*
1178          * We may race with the CPU trying to set the access flag here,
1179          * but worst-case the access flag update gets lost and will be
1180          * set on the next access instead.
1181          */
1182         if (data->pte != pte) {
1183                 /*
1184                  * Invalidate instruction cache before updating the guest
1185                  * stage-2 PTE if we are going to add executable permission.
1186                  */
1187                 if (mm_ops->icache_inval_pou &&
1188                     stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old))
1189                         mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
1190                                                   kvm_granule_size(ctx->level));
1191
1192                 if (!stage2_try_set_pte(ctx, pte))
1193                         return -EAGAIN;
1194         }
1195
1196         return 0;
1197 }
1198
1199 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
1200                                     u64 size, kvm_pte_t attr_set,
1201                                     kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
1202                                     s8 *level, enum kvm_pgtable_walk_flags flags)
1203 {
1204         int ret;
1205         kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
1206         struct stage2_attr_data data = {
1207                 .attr_set       = attr_set & attr_mask,
1208                 .attr_clr       = attr_clr & attr_mask,
1209         };
1210         struct kvm_pgtable_walker walker = {
1211                 .cb             = stage2_attr_walker,
1212                 .arg            = &data,
1213                 .flags          = flags | KVM_PGTABLE_WALK_LEAF,
1214         };
1215
1216         ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1217         if (ret)
1218                 return ret;
1219
1220         if (orig_pte)
1221                 *orig_pte = data.pte;
1222
1223         if (level)
1224                 *level = data.level;
1225         return 0;
1226 }
1227
1228 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
1229 {
1230         return stage2_update_leaf_attrs(pgt, addr, size, 0,
1231                                         KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
1232                                         NULL, NULL, 0);
1233 }
1234
1235 void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
1236                                 enum kvm_pgtable_walk_flags flags)
1237 {
1238         int ret;
1239
1240         ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
1241                                        NULL, NULL, flags);
1242         if (!ret)
1243                 dsb(ishst);
1244 }
1245
1246 struct stage2_age_data {
1247         bool    mkold;
1248         bool    young;
1249 };
1250
1251 static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
1252                              enum kvm_pgtable_walk_flags visit)
1253 {
1254         kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
1255         struct stage2_age_data *data = ctx->arg;
1256
1257         if (!kvm_pte_valid(ctx->old) || new == ctx->old)
1258                 return 0;
1259
1260         data->young = true;
1261
1262         /*
1263          * stage2_age_walker() is always called while holding the MMU lock for
1264          * write, so this will always succeed. Nonetheless, this deliberately
1265          * follows the race detection pattern of the other stage-2 walkers in
1266          * case the locking mechanics of the MMU notifiers is ever changed.
1267          */
1268         if (data->mkold && !stage2_try_set_pte(ctx, new))
1269                 return -EAGAIN;
1270
1271         /*
1272          * "But where's the TLBI?!", you scream.
1273          * "Over in the core code", I sigh.
1274          *
1275          * See the '->clear_flush_young()' callback on the KVM mmu notifier.
1276          */
1277         return 0;
1278 }
1279
1280 bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
1281                                          u64 size, bool mkold)
1282 {
1283         struct stage2_age_data data = {
1284                 .mkold          = mkold,
1285         };
1286         struct kvm_pgtable_walker walker = {
1287                 .cb             = stage2_age_walker,
1288                 .arg            = &data,
1289                 .flags          = KVM_PGTABLE_WALK_LEAF,
1290         };
1291
1292         WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
1293         return data.young;
1294 }
1295
1296 int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
1297                                    enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags)
1298 {
1299         int ret;
1300         s8 level;
1301         kvm_pte_t set = 0, clr = 0;
1302
1303         if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
1304                 return -EINVAL;
1305
1306         if (prot & KVM_PGTABLE_PROT_R)
1307                 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
1308
1309         if (prot & KVM_PGTABLE_PROT_W)
1310                 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
1311
1312         if (prot & KVM_PGTABLE_PROT_X)
1313                 clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
1314
1315         ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags);
1316         if (!ret || ret == -EAGAIN)
1317                 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
1318         return ret;
1319 }
1320
1321 static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx,
1322                                enum kvm_pgtable_walk_flags visit)
1323 {
1324         struct kvm_pgtable *pgt = ctx->arg;
1325         struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
1326
1327         if (!stage2_pte_cacheable(pgt, ctx->old))
1328                 return 0;
1329
1330         if (mm_ops->dcache_clean_inval_poc)
1331                 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
1332                                                kvm_granule_size(ctx->level));
1333         return 0;
1334 }
1335
1336 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
1337 {
1338         struct kvm_pgtable_walker walker = {
1339                 .cb     = stage2_flush_walker,
1340                 .flags  = KVM_PGTABLE_WALK_LEAF,
1341                 .arg    = pgt,
1342         };
1343
1344         if (stage2_has_fwb(pgt))
1345                 return 0;
1346
1347         return kvm_pgtable_walk(pgt, addr, size, &walker);
1348 }
1349
1350 kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
1351                                               u64 phys, s8 level,
1352                                               enum kvm_pgtable_prot prot,
1353                                               void *mc, bool force_pte)
1354 {
1355         struct stage2_map_data map_data = {
1356                 .phys           = phys,
1357                 .mmu            = pgt->mmu,
1358                 .memcache       = mc,
1359                 .force_pte      = force_pte,
1360         };
1361         struct kvm_pgtable_walker walker = {
1362                 .cb             = stage2_map_walker,
1363                 .flags          = KVM_PGTABLE_WALK_LEAF |
1364                                   KVM_PGTABLE_WALK_SKIP_BBM_TLBI |
1365                                   KVM_PGTABLE_WALK_SKIP_CMO,
1366                 .arg            = &map_data,
1367         };
1368         /*
1369          * The input address (.addr) is irrelevant for walking an
1370          * unlinked table. Construct an ambiguous IA range to map
1371          * kvm_granule_size(level) worth of memory.
1372          */
1373         struct kvm_pgtable_walk_data data = {
1374                 .walker = &walker,
1375                 .addr   = 0,
1376                 .end    = kvm_granule_size(level),
1377         };
1378         struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
1379         kvm_pte_t *pgtable;
1380         int ret;
1381
1382         if (!IS_ALIGNED(phys, kvm_granule_size(level)))
1383                 return ERR_PTR(-EINVAL);
1384
1385         ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
1386         if (ret)
1387                 return ERR_PTR(ret);
1388
1389         pgtable = mm_ops->zalloc_page(mc);
1390         if (!pgtable)
1391                 return ERR_PTR(-ENOMEM);
1392
1393         ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable,
1394                                  level + 1);
1395         if (ret) {
1396                 kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level);
1397                 return ERR_PTR(ret);
1398         }
1399
1400         return pgtable;
1401 }
1402
1403 /*
1404  * Get the number of page-tables needed to replace a block with a
1405  * fully populated tree up to the PTE entries. Note that @level is
1406  * interpreted as in "level @level entry".
1407  */
1408 static int stage2_block_get_nr_page_tables(s8 level)
1409 {
1410         switch (level) {
1411         case 1:
1412                 return PTRS_PER_PTE + 1;
1413         case 2:
1414                 return 1;
1415         case 3:
1416                 return 0;
1417         default:
1418                 WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
1419                              level > KVM_PGTABLE_LAST_LEVEL);
1420                 return -EINVAL;
1421         };
1422 }
1423
1424 static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
1425                                enum kvm_pgtable_walk_flags visit)
1426 {
1427         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1428         struct kvm_mmu_memory_cache *mc = ctx->arg;
1429         struct kvm_s2_mmu *mmu;
1430         kvm_pte_t pte = ctx->old, new, *childp;
1431         enum kvm_pgtable_prot prot;
1432         s8 level = ctx->level;
1433         bool force_pte;
1434         int nr_pages;
1435         u64 phys;
1436
1437         /* No huge-pages exist at the last level */
1438         if (level == KVM_PGTABLE_LAST_LEVEL)
1439                 return 0;
1440
1441         /* We only split valid block mappings */
1442         if (!kvm_pte_valid(pte))
1443                 return 0;
1444
1445         nr_pages = stage2_block_get_nr_page_tables(level);
1446         if (nr_pages < 0)
1447                 return nr_pages;
1448
1449         if (mc->nobjs >= nr_pages) {
1450                 /* Build a tree mapped down to the PTE granularity. */
1451                 force_pte = true;
1452         } else {
1453                 /*
1454                  * Don't force PTEs, so create_unlinked() below does
1455                  * not populate the tree up to the PTE level. The
1456                  * consequence is that the call will require a single
1457                  * page of level 2 entries at level 1, or a single
1458                  * page of PTEs at level 2. If we are at level 1, the
1459                  * PTEs will be created recursively.
1460                  */
1461                 force_pte = false;
1462                 nr_pages = 1;
1463         }
1464
1465         if (mc->nobjs < nr_pages)
1466                 return -ENOMEM;
1467
1468         mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache);
1469         phys = kvm_pte_to_phys(pte);
1470         prot = kvm_pgtable_stage2_pte_prot(pte);
1471
1472         childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys,
1473                                                     level, prot, mc, force_pte);
1474         if (IS_ERR(childp))
1475                 return PTR_ERR(childp);
1476
1477         if (!stage2_try_break_pte(ctx, mmu)) {
1478                 kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
1479                 return -EAGAIN;
1480         }
1481
1482         /*
1483          * Note, the contents of the page table are guaranteed to be made
1484          * visible before the new PTE is assigned because stage2_make_pte()
1485          * writes the PTE using smp_store_release().
1486          */
1487         new = kvm_init_table_pte(childp, mm_ops);
1488         stage2_make_pte(ctx, new);
1489         return 0;
1490 }
1491
1492 int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
1493                              struct kvm_mmu_memory_cache *mc)
1494 {
1495         struct kvm_pgtable_walker walker = {
1496                 .cb     = stage2_split_walker,
1497                 .flags  = KVM_PGTABLE_WALK_LEAF,
1498                 .arg    = mc,
1499         };
1500         int ret;
1501
1502         ret = kvm_pgtable_walk(pgt, addr, size, &walker);
1503         dsb(ishst);
1504         return ret;
1505 }
1506
1507 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
1508                               struct kvm_pgtable_mm_ops *mm_ops,
1509                               enum kvm_pgtable_stage2_flags flags,
1510                               kvm_pgtable_force_pte_cb_t force_pte_cb)
1511 {
1512         size_t pgd_sz;
1513         u64 vtcr = mmu->vtcr;
1514         u32 ia_bits = VTCR_EL2_IPA(vtcr);
1515         u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
1516         s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
1517
1518         pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
1519         pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz);
1520         if (!pgt->pgd)
1521                 return -ENOMEM;
1522
1523         pgt->ia_bits            = ia_bits;
1524         pgt->start_level        = start_level;
1525         pgt->mm_ops             = mm_ops;
1526         pgt->mmu                = mmu;
1527         pgt->flags              = flags;
1528         pgt->force_pte_cb       = force_pte_cb;
1529
1530         /* Ensure zeroed PGD pages are visible to the hardware walker */
1531         dsb(ishst);
1532         return 0;
1533 }
1534
1535 size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
1536 {
1537         u32 ia_bits = VTCR_EL2_IPA(vtcr);
1538         u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
1539         s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
1540
1541         return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
1542 }
1543
1544 static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
1545                               enum kvm_pgtable_walk_flags visit)
1546 {
1547         struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
1548
1549         if (!stage2_pte_is_counted(ctx->old))
1550                 return 0;
1551
1552         mm_ops->put_page(ctx->ptep);
1553
1554         if (kvm_pte_table(ctx->old, ctx->level))
1555                 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
1556
1557         return 0;
1558 }
1559
1560 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
1561 {
1562         size_t pgd_sz;
1563         struct kvm_pgtable_walker walker = {
1564                 .cb     = stage2_free_walker,
1565                 .flags  = KVM_PGTABLE_WALK_LEAF |
1566                           KVM_PGTABLE_WALK_TABLE_POST,
1567         };
1568
1569         WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
1570         pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
1571         pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
1572         pgt->pgd = NULL;
1573 }
1574
1575 void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
1576 {
1577         kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
1578         struct kvm_pgtable_walker walker = {
1579                 .cb     = stage2_free_walker,
1580                 .flags  = KVM_PGTABLE_WALK_LEAF |
1581                           KVM_PGTABLE_WALK_TABLE_POST,
1582         };
1583         struct kvm_pgtable_walk_data data = {
1584                 .walker = &walker,
1585
1586                 /*
1587                  * At this point the IPA really doesn't matter, as the page
1588                  * table being traversed has already been removed from the stage
1589                  * 2. Set an appropriate range to cover the entire page table.
1590                  */
1591                 .addr   = 0,
1592                 .end    = kvm_granule_size(level),
1593         };
1594
1595         WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1));
1596
1597         WARN_ON(mm_ops->page_count(pgtable) != 1);
1598         mm_ops->put_page(pgtable);
1599 }
This page took 0.119366 seconds and 4 git commands to generate.