]> Git Repo - linux.git/blob - mm/pagewalk.c
mm: add optional close() to struct vm_special_mapping
[linux.git] / mm / pagewalk.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/pagewalk.h>
3 #include <linux/highmem.h>
4 #include <linux/sched.h>
5 #include <linux/hugetlb.h>
6 #include <linux/swap.h>
7 #include <linux/swapops.h>
8
9 /*
10  * We want to know the real level where a entry is located ignoring any
11  * folding of levels which may be happening. For example if p4d is folded then
12  * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
13  */
14 static int real_depth(int depth)
15 {
16         if (depth == 3 && PTRS_PER_PMD == 1)
17                 depth = 2;
18         if (depth == 2 && PTRS_PER_PUD == 1)
19                 depth = 1;
20         if (depth == 1 && PTRS_PER_P4D == 1)
21                 depth = 0;
22         return depth;
23 }
24
25 static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
26                                 unsigned long end, struct mm_walk *walk)
27 {
28         const struct mm_walk_ops *ops = walk->ops;
29         int err = 0;
30
31         for (;;) {
32                 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
33                 if (err)
34                        break;
35                 if (addr >= end - PAGE_SIZE)
36                         break;
37                 addr += PAGE_SIZE;
38                 pte++;
39         }
40         return err;
41 }
42
43 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
44                           struct mm_walk *walk)
45 {
46         pte_t *pte;
47         int err = 0;
48         spinlock_t *ptl;
49
50         if (walk->no_vma) {
51                 /*
52                  * pte_offset_map() might apply user-specific validation.
53                  * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
54                  * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
55                  * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
56                  */
57                 if (walk->mm == &init_mm || addr >= TASK_SIZE)
58                         pte = pte_offset_kernel(pmd, addr);
59                 else
60                         pte = pte_offset_map(pmd, addr);
61                 if (pte) {
62                         err = walk_pte_range_inner(pte, addr, end, walk);
63                         if (walk->mm != &init_mm && addr < TASK_SIZE)
64                                 pte_unmap(pte);
65                 }
66         } else {
67                 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
68                 if (pte) {
69                         err = walk_pte_range_inner(pte, addr, end, walk);
70                         pte_unmap_unlock(pte, ptl);
71                 }
72         }
73         if (!pte)
74                 walk->action = ACTION_AGAIN;
75         return err;
76 }
77
78 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
79                           struct mm_walk *walk)
80 {
81         pmd_t *pmd;
82         unsigned long next;
83         const struct mm_walk_ops *ops = walk->ops;
84         int err = 0;
85         int depth = real_depth(3);
86
87         pmd = pmd_offset(pud, addr);
88         do {
89 again:
90                 next = pmd_addr_end(addr, end);
91                 if (pmd_none(*pmd)) {
92                         if (ops->pte_hole)
93                                 err = ops->pte_hole(addr, next, depth, walk);
94                         if (err)
95                                 break;
96                         continue;
97                 }
98
99                 walk->action = ACTION_SUBTREE;
100
101                 /*
102                  * This implies that each ->pmd_entry() handler
103                  * needs to know about pmd_trans_huge() pmds
104                  */
105                 if (ops->pmd_entry)
106                         err = ops->pmd_entry(pmd, addr, next, walk);
107                 if (err)
108                         break;
109
110                 if (walk->action == ACTION_AGAIN)
111                         goto again;
112
113                 /*
114                  * Check this here so we only break down trans_huge
115                  * pages when we _need_ to
116                  */
117                 if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
118                     walk->action == ACTION_CONTINUE ||
119                     !(ops->pte_entry))
120                         continue;
121
122                 if (walk->vma)
123                         split_huge_pmd(walk->vma, pmd, addr);
124
125                 err = walk_pte_range(pmd, addr, next, walk);
126                 if (err)
127                         break;
128
129                 if (walk->action == ACTION_AGAIN)
130                         goto again;
131
132         } while (pmd++, addr = next, addr != end);
133
134         return err;
135 }
136
137 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
138                           struct mm_walk *walk)
139 {
140         pud_t *pud;
141         unsigned long next;
142         const struct mm_walk_ops *ops = walk->ops;
143         int err = 0;
144         int depth = real_depth(2);
145
146         pud = pud_offset(p4d, addr);
147         do {
148  again:
149                 next = pud_addr_end(addr, end);
150                 if (pud_none(*pud)) {
151                         if (ops->pte_hole)
152                                 err = ops->pte_hole(addr, next, depth, walk);
153                         if (err)
154                                 break;
155                         continue;
156                 }
157
158                 walk->action = ACTION_SUBTREE;
159
160                 if (ops->pud_entry)
161                         err = ops->pud_entry(pud, addr, next, walk);
162                 if (err)
163                         break;
164
165                 if (walk->action == ACTION_AGAIN)
166                         goto again;
167
168                 if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
169                     walk->action == ACTION_CONTINUE ||
170                     !(ops->pmd_entry || ops->pte_entry))
171                         continue;
172
173                 if (walk->vma)
174                         split_huge_pud(walk->vma, pud, addr);
175                 if (pud_none(*pud))
176                         goto again;
177
178                 err = walk_pmd_range(pud, addr, next, walk);
179                 if (err)
180                         break;
181         } while (pud++, addr = next, addr != end);
182
183         return err;
184 }
185
186 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
187                           struct mm_walk *walk)
188 {
189         p4d_t *p4d;
190         unsigned long next;
191         const struct mm_walk_ops *ops = walk->ops;
192         int err = 0;
193         int depth = real_depth(1);
194
195         p4d = p4d_offset(pgd, addr);
196         do {
197                 next = p4d_addr_end(addr, end);
198                 if (p4d_none_or_clear_bad(p4d)) {
199                         if (ops->pte_hole)
200                                 err = ops->pte_hole(addr, next, depth, walk);
201                         if (err)
202                                 break;
203                         continue;
204                 }
205                 if (ops->p4d_entry) {
206                         err = ops->p4d_entry(p4d, addr, next, walk);
207                         if (err)
208                                 break;
209                 }
210                 if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
211                         err = walk_pud_range(p4d, addr, next, walk);
212                 if (err)
213                         break;
214         } while (p4d++, addr = next, addr != end);
215
216         return err;
217 }
218
219 static int walk_pgd_range(unsigned long addr, unsigned long end,
220                           struct mm_walk *walk)
221 {
222         pgd_t *pgd;
223         unsigned long next;
224         const struct mm_walk_ops *ops = walk->ops;
225         int err = 0;
226
227         if (walk->pgd)
228                 pgd = walk->pgd + pgd_index(addr);
229         else
230                 pgd = pgd_offset(walk->mm, addr);
231         do {
232                 next = pgd_addr_end(addr, end);
233                 if (pgd_none_or_clear_bad(pgd)) {
234                         if (ops->pte_hole)
235                                 err = ops->pte_hole(addr, next, 0, walk);
236                         if (err)
237                                 break;
238                         continue;
239                 }
240                 if (ops->pgd_entry) {
241                         err = ops->pgd_entry(pgd, addr, next, walk);
242                         if (err)
243                                 break;
244                 }
245                 if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
246                         err = walk_p4d_range(pgd, addr, next, walk);
247                 if (err)
248                         break;
249         } while (pgd++, addr = next, addr != end);
250
251         return err;
252 }
253
254 #ifdef CONFIG_HUGETLB_PAGE
255 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
256                                        unsigned long end)
257 {
258         unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
259         return boundary < end ? boundary : end;
260 }
261
262 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
263                               struct mm_walk *walk)
264 {
265         struct vm_area_struct *vma = walk->vma;
266         struct hstate *h = hstate_vma(vma);
267         unsigned long next;
268         unsigned long hmask = huge_page_mask(h);
269         unsigned long sz = huge_page_size(h);
270         pte_t *pte;
271         const struct mm_walk_ops *ops = walk->ops;
272         int err = 0;
273
274         hugetlb_vma_lock_read(vma);
275         do {
276                 next = hugetlb_entry_end(h, addr, end);
277                 pte = hugetlb_walk(vma, addr & hmask, sz);
278                 if (pte)
279                         err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
280                 else if (ops->pte_hole)
281                         err = ops->pte_hole(addr, next, -1, walk);
282                 if (err)
283                         break;
284         } while (addr = next, addr != end);
285         hugetlb_vma_unlock_read(vma);
286
287         return err;
288 }
289
290 #else /* CONFIG_HUGETLB_PAGE */
291 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
292                               struct mm_walk *walk)
293 {
294         return 0;
295 }
296
297 #endif /* CONFIG_HUGETLB_PAGE */
298
299 /*
300  * Decide whether we really walk over the current vma on [@start, @end)
301  * or skip it via the returned value. Return 0 if we do walk over the
302  * current vma, and return 1 if we skip the vma. Negative values means
303  * error, where we abort the current walk.
304  */
305 static int walk_page_test(unsigned long start, unsigned long end,
306                         struct mm_walk *walk)
307 {
308         struct vm_area_struct *vma = walk->vma;
309         const struct mm_walk_ops *ops = walk->ops;
310
311         if (ops->test_walk)
312                 return ops->test_walk(start, end, walk);
313
314         /*
315          * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
316          * range, so we don't walk over it as we do for normal vmas. However,
317          * Some callers are interested in handling hole range and they don't
318          * want to just ignore any single address range. Such users certainly
319          * define their ->pte_hole() callbacks, so let's delegate them to handle
320          * vma(VM_PFNMAP).
321          */
322         if (vma->vm_flags & VM_PFNMAP) {
323                 int err = 1;
324                 if (ops->pte_hole)
325                         err = ops->pte_hole(start, end, -1, walk);
326                 return err ? err : 1;
327         }
328         return 0;
329 }
330
331 static int __walk_page_range(unsigned long start, unsigned long end,
332                         struct mm_walk *walk)
333 {
334         int err = 0;
335         struct vm_area_struct *vma = walk->vma;
336         const struct mm_walk_ops *ops = walk->ops;
337
338         if (ops->pre_vma) {
339                 err = ops->pre_vma(start, end, walk);
340                 if (err)
341                         return err;
342         }
343
344         if (is_vm_hugetlb_page(vma)) {
345                 if (ops->hugetlb_entry)
346                         err = walk_hugetlb_range(start, end, walk);
347         } else
348                 err = walk_pgd_range(start, end, walk);
349
350         if (ops->post_vma)
351                 ops->post_vma(walk);
352
353         return err;
354 }
355
356 static inline void process_mm_walk_lock(struct mm_struct *mm,
357                                         enum page_walk_lock walk_lock)
358 {
359         if (walk_lock == PGWALK_RDLOCK)
360                 mmap_assert_locked(mm);
361         else
362                 mmap_assert_write_locked(mm);
363 }
364
365 static inline void process_vma_walk_lock(struct vm_area_struct *vma,
366                                          enum page_walk_lock walk_lock)
367 {
368 #ifdef CONFIG_PER_VMA_LOCK
369         switch (walk_lock) {
370         case PGWALK_WRLOCK:
371                 vma_start_write(vma);
372                 break;
373         case PGWALK_WRLOCK_VERIFY:
374                 vma_assert_write_locked(vma);
375                 break;
376         case PGWALK_RDLOCK:
377                 /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
378                 break;
379         }
380 #endif
381 }
382
383 /**
384  * walk_page_range - walk page table with caller specific callbacks
385  * @mm:         mm_struct representing the target process of page table walk
386  * @start:      start address of the virtual address range
387  * @end:        end address of the virtual address range
388  * @ops:        operation to call during the walk
389  * @private:    private data for callbacks' usage
390  *
391  * Recursively walk the page table tree of the process represented by @mm
392  * within the virtual address range [@start, @end). During walking, we can do
393  * some caller-specific works for each entry, by setting up pmd_entry(),
394  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
395  * callbacks, the associated entries/pages are just ignored.
396  * The return values of these callbacks are commonly defined like below:
397  *
398  *  - 0  : succeeded to handle the current entry, and if you don't reach the
399  *         end address yet, continue to walk.
400  *  - >0 : succeeded to handle the current entry, and return to the caller
401  *         with caller specific value.
402  *  - <0 : failed to handle the current entry, and return to the caller
403  *         with error code.
404  *
405  * Before starting to walk page table, some callers want to check whether
406  * they really want to walk over the current vma, typically by checking
407  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
408  * purpose.
409  *
410  * If operations need to be staged before and committed after a vma is walked,
411  * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
412  * since it is intended to handle commit-type operations, can't return any
413  * errors.
414  *
415  * struct mm_walk keeps current values of some common data like vma and pmd,
416  * which are useful for the access from callbacks. If you want to pass some
417  * caller-specific data to callbacks, @private should be helpful.
418  *
419  * Locking:
420  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
421  *   because these function traverse vma list and/or access to vma's data.
422  */
423 int walk_page_range(struct mm_struct *mm, unsigned long start,
424                 unsigned long end, const struct mm_walk_ops *ops,
425                 void *private)
426 {
427         int err = 0;
428         unsigned long next;
429         struct vm_area_struct *vma;
430         struct mm_walk walk = {
431                 .ops            = ops,
432                 .mm             = mm,
433                 .private        = private,
434         };
435
436         if (start >= end)
437                 return -EINVAL;
438
439         if (!walk.mm)
440                 return -EINVAL;
441
442         process_mm_walk_lock(walk.mm, ops->walk_lock);
443
444         vma = find_vma(walk.mm, start);
445         do {
446                 if (!vma) { /* after the last vma */
447                         walk.vma = NULL;
448                         next = end;
449                         if (ops->pte_hole)
450                                 err = ops->pte_hole(start, next, -1, &walk);
451                 } else if (start < vma->vm_start) { /* outside vma */
452                         walk.vma = NULL;
453                         next = min(end, vma->vm_start);
454                         if (ops->pte_hole)
455                                 err = ops->pte_hole(start, next, -1, &walk);
456                 } else { /* inside vma */
457                         process_vma_walk_lock(vma, ops->walk_lock);
458                         walk.vma = vma;
459                         next = min(end, vma->vm_end);
460                         vma = find_vma(mm, vma->vm_end);
461
462                         err = walk_page_test(start, next, &walk);
463                         if (err > 0) {
464                                 /*
465                                  * positive return values are purely for
466                                  * controlling the pagewalk, so should never
467                                  * be passed to the callers.
468                                  */
469                                 err = 0;
470                                 continue;
471                         }
472                         if (err < 0)
473                                 break;
474                         err = __walk_page_range(start, next, &walk);
475                 }
476                 if (err)
477                         break;
478         } while (start = next, start < end);
479         return err;
480 }
481
482 /**
483  * walk_page_range_novma - walk a range of pagetables not backed by a vma
484  * @mm:         mm_struct representing the target process of page table walk
485  * @start:      start address of the virtual address range
486  * @end:        end address of the virtual address range
487  * @ops:        operation to call during the walk
488  * @pgd:        pgd to walk if different from mm->pgd
489  * @private:    private data for callbacks' usage
490  *
491  * Similar to walk_page_range() but can walk any page tables even if they are
492  * not backed by VMAs. Because 'unusual' entries may be walked this function
493  * will also not lock the PTEs for the pte_entry() callback. This is useful for
494  * walking the kernel pages tables or page tables for firmware.
495  *
496  * Note: Be careful to walk the kernel pages tables, the caller may be need to
497  * take other effective approache (mmap lock may be insufficient) to prevent
498  * the intermediate kernel page tables belonging to the specified address range
499  * from being freed (e.g. memory hot-remove).
500  */
501 int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
502                           unsigned long end, const struct mm_walk_ops *ops,
503                           pgd_t *pgd,
504                           void *private)
505 {
506         struct mm_walk walk = {
507                 .ops            = ops,
508                 .mm             = mm,
509                 .pgd            = pgd,
510                 .private        = private,
511                 .no_vma         = true
512         };
513
514         if (start >= end || !walk.mm)
515                 return -EINVAL;
516
517         /*
518          * 1) For walking the user virtual address space:
519          *
520          * The mmap lock protects the page walker from changes to the page
521          * tables during the walk.  However a read lock is insufficient to
522          * protect those areas which don't have a VMA as munmap() detaches
523          * the VMAs before downgrading to a read lock and actually tearing
524          * down PTEs/page tables. In which case, the mmap write lock should
525          * be hold.
526          *
527          * 2) For walking the kernel virtual address space:
528          *
529          * The kernel intermediate page tables usually do not be freed, so
530          * the mmap map read lock is sufficient. But there are some exceptions.
531          * E.g. memory hot-remove. In which case, the mmap lock is insufficient
532          * to prevent the intermediate kernel pages tables belonging to the
533          * specified address range from being freed. The caller should take
534          * other actions to prevent this race.
535          */
536         if (mm == &init_mm)
537                 mmap_assert_locked(walk.mm);
538         else
539                 mmap_assert_write_locked(walk.mm);
540
541         return walk_pgd_range(start, end, &walk);
542 }
543
544 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
545                         unsigned long end, const struct mm_walk_ops *ops,
546                         void *private)
547 {
548         struct mm_walk walk = {
549                 .ops            = ops,
550                 .mm             = vma->vm_mm,
551                 .vma            = vma,
552                 .private        = private,
553         };
554
555         if (start >= end || !walk.mm)
556                 return -EINVAL;
557         if (start < vma->vm_start || end > vma->vm_end)
558                 return -EINVAL;
559
560         process_mm_walk_lock(walk.mm, ops->walk_lock);
561         process_vma_walk_lock(vma, ops->walk_lock);
562         return __walk_page_range(start, end, &walk);
563 }
564
565 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
566                 void *private)
567 {
568         struct mm_walk walk = {
569                 .ops            = ops,
570                 .mm             = vma->vm_mm,
571                 .vma            = vma,
572                 .private        = private,
573         };
574
575         if (!walk.mm)
576                 return -EINVAL;
577
578         process_mm_walk_lock(walk.mm, ops->walk_lock);
579         process_vma_walk_lock(vma, ops->walk_lock);
580         return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
581 }
582
583 /**
584  * walk_page_mapping - walk all memory areas mapped into a struct address_space.
585  * @mapping: Pointer to the struct address_space
586  * @first_index: First page offset in the address_space
587  * @nr: Number of incremental page offsets to cover
588  * @ops:        operation to call during the walk
589  * @private:    private data for callbacks' usage
590  *
591  * This function walks all memory areas mapped into a struct address_space.
592  * The walk is limited to only the given page-size index range, but if
593  * the index boundaries cross a huge page-table entry, that entry will be
594  * included.
595  *
596  * Also see walk_page_range() for additional information.
597  *
598  * Locking:
599  *   This function can't require that the struct mm_struct::mmap_lock is held,
600  *   since @mapping may be mapped by multiple processes. Instead
601  *   @mapping->i_mmap_rwsem must be held. This might have implications in the
602  *   callbacks, and it's up tho the caller to ensure that the
603  *   struct mm_struct::mmap_lock is not needed.
604  *
605  *   Also this means that a caller can't rely on the struct
606  *   vm_area_struct::vm_flags to be constant across a call,
607  *   except for immutable flags. Callers requiring this shouldn't use
608  *   this function.
609  *
610  * Return: 0 on success, negative error code on failure, positive number on
611  * caller defined premature termination.
612  */
613 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
614                       pgoff_t nr, const struct mm_walk_ops *ops,
615                       void *private)
616 {
617         struct mm_walk walk = {
618                 .ops            = ops,
619                 .private        = private,
620         };
621         struct vm_area_struct *vma;
622         pgoff_t vba, vea, cba, cea;
623         unsigned long start_addr, end_addr;
624         int err = 0;
625
626         lockdep_assert_held(&mapping->i_mmap_rwsem);
627         vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
628                                   first_index + nr - 1) {
629                 /* Clip to the vma */
630                 vba = vma->vm_pgoff;
631                 vea = vba + vma_pages(vma);
632                 cba = first_index;
633                 cba = max(cba, vba);
634                 cea = first_index + nr;
635                 cea = min(cea, vea);
636
637                 start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
638                 end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
639                 if (start_addr >= end_addr)
640                         continue;
641
642                 walk.vma = vma;
643                 walk.mm = vma->vm_mm;
644
645                 err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
646                 if (err > 0) {
647                         err = 0;
648                         break;
649                 } else if (err < 0)
650                         break;
651
652                 err = __walk_page_range(start_addr, end_addr, &walk);
653                 if (err)
654                         break;
655         }
656
657         return err;
658 }
659
660 /**
661  * folio_walk_start - walk the page tables to a folio
662  * @fw: filled with information on success.
663  * @vma: the VMA.
664  * @addr: the virtual address to use for the page table walk.
665  * @flags: flags modifying which folios to walk to.
666  *
667  * Walk the page tables using @addr in a given @vma to a mapped folio and
668  * return the folio, making sure that the page table entry referenced by
669  * @addr cannot change until folio_walk_end() was called.
670  *
671  * As default, this function returns only folios that are not special (e.g., not
672  * the zeropage) and never returns folios that are supposed to be ignored by the
673  * VM as documented by vm_normal_page(). If requested, zeropages will be
674  * returned as well.
675  *
676  * As default, this function only considers present page table entries.
677  * If requested, it will also consider migration entries.
678  *
679  * If this function returns NULL it might either indicate "there is nothing" or
680  * "there is nothing suitable".
681  *
682  * On success, @fw is filled and the function returns the folio while the PTL
683  * is still held and folio_walk_end() must be called to clean up,
684  * releasing any held locks. The returned folio must *not* be used after the
685  * call to folio_walk_end(), unless a short-term folio reference is taken before
686  * that call.
687  *
688  * @fw->page will correspond to the page that is effectively referenced by
689  * @addr. However, for migration entries and shared zeropages @fw->page is
690  * set to NULL. Note that large folios might be mapped by multiple page table
691  * entries, and this function will always only lookup a single entry as
692  * specified by @addr, which might or might not cover more than a single page of
693  * the returned folio.
694  *
695  * This function must *not* be used as a naive replacement for
696  * get_user_pages() / pin_user_pages(), especially not to perform DMA or
697  * to carelessly modify page content. This function may *only* be used to grab
698  * short-term folio references, never to grab long-term folio references.
699  *
700  * Using the page table entry pointers in @fw for reading or modifying the
701  * entry should be avoided where possible: however, there might be valid
702  * use cases.
703  *
704  * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care.
705  * For example, PMD page table sharing might require prior unsharing. Also,
706  * logical hugetlb entries might span multiple physical page table entries,
707  * which *must* be modified in a single operation (set_huge_pte_at(),
708  * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might
709  * not correspond to the first physical entry of a logical hugetlb entry.
710  *
711  * The mmap lock must be held in read mode.
712  *
713  * Return: folio pointer on success, otherwise NULL.
714  */
715 struct folio *folio_walk_start(struct folio_walk *fw,
716                 struct vm_area_struct *vma, unsigned long addr,
717                 folio_walk_flags_t flags)
718 {
719         unsigned long entry_size;
720         bool expose_page = true;
721         struct page *page;
722         pud_t *pudp, pud;
723         pmd_t *pmdp, pmd;
724         pte_t *ptep, pte;
725         spinlock_t *ptl;
726         pgd_t *pgdp;
727         p4d_t *p4dp;
728
729         mmap_assert_locked(vma->vm_mm);
730         vma_pgtable_walk_begin(vma);
731
732         if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end))
733                 goto not_found;
734
735         pgdp = pgd_offset(vma->vm_mm, addr);
736         if (pgd_none_or_clear_bad(pgdp))
737                 goto not_found;
738
739         p4dp = p4d_offset(pgdp, addr);
740         if (p4d_none_or_clear_bad(p4dp))
741                 goto not_found;
742
743         pudp = pud_offset(p4dp, addr);
744         pud = pudp_get(pudp);
745         if (pud_none(pud))
746                 goto not_found;
747         if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
748                 ptl = pud_lock(vma->vm_mm, pudp);
749                 pud = pudp_get(pudp);
750
751                 entry_size = PUD_SIZE;
752                 fw->level = FW_LEVEL_PUD;
753                 fw->pudp = pudp;
754                 fw->pud = pud;
755
756                 if (!pud_present(pud) || pud_devmap(pud)) {
757                         spin_unlock(ptl);
758                         goto not_found;
759                 } else if (!pud_leaf(pud)) {
760                         spin_unlock(ptl);
761                         goto pmd_table;
762                 }
763                 /*
764                  * TODO: vm_normal_page_pud() will be handy once we want to
765                  * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs.
766                  */
767                 page = pud_page(pud);
768                 goto found;
769         }
770
771 pmd_table:
772         VM_WARN_ON_ONCE(pud_leaf(*pudp));
773         pmdp = pmd_offset(pudp, addr);
774         pmd = pmdp_get_lockless(pmdp);
775         if (pmd_none(pmd))
776                 goto not_found;
777         if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
778                 ptl = pmd_lock(vma->vm_mm, pmdp);
779                 pmd = pmdp_get(pmdp);
780
781                 entry_size = PMD_SIZE;
782                 fw->level = FW_LEVEL_PMD;
783                 fw->pmdp = pmdp;
784                 fw->pmd = pmd;
785
786                 if (pmd_none(pmd)) {
787                         spin_unlock(ptl);
788                         goto not_found;
789                 } else if (!pmd_leaf(pmd)) {
790                         spin_unlock(ptl);
791                         goto pte_table;
792                 } else if (pmd_present(pmd)) {
793                         page = vm_normal_page_pmd(vma, addr, pmd);
794                         if (page) {
795                                 goto found;
796                         } else if ((flags & FW_ZEROPAGE) &&
797                                     is_huge_zero_pmd(pmd)) {
798                                 page = pfn_to_page(pmd_pfn(pmd));
799                                 expose_page = false;
800                                 goto found;
801                         }
802                 } else if ((flags & FW_MIGRATION) &&
803                            is_pmd_migration_entry(pmd)) {
804                         swp_entry_t entry = pmd_to_swp_entry(pmd);
805
806                         page = pfn_swap_entry_to_page(entry);
807                         expose_page = false;
808                         goto found;
809                 }
810                 spin_unlock(ptl);
811                 goto not_found;
812         }
813
814 pte_table:
815         VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
816         ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
817         if (!ptep)
818                 goto not_found;
819         pte = ptep_get(ptep);
820
821         entry_size = PAGE_SIZE;
822         fw->level = FW_LEVEL_PTE;
823         fw->ptep = ptep;
824         fw->pte = pte;
825
826         if (pte_present(pte)) {
827                 page = vm_normal_page(vma, addr, pte);
828                 if (page)
829                         goto found;
830                 if ((flags & FW_ZEROPAGE) &&
831                     is_zero_pfn(pte_pfn(pte))) {
832                         page = pfn_to_page(pte_pfn(pte));
833                         expose_page = false;
834                         goto found;
835                 }
836         } else if (!pte_none(pte)) {
837                 swp_entry_t entry = pte_to_swp_entry(pte);
838
839                 if ((flags & FW_MIGRATION) &&
840                     is_migration_entry(entry)) {
841                         page = pfn_swap_entry_to_page(entry);
842                         expose_page = false;
843                         goto found;
844                 }
845         }
846         pte_unmap_unlock(ptep, ptl);
847 not_found:
848         vma_pgtable_walk_end(vma);
849         return NULL;
850 found:
851         if (expose_page)
852                 /* Note: Offset from the mapped page, not the folio start. */
853                 fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT);
854         else
855                 fw->page = NULL;
856         fw->ptl = ptl;
857         return page_folio(page);
858 }
This page took 0.080154 seconds and 4 git commands to generate.