mm/pagewalk.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/pagewalk.h>
   3 #include <linux/highmem.h>
   4 #include <linux/sched.h>
   5 #include <linux/hugetlb.h>
   6 #include <linux/swap.h>
   7 #include <linux/swapops.h>
   8
   9 /*
  10  * We want to know the real level where a entry is located ignoring any
  11  * folding of levels which may be happening. For example if p4d is folded then
  12  * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
  13  */
  14 static int real_depth(int depth)
  15 {
  16         if (depth == 3 && PTRS_PER_PMD == 1)
  17                 depth = 2;
  18         if (depth == 2 && PTRS_PER_PUD == 1)
  19                 depth = 1;
  20         if (depth == 1 && PTRS_PER_P4D == 1)
  21                 depth = 0;
  22         return depth;
  23 }
  24
  25 static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
  26                                 unsigned long end, struct mm_walk *walk)
  27 {
  28         const struct mm_walk_ops *ops = walk->ops;
  29         int err = 0;
  30
  31         for (;;) {
  32                 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
  33                 if (err)
  34                        break;
  35                 if (addr >= end - PAGE_SIZE)
  36                         break;
  37                 addr += PAGE_SIZE;
  38                 pte++;
  39         }
  40         return err;
  41 }
  42
  43 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
  44                           struct mm_walk *walk)
  45 {
  46         pte_t *pte;
  47         int err = 0;
  48         spinlock_t *ptl;
  49
  50         if (walk->no_vma) {
  51                 /*
  52                  * pte_offset_map() might apply user-specific validation.
  53                  * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
  54                  * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
  55                  * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
  56                  */
  57                 if (walk->mm == &init_mm || addr >= TASK_SIZE)
  58                         pte = pte_offset_kernel(pmd, addr);
  59                 else
  60                         pte = pte_offset_map(pmd, addr);
  61                 if (pte) {
  62                         err = walk_pte_range_inner(pte, addr, end, walk);
  63                         if (walk->mm != &init_mm && addr < TASK_SIZE)
  64                                 pte_unmap(pte);
  65                 }
  66         } else {
  67                 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
  68                 if (pte) {
  69                         err = walk_pte_range_inner(pte, addr, end, walk);
  70                         pte_unmap_unlock(pte, ptl);
  71                 }
  72         }
  73         if (!pte)
  74                 walk->action = ACTION_AGAIN;
  75         return err;
  76 }
  77
  78 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  79                           struct mm_walk *walk)
  80 {
  81         pmd_t *pmd;
  82         unsigned long next;
  83         const struct mm_walk_ops *ops = walk->ops;
  84         int err = 0;
  85         int depth = real_depth(3);
  86
  87         pmd = pmd_offset(pud, addr);
  88         do {
  89 again:
  90                 next = pmd_addr_end(addr, end);
  91                 if (pmd_none(*pmd)) {
  92                         if (ops->pte_hole)
  93                                 err = ops->pte_hole(addr, next, depth, walk);
  94                         if (err)
  95                                 break;
  96                         continue;
  97                 }
  98
  99                 walk->action = ACTION_SUBTREE;
 100
 101                 /*
 102                  * This implies that each ->pmd_entry() handler
 103                  * needs to know about pmd_trans_huge() pmds
 104                  */
 105                 if (ops->pmd_entry)
 106                         err = ops->pmd_entry(pmd, addr, next, walk);
 107                 if (err)
 108                         break;
 109
 110                 if (walk->action == ACTION_AGAIN)
 111                         goto again;
 112
 113                 /*
 114                  * Check this here so we only break down trans_huge
 115                  * pages when we _need_ to
 116                  */
 117                 if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
 118                     walk->action == ACTION_CONTINUE ||
 119                     !(ops->pte_entry))
 120                         continue;
 121
 122                 if (walk->vma)
 123                         split_huge_pmd(walk->vma, pmd, addr);
 124
 125                 err = walk_pte_range(pmd, addr, next, walk);
 126                 if (err)
 127                         break;
 128
 129                 if (walk->action == ACTION_AGAIN)
 130                         goto again;
 131
 132         } while (pmd++, addr = next, addr != end);
 133
 134         return err;
 135 }
 136
 137 static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 138                           struct mm_walk *walk)
 139 {
 140         pud_t *pud;
 141         unsigned long next;
 142         const struct mm_walk_ops *ops = walk->ops;
 143         int err = 0;
 144         int depth = real_depth(2);
 145
 146         pud = pud_offset(p4d, addr);
 147         do {
 148  again:
 149                 next = pud_addr_end(addr, end);
 150                 if (pud_none(*pud)) {
 151                         if (ops->pte_hole)
 152                                 err = ops->pte_hole(addr, next, depth, walk);
 153                         if (err)
 154                                 break;
 155                         continue;
 156                 }
 157
 158                 walk->action = ACTION_SUBTREE;
 159
 160                 if (ops->pud_entry)
 161                         err = ops->pud_entry(pud, addr, next, walk);
 162                 if (err)
 163                         break;
 164
 165                 if (walk->action == ACTION_AGAIN)
 166                         goto again;
 167
 168                 if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
 169                     walk->action == ACTION_CONTINUE ||
 170                     !(ops->pmd_entry || ops->pte_entry))
 171                         continue;
 172
 173                 if (walk->vma)
 174                         split_huge_pud(walk->vma, pud, addr);
 175                 if (pud_none(*pud))
 176                         goto again;
 177
 178                 err = walk_pmd_range(pud, addr, next, walk);
 179                 if (err)
 180                         break;
 181         } while (pud++, addr = next, addr != end);
 182
 183         return err;
 184 }
 185
 186 static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 187                           struct mm_walk *walk)
 188 {
 189         p4d_t *p4d;
 190         unsigned long next;
 191         const struct mm_walk_ops *ops = walk->ops;
 192         int err = 0;
 193         int depth = real_depth(1);
 194
 195         p4d = p4d_offset(pgd, addr);
 196         do {
 197                 next = p4d_addr_end(addr, end);
 198                 if (p4d_none_or_clear_bad(p4d)) {
 199                         if (ops->pte_hole)
 200                                 err = ops->pte_hole(addr, next, depth, walk);
 201                         if (err)
 202                                 break;
 203                         continue;
 204                 }
 205                 if (ops->p4d_entry) {
 206                         err = ops->p4d_entry(p4d, addr, next, walk);
 207                         if (err)
 208                                 break;
 209                 }
 210                 if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
 211                         err = walk_pud_range(p4d, addr, next, walk);
 212                 if (err)
 213                         break;
 214         } while (p4d++, addr = next, addr != end);
 215
 216         return err;
 217 }
 218
 219 static int walk_pgd_range(unsigned long addr, unsigned long end,
 220                           struct mm_walk *walk)
 221 {
 222         pgd_t *pgd;
 223         unsigned long next;
 224         const struct mm_walk_ops *ops = walk->ops;
 225         int err = 0;
 226
 227         if (walk->pgd)
 228                 pgd = walk->pgd + pgd_index(addr);
 229         else
 230                 pgd = pgd_offset(walk->mm, addr);
 231         do {
 232                 next = pgd_addr_end(addr, end);
 233                 if (pgd_none_or_clear_bad(pgd)) {
 234                         if (ops->pte_hole)
 235                                 err = ops->pte_hole(addr, next, 0, walk);
 236                         if (err)
 237                                 break;
 238                         continue;
 239                 }
 240                 if (ops->pgd_entry) {
 241                         err = ops->pgd_entry(pgd, addr, next, walk);
 242                         if (err)
 243                                 break;
 244                 }
 245                 if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry)
 246                         err = walk_p4d_range(pgd, addr, next, walk);
 247                 if (err)
 248                         break;
 249         } while (pgd++, addr = next, addr != end);
 250
 251         return err;
 252 }
 253
 254 #ifdef CONFIG_HUGETLB_PAGE
 255 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
 256                                        unsigned long end)
 257 {
 258         unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
 259         return boundary < end ? boundary : end;
 260 }
 261
 262 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 263                               struct mm_walk *walk)
 264 {
 265         struct vm_area_struct *vma = walk->vma;
 266         struct hstate *h = hstate_vma(vma);
 267         unsigned long next;
 268         unsigned long hmask = huge_page_mask(h);
 269         unsigned long sz = huge_page_size(h);
 270         pte_t *pte;
 271         const struct mm_walk_ops *ops = walk->ops;
 272         int err = 0;
 273
 274         hugetlb_vma_lock_read(vma);
 275         do {
 276                 next = hugetlb_entry_end(h, addr, end);
 277                 pte = hugetlb_walk(vma, addr & hmask, sz);
 278                 if (pte)
 279                         err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
 280                 else if (ops->pte_hole)
 281                         err = ops->pte_hole(addr, next, -1, walk);
 282                 if (err)
 283                         break;
 284         } while (addr = next, addr != end);
 285         hugetlb_vma_unlock_read(vma);
 286
 287         return err;
 288 }
 289
 290 #else /* CONFIG_HUGETLB_PAGE */
 291 static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 292                               struct mm_walk *walk)
 293 {
 294         return 0;
 295 }
 296
 297 #endif /* CONFIG_HUGETLB_PAGE */
 298
 299 /*
 300  * Decide whether we really walk over the current vma on [@start, @end)
 301  * or skip it via the returned value. Return 0 if we do walk over the
 302  * current vma, and return 1 if we skip the vma. Negative values means
 303  * error, where we abort the current walk.
 304  */
 305 static int walk_page_test(unsigned long start, unsigned long end,
 306                         struct mm_walk *walk)
 307 {
 308         struct vm_area_struct *vma = walk->vma;
 309         const struct mm_walk_ops *ops = walk->ops;
 310
 311         if (ops->test_walk)
 312                 return ops->test_walk(start, end, walk);
 313
 314         /*
 315          * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
 316          * range, so we don't walk over it as we do for normal vmas. However,
 317          * Some callers are interested in handling hole range and they don't
 318          * want to just ignore any single address range. Such users certainly
 319          * define their ->pte_hole() callbacks, so let's delegate them to handle
 320          * vma(VM_PFNMAP).
 321          */
 322         if (vma->vm_flags & VM_PFNMAP) {
 323                 int err = 1;
 324                 if (ops->pte_hole)
 325                         err = ops->pte_hole(start, end, -1, walk);
 326                 return err ? err : 1;
 327         }
 328         return 0;
 329 }
 330
 331 static int __walk_page_range(unsigned long start, unsigned long end,
 332                         struct mm_walk *walk)
 333 {
 334         int err = 0;
 335         struct vm_area_struct *vma = walk->vma;
 336         const struct mm_walk_ops *ops = walk->ops;
 337
 338         if (ops->pre_vma) {
 339                 err = ops->pre_vma(start, end, walk);
 340                 if (err)
 341                         return err;
 342         }
 343
 344         if (is_vm_hugetlb_page(vma)) {
 345                 if (ops->hugetlb_entry)
 346                         err = walk_hugetlb_range(start, end, walk);
 347         } else
 348                 err = walk_pgd_range(start, end, walk);
 349
 350         if (ops->post_vma)
 351                 ops->post_vma(walk);
 352
 353         return err;
 354 }
 355
 356 static inline void process_mm_walk_lock(struct mm_struct *mm,
 357                                         enum page_walk_lock walk_lock)
 358 {
 359         if (walk_lock == PGWALK_RDLOCK)
 360                 mmap_assert_locked(mm);
 361         else
 362                 mmap_assert_write_locked(mm);
 363 }
 364
 365 static inline void process_vma_walk_lock(struct vm_area_struct *vma,
 366                                          enum page_walk_lock walk_lock)
 367 {
 368 #ifdef CONFIG_PER_VMA_LOCK
 369         switch (walk_lock) {
 370         case PGWALK_WRLOCK:
 371                 vma_start_write(vma);
 372                 break;
 373         case PGWALK_WRLOCK_VERIFY:
 374                 vma_assert_write_locked(vma);
 375                 break;
 376         case PGWALK_RDLOCK:
 377                 /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
 378                 break;
 379         }
 380 #endif
 381 }
 382
 383 /**
 384  * walk_page_range - walk page table with caller specific callbacks
 385  * @mm:         mm_struct representing the target process of page table walk
 386  * @start:      start address of the virtual address range
 387  * @end:        end address of the virtual address range
 388  * @ops:        operation to call during the walk
 389  * @private:    private data for callbacks' usage
 390  *
 391  * Recursively walk the page table tree of the process represented by @mm
 392  * within the virtual address range [@start, @end). During walking, we can do
 393  * some caller-specific works for each entry, by setting up pmd_entry(),
 394  * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
 395  * callbacks, the associated entries/pages are just ignored.
 396  * The return values of these callbacks are commonly defined like below:
 397  *
 398  *  - 0  : succeeded to handle the current entry, and if you don't reach the
 399  *         end address yet, continue to walk.
 400  *  - >0 : succeeded to handle the current entry, and return to the caller
 401  *         with caller specific value.
 402  *  - <0 : failed to handle the current entry, and return to the caller
 403  *         with error code.
 404  *
 405  * Before starting to walk page table, some callers want to check whether
 406  * they really want to walk over the current vma, typically by checking
 407  * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
 408  * purpose.
 409  *
 410  * If operations need to be staged before and committed after a vma is walked,
 411  * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
 412  * since it is intended to handle commit-type operations, can't return any
 413  * errors.
 414  *
 415  * struct mm_walk keeps current values of some common data like vma and pmd,
 416  * which are useful for the access from callbacks. If you want to pass some
 417  * caller-specific data to callbacks, @private should be helpful.
 418  *
 419  * Locking:
 420  *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
 421  *   because these function traverse vma list and/or access to vma's data.
 422  */
 423 int walk_page_range(struct mm_struct *mm, unsigned long start,
 424                 unsigned long end, const struct mm_walk_ops *ops,
 425                 void *private)
 426 {
 427         int err = 0;
 428         unsigned long next;
 429         struct vm_area_struct *vma;
 430         struct mm_walk walk = {
 431                 .ops            = ops,
 432                 .mm             = mm,
 433                 .private        = private,
 434         };
 435
 436         if (start >= end)
 437                 return -EINVAL;
 438
 439         if (!walk.mm)
 440                 return -EINVAL;
 441
 442         process_mm_walk_lock(walk.mm, ops->walk_lock);
 443
 444         vma = find_vma(walk.mm, start);
 445         do {
 446                 if (!vma) { /* after the last vma */
 447                         walk.vma = NULL;
 448                         next = end;
 449                         if (ops->pte_hole)
 450                                 err = ops->pte_hole(start, next, -1, &walk);
 451                 } else if (start < vma->vm_start) { /* outside vma */
 452                         walk.vma = NULL;
 453                         next = min(end, vma->vm_start);
 454                         if (ops->pte_hole)
 455                                 err = ops->pte_hole(start, next, -1, &walk);
 456                 } else { /* inside vma */
 457                         process_vma_walk_lock(vma, ops->walk_lock);
 458                         walk.vma = vma;
 459                         next = min(end, vma->vm_end);
 460                         vma = find_vma(mm, vma->vm_end);
 461
 462                         err = walk_page_test(start, next, &walk);
 463                         if (err > 0) {
 464                                 /*
 465                                  * positive return values are purely for
 466                                  * controlling the pagewalk, so should never
 467                                  * be passed to the callers.
 468                                  */
 469                                 err = 0;
 470                                 continue;
 471                         }
 472                         if (err < 0)
 473                                 break;
 474                         err = __walk_page_range(start, next, &walk);
 475                 }
 476                 if (err)
 477                         break;
 478         } while (start = next, start < end);
 479         return err;
 480 }
 481
 482 /**
 483  * walk_page_range_novma - walk a range of pagetables not backed by a vma
 484  * @mm:         mm_struct representing the target process of page table walk
 485  * @start:      start address of the virtual address range
 486  * @end:        end address of the virtual address range
 487  * @ops:        operation to call during the walk
 488  * @pgd:        pgd to walk if different from mm->pgd
 489  * @private:    private data for callbacks' usage
 490  *
 491  * Similar to walk_page_range() but can walk any page tables even if they are
 492  * not backed by VMAs. Because 'unusual' entries may be walked this function
 493  * will also not lock the PTEs for the pte_entry() callback. This is useful for
 494  * walking the kernel pages tables or page tables for firmware.
 495  *
 496  * Note: Be careful to walk the kernel pages tables, the caller may be need to
 497  * take other effective approache (mmap lock may be insufficient) to prevent
 498  * the intermediate kernel page tables belonging to the specified address range
 499  * from being freed (e.g. memory hot-remove).
 500  */
 501 int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
 502                           unsigned long end, const struct mm_walk_ops *ops,
 503                           pgd_t *pgd,
 504                           void *private)
 505 {
 506         struct mm_walk walk = {
 507                 .ops            = ops,
 508                 .mm             = mm,
 509                 .pgd            = pgd,
 510                 .private        = private,
 511                 .no_vma         = true
 512         };
 513
 514         if (start >= end || !walk.mm)
 515                 return -EINVAL;
 516
 517         /*
 518          * 1) For walking the user virtual address space:
 519          *
 520          * The mmap lock protects the page walker from changes to the page
 521          * tables during the walk.  However a read lock is insufficient to
 522          * protect those areas which don't have a VMA as munmap() detaches
 523          * the VMAs before downgrading to a read lock and actually tearing
 524          * down PTEs/page tables. In which case, the mmap write lock should
 525          * be hold.
 526          *
 527          * 2) For walking the kernel virtual address space:
 528          *
 529          * The kernel intermediate page tables usually do not be freed, so
 530          * the mmap map read lock is sufficient. But there are some exceptions.
 531          * E.g. memory hot-remove. In which case, the mmap lock is insufficient
 532          * to prevent the intermediate kernel pages tables belonging to the
 533          * specified address range from being freed. The caller should take
 534          * other actions to prevent this race.
 535          */
 536         if (mm == &init_mm)
 537                 mmap_assert_locked(walk.mm);
 538         else
 539                 mmap_assert_write_locked(walk.mm);
 540
 541         return walk_pgd_range(start, end, &walk);
 542 }
 543
 544 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
 545                         unsigned long end, const struct mm_walk_ops *ops,
 546                         void *private)
 547 {
 548         struct mm_walk walk = {
 549                 .ops            = ops,
 550                 .mm             = vma->vm_mm,
 551                 .vma            = vma,
 552                 .private        = private,
 553         };
 554
 555         if (start >= end || !walk.mm)
 556                 return -EINVAL;
 557         if (start < vma->vm_start || end > vma->vm_end)
 558                 return -EINVAL;
 559
 560         process_mm_walk_lock(walk.mm, ops->walk_lock);
 561         process_vma_walk_lock(vma, ops->walk_lock);
 562         return __walk_page_range(start, end, &walk);
 563 }
 564
 565 int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
 566                 void *private)
 567 {
 568         struct mm_walk walk = {
 569                 .ops            = ops,
 570                 .mm             = vma->vm_mm,
 571                 .vma            = vma,
 572                 .private        = private,
 573         };
 574
 575         if (!walk.mm)
 576                 return -EINVAL;
 577
 578         process_mm_walk_lock(walk.mm, ops->walk_lock);
 579         process_vma_walk_lock(vma, ops->walk_lock);
 580         return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
 581 }
 582
 583 /**
 584  * walk_page_mapping - walk all memory areas mapped into a struct address_space.
 585  * @mapping: Pointer to the struct address_space
 586  * @first_index: First page offset in the address_space
 587  * @nr: Number of incremental page offsets to cover
 588  * @ops:        operation to call during the walk
 589  * @private:    private data for callbacks' usage
 590  *
 591  * This function walks all memory areas mapped into a struct address_space.
 592  * The walk is limited to only the given page-size index range, but if
 593  * the index boundaries cross a huge page-table entry, that entry will be
 594  * included.
 595  *
 596  * Also see walk_page_range() for additional information.
 597  *
 598  * Locking:
 599  *   This function can't require that the struct mm_struct::mmap_lock is held,
 600  *   since @mapping may be mapped by multiple processes. Instead
 601  *   @mapping->i_mmap_rwsem must be held. This might have implications in the
 602  *   callbacks, and it's up tho the caller to ensure that the
 603  *   struct mm_struct::mmap_lock is not needed.
 604  *
 605  *   Also this means that a caller can't rely on the struct
 606  *   vm_area_struct::vm_flags to be constant across a call,
 607  *   except for immutable flags. Callers requiring this shouldn't use
 608  *   this function.
 609  *
 610  * Return: 0 on success, negative error code on failure, positive number on
 611  * caller defined premature termination.
 612  */
 613 int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
 614                       pgoff_t nr, const struct mm_walk_ops *ops,
 615                       void *private)
 616 {
 617         struct mm_walk walk = {
 618                 .ops            = ops,
 619                 .private        = private,
 620         };
 621         struct vm_area_struct *vma;
 622         pgoff_t vba, vea, cba, cea;
 623         unsigned long start_addr, end_addr;
 624         int err = 0;
 625
 626         lockdep_assert_held(&mapping->i_mmap_rwsem);
 627         vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
 628                                   first_index + nr - 1) {
 629                 /* Clip to the vma */
 630                 vba = vma->vm_pgoff;
 631                 vea = vba + vma_pages(vma);
 632                 cba = first_index;
 633                 cba = max(cba, vba);
 634                 cea = first_index + nr;
 635                 cea = min(cea, vea);
 636
 637                 start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
 638                 end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
 639                 if (start_addr >= end_addr)
 640                         continue;
 641
 642                 walk.vma = vma;
 643                 walk.mm = vma->vm_mm;
 644
 645                 err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
 646                 if (err > 0) {
 647                         err = 0;
 648                         break;
 649                 } else if (err < 0)
 650                         break;
 651
 652                 err = __walk_page_range(start_addr, end_addr, &walk);
 653                 if (err)
 654                         break;
 655         }
 656
 657         return err;
 658 }
 659
 660 /**
 661  * folio_walk_start - walk the page tables to a folio
 662  * @fw: filled with information on success.
 663  * @vma: the VMA.
 664  * @addr: the virtual address to use for the page table walk.
 665  * @flags: flags modifying which folios to walk to.
 666  *
 667  * Walk the page tables using @addr in a given @vma to a mapped folio and
 668  * return the folio, making sure that the page table entry referenced by
 669  * @addr cannot change until folio_walk_end() was called.
 670  *
 671  * As default, this function returns only folios that are not special (e.g., not
 672  * the zeropage) and never returns folios that are supposed to be ignored by the
 673  * VM as documented by vm_normal_page(). If requested, zeropages will be
 674  * returned as well.
 675  *
 676  * As default, this function only considers present page table entries.
 677  * If requested, it will also consider migration entries.
 678  *
 679  * If this function returns NULL it might either indicate "there is nothing" or
 680  * "there is nothing suitable".
 681  *
 682  * On success, @fw is filled and the function returns the folio while the PTL
 683  * is still held and folio_walk_end() must be called to clean up,
 684  * releasing any held locks. The returned folio must *not* be used after the
 685  * call to folio_walk_end(), unless a short-term folio reference is taken before
 686  * that call.
 687  *
 688  * @fw->page will correspond to the page that is effectively referenced by
 689  * @addr. However, for migration entries and shared zeropages @fw->page is
 690  * set to NULL. Note that large folios might be mapped by multiple page table
 691  * entries, and this function will always only lookup a single entry as
 692  * specified by @addr, which might or might not cover more than a single page of
 693  * the returned folio.
 694  *
 695  * This function must *not* be used as a naive replacement for
 696  * get_user_pages() / pin_user_pages(), especially not to perform DMA or
 697  * to carelessly modify page content. This function may *only* be used to grab
 698  * short-term folio references, never to grab long-term folio references.
 699  *
 700  * Using the page table entry pointers in @fw for reading or modifying the
 701  * entry should be avoided where possible: however, there might be valid
 702  * use cases.
 703  *
 704  * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care.
 705  * For example, PMD page table sharing might require prior unsharing. Also,
 706  * logical hugetlb entries might span multiple physical page table entries,
 707  * which *must* be modified in a single operation (set_huge_pte_at(),
 708  * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might
 709  * not correspond to the first physical entry of a logical hugetlb entry.
 710  *
 711  * The mmap lock must be held in read mode.
 712  *
 713  * Return: folio pointer on success, otherwise NULL.
 714  */
 715 struct folio *folio_walk_start(struct folio_walk *fw,
 716                 struct vm_area_struct *vma, unsigned long addr,
 717                 folio_walk_flags_t flags)
 718 {
 719         unsigned long entry_size;
 720         bool expose_page = true;
 721         struct page *page;
 722         pud_t *pudp, pud;
 723         pmd_t *pmdp, pmd;
 724         pte_t *ptep, pte;
 725         spinlock_t *ptl;
 726         pgd_t *pgdp;
 727         p4d_t *p4dp;
 728
 729         mmap_assert_locked(vma->vm_mm);
 730         vma_pgtable_walk_begin(vma);
 731
 732         if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end))
 733                 goto not_found;
 734
 735         pgdp = pgd_offset(vma->vm_mm, addr);
 736         if (pgd_none_or_clear_bad(pgdp))
 737                 goto not_found;
 738
 739         p4dp = p4d_offset(pgdp, addr);
 740         if (p4d_none_or_clear_bad(p4dp))
 741                 goto not_found;
 742
 743         pudp = pud_offset(p4dp, addr);
 744         pud = pudp_get(pudp);
 745         if (pud_none(pud))
 746                 goto not_found;
 747         if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
 748                 ptl = pud_lock(vma->vm_mm, pudp);
 749                 pud = pudp_get(pudp);
 750
 751                 entry_size = PUD_SIZE;
 752                 fw->level = FW_LEVEL_PUD;
 753                 fw->pudp = pudp;
 754                 fw->pud = pud;
 755
 756                 if (!pud_present(pud) || pud_devmap(pud)) {
 757                         spin_unlock(ptl);
 758                         goto not_found;
 759                 } else if (!pud_leaf(pud)) {
 760                         spin_unlock(ptl);
 761                         goto pmd_table;
 762                 }
 763                 /*
 764                  * TODO: vm_normal_page_pud() will be handy once we want to
 765                  * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs.
 766                  */
 767                 page = pud_page(pud);
 768                 goto found;
 769         }
 770
 771 pmd_table:
 772         VM_WARN_ON_ONCE(pud_leaf(*pudp));
 773         pmdp = pmd_offset(pudp, addr);
 774         pmd = pmdp_get_lockless(pmdp);
 775         if (pmd_none(pmd))
 776                 goto not_found;
 777         if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
 778                 ptl = pmd_lock(vma->vm_mm, pmdp);
 779                 pmd = pmdp_get(pmdp);
 780
 781                 entry_size = PMD_SIZE;
 782                 fw->level = FW_LEVEL_PMD;
 783                 fw->pmdp = pmdp;
 784                 fw->pmd = pmd;
 785
 786                 if (pmd_none(pmd)) {
 787                         spin_unlock(ptl);
 788                         goto not_found;
 789                 } else if (!pmd_leaf(pmd)) {
 790                         spin_unlock(ptl);
 791                         goto pte_table;
 792                 } else if (pmd_present(pmd)) {
 793                         page = vm_normal_page_pmd(vma, addr, pmd);
 794                         if (page) {
 795                                 goto found;
 796                         } else if ((flags & FW_ZEROPAGE) &&
 797                                     is_huge_zero_pmd(pmd)) {
 798                                 page = pfn_to_page(pmd_pfn(pmd));
 799                                 expose_page = false;
 800                                 goto found;
 801                         }
 802                 } else if ((flags & FW_MIGRATION) &&
 803                            is_pmd_migration_entry(pmd)) {
 804                         swp_entry_t entry = pmd_to_swp_entry(pmd);
 805
 806                         page = pfn_swap_entry_to_page(entry);
 807                         expose_page = false;
 808                         goto found;
 809                 }
 810                 spin_unlock(ptl);
 811                 goto not_found;
 812         }
 813
 814 pte_table:
 815         VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
 816         ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
 817         if (!ptep)
 818                 goto not_found;
 819         pte = ptep_get(ptep);
 820
 821         entry_size = PAGE_SIZE;
 822         fw->level = FW_LEVEL_PTE;
 823         fw->ptep = ptep;
 824         fw->pte = pte;
 825
 826         if (pte_present(pte)) {
 827                 page = vm_normal_page(vma, addr, pte);
 828                 if (page)
 829                         goto found;
 830                 if ((flags & FW_ZEROPAGE) &&
 831                     is_zero_pfn(pte_pfn(pte))) {
 832                         page = pfn_to_page(pte_pfn(pte));
 833                         expose_page = false;
 834                         goto found;
 835                 }
 836         } else if (!pte_none(pte)) {
 837                 swp_entry_t entry = pte_to_swp_entry(pte);
 838
 839                 if ((flags & FW_MIGRATION) &&
 840                     is_migration_entry(entry)) {
 841                         page = pfn_swap_entry_to_page(entry);
 842                         expose_page = false;
 843                         goto found;
 844                 }
 845         }
 846         pte_unmap_unlock(ptep, ptl);
 847 not_found:
 848         vma_pgtable_walk_end(vma);
 849         return NULL;
 850 found:
 851         if (expose_page)
 852                 /* Note: Offset from the mapped page, not the folio start. */
 853                 fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT);
 854         else
 855                 fw->page = NULL;
 856         fw->ptl = ptl;
 857         return page_folio(page);
 858 }