mm/madvise.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *      linux/mm/madvise.c
   4  *
   5  * Copyright (C) 1999  Linus Torvalds
   6  * Copyright (C) 2002  Christoph Hellwig
   7  */
   8
   9 #include <linux/mman.h>
  10 #include <linux/pagemap.h>
  11 #include <linux/syscalls.h>
  12 #include <linux/mempolicy.h>
  13 #include <linux/page-isolation.h>
  14 #include <linux/page_idle.h>
  15 #include <linux/userfaultfd_k.h>
  16 #include <linux/hugetlb.h>
  17 #include <linux/falloc.h>
  18 #include <linux/fadvise.h>
  19 #include <linux/sched.h>
  20 #include <linux/sched/mm.h>
  21 #include <linux/mm_inline.h>
  22 #include <linux/string.h>
  23 #include <linux/uio.h>
  24 #include <linux/ksm.h>
  25 #include <linux/fs.h>
  26 #include <linux/file.h>
  27 #include <linux/blkdev.h>
  28 #include <linux/backing-dev.h>
  29 #include <linux/pagewalk.h>
  30 #include <linux/swap.h>
  31 #include <linux/swapops.h>
  32 #include <linux/shmem_fs.h>
  33 #include <linux/mmu_notifier.h>
  34
  35 #include <asm/tlb.h>
  36
  37 #include "internal.h"
  38 #include "swap.h"
  39
  40 struct madvise_walk_private {
  41         struct mmu_gather *tlb;
  42         bool pageout;
  43 };
  44
  45 /*
  46  * Any behaviour which results in changes to the vma->vm_flags needs to
  47  * take mmap_lock for writing. Others, which simply traverse vmas, need
  48  * to only take it for reading.
  49  */
  50 static int madvise_need_mmap_write(int behavior)
  51 {
  52         switch (behavior) {
  53         case MADV_REMOVE:
  54         case MADV_WILLNEED:
  55         case MADV_DONTNEED:
  56         case MADV_DONTNEED_LOCKED:
  57         case MADV_COLD:
  58         case MADV_PAGEOUT:
  59         case MADV_FREE:
  60         case MADV_POPULATE_READ:
  61         case MADV_POPULATE_WRITE:
  62                 return 0;
  63         default:
  64                 /* be safe, default to 1. list exceptions explicitly */
  65                 return 1;
  66         }
  67 }
  68
  69 #ifdef CONFIG_ANON_VMA_NAME
  70 struct anon_vma_name *anon_vma_name_alloc(const char *name)
  71 {
  72         struct anon_vma_name *anon_name;
  73         size_t count;
  74
  75         /* Add 1 for NUL terminator at the end of the anon_name->name */
  76         count = strlen(name) + 1;
  77         anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
  78         if (anon_name) {
  79                 kref_init(&anon_name->kref);
  80                 memcpy(anon_name->name, name, count);
  81         }
  82
  83         return anon_name;
  84 }
  85
  86 void anon_vma_name_free(struct kref *kref)
  87 {
  88         struct anon_vma_name *anon_name =
  89                         container_of(kref, struct anon_vma_name, kref);
  90         kfree(anon_name);
  91 }
  92
  93 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
  94 {
  95         mmap_assert_locked(vma->vm_mm);
  96
  97         if (vma->vm_file)
  98                 return NULL;
  99
 100         return vma->anon_name;
 101 }
 102
 103 /* mmap_lock should be write-locked */
 104 static int replace_anon_vma_name(struct vm_area_struct *vma,
 105                                  struct anon_vma_name *anon_name)
 106 {
 107         struct anon_vma_name *orig_name = anon_vma_name(vma);
 108
 109         if (!anon_name) {
 110                 vma->anon_name = NULL;
 111                 anon_vma_name_put(orig_name);
 112                 return 0;
 113         }
 114
 115         if (anon_vma_name_eq(orig_name, anon_name))
 116                 return 0;
 117
 118         vma->anon_name = anon_vma_name_reuse(anon_name);
 119         anon_vma_name_put(orig_name);
 120
 121         return 0;
 122 }
 123 #else /* CONFIG_ANON_VMA_NAME */
 124 static int replace_anon_vma_name(struct vm_area_struct *vma,
 125                                  struct anon_vma_name *anon_name)
 126 {
 127         if (anon_name)
 128                 return -EINVAL;
 129
 130         return 0;
 131 }
 132 #endif /* CONFIG_ANON_VMA_NAME */
 133 /*
 134  * Update the vm_flags on region of a vma, splitting it or merging it as
 135  * necessary.  Must be called with mmap_sem held for writing;
 136  * Caller should ensure anon_name stability by raising its refcount even when
 137  * anon_name belongs to a valid vma because this function might free that vma.
 138  */
 139 static int madvise_update_vma(struct vm_area_struct *vma,
 140                               struct vm_area_struct **prev, unsigned long start,
 141                               unsigned long end, unsigned long new_flags,
 142                               struct anon_vma_name *anon_name)
 143 {
 144         struct mm_struct *mm = vma->vm_mm;
 145         int error;
 146         pgoff_t pgoff;
 147
 148         if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
 149                 *prev = vma;
 150                 return 0;
 151         }
 152
 153         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 154         *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
 155                           vma->vm_file, pgoff, vma_policy(vma),
 156                           vma->vm_userfaultfd_ctx, anon_name);
 157         if (*prev) {
 158                 vma = *prev;
 159                 goto success;
 160         }
 161
 162         *prev = vma;
 163
 164         if (start != vma->vm_start) {
 165                 if (unlikely(mm->map_count >= sysctl_max_map_count))
 166                         return -ENOMEM;
 167                 error = __split_vma(mm, vma, start, 1);
 168                 if (error)
 169                         return error;
 170         }
 171
 172         if (end != vma->vm_end) {
 173                 if (unlikely(mm->map_count >= sysctl_max_map_count))
 174                         return -ENOMEM;
 175                 error = __split_vma(mm, vma, end, 0);
 176                 if (error)
 177                         return error;
 178         }
 179
 180 success:
 181         /*
 182          * vm_flags is protected by the mmap_lock held in write mode.
 183          */
 184         vma->vm_flags = new_flags;
 185         if (!vma->vm_file) {
 186                 error = replace_anon_vma_name(vma, anon_name);
 187                 if (error)
 188                         return error;
 189         }
 190
 191         return 0;
 192 }
 193
 194 #ifdef CONFIG_SWAP
 195 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 196         unsigned long end, struct mm_walk *walk)
 197 {
 198         pte_t *orig_pte;
 199         struct vm_area_struct *vma = walk->private;
 200         unsigned long index;
 201         struct swap_iocb *splug = NULL;
 202
 203         if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 204                 return 0;
 205
 206         for (index = start; index != end; index += PAGE_SIZE) {
 207                 pte_t pte;
 208                 swp_entry_t entry;
 209                 struct page *page;
 210                 spinlock_t *ptl;
 211
 212                 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
 213                 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
 214                 pte_unmap_unlock(orig_pte, ptl);
 215
 216                 if (pte_present(pte) || pte_none(pte))
 217                         continue;
 218                 entry = pte_to_swp_entry(pte);
 219                 if (unlikely(non_swap_entry(entry)))
 220                         continue;
 221
 222                 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
 223                                              vma, index, false, &splug);
 224                 if (page)
 225                         put_page(page);
 226         }
 227         swap_read_unplug(splug);
 228
 229         return 0;
 230 }
 231
 232 static const struct mm_walk_ops swapin_walk_ops = {
 233         .pmd_entry              = swapin_walk_pmd_entry,
 234 };
 235
 236 static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 237                 unsigned long start, unsigned long end,
 238                 struct address_space *mapping)
 239 {
 240         XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
 241         pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
 242         struct page *page;
 243         struct swap_iocb *splug = NULL;
 244
 245         rcu_read_lock();
 246         xas_for_each(&xas, page, end_index) {
 247                 swp_entry_t swap;
 248
 249                 if (!xa_is_value(page))
 250                         continue;
 251                 xas_pause(&xas);
 252                 rcu_read_unlock();
 253
 254                 swap = radix_to_swp_entry(page);
 255                 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
 256                                              NULL, 0, false, &splug);
 257                 if (page)
 258                         put_page(page);
 259
 260                 rcu_read_lock();
 261         }
 262         rcu_read_unlock();
 263         swap_read_unplug(splug);
 264
 265         lru_add_drain();        /* Push any new pages onto the LRU now */
 266 }
 267 #endif          /* CONFIG_SWAP */
 268
 269 /*
 270  * Schedule all required I/O operations.  Do not wait for completion.
 271  */
 272 static long madvise_willneed(struct vm_area_struct *vma,
 273                              struct vm_area_struct **prev,
 274                              unsigned long start, unsigned long end)
 275 {
 276         struct mm_struct *mm = vma->vm_mm;
 277         struct file *file = vma->vm_file;
 278         loff_t offset;
 279
 280         *prev = vma;
 281 #ifdef CONFIG_SWAP
 282         if (!file) {
 283                 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
 284                 lru_add_drain(); /* Push any new pages onto the LRU now */
 285                 return 0;
 286         }
 287
 288         if (shmem_mapping(file->f_mapping)) {
 289                 force_shm_swapin_readahead(vma, start, end,
 290                                         file->f_mapping);
 291                 return 0;
 292         }
 293 #else
 294         if (!file)
 295                 return -EBADF;
 296 #endif
 297
 298         if (IS_DAX(file_inode(file))) {
 299                 /* no bad return value, but ignore advice */
 300                 return 0;
 301         }
 302
 303         /*
 304          * Filesystem's fadvise may need to take various locks.  We need to
 305          * explicitly grab a reference because the vma (and hence the
 306          * vma's reference to the file) can go away as soon as we drop
 307          * mmap_lock.
 308          */
 309         *prev = NULL;   /* tell sys_madvise we drop mmap_lock */
 310         get_file(file);
 311         offset = (loff_t)(start - vma->vm_start)
 312                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 313         mmap_read_unlock(mm);
 314         vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
 315         fput(file);
 316         mmap_read_lock(mm);
 317         return 0;
 318 }
 319
 320 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 321                                 unsigned long addr, unsigned long end,
 322                                 struct mm_walk *walk)
 323 {
 324         struct madvise_walk_private *private = walk->private;
 325         struct mmu_gather *tlb = private->tlb;
 326         bool pageout = private->pageout;
 327         struct mm_struct *mm = tlb->mm;
 328         struct vm_area_struct *vma = walk->vma;
 329         pte_t *orig_pte, *pte, ptent;
 330         spinlock_t *ptl;
 331         struct page *page = NULL;
 332         LIST_HEAD(page_list);
 333
 334         if (fatal_signal_pending(current))
 335                 return -EINTR;
 336
 337 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 338         if (pmd_trans_huge(*pmd)) {
 339                 pmd_t orig_pmd;
 340                 unsigned long next = pmd_addr_end(addr, end);
 341
 342                 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
 343                 ptl = pmd_trans_huge_lock(pmd, vma);
 344                 if (!ptl)
 345                         return 0;
 346
 347                 orig_pmd = *pmd;
 348                 if (is_huge_zero_pmd(orig_pmd))
 349                         goto huge_unlock;
 350
 351                 if (unlikely(!pmd_present(orig_pmd))) {
 352                         VM_BUG_ON(thp_migration_supported() &&
 353                                         !is_pmd_migration_entry(orig_pmd));
 354                         goto huge_unlock;
 355                 }
 356
 357                 page = pmd_page(orig_pmd);
 358
 359                 /* Do not interfere with other mappings of this page */
 360                 if (page_mapcount(page) != 1)
 361                         goto huge_unlock;
 362
 363                 if (next - addr != HPAGE_PMD_SIZE) {
 364                         int err;
 365
 366                         get_page(page);
 367                         spin_unlock(ptl);
 368                         lock_page(page);
 369                         err = split_huge_page(page);
 370                         unlock_page(page);
 371                         put_page(page);
 372                         if (!err)
 373                                 goto regular_page;
 374                         return 0;
 375                 }
 376
 377                 if (pmd_young(orig_pmd)) {
 378                         pmdp_invalidate(vma, addr, pmd);
 379                         orig_pmd = pmd_mkold(orig_pmd);
 380
 381                         set_pmd_at(mm, addr, pmd, orig_pmd);
 382                         tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 383                 }
 384
 385                 ClearPageReferenced(page);
 386                 test_and_clear_page_young(page);
 387                 if (pageout) {
 388                         if (!isolate_lru_page(page)) {
 389                                 if (PageUnevictable(page))
 390                                         putback_lru_page(page);
 391                                 else
 392                                         list_add(&page->lru, &page_list);
 393                         }
 394                 } else
 395                         deactivate_page(page);
 396 huge_unlock:
 397                 spin_unlock(ptl);
 398                 if (pageout)
 399                         reclaim_pages(&page_list);
 400                 return 0;
 401         }
 402
 403 regular_page:
 404         if (pmd_trans_unstable(pmd))
 405                 return 0;
 406 #endif
 407         tlb_change_page_size(tlb, PAGE_SIZE);
 408         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 409         flush_tlb_batched_pending(mm);
 410         arch_enter_lazy_mmu_mode();
 411         for (; addr < end; pte++, addr += PAGE_SIZE) {
 412                 ptent = *pte;
 413
 414                 if (pte_none(ptent))
 415                         continue;
 416
 417                 if (!pte_present(ptent))
 418                         continue;
 419
 420                 page = vm_normal_page(vma, addr, ptent);
 421                 if (!page)
 422                         continue;
 423
 424                 /*
 425                  * Creating a THP page is expensive so split it only if we
 426                  * are sure it's worth. Split it if we are only owner.
 427                  */
 428                 if (PageTransCompound(page)) {
 429                         if (page_mapcount(page) != 1)
 430                                 break;
 431                         get_page(page);
 432                         if (!trylock_page(page)) {
 433                                 put_page(page);
 434                                 break;
 435                         }
 436                         pte_unmap_unlock(orig_pte, ptl);
 437                         if (split_huge_page(page)) {
 438                                 unlock_page(page);
 439                                 put_page(page);
 440                                 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 441                                 break;
 442                         }
 443                         unlock_page(page);
 444                         put_page(page);
 445                         orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 446                         pte--;
 447                         addr -= PAGE_SIZE;
 448                         continue;
 449                 }
 450
 451                 /* Do not interfere with other mappings of this page */
 452                 if (page_mapcount(page) != 1)
 453                         continue;
 454
 455                 VM_BUG_ON_PAGE(PageTransCompound(page), page);
 456
 457                 if (pte_young(ptent)) {
 458                         ptent = ptep_get_and_clear_full(mm, addr, pte,
 459                                                         tlb->fullmm);
 460                         ptent = pte_mkold(ptent);
 461                         set_pte_at(mm, addr, pte, ptent);
 462                         tlb_remove_tlb_entry(tlb, pte, addr);
 463                 }
 464
 465                 /*
 466                  * We are deactivating a page for accelerating reclaiming.
 467                  * VM couldn't reclaim the page unless we clear PG_young.
 468                  * As a side effect, it makes confuse idle-page tracking
 469                  * because they will miss recent referenced history.
 470                  */
 471                 ClearPageReferenced(page);
 472                 test_and_clear_page_young(page);
 473                 if (pageout) {
 474                         if (!isolate_lru_page(page)) {
 475                                 if (PageUnevictable(page))
 476                                         putback_lru_page(page);
 477                                 else
 478                                         list_add(&page->lru, &page_list);
 479                         }
 480                 } else
 481                         deactivate_page(page);
 482         }
 483
 484         arch_leave_lazy_mmu_mode();
 485         pte_unmap_unlock(orig_pte, ptl);
 486         if (pageout)
 487                 reclaim_pages(&page_list);
 488         cond_resched();
 489
 490         return 0;
 491 }
 492
 493 static const struct mm_walk_ops cold_walk_ops = {
 494         .pmd_entry = madvise_cold_or_pageout_pte_range,
 495 };
 496
 497 static void madvise_cold_page_range(struct mmu_gather *tlb,
 498                              struct vm_area_struct *vma,
 499                              unsigned long addr, unsigned long end)
 500 {
 501         struct madvise_walk_private walk_private = {
 502                 .pageout = false,
 503                 .tlb = tlb,
 504         };
 505
 506         tlb_start_vma(tlb, vma);
 507         walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
 508         tlb_end_vma(tlb, vma);
 509 }
 510
 511 static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
 512 {
 513         return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
 514 }
 515
 516 static long madvise_cold(struct vm_area_struct *vma,
 517                         struct vm_area_struct **prev,
 518                         unsigned long start_addr, unsigned long end_addr)
 519 {
 520         struct mm_struct *mm = vma->vm_mm;
 521         struct mmu_gather tlb;
 522
 523         *prev = vma;
 524         if (!can_madv_lru_vma(vma))
 525                 return -EINVAL;
 526
 527         lru_add_drain();
 528         tlb_gather_mmu(&tlb, mm);
 529         madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
 530         tlb_finish_mmu(&tlb);
 531
 532         return 0;
 533 }
 534
 535 static void madvise_pageout_page_range(struct mmu_gather *tlb,
 536                              struct vm_area_struct *vma,
 537                              unsigned long addr, unsigned long end)
 538 {
 539         struct madvise_walk_private walk_private = {
 540                 .pageout = true,
 541                 .tlb = tlb,
 542         };
 543
 544         tlb_start_vma(tlb, vma);
 545         walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
 546         tlb_end_vma(tlb, vma);
 547 }
 548
 549 static inline bool can_do_pageout(struct vm_area_struct *vma)
 550 {
 551         if (vma_is_anonymous(vma))
 552                 return true;
 553         if (!vma->vm_file)
 554                 return false;
 555         /*
 556          * paging out pagecache only for non-anonymous mappings that correspond
 557          * to the files the calling process could (if tried) open for writing;
 558          * otherwise we'd be including shared non-exclusive mappings, which
 559          * opens a side channel.
 560          */
 561         return inode_owner_or_capable(&init_user_ns,
 562                                       file_inode(vma->vm_file)) ||
 563                file_permission(vma->vm_file, MAY_WRITE) == 0;
 564 }
 565
 566 static long madvise_pageout(struct vm_area_struct *vma,
 567                         struct vm_area_struct **prev,
 568                         unsigned long start_addr, unsigned long end_addr)
 569 {
 570         struct mm_struct *mm = vma->vm_mm;
 571         struct mmu_gather tlb;
 572
 573         *prev = vma;
 574         if (!can_madv_lru_vma(vma))
 575                 return -EINVAL;
 576
 577         if (!can_do_pageout(vma))
 578                 return 0;
 579
 580         lru_add_drain();
 581         tlb_gather_mmu(&tlb, mm);
 582         madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
 583         tlb_finish_mmu(&tlb);
 584
 585         return 0;
 586 }
 587
 588 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 589                                 unsigned long end, struct mm_walk *walk)
 590
 591 {
 592         struct mmu_gather *tlb = walk->private;
 593         struct mm_struct *mm = tlb->mm;
 594         struct vm_area_struct *vma = walk->vma;
 595         spinlock_t *ptl;
 596         pte_t *orig_pte, *pte, ptent;
 597         struct page *page;
 598         int nr_swap = 0;
 599         unsigned long next;
 600
 601         next = pmd_addr_end(addr, end);
 602         if (pmd_trans_huge(*pmd))
 603                 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
 604                         goto next;
 605
 606         if (pmd_trans_unstable(pmd))
 607                 return 0;
 608
 609         tlb_change_page_size(tlb, PAGE_SIZE);
 610         orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 611         flush_tlb_batched_pending(mm);
 612         arch_enter_lazy_mmu_mode();
 613         for (; addr != end; pte++, addr += PAGE_SIZE) {
 614                 ptent = *pte;
 615
 616                 if (pte_none(ptent))
 617                         continue;
 618                 /*
 619                  * If the pte has swp_entry, just clear page table to
 620                  * prevent swap-in which is more expensive rather than
 621                  * (page allocation + zeroing).
 622                  */
 623                 if (!pte_present(ptent)) {
 624                         swp_entry_t entry;
 625
 626                         entry = pte_to_swp_entry(ptent);
 627                         if (non_swap_entry(entry))
 628                                 continue;
 629                         nr_swap--;
 630                         free_swap_and_cache(entry);
 631                         pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 632                         continue;
 633                 }
 634
 635                 page = vm_normal_page(vma, addr, ptent);
 636                 if (!page)
 637                         continue;
 638
 639                 /*
 640                  * If pmd isn't transhuge but the page is THP and
 641                  * is owned by only this process, split it and
 642                  * deactivate all pages.
 643                  */
 644                 if (PageTransCompound(page)) {
 645                         if (page_mapcount(page) != 1)
 646                                 goto out;
 647                         get_page(page);
 648                         if (!trylock_page(page)) {
 649                                 put_page(page);
 650                                 goto out;
 651                         }
 652                         pte_unmap_unlock(orig_pte, ptl);
 653                         if (split_huge_page(page)) {
 654                                 unlock_page(page);
 655                                 put_page(page);
 656                                 orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 657                                 goto out;
 658                         }
 659                         unlock_page(page);
 660                         put_page(page);
 661                         orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 662                         pte--;
 663                         addr -= PAGE_SIZE;
 664                         continue;
 665                 }
 666
 667                 VM_BUG_ON_PAGE(PageTransCompound(page), page);
 668
 669                 if (PageSwapCache(page) || PageDirty(page)) {
 670                         if (!trylock_page(page))
 671                                 continue;
 672                         /*
 673                          * If page is shared with others, we couldn't clear
 674                          * PG_dirty of the page.
 675                          */
 676                         if (page_mapcount(page) != 1) {
 677                                 unlock_page(page);
 678                                 continue;
 679                         }
 680
 681                         if (PageSwapCache(page) && !try_to_free_swap(page)) {
 682                                 unlock_page(page);
 683                                 continue;
 684                         }
 685
 686                         ClearPageDirty(page);
 687                         unlock_page(page);
 688                 }
 689
 690                 if (pte_young(ptent) || pte_dirty(ptent)) {
 691                         /*
 692                          * Some of architecture(ex, PPC) don't update TLB
 693                          * with set_pte_at and tlb_remove_tlb_entry so for
 694                          * the portability, remap the pte with old|clean
 695                          * after pte clearing.
 696                          */
 697                         ptent = ptep_get_and_clear_full(mm, addr, pte,
 698                                                         tlb->fullmm);
 699
 700                         ptent = pte_mkold(ptent);
 701                         ptent = pte_mkclean(ptent);
 702                         set_pte_at(mm, addr, pte, ptent);
 703                         tlb_remove_tlb_entry(tlb, pte, addr);
 704                 }
 705                 mark_page_lazyfree(page);
 706         }
 707 out:
 708         if (nr_swap) {
 709                 if (current->mm == mm)
 710                         sync_mm_rss(mm);
 711
 712                 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
 713         }
 714         arch_leave_lazy_mmu_mode();
 715         pte_unmap_unlock(orig_pte, ptl);
 716         cond_resched();
 717 next:
 718         return 0;
 719 }
 720
 721 static const struct mm_walk_ops madvise_free_walk_ops = {
 722         .pmd_entry              = madvise_free_pte_range,
 723 };
 724
 725 static int madvise_free_single_vma(struct vm_area_struct *vma,
 726                         unsigned long start_addr, unsigned long end_addr)
 727 {
 728         struct mm_struct *mm = vma->vm_mm;
 729         struct mmu_notifier_range range;
 730         struct mmu_gather tlb;
 731
 732         /* MADV_FREE works for only anon vma at the moment */
 733         if (!vma_is_anonymous(vma))
 734                 return -EINVAL;
 735
 736         range.start = max(vma->vm_start, start_addr);
 737         if (range.start >= vma->vm_end)
 738                 return -EINVAL;
 739         range.end = min(vma->vm_end, end_addr);
 740         if (range.end <= vma->vm_start)
 741                 return -EINVAL;
 742         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
 743                                 range.start, range.end);
 744
 745         lru_add_drain();
 746         tlb_gather_mmu(&tlb, mm);
 747         update_hiwater_rss(mm);
 748
 749         mmu_notifier_invalidate_range_start(&range);
 750         tlb_start_vma(&tlb, vma);
 751         walk_page_range(vma->vm_mm, range.start, range.end,
 752                         &madvise_free_walk_ops, &tlb);
 753         tlb_end_vma(&tlb, vma);
 754         mmu_notifier_invalidate_range_end(&range);
 755         tlb_finish_mmu(&tlb);
 756
 757         return 0;
 758 }
 759
 760 /*
 761  * Application no longer needs these pages.  If the pages are dirty,
 762  * it's OK to just throw them away.  The app will be more careful about
 763  * data it wants to keep.  Be sure to free swap resources too.  The
 764  * zap_page_range call sets things up for shrink_active_list to actually free
 765  * these pages later if no one else has touched them in the meantime,
 766  * although we could add these pages to a global reuse list for
 767  * shrink_active_list to pick up before reclaiming other pages.
 768  *
 769  * NB: This interface discards data rather than pushes it out to swap,
 770  * as some implementations do.  This has performance implications for
 771  * applications like large transactional databases which want to discard
 772  * pages in anonymous maps after committing to backing store the data
 773  * that was kept in them.  There is no reason to write this data out to
 774  * the swap area if the application is discarding it.
 775  *
 776  * An interface that causes the system to free clean pages and flush
 777  * dirty pages is already available as msync(MS_INVALIDATE).
 778  */
 779 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
 780                                         unsigned long start, unsigned long end)
 781 {
 782         zap_page_range(vma, start, end - start);
 783         return 0;
 784 }
 785
 786 static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
 787                                             unsigned long start,
 788                                             unsigned long *end,
 789                                             int behavior)
 790 {
 791         if (!is_vm_hugetlb_page(vma)) {
 792                 unsigned int forbidden = VM_PFNMAP;
 793
 794                 if (behavior != MADV_DONTNEED_LOCKED)
 795                         forbidden |= VM_LOCKED;
 796
 797                 return !(vma->vm_flags & forbidden);
 798         }
 799
 800         if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
 801                 return false;
 802         if (start & ~huge_page_mask(hstate_vma(vma)))
 803                 return false;
 804
 805         *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
 806         return true;
 807 }
 808
 809 static long madvise_dontneed_free(struct vm_area_struct *vma,
 810                                   struct vm_area_struct **prev,
 811                                   unsigned long start, unsigned long end,
 812                                   int behavior)
 813 {
 814         struct mm_struct *mm = vma->vm_mm;
 815
 816         *prev = vma;
 817         if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
 818                 return -EINVAL;
 819
 820         if (!userfaultfd_remove(vma, start, end)) {
 821                 *prev = NULL; /* mmap_lock has been dropped, prev is stale */
 822
 823                 mmap_read_lock(mm);
 824                 vma = find_vma(mm, start);
 825                 if (!vma)
 826                         return -ENOMEM;
 827                 if (start < vma->vm_start) {
 828                         /*
 829                          * This "vma" under revalidation is the one
 830                          * with the lowest vma->vm_start where start
 831                          * is also < vma->vm_end. If start <
 832                          * vma->vm_start it means an hole materialized
 833                          * in the user address space within the
 834                          * virtual range passed to MADV_DONTNEED
 835                          * or MADV_FREE.
 836                          */
 837                         return -ENOMEM;
 838                 }
 839                 /*
 840                  * Potential end adjustment for hugetlb vma is OK as
 841                  * the check below keeps end within vma.
 842                  */
 843                 if (!madvise_dontneed_free_valid_vma(vma, start, &end,
 844                                                      behavior))
 845                         return -EINVAL;
 846                 if (end > vma->vm_end) {
 847                         /*
 848                          * Don't fail if end > vma->vm_end. If the old
 849                          * vma was split while the mmap_lock was
 850                          * released the effect of the concurrent
 851                          * operation may not cause madvise() to
 852                          * have an undefined result. There may be an
 853                          * adjacent next vma that we'll walk
 854                          * next. userfaultfd_remove() will generate an
 855                          * UFFD_EVENT_REMOVE repetition on the
 856                          * end-vma->vm_end range, but the manager can
 857                          * handle a repetition fine.
 858                          */
 859                         end = vma->vm_end;
 860                 }
 861                 VM_WARN_ON(start >= end);
 862         }
 863
 864         if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
 865                 return madvise_dontneed_single_vma(vma, start, end);
 866         else if (behavior == MADV_FREE)
 867                 return madvise_free_single_vma(vma, start, end);
 868         else
 869                 return -EINVAL;
 870 }
 871
 872 static long madvise_populate(struct vm_area_struct *vma,
 873                              struct vm_area_struct **prev,
 874                              unsigned long start, unsigned long end,
 875                              int behavior)
 876 {
 877         const bool write = behavior == MADV_POPULATE_WRITE;
 878         struct mm_struct *mm = vma->vm_mm;
 879         unsigned long tmp_end;
 880         int locked = 1;
 881         long pages;
 882
 883         *prev = vma;
 884
 885         while (start < end) {
 886                 /*
 887                  * We might have temporarily dropped the lock. For example,
 888                  * our VMA might have been split.
 889                  */
 890                 if (!vma || start >= vma->vm_end) {
 891                         vma = vma_lookup(mm, start);
 892                         if (!vma)
 893                                 return -ENOMEM;
 894                 }
 895
 896                 tmp_end = min_t(unsigned long, end, vma->vm_end);
 897                 /* Populate (prefault) page tables readable/writable. */
 898                 pages = faultin_vma_page_range(vma, start, tmp_end, write,
 899                                                &locked);
 900                 if (!locked) {
 901                         mmap_read_lock(mm);
 902                         locked = 1;
 903                         *prev = NULL;
 904                         vma = NULL;
 905                 }
 906                 if (pages < 0) {
 907                         switch (pages) {
 908                         case -EINTR:
 909                                 return -EINTR;
 910                         case -EINVAL: /* Incompatible mappings / permissions. */
 911                                 return -EINVAL;
 912                         case -EHWPOISON:
 913                                 return -EHWPOISON;
 914                         case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
 915                                 return -EFAULT;
 916                         default:
 917                                 pr_warn_once("%s: unhandled return value: %ld\n",
 918                                              __func__, pages);
 919                                 fallthrough;
 920                         case -ENOMEM:
 921                                 return -ENOMEM;
 922                         }
 923                 }
 924                 start += pages * PAGE_SIZE;
 925         }
 926         return 0;
 927 }
 928
 929 /*
 930  * Application wants to free up the pages and associated backing store.
 931  * This is effectively punching a hole into the middle of a file.
 932  */
 933 static long madvise_remove(struct vm_area_struct *vma,
 934                                 struct vm_area_struct **prev,
 935                                 unsigned long start, unsigned long end)
 936 {
 937         loff_t offset;
 938         int error;
 939         struct file *f;
 940         struct mm_struct *mm = vma->vm_mm;
 941
 942         *prev = NULL;   /* tell sys_madvise we drop mmap_lock */
 943
 944         if (vma->vm_flags & VM_LOCKED)
 945                 return -EINVAL;
 946
 947         f = vma->vm_file;
 948
 949         if (!f || !f->f_mapping || !f->f_mapping->host) {
 950                         return -EINVAL;
 951         }
 952
 953         if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
 954                 return -EACCES;
 955
 956         offset = (loff_t)(start - vma->vm_start)
 957                         + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 958
 959         /*
 960          * Filesystem's fallocate may need to take i_rwsem.  We need to
 961          * explicitly grab a reference because the vma (and hence the
 962          * vma's reference to the file) can go away as soon as we drop
 963          * mmap_lock.
 964          */
 965         get_file(f);
 966         if (userfaultfd_remove(vma, start, end)) {
 967                 /* mmap_lock was not released by userfaultfd_remove() */
 968                 mmap_read_unlock(mm);
 969         }
 970         error = vfs_fallocate(f,
 971                                 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 972                                 offset, end - start);
 973         fput(f);
 974         mmap_read_lock(mm);
 975         return error;
 976 }
 977
 978 /*
 979  * Apply an madvise behavior to a region of a vma.  madvise_update_vma
 980  * will handle splitting a vm area into separate areas, each area with its own
 981  * behavior.
 982  */
 983 static int madvise_vma_behavior(struct vm_area_struct *vma,
 984                                 struct vm_area_struct **prev,
 985                                 unsigned long start, unsigned long end,
 986                                 unsigned long behavior)
 987 {
 988         int error;
 989         struct anon_vma_name *anon_name;
 990         unsigned long new_flags = vma->vm_flags;
 991
 992         switch (behavior) {
 993         case MADV_REMOVE:
 994                 return madvise_remove(vma, prev, start, end);
 995         case MADV_WILLNEED:
 996                 return madvise_willneed(vma, prev, start, end);
 997         case MADV_COLD:
 998                 return madvise_cold(vma, prev, start, end);
 999         case MADV_PAGEOUT:
1000                 return madvise_pageout(vma, prev, start, end);
1001         case MADV_FREE:
1002         case MADV_DONTNEED:
1003         case MADV_DONTNEED_LOCKED:
1004                 return madvise_dontneed_free(vma, prev, start, end, behavior);
1005         case MADV_POPULATE_READ:
1006         case MADV_POPULATE_WRITE:
1007                 return madvise_populate(vma, prev, start, end, behavior);
1008         case MADV_NORMAL:
1009                 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
1010                 break;
1011         case MADV_SEQUENTIAL:
1012                 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
1013                 break;
1014         case MADV_RANDOM:
1015                 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
1016                 break;
1017         case MADV_DONTFORK:
1018                 new_flags |= VM_DONTCOPY;
1019                 break;
1020         case MADV_DOFORK:
1021                 if (vma->vm_flags & VM_IO)
1022                         return -EINVAL;
1023                 new_flags &= ~VM_DONTCOPY;
1024                 break;
1025         case MADV_WIPEONFORK:
1026                 /* MADV_WIPEONFORK is only supported on anonymous memory. */
1027                 if (vma->vm_file || vma->vm_flags & VM_SHARED)
1028                         return -EINVAL;
1029                 new_flags |= VM_WIPEONFORK;
1030                 break;
1031         case MADV_KEEPONFORK:
1032                 new_flags &= ~VM_WIPEONFORK;
1033                 break;
1034         case MADV_DONTDUMP:
1035                 new_flags |= VM_DONTDUMP;
1036                 break;
1037         case MADV_DODUMP:
1038                 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
1039                         return -EINVAL;
1040                 new_flags &= ~VM_DONTDUMP;
1041                 break;
1042         case MADV_MERGEABLE:
1043         case MADV_UNMERGEABLE:
1044                 error = ksm_madvise(vma, start, end, behavior, &new_flags);
1045                 if (error)
1046                         goto out;
1047                 break;
1048         case MADV_HUGEPAGE:
1049         case MADV_NOHUGEPAGE:
1050                 error = hugepage_madvise(vma, &new_flags, behavior);
1051                 if (error)
1052                         goto out;
1053                 break;
1054         }
1055
1056         anon_name = anon_vma_name(vma);
1057         anon_vma_name_get(anon_name);
1058         error = madvise_update_vma(vma, prev, start, end, new_flags,
1059                                    anon_name);
1060         anon_vma_name_put(anon_name);
1061
1062 out:
1063         /*
1064          * madvise() returns EAGAIN if kernel resources, such as
1065          * slab, are temporarily unavailable.
1066          */
1067         if (error == -ENOMEM)
1068                 error = -EAGAIN;
1069         return error;
1070 }
1071
1072 #ifdef CONFIG_MEMORY_FAILURE
1073 /*
1074  * Error injection support for memory error handling.
1075  */
1076 static int madvise_inject_error(int behavior,
1077                 unsigned long start, unsigned long end)
1078 {
1079         unsigned long size;
1080
1081         if (!capable(CAP_SYS_ADMIN))
1082                 return -EPERM;
1083
1084
1085         for (; start < end; start += size) {
1086                 unsigned long pfn;
1087                 struct page *page;
1088                 int ret;
1089
1090                 ret = get_user_pages_fast(start, 1, 0, &page);
1091                 if (ret != 1)
1092                         return ret;
1093                 pfn = page_to_pfn(page);
1094
1095                 /*
1096                  * When soft offlining hugepages, after migrating the page
1097                  * we dissolve it, therefore in the second loop "page" will
1098                  * no longer be a compound page.
1099                  */
1100                 size = page_size(compound_head(page));
1101
1102                 if (behavior == MADV_SOFT_OFFLINE) {
1103                         pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1104                                  pfn, start);
1105                         ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1106                 } else {
1107                         pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
1108                                  pfn, start);
1109                         ret = memory_failure(pfn, MF_COUNT_INCREASED);
1110                         if (ret == -EOPNOTSUPP)
1111                                 ret = 0;
1112                 }
1113
1114                 if (ret)
1115                         return ret;
1116         }
1117
1118         return 0;
1119 }
1120 #endif
1121
1122 static bool
1123 madvise_behavior_valid(int behavior)
1124 {
1125         switch (behavior) {
1126         case MADV_DOFORK:
1127         case MADV_DONTFORK:
1128         case MADV_NORMAL:
1129         case MADV_SEQUENTIAL:
1130         case MADV_RANDOM:
1131         case MADV_REMOVE:
1132         case MADV_WILLNEED:
1133         case MADV_DONTNEED:
1134         case MADV_DONTNEED_LOCKED:
1135         case MADV_FREE:
1136         case MADV_COLD:
1137         case MADV_PAGEOUT:
1138         case MADV_POPULATE_READ:
1139         case MADV_POPULATE_WRITE:
1140 #ifdef CONFIG_KSM
1141         case MADV_MERGEABLE:
1142         case MADV_UNMERGEABLE:
1143 #endif
1144 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1145         case MADV_HUGEPAGE:
1146         case MADV_NOHUGEPAGE:
1147 #endif
1148         case MADV_DONTDUMP:
1149         case MADV_DODUMP:
1150         case MADV_WIPEONFORK:
1151         case MADV_KEEPONFORK:
1152 #ifdef CONFIG_MEMORY_FAILURE
1153         case MADV_SOFT_OFFLINE:
1154         case MADV_HWPOISON:
1155 #endif
1156                 return true;
1157
1158         default:
1159                 return false;
1160         }
1161 }
1162
1163 static bool
1164 process_madvise_behavior_valid(int behavior)
1165 {
1166         switch (behavior) {
1167         case MADV_COLD:
1168         case MADV_PAGEOUT:
1169         case MADV_WILLNEED:
1170                 return true;
1171         default:
1172                 return false;
1173         }
1174 }
1175
1176 /*
1177  * Walk the vmas in range [start,end), and call the visit function on each one.
1178  * The visit function will get start and end parameters that cover the overlap
1179  * between the current vma and the original range.  Any unmapped regions in the
1180  * original range will result in this function returning -ENOMEM while still
1181  * calling the visit function on all of the existing vmas in the range.
1182  * Must be called with the mmap_lock held for reading or writing.
1183  */
1184 static
1185 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
1186                       unsigned long end, unsigned long arg,
1187                       int (*visit)(struct vm_area_struct *vma,
1188                                    struct vm_area_struct **prev, unsigned long start,
1189                                    unsigned long end, unsigned long arg))
1190 {
1191         struct vm_area_struct *vma;
1192         struct vm_area_struct *prev;
1193         unsigned long tmp;
1194         int unmapped_error = 0;
1195
1196         /*
1197          * If the interval [start,end) covers some unmapped address
1198          * ranges, just ignore them, but return -ENOMEM at the end.
1199          * - different from the way of handling in mlock etc.
1200          */
1201         vma = find_vma_prev(mm, start, &prev);
1202         if (vma && start > vma->vm_start)
1203                 prev = vma;
1204
1205         for (;;) {
1206                 int error;
1207
1208                 /* Still start < end. */
1209                 if (!vma)
1210                         return -ENOMEM;
1211
1212                 /* Here start < (end|vma->vm_end). */
1213                 if (start < vma->vm_start) {
1214                         unmapped_error = -ENOMEM;
1215                         start = vma->vm_start;
1216                         if (start >= end)
1217                                 break;
1218                 }
1219
1220                 /* Here vma->vm_start <= start < (end|vma->vm_end) */
1221                 tmp = vma->vm_end;
1222                 if (end < tmp)
1223                         tmp = end;
1224
1225                 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1226                 error = visit(vma, &prev, start, tmp, arg);
1227                 if (error)
1228                         return error;
1229                 start = tmp;
1230                 if (prev && start < prev->vm_end)
1231                         start = prev->vm_end;
1232                 if (start >= end)
1233                         break;
1234                 if (prev)
1235                         vma = prev->vm_next;
1236                 else    /* madvise_remove dropped mmap_lock */
1237                         vma = find_vma(mm, start);
1238         }
1239
1240         return unmapped_error;
1241 }
1242
1243 #ifdef CONFIG_ANON_VMA_NAME
1244 static int madvise_vma_anon_name(struct vm_area_struct *vma,
1245                                  struct vm_area_struct **prev,
1246                                  unsigned long start, unsigned long end,
1247                                  unsigned long anon_name)
1248 {
1249         int error;
1250
1251         /* Only anonymous mappings can be named */
1252         if (vma->vm_file)
1253                 return -EBADF;
1254
1255         error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
1256                                    (struct anon_vma_name *)anon_name);
1257
1258         /*
1259          * madvise() returns EAGAIN if kernel resources, such as
1260          * slab, are temporarily unavailable.
1261          */
1262         if (error == -ENOMEM)
1263                 error = -EAGAIN;
1264         return error;
1265 }
1266
1267 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
1268                           unsigned long len_in, struct anon_vma_name *anon_name)
1269 {
1270         unsigned long end;
1271         unsigned long len;
1272
1273         if (start & ~PAGE_MASK)
1274                 return -EINVAL;
1275         len = (len_in + ~PAGE_MASK) & PAGE_MASK;
1276
1277         /* Check to see whether len was rounded up from small -ve to zero */
1278         if (len_in && !len)
1279                 return -EINVAL;
1280
1281         end = start + len;
1282         if (end < start)
1283                 return -EINVAL;
1284
1285         if (end == start)
1286                 return 0;
1287
1288         return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
1289                                  madvise_vma_anon_name);
1290 }
1291 #endif /* CONFIG_ANON_VMA_NAME */
1292 /*
1293  * The madvise(2) system call.
1294  *
1295  * Applications can use madvise() to advise the kernel how it should
1296  * handle paging I/O in this VM area.  The idea is to help the kernel
1297  * use appropriate read-ahead and caching techniques.  The information
1298  * provided is advisory only, and can be safely disregarded by the
1299  * kernel without affecting the correct operation of the application.
1300  *
1301  * behavior values:
1302  *  MADV_NORMAL - the default behavior is to read clusters.  This
1303  *              results in some read-ahead and read-behind.
1304  *  MADV_RANDOM - the system should read the minimum amount of data
1305  *              on any access, since it is unlikely that the appli-
1306  *              cation will need more than what it asks for.
1307  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
1308  *              once, so they can be aggressively read ahead, and
1309  *              can be freed soon after they are accessed.
1310  *  MADV_WILLNEED - the application is notifying the system to read
1311  *              some pages ahead.
1312  *  MADV_DONTNEED - the application is finished with the given range,
1313  *              so the kernel can free resources associated with it.
1314  *  MADV_FREE - the application marks pages in the given range as lazy free,
1315  *              where actual purges are postponed until memory pressure happens.
1316  *  MADV_REMOVE - the application wants to free up the given range of
1317  *              pages and associated backing store.
1318  *  MADV_DONTFORK - omit this area from child's address space when forking:
1319  *              typically, to avoid COWing pages pinned by get_user_pages().
1320  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1321  *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
1322  *              range after a fork.
1323  *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1324  *  MADV_HWPOISON - trigger memory error handler as if the given memory range
1325  *              were corrupted by unrecoverable hardware memory failure.
1326  *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1327  *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1328  *              this area with pages of identical content from other such areas.
1329  *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1330  *  MADV_HUGEPAGE - the application wants to back the given range by transparent
1331  *              huge pages in the future. Existing pages might be coalesced and
1332  *              new pages might be allocated as THP.
1333  *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1334  *              transparent huge pages so the existing pages will not be
1335  *              coalesced into THP and new pages will not be allocated as THP.
1336  *  MADV_DONTDUMP - the application wants to prevent pages in the given range
1337  *              from being included in its core dump.
1338  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1339  *  MADV_COLD - the application is not expected to use this memory soon,
1340  *              deactivate pages in this range so that they can be reclaimed
1341  *              easily if memory pressure happens.
1342  *  MADV_PAGEOUT - the application is not expected to use this memory soon,
1343  *              page out the pages in this range immediately.
1344  *  MADV_POPULATE_READ - populate (prefault) page tables readable by
1345  *              triggering read faults if required
1346  *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1347  *              triggering write faults if required
1348  *
1349  * return values:
1350  *  zero    - success
1351  *  -EINVAL - start + len < 0, start is not page-aligned,
1352  *              "behavior" is not a valid value, or application
1353  *              is attempting to release locked or shared pages,
1354  *              or the specified address range includes file, Huge TLB,
1355  *              MAP_SHARED or VMPFNMAP range.
1356  *  -ENOMEM - addresses in the specified range are not currently
1357  *              mapped, or are outside the AS of the process.
1358  *  -EIO    - an I/O error occurred while paging in data.
1359  *  -EBADF  - map exists, but area maps something that isn't a file.
1360  *  -EAGAIN - a kernel resource was temporarily unavailable.
1361  */
1362 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1363 {
1364         unsigned long end;
1365         int error;
1366         int write;
1367         size_t len;
1368         struct blk_plug plug;
1369
1370         start = untagged_addr(start);
1371
1372         if (!madvise_behavior_valid(behavior))
1373                 return -EINVAL;
1374
1375         if (!PAGE_ALIGNED(start))
1376                 return -EINVAL;
1377         len = PAGE_ALIGN(len_in);
1378
1379         /* Check to see whether len was rounded up from small -ve to zero */
1380         if (len_in && !len)
1381                 return -EINVAL;
1382
1383         end = start + len;
1384         if (end < start)
1385                 return -EINVAL;
1386
1387         if (end == start)
1388                 return 0;
1389
1390 #ifdef CONFIG_MEMORY_FAILURE
1391         if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1392                 return madvise_inject_error(behavior, start, start + len_in);
1393 #endif
1394
1395         write = madvise_need_mmap_write(behavior);
1396         if (write) {
1397                 if (mmap_write_lock_killable(mm))
1398                         return -EINTR;
1399         } else {
1400                 mmap_read_lock(mm);
1401         }
1402
1403         blk_start_plug(&plug);
1404         error = madvise_walk_vmas(mm, start, end, behavior,
1405                         madvise_vma_behavior);
1406         blk_finish_plug(&plug);
1407         if (write)
1408                 mmap_write_unlock(mm);
1409         else
1410                 mmap_read_unlock(mm);
1411
1412         return error;
1413 }
1414
1415 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1416 {
1417         return do_madvise(current->mm, start, len_in, behavior);
1418 }
1419
1420 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1421                 size_t, vlen, int, behavior, unsigned int, flags)
1422 {
1423         ssize_t ret;
1424         struct iovec iovstack[UIO_FASTIOV], iovec;
1425         struct iovec *iov = iovstack;
1426         struct iov_iter iter;
1427         struct task_struct *task;
1428         struct mm_struct *mm;
1429         size_t total_len;
1430         unsigned int f_flags;
1431
1432         if (flags != 0) {
1433                 ret = -EINVAL;
1434                 goto out;
1435         }
1436
1437         ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1438         if (ret < 0)
1439                 goto out;
1440
1441         task = pidfd_get_task(pidfd, &f_flags);
1442         if (IS_ERR(task)) {
1443                 ret = PTR_ERR(task);
1444                 goto free_iov;
1445         }
1446
1447         if (!process_madvise_behavior_valid(behavior)) {
1448                 ret = -EINVAL;
1449                 goto release_task;
1450         }
1451
1452         /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
1453         mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1454         if (IS_ERR_OR_NULL(mm)) {
1455                 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1456                 goto release_task;
1457         }
1458
1459         /*
1460          * Require CAP_SYS_NICE for influencing process performance. Note that
1461          * only non-destructive hints are currently supported.
1462          */
1463         if (!capable(CAP_SYS_NICE)) {
1464                 ret = -EPERM;
1465                 goto release_mm;
1466         }
1467
1468         total_len = iov_iter_count(&iter);
1469
1470         while (iov_iter_count(&iter)) {
1471                 iovec = iov_iter_iovec(&iter);
1472                 ret = do_madvise(mm, (unsigned long)iovec.iov_base,
1473                                         iovec.iov_len, behavior);
1474                 if (ret < 0)
1475                         break;
1476                 iov_iter_advance(&iter, iovec.iov_len);
1477         }
1478
1479         ret = (total_len - iov_iter_count(&iter)) ? : ret;
1480
1481 release_mm:
1482         mmput(mm);
1483 release_task:
1484         put_task_struct(task);
1485 free_iov:
1486         kfree(iov);
1487 out:
1488         return ret;
1489 }