mm/memory-failure.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) 2008, 2009 Intel Corporation
   4  * Authors: Andi Kleen, Fengguang Wu
   5  *
   6  * High level machine check handler. Handles pages reported by the
   7  * hardware as being corrupted usually due to a multi-bit ECC memory or cache
   8  * failure.
   9  *
  10  * In addition there is a "soft offline" entry point that allows stop using
  11  * not-yet-corrupted-by-suspicious pages without killing anything.
  12  *
  13  * Handles page cache pages in various states.  The tricky part
  14  * here is that we can access any page asynchronously in respect to
  15  * other VM users, because memory failures could happen anytime and
  16  * anywhere. This could violate some of their assumptions. This is why
  17  * this code has to be extremely careful. Generally it tries to use
  18  * normal locking rules, as in get the standard locks, even if that means
  19  * the error handling takes potentially a long time.
  20  *
  21  * It can be very tempting to add handling for obscure cases here.
  22  * In general any code for handling new cases should only be added iff:
  23  * - You know how to test it.
  24  * - You have a test that can be added to mce-test
  25  *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
  26  * - The case actually shows up as a frequent (top 10) page state in
  27  *   tools/mm/page-types when running a real workload.
  28  *
  29  * There are several operations here with exponential complexity because
  30  * of unsuitable VM data structures. For example the operation to map back
  31  * from RMAP chains to processes has to walk the complete process list and
  32  * has non linear complexity with the number. But since memory corruptions
  33  * are rare we hope to get away with this. This avoids impacting the core
  34  * VM.
  35  */
  36
  37 #define pr_fmt(fmt) "Memory failure: " fmt
  38
  39 #include <linux/kernel.h>
  40 #include <linux/mm.h>
  41 #include <linux/page-flags.h>
  42 #include <linux/sched/signal.h>
  43 #include <linux/sched/task.h>
  44 #include <linux/dax.h>
  45 #include <linux/ksm.h>
  46 #include <linux/rmap.h>
  47 #include <linux/export.h>
  48 #include <linux/pagemap.h>
  49 #include <linux/swap.h>
  50 #include <linux/backing-dev.h>
  51 #include <linux/migrate.h>
  52 #include <linux/slab.h>
  53 #include <linux/swapops.h>
  54 #include <linux/hugetlb.h>
  55 #include <linux/memory_hotplug.h>
  56 #include <linux/mm_inline.h>
  57 #include <linux/memremap.h>
  58 #include <linux/kfifo.h>
  59 #include <linux/ratelimit.h>
  60 #include <linux/pagewalk.h>
  61 #include <linux/shmem_fs.h>
  62 #include <linux/sysctl.h>
  63 #include "swap.h"
  64 #include "internal.h"
  65 #include "ras/ras_event.h"
  66
  67 static int sysctl_memory_failure_early_kill __read_mostly;
  68
  69 static int sysctl_memory_failure_recovery __read_mostly = 1;
  70
  71 atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
  72
  73 static bool hw_memory_failure __read_mostly = false;
  74
  75 static DEFINE_MUTEX(mf_mutex);
  76
  77 void num_poisoned_pages_inc(unsigned long pfn)
  78 {
  79         atomic_long_inc(&num_poisoned_pages);
  80         memblk_nr_poison_inc(pfn);
  81 }
  82
  83 void num_poisoned_pages_sub(unsigned long pfn, long i)
  84 {
  85         atomic_long_sub(i, &num_poisoned_pages);
  86         if (pfn != -1UL)
  87                 memblk_nr_poison_sub(pfn, i);
  88 }
  89
  90 /**
  91  * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
  92  * @_name: name of the file in the per NUMA sysfs directory.
  93  */
  94 #define MF_ATTR_RO(_name)                                       \
  95 static ssize_t _name##_show(struct device *dev,                 \
  96                             struct device_attribute *attr,      \
  97                             char *buf)                          \
  98 {                                                               \
  99         struct memory_failure_stats *mf_stats =                 \
 100                 &NODE_DATA(dev->id)->mf_stats;                  \
 101         return sprintf(buf, "%lu\n", mf_stats->_name);          \
 102 }                                                               \
 103 static DEVICE_ATTR_RO(_name)
 104
 105 MF_ATTR_RO(total);
 106 MF_ATTR_RO(ignored);
 107 MF_ATTR_RO(failed);
 108 MF_ATTR_RO(delayed);
 109 MF_ATTR_RO(recovered);
 110
 111 static struct attribute *memory_failure_attr[] = {
 112         &dev_attr_total.attr,
 113         &dev_attr_ignored.attr,
 114         &dev_attr_failed.attr,
 115         &dev_attr_delayed.attr,
 116         &dev_attr_recovered.attr,
 117         NULL,
 118 };
 119
 120 const struct attribute_group memory_failure_attr_group = {
 121         .name = "memory_failure",
 122         .attrs = memory_failure_attr,
 123 };
 124
 125 static struct ctl_table memory_failure_table[] = {
 126         {
 127                 .procname       = "memory_failure_early_kill",
 128                 .data           = &sysctl_memory_failure_early_kill,
 129                 .maxlen         = sizeof(sysctl_memory_failure_early_kill),
 130                 .mode           = 0644,
 131                 .proc_handler   = proc_dointvec_minmax,
 132                 .extra1         = SYSCTL_ZERO,
 133                 .extra2         = SYSCTL_ONE,
 134         },
 135         {
 136                 .procname       = "memory_failure_recovery",
 137                 .data           = &sysctl_memory_failure_recovery,
 138                 .maxlen         = sizeof(sysctl_memory_failure_recovery),
 139                 .mode           = 0644,
 140                 .proc_handler   = proc_dointvec_minmax,
 141                 .extra1         = SYSCTL_ZERO,
 142                 .extra2         = SYSCTL_ONE,
 143         },
 144 };
 145
 146 /*
 147  * Return values:
 148  *   1:   the page is dissolved (if needed) and taken off from buddy,
 149  *   0:   the page is dissolved (if needed) and not taken off from buddy,
 150  *   < 0: failed to dissolve.
 151  */
 152 static int __page_handle_poison(struct page *page)
 153 {
 154         int ret;
 155
 156         /*
 157          * zone_pcp_disable() can't be used here. It will
 158          * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold
 159          * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
 160          * optimization is enabled. This will break current lock dependency
 161          * chain and leads to deadlock.
 162          * Disabling pcp before dissolving the page was a deterministic
 163          * approach because we made sure that those pages cannot end up in any
 164          * PCP list. Draining PCP lists expels those pages to the buddy system,
 165          * but nothing guarantees that those pages do not get back to a PCP
 166          * queue if we need to refill those.
 167          */
 168         ret = dissolve_free_hugetlb_folio(page_folio(page));
 169         if (!ret) {
 170                 drain_all_pages(page_zone(page));
 171                 ret = take_page_off_buddy(page);
 172         }
 173
 174         return ret;
 175 }
 176
 177 static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
 178 {
 179         if (hugepage_or_freepage) {
 180                 /*
 181                  * Doing this check for free pages is also fine since
 182                  * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well.
 183                  */
 184                 if (__page_handle_poison(page) <= 0)
 185                         /*
 186                          * We could fail to take off the target page from buddy
 187                          * for example due to racy page allocation, but that's
 188                          * acceptable because soft-offlined page is not broken
 189                          * and if someone really want to use it, they should
 190                          * take it.
 191                          */
 192                         return false;
 193         }
 194
 195         SetPageHWPoison(page);
 196         if (release)
 197                 put_page(page);
 198         page_ref_inc(page);
 199         num_poisoned_pages_inc(page_to_pfn(page));
 200
 201         return true;
 202 }
 203
 204 #if IS_ENABLED(CONFIG_HWPOISON_INJECT)
 205
 206 u32 hwpoison_filter_enable = 0;
 207 u32 hwpoison_filter_dev_major = ~0U;
 208 u32 hwpoison_filter_dev_minor = ~0U;
 209 u64 hwpoison_filter_flags_mask;
 210 u64 hwpoison_filter_flags_value;
 211 EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
 212 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
 213 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
 214 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
 215 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
 216
 217 static int hwpoison_filter_dev(struct page *p)
 218 {
 219         struct folio *folio = page_folio(p);
 220         struct address_space *mapping;
 221         dev_t dev;
 222
 223         if (hwpoison_filter_dev_major == ~0U &&
 224             hwpoison_filter_dev_minor == ~0U)
 225                 return 0;
 226
 227         mapping = folio_mapping(folio);
 228         if (mapping == NULL || mapping->host == NULL)
 229                 return -EINVAL;
 230
 231         dev = mapping->host->i_sb->s_dev;
 232         if (hwpoison_filter_dev_major != ~0U &&
 233             hwpoison_filter_dev_major != MAJOR(dev))
 234                 return -EINVAL;
 235         if (hwpoison_filter_dev_minor != ~0U &&
 236             hwpoison_filter_dev_minor != MINOR(dev))
 237                 return -EINVAL;
 238
 239         return 0;
 240 }
 241
 242 static int hwpoison_filter_flags(struct page *p)
 243 {
 244         if (!hwpoison_filter_flags_mask)
 245                 return 0;
 246
 247         if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
 248                                     hwpoison_filter_flags_value)
 249                 return 0;
 250         else
 251                 return -EINVAL;
 252 }
 253
 254 /*
 255  * This allows stress tests to limit test scope to a collection of tasks
 256  * by putting them under some memcg. This prevents killing unrelated/important
 257  * processes such as /sbin/init. Note that the target task may share clean
 258  * pages with init (eg. libc text), which is harmless. If the target task
 259  * share _dirty_ pages with another task B, the test scheme must make sure B
 260  * is also included in the memcg. At last, due to race conditions this filter
 261  * can only guarantee that the page either belongs to the memcg tasks, or is
 262  * a freed page.
 263  */
 264 #ifdef CONFIG_MEMCG
 265 u64 hwpoison_filter_memcg;
 266 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 267 static int hwpoison_filter_task(struct page *p)
 268 {
 269         if (!hwpoison_filter_memcg)
 270                 return 0;
 271
 272         if (page_cgroup_ino(p) != hwpoison_filter_memcg)
 273                 return -EINVAL;
 274
 275         return 0;
 276 }
 277 #else
 278 static int hwpoison_filter_task(struct page *p) { return 0; }
 279 #endif
 280
 281 int hwpoison_filter(struct page *p)
 282 {
 283         if (!hwpoison_filter_enable)
 284                 return 0;
 285
 286         if (hwpoison_filter_dev(p))
 287                 return -EINVAL;
 288
 289         if (hwpoison_filter_flags(p))
 290                 return -EINVAL;
 291
 292         if (hwpoison_filter_task(p))
 293                 return -EINVAL;
 294
 295         return 0;
 296 }
 297 #else
 298 int hwpoison_filter(struct page *p)
 299 {
 300         return 0;
 301 }
 302 #endif
 303
 304 EXPORT_SYMBOL_GPL(hwpoison_filter);
 305
 306 /*
 307  * Kill all processes that have a poisoned page mapped and then isolate
 308  * the page.
 309  *
 310  * General strategy:
 311  * Find all processes having the page mapped and kill them.
 312  * But we keep a page reference around so that the page is not
 313  * actually freed yet.
 314  * Then stash the page away
 315  *
 316  * There's no convenient way to get back to mapped processes
 317  * from the VMAs. So do a brute-force search over all
 318  * running processes.
 319  *
 320  * Remember that machine checks are not common (or rather
 321  * if they are common you have other problems), so this shouldn't
 322  * be a performance issue.
 323  *
 324  * Also there are some races possible while we get from the
 325  * error detection to actually handle it.
 326  */
 327
 328 struct to_kill {
 329         struct list_head nd;
 330         struct task_struct *tsk;
 331         unsigned long addr;
 332         short size_shift;
 333 };
 334
 335 /*
 336  * Send all the processes who have the page mapped a signal.
 337  * ``action optional'' if they are not immediately affected by the error
 338  * ``action required'' if error happened in current execution context
 339  */
 340 static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
 341 {
 342         struct task_struct *t = tk->tsk;
 343         short addr_lsb = tk->size_shift;
 344         int ret = 0;
 345
 346         pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
 347                         pfn, t->comm, t->pid);
 348
 349         if ((flags & MF_ACTION_REQUIRED) && (t == current))
 350                 ret = force_sig_mceerr(BUS_MCEERR_AR,
 351                                  (void __user *)tk->addr, addr_lsb);
 352         else
 353                 /*
 354                  * Signal other processes sharing the page if they have
 355                  * PF_MCE_EARLY set.
 356                  * Don't use force here, it's convenient if the signal
 357                  * can be temporarily blocked.
 358                  * This could cause a loop when the user sets SIGBUS
 359                  * to SIG_IGN, but hopefully no one will do that?
 360                  */
 361                 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
 362                                       addr_lsb, t);
 363         if (ret < 0)
 364                 pr_info("Error sending signal to %s:%d: %d\n",
 365                         t->comm, t->pid, ret);
 366         return ret;
 367 }
 368
 369 /*
 370  * Unknown page type encountered. Try to check whether it can turn PageLRU by
 371  * lru_add_drain_all.
 372  */
 373 void shake_folio(struct folio *folio)
 374 {
 375         if (folio_test_hugetlb(folio))
 376                 return;
 377         /*
 378          * TODO: Could shrink slab caches here if a lightweight range-based
 379          * shrinker will be available.
 380          */
 381         if (folio_test_slab(folio))
 382                 return;
 383
 384         lru_add_drain_all();
 385 }
 386 EXPORT_SYMBOL_GPL(shake_folio);
 387
 388 static void shake_page(struct page *page)
 389 {
 390         shake_folio(page_folio(page));
 391 }
 392
 393 static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
 394                 unsigned long address)
 395 {
 396         unsigned long ret = 0;
 397         pgd_t *pgd;
 398         p4d_t *p4d;
 399         pud_t *pud;
 400         pmd_t *pmd;
 401         pte_t *pte;
 402         pte_t ptent;
 403
 404         VM_BUG_ON_VMA(address == -EFAULT, vma);
 405         pgd = pgd_offset(vma->vm_mm, address);
 406         if (!pgd_present(*pgd))
 407                 return 0;
 408         p4d = p4d_offset(pgd, address);
 409         if (!p4d_present(*p4d))
 410                 return 0;
 411         pud = pud_offset(p4d, address);
 412         if (!pud_present(*pud))
 413                 return 0;
 414         if (pud_devmap(*pud))
 415                 return PUD_SHIFT;
 416         pmd = pmd_offset(pud, address);
 417         if (!pmd_present(*pmd))
 418                 return 0;
 419         if (pmd_devmap(*pmd))
 420                 return PMD_SHIFT;
 421         pte = pte_offset_map(pmd, address);
 422         if (!pte)
 423                 return 0;
 424         ptent = ptep_get(pte);
 425         if (pte_present(ptent) && pte_devmap(ptent))
 426                 ret = PAGE_SHIFT;
 427         pte_unmap(pte);
 428         return ret;
 429 }
 430
 431 /*
 432  * Failure handling: if we can't find or can't kill a process there's
 433  * not much we can do.  We just print a message and ignore otherwise.
 434  */
 435
 436 /*
 437  * Schedule a process for later kill.
 438  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
 439  */
 440 static void __add_to_kill(struct task_struct *tsk, struct page *p,
 441                           struct vm_area_struct *vma, struct list_head *to_kill,
 442                           unsigned long addr)
 443 {
 444         struct to_kill *tk;
 445
 446         tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
 447         if (!tk) {
 448                 pr_err("Out of memory while machine check handling\n");
 449                 return;
 450         }
 451
 452         tk->addr = addr;
 453         if (is_zone_device_page(p))
 454                 tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
 455         else
 456                 tk->size_shift = page_shift(compound_head(p));
 457
 458         /*
 459          * Send SIGKILL if "tk->addr == -EFAULT". Also, as
 460          * "tk->size_shift" is always non-zero for !is_zone_device_page(),
 461          * so "tk->size_shift == 0" effectively checks no mapping on
 462          * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
 463          * to a process' address space, it's possible not all N VMAs
 464          * contain mappings for the page, but at least one VMA does.
 465          * Only deliver SIGBUS with payload derived from the VMA that
 466          * has a mapping for the page.
 467          */
 468         if (tk->addr == -EFAULT) {
 469                 pr_info("Unable to find user space address %lx in %s\n",
 470                         page_to_pfn(p), tsk->comm);
 471         } else if (tk->size_shift == 0) {
 472                 kfree(tk);
 473                 return;
 474         }
 475
 476         get_task_struct(tsk);
 477         tk->tsk = tsk;
 478         list_add_tail(&tk->nd, to_kill);
 479 }
 480
 481 static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
 482                 struct vm_area_struct *vma, struct list_head *to_kill,
 483                 unsigned long addr)
 484 {
 485         if (addr == -EFAULT)
 486                 return;
 487         __add_to_kill(tsk, p, vma, to_kill, addr);
 488 }
 489
 490 #ifdef CONFIG_KSM
 491 static bool task_in_to_kill_list(struct list_head *to_kill,
 492                                  struct task_struct *tsk)
 493 {
 494         struct to_kill *tk, *next;
 495
 496         list_for_each_entry_safe(tk, next, to_kill, nd) {
 497                 if (tk->tsk == tsk)
 498                         return true;
 499         }
 500
 501         return false;
 502 }
 503
 504 void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
 505                      struct vm_area_struct *vma, struct list_head *to_kill,
 506                      unsigned long addr)
 507 {
 508         if (!task_in_to_kill_list(to_kill, tsk))
 509                 __add_to_kill(tsk, p, vma, to_kill, addr);
 510 }
 511 #endif
 512 /*
 513  * Kill the processes that have been collected earlier.
 514  *
 515  * Only do anything when FORCEKILL is set, otherwise just free the
 516  * list (this is used for clean pages which do not need killing)
 517  * Also when FAIL is set do a force kill because something went
 518  * wrong earlier.
 519  */
 520 static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
 521                 unsigned long pfn, int flags)
 522 {
 523         struct to_kill *tk, *next;
 524
 525         list_for_each_entry_safe(tk, next, to_kill, nd) {
 526                 if (forcekill) {
 527                         /*
 528                          * In case something went wrong with munmapping
 529                          * make sure the process doesn't catch the
 530                          * signal and then access the memory. Just kill it.
 531                          */
 532                         if (fail || tk->addr == -EFAULT) {
 533                                 pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
 534                                        pfn, tk->tsk->comm, tk->tsk->pid);
 535                                 do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
 536                                                  tk->tsk, PIDTYPE_PID);
 537                         }
 538
 539                         /*
 540                          * In theory the process could have mapped
 541                          * something else on the address in-between. We could
 542                          * check for that, but we need to tell the
 543                          * process anyways.
 544                          */
 545                         else if (kill_proc(tk, pfn, flags) < 0)
 546                                 pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
 547                                        pfn, tk->tsk->comm, tk->tsk->pid);
 548                 }
 549                 list_del(&tk->nd);
 550                 put_task_struct(tk->tsk);
 551                 kfree(tk);
 552         }
 553 }
 554
 555 /*
 556  * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
 557  * on behalf of the thread group. Return task_struct of the (first found)
 558  * dedicated thread if found, and return NULL otherwise.
 559  *
 560  * We already hold rcu lock in the caller, so we don't have to call
 561  * rcu_read_lock/unlock() in this function.
 562  */
 563 static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
 564 {
 565         struct task_struct *t;
 566
 567         for_each_thread(tsk, t) {
 568                 if (t->flags & PF_MCE_PROCESS) {
 569                         if (t->flags & PF_MCE_EARLY)
 570                                 return t;
 571                 } else {
 572                         if (sysctl_memory_failure_early_kill)
 573                                 return t;
 574                 }
 575         }
 576         return NULL;
 577 }
 578
 579 /*
 580  * Determine whether a given process is "early kill" process which expects
 581  * to be signaled when some page under the process is hwpoisoned.
 582  * Return task_struct of the dedicated thread (main thread unless explicitly
 583  * specified) if the process is "early kill" and otherwise returns NULL.
 584  *
 585  * Note that the above is true for Action Optional case. For Action Required
 586  * case, it's only meaningful to the current thread which need to be signaled
 587  * with SIGBUS, this error is Action Optional for other non current
 588  * processes sharing the same error page,if the process is "early kill", the
 589  * task_struct of the dedicated thread will also be returned.
 590  */
 591 struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
 592 {
 593         if (!tsk->mm)
 594                 return NULL;
 595         /*
 596          * Comparing ->mm here because current task might represent
 597          * a subthread, while tsk always points to the main thread.
 598          */
 599         if (force_early && tsk->mm == current->mm)
 600                 return current;
 601
 602         return find_early_kill_thread(tsk);
 603 }
 604
 605 /*
 606  * Collect processes when the error hit an anonymous page.
 607  */
 608 static void collect_procs_anon(struct folio *folio, struct page *page,
 609                 struct list_head *to_kill, int force_early)
 610 {
 611         struct task_struct *tsk;
 612         struct anon_vma *av;
 613         pgoff_t pgoff;
 614
 615         av = folio_lock_anon_vma_read(folio, NULL);
 616         if (av == NULL) /* Not actually mapped anymore */
 617                 return;
 618
 619         pgoff = page_to_pgoff(page);
 620         rcu_read_lock();
 621         for_each_process(tsk) {
 622                 struct vm_area_struct *vma;
 623                 struct anon_vma_chain *vmac;
 624                 struct task_struct *t = task_early_kill(tsk, force_early);
 625                 unsigned long addr;
 626
 627                 if (!t)
 628                         continue;
 629                 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
 630                                                pgoff, pgoff) {
 631                         vma = vmac->vma;
 632                         if (vma->vm_mm != t->mm)
 633                                 continue;
 634                         addr = page_mapped_in_vma(page, vma);
 635                         add_to_kill_anon_file(t, page, vma, to_kill, addr);
 636                 }
 637         }
 638         rcu_read_unlock();
 639         anon_vma_unlock_read(av);
 640 }
 641
 642 /*
 643  * Collect processes when the error hit a file mapped page.
 644  */
 645 static void collect_procs_file(struct folio *folio, struct page *page,
 646                 struct list_head *to_kill, int force_early)
 647 {
 648         struct vm_area_struct *vma;
 649         struct task_struct *tsk;
 650         struct address_space *mapping = folio->mapping;
 651         pgoff_t pgoff;
 652
 653         i_mmap_lock_read(mapping);
 654         rcu_read_lock();
 655         pgoff = page_to_pgoff(page);
 656         for_each_process(tsk) {
 657                 struct task_struct *t = task_early_kill(tsk, force_early);
 658                 unsigned long addr;
 659
 660                 if (!t)
 661                         continue;
 662                 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
 663                                       pgoff) {
 664                         /*
 665                          * Send early kill signal to tasks where a vma covers
 666                          * the page but the corrupted page is not necessarily
 667                          * mapped in its pte.
 668                          * Assume applications who requested early kill want
 669                          * to be informed of all such data corruptions.
 670                          */
 671                         if (vma->vm_mm != t->mm)
 672                                 continue;
 673                         addr = page_address_in_vma(page, vma);
 674                         add_to_kill_anon_file(t, page, vma, to_kill, addr);
 675                 }
 676         }
 677         rcu_read_unlock();
 678         i_mmap_unlock_read(mapping);
 679 }
 680
 681 #ifdef CONFIG_FS_DAX
 682 static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
 683                               struct vm_area_struct *vma,
 684                               struct list_head *to_kill, pgoff_t pgoff)
 685 {
 686         unsigned long addr = vma_address(vma, pgoff, 1);
 687         __add_to_kill(tsk, p, vma, to_kill, addr);
 688 }
 689
 690 /*
 691  * Collect processes when the error hit a fsdax page.
 692  */
 693 static void collect_procs_fsdax(struct page *page,
 694                 struct address_space *mapping, pgoff_t pgoff,
 695                 struct list_head *to_kill, bool pre_remove)
 696 {
 697         struct vm_area_struct *vma;
 698         struct task_struct *tsk;
 699
 700         i_mmap_lock_read(mapping);
 701         rcu_read_lock();
 702         for_each_process(tsk) {
 703                 struct task_struct *t = tsk;
 704
 705                 /*
 706                  * Search for all tasks while MF_MEM_PRE_REMOVE is set, because
 707                  * the current may not be the one accessing the fsdax page.
 708                  * Otherwise, search for the current task.
 709                  */
 710                 if (!pre_remove)
 711                         t = task_early_kill(tsk, true);
 712                 if (!t)
 713                         continue;
 714                 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 715                         if (vma->vm_mm == t->mm)
 716                                 add_to_kill_fsdax(t, page, vma, to_kill, pgoff);
 717                 }
 718         }
 719         rcu_read_unlock();
 720         i_mmap_unlock_read(mapping);
 721 }
 722 #endif /* CONFIG_FS_DAX */
 723
 724 /*
 725  * Collect the processes who have the corrupted page mapped to kill.
 726  */
 727 static void collect_procs(struct folio *folio, struct page *page,
 728                 struct list_head *tokill, int force_early)
 729 {
 730         if (!folio->mapping)
 731                 return;
 732         if (unlikely(folio_test_ksm(folio)))
 733                 collect_procs_ksm(folio, page, tokill, force_early);
 734         else if (folio_test_anon(folio))
 735                 collect_procs_anon(folio, page, tokill, force_early);
 736         else
 737                 collect_procs_file(folio, page, tokill, force_early);
 738 }
 739
 740 struct hwpoison_walk {
 741         struct to_kill tk;
 742         unsigned long pfn;
 743         int flags;
 744 };
 745
 746 static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
 747 {
 748         tk->addr = addr;
 749         tk->size_shift = shift;
 750 }
 751
 752 static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
 753                                 unsigned long poisoned_pfn, struct to_kill *tk)
 754 {
 755         unsigned long pfn = 0;
 756
 757         if (pte_present(pte)) {
 758                 pfn = pte_pfn(pte);
 759         } else {
 760                 swp_entry_t swp = pte_to_swp_entry(pte);
 761
 762                 if (is_hwpoison_entry(swp))
 763                         pfn = swp_offset_pfn(swp);
 764         }
 765
 766         if (!pfn || pfn != poisoned_pfn)
 767                 return 0;
 768
 769         set_to_kill(tk, addr, shift);
 770         return 1;
 771 }
 772
 773 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 774 static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
 775                                       struct hwpoison_walk *hwp)
 776 {
 777         pmd_t pmd = *pmdp;
 778         unsigned long pfn;
 779         unsigned long hwpoison_vaddr;
 780
 781         if (!pmd_present(pmd))
 782                 return 0;
 783         pfn = pmd_pfn(pmd);
 784         if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
 785                 hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
 786                 set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
 787                 return 1;
 788         }
 789         return 0;
 790 }
 791 #else
 792 static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
 793                                       struct hwpoison_walk *hwp)
 794 {
 795         return 0;
 796 }
 797 #endif
 798
 799 static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
 800                               unsigned long end, struct mm_walk *walk)
 801 {
 802         struct hwpoison_walk *hwp = walk->private;
 803         int ret = 0;
 804         pte_t *ptep, *mapped_pte;
 805         spinlock_t *ptl;
 806
 807         ptl = pmd_trans_huge_lock(pmdp, walk->vma);
 808         if (ptl) {
 809                 ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
 810                 spin_unlock(ptl);
 811                 goto out;
 812         }
 813
 814         mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
 815                                                 addr, &ptl);
 816         if (!ptep)
 817                 goto out;
 818
 819         for (; addr != end; ptep++, addr += PAGE_SIZE) {
 820                 ret = check_hwpoisoned_entry(ptep_get(ptep), addr, PAGE_SHIFT,
 821                                              hwp->pfn, &hwp->tk);
 822                 if (ret == 1)
 823                         break;
 824         }
 825         pte_unmap_unlock(mapped_pte, ptl);
 826 out:
 827         cond_resched();
 828         return ret;
 829 }
 830
 831 #ifdef CONFIG_HUGETLB_PAGE
 832 static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
 833                             unsigned long addr, unsigned long end,
 834                             struct mm_walk *walk)
 835 {
 836         struct hwpoison_walk *hwp = walk->private;
 837         pte_t pte = huge_ptep_get(ptep);
 838         struct hstate *h = hstate_vma(walk->vma);
 839
 840         return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
 841                                       hwp->pfn, &hwp->tk);
 842 }
 843 #else
 844 #define hwpoison_hugetlb_range  NULL
 845 #endif
 846
 847 static const struct mm_walk_ops hwpoison_walk_ops = {
 848         .pmd_entry = hwpoison_pte_range,
 849         .hugetlb_entry = hwpoison_hugetlb_range,
 850         .walk_lock = PGWALK_RDLOCK,
 851 };
 852
 853 /*
 854  * Sends SIGBUS to the current process with error info.
 855  *
 856  * This function is intended to handle "Action Required" MCEs on already
 857  * hardware poisoned pages. They could happen, for example, when
 858  * memory_failure() failed to unmap the error page at the first call, or
 859  * when multiple local machine checks happened on different CPUs.
 860  *
 861  * MCE handler currently has no easy access to the error virtual address,
 862  * so this function walks page table to find it. The returned virtual address
 863  * is proper in most cases, but it could be wrong when the application
 864  * process has multiple entries mapping the error page.
 865  */
 866 static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
 867                                   int flags)
 868 {
 869         int ret;
 870         struct hwpoison_walk priv = {
 871                 .pfn = pfn,
 872         };
 873         priv.tk.tsk = p;
 874
 875         if (!p->mm)
 876                 return -EFAULT;
 877
 878         mmap_read_lock(p->mm);
 879         ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops,
 880                               (void *)&priv);
 881         if (ret == 1 && priv.tk.addr)
 882                 kill_proc(&priv.tk, pfn, flags);
 883         else
 884                 ret = 0;
 885         mmap_read_unlock(p->mm);
 886         return ret > 0 ? -EHWPOISON : -EFAULT;
 887 }
 888
 889 static const char *action_name[] = {
 890         [MF_IGNORED] = "Ignored",
 891         [MF_FAILED] = "Failed",
 892         [MF_DELAYED] = "Delayed",
 893         [MF_RECOVERED] = "Recovered",
 894 };
 895
 896 static const char * const action_page_types[] = {
 897         [MF_MSG_KERNEL]                 = "reserved kernel page",
 898         [MF_MSG_KERNEL_HIGH_ORDER]      = "high-order kernel page",
 899         [MF_MSG_SLAB]                   = "kernel slab page",
 900         [MF_MSG_DIFFERENT_COMPOUND]     = "different compound page after locking",
 901         [MF_MSG_HUGE]                   = "huge page",
 902         [MF_MSG_FREE_HUGE]              = "free huge page",
 903         [MF_MSG_UNMAP_FAILED]           = "unmapping failed page",
 904         [MF_MSG_DIRTY_SWAPCACHE]        = "dirty swapcache page",
 905         [MF_MSG_CLEAN_SWAPCACHE]        = "clean swapcache page",
 906         [MF_MSG_DIRTY_MLOCKED_LRU]      = "dirty mlocked LRU page",
 907         [MF_MSG_CLEAN_MLOCKED_LRU]      = "clean mlocked LRU page",
 908         [MF_MSG_DIRTY_UNEVICTABLE_LRU]  = "dirty unevictable LRU page",
 909         [MF_MSG_CLEAN_UNEVICTABLE_LRU]  = "clean unevictable LRU page",
 910         [MF_MSG_DIRTY_LRU]              = "dirty LRU page",
 911         [MF_MSG_CLEAN_LRU]              = "clean LRU page",
 912         [MF_MSG_TRUNCATED_LRU]          = "already truncated LRU page",
 913         [MF_MSG_BUDDY]                  = "free buddy page",
 914         [MF_MSG_DAX]                    = "dax page",
 915         [MF_MSG_UNSPLIT_THP]            = "unsplit thp",
 916         [MF_MSG_UNKNOWN]                = "unknown page",
 917 };
 918
 919 /*
 920  * XXX: It is possible that a page is isolated from LRU cache,
 921  * and then kept in swap cache or failed to remove from page cache.
 922  * The page count will stop it from being freed by unpoison.
 923  * Stress tests should be aware of this memory leak problem.
 924  */
 925 static int delete_from_lru_cache(struct folio *folio)
 926 {
 927         if (folio_isolate_lru(folio)) {
 928                 /*
 929                  * Clear sensible page flags, so that the buddy system won't
 930                  * complain when the folio is unpoison-and-freed.
 931                  */
 932                 folio_clear_active(folio);
 933                 folio_clear_unevictable(folio);
 934
 935                 /*
 936                  * Poisoned page might never drop its ref count to 0 so we have
 937                  * to uncharge it manually from its memcg.
 938                  */
 939                 mem_cgroup_uncharge(folio);
 940
 941                 /*
 942                  * drop the refcount elevated by folio_isolate_lru()
 943                  */
 944                 folio_put(folio);
 945                 return 0;
 946         }
 947         return -EIO;
 948 }
 949
 950 static int truncate_error_folio(struct folio *folio, unsigned long pfn,
 951                                 struct address_space *mapping)
 952 {
 953         int ret = MF_FAILED;
 954
 955         if (mapping->a_ops->error_remove_folio) {
 956                 int err = mapping->a_ops->error_remove_folio(mapping, folio);
 957
 958                 if (err != 0)
 959                         pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
 960                 else if (!filemap_release_folio(folio, GFP_NOIO))
 961                         pr_info("%#lx: failed to release buffers\n", pfn);
 962                 else
 963                         ret = MF_RECOVERED;
 964         } else {
 965                 /*
 966                  * If the file system doesn't support it just invalidate
 967                  * This fails on dirty or anything with private pages
 968                  */
 969                 if (mapping_evict_folio(mapping, folio))
 970                         ret = MF_RECOVERED;
 971                 else
 972                         pr_info("%#lx: Failed to invalidate\n", pfn);
 973         }
 974
 975         return ret;
 976 }
 977
 978 struct page_state {
 979         unsigned long mask;
 980         unsigned long res;
 981         enum mf_action_page_type type;
 982
 983         /* Callback ->action() has to unlock the relevant page inside it. */
 984         int (*action)(struct page_state *ps, struct page *p);
 985 };
 986
 987 /*
 988  * Return true if page is still referenced by others, otherwise return
 989  * false.
 990  *
 991  * The extra_pins is true when one extra refcount is expected.
 992  */
 993 static bool has_extra_refcount(struct page_state *ps, struct page *p,
 994                                bool extra_pins)
 995 {
 996         int count = page_count(p) - 1;
 997
 998         if (extra_pins)
 999                 count -= folio_nr_pages(page_folio(p));
1000
1001         if (count > 0) {
1002                 pr_err("%#lx: %s still referenced by %d users\n",
1003                        page_to_pfn(p), action_page_types[ps->type], count);
1004                 return true;
1005         }
1006
1007         return false;
1008 }
1009
1010 /*
1011  * Error hit kernel page.
1012  * Do nothing, try to be lucky and not touch this instead. For a few cases we
1013  * could be more sophisticated.
1014  */
1015 static int me_kernel(struct page_state *ps, struct page *p)
1016 {
1017         unlock_page(p);
1018         return MF_IGNORED;
1019 }
1020
1021 /*
1022  * Page in unknown state. Do nothing.
1023  */
1024 static int me_unknown(struct page_state *ps, struct page *p)
1025 {
1026         pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
1027         unlock_page(p);
1028         return MF_FAILED;
1029 }
1030
1031 /*
1032  * Clean (or cleaned) page cache page.
1033  */
1034 static int me_pagecache_clean(struct page_state *ps, struct page *p)
1035 {
1036         struct folio *folio = page_folio(p);
1037         int ret;
1038         struct address_space *mapping;
1039         bool extra_pins;
1040
1041         delete_from_lru_cache(folio);
1042
1043         /*
1044          * For anonymous folios the only reference left
1045          * should be the one m_f() holds.
1046          */
1047         if (folio_test_anon(folio)) {
1048                 ret = MF_RECOVERED;
1049                 goto out;
1050         }
1051
1052         /*
1053          * Now truncate the page in the page cache. This is really
1054          * more like a "temporary hole punch"
1055          * Don't do this for block devices when someone else
1056          * has a reference, because it could be file system metadata
1057          * and that's not safe to truncate.
1058          */
1059         mapping = folio_mapping(folio);
1060         if (!mapping) {
1061                 /* Folio has been torn down in the meantime */
1062                 ret = MF_FAILED;
1063                 goto out;
1064         }
1065
1066         /*
1067          * The shmem page is kept in page cache instead of truncating
1068          * so is expected to have an extra refcount after error-handling.
1069          */
1070         extra_pins = shmem_mapping(mapping);
1071
1072         /*
1073          * Truncation is a bit tricky. Enable it per file system for now.
1074          *
1075          * Open: to take i_rwsem or not for this? Right now we don't.
1076          */
1077         ret = truncate_error_folio(folio, page_to_pfn(p), mapping);
1078         if (has_extra_refcount(ps, p, extra_pins))
1079                 ret = MF_FAILED;
1080
1081 out:
1082         folio_unlock(folio);
1083
1084         return ret;
1085 }
1086
1087 /*
1088  * Dirty pagecache page
1089  * Issues: when the error hit a hole page the error is not properly
1090  * propagated.
1091  */
1092 static int me_pagecache_dirty(struct page_state *ps, struct page *p)
1093 {
1094         struct folio *folio = page_folio(p);
1095         struct address_space *mapping = folio_mapping(folio);
1096
1097         SetPageError(p);
1098         /* TBD: print more information about the file. */
1099         if (mapping) {
1100                 /*
1101                  * IO error will be reported by write(), fsync(), etc.
1102                  * who check the mapping.
1103                  * This way the application knows that something went
1104                  * wrong with its dirty file data.
1105                  *
1106                  * There's one open issue:
1107                  *
1108                  * The EIO will be only reported on the next IO
1109                  * operation and then cleared through the IO map.
1110                  * Normally Linux has two mechanisms to pass IO error
1111                  * first through the AS_EIO flag in the address space
1112                  * and then through the PageError flag in the page.
1113                  * Since we drop pages on memory failure handling the
1114                  * only mechanism open to use is through AS_AIO.
1115                  *
1116                  * This has the disadvantage that it gets cleared on
1117                  * the first operation that returns an error, while
1118                  * the PageError bit is more sticky and only cleared
1119                  * when the page is reread or dropped.  If an
1120                  * application assumes it will always get error on
1121                  * fsync, but does other operations on the fd before
1122                  * and the page is dropped between then the error
1123                  * will not be properly reported.
1124                  *
1125                  * This can already happen even without hwpoisoned
1126                  * pages: first on metadata IO errors (which only
1127                  * report through AS_EIO) or when the page is dropped
1128                  * at the wrong time.
1129                  *
1130                  * So right now we assume that the application DTRT on
1131                  * the first EIO, but we're not worse than other parts
1132                  * of the kernel.
1133                  */
1134                 mapping_set_error(mapping, -EIO);
1135         }
1136
1137         return me_pagecache_clean(ps, p);
1138 }
1139
1140 /*
1141  * Clean and dirty swap cache.
1142  *
1143  * Dirty swap cache page is tricky to handle. The page could live both in page
1144  * cache and swap cache(ie. page is freshly swapped in). So it could be
1145  * referenced concurrently by 2 types of PTEs:
1146  * normal PTEs and swap PTEs. We try to handle them consistently by calling
1147  * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs,
1148  * and then
1149  *      - clear dirty bit to prevent IO
1150  *      - remove from LRU
1151  *      - but keep in the swap cache, so that when we return to it on
1152  *        a later page fault, we know the application is accessing
1153  *        corrupted data and shall be killed (we installed simple
1154  *        interception code in do_swap_page to catch it).
1155  *
1156  * Clean swap cache pages can be directly isolated. A later page fault will
1157  * bring in the known good data from disk.
1158  */
1159 static int me_swapcache_dirty(struct page_state *ps, struct page *p)
1160 {
1161         struct folio *folio = page_folio(p);
1162         int ret;
1163         bool extra_pins = false;
1164
1165         folio_clear_dirty(folio);
1166         /* Trigger EIO in shmem: */
1167         folio_clear_uptodate(folio);
1168
1169         ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_DELAYED;
1170         folio_unlock(folio);
1171
1172         if (ret == MF_DELAYED)
1173                 extra_pins = true;
1174
1175         if (has_extra_refcount(ps, p, extra_pins))
1176                 ret = MF_FAILED;
1177
1178         return ret;
1179 }
1180
1181 static int me_swapcache_clean(struct page_state *ps, struct page *p)
1182 {
1183         struct folio *folio = page_folio(p);
1184         int ret;
1185
1186         delete_from_swap_cache(folio);
1187
1188         ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_RECOVERED;
1189         folio_unlock(folio);
1190
1191         if (has_extra_refcount(ps, p, false))
1192                 ret = MF_FAILED;
1193
1194         return ret;
1195 }
1196
1197 /*
1198  * Huge pages. Needs work.
1199  * Issues:
1200  * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
1201  *   To narrow down kill region to one page, we need to break up pmd.
1202  */
1203 static int me_huge_page(struct page_state *ps, struct page *p)
1204 {
1205         struct folio *folio = page_folio(p);
1206         int res;
1207         struct address_space *mapping;
1208         bool extra_pins = false;
1209
1210         mapping = folio_mapping(folio);
1211         if (mapping) {
1212                 res = truncate_error_folio(folio, page_to_pfn(p), mapping);
1213                 /* The page is kept in page cache. */
1214                 extra_pins = true;
1215                 folio_unlock(folio);
1216         } else {
1217                 folio_unlock(folio);
1218                 /*
1219                  * migration entry prevents later access on error hugepage,
1220                  * so we can free and dissolve it into buddy to save healthy
1221                  * subpages.
1222                  */
1223                 folio_put(folio);
1224                 if (__page_handle_poison(p) >= 0) {
1225                         page_ref_inc(p);
1226                         res = MF_RECOVERED;
1227                 } else {
1228                         res = MF_FAILED;
1229                 }
1230         }
1231
1232         if (has_extra_refcount(ps, p, extra_pins))
1233                 res = MF_FAILED;
1234
1235         return res;
1236 }
1237
1238 /*
1239  * Various page states we can handle.
1240  *
1241  * A page state is defined by its current page->flags bits.
1242  * The table matches them in order and calls the right handler.
1243  *
1244  * This is quite tricky because we can access page at any time
1245  * in its live cycle, so all accesses have to be extremely careful.
1246  *
1247  * This is not complete. More states could be added.
1248  * For any missing state don't attempt recovery.
1249  */
1250
1251 #define dirty           (1UL << PG_dirty)
1252 #define sc              ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
1253 #define unevict         (1UL << PG_unevictable)
1254 #define mlock           (1UL << PG_mlocked)
1255 #define lru             (1UL << PG_lru)
1256 #define head            (1UL << PG_head)
1257 #define reserved        (1UL << PG_reserved)
1258
1259 static struct page_state error_states[] = {
1260         { reserved,     reserved,       MF_MSG_KERNEL,  me_kernel },
1261         /*
1262          * free pages are specially detected outside this table:
1263          * PG_buddy pages only make a small fraction of all free pages.
1264          */
1265
1266         { head,         head,           MF_MSG_HUGE,            me_huge_page },
1267
1268         { sc|dirty,     sc|dirty,       MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
1269         { sc|dirty,     sc,             MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
1270
1271         { mlock|dirty,  mlock|dirty,    MF_MSG_DIRTY_MLOCKED_LRU,       me_pagecache_dirty },
1272         { mlock|dirty,  mlock,          MF_MSG_CLEAN_MLOCKED_LRU,       me_pagecache_clean },
1273
1274         { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU,   me_pagecache_dirty },
1275         { unevict|dirty, unevict,       MF_MSG_CLEAN_UNEVICTABLE_LRU,   me_pagecache_clean },
1276
1277         { lru|dirty,    lru|dirty,      MF_MSG_DIRTY_LRU,       me_pagecache_dirty },
1278         { lru|dirty,    lru,            MF_MSG_CLEAN_LRU,       me_pagecache_clean },
1279
1280         /*
1281          * Catchall entry: must be at end.
1282          */
1283         { 0,            0,              MF_MSG_UNKNOWN, me_unknown },
1284 };
1285
1286 #undef dirty
1287 #undef sc
1288 #undef unevict
1289 #undef mlock
1290 #undef lru
1291 #undef head
1292 #undef reserved
1293
1294 static void update_per_node_mf_stats(unsigned long pfn,
1295                                      enum mf_result result)
1296 {
1297         int nid = MAX_NUMNODES;
1298         struct memory_failure_stats *mf_stats = NULL;
1299
1300         nid = pfn_to_nid(pfn);
1301         if (unlikely(nid < 0 || nid >= MAX_NUMNODES)) {
1302                 WARN_ONCE(1, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid);
1303                 return;
1304         }
1305
1306         mf_stats = &NODE_DATA(nid)->mf_stats;
1307         switch (result) {
1308         case MF_IGNORED:
1309                 ++mf_stats->ignored;
1310                 break;
1311         case MF_FAILED:
1312                 ++mf_stats->failed;
1313                 break;
1314         case MF_DELAYED:
1315                 ++mf_stats->delayed;
1316                 break;
1317         case MF_RECOVERED:
1318                 ++mf_stats->recovered;
1319                 break;
1320         default:
1321                 WARN_ONCE(1, "Memory failure: mf_result=%d is not properly handled", result);
1322                 break;
1323         }
1324         ++mf_stats->total;
1325 }
1326
1327 /*
1328  * "Dirty/Clean" indication is not 100% accurate due to the possibility of
1329  * setting PG_dirty outside page lock. See also comment above set_page_dirty().
1330  */
1331 static int action_result(unsigned long pfn, enum mf_action_page_type type,
1332                          enum mf_result result)
1333 {
1334         trace_memory_failure_event(pfn, type, result);
1335
1336         num_poisoned_pages_inc(pfn);
1337
1338         update_per_node_mf_stats(pfn, result);
1339
1340         pr_err("%#lx: recovery action for %s: %s\n",
1341                 pfn, action_page_types[type], action_name[result]);
1342
1343         return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
1344 }
1345
1346 static int page_action(struct page_state *ps, struct page *p,
1347                         unsigned long pfn)
1348 {
1349         int result;
1350
1351         /* page p should be unlocked after returning from ps->action().  */
1352         result = ps->action(ps, p);
1353
1354         /* Could do more checks here if page looks ok */
1355         /*
1356          * Could adjust zone counters here to correct for the missing page.
1357          */
1358
1359         return action_result(pfn, ps->type, result);
1360 }
1361
1362 static inline bool PageHWPoisonTakenOff(struct page *page)
1363 {
1364         return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
1365 }
1366
1367 void SetPageHWPoisonTakenOff(struct page *page)
1368 {
1369         set_page_private(page, MAGIC_HWPOISON);
1370 }
1371
1372 void ClearPageHWPoisonTakenOff(struct page *page)
1373 {
1374         if (PageHWPoison(page))
1375                 set_page_private(page, 0);
1376 }
1377
1378 /*
1379  * Return true if a page type of a given page is supported by hwpoison
1380  * mechanism (while handling could fail), otherwise false.  This function
1381  * does not return true for hugetlb or device memory pages, so it's assumed
1382  * to be called only in the context where we never have such pages.
1383  */
1384 static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
1385 {
1386         if (PageSlab(page))
1387                 return false;
1388
1389         /* Soft offline could migrate non-LRU movable pages */
1390         if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
1391                 return true;
1392
1393         return PageLRU(page) || is_free_buddy_page(page);
1394 }
1395
1396 static int __get_hwpoison_page(struct page *page, unsigned long flags)
1397 {
1398         struct folio *folio = page_folio(page);
1399         int ret = 0;
1400         bool hugetlb = false;
1401
1402         ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false);
1403         if (hugetlb) {
1404                 /* Make sure hugetlb demotion did not happen from under us. */
1405                 if (folio == page_folio(page))
1406                         return ret;
1407                 if (ret > 0) {
1408                         folio_put(folio);
1409                         folio = page_folio(page);
1410                 }
1411         }
1412
1413         /*
1414          * This check prevents from calling folio_try_get() for any
1415          * unsupported type of folio in order to reduce the risk of unexpected
1416          * races caused by taking a folio refcount.
1417          */
1418         if (!HWPoisonHandlable(&folio->page, flags))
1419                 return -EBUSY;
1420
1421         if (folio_try_get(folio)) {
1422                 if (folio == page_folio(page))
1423                         return 1;
1424
1425                 pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
1426                 folio_put(folio);
1427         }
1428
1429         return 0;
1430 }
1431
1432 static int get_any_page(struct page *p, unsigned long flags)
1433 {
1434         int ret = 0, pass = 0;
1435         bool count_increased = false;
1436
1437         if (flags & MF_COUNT_INCREASED)
1438                 count_increased = true;
1439
1440 try_again:
1441         if (!count_increased) {
1442                 ret = __get_hwpoison_page(p, flags);
1443                 if (!ret) {
1444                         if (page_count(p)) {
1445                                 /* We raced with an allocation, retry. */
1446                                 if (pass++ < 3)
1447                                         goto try_again;
1448                                 ret = -EBUSY;
1449                         } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
1450                                 /* We raced with put_page, retry. */
1451                                 if (pass++ < 3)
1452                                         goto try_again;
1453                                 ret = -EIO;
1454                         }
1455                         goto out;
1456                 } else if (ret == -EBUSY) {
1457                         /*
1458                          * We raced with (possibly temporary) unhandlable
1459                          * page, retry.
1460                          */
1461                         if (pass++ < 3) {
1462                                 shake_page(p);
1463                                 goto try_again;
1464                         }
1465                         ret = -EIO;
1466                         goto out;
1467                 }
1468         }
1469
1470         if (PageHuge(p) || HWPoisonHandlable(p, flags)) {
1471                 ret = 1;
1472         } else {
1473                 /*
1474                  * A page we cannot handle. Check whether we can turn
1475                  * it into something we can handle.
1476                  */
1477                 if (pass++ < 3) {
1478                         put_page(p);
1479                         shake_page(p);
1480                         count_increased = false;
1481                         goto try_again;
1482                 }
1483                 put_page(p);
1484                 ret = -EIO;
1485         }
1486 out:
1487         if (ret == -EIO)
1488                 pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
1489
1490         return ret;
1491 }
1492
1493 static int __get_unpoison_page(struct page *page)
1494 {
1495         struct folio *folio = page_folio(page);
1496         int ret = 0;
1497         bool hugetlb = false;
1498
1499         ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true);
1500         if (hugetlb) {
1501                 /* Make sure hugetlb demotion did not happen from under us. */
1502                 if (folio == page_folio(page))
1503                         return ret;
1504                 if (ret > 0)
1505                         folio_put(folio);
1506         }
1507
1508         /*
1509          * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
1510          * but also isolated from buddy freelist, so need to identify the
1511          * state and have to cancel both operations to unpoison.
1512          */
1513         if (PageHWPoisonTakenOff(page))
1514                 return -EHWPOISON;
1515
1516         return get_page_unless_zero(page) ? 1 : 0;
1517 }
1518
1519 /**
1520  * get_hwpoison_page() - Get refcount for memory error handling
1521  * @p:          Raw error page (hit by memory error)
1522  * @flags:      Flags controlling behavior of error handling
1523  *
1524  * get_hwpoison_page() takes a page refcount of an error page to handle memory
1525  * error on it, after checking that the error page is in a well-defined state
1526  * (defined as a page-type we can successfully handle the memory error on it,
1527  * such as LRU page and hugetlb page).
1528  *
1529  * Memory error handling could be triggered at any time on any type of page,
1530  * so it's prone to race with typical memory management lifecycle (like
1531  * allocation and free).  So to avoid such races, get_hwpoison_page() takes
1532  * extra care for the error page's state (as done in __get_hwpoison_page()),
1533  * and has some retry logic in get_any_page().
1534  *
1535  * When called from unpoison_memory(), the caller should already ensure that
1536  * the given page has PG_hwpoison. So it's never reused for other page
1537  * allocations, and __get_unpoison_page() never races with them.
1538  *
1539  * Return: 0 on failure,
1540  *         1 on success for in-use pages in a well-defined state,
1541  *         -EIO for pages on which we can not handle memory errors,
1542  *         -EBUSY when get_hwpoison_page() has raced with page lifecycle
1543  *         operations like allocation and free,
1544  *         -EHWPOISON when the page is hwpoisoned and taken off from buddy.
1545  */
1546 static int get_hwpoison_page(struct page *p, unsigned long flags)
1547 {
1548         int ret;
1549
1550         zone_pcp_disable(page_zone(p));
1551         if (flags & MF_UNPOISON)
1552                 ret = __get_unpoison_page(p);
1553         else
1554                 ret = get_any_page(p, flags);
1555         zone_pcp_enable(page_zone(p));
1556
1557         return ret;
1558 }
1559
1560 /*
1561  * Do all that is necessary to remove user space mappings. Unmap
1562  * the pages and send SIGBUS to the processes if the data was dirty.
1563  */
1564 static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
1565                 unsigned long pfn, int flags)
1566 {
1567         enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
1568         struct address_space *mapping;
1569         LIST_HEAD(tokill);
1570         bool unmap_success;
1571         int forcekill;
1572         bool mlocked = folio_test_mlocked(folio);
1573
1574         /*
1575          * Here we are interested only in user-mapped pages, so skip any
1576          * other types of pages.
1577          */
1578         if (folio_test_reserved(folio) || folio_test_slab(folio) ||
1579             folio_test_pgtable(folio) || folio_test_offline(folio))
1580                 return true;
1581         if (!(folio_test_lru(folio) || folio_test_hugetlb(folio)))
1582                 return true;
1583
1584         /*
1585          * This check implies we don't kill processes if their pages
1586          * are in the swap cache early. Those are always late kills.
1587          */
1588         if (!page_mapped(p))
1589                 return true;
1590
1591         if (folio_test_swapcache(folio)) {
1592                 pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
1593                 ttu &= ~TTU_HWPOISON;
1594         }
1595
1596         /*
1597          * Propagate the dirty bit from PTEs to struct page first, because we
1598          * need this to decide if we should kill or just drop the page.
1599          * XXX: the dirty test could be racy: set_page_dirty() may not always
1600          * be called inside page lock (it's recommended but not enforced).
1601          */
1602         mapping = folio_mapping(folio);
1603         if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
1604             mapping_can_writeback(mapping)) {
1605                 if (folio_mkclean(folio)) {
1606                         folio_set_dirty(folio);
1607                 } else {
1608                         ttu &= ~TTU_HWPOISON;
1609                         pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
1610                                 pfn);
1611                 }
1612         }
1613
1614         /*
1615          * First collect all the processes that have the page
1616          * mapped in dirty form.  This has to be done before try_to_unmap,
1617          * because ttu takes the rmap data structures down.
1618          */
1619         collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
1620
1621         if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
1622                 /*
1623                  * For hugetlb pages in shared mappings, try_to_unmap
1624                  * could potentially call huge_pmd_unshare.  Because of
1625                  * this, take semaphore in write mode here and set
1626                  * TTU_RMAP_LOCKED to indicate we have taken the lock
1627                  * at this higher level.
1628                  */
1629                 mapping = hugetlb_folio_mapping_lock_write(folio);
1630                 if (mapping) {
1631                         try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
1632                         i_mmap_unlock_write(mapping);
1633                 } else
1634                         pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
1635         } else {
1636                 try_to_unmap(folio, ttu);
1637         }
1638
1639         unmap_success = !page_mapped(p);
1640         if (!unmap_success)
1641                 pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
1642                        pfn, folio_mapcount(page_folio(p)));
1643
1644         /*
1645          * try_to_unmap() might put mlocked page in lru cache, so call
1646          * shake_page() again to ensure that it's flushed.
1647          */
1648         if (mlocked)
1649                 shake_folio(folio);
1650
1651         /*
1652          * Now that the dirty bit has been propagated to the
1653          * struct page and all unmaps done we can decide if
1654          * killing is needed or not.  Only kill when the page
1655          * was dirty or the process is not restartable,
1656          * otherwise the tokill list is merely
1657          * freed.  When there was a problem unmapping earlier
1658          * use a more force-full uncatchable kill to prevent
1659          * any accesses to the poisoned memory.
1660          */
1661         forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) ||
1662                     !unmap_success;
1663         kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
1664
1665         return unmap_success;
1666 }
1667
1668 static int identify_page_state(unsigned long pfn, struct page *p,
1669                                 unsigned long page_flags)
1670 {
1671         struct page_state *ps;
1672
1673         /*
1674          * The first check uses the current page flags which may not have any
1675          * relevant information. The second check with the saved page flags is
1676          * carried out only if the first check can't determine the page status.
1677          */
1678         for (ps = error_states;; ps++)
1679                 if ((p->flags & ps->mask) == ps->res)
1680                         break;
1681
1682         page_flags |= (p->flags & (1UL << PG_dirty));
1683
1684         if (!ps->mask)
1685                 for (ps = error_states;; ps++)
1686                         if ((page_flags & ps->mask) == ps->res)
1687                                 break;
1688         return page_action(ps, p, pfn);
1689 }
1690
1691 static int try_to_split_thp_page(struct page *page)
1692 {
1693         int ret;
1694
1695         lock_page(page);
1696         ret = split_huge_page(page);
1697         unlock_page(page);
1698
1699         if (unlikely(ret))
1700                 put_page(page);
1701
1702         return ret;
1703 }
1704
1705 static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
1706                 struct address_space *mapping, pgoff_t index, int flags)
1707 {
1708         struct to_kill *tk;
1709         unsigned long size = 0;
1710
1711         list_for_each_entry(tk, to_kill, nd)
1712                 if (tk->size_shift)
1713                         size = max(size, 1UL << tk->size_shift);
1714
1715         if (size) {
1716                 /*
1717                  * Unmap the largest mapping to avoid breaking up device-dax
1718                  * mappings which are constant size. The actual size of the
1719                  * mapping being torn down is communicated in siginfo, see
1720                  * kill_proc()
1721                  */
1722                 loff_t start = ((loff_t)index << PAGE_SHIFT) & ~(size - 1);
1723
1724                 unmap_mapping_range(mapping, start, size, 0);
1725         }
1726
1727         kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
1728 }
1729
1730 /*
1731  * Only dev_pagemap pages get here, such as fsdax when the filesystem
1732  * either do not claim or fails to claim a hwpoison event, or devdax.
1733  * The fsdax pages are initialized per base page, and the devdax pages
1734  * could be initialized either as base pages, or as compound pages with
1735  * vmemmap optimization enabled. Devdax is simplistic in its dealing with
1736  * hwpoison, such that, if a subpage of a compound page is poisoned,
1737  * simply mark the compound head page is by far sufficient.
1738  */
1739 static int mf_generic_kill_procs(unsigned long long pfn, int flags,
1740                 struct dev_pagemap *pgmap)
1741 {
1742         struct folio *folio = pfn_folio(pfn);
1743         LIST_HEAD(to_kill);
1744         dax_entry_t cookie;
1745         int rc = 0;
1746
1747         /*
1748          * Prevent the inode from being freed while we are interrogating
1749          * the address_space, typically this would be handled by
1750          * lock_page(), but dax pages do not use the page lock. This
1751          * also prevents changes to the mapping of this pfn until
1752          * poison signaling is complete.
1753          */
1754         cookie = dax_lock_folio(folio);
1755         if (!cookie)
1756                 return -EBUSY;
1757
1758         if (hwpoison_filter(&folio->page)) {
1759                 rc = -EOPNOTSUPP;
1760                 goto unlock;
1761         }
1762
1763         switch (pgmap->type) {
1764         case MEMORY_DEVICE_PRIVATE:
1765         case MEMORY_DEVICE_COHERENT:
1766                 /*
1767                  * TODO: Handle device pages which may need coordination
1768                  * with device-side memory.
1769                  */
1770                 rc = -ENXIO;
1771                 goto unlock;
1772         default:
1773                 break;
1774         }
1775
1776         /*
1777          * Use this flag as an indication that the dax page has been
1778          * remapped UC to prevent speculative consumption of poison.
1779          */
1780         SetPageHWPoison(&folio->page);
1781
1782         /*
1783          * Unlike System-RAM there is no possibility to swap in a
1784          * different physical page at a given virtual address, so all
1785          * userspace consumption of ZONE_DEVICE memory necessitates
1786          * SIGBUS (i.e. MF_MUST_KILL)
1787          */
1788         flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1789         collect_procs(folio, &folio->page, &to_kill, true);
1790
1791         unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags);
1792 unlock:
1793         dax_unlock_folio(folio, cookie);
1794         return rc;
1795 }
1796
1797 #ifdef CONFIG_FS_DAX
1798 /**
1799  * mf_dax_kill_procs - Collect and kill processes who are using this file range
1800  * @mapping:    address_space of the file in use
1801  * @index:      start pgoff of the range within the file
1802  * @count:      length of the range, in unit of PAGE_SIZE
1803  * @mf_flags:   memory failure flags
1804  */
1805 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
1806                 unsigned long count, int mf_flags)
1807 {
1808         LIST_HEAD(to_kill);
1809         dax_entry_t cookie;
1810         struct page *page;
1811         size_t end = index + count;
1812         bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
1813
1814         mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1815
1816         for (; index < end; index++) {
1817                 page = NULL;
1818                 cookie = dax_lock_mapping_entry(mapping, index, &page);
1819                 if (!cookie)
1820                         return -EBUSY;
1821                 if (!page)
1822                         goto unlock;
1823
1824                 if (!pre_remove)
1825                         SetPageHWPoison(page);
1826
1827                 /*
1828                  * The pre_remove case is revoking access, the memory is still
1829                  * good and could theoretically be put back into service.
1830                  */
1831                 collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
1832                 unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
1833                                 index, mf_flags);
1834 unlock:
1835                 dax_unlock_mapping_entry(mapping, index, cookie);
1836         }
1837         return 0;
1838 }
1839 EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
1840 #endif /* CONFIG_FS_DAX */
1841
1842 #ifdef CONFIG_HUGETLB_PAGE
1843
1844 /*
1845  * Struct raw_hwp_page represents information about "raw error page",
1846  * constructing singly linked list from ->_hugetlb_hwpoison field of folio.
1847  */
1848 struct raw_hwp_page {
1849         struct llist_node node;
1850         struct page *page;
1851 };
1852
1853 static inline struct llist_head *raw_hwp_list_head(struct folio *folio)
1854 {
1855         return (struct llist_head *)&folio->_hugetlb_hwpoison;
1856 }
1857
1858 bool is_raw_hwpoison_page_in_hugepage(struct page *page)
1859 {
1860         struct llist_head *raw_hwp_head;
1861         struct raw_hwp_page *p;
1862         struct folio *folio = page_folio(page);
1863         bool ret = false;
1864
1865         if (!folio_test_hwpoison(folio))
1866                 return false;
1867
1868         if (!folio_test_hugetlb(folio))
1869                 return PageHWPoison(page);
1870
1871         /*
1872          * When RawHwpUnreliable is set, kernel lost track of which subpages
1873          * are HWPOISON. So return as if ALL subpages are HWPOISONed.
1874          */
1875         if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1876                 return true;
1877
1878         mutex_lock(&mf_mutex);
1879
1880         raw_hwp_head = raw_hwp_list_head(folio);
1881         llist_for_each_entry(p, raw_hwp_head->first, node) {
1882                 if (page == p->page) {
1883                         ret = true;
1884                         break;
1885                 }
1886         }
1887
1888         mutex_unlock(&mf_mutex);
1889
1890         return ret;
1891 }
1892
1893 static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
1894 {
1895         struct llist_node *head;
1896         struct raw_hwp_page *p, *next;
1897         unsigned long count = 0;
1898
1899         head = llist_del_all(raw_hwp_list_head(folio));
1900         llist_for_each_entry_safe(p, next, head, node) {
1901                 if (move_flag)
1902                         SetPageHWPoison(p->page);
1903                 else
1904                         num_poisoned_pages_sub(page_to_pfn(p->page), 1);
1905                 kfree(p);
1906                 count++;
1907         }
1908         return count;
1909 }
1910
1911 static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
1912 {
1913         struct llist_head *head;
1914         struct raw_hwp_page *raw_hwp;
1915         struct raw_hwp_page *p, *next;
1916         int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
1917
1918         /*
1919          * Once the hwpoison hugepage has lost reliable raw error info,
1920          * there is little meaning to keep additional error info precisely,
1921          * so skip to add additional raw error info.
1922          */
1923         if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1924                 return -EHWPOISON;
1925         head = raw_hwp_list_head(folio);
1926         llist_for_each_entry_safe(p, next, head->first, node) {
1927                 if (p->page == page)
1928                         return -EHWPOISON;
1929         }
1930
1931         raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
1932         if (raw_hwp) {
1933                 raw_hwp->page = page;
1934                 llist_add(&raw_hwp->node, head);
1935                 /* the first error event will be counted in action_result(). */
1936                 if (ret)
1937                         num_poisoned_pages_inc(page_to_pfn(page));
1938         } else {
1939                 /*
1940                  * Failed to save raw error info.  We no longer trace all
1941                  * hwpoisoned subpages, and we need refuse to free/dissolve
1942                  * this hwpoisoned hugepage.
1943                  */
1944                 folio_set_hugetlb_raw_hwp_unreliable(folio);
1945                 /*
1946                  * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not
1947                  * used any more, so free it.
1948                  */
1949                 __folio_free_raw_hwp(folio, false);
1950         }
1951         return ret;
1952 }
1953
1954 static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
1955 {
1956         /*
1957          * hugetlb_vmemmap_optimized hugepages can't be freed because struct
1958          * pages for tail pages are required but they don't exist.
1959          */
1960         if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio))
1961                 return 0;
1962
1963         /*
1964          * hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by
1965          * definition.
1966          */
1967         if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1968                 return 0;
1969
1970         return __folio_free_raw_hwp(folio, move_flag);
1971 }
1972
1973 void folio_clear_hugetlb_hwpoison(struct folio *folio)
1974 {
1975         if (folio_test_hugetlb_raw_hwp_unreliable(folio))
1976                 return;
1977         if (folio_test_hugetlb_vmemmap_optimized(folio))
1978                 return;
1979         folio_clear_hwpoison(folio);
1980         folio_free_raw_hwp(folio, true);
1981 }
1982
1983 /*
1984  * Called from hugetlb code with hugetlb_lock held.
1985  *
1986  * Return values:
1987  *   0             - free hugepage
1988  *   1             - in-use hugepage
1989  *   2             - not a hugepage
1990  *   -EBUSY        - the hugepage is busy (try to retry)
1991  *   -EHWPOISON    - the hugepage is already hwpoisoned
1992  */
1993 int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
1994                                  bool *migratable_cleared)
1995 {
1996         struct page *page = pfn_to_page(pfn);
1997         struct folio *folio = page_folio(page);
1998         int ret = 2;    /* fallback to normal page handling */
1999         bool count_increased = false;
2000
2001         if (!folio_test_hugetlb(folio))
2002                 goto out;
2003
2004         if (flags & MF_COUNT_INCREASED) {
2005                 ret = 1;
2006                 count_increased = true;
2007         } else if (folio_test_hugetlb_freed(folio)) {
2008                 ret = 0;
2009         } else if (folio_test_hugetlb_migratable(folio)) {
2010                 ret = folio_try_get(folio);
2011                 if (ret)
2012                         count_increased = true;
2013         } else {
2014                 ret = -EBUSY;
2015                 if (!(flags & MF_NO_RETRY))
2016                         goto out;
2017         }
2018
2019         if (folio_set_hugetlb_hwpoison(folio, page)) {
2020                 ret = -EHWPOISON;
2021                 goto out;
2022         }
2023
2024         /*
2025          * Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them
2026          * from being migrated by memory hotremove.
2027          */
2028         if (count_increased && folio_test_hugetlb_migratable(folio)) {
2029                 folio_clear_hugetlb_migratable(folio);
2030                 *migratable_cleared = true;
2031         }
2032
2033         return ret;
2034 out:
2035         if (count_increased)
2036                 folio_put(folio);
2037         return ret;
2038 }
2039
2040 /*
2041  * Taking refcount of hugetlb pages needs extra care about race conditions
2042  * with basic operations like hugepage allocation/free/demotion.
2043  * So some of prechecks for hwpoison (pinning, and testing/setting
2044  * PageHWPoison) should be done in single hugetlb_lock range.
2045  */
2046 static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
2047 {
2048         int res;
2049         struct page *p = pfn_to_page(pfn);
2050         struct folio *folio;
2051         unsigned long page_flags;
2052         bool migratable_cleared = false;
2053
2054         *hugetlb = 1;
2055 retry:
2056         res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
2057         if (res == 2) { /* fallback to normal page handling */
2058                 *hugetlb = 0;
2059                 return 0;
2060         } else if (res == -EHWPOISON) {
2061                 pr_err("%#lx: already hardware poisoned\n", pfn);
2062                 if (flags & MF_ACTION_REQUIRED) {
2063                         folio = page_folio(p);
2064                         res = kill_accessing_process(current, folio_pfn(folio), flags);
2065                 }
2066                 return res;
2067         } else if (res == -EBUSY) {
2068                 if (!(flags & MF_NO_RETRY)) {
2069                         flags |= MF_NO_RETRY;
2070                         goto retry;
2071                 }
2072                 return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
2073         }
2074
2075         folio = page_folio(p);
2076         folio_lock(folio);
2077
2078         if (hwpoison_filter(p)) {
2079                 folio_clear_hugetlb_hwpoison(folio);
2080                 if (migratable_cleared)
2081                         folio_set_hugetlb_migratable(folio);
2082                 folio_unlock(folio);
2083                 if (res == 1)
2084                         folio_put(folio);
2085                 return -EOPNOTSUPP;
2086         }
2087
2088         /*
2089          * Handling free hugepage.  The possible race with hugepage allocation
2090          * or demotion can be prevented by PageHWPoison flag.
2091          */
2092         if (res == 0) {
2093                 folio_unlock(folio);
2094                 if (__page_handle_poison(p) >= 0) {
2095                         page_ref_inc(p);
2096                         res = MF_RECOVERED;
2097                 } else {
2098                         res = MF_FAILED;
2099                 }
2100                 return action_result(pfn, MF_MSG_FREE_HUGE, res);
2101         }
2102
2103         page_flags = folio->flags;
2104
2105         if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
2106                 folio_unlock(folio);
2107                 return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
2108         }
2109
2110         return identify_page_state(pfn, p, page_flags);
2111 }
2112
2113 #else
2114 static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
2115 {
2116         return 0;
2117 }
2118
2119 static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag)
2120 {
2121         return 0;
2122 }
2123 #endif  /* CONFIG_HUGETLB_PAGE */
2124
2125 /* Drop the extra refcount in case we come from madvise() */
2126 static void put_ref_page(unsigned long pfn, int flags)
2127 {
2128         struct page *page;
2129
2130         if (!(flags & MF_COUNT_INCREASED))
2131                 return;
2132
2133         page = pfn_to_page(pfn);
2134         if (page)
2135                 put_page(page);
2136 }
2137
2138 static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
2139                 struct dev_pagemap *pgmap)
2140 {
2141         int rc = -ENXIO;
2142
2143         /* device metadata space is not recoverable */
2144         if (!pgmap_pfn_valid(pgmap, pfn))
2145                 goto out;
2146
2147         /*
2148          * Call driver's implementation to handle the memory failure, otherwise
2149          * fall back to generic handler.
2150          */
2151         if (pgmap_has_memory_failure(pgmap)) {
2152                 rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
2153                 /*
2154                  * Fall back to generic handler too if operation is not
2155                  * supported inside the driver/device/filesystem.
2156                  */
2157                 if (rc != -EOPNOTSUPP)
2158                         goto out;
2159         }
2160
2161         rc = mf_generic_kill_procs(pfn, flags, pgmap);
2162 out:
2163         /* drop pgmap ref acquired in caller */
2164         put_dev_pagemap(pgmap);
2165         if (rc != -EOPNOTSUPP)
2166                 action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
2167         return rc;
2168 }
2169
2170 /**
2171  * memory_failure - Handle memory failure of a page.
2172  * @pfn: Page Number of the corrupted page
2173  * @flags: fine tune action taken
2174  *
2175  * This function is called by the low level machine check code
2176  * of an architecture when it detects hardware memory corruption
2177  * of a page. It tries its best to recover, which includes
2178  * dropping pages, killing processes etc.
2179  *
2180  * The function is primarily of use for corruptions that
2181  * happen outside the current execution context (e.g. when
2182  * detected by a background scrubber)
2183  *
2184  * Must run in process context (e.g. a work queue) with interrupts
2185  * enabled and no spinlocks held.
2186  *
2187  * Return: 0 for successfully handled the memory error,
2188  *         -EOPNOTSUPP for hwpoison_filter() filtered the error event,
2189  *         < 0(except -EOPNOTSUPP) on failure.
2190  */
2191 int memory_failure(unsigned long pfn, int flags)
2192 {
2193         struct page *p;
2194         struct folio *folio;
2195         struct dev_pagemap *pgmap;
2196         int res = 0;
2197         unsigned long page_flags;
2198         bool retry = true;
2199         int hugetlb = 0;
2200
2201         if (!sysctl_memory_failure_recovery)
2202                 panic("Memory failure on page %lx", pfn);
2203
2204         mutex_lock(&mf_mutex);
2205
2206         if (!(flags & MF_SW_SIMULATED))
2207                 hw_memory_failure = true;
2208
2209         p = pfn_to_online_page(pfn);
2210         if (!p) {
2211                 res = arch_memory_failure(pfn, flags);
2212                 if (res == 0)
2213                         goto unlock_mutex;
2214
2215                 if (pfn_valid(pfn)) {
2216                         pgmap = get_dev_pagemap(pfn, NULL);
2217                         put_ref_page(pfn, flags);
2218                         if (pgmap) {
2219                                 res = memory_failure_dev_pagemap(pfn, flags,
2220                                                                  pgmap);
2221                                 goto unlock_mutex;
2222                         }
2223                 }
2224                 pr_err("%#lx: memory outside kernel control\n", pfn);
2225                 res = -ENXIO;
2226                 goto unlock_mutex;
2227         }
2228
2229 try_again:
2230         res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
2231         if (hugetlb)
2232                 goto unlock_mutex;
2233
2234         if (TestSetPageHWPoison(p)) {
2235                 pr_err("%#lx: already hardware poisoned\n", pfn);
2236                 res = -EHWPOISON;
2237                 if (flags & MF_ACTION_REQUIRED)
2238                         res = kill_accessing_process(current, pfn, flags);
2239                 if (flags & MF_COUNT_INCREASED)
2240                         put_page(p);
2241                 goto unlock_mutex;
2242         }
2243
2244         /*
2245          * We need/can do nothing about count=0 pages.
2246          * 1) it's a free page, and therefore in safe hand:
2247          *    check_new_page() will be the gate keeper.
2248          * 2) it's part of a non-compound high order page.
2249          *    Implies some kernel user: cannot stop them from
2250          *    R/W the page; let's pray that the page has been
2251          *    used and will be freed some time later.
2252          * In fact it's dangerous to directly bump up page count from 0,
2253          * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
2254          */
2255         if (!(flags & MF_COUNT_INCREASED)) {
2256                 res = get_hwpoison_page(p, flags);
2257                 if (!res) {
2258                         if (is_free_buddy_page(p)) {
2259                                 if (take_page_off_buddy(p)) {
2260                                         page_ref_inc(p);
2261                                         res = MF_RECOVERED;
2262                                 } else {
2263                                         /* We lost the race, try again */
2264                                         if (retry) {
2265                                                 ClearPageHWPoison(p);
2266                                                 retry = false;
2267                                                 goto try_again;
2268                                         }
2269                                         res = MF_FAILED;
2270                                 }
2271                                 res = action_result(pfn, MF_MSG_BUDDY, res);
2272                         } else {
2273                                 res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
2274                         }
2275                         goto unlock_mutex;
2276                 } else if (res < 0) {
2277                         res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
2278                         goto unlock_mutex;
2279                 }
2280         }
2281
2282         folio = page_folio(p);
2283         if (folio_test_large(folio)) {
2284                 /*
2285                  * The flag must be set after the refcount is bumped
2286                  * otherwise it may race with THP split.
2287                  * And the flag can't be set in get_hwpoison_page() since
2288                  * it is called by soft offline too and it is just called
2289                  * for !MF_COUNT_INCREASED.  So here seems to be the best
2290                  * place.
2291                  *
2292                  * Don't need care about the above error handling paths for
2293                  * get_hwpoison_page() since they handle either free page
2294                  * or unhandlable page.  The refcount is bumped iff the
2295                  * page is a valid handlable page.
2296                  */
2297                 folio_set_has_hwpoisoned(folio);
2298                 if (try_to_split_thp_page(p) < 0) {
2299                         res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
2300                         goto unlock_mutex;
2301                 }
2302                 VM_BUG_ON_PAGE(!page_count(p), p);
2303                 folio = page_folio(p);
2304         }
2305
2306         /*
2307          * We ignore non-LRU pages for good reasons.
2308          * - PG_locked is only well defined for LRU pages and a few others
2309          * - to avoid races with __SetPageLocked()
2310          * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
2311          * The check (unnecessarily) ignores LRU pages being isolated and
2312          * walked by the page reclaim code, however that's not a big loss.
2313          */
2314         shake_folio(folio);
2315
2316         folio_lock(folio);
2317
2318         /*
2319          * We're only intended to deal with the non-Compound page here.
2320          * However, the page could have changed compound pages due to
2321          * race window. If this happens, we could try again to hopefully
2322          * handle the page next round.
2323          */
2324         if (folio_test_large(folio)) {
2325                 if (retry) {
2326                         ClearPageHWPoison(p);
2327                         folio_unlock(folio);
2328                         folio_put(folio);
2329                         flags &= ~MF_COUNT_INCREASED;
2330                         retry = false;
2331                         goto try_again;
2332                 }
2333                 res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
2334                 goto unlock_page;
2335         }
2336
2337         /*
2338          * We use page flags to determine what action should be taken, but
2339          * the flags can be modified by the error containment action.  One
2340          * example is an mlocked page, where PG_mlocked is cleared by
2341          * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
2342          * status correctly, we save a copy of the page flags at this time.
2343          */
2344         page_flags = folio->flags;
2345
2346         if (hwpoison_filter(p)) {
2347                 ClearPageHWPoison(p);
2348                 folio_unlock(folio);
2349                 folio_put(folio);
2350                 res = -EOPNOTSUPP;
2351                 goto unlock_mutex;
2352         }
2353
2354         /*
2355          * __munlock_folio() may clear a writeback folio's LRU flag without
2356          * the folio lock. We need to wait for writeback completion for this
2357          * folio or it may trigger a vfs BUG while evicting inode.
2358          */
2359         if (!folio_test_lru(folio) && !folio_test_writeback(folio))
2360                 goto identify_page_state;
2361
2362         /*
2363          * It's very difficult to mess with pages currently under IO
2364          * and in many cases impossible, so we just avoid it here.
2365          */
2366         folio_wait_writeback(folio);
2367
2368         /*
2369          * Now take care of user space mappings.
2370          * Abort on fail: __filemap_remove_folio() assumes unmapped page.
2371          */
2372         if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
2373                 res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
2374                 goto unlock_page;
2375         }
2376
2377         /*
2378          * Torn down by someone else?
2379          */
2380         if (folio_test_lru(folio) && !folio_test_swapcache(folio) &&
2381             folio->mapping == NULL) {
2382                 res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
2383                 goto unlock_page;
2384         }
2385
2386 identify_page_state:
2387         res = identify_page_state(pfn, p, page_flags);
2388         mutex_unlock(&mf_mutex);
2389         return res;
2390 unlock_page:
2391         folio_unlock(folio);
2392 unlock_mutex:
2393         mutex_unlock(&mf_mutex);
2394         return res;
2395 }
2396 EXPORT_SYMBOL_GPL(memory_failure);
2397
2398 #define MEMORY_FAILURE_FIFO_ORDER       4
2399 #define MEMORY_FAILURE_FIFO_SIZE        (1 << MEMORY_FAILURE_FIFO_ORDER)
2400
2401 struct memory_failure_entry {
2402         unsigned long pfn;
2403         int flags;
2404 };
2405
2406 struct memory_failure_cpu {
2407         DECLARE_KFIFO(fifo, struct memory_failure_entry,
2408                       MEMORY_FAILURE_FIFO_SIZE);
2409         spinlock_t lock;
2410         struct work_struct work;
2411 };
2412
2413 static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
2414
2415 /**
2416  * memory_failure_queue - Schedule handling memory failure of a page.
2417  * @pfn: Page Number of the corrupted page
2418  * @flags: Flags for memory failure handling
2419  *
2420  * This function is called by the low level hardware error handler
2421  * when it detects hardware memory corruption of a page. It schedules
2422  * the recovering of error page, including dropping pages, killing
2423  * processes etc.
2424  *
2425  * The function is primarily of use for corruptions that
2426  * happen outside the current execution context (e.g. when
2427  * detected by a background scrubber)
2428  *
2429  * Can run in IRQ context.
2430  */
2431 void memory_failure_queue(unsigned long pfn, int flags)
2432 {
2433         struct memory_failure_cpu *mf_cpu;
2434         unsigned long proc_flags;
2435         struct memory_failure_entry entry = {
2436                 .pfn =          pfn,
2437                 .flags =        flags,
2438         };
2439
2440         mf_cpu = &get_cpu_var(memory_failure_cpu);
2441         spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2442         if (kfifo_put(&mf_cpu->fifo, entry))
2443                 schedule_work_on(smp_processor_id(), &mf_cpu->work);
2444         else
2445                 pr_err("buffer overflow when queuing memory failure at %#lx\n",
2446                        pfn);
2447         spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
2448         put_cpu_var(memory_failure_cpu);
2449 }
2450 EXPORT_SYMBOL_GPL(memory_failure_queue);
2451
2452 static void memory_failure_work_func(struct work_struct *work)
2453 {
2454         struct memory_failure_cpu *mf_cpu;
2455         struct memory_failure_entry entry = { 0, };
2456         unsigned long proc_flags;
2457         int gotten;
2458
2459         mf_cpu = container_of(work, struct memory_failure_cpu, work);
2460         for (;;) {
2461                 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
2462                 gotten = kfifo_get(&mf_cpu->fifo, &entry);
2463                 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
2464                 if (!gotten)
2465                         break;
2466                 if (entry.flags & MF_SOFT_OFFLINE)
2467                         soft_offline_page(entry.pfn, entry.flags);
2468                 else
2469                         memory_failure(entry.pfn, entry.flags);
2470         }
2471 }
2472
2473 /*
2474  * Process memory_failure work queued on the specified CPU.
2475  * Used to avoid return-to-userspace racing with the memory_failure workqueue.
2476  */
2477 void memory_failure_queue_kick(int cpu)
2478 {
2479         struct memory_failure_cpu *mf_cpu;
2480
2481         mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2482         cancel_work_sync(&mf_cpu->work);
2483         memory_failure_work_func(&mf_cpu->work);
2484 }
2485
2486 static int __init memory_failure_init(void)
2487 {
2488         struct memory_failure_cpu *mf_cpu;
2489         int cpu;
2490
2491         for_each_possible_cpu(cpu) {
2492                 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
2493                 spin_lock_init(&mf_cpu->lock);
2494                 INIT_KFIFO(mf_cpu->fifo);
2495                 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
2496         }
2497
2498         register_sysctl_init("vm", memory_failure_table);
2499
2500         return 0;
2501 }
2502 core_initcall(memory_failure_init);
2503
2504 #undef pr_fmt
2505 #define pr_fmt(fmt)     "" fmt
2506 #define unpoison_pr_info(fmt, pfn, rs)                  \
2507 ({                                                      \
2508         if (__ratelimit(rs))                            \
2509                 pr_info(fmt, pfn);                      \
2510 })
2511
2512 /**
2513  * unpoison_memory - Unpoison a previously poisoned page
2514  * @pfn: Page number of the to be unpoisoned page
2515  *
2516  * Software-unpoison a page that has been poisoned by
2517  * memory_failure() earlier.
2518  *
2519  * This is only done on the software-level, so it only works
2520  * for linux injected failures, not real hardware failures
2521  *
2522  * Returns 0 for success, otherwise -errno.
2523  */
2524 int unpoison_memory(unsigned long pfn)
2525 {
2526         struct folio *folio;
2527         struct page *p;
2528         int ret = -EBUSY, ghp;
2529         unsigned long count = 1;
2530         bool huge = false;
2531         static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
2532                                         DEFAULT_RATELIMIT_BURST);
2533
2534         if (!pfn_valid(pfn))
2535                 return -ENXIO;
2536
2537         p = pfn_to_page(pfn);
2538         folio = page_folio(p);
2539
2540         mutex_lock(&mf_mutex);
2541
2542         if (hw_memory_failure) {
2543                 unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n",
2544                                  pfn, &unpoison_rs);
2545                 ret = -EOPNOTSUPP;
2546                 goto unlock_mutex;
2547         }
2548
2549         if (!PageHWPoison(p)) {
2550                 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
2551                                  pfn, &unpoison_rs);
2552                 goto unlock_mutex;
2553         }
2554
2555         if (folio_ref_count(folio) > 1) {
2556                 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
2557                                  pfn, &unpoison_rs);
2558                 goto unlock_mutex;
2559         }
2560
2561         if (folio_test_slab(folio) || folio_test_pgtable(folio) ||
2562             folio_test_reserved(folio) || folio_test_offline(folio))
2563                 goto unlock_mutex;
2564
2565         /*
2566          * Note that folio->_mapcount is overloaded in SLAB, so the simple test
2567          * in folio_mapped() has to be done after folio_test_slab() is checked.
2568          */
2569         if (folio_mapped(folio)) {
2570                 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
2571                                  pfn, &unpoison_rs);
2572                 goto unlock_mutex;
2573         }
2574
2575         if (folio_mapping(folio)) {
2576                 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
2577                                  pfn, &unpoison_rs);
2578                 goto unlock_mutex;
2579         }
2580
2581         ghp = get_hwpoison_page(p, MF_UNPOISON);
2582         if (!ghp) {
2583                 if (folio_test_hugetlb(folio)) {
2584                         huge = true;
2585                         count = folio_free_raw_hwp(folio, false);
2586                         if (count == 0)
2587                                 goto unlock_mutex;
2588                 }
2589                 ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY;
2590         } else if (ghp < 0) {
2591                 if (ghp == -EHWPOISON) {
2592                         ret = put_page_back_buddy(p) ? 0 : -EBUSY;
2593                 } else {
2594                         ret = ghp;
2595                         unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
2596                                          pfn, &unpoison_rs);
2597                 }
2598         } else {
2599                 if (folio_test_hugetlb(folio)) {
2600                         huge = true;
2601                         count = folio_free_raw_hwp(folio, false);
2602                         if (count == 0) {
2603                                 folio_put(folio);
2604                                 goto unlock_mutex;
2605                         }
2606                 }
2607
2608                 folio_put(folio);
2609                 if (TestClearPageHWPoison(p)) {
2610                         folio_put(folio);
2611                         ret = 0;
2612                 }
2613         }
2614
2615 unlock_mutex:
2616         mutex_unlock(&mf_mutex);
2617         if (!ret) {
2618                 if (!huge)
2619                         num_poisoned_pages_sub(pfn, 1);
2620                 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
2621                                  page_to_pfn(p), &unpoison_rs);
2622         }
2623         return ret;
2624 }
2625 EXPORT_SYMBOL(unpoison_memory);
2626
2627 static bool mf_isolate_folio(struct folio *folio, struct list_head *pagelist)
2628 {
2629         bool isolated = false;
2630
2631         if (folio_test_hugetlb(folio)) {
2632                 isolated = isolate_hugetlb(folio, pagelist);
2633         } else {
2634                 bool lru = !__folio_test_movable(folio);
2635
2636                 if (lru)
2637                         isolated = folio_isolate_lru(folio);
2638                 else
2639                         isolated = isolate_movable_page(&folio->page,
2640                                                         ISOLATE_UNEVICTABLE);
2641
2642                 if (isolated) {
2643                         list_add(&folio->lru, pagelist);
2644                         if (lru)
2645                                 node_stat_add_folio(folio, NR_ISOLATED_ANON +
2646                                                     folio_is_file_lru(folio));
2647                 }
2648         }
2649
2650         /*
2651          * If we succeed to isolate the folio, we grabbed another refcount on
2652          * the folio, so we can safely drop the one we got from get_any_page().
2653          * If we failed to isolate the folio, it means that we cannot go further
2654          * and we will return an error, so drop the reference we got from
2655          * get_any_page() as well.
2656          */
2657         folio_put(folio);
2658         return isolated;
2659 }
2660
2661 /*
2662  * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
2663  * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
2664  * If the page is mapped, it migrates the contents over.
2665  */
2666 static int soft_offline_in_use_page(struct page *page)
2667 {
2668         long ret = 0;
2669         unsigned long pfn = page_to_pfn(page);
2670         struct folio *folio = page_folio(page);
2671         char const *msg_page[] = {"page", "hugepage"};
2672         bool huge = folio_test_hugetlb(folio);
2673         LIST_HEAD(pagelist);
2674         struct migration_target_control mtc = {
2675                 .nid = NUMA_NO_NODE,
2676                 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
2677                 .reason = MR_MEMORY_FAILURE,
2678         };
2679
2680         if (!huge && folio_test_large(folio)) {
2681                 if (try_to_split_thp_page(page)) {
2682                         pr_info("soft offline: %#lx: thp split failed\n", pfn);
2683                         return -EBUSY;
2684                 }
2685                 folio = page_folio(page);
2686         }
2687
2688         folio_lock(folio);
2689         if (!huge)
2690                 folio_wait_writeback(folio);
2691         if (PageHWPoison(page)) {
2692                 folio_unlock(folio);
2693                 folio_put(folio);
2694                 pr_info("soft offline: %#lx page already poisoned\n", pfn);
2695                 return 0;
2696         }
2697
2698         if (!huge && folio_test_lru(folio) && !folio_test_swapcache(folio))
2699                 /*
2700                  * Try to invalidate first. This should work for
2701                  * non dirty unmapped page cache pages.
2702                  */
2703                 ret = mapping_evict_folio(folio_mapping(folio), folio);
2704         folio_unlock(folio);
2705
2706         if (ret) {
2707                 pr_info("soft_offline: %#lx: invalidated\n", pfn);
2708                 page_handle_poison(page, false, true);
2709                 return 0;
2710         }
2711
2712         if (mf_isolate_folio(folio, &pagelist)) {
2713                 ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
2714                         (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
2715                 if (!ret) {
2716                         bool release = !huge;
2717
2718                         if (!page_handle_poison(page, huge, release))
2719                                 ret = -EBUSY;
2720                 } else {
2721                         if (!list_empty(&pagelist))
2722                                 putback_movable_pages(&pagelist);
2723
2724                         pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n",
2725                                 pfn, msg_page[huge], ret, &page->flags);
2726                         if (ret > 0)
2727                                 ret = -EBUSY;
2728                 }
2729         } else {
2730                 pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n",
2731                         pfn, msg_page[huge], page_count(page), &page->flags);
2732                 ret = -EBUSY;
2733         }
2734         return ret;
2735 }
2736
2737 /**
2738  * soft_offline_page - Soft offline a page.
2739  * @pfn: pfn to soft-offline
2740  * @flags: flags. Same as memory_failure().
2741  *
2742  * Returns 0 on success
2743  *         -EOPNOTSUPP for hwpoison_filter() filtered the error event
2744  *         < 0 otherwise negated errno.
2745  *
2746  * Soft offline a page, by migration or invalidation,
2747  * without killing anything. This is for the case when
2748  * a page is not corrupted yet (so it's still valid to access),
2749  * but has had a number of corrected errors and is better taken
2750  * out.
2751  *
2752  * The actual policy on when to do that is maintained by
2753  * user space.
2754  *
2755  * This should never impact any application or cause data loss,
2756  * however it might take some time.
2757  *
2758  * This is not a 100% solution for all memory, but tries to be
2759  * ``good enough'' for the majority of memory.
2760  */
2761 int soft_offline_page(unsigned long pfn, int flags)
2762 {
2763         int ret;
2764         bool try_again = true;
2765         struct page *page;
2766
2767         if (!pfn_valid(pfn)) {
2768                 WARN_ON_ONCE(flags & MF_COUNT_INCREASED);
2769                 return -ENXIO;
2770         }
2771
2772         /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
2773         page = pfn_to_online_page(pfn);
2774         if (!page) {
2775                 put_ref_page(pfn, flags);
2776                 return -EIO;
2777         }
2778
2779         mutex_lock(&mf_mutex);
2780
2781         if (PageHWPoison(page)) {
2782                 pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
2783                 put_ref_page(pfn, flags);
2784                 mutex_unlock(&mf_mutex);
2785                 return 0;
2786         }
2787
2788 retry:
2789         get_online_mems();
2790         ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
2791         put_online_mems();
2792
2793         if (hwpoison_filter(page)) {
2794                 if (ret > 0)
2795                         put_page(page);
2796
2797                 mutex_unlock(&mf_mutex);
2798                 return -EOPNOTSUPP;
2799         }
2800
2801         if (ret > 0) {
2802                 ret = soft_offline_in_use_page(page);
2803         } else if (ret == 0) {
2804                 if (!page_handle_poison(page, true, false)) {
2805                         if (try_again) {
2806                                 try_again = false;
2807                                 flags &= ~MF_COUNT_INCREASED;
2808                                 goto retry;
2809                         }
2810                         ret = -EBUSY;
2811                 }
2812         }
2813
2814         mutex_unlock(&mf_mutex);
2815
2816         return ret;
2817 }