mm/vmscan.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   4  *
   5  *  Swap reorganised 29.12.95, Stephen Tweedie.
   6  *  kswapd added: 7.1.96  sct
   7  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   8  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
   9  *  Zone aware kswapd started 02/00, Kanoj Sarcar ([email protected]).
  10  *  Multiqueue VM started 5.8.00, Rik van Riel.
  11  */
  12
  13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15 #include <linux/mm.h>
  16 #include <linux/sched/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/gfp.h>
  19 #include <linux/kernel_stat.h>
  20 #include <linux/swap.h>
  21 #include <linux/pagemap.h>
  22 #include <linux/init.h>
  23 #include <linux/highmem.h>
  24 #include <linux/vmpressure.h>
  25 #include <linux/vmstat.h>
  26 #include <linux/file.h>
  27 #include <linux/writeback.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/buffer_head.h>  /* for buffer_heads_over_limit */
  30 #include <linux/mm_inline.h>
  31 #include <linux/backing-dev.h>
  32 #include <linux/rmap.h>
  33 #include <linux/topology.h>
  34 #include <linux/cpu.h>
  35 #include <linux/cpuset.h>
  36 #include <linux/compaction.h>
  37 #include <linux/notifier.h>
  38 #include <linux/delay.h>
  39 #include <linux/kthread.h>
  40 #include <linux/freezer.h>
  41 #include <linux/memcontrol.h>
  42 #include <linux/migrate.h>
  43 #include <linux/delayacct.h>
  44 #include <linux/sysctl.h>
  45 #include <linux/memory-tiers.h>
  46 #include <linux/oom.h>
  47 #include <linux/pagevec.h>
  48 #include <linux/prefetch.h>
  49 #include <linux/printk.h>
  50 #include <linux/dax.h>
  51 #include <linux/psi.h>
  52 #include <linux/pagewalk.h>
  53 #include <linux/shmem_fs.h>
  54 #include <linux/ctype.h>
  55 #include <linux/debugfs.h>
  56 #include <linux/khugepaged.h>
  57 #include <linux/rculist_nulls.h>
  58 #include <linux/random.h>
  59
  60 #include <asm/tlbflush.h>
  61 #include <asm/div64.h>
  62
  63 #include <linux/swapops.h>
  64 #include <linux/balloon_compaction.h>
  65 #include <linux/sched/sysctl.h>
  66
  67 #include "internal.h"
  68 #include "swap.h"
  69
  70 #define CREATE_TRACE_POINTS
  71 #include <trace/events/vmscan.h>
  72
  73 struct scan_control {
  74         /* How many pages shrink_list() should reclaim */
  75         unsigned long nr_to_reclaim;
  76
  77         /*
  78          * Nodemask of nodes allowed by the caller. If NULL, all nodes
  79          * are scanned.
  80          */
  81         nodemask_t      *nodemask;
  82
  83         /*
  84          * The memory cgroup that hit its limit and as a result is the
  85          * primary target of this reclaim invocation.
  86          */
  87         struct mem_cgroup *target_mem_cgroup;
  88
  89         /*
  90          * Scan pressure balancing between anon and file LRUs
  91          */
  92         unsigned long   anon_cost;
  93         unsigned long   file_cost;
  94
  95 #ifdef CONFIG_MEMCG
  96         /* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
  97         int *proactive_swappiness;
  98 #endif
  99
 100         /* Can active folios be deactivated as part of reclaim? */
 101 #define DEACTIVATE_ANON 1
 102 #define DEACTIVATE_FILE 2
 103         unsigned int may_deactivate:2;
 104         unsigned int force_deactivate:1;
 105         unsigned int skipped_deactivate:1;
 106
 107         /* Writepage batching in laptop mode; RECLAIM_WRITE */
 108         unsigned int may_writepage:1;
 109
 110         /* Can mapped folios be reclaimed? */
 111         unsigned int may_unmap:1;
 112
 113         /* Can folios be swapped as part of reclaim? */
 114         unsigned int may_swap:1;
 115
 116         /* Not allow cache_trim_mode to be turned on as part of reclaim? */
 117         unsigned int no_cache_trim_mode:1;
 118
 119         /* Has cache_trim_mode failed at least once? */
 120         unsigned int cache_trim_mode_failed:1;
 121
 122         /* Proactive reclaim invoked by userspace through memory.reclaim */
 123         unsigned int proactive:1;
 124
 125         /*
 126          * Cgroup memory below memory.low is protected as long as we
 127          * don't threaten to OOM. If any cgroup is reclaimed at
 128          * reduced force or passed over entirely due to its memory.low
 129          * setting (memcg_low_skipped), and nothing is reclaimed as a
 130          * result, then go back for one more cycle that reclaims the protected
 131          * memory (memcg_low_reclaim) to avert OOM.
 132          */
 133         unsigned int memcg_low_reclaim:1;
 134         unsigned int memcg_low_skipped:1;
 135
 136         /* Shared cgroup tree walk failed, rescan the whole tree */
 137         unsigned int memcg_full_walk:1;
 138
 139         unsigned int hibernation_mode:1;
 140
 141         /* One of the zones is ready for compaction */
 142         unsigned int compaction_ready:1;
 143
 144         /* There is easily reclaimable cold cache in the current node */
 145         unsigned int cache_trim_mode:1;
 146
 147         /* The file folios on the current node are dangerously low */
 148         unsigned int file_is_tiny:1;
 149
 150         /* Always discard instead of demoting to lower tier memory */
 151         unsigned int no_demotion:1;
 152
 153         /* Allocation order */
 154         s8 order;
 155
 156         /* Scan (total_size >> priority) pages at once */
 157         s8 priority;
 158
 159         /* The highest zone to isolate folios for reclaim from */
 160         s8 reclaim_idx;
 161
 162         /* This context's GFP mask */
 163         gfp_t gfp_mask;
 164
 165         /* Incremented by the number of inactive pages that were scanned */
 166         unsigned long nr_scanned;
 167
 168         /* Number of pages freed so far during a call to shrink_zones() */
 169         unsigned long nr_reclaimed;
 170
 171         struct {
 172                 unsigned int dirty;
 173                 unsigned int unqueued_dirty;
 174                 unsigned int congested;
 175                 unsigned int writeback;
 176                 unsigned int immediate;
 177                 unsigned int file_taken;
 178                 unsigned int taken;
 179         } nr;
 180
 181         /* for recording the reclaimed slab by now */
 182         struct reclaim_state reclaim_state;
 183 };
 184
 185 #ifdef ARCH_HAS_PREFETCHW
 186 #define prefetchw_prev_lru_folio(_folio, _base, _field)                 \
 187         do {                                                            \
 188                 if ((_folio)->lru.prev != _base) {                      \
 189                         struct folio *prev;                             \
 190                                                                         \
 191                         prev = lru_to_folio(&(_folio->lru));            \
 192                         prefetchw(&prev->_field);                       \
 193                 }                                                       \
 194         } while (0)
 195 #else
 196 #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
 197 #endif
 198
 199 /*
 200  * From 0 .. MAX_SWAPPINESS.  Higher means more swappy.
 201  */
 202 int vm_swappiness = 60;
 203
 204 #ifdef CONFIG_MEMCG
 205
 206 /* Returns true for reclaim through cgroup limits or cgroup interfaces. */
 207 static bool cgroup_reclaim(struct scan_control *sc)
 208 {
 209         return sc->target_mem_cgroup;
 210 }
 211
 212 /*
 213  * Returns true for reclaim on the root cgroup. This is true for direct
 214  * allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
 215  */
 216 static bool root_reclaim(struct scan_control *sc)
 217 {
 218         return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
 219 }
 220
 221 /**
 222  * writeback_throttling_sane - is the usual dirty throttling mechanism available?
 223  * @sc: scan_control in question
 224  *
 225  * The normal page dirty throttling mechanism in balance_dirty_pages() is
 226  * completely broken with the legacy memcg and direct stalling in
 227  * shrink_folio_list() is used for throttling instead, which lacks all the
 228  * niceties such as fairness, adaptive pausing, bandwidth proportional
 229  * allocation and configurability.
 230  *
 231  * This function tests whether the vmscan currently in progress can assume
 232  * that the normal dirty throttling mechanism is operational.
 233  */
 234 static bool writeback_throttling_sane(struct scan_control *sc)
 235 {
 236         if (!cgroup_reclaim(sc))
 237                 return true;
 238 #ifdef CONFIG_CGROUP_WRITEBACK
 239         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
 240                 return true;
 241 #endif
 242         return false;
 243 }
 244
 245 static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
 246 {
 247         if (sc->proactive && sc->proactive_swappiness)
 248                 return *sc->proactive_swappiness;
 249         return mem_cgroup_swappiness(memcg);
 250 }
 251 #else
 252 static bool cgroup_reclaim(struct scan_control *sc)
 253 {
 254         return false;
 255 }
 256
 257 static bool root_reclaim(struct scan_control *sc)
 258 {
 259         return true;
 260 }
 261
 262 static bool writeback_throttling_sane(struct scan_control *sc)
 263 {
 264         return true;
 265 }
 266
 267 static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg)
 268 {
 269         return READ_ONCE(vm_swappiness);
 270 }
 271 #endif
 272
 273 static void set_task_reclaim_state(struct task_struct *task,
 274                                    struct reclaim_state *rs)
 275 {
 276         /* Check for an overwrite */
 277         WARN_ON_ONCE(rs && task->reclaim_state);
 278
 279         /* Check for the nulling of an already-nulled member */
 280         WARN_ON_ONCE(!rs && !task->reclaim_state);
 281
 282         task->reclaim_state = rs;
 283 }
 284
 285 /*
 286  * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
 287  * scan_control->nr_reclaimed.
 288  */
 289 static void flush_reclaim_state(struct scan_control *sc)
 290 {
 291         /*
 292          * Currently, reclaim_state->reclaimed includes three types of pages
 293          * freed outside of vmscan:
 294          * (1) Slab pages.
 295          * (2) Clean file pages from pruned inodes (on highmem systems).
 296          * (3) XFS freed buffer pages.
 297          *
 298          * For all of these cases, we cannot universally link the pages to a
 299          * single memcg. For example, a memcg-aware shrinker can free one object
 300          * charged to the target memcg, causing an entire page to be freed.
 301          * If we count the entire page as reclaimed from the memcg, we end up
 302          * overestimating the reclaimed amount (potentially under-reclaiming).
 303          *
 304          * Only count such pages for global reclaim to prevent under-reclaiming
 305          * from the target memcg; preventing unnecessary retries during memcg
 306          * charging and false positives from proactive reclaim.
 307          *
 308          * For uncommon cases where the freed pages were actually mostly
 309          * charged to the target memcg, we end up underestimating the reclaimed
 310          * amount. This should be fine. The freed pages will be uncharged
 311          * anyway, even if they are not counted here properly, and we will be
 312          * able to make forward progress in charging (which is usually in a
 313          * retry loop).
 314          *
 315          * We can go one step further, and report the uncharged objcg pages in
 316          * memcg reclaim, to make reporting more accurate and reduce
 317          * underestimation, but it's probably not worth the complexity for now.
 318          */
 319         if (current->reclaim_state && root_reclaim(sc)) {
 320                 sc->nr_reclaimed += current->reclaim_state->reclaimed;
 321                 current->reclaim_state->reclaimed = 0;
 322         }
 323 }
 324
 325 static bool can_demote(int nid, struct scan_control *sc)
 326 {
 327         if (!numa_demotion_enabled)
 328                 return false;
 329         if (sc && sc->no_demotion)
 330                 return false;
 331         if (next_demotion_node(nid) == NUMA_NO_NODE)
 332                 return false;
 333
 334         return true;
 335 }
 336
 337 static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
 338                                           int nid,
 339                                           struct scan_control *sc)
 340 {
 341         if (memcg == NULL) {
 342                 /*
 343                  * For non-memcg reclaim, is there
 344                  * space in any swap device?
 345                  */
 346                 if (get_nr_swap_pages() > 0)
 347                         return true;
 348         } else {
 349                 /* Is the memcg below its swap limit? */
 350                 if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
 351                         return true;
 352         }
 353
 354         /*
 355          * The page can not be swapped.
 356          *
 357          * Can it be reclaimed from this node via demotion?
 358          */
 359         return can_demote(nid, sc);
 360 }
 361
 362 /*
 363  * This misses isolated folios which are not accounted for to save counters.
 364  * As the data only determines if reclaim or compaction continues, it is
 365  * not expected that isolated folios will be a dominating factor.
 366  */
 367 unsigned long zone_reclaimable_pages(struct zone *zone)
 368 {
 369         unsigned long nr;
 370
 371         nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
 372                 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
 373         if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
 374                 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
 375                         zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
 376
 377         return nr;
 378 }
 379
 380 /**
 381  * lruvec_lru_size -  Returns the number of pages on the given LRU list.
 382  * @lruvec: lru vector
 383  * @lru: lru to use
 384  * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
 385  */
 386 static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
 387                                      int zone_idx)
 388 {
 389         unsigned long size = 0;
 390         int zid;
 391
 392         for (zid = 0; zid <= zone_idx; zid++) {
 393                 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
 394
 395                 if (!managed_zone(zone))
 396                         continue;
 397
 398                 if (!mem_cgroup_disabled())
 399                         size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
 400                 else
 401                         size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
 402         }
 403         return size;
 404 }
 405
 406 static unsigned long drop_slab_node(int nid)
 407 {
 408         unsigned long freed = 0;
 409         struct mem_cgroup *memcg = NULL;
 410
 411         memcg = mem_cgroup_iter(NULL, NULL, NULL);
 412         do {
 413                 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
 414         } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
 415
 416         return freed;
 417 }
 418
 419 void drop_slab(void)
 420 {
 421         int nid;
 422         int shift = 0;
 423         unsigned long freed;
 424
 425         do {
 426                 freed = 0;
 427                 for_each_online_node(nid) {
 428                         if (fatal_signal_pending(current))
 429                                 return;
 430
 431                         freed += drop_slab_node(nid);
 432                 }
 433         } while ((freed >> shift++) > 1);
 434 }
 435
 436 static int reclaimer_offset(void)
 437 {
 438         BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
 439                         PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
 440         BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
 441                         PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
 442         BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
 443                         PGSCAN_DIRECT - PGSCAN_KSWAPD);
 444         BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
 445                         PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
 446
 447         if (current_is_kswapd())
 448                 return 0;
 449         if (current_is_khugepaged())
 450                 return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
 451         return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
 452 }
 453
 454 static inline int is_page_cache_freeable(struct folio *folio)
 455 {
 456         /*
 457          * A freeable page cache folio is referenced only by the caller
 458          * that isolated the folio, the page cache and optional filesystem
 459          * private data at folio->private.
 460          */
 461         return folio_ref_count(folio) - folio_test_private(folio) ==
 462                 1 + folio_nr_pages(folio);
 463 }
 464
 465 /*
 466  * We detected a synchronous write error writing a folio out.  Probably
 467  * -ENOSPC.  We need to propagate that into the address_space for a subsequent
 468  * fsync(), msync() or close().
 469  *
 470  * The tricky part is that after writepage we cannot touch the mapping: nothing
 471  * prevents it from being freed up.  But we have a ref on the folio and once
 472  * that folio is locked, the mapping is pinned.
 473  *
 474  * We're allowed to run sleeping folio_lock() here because we know the caller has
 475  * __GFP_FS.
 476  */
 477 static void handle_write_error(struct address_space *mapping,
 478                                 struct folio *folio, int error)
 479 {
 480         folio_lock(folio);
 481         if (folio_mapping(folio) == mapping)
 482                 mapping_set_error(mapping, error);
 483         folio_unlock(folio);
 484 }
 485
 486 static bool skip_throttle_noprogress(pg_data_t *pgdat)
 487 {
 488         int reclaimable = 0, write_pending = 0;
 489         int i;
 490
 491         /*
 492          * If kswapd is disabled, reschedule if necessary but do not
 493          * throttle as the system is likely near OOM.
 494          */
 495         if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
 496                 return true;
 497
 498         /*
 499          * If there are a lot of dirty/writeback folios then do not
 500          * throttle as throttling will occur when the folios cycle
 501          * towards the end of the LRU if still under writeback.
 502          */
 503         for (i = 0; i < MAX_NR_ZONES; i++) {
 504                 struct zone *zone = pgdat->node_zones + i;
 505
 506                 if (!managed_zone(zone))
 507                         continue;
 508
 509                 reclaimable += zone_reclaimable_pages(zone);
 510                 write_pending += zone_page_state_snapshot(zone,
 511                                                   NR_ZONE_WRITE_PENDING);
 512         }
 513         if (2 * write_pending <= reclaimable)
 514                 return true;
 515
 516         return false;
 517 }
 518
 519 void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
 520 {
 521         wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
 522         long timeout, ret;
 523         DEFINE_WAIT(wait);
 524
 525         /*
 526          * Do not throttle user workers, kthreads other than kswapd or
 527          * workqueues. They may be required for reclaim to make
 528          * forward progress (e.g. journalling workqueues or kthreads).
 529          */
 530         if (!current_is_kswapd() &&
 531             current->flags & (PF_USER_WORKER|PF_KTHREAD)) {
 532                 cond_resched();
 533                 return;
 534         }
 535
 536         /*
 537          * These figures are pulled out of thin air.
 538          * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
 539          * parallel reclaimers which is a short-lived event so the timeout is
 540          * short. Failing to make progress or waiting on writeback are
 541          * potentially long-lived events so use a longer timeout. This is shaky
 542          * logic as a failure to make progress could be due to anything from
 543          * writeback to a slow device to excessive referenced folios at the tail
 544          * of the inactive LRU.
 545          */
 546         switch(reason) {
 547         case VMSCAN_THROTTLE_WRITEBACK:
 548                 timeout = HZ/10;
 549
 550                 if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
 551                         WRITE_ONCE(pgdat->nr_reclaim_start,
 552                                 node_page_state(pgdat, NR_THROTTLED_WRITTEN));
 553                 }
 554
 555                 break;
 556         case VMSCAN_THROTTLE_CONGESTED:
 557                 fallthrough;
 558         case VMSCAN_THROTTLE_NOPROGRESS:
 559                 if (skip_throttle_noprogress(pgdat)) {
 560                         cond_resched();
 561                         return;
 562                 }
 563
 564                 timeout = 1;
 565
 566                 break;
 567         case VMSCAN_THROTTLE_ISOLATED:
 568                 timeout = HZ/50;
 569                 break;
 570         default:
 571                 WARN_ON_ONCE(1);
 572                 timeout = HZ;
 573                 break;
 574         }
 575
 576         prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 577         ret = schedule_timeout(timeout);
 578         finish_wait(wqh, &wait);
 579
 580         if (reason == VMSCAN_THROTTLE_WRITEBACK)
 581                 atomic_dec(&pgdat->nr_writeback_throttled);
 582
 583         trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
 584                                 jiffies_to_usecs(timeout - ret),
 585                                 reason);
 586 }
 587
 588 /*
 589  * Account for folios written if tasks are throttled waiting on dirty
 590  * folios to clean. If enough folios have been cleaned since throttling
 591  * started then wakeup the throttled tasks.
 592  */
 593 void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
 594                                                         int nr_throttled)
 595 {
 596         unsigned long nr_written;
 597
 598         node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
 599
 600         /*
 601          * This is an inaccurate read as the per-cpu deltas may not
 602          * be synchronised. However, given that the system is
 603          * writeback throttled, it is not worth taking the penalty
 604          * of getting an accurate count. At worst, the throttle
 605          * timeout guarantees forward progress.
 606          */
 607         nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
 608                 READ_ONCE(pgdat->nr_reclaim_start);
 609
 610         if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
 611                 wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
 612 }
 613
 614 /* possible outcome of pageout() */
 615 typedef enum {
 616         /* failed to write folio out, folio is locked */
 617         PAGE_KEEP,
 618         /* move folio to the active list, folio is locked */
 619         PAGE_ACTIVATE,
 620         /* folio has been sent to the disk successfully, folio is unlocked */
 621         PAGE_SUCCESS,
 622         /* folio is clean and locked */
 623         PAGE_CLEAN,
 624 } pageout_t;
 625
 626 /*
 627  * pageout is called by shrink_folio_list() for each dirty folio.
 628  * Calls ->writepage().
 629  */
 630 static pageout_t pageout(struct folio *folio, struct address_space *mapping,
 631                          struct swap_iocb **plug)
 632 {
 633         /*
 634          * If the folio is dirty, only perform writeback if that write
 635          * will be non-blocking.  To prevent this allocation from being
 636          * stalled by pagecache activity.  But note that there may be
 637          * stalls if we need to run get_block().  We could test
 638          * PagePrivate for that.
 639          *
 640          * If this process is currently in __generic_file_write_iter() against
 641          * this folio's queue, we can perform writeback even if that
 642          * will block.
 643          *
 644          * If the folio is swapcache, write it back even if that would
 645          * block, for some throttling. This happens by accident, because
 646          * swap_backing_dev_info is bust: it doesn't reflect the
 647          * congestion state of the swapdevs.  Easy to fix, if needed.
 648          */
 649         if (!is_page_cache_freeable(folio))
 650                 return PAGE_KEEP;
 651         if (!mapping) {
 652                 /*
 653                  * Some data journaling orphaned folios can have
 654                  * folio->mapping == NULL while being dirty with clean buffers.
 655                  */
 656                 if (folio_test_private(folio)) {
 657                         if (try_to_free_buffers(folio)) {
 658                                 folio_clear_dirty(folio);
 659                                 pr_info("%s: orphaned folio\n", __func__);
 660                                 return PAGE_CLEAN;
 661                         }
 662                 }
 663                 return PAGE_KEEP;
 664         }
 665         if (mapping->a_ops->writepage == NULL)
 666                 return PAGE_ACTIVATE;
 667
 668         if (folio_clear_dirty_for_io(folio)) {
 669                 int res;
 670                 struct writeback_control wbc = {
 671                         .sync_mode = WB_SYNC_NONE,
 672                         .nr_to_write = SWAP_CLUSTER_MAX,
 673                         .range_start = 0,
 674                         .range_end = LLONG_MAX,
 675                         .for_reclaim = 1,
 676                         .swap_plug = plug,
 677                 };
 678
 679                 folio_set_reclaim(folio);
 680                 res = mapping->a_ops->writepage(&folio->page, &wbc);
 681                 if (res < 0)
 682                         handle_write_error(mapping, folio, res);
 683                 if (res == AOP_WRITEPAGE_ACTIVATE) {
 684                         folio_clear_reclaim(folio);
 685                         return PAGE_ACTIVATE;
 686                 }
 687
 688                 if (!folio_test_writeback(folio)) {
 689                         /* synchronous write or broken a_ops? */
 690                         folio_clear_reclaim(folio);
 691                 }
 692                 trace_mm_vmscan_write_folio(folio);
 693                 node_stat_add_folio(folio, NR_VMSCAN_WRITE);
 694                 return PAGE_SUCCESS;
 695         }
 696
 697         return PAGE_CLEAN;
 698 }
 699
 700 /*
 701  * Same as remove_mapping, but if the folio is removed from the mapping, it
 702  * gets returned with a refcount of 0.
 703  */
 704 static int __remove_mapping(struct address_space *mapping, struct folio *folio,
 705                             bool reclaimed, struct mem_cgroup *target_memcg)
 706 {
 707         int refcount;
 708         void *shadow = NULL;
 709
 710         BUG_ON(!folio_test_locked(folio));
 711         BUG_ON(mapping != folio_mapping(folio));
 712
 713         if (!folio_test_swapcache(folio))
 714                 spin_lock(&mapping->host->i_lock);
 715         xa_lock_irq(&mapping->i_pages);
 716         /*
 717          * The non racy check for a busy folio.
 718          *
 719          * Must be careful with the order of the tests. When someone has
 720          * a ref to the folio, it may be possible that they dirty it then
 721          * drop the reference. So if the dirty flag is tested before the
 722          * refcount here, then the following race may occur:
 723          *
 724          * get_user_pages(&page);
 725          * [user mapping goes away]
 726          * write_to(page);
 727          *                              !folio_test_dirty(folio)    [good]
 728          * folio_set_dirty(folio);
 729          * folio_put(folio);
 730          *                              !refcount(folio)   [good, discard it]
 731          *
 732          * [oops, our write_to data is lost]
 733          *
 734          * Reversing the order of the tests ensures such a situation cannot
 735          * escape unnoticed. The smp_rmb is needed to ensure the folio->flags
 736          * load is not satisfied before that of folio->_refcount.
 737          *
 738          * Note that if the dirty flag is always set via folio_mark_dirty,
 739          * and thus under the i_pages lock, then this ordering is not required.
 740          */
 741         refcount = 1 + folio_nr_pages(folio);
 742         if (!folio_ref_freeze(folio, refcount))
 743                 goto cannot_free;
 744         /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */
 745         if (unlikely(folio_test_dirty(folio))) {
 746                 folio_ref_unfreeze(folio, refcount);
 747                 goto cannot_free;
 748         }
 749
 750         if (folio_test_swapcache(folio)) {
 751                 swp_entry_t swap = folio->swap;
 752
 753                 if (reclaimed && !mapping_exiting(mapping))
 754                         shadow = workingset_eviction(folio, target_memcg);
 755                 __delete_from_swap_cache(folio, swap, shadow);
 756                 mem_cgroup_swapout(folio, swap);
 757                 xa_unlock_irq(&mapping->i_pages);
 758                 put_swap_folio(folio, swap);
 759         } else {
 760                 void (*free_folio)(struct folio *);
 761
 762                 free_folio = mapping->a_ops->free_folio;
 763                 /*
 764                  * Remember a shadow entry for reclaimed file cache in
 765                  * order to detect refaults, thus thrashing, later on.
 766                  *
 767                  * But don't store shadows in an address space that is
 768                  * already exiting.  This is not just an optimization,
 769                  * inode reclaim needs to empty out the radix tree or
 770                  * the nodes are lost.  Don't plant shadows behind its
 771                  * back.
 772                  *
 773                  * We also don't store shadows for DAX mappings because the
 774                  * only page cache folios found in these are zero pages
 775                  * covering holes, and because we don't want to mix DAX
 776                  * exceptional entries and shadow exceptional entries in the
 777                  * same address_space.
 778                  */
 779                 if (reclaimed && folio_is_file_lru(folio) &&
 780                     !mapping_exiting(mapping) && !dax_mapping(mapping))
 781                         shadow = workingset_eviction(folio, target_memcg);
 782                 __filemap_remove_folio(folio, shadow);
 783                 xa_unlock_irq(&mapping->i_pages);
 784                 if (mapping_shrinkable(mapping))
 785                         inode_add_lru(mapping->host);
 786                 spin_unlock(&mapping->host->i_lock);
 787
 788                 if (free_folio)
 789                         free_folio(folio);
 790         }
 791
 792         return 1;
 793
 794 cannot_free:
 795         xa_unlock_irq(&mapping->i_pages);
 796         if (!folio_test_swapcache(folio))
 797                 spin_unlock(&mapping->host->i_lock);
 798         return 0;
 799 }
 800
 801 /**
 802  * remove_mapping() - Attempt to remove a folio from its mapping.
 803  * @mapping: The address space.
 804  * @folio: The folio to remove.
 805  *
 806  * If the folio is dirty, under writeback or if someone else has a ref
 807  * on it, removal will fail.
 808  * Return: The number of pages removed from the mapping.  0 if the folio
 809  * could not be removed.
 810  * Context: The caller should have a single refcount on the folio and
 811  * hold its lock.
 812  */
 813 long remove_mapping(struct address_space *mapping, struct folio *folio)
 814 {
 815         if (__remove_mapping(mapping, folio, false, NULL)) {
 816                 /*
 817                  * Unfreezing the refcount with 1 effectively
 818                  * drops the pagecache ref for us without requiring another
 819                  * atomic operation.
 820                  */
 821                 folio_ref_unfreeze(folio, 1);
 822                 return folio_nr_pages(folio);
 823         }
 824         return 0;
 825 }
 826
 827 /**
 828  * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
 829  * @folio: Folio to be returned to an LRU list.
 830  *
 831  * Add previously isolated @folio to appropriate LRU list.
 832  * The folio may still be unevictable for other reasons.
 833  *
 834  * Context: lru_lock must not be held, interrupts must be enabled.
 835  */
 836 void folio_putback_lru(struct folio *folio)
 837 {
 838         folio_add_lru(folio);
 839         folio_put(folio);               /* drop ref from isolate */
 840 }
 841
 842 enum folio_references {
 843         FOLIOREF_RECLAIM,
 844         FOLIOREF_RECLAIM_CLEAN,
 845         FOLIOREF_KEEP,
 846         FOLIOREF_ACTIVATE,
 847 };
 848
 849 static enum folio_references folio_check_references(struct folio *folio,
 850                                                   struct scan_control *sc)
 851 {
 852         int referenced_ptes, referenced_folio;
 853         unsigned long vm_flags;
 854
 855         referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
 856                                            &vm_flags);
 857         referenced_folio = folio_test_clear_referenced(folio);
 858
 859         /*
 860          * The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
 861          * Let the folio, now marked Mlocked, be moved to the unevictable list.
 862          */
 863         if (vm_flags & VM_LOCKED)
 864                 return FOLIOREF_ACTIVATE;
 865
 866         /* rmap lock contention: rotate */
 867         if (referenced_ptes == -1)
 868                 return FOLIOREF_KEEP;
 869
 870         if (referenced_ptes) {
 871                 /*
 872                  * All mapped folios start out with page table
 873                  * references from the instantiating fault, so we need
 874                  * to look twice if a mapped file/anon folio is used more
 875                  * than once.
 876                  *
 877                  * Mark it and spare it for another trip around the
 878                  * inactive list.  Another page table reference will
 879                  * lead to its activation.
 880                  *
 881                  * Note: the mark is set for activated folios as well
 882                  * so that recently deactivated but used folios are
 883                  * quickly recovered.
 884                  */
 885                 folio_set_referenced(folio);
 886
 887                 if (referenced_folio || referenced_ptes > 1)
 888                         return FOLIOREF_ACTIVATE;
 889
 890                 /*
 891                  * Activate file-backed executable folios after first usage.
 892                  */
 893                 if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
 894                         return FOLIOREF_ACTIVATE;
 895
 896                 return FOLIOREF_KEEP;
 897         }
 898
 899         /* Reclaim if clean, defer dirty folios to writeback */
 900         if (referenced_folio && folio_is_file_lru(folio))
 901                 return FOLIOREF_RECLAIM_CLEAN;
 902
 903         return FOLIOREF_RECLAIM;
 904 }
 905
 906 /* Check if a folio is dirty or under writeback */
 907 static void folio_check_dirty_writeback(struct folio *folio,
 908                                        bool *dirty, bool *writeback)
 909 {
 910         struct address_space *mapping;
 911
 912         /*
 913          * Anonymous folios are not handled by flushers and must be written
 914          * from reclaim context. Do not stall reclaim based on them.
 915          * MADV_FREE anonymous folios are put into inactive file list too.
 916          * They could be mistakenly treated as file lru. So further anon
 917          * test is needed.
 918          */
 919         if (!folio_is_file_lru(folio) ||
 920             (folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
 921                 *dirty = false;
 922                 *writeback = false;
 923                 return;
 924         }
 925
 926         /* By default assume that the folio flags are accurate */
 927         *dirty = folio_test_dirty(folio);
 928         *writeback = folio_test_writeback(folio);
 929
 930         /* Verify dirty/writeback state if the filesystem supports it */
 931         if (!folio_test_private(folio))
 932                 return;
 933
 934         mapping = folio_mapping(folio);
 935         if (mapping && mapping->a_ops->is_dirty_writeback)
 936                 mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
 937 }
 938
 939 struct folio *alloc_migrate_folio(struct folio *src, unsigned long private)
 940 {
 941         struct folio *dst;
 942         nodemask_t *allowed_mask;
 943         struct migration_target_control *mtc;
 944
 945         mtc = (struct migration_target_control *)private;
 946
 947         allowed_mask = mtc->nmask;
 948         /*
 949          * make sure we allocate from the target node first also trying to
 950          * demote or reclaim pages from the target node via kswapd if we are
 951          * low on free memory on target node. If we don't do this and if
 952          * we have free memory on the slower(lower) memtier, we would start
 953          * allocating pages from slower(lower) memory tiers without even forcing
 954          * a demotion of cold pages from the target memtier. This can result
 955          * in the kernel placing hot pages in slower(lower) memory tiers.
 956          */
 957         mtc->nmask = NULL;
 958         mtc->gfp_mask |= __GFP_THISNODE;
 959         dst = alloc_migration_target(src, (unsigned long)mtc);
 960         if (dst)
 961                 return dst;
 962
 963         mtc->gfp_mask &= ~__GFP_THISNODE;
 964         mtc->nmask = allowed_mask;
 965
 966         return alloc_migration_target(src, (unsigned long)mtc);
 967 }
 968
 969 /*
 970  * Take folios on @demote_folios and attempt to demote them to another node.
 971  * Folios which are not demoted are left on @demote_folios.
 972  */
 973 static unsigned int demote_folio_list(struct list_head *demote_folios,
 974                                      struct pglist_data *pgdat)
 975 {
 976         int target_nid = next_demotion_node(pgdat->node_id);
 977         unsigned int nr_succeeded;
 978         nodemask_t allowed_mask;
 979
 980         struct migration_target_control mtc = {
 981                 /*
 982                  * Allocate from 'node', or fail quickly and quietly.
 983                  * When this happens, 'page' will likely just be discarded
 984                  * instead of migrated.
 985                  */
 986                 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
 987                         __GFP_NOMEMALLOC | GFP_NOWAIT,
 988                 .nid = target_nid,
 989                 .nmask = &allowed_mask,
 990                 .reason = MR_DEMOTION,
 991         };
 992
 993         if (list_empty(demote_folios))
 994                 return 0;
 995
 996         if (target_nid == NUMA_NO_NODE)
 997                 return 0;
 998
 999         node_get_allowed_targets(pgdat, &allowed_mask);
1000
1001         /* Demotion ignores all cpuset and mempolicy settings */
1002         migrate_pages(demote_folios, alloc_migrate_folio, NULL,
1003                       (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
1004                       &nr_succeeded);
1005
1006         mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(),
1007                             nr_succeeded);
1008
1009         return nr_succeeded;
1010 }
1011
1012 static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
1013 {
1014         if (gfp_mask & __GFP_FS)
1015                 return true;
1016         if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
1017                 return false;
1018         /*
1019          * We can "enter_fs" for swap-cache with only __GFP_IO
1020          * providing this isn't SWP_FS_OPS.
1021          * ->flags can be updated non-atomicially (scan_swap_map_slots),
1022          * but that will never affect SWP_FS_OPS, so the data_race
1023          * is safe.
1024          */
1025         return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
1026 }
1027
1028 /*
1029  * shrink_folio_list() returns the number of reclaimed pages
1030  */
1031 static unsigned int shrink_folio_list(struct list_head *folio_list,
1032                 struct pglist_data *pgdat, struct scan_control *sc,
1033                 struct reclaim_stat *stat, bool ignore_references)
1034 {
1035         struct folio_batch free_folios;
1036         LIST_HEAD(ret_folios);
1037         LIST_HEAD(demote_folios);
1038         unsigned int nr_reclaimed = 0;
1039         unsigned int pgactivate = 0;
1040         bool do_demote_pass;
1041         struct swap_iocb *plug = NULL;
1042
1043         folio_batch_init(&free_folios);
1044         memset(stat, 0, sizeof(*stat));
1045         cond_resched();
1046         do_demote_pass = can_demote(pgdat->node_id, sc);
1047
1048 retry:
1049         while (!list_empty(folio_list)) {
1050                 struct address_space *mapping;
1051                 struct folio *folio;
1052                 enum folio_references references = FOLIOREF_RECLAIM;
1053                 bool dirty, writeback;
1054                 unsigned int nr_pages;
1055
1056                 cond_resched();
1057
1058                 folio = lru_to_folio(folio_list);
1059                 list_del(&folio->lru);
1060
1061                 if (!folio_trylock(folio))
1062                         goto keep;
1063
1064                 VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
1065
1066                 nr_pages = folio_nr_pages(folio);
1067
1068                 /* Account the number of base pages */
1069                 sc->nr_scanned += nr_pages;
1070
1071                 if (unlikely(!folio_evictable(folio)))
1072                         goto activate_locked;
1073
1074                 if (!sc->may_unmap && folio_mapped(folio))
1075                         goto keep_locked;
1076
1077                 /* folio_update_gen() tried to promote this page? */
1078                 if (lru_gen_enabled() && !ignore_references &&
1079                     folio_mapped(folio) && folio_test_referenced(folio))
1080                         goto keep_locked;
1081
1082                 /*
1083                  * The number of dirty pages determines if a node is marked
1084                  * reclaim_congested. kswapd will stall and start writing
1085                  * folios if the tail of the LRU is all dirty unqueued folios.
1086                  */
1087                 folio_check_dirty_writeback(folio, &dirty, &writeback);
1088                 if (dirty || writeback)
1089                         stat->nr_dirty += nr_pages;
1090
1091                 if (dirty && !writeback)
1092                         stat->nr_unqueued_dirty += nr_pages;
1093
1094                 /*
1095                  * Treat this folio as congested if folios are cycling
1096                  * through the LRU so quickly that the folios marked
1097                  * for immediate reclaim are making it to the end of
1098                  * the LRU a second time.
1099                  */
1100                 if (writeback && folio_test_reclaim(folio))
1101                         stat->nr_congested += nr_pages;
1102
1103                 /*
1104                  * If a folio at the tail of the LRU is under writeback, there
1105                  * are three cases to consider.
1106                  *
1107                  * 1) If reclaim is encountering an excessive number
1108                  *    of folios under writeback and this folio has both
1109                  *    the writeback and reclaim flags set, then it
1110                  *    indicates that folios are being queued for I/O but
1111                  *    are being recycled through the LRU before the I/O
1112                  *    can complete. Waiting on the folio itself risks an
1113                  *    indefinite stall if it is impossible to writeback
1114                  *    the folio due to I/O error or disconnected storage
1115                  *    so instead note that the LRU is being scanned too
1116                  *    quickly and the caller can stall after the folio
1117                  *    list has been processed.
1118                  *
1119                  * 2) Global or new memcg reclaim encounters a folio that is
1120                  *    not marked for immediate reclaim, or the caller does not
1121                  *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
1122                  *    not to fs). In this case mark the folio for immediate
1123                  *    reclaim and continue scanning.
1124                  *
1125                  *    Require may_enter_fs() because we would wait on fs, which
1126                  *    may not have submitted I/O yet. And the loop driver might
1127                  *    enter reclaim, and deadlock if it waits on a folio for
1128                  *    which it is needed to do the write (loop masks off
1129                  *    __GFP_IO|__GFP_FS for this reason); but more thought
1130                  *    would probably show more reasons.
1131                  *
1132                  * 3) Legacy memcg encounters a folio that already has the
1133                  *    reclaim flag set. memcg does not have any dirty folio
1134                  *    throttling so we could easily OOM just because too many
1135                  *    folios are in writeback and there is nothing else to
1136                  *    reclaim. Wait for the writeback to complete.
1137                  *
1138                  * In cases 1) and 2) we activate the folios to get them out of
1139                  * the way while we continue scanning for clean folios on the
1140                  * inactive list and refilling from the active list. The
1141                  * observation here is that waiting for disk writes is more
1142                  * expensive than potentially causing reloads down the line.
1143                  * Since they're marked for immediate reclaim, they won't put
1144                  * memory pressure on the cache working set any longer than it
1145                  * takes to write them to disk.
1146                  */
1147                 if (folio_test_writeback(folio)) {
1148                         /* Case 1 above */
1149                         if (current_is_kswapd() &&
1150                             folio_test_reclaim(folio) &&
1151                             test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1152                                 stat->nr_immediate += nr_pages;
1153                                 goto activate_locked;
1154
1155                         /* Case 2 above */
1156                         } else if (writeback_throttling_sane(sc) ||
1157                             !folio_test_reclaim(folio) ||
1158                             !may_enter_fs(folio, sc->gfp_mask)) {
1159                                 /*
1160                                  * This is slightly racy -
1161                                  * folio_end_writeback() might have
1162                                  * just cleared the reclaim flag, then
1163                                  * setting the reclaim flag here ends up
1164                                  * interpreted as the readahead flag - but
1165                                  * that does not matter enough to care.
1166                                  * What we do want is for this folio to
1167                                  * have the reclaim flag set next time
1168                                  * memcg reclaim reaches the tests above,
1169                                  * so it will then wait for writeback to
1170                                  * avoid OOM; and it's also appropriate
1171                                  * in global reclaim.
1172                                  */
1173                                 folio_set_reclaim(folio);
1174                                 stat->nr_writeback += nr_pages;
1175                                 goto activate_locked;
1176
1177                         /* Case 3 above */
1178                         } else {
1179                                 folio_unlock(folio);
1180                                 folio_wait_writeback(folio);
1181                                 /* then go back and try same folio again */
1182                                 list_add_tail(&folio->lru, folio_list);
1183                                 continue;
1184                         }
1185                 }
1186
1187                 if (!ignore_references)
1188                         references = folio_check_references(folio, sc);
1189
1190                 switch (references) {
1191                 case FOLIOREF_ACTIVATE:
1192                         goto activate_locked;
1193                 case FOLIOREF_KEEP:
1194                         stat->nr_ref_keep += nr_pages;
1195                         goto keep_locked;
1196                 case FOLIOREF_RECLAIM:
1197                 case FOLIOREF_RECLAIM_CLEAN:
1198                         ; /* try to reclaim the folio below */
1199                 }
1200
1201                 /*
1202                  * Before reclaiming the folio, try to relocate
1203                  * its contents to another node.
1204                  */
1205                 if (do_demote_pass &&
1206                     (thp_migration_supported() || !folio_test_large(folio))) {
1207                         list_add(&folio->lru, &demote_folios);
1208                         folio_unlock(folio);
1209                         continue;
1210                 }
1211
1212                 /*
1213                  * Anonymous process memory has backing store?
1214                  * Try to allocate it some swap space here.
1215                  * Lazyfree folio could be freed directly
1216                  */
1217                 if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
1218                         if (!folio_test_swapcache(folio)) {
1219                                 if (!(sc->gfp_mask & __GFP_IO))
1220                                         goto keep_locked;
1221                                 if (folio_maybe_dma_pinned(folio))
1222                                         goto keep_locked;
1223                                 if (folio_test_large(folio)) {
1224                                         /* cannot split folio, skip it */
1225                                         if (!can_split_folio(folio, NULL))
1226                                                 goto activate_locked;
1227                                         /*
1228                                          * Split partially mapped folios right away.
1229                                          * We can free the unmapped pages without IO.
1230                                          */
1231                                         if (data_race(!list_empty(&folio->_deferred_list)) &&
1232                                             split_folio_to_list(folio, folio_list))
1233                                                 goto activate_locked;
1234                                 }
1235                                 if (!add_to_swap(folio)) {
1236                                         int __maybe_unused order = folio_order(folio);
1237
1238                                         if (!folio_test_large(folio))
1239                                                 goto activate_locked_split;
1240                                         /* Fallback to swap normal pages */
1241                                         if (split_folio_to_list(folio, folio_list))
1242                                                 goto activate_locked;
1243 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1244                                         if (nr_pages >= HPAGE_PMD_NR) {
1245                                                 count_memcg_folio_events(folio,
1246                                                         THP_SWPOUT_FALLBACK, 1);
1247                                                 count_vm_event(THP_SWPOUT_FALLBACK);
1248                                         }
1249                                         count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
1250 #endif
1251                                         if (!add_to_swap(folio))
1252                                                 goto activate_locked_split;
1253                                 }
1254                         }
1255                 } else if (folio_test_swapbacked(folio) &&
1256                            folio_test_large(folio)) {
1257                         /* Split shmem folio */
1258                         if (split_folio_to_list(folio, folio_list))
1259                                 goto keep_locked;
1260                 }
1261
1262                 /*
1263                  * If the folio was split above, the tail pages will make
1264                  * their own pass through this function and be accounted
1265                  * then.
1266                  */
1267                 if ((nr_pages > 1) && !folio_test_large(folio)) {
1268                         sc->nr_scanned -= (nr_pages - 1);
1269                         nr_pages = 1;
1270                 }
1271
1272                 /*
1273                  * The folio is mapped into the page tables of one or more
1274                  * processes. Try to unmap it here.
1275                  */
1276                 if (folio_mapped(folio)) {
1277                         enum ttu_flags flags = TTU_BATCH_FLUSH;
1278                         bool was_swapbacked = folio_test_swapbacked(folio);
1279
1280                         if (folio_test_pmd_mappable(folio))
1281                                 flags |= TTU_SPLIT_HUGE_PMD;
1282                         /*
1283                          * Without TTU_SYNC, try_to_unmap will only begin to
1284                          * hold PTL from the first present PTE within a large
1285                          * folio. Some initial PTEs might be skipped due to
1286                          * races with parallel PTE writes in which PTEs can be
1287                          * cleared temporarily before being written new present
1288                          * values. This will lead to a large folio is still
1289                          * mapped while some subpages have been partially
1290                          * unmapped after try_to_unmap; TTU_SYNC helps
1291                          * try_to_unmap acquire PTL from the first PTE,
1292                          * eliminating the influence of temporary PTE values.
1293                          */
1294                         if (folio_test_large(folio))
1295                                 flags |= TTU_SYNC;
1296
1297                         try_to_unmap(folio, flags);
1298                         if (folio_mapped(folio)) {
1299                                 stat->nr_unmap_fail += nr_pages;
1300                                 if (!was_swapbacked &&
1301                                     folio_test_swapbacked(folio))
1302                                         stat->nr_lazyfree_fail += nr_pages;
1303                                 goto activate_locked;
1304                         }
1305                 }
1306
1307                 /*
1308                  * Folio is unmapped now so it cannot be newly pinned anymore.
1309                  * No point in trying to reclaim folio if it is pinned.
1310                  * Furthermore we don't want to reclaim underlying fs metadata
1311                  * if the folio is pinned and thus potentially modified by the
1312                  * pinning process as that may upset the filesystem.
1313                  */
1314                 if (folio_maybe_dma_pinned(folio))
1315                         goto activate_locked;
1316
1317                 mapping = folio_mapping(folio);
1318                 if (folio_test_dirty(folio)) {
1319                         /*
1320                          * Only kswapd can writeback filesystem folios
1321                          * to avoid risk of stack overflow. But avoid
1322                          * injecting inefficient single-folio I/O into
1323                          * flusher writeback as much as possible: only
1324                          * write folios when we've encountered many
1325                          * dirty folios, and when we've already scanned
1326                          * the rest of the LRU for clean folios and see
1327                          * the same dirty folios again (with the reclaim
1328                          * flag set).
1329                          */
1330                         if (folio_is_file_lru(folio) &&
1331                             (!current_is_kswapd() ||
1332                              !folio_test_reclaim(folio) ||
1333                              !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1334                                 /*
1335                                  * Immediately reclaim when written back.
1336                                  * Similar in principle to folio_deactivate()
1337                                  * except we already have the folio isolated
1338                                  * and know it's dirty
1339                                  */
1340                                 node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
1341                                                 nr_pages);
1342                                 folio_set_reclaim(folio);
1343
1344                                 goto activate_locked;
1345                         }
1346
1347                         if (references == FOLIOREF_RECLAIM_CLEAN)
1348                                 goto keep_locked;
1349                         if (!may_enter_fs(folio, sc->gfp_mask))
1350                                 goto keep_locked;
1351                         if (!sc->may_writepage)
1352                                 goto keep_locked;
1353
1354                         /*
1355                          * Folio is dirty. Flush the TLB if a writable entry
1356                          * potentially exists to avoid CPU writes after I/O
1357                          * starts and then write it out here.
1358                          */
1359                         try_to_unmap_flush_dirty();
1360                         switch (pageout(folio, mapping, &plug)) {
1361                         case PAGE_KEEP:
1362                                 goto keep_locked;
1363                         case PAGE_ACTIVATE:
1364                                 goto activate_locked;
1365                         case PAGE_SUCCESS:
1366                                 stat->nr_pageout += nr_pages;
1367
1368                                 if (folio_test_writeback(folio))
1369                                         goto keep;
1370                                 if (folio_test_dirty(folio))
1371                                         goto keep;
1372
1373                                 /*
1374                                  * A synchronous write - probably a ramdisk.  Go
1375                                  * ahead and try to reclaim the folio.
1376                                  */
1377                                 if (!folio_trylock(folio))
1378                                         goto keep;
1379                                 if (folio_test_dirty(folio) ||
1380                                     folio_test_writeback(folio))
1381                                         goto keep_locked;
1382                                 mapping = folio_mapping(folio);
1383                                 fallthrough;
1384                         case PAGE_CLEAN:
1385                                 ; /* try to free the folio below */
1386                         }
1387                 }
1388
1389                 /*
1390                  * If the folio has buffers, try to free the buffer
1391                  * mappings associated with this folio. If we succeed
1392                  * we try to free the folio as well.
1393                  *
1394                  * We do this even if the folio is dirty.
1395                  * filemap_release_folio() does not perform I/O, but it
1396                  * is possible for a folio to have the dirty flag set,
1397                  * but it is actually clean (all its buffers are clean).
1398                  * This happens if the buffers were written out directly,
1399                  * with submit_bh(). ext3 will do this, as well as
1400                  * the blockdev mapping.  filemap_release_folio() will
1401                  * discover that cleanness and will drop the buffers
1402                  * and mark the folio clean - it can be freed.
1403                  *
1404                  * Rarely, folios can have buffers and no ->mapping.
1405                  * These are the folios which were not successfully
1406                  * invalidated in truncate_cleanup_folio().  We try to
1407                  * drop those buffers here and if that worked, and the
1408                  * folio is no longer mapped into process address space
1409                  * (refcount == 1) it can be freed.  Otherwise, leave
1410                  * the folio on the LRU so it is swappable.
1411                  */
1412                 if (folio_needs_release(folio)) {
1413                         if (!filemap_release_folio(folio, sc->gfp_mask))
1414                                 goto activate_locked;
1415                         if (!mapping && folio_ref_count(folio) == 1) {
1416                                 folio_unlock(folio);
1417                                 if (folio_put_testzero(folio))
1418                                         goto free_it;
1419                                 else {
1420                                         /*
1421                                          * rare race with speculative reference.
1422                                          * the speculative reference will free
1423                                          * this folio shortly, so we may
1424                                          * increment nr_reclaimed here (and
1425                                          * leave it off the LRU).
1426                                          */
1427                                         nr_reclaimed += nr_pages;
1428                                         continue;
1429                                 }
1430                         }
1431                 }
1432
1433                 if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
1434                         /* follow __remove_mapping for reference */
1435                         if (!folio_ref_freeze(folio, 1))
1436                                 goto keep_locked;
1437                         /*
1438                          * The folio has only one reference left, which is
1439                          * from the isolation. After the caller puts the
1440                          * folio back on the lru and drops the reference, the
1441                          * folio will be freed anyway. It doesn't matter
1442                          * which lru it goes on. So we don't bother checking
1443                          * the dirty flag here.
1444                          */
1445                         count_vm_events(PGLAZYFREED, nr_pages);
1446                         count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
1447                 } else if (!mapping || !__remove_mapping(mapping, folio, true,
1448                                                          sc->target_mem_cgroup))
1449                         goto keep_locked;
1450
1451                 folio_unlock(folio);
1452 free_it:
1453                 /*
1454                  * Folio may get swapped out as a whole, need to account
1455                  * all pages in it.
1456                  */
1457                 nr_reclaimed += nr_pages;
1458
1459                 folio_undo_large_rmappable(folio);
1460                 if (folio_batch_add(&free_folios, folio) == 0) {
1461                         mem_cgroup_uncharge_folios(&free_folios);
1462                         try_to_unmap_flush();
1463                         free_unref_folios(&free_folios);
1464                 }
1465                 continue;
1466
1467 activate_locked_split:
1468                 /*
1469                  * The tail pages that are failed to add into swap cache
1470                  * reach here.  Fixup nr_scanned and nr_pages.
1471                  */
1472                 if (nr_pages > 1) {
1473                         sc->nr_scanned -= (nr_pages - 1);
1474                         nr_pages = 1;
1475                 }
1476 activate_locked:
1477                 /* Not a candidate for swapping, so reclaim swap space. */
1478                 if (folio_test_swapcache(folio) &&
1479                     (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
1480                         folio_free_swap(folio);
1481                 VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
1482                 if (!folio_test_mlocked(folio)) {
1483                         int type = folio_is_file_lru(folio);
1484                         folio_set_active(folio);
1485                         stat->nr_activate[type] += nr_pages;
1486                         count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
1487                 }
1488 keep_locked:
1489                 folio_unlock(folio);
1490 keep:
1491                 list_add(&folio->lru, &ret_folios);
1492                 VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
1493                                 folio_test_unevictable(folio), folio);
1494         }
1495         /* 'folio_list' is always empty here */
1496
1497         /* Migrate folios selected for demotion */
1498         nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
1499         /* Folios that could not be demoted are still in @demote_folios */
1500         if (!list_empty(&demote_folios)) {
1501                 /* Folios which weren't demoted go back on @folio_list */
1502                 list_splice_init(&demote_folios, folio_list);
1503
1504                 /*
1505                  * goto retry to reclaim the undemoted folios in folio_list if
1506                  * desired.
1507                  *
1508                  * Reclaiming directly from top tier nodes is not often desired
1509                  * due to it breaking the LRU ordering: in general memory
1510                  * should be reclaimed from lower tier nodes and demoted from
1511                  * top tier nodes.
1512                  *
1513                  * However, disabling reclaim from top tier nodes entirely
1514                  * would cause ooms in edge scenarios where lower tier memory
1515                  * is unreclaimable for whatever reason, eg memory being
1516                  * mlocked or too hot to reclaim. We can disable reclaim
1517                  * from top tier nodes in proactive reclaim though as that is
1518                  * not real memory pressure.
1519                  */
1520                 if (!sc->proactive) {
1521                         do_demote_pass = false;
1522                         goto retry;
1523                 }
1524         }
1525
1526         pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
1527
1528         mem_cgroup_uncharge_folios(&free_folios);
1529         try_to_unmap_flush();
1530         free_unref_folios(&free_folios);
1531
1532         list_splice(&ret_folios, folio_list);
1533         count_vm_events(PGACTIVATE, pgactivate);
1534
1535         if (plug)
1536                 swap_write_unplug(plug);
1537         return nr_reclaimed;
1538 }
1539
1540 unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1541                                            struct list_head *folio_list)
1542 {
1543         struct scan_control sc = {
1544                 .gfp_mask = GFP_KERNEL,
1545                 .may_unmap = 1,
1546         };
1547         struct reclaim_stat stat;
1548         unsigned int nr_reclaimed;
1549         struct folio *folio, *next;
1550         LIST_HEAD(clean_folios);
1551         unsigned int noreclaim_flag;
1552
1553         list_for_each_entry_safe(folio, next, folio_list, lru) {
1554                 if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
1555                     !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
1556                     !folio_test_unevictable(folio)) {
1557                         folio_clear_active(folio);
1558                         list_move(&folio->lru, &clean_folios);
1559                 }
1560         }
1561
1562         /*
1563          * We should be safe here since we are only dealing with file pages and
1564          * we are not kswapd and therefore cannot write dirty file pages. But
1565          * call memalloc_noreclaim_save() anyway, just in case these conditions
1566          * change in the future.
1567          */
1568         noreclaim_flag = memalloc_noreclaim_save();
1569         nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
1570                                         &stat, true);
1571         memalloc_noreclaim_restore(noreclaim_flag);
1572
1573         list_splice(&clean_folios, folio_list);
1574         mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1575                             -(long)nr_reclaimed);
1576         /*
1577          * Since lazyfree pages are isolated from file LRU from the beginning,
1578          * they will rotate back to anonymous LRU in the end if it failed to
1579          * discard so isolated count will be mismatched.
1580          * Compensate the isolated count for both LRU lists.
1581          */
1582         mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
1583                             stat.nr_lazyfree_fail);
1584         mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1585                             -(long)stat.nr_lazyfree_fail);
1586         return nr_reclaimed;
1587 }
1588
1589 /*
1590  * Update LRU sizes after isolating pages. The LRU size updates must
1591  * be complete before mem_cgroup_update_lru_size due to a sanity check.
1592  */
1593 static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1594                         enum lru_list lru, unsigned long *nr_zone_taken)
1595 {
1596         int zid;
1597
1598         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1599                 if (!nr_zone_taken[zid])
1600                         continue;
1601
1602                 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1603         }
1604
1605 }
1606
1607 /*
1608  * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
1609  *
1610  * lruvec->lru_lock is heavily contended.  Some of the functions that
1611  * shrink the lists perform better by taking out a batch of pages
1612  * and working on them outside the LRU lock.
1613  *
1614  * For pagecache intensive workloads, this function is the hottest
1615  * spot in the kernel (apart from copy_*_user functions).
1616  *
1617  * Lru_lock must be held before calling this function.
1618  *
1619  * @nr_to_scan: The number of eligible pages to look through on the list.
1620  * @lruvec:     The LRU vector to pull pages from.
1621  * @dst:        The temp list to put pages on to.
1622  * @nr_scanned: The number of pages that were scanned.
1623  * @sc:         The scan_control struct for this reclaim session
1624  * @lru:        LRU list id for isolating
1625  *
1626  * returns how many pages were moved onto *@dst.
1627  */
1628 static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
1629                 struct lruvec *lruvec, struct list_head *dst,
1630                 unsigned long *nr_scanned, struct scan_control *sc,
1631                 enum lru_list lru)
1632 {
1633         struct list_head *src = &lruvec->lists[lru];
1634         unsigned long nr_taken = 0;
1635         unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1636         unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1637         unsigned long skipped = 0;
1638         unsigned long scan, total_scan, nr_pages;
1639         LIST_HEAD(folios_skipped);
1640
1641         total_scan = 0;
1642         scan = 0;
1643         while (scan < nr_to_scan && !list_empty(src)) {
1644                 struct list_head *move_to = src;
1645                 struct folio *folio;
1646
1647                 folio = lru_to_folio(src);
1648                 prefetchw_prev_lru_folio(folio, src, flags);
1649
1650                 nr_pages = folio_nr_pages(folio);
1651                 total_scan += nr_pages;
1652
1653                 if (folio_zonenum(folio) > sc->reclaim_idx) {
1654                         nr_skipped[folio_zonenum(folio)] += nr_pages;
1655                         move_to = &folios_skipped;
1656                         goto move;
1657                 }
1658
1659                 /*
1660                  * Do not count skipped folios because that makes the function
1661                  * return with no isolated folios if the LRU mostly contains
1662                  * ineligible folios.  This causes the VM to not reclaim any
1663                  * folios, triggering a premature OOM.
1664                  * Account all pages in a folio.
1665                  */
1666                 scan += nr_pages;
1667
1668                 if (!folio_test_lru(folio))
1669                         goto move;
1670                 if (!sc->may_unmap && folio_mapped(folio))
1671                         goto move;
1672
1673                 /*
1674                  * Be careful not to clear the lru flag until after we're
1675                  * sure the folio is not being freed elsewhere -- the
1676                  * folio release code relies on it.
1677                  */
1678                 if (unlikely(!folio_try_get(folio)))
1679                         goto move;
1680
1681                 if (!folio_test_clear_lru(folio)) {
1682                         /* Another thread is already isolating this folio */
1683                         folio_put(folio);
1684                         goto move;
1685                 }
1686
1687                 nr_taken += nr_pages;
1688                 nr_zone_taken[folio_zonenum(folio)] += nr_pages;
1689                 move_to = dst;
1690 move:
1691                 list_move(&folio->lru, move_to);
1692         }
1693
1694         /*
1695          * Splice any skipped folios to the start of the LRU list. Note that
1696          * this disrupts the LRU order when reclaiming for lower zones but
1697          * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
1698          * scanning would soon rescan the same folios to skip and waste lots
1699          * of cpu cycles.
1700          */
1701         if (!list_empty(&folios_skipped)) {
1702                 int zid;
1703
1704                 list_splice(&folios_skipped, src);
1705                 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1706                         if (!nr_skipped[zid])
1707                                 continue;
1708
1709                         __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1710                         skipped += nr_skipped[zid];
1711                 }
1712         }
1713         *nr_scanned = total_scan;
1714         trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1715                                     total_scan, skipped, nr_taken, lru);
1716         update_lru_sizes(lruvec, lru, nr_zone_taken);
1717         return nr_taken;
1718 }
1719
1720 /**
1721  * folio_isolate_lru() - Try to isolate a folio from its LRU list.
1722  * @folio: Folio to isolate from its LRU list.
1723  *
1724  * Isolate a @folio from an LRU list and adjust the vmstat statistic
1725  * corresponding to whatever LRU list the folio was on.
1726  *
1727  * The folio will have its LRU flag cleared.  If it was found on the
1728  * active list, it will have the Active flag set.  If it was found on the
1729  * unevictable list, it will have the Unevictable flag set.  These flags
1730  * may need to be cleared by the caller before letting the page go.
1731  *
1732  * Context:
1733  *
1734  * (1) Must be called with an elevated refcount on the folio. This is a
1735  *     fundamental difference from isolate_lru_folios() (which is called
1736  *     without a stable reference).
1737  * (2) The lru_lock must not be held.
1738  * (3) Interrupts must be enabled.
1739  *
1740  * Return: true if the folio was removed from an LRU list.
1741  * false if the folio was not on an LRU list.
1742  */
1743 bool folio_isolate_lru(struct folio *folio)
1744 {
1745         bool ret = false;
1746
1747         VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
1748
1749         if (folio_test_clear_lru(folio)) {
1750                 struct lruvec *lruvec;
1751
1752                 folio_get(folio);
1753                 lruvec = folio_lruvec_lock_irq(folio);
1754                 lruvec_del_folio(lruvec, folio);
1755                 unlock_page_lruvec_irq(lruvec);
1756                 ret = true;
1757         }
1758
1759         return ret;
1760 }
1761
1762 /*
1763  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1764  * then get rescheduled. When there are massive number of tasks doing page
1765  * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1766  * the LRU list will go small and be scanned faster than necessary, leading to
1767  * unnecessary swapping, thrashing and OOM.
1768  */
1769 static bool too_many_isolated(struct pglist_data *pgdat, int file,
1770                 struct scan_control *sc)
1771 {
1772         unsigned long inactive, isolated;
1773         bool too_many;
1774
1775         if (current_is_kswapd())
1776                 return false;
1777
1778         if (!writeback_throttling_sane(sc))
1779                 return false;
1780
1781         if (file) {
1782                 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1783                 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1784         } else {
1785                 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1786                 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1787         }
1788
1789         /*
1790          * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1791          * won't get blocked by normal direct-reclaimers, forming a circular
1792          * deadlock.
1793          */
1794         if (gfp_has_io_fs(sc->gfp_mask))
1795                 inactive >>= 3;
1796
1797         too_many = isolated > inactive;
1798
1799         /* Wake up tasks throttled due to too_many_isolated. */
1800         if (!too_many)
1801                 wake_throttle_isolated(pgdat);
1802
1803         return too_many;
1804 }
1805
1806 /*
1807  * move_folios_to_lru() moves folios from private @list to appropriate LRU list.
1808  *
1809  * Returns the number of pages moved to the given lruvec.
1810  */
1811 static unsigned int move_folios_to_lru(struct lruvec *lruvec,
1812                 struct list_head *list)
1813 {
1814         int nr_pages, nr_moved = 0;
1815         struct folio_batch free_folios;
1816
1817         folio_batch_init(&free_folios);
1818         while (!list_empty(list)) {
1819                 struct folio *folio = lru_to_folio(list);
1820
1821                 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
1822                 list_del(&folio->lru);
1823                 if (unlikely(!folio_evictable(folio))) {
1824                         spin_unlock_irq(&lruvec->lru_lock);
1825                         folio_putback_lru(folio);
1826                         spin_lock_irq(&lruvec->lru_lock);
1827                         continue;
1828                 }
1829
1830                 /*
1831                  * The folio_set_lru needs to be kept here for list integrity.
1832                  * Otherwise:
1833                  *   #0 move_folios_to_lru             #1 release_pages
1834                  *   if (!folio_put_testzero())
1835                  *                                    if (folio_put_testzero())
1836                  *                                      !lru //skip lru_lock
1837                  *     folio_set_lru()
1838                  *     list_add(&folio->lru,)
1839                  *                                        list_add(&folio->lru,)
1840                  */
1841                 folio_set_lru(folio);
1842
1843                 if (unlikely(folio_put_testzero(folio))) {
1844                         __folio_clear_lru_flags(folio);
1845
1846                         folio_undo_large_rmappable(folio);
1847                         if (folio_batch_add(&free_folios, folio) == 0) {
1848                                 spin_unlock_irq(&lruvec->lru_lock);
1849                                 mem_cgroup_uncharge_folios(&free_folios);
1850                                 free_unref_folios(&free_folios);
1851                                 spin_lock_irq(&lruvec->lru_lock);
1852                         }
1853
1854                         continue;
1855                 }
1856
1857                 /*
1858                  * All pages were isolated from the same lruvec (and isolation
1859                  * inhibits memcg migration).
1860                  */
1861                 VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
1862                 lruvec_add_folio(lruvec, folio);
1863                 nr_pages = folio_nr_pages(folio);
1864                 nr_moved += nr_pages;
1865                 if (folio_test_active(folio))
1866                         workingset_age_nonresident(lruvec, nr_pages);
1867         }
1868
1869         if (free_folios.nr) {
1870                 spin_unlock_irq(&lruvec->lru_lock);
1871                 mem_cgroup_uncharge_folios(&free_folios);
1872                 free_unref_folios(&free_folios);
1873                 spin_lock_irq(&lruvec->lru_lock);
1874         }
1875
1876         return nr_moved;
1877 }
1878
1879 /*
1880  * If a kernel thread (such as nfsd for loop-back mounts) services a backing
1881  * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
1882  * we should not throttle.  Otherwise it is safe to do so.
1883  */
1884 static int current_may_throttle(void)
1885 {
1886         return !(current->flags & PF_LOCAL_THROTTLE);
1887 }
1888
1889 /*
1890  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
1891  * of reclaimed pages
1892  */
1893 static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
1894                 struct lruvec *lruvec, struct scan_control *sc,
1895                 enum lru_list lru)
1896 {
1897         LIST_HEAD(folio_list);
1898         unsigned long nr_scanned;
1899         unsigned int nr_reclaimed = 0;
1900         unsigned long nr_taken;
1901         struct reclaim_stat stat;
1902         bool file = is_file_lru(lru);
1903         enum vm_event_item item;
1904         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1905         bool stalled = false;
1906
1907         while (unlikely(too_many_isolated(pgdat, file, sc))) {
1908                 if (stalled)
1909                         return 0;
1910
1911                 /* wait a bit for the reclaimer. */
1912                 stalled = true;
1913                 reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
1914
1915                 /* We are about to die and free our memory. Return now. */
1916                 if (fatal_signal_pending(current))
1917                         return SWAP_CLUSTER_MAX;
1918         }
1919
1920         lru_add_drain();
1921
1922         spin_lock_irq(&lruvec->lru_lock);
1923
1924         nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
1925                                      &nr_scanned, sc, lru);
1926
1927         __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1928         item = PGSCAN_KSWAPD + reclaimer_offset();
1929         if (!cgroup_reclaim(sc))
1930                 __count_vm_events(item, nr_scanned);
1931         __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
1932         __count_vm_events(PGSCAN_ANON + file, nr_scanned);
1933
1934         spin_unlock_irq(&lruvec->lru_lock);
1935
1936         if (nr_taken == 0)
1937                 return 0;
1938
1939         nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
1940
1941         spin_lock_irq(&lruvec->lru_lock);
1942         move_folios_to_lru(lruvec, &folio_list);
1943
1944         __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1945         item = PGSTEAL_KSWAPD + reclaimer_offset();
1946         if (!cgroup_reclaim(sc))
1947                 __count_vm_events(item, nr_reclaimed);
1948         __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
1949         __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
1950         spin_unlock_irq(&lruvec->lru_lock);
1951
1952         lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
1953
1954         /*
1955          * If dirty folios are scanned that are not queued for IO, it
1956          * implies that flushers are not doing their job. This can
1957          * happen when memory pressure pushes dirty folios to the end of
1958          * the LRU before the dirty limits are breached and the dirty
1959          * data has expired. It can also happen when the proportion of
1960          * dirty folios grows not through writes but through memory
1961          * pressure reclaiming all the clean cache. And in some cases,
1962          * the flushers simply cannot keep up with the allocation
1963          * rate. Nudge the flusher threads in case they are asleep.
1964          */
1965         if (stat.nr_unqueued_dirty == nr_taken) {
1966                 wakeup_flusher_threads(WB_REASON_VMSCAN);
1967                 /*
1968                  * For cgroupv1 dirty throttling is achieved by waking up
1969                  * the kernel flusher here and later waiting on folios
1970                  * which are in writeback to finish (see shrink_folio_list()).
1971                  *
1972                  * Flusher may not be able to issue writeback quickly
1973                  * enough for cgroupv1 writeback throttling to work
1974                  * on a large system.
1975                  */
1976                 if (!writeback_throttling_sane(sc))
1977                         reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
1978         }
1979
1980         sc->nr.dirty += stat.nr_dirty;
1981         sc->nr.congested += stat.nr_congested;
1982         sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
1983         sc->nr.writeback += stat.nr_writeback;
1984         sc->nr.immediate += stat.nr_immediate;
1985         sc->nr.taken += nr_taken;
1986         if (file)
1987                 sc->nr.file_taken += nr_taken;
1988
1989         trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
1990                         nr_scanned, nr_reclaimed, &stat, sc->priority, file);
1991         return nr_reclaimed;
1992 }
1993
1994 /*
1995  * shrink_active_list() moves folios from the active LRU to the inactive LRU.
1996  *
1997  * We move them the other way if the folio is referenced by one or more
1998  * processes.
1999  *
2000  * If the folios are mostly unmapped, the processing is fast and it is
2001  * appropriate to hold lru_lock across the whole operation.  But if
2002  * the folios are mapped, the processing is slow (folio_referenced()), so
2003  * we should drop lru_lock around each folio.  It's impossible to balance
2004  * this, so instead we remove the folios from the LRU while processing them.
2005  * It is safe to rely on the active flag against the non-LRU folios in here
2006  * because nobody will play with that bit on a non-LRU folio.
2007  *
2008  * The downside is that we have to touch folio->_refcount against each folio.
2009  * But we had to alter folio->flags anyway.
2010  */
2011 static void shrink_active_list(unsigned long nr_to_scan,
2012                                struct lruvec *lruvec,
2013                                struct scan_control *sc,
2014                                enum lru_list lru)
2015 {
2016         unsigned long nr_taken;
2017         unsigned long nr_scanned;
2018         unsigned long vm_flags;
2019         LIST_HEAD(l_hold);      /* The folios which were snipped off */
2020         LIST_HEAD(l_active);
2021         LIST_HEAD(l_inactive);
2022         unsigned nr_deactivate, nr_activate;
2023         unsigned nr_rotated = 0;
2024         bool file = is_file_lru(lru);
2025         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2026
2027         lru_add_drain();
2028
2029         spin_lock_irq(&lruvec->lru_lock);
2030
2031         nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
2032                                      &nr_scanned, sc, lru);
2033
2034         __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2035
2036         if (!cgroup_reclaim(sc))
2037                 __count_vm_events(PGREFILL, nr_scanned);
2038         __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2039
2040         spin_unlock_irq(&lruvec->lru_lock);
2041
2042         while (!list_empty(&l_hold)) {
2043                 struct folio *folio;
2044
2045                 cond_resched();
2046                 folio = lru_to_folio(&l_hold);
2047                 list_del(&folio->lru);
2048
2049                 if (unlikely(!folio_evictable(folio))) {
2050                         folio_putback_lru(folio);
2051                         continue;
2052                 }
2053
2054                 if (unlikely(buffer_heads_over_limit)) {
2055                         if (folio_needs_release(folio) &&
2056                             folio_trylock(folio)) {
2057                                 filemap_release_folio(folio, 0);
2058                                 folio_unlock(folio);
2059                         }
2060                 }
2061
2062                 /* Referenced or rmap lock contention: rotate */
2063                 if (folio_referenced(folio, 0, sc->target_mem_cgroup,
2064                                      &vm_flags) != 0) {
2065                         /*
2066                          * Identify referenced, file-backed active folios and
2067                          * give them one more trip around the active list. So
2068                          * that executable code get better chances to stay in
2069                          * memory under moderate memory pressure.  Anon folios
2070                          * are not likely to be evicted by use-once streaming
2071                          * IO, plus JVM can create lots of anon VM_EXEC folios,
2072                          * so we ignore them here.
2073                          */
2074                         if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
2075                                 nr_rotated += folio_nr_pages(folio);
2076                                 list_add(&folio->lru, &l_active);
2077                                 continue;
2078                         }
2079                 }
2080
2081                 folio_clear_active(folio);      /* we are de-activating */
2082                 folio_set_workingset(folio);
2083                 list_add(&folio->lru, &l_inactive);
2084         }
2085
2086         /*
2087          * Move folios back to the lru list.
2088          */
2089         spin_lock_irq(&lruvec->lru_lock);
2090
2091         nr_activate = move_folios_to_lru(lruvec, &l_active);
2092         nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
2093
2094         __count_vm_events(PGDEACTIVATE, nr_deactivate);
2095         __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2096
2097         __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2098         spin_unlock_irq(&lruvec->lru_lock);
2099
2100         if (nr_rotated)
2101                 lru_note_cost(lruvec, file, 0, nr_rotated);
2102         trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2103                         nr_deactivate, nr_rotated, sc->priority, file);
2104 }
2105
2106 static unsigned int reclaim_folio_list(struct list_head *folio_list,
2107                                       struct pglist_data *pgdat)
2108 {
2109         struct reclaim_stat dummy_stat;
2110         unsigned int nr_reclaimed;
2111         struct folio *folio;
2112         struct scan_control sc = {
2113                 .gfp_mask = GFP_KERNEL,
2114                 .may_writepage = 1,
2115                 .may_unmap = 1,
2116                 .may_swap = 1,
2117                 .no_demotion = 1,
2118         };
2119
2120         nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true);
2121         while (!list_empty(folio_list)) {
2122                 folio = lru_to_folio(folio_list);
2123                 list_del(&folio->lru);
2124                 folio_putback_lru(folio);
2125         }
2126
2127         return nr_reclaimed;
2128 }
2129
2130 unsigned long reclaim_pages(struct list_head *folio_list)
2131 {
2132         int nid;
2133         unsigned int nr_reclaimed = 0;
2134         LIST_HEAD(node_folio_list);
2135         unsigned int noreclaim_flag;
2136
2137         if (list_empty(folio_list))
2138                 return nr_reclaimed;
2139
2140         noreclaim_flag = memalloc_noreclaim_save();
2141
2142         nid = folio_nid(lru_to_folio(folio_list));
2143         do {
2144                 struct folio *folio = lru_to_folio(folio_list);
2145
2146                 if (nid == folio_nid(folio)) {
2147                         folio_clear_active(folio);
2148                         list_move(&folio->lru, &node_folio_list);
2149                         continue;
2150                 }
2151
2152                 nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
2153                 nid = folio_nid(lru_to_folio(folio_list));
2154         } while (!list_empty(folio_list));
2155
2156         nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
2157
2158         memalloc_noreclaim_restore(noreclaim_flag);
2159
2160         return nr_reclaimed;
2161 }
2162
2163 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2164                                  struct lruvec *lruvec, struct scan_control *sc)
2165 {
2166         if (is_active_lru(lru)) {
2167                 if (sc->may_deactivate & (1 << is_file_lru(lru)))
2168                         shrink_active_list(nr_to_scan, lruvec, sc, lru);
2169                 else
2170                         sc->skipped_deactivate = 1;
2171                 return 0;
2172         }
2173
2174         return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2175 }
2176
2177 /*
2178  * The inactive anon list should be small enough that the VM never has
2179  * to do too much work.
2180  *
2181  * The inactive file list should be small enough to leave most memory
2182  * to the established workingset on the scan-resistant active list,
2183  * but large enough to avoid thrashing the aggregate readahead window.
2184  *
2185  * Both inactive lists should also be large enough that each inactive
2186  * folio has a chance to be referenced again before it is reclaimed.
2187  *
2188  * If that fails and refaulting is observed, the inactive list grows.
2189  *
2190  * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios
2191  * on this LRU, maintained by the pageout code. An inactive_ratio
2192  * of 3 means 3:1 or 25% of the folios are kept on the inactive list.
2193  *
2194  * total     target    max
2195  * memory    ratio     inactive
2196  * -------------------------------------
2197  *   10MB       1         5MB
2198  *  100MB       1        50MB
2199  *    1GB       3       250MB
2200  *   10GB      10       0.9GB
2201  *  100GB      31         3GB
2202  *    1TB     101        10GB
2203  *   10TB     320        32GB
2204  */
2205 static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
2206 {
2207         enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2208         unsigned long inactive, active;
2209         unsigned long inactive_ratio;
2210         unsigned long gb;
2211
2212         inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
2213         active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
2214
2215         gb = (inactive + active) >> (30 - PAGE_SHIFT);
2216         if (gb)
2217                 inactive_ratio = int_sqrt(10 * gb);
2218         else
2219                 inactive_ratio = 1;
2220
2221         return inactive * inactive_ratio < active;
2222 }
2223
2224 enum scan_balance {
2225         SCAN_EQUAL,
2226         SCAN_FRACT,
2227         SCAN_ANON,
2228         SCAN_FILE,
2229 };
2230
2231 static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
2232 {
2233         unsigned long file;
2234         struct lruvec *target_lruvec;
2235
2236         if (lru_gen_enabled())
2237                 return;
2238
2239         target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
2240
2241         /*
2242          * Flush the memory cgroup stats, so that we read accurate per-memcg
2243          * lruvec stats for heuristics.
2244          */
2245         mem_cgroup_flush_stats(sc->target_mem_cgroup);
2246
2247         /*
2248          * Determine the scan balance between anon and file LRUs.
2249          */
2250         spin_lock_irq(&target_lruvec->lru_lock);
2251         sc->anon_cost = target_lruvec->anon_cost;
2252         sc->file_cost = target_lruvec->file_cost;
2253         spin_unlock_irq(&target_lruvec->lru_lock);
2254
2255         /*
2256          * Target desirable inactive:active list ratios for the anon
2257          * and file LRU lists.
2258          */
2259         if (!sc->force_deactivate) {
2260                 unsigned long refaults;
2261
2262                 /*
2263                  * When refaults are being observed, it means a new
2264                  * workingset is being established. Deactivate to get
2265                  * rid of any stale active pages quickly.
2266                  */
2267                 refaults = lruvec_page_state(target_lruvec,
2268                                 WORKINGSET_ACTIVATE_ANON);
2269                 if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
2270                         inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
2271                         sc->may_deactivate |= DEACTIVATE_ANON;
2272                 else
2273                         sc->may_deactivate &= ~DEACTIVATE_ANON;
2274
2275                 refaults = lruvec_page_state(target_lruvec,
2276                                 WORKINGSET_ACTIVATE_FILE);
2277                 if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
2278                     inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
2279                         sc->may_deactivate |= DEACTIVATE_FILE;
2280                 else
2281                         sc->may_deactivate &= ~DEACTIVATE_FILE;
2282         } else
2283                 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
2284
2285         /*
2286          * If we have plenty of inactive file pages that aren't
2287          * thrashing, try to reclaim those first before touching
2288          * anonymous pages.
2289          */
2290         file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
2291         if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) &&
2292             !sc->no_cache_trim_mode)
2293                 sc->cache_trim_mode = 1;
2294         else
2295                 sc->cache_trim_mode = 0;
2296
2297         /*
2298          * Prevent the reclaimer from falling into the cache trap: as
2299          * cache pages start out inactive, every cache fault will tip
2300          * the scan balance towards the file LRU.  And as the file LRU
2301          * shrinks, so does the window for rotation from references.
2302          * This means we have a runaway feedback loop where a tiny
2303          * thrashing file LRU becomes infinitely more attractive than
2304          * anon pages.  Try to detect this based on file LRU size.
2305          */
2306         if (!cgroup_reclaim(sc)) {
2307                 unsigned long total_high_wmark = 0;
2308                 unsigned long free, anon;
2309                 int z;
2310
2311                 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2312                 file = node_page_state(pgdat, NR_ACTIVE_FILE) +
2313                            node_page_state(pgdat, NR_INACTIVE_FILE);
2314
2315                 for (z = 0; z < MAX_NR_ZONES; z++) {
2316                         struct zone *zone = &pgdat->node_zones[z];
2317
2318                         if (!managed_zone(zone))
2319                                 continue;
2320
2321                         total_high_wmark += high_wmark_pages(zone);
2322                 }
2323
2324                 /*
2325                  * Consider anon: if that's low too, this isn't a
2326                  * runaway file reclaim problem, but rather just
2327                  * extreme pressure. Reclaim as per usual then.
2328                  */
2329                 anon = node_page_state(pgdat, NR_INACTIVE_ANON);
2330
2331                 sc->file_is_tiny =
2332                         file + free <= total_high_wmark &&
2333                         !(sc->may_deactivate & DEACTIVATE_ANON) &&
2334                         anon >> sc->priority;
2335         }
2336 }
2337
2338 /*
2339  * Determine how aggressively the anon and file LRU lists should be
2340  * scanned.
2341  *
2342  * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
2343  * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
2344  */
2345 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
2346                            unsigned long *nr)
2347 {
2348         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2349         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2350         unsigned long anon_cost, file_cost, total_cost;
2351         int swappiness = sc_swappiness(sc, memcg);
2352         u64 fraction[ANON_AND_FILE];
2353         u64 denominator = 0;    /* gcc */
2354         enum scan_balance scan_balance;
2355         unsigned long ap, fp;
2356         enum lru_list lru;
2357
2358         /* If we have no swap space, do not bother scanning anon folios. */
2359         if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
2360                 scan_balance = SCAN_FILE;
2361                 goto out;
2362         }
2363
2364         /*
2365          * Global reclaim will swap to prevent OOM even with no
2366          * swappiness, but memcg users want to use this knob to
2367          * disable swapping for individual groups completely when
2368          * using the memory controller's swap limit feature would be
2369          * too expensive.
2370          */
2371         if (cgroup_reclaim(sc) && !swappiness) {
2372                 scan_balance = SCAN_FILE;
2373                 goto out;
2374         }
2375
2376         /*
2377          * Do not apply any pressure balancing cleverness when the
2378          * system is close to OOM, scan both anon and file equally
2379          * (unless the swappiness setting disagrees with swapping).
2380          */
2381         if (!sc->priority && swappiness) {
2382                 scan_balance = SCAN_EQUAL;
2383                 goto out;
2384         }
2385
2386         /*
2387          * If the system is almost out of file pages, force-scan anon.
2388          */
2389         if (sc->file_is_tiny) {
2390                 scan_balance = SCAN_ANON;
2391                 goto out;
2392         }
2393
2394         /*
2395          * If there is enough inactive page cache, we do not reclaim
2396          * anything from the anonymous working right now.
2397          */
2398         if (sc->cache_trim_mode) {
2399                 scan_balance = SCAN_FILE;
2400                 goto out;
2401         }
2402
2403         scan_balance = SCAN_FRACT;
2404         /*
2405          * Calculate the pressure balance between anon and file pages.
2406          *
2407          * The amount of pressure we put on each LRU is inversely
2408          * proportional to the cost of reclaiming each list, as
2409          * determined by the share of pages that are refaulting, times
2410          * the relative IO cost of bringing back a swapped out
2411          * anonymous page vs reloading a filesystem page (swappiness).
2412          *
2413          * Although we limit that influence to ensure no list gets
2414          * left behind completely: at least a third of the pressure is
2415          * applied, before swappiness.
2416          *
2417          * With swappiness at 100, anon and file have equal IO cost.
2418          */
2419         total_cost = sc->anon_cost + sc->file_cost;
2420         anon_cost = total_cost + sc->anon_cost;
2421         file_cost = total_cost + sc->file_cost;
2422         total_cost = anon_cost + file_cost;
2423
2424         ap = swappiness * (total_cost + 1);
2425         ap /= anon_cost + 1;
2426
2427         fp = (MAX_SWAPPINESS - swappiness) * (total_cost + 1);
2428         fp /= file_cost + 1;
2429
2430         fraction[0] = ap;
2431         fraction[1] = fp;
2432         denominator = ap + fp;
2433 out:
2434         for_each_evictable_lru(lru) {
2435                 bool file = is_file_lru(lru);
2436                 unsigned long lruvec_size;
2437                 unsigned long low, min;
2438                 unsigned long scan;
2439
2440                 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2441                 mem_cgroup_protection(sc->target_mem_cgroup, memcg,
2442                                       &min, &low);
2443
2444                 if (min || low) {
2445                         /*
2446                          * Scale a cgroup's reclaim pressure by proportioning
2447                          * its current usage to its memory.low or memory.min
2448                          * setting.
2449                          *
2450                          * This is important, as otherwise scanning aggression
2451                          * becomes extremely binary -- from nothing as we
2452                          * approach the memory protection threshold, to totally
2453                          * nominal as we exceed it.  This results in requiring
2454                          * setting extremely liberal protection thresholds. It
2455                          * also means we simply get no protection at all if we
2456                          * set it too low, which is not ideal.
2457                          *
2458                          * If there is any protection in place, we reduce scan
2459                          * pressure by how much of the total memory used is
2460                          * within protection thresholds.
2461                          *
2462                          * There is one special case: in the first reclaim pass,
2463                          * we skip over all groups that are within their low
2464                          * protection. If that fails to reclaim enough pages to
2465                          * satisfy the reclaim goal, we come back and override
2466                          * the best-effort low protection. However, we still
2467                          * ideally want to honor how well-behaved groups are in
2468                          * that case instead of simply punishing them all
2469                          * equally. As such, we reclaim them based on how much
2470                          * memory they are using, reducing the scan pressure
2471                          * again by how much of the total memory used is under
2472                          * hard protection.
2473                          */
2474                         unsigned long cgroup_size = mem_cgroup_size(memcg);
2475                         unsigned long protection;
2476
2477                         /* memory.low scaling, make sure we retry before OOM */
2478                         if (!sc->memcg_low_reclaim && low > min) {
2479                                 protection = low;
2480                                 sc->memcg_low_skipped = 1;
2481                         } else {
2482                                 protection = min;
2483                         }
2484
2485                         /* Avoid TOCTOU with earlier protection check */
2486                         cgroup_size = max(cgroup_size, protection);
2487
2488                         scan = lruvec_size - lruvec_size * protection /
2489                                 (cgroup_size + 1);
2490
2491                         /*
2492                          * Minimally target SWAP_CLUSTER_MAX pages to keep
2493                          * reclaim moving forwards, avoiding decrementing
2494                          * sc->priority further than desirable.
2495                          */
2496                         scan = max(scan, SWAP_CLUSTER_MAX);
2497                 } else {
2498                         scan = lruvec_size;
2499                 }
2500
2501                 scan >>= sc->priority;
2502
2503                 /*
2504                  * If the cgroup's already been deleted, make sure to
2505                  * scrape out the remaining cache.
2506                  */
2507                 if (!scan && !mem_cgroup_online(memcg))
2508                         scan = min(lruvec_size, SWAP_CLUSTER_MAX);
2509
2510                 switch (scan_balance) {
2511                 case SCAN_EQUAL:
2512                         /* Scan lists relative to size */
2513                         break;
2514                 case SCAN_FRACT:
2515                         /*
2516                          * Scan types proportional to swappiness and
2517                          * their relative recent reclaim efficiency.
2518                          * Make sure we don't miss the last page on
2519                          * the offlined memory cgroups because of a
2520                          * round-off error.
2521                          */
2522                         scan = mem_cgroup_online(memcg) ?
2523                                div64_u64(scan * fraction[file], denominator) :
2524                                DIV64_U64_ROUND_UP(scan * fraction[file],
2525                                                   denominator);
2526                         break;
2527                 case SCAN_FILE:
2528                 case SCAN_ANON:
2529                         /* Scan one type exclusively */
2530                         if ((scan_balance == SCAN_FILE) != file)
2531                                 scan = 0;
2532                         break;
2533                 default:
2534                         /* Look ma, no brain */
2535                         BUG();
2536                 }
2537
2538                 nr[lru] = scan;
2539         }
2540 }
2541
2542 /*
2543  * Anonymous LRU management is a waste if there is
2544  * ultimately no way to reclaim the memory.
2545  */
2546 static bool can_age_anon_pages(struct pglist_data *pgdat,
2547                                struct scan_control *sc)
2548 {
2549         /* Aging the anon LRU is valuable if swap is present: */
2550         if (total_swap_pages > 0)
2551                 return true;
2552
2553         /* Also valuable if anon pages can be demoted: */
2554         return can_demote(pgdat->node_id, sc);
2555 }
2556
2557 #ifdef CONFIG_LRU_GEN
2558
2559 #ifdef CONFIG_LRU_GEN_ENABLED
2560 DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
2561 #define get_cap(cap)    static_branch_likely(&lru_gen_caps[cap])
2562 #else
2563 DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
2564 #define get_cap(cap)    static_branch_unlikely(&lru_gen_caps[cap])
2565 #endif
2566
2567 static bool should_walk_mmu(void)
2568 {
2569         return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK);
2570 }
2571
2572 static bool should_clear_pmd_young(void)
2573 {
2574         return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG);
2575 }
2576
2577 /******************************************************************************
2578  *                          shorthand helpers
2579  ******************************************************************************/
2580
2581 #define LRU_REFS_FLAGS  (BIT(PG_referenced) | BIT(PG_workingset))
2582
2583 #define DEFINE_MAX_SEQ(lruvec)                                          \
2584         unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
2585
2586 #define DEFINE_MIN_SEQ(lruvec)                                          \
2587         unsigned long min_seq[ANON_AND_FILE] = {                        \
2588                 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]),      \
2589                 READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]),      \
2590         }
2591
2592 #define for_each_gen_type_zone(gen, type, zone)                         \
2593         for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)                   \
2594                 for ((type) = 0; (type) < ANON_AND_FILE; (type)++)      \
2595                         for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
2596
2597 #define get_memcg_gen(seq)      ((seq) % MEMCG_NR_GENS)
2598 #define get_memcg_bin(bin)      ((bin) % MEMCG_NR_BINS)
2599
2600 static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
2601 {
2602         struct pglist_data *pgdat = NODE_DATA(nid);
2603
2604 #ifdef CONFIG_MEMCG
2605         if (memcg) {
2606                 struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
2607
2608                 /* see the comment in mem_cgroup_lruvec() */
2609                 if (!lruvec->pgdat)
2610                         lruvec->pgdat = pgdat;
2611
2612                 return lruvec;
2613         }
2614 #endif
2615         VM_WARN_ON_ONCE(!mem_cgroup_disabled());
2616
2617         return &pgdat->__lruvec;
2618 }
2619
2620 static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
2621 {
2622         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2623         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2624
2625         if (!sc->may_swap)
2626                 return 0;
2627
2628         if (!can_demote(pgdat->node_id, sc) &&
2629             mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
2630                 return 0;
2631
2632         return sc_swappiness(sc, memcg);
2633 }
2634
2635 static int get_nr_gens(struct lruvec *lruvec, int type)
2636 {
2637         return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
2638 }
2639
2640 static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
2641 {
2642         /* see the comment on lru_gen_folio */
2643         return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
2644                get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
2645                get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
2646 }
2647
2648 /******************************************************************************
2649  *                          Bloom filters
2650  ******************************************************************************/
2651
2652 /*
2653  * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
2654  * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
2655  * bits in a bitmap, k is the number of hash functions and n is the number of
2656  * inserted items.
2657  *
2658  * Page table walkers use one of the two filters to reduce their search space.
2659  * To get rid of non-leaf entries that no longer have enough leaf entries, the
2660  * aging uses the double-buffering technique to flip to the other filter each
2661  * time it produces a new generation. For non-leaf entries that have enough
2662  * leaf entries, the aging carries them over to the next generation in
2663  * walk_pmd_range(); the eviction also report them when walking the rmap
2664  * in lru_gen_look_around().
2665  *
2666  * For future optimizations:
2667  * 1. It's not necessary to keep both filters all the time. The spare one can be
2668  *    freed after the RCU grace period and reallocated if needed again.
2669  * 2. And when reallocating, it's worth scaling its size according to the number
2670  *    of inserted entries in the other filter, to reduce the memory overhead on
2671  *    small systems and false positives on large systems.
2672  * 3. Jenkins' hash function is an alternative to Knuth's.
2673  */
2674 #define BLOOM_FILTER_SHIFT      15
2675
2676 static inline int filter_gen_from_seq(unsigned long seq)
2677 {
2678         return seq % NR_BLOOM_FILTERS;
2679 }
2680
2681 static void get_item_key(void *item, int *key)
2682 {
2683         u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
2684
2685         BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
2686
2687         key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
2688         key[1] = hash >> BLOOM_FILTER_SHIFT;
2689 }
2690
2691 static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
2692                               void *item)
2693 {
2694         int key[2];
2695         unsigned long *filter;
2696         int gen = filter_gen_from_seq(seq);
2697
2698         filter = READ_ONCE(mm_state->filters[gen]);
2699         if (!filter)
2700                 return true;
2701
2702         get_item_key(item, key);
2703
2704         return test_bit(key[0], filter) && test_bit(key[1], filter);
2705 }
2706
2707 static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq,
2708                                 void *item)
2709 {
2710         int key[2];
2711         unsigned long *filter;
2712         int gen = filter_gen_from_seq(seq);
2713
2714         filter = READ_ONCE(mm_state->filters[gen]);
2715         if (!filter)
2716                 return;
2717
2718         get_item_key(item, key);
2719
2720         if (!test_bit(key[0], filter))
2721                 set_bit(key[0], filter);
2722         if (!test_bit(key[1], filter))
2723                 set_bit(key[1], filter);
2724 }
2725
2726 static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq)
2727 {
2728         unsigned long *filter;
2729         int gen = filter_gen_from_seq(seq);
2730
2731         filter = mm_state->filters[gen];
2732         if (filter) {
2733                 bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
2734                 return;
2735         }
2736
2737         filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
2738                                __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
2739         WRITE_ONCE(mm_state->filters[gen], filter);
2740 }
2741
2742 /******************************************************************************
2743  *                          mm_struct list
2744  ******************************************************************************/
2745
2746 #ifdef CONFIG_LRU_GEN_WALKS_MMU
2747
2748 static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
2749 {
2750         static struct lru_gen_mm_list mm_list = {
2751                 .fifo = LIST_HEAD_INIT(mm_list.fifo),
2752                 .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
2753         };
2754
2755 #ifdef CONFIG_MEMCG
2756         if (memcg)
2757                 return &memcg->mm_list;
2758 #endif
2759         VM_WARN_ON_ONCE(!mem_cgroup_disabled());
2760
2761         return &mm_list;
2762 }
2763
2764 static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
2765 {
2766         return &lruvec->mm_state;
2767 }
2768
2769 static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
2770 {
2771         int key;
2772         struct mm_struct *mm;
2773         struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
2774         struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
2775
2776         mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
2777         key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
2778
2779         if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
2780                 return NULL;
2781
2782         clear_bit(key, &mm->lru_gen.bitmap);
2783
2784         return mmget_not_zero(mm) ? mm : NULL;
2785 }
2786
2787 void lru_gen_add_mm(struct mm_struct *mm)
2788 {
2789         int nid;
2790         struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
2791         struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
2792
2793         VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
2794 #ifdef CONFIG_MEMCG
2795         VM_WARN_ON_ONCE(mm->lru_gen.memcg);
2796         mm->lru_gen.memcg = memcg;
2797 #endif
2798         spin_lock(&mm_list->lock);
2799
2800         for_each_node_state(nid, N_MEMORY) {
2801                 struct lruvec *lruvec = get_lruvec(memcg, nid);
2802                 struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
2803
2804                 /* the first addition since the last iteration */
2805                 if (mm_state->tail == &mm_list->fifo)
2806                         mm_state->tail = &mm->lru_gen.list;
2807         }
2808
2809         list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
2810
2811         spin_unlock(&mm_list->lock);
2812 }
2813
2814 void lru_gen_del_mm(struct mm_struct *mm)
2815 {
2816         int nid;
2817         struct lru_gen_mm_list *mm_list;
2818         struct mem_cgroup *memcg = NULL;
2819
2820         if (list_empty(&mm->lru_gen.list))
2821                 return;
2822
2823 #ifdef CONFIG_MEMCG
2824         memcg = mm->lru_gen.memcg;
2825 #endif
2826         mm_list = get_mm_list(memcg);
2827
2828         spin_lock(&mm_list->lock);
2829
2830         for_each_node(nid) {
2831                 struct lruvec *lruvec = get_lruvec(memcg, nid);
2832                 struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
2833
2834                 /* where the current iteration continues after */
2835                 if (mm_state->head == &mm->lru_gen.list)
2836                         mm_state->head = mm_state->head->prev;
2837
2838                 /* where the last iteration ended before */
2839                 if (mm_state->tail == &mm->lru_gen.list)
2840                         mm_state->tail = mm_state->tail->next;
2841         }
2842
2843         list_del_init(&mm->lru_gen.list);
2844
2845         spin_unlock(&mm_list->lock);
2846
2847 #ifdef CONFIG_MEMCG
2848         mem_cgroup_put(mm->lru_gen.memcg);
2849         mm->lru_gen.memcg = NULL;
2850 #endif
2851 }
2852
2853 #ifdef CONFIG_MEMCG
2854 void lru_gen_migrate_mm(struct mm_struct *mm)
2855 {
2856         struct mem_cgroup *memcg;
2857         struct task_struct *task = rcu_dereference_protected(mm->owner, true);
2858
2859         VM_WARN_ON_ONCE(task->mm != mm);
2860         lockdep_assert_held(&task->alloc_lock);
2861
2862         /* for mm_update_next_owner() */
2863         if (mem_cgroup_disabled())
2864                 return;
2865
2866         /* migration can happen before addition */
2867         if (!mm->lru_gen.memcg)
2868                 return;
2869
2870         rcu_read_lock();
2871         memcg = mem_cgroup_from_task(task);
2872         rcu_read_unlock();
2873         if (memcg == mm->lru_gen.memcg)
2874                 return;
2875
2876         VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
2877
2878         lru_gen_del_mm(mm);
2879         lru_gen_add_mm(mm);
2880 }
2881 #endif
2882
2883 #else /* !CONFIG_LRU_GEN_WALKS_MMU */
2884
2885 static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
2886 {
2887         return NULL;
2888 }
2889
2890 static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec)
2891 {
2892         return NULL;
2893 }
2894
2895 static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk)
2896 {
2897         return NULL;
2898 }
2899
2900 #endif
2901
2902 static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last)
2903 {
2904         int i;
2905         int hist;
2906         struct lruvec *lruvec = walk->lruvec;
2907         struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
2908
2909         lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
2910
2911         hist = lru_hist_from_seq(walk->seq);
2912
2913         for (i = 0; i < NR_MM_STATS; i++) {
2914                 WRITE_ONCE(mm_state->stats[hist][i],
2915                            mm_state->stats[hist][i] + walk->mm_stats[i]);
2916                 walk->mm_stats[i] = 0;
2917         }
2918
2919         if (NR_HIST_GENS > 1 && last) {
2920                 hist = lru_hist_from_seq(walk->seq + 1);
2921
2922                 for (i = 0; i < NR_MM_STATS; i++)
2923                         WRITE_ONCE(mm_state->stats[hist][i], 0);
2924         }
2925 }
2926
2927 static bool iterate_mm_list(struct lru_gen_mm_walk *walk, struct mm_struct **iter)
2928 {
2929         bool first = false;
2930         bool last = false;
2931         struct mm_struct *mm = NULL;
2932         struct lruvec *lruvec = walk->lruvec;
2933         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2934         struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
2935         struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
2936
2937         /*
2938          * mm_state->seq is incremented after each iteration of mm_list. There
2939          * are three interesting cases for this page table walker:
2940          * 1. It tries to start a new iteration with a stale max_seq: there is
2941          *    nothing left to do.
2942          * 2. It started the next iteration: it needs to reset the Bloom filter
2943          *    so that a fresh set of PTE tables can be recorded.
2944          * 3. It ended the current iteration: it needs to reset the mm stats
2945          *    counters and tell its caller to increment max_seq.
2946          */
2947         spin_lock(&mm_list->lock);
2948
2949         VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->seq);
2950
2951         if (walk->seq <= mm_state->seq)
2952                 goto done;
2953
2954         if (!mm_state->head)
2955                 mm_state->head = &mm_list->fifo;
2956
2957         if (mm_state->head == &mm_list->fifo)
2958                 first = true;
2959
2960         do {
2961                 mm_state->head = mm_state->head->next;
2962                 if (mm_state->head == &mm_list->fifo) {
2963                         WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
2964                         last = true;
2965                         break;
2966                 }
2967
2968                 /* force scan for those added after the last iteration */
2969                 if (!mm_state->tail || mm_state->tail == mm_state->head) {
2970                         mm_state->tail = mm_state->head->next;
2971                         walk->force_scan = true;
2972                 }
2973         } while (!(mm = get_next_mm(walk)));
2974 done:
2975         if (*iter || last)
2976                 reset_mm_stats(walk, last);
2977
2978         spin_unlock(&mm_list->lock);
2979
2980         if (mm && first)
2981                 reset_bloom_filter(mm_state, walk->seq + 1);
2982
2983         if (*iter)
2984                 mmput_async(*iter);
2985
2986         *iter = mm;
2987
2988         return last;
2989 }
2990
2991 static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long seq)
2992 {
2993         bool success = false;
2994         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2995         struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
2996         struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
2997
2998         spin_lock(&mm_list->lock);
2999
3000         VM_WARN_ON_ONCE(mm_state->seq + 1 < seq);
3001
3002         if (seq > mm_state->seq) {
3003                 mm_state->head = NULL;
3004                 mm_state->tail = NULL;
3005                 WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
3006                 success = true;
3007         }
3008
3009         spin_unlock(&mm_list->lock);
3010
3011         return success;
3012 }
3013
3014 /******************************************************************************
3015  *                          PID controller
3016  ******************************************************************************/
3017
3018 /*
3019  * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
3020  *
3021  * The P term is refaulted/(evicted+protected) from a tier in the generation
3022  * currently being evicted; the I term is the exponential moving average of the
3023  * P term over the generations previously evicted, using the smoothing factor
3024  * 1/2; the D term isn't supported.
3025  *
3026  * The setpoint (SP) is always the first tier of one type; the process variable
3027  * (PV) is either any tier of the other type or any other tier of the same
3028  * type.
3029  *
3030  * The error is the difference between the SP and the PV; the correction is to
3031  * turn off protection when SP>PV or turn on protection when SP<PV.
3032  *
3033  * For future optimizations:
3034  * 1. The D term may discount the other two terms over time so that long-lived
3035  *    generations can resist stale information.
3036  */
3037 struct ctrl_pos {
3038         unsigned long refaulted;
3039         unsigned long total;
3040         int gain;
3041 };
3042
3043 static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
3044                           struct ctrl_pos *pos)
3045 {
3046         struct lru_gen_folio *lrugen = &lruvec->lrugen;
3047         int hist = lru_hist_from_seq(lrugen->min_seq[type]);
3048
3049         pos->refaulted = lrugen->avg_refaulted[type][tier] +
3050                          atomic_long_read(&lrugen->refaulted[hist][type][tier]);
3051         pos->total = lrugen->avg_total[type][tier] +
3052                      atomic_long_read(&lrugen->evicted[hist][type][tier]);
3053         if (tier)
3054                 pos->total += lrugen->protected[hist][type][tier - 1];
3055         pos->gain = gain;
3056 }
3057
3058 static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
3059 {
3060         int hist, tier;
3061         struct lru_gen_folio *lrugen = &lruvec->lrugen;
3062         bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
3063         unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
3064
3065         lockdep_assert_held(&lruvec->lru_lock);
3066
3067         if (!carryover && !clear)
3068                 return;
3069
3070         hist = lru_hist_from_seq(seq);
3071
3072         for (tier = 0; tier < MAX_NR_TIERS; tier++) {
3073                 if (carryover) {
3074                         unsigned long sum;
3075
3076                         sum = lrugen->avg_refaulted[type][tier] +
3077                               atomic_long_read(&lrugen->refaulted[hist][type][tier]);
3078                         WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
3079
3080                         sum = lrugen->avg_total[type][tier] +
3081                               atomic_long_read(&lrugen->evicted[hist][type][tier]);
3082                         if (tier)
3083                                 sum += lrugen->protected[hist][type][tier - 1];
3084                         WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
3085                 }
3086
3087                 if (clear) {
3088                         atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
3089                         atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
3090                         if (tier)
3091                                 WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
3092                 }
3093         }
3094 }
3095
3096 static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
3097 {
3098         /*
3099          * Return true if the PV has a limited number of refaults or a lower
3100          * refaulted/total than the SP.
3101          */
3102         return pv->refaulted < MIN_LRU_BATCH ||
3103                pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
3104                (sp->refaulted + 1) * pv->total * pv->gain;
3105 }
3106
3107 /******************************************************************************
3108  *                          the aging
3109  ******************************************************************************/
3110
3111 /* promote pages accessed through page tables */
3112 static int folio_update_gen(struct folio *folio, int gen)
3113 {
3114         unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
3115
3116         VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
3117         VM_WARN_ON_ONCE(!rcu_read_lock_held());
3118
3119         do {
3120                 /* lru_gen_del_folio() has isolated this page? */
3121                 if (!(old_flags & LRU_GEN_MASK)) {
3122                         /* for shrink_folio_list() */
3123                         new_flags = old_flags | BIT(PG_referenced);
3124                         continue;
3125                 }
3126
3127                 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
3128                 new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
3129         } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
3130
3131         return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
3132 }
3133
3134 /* protect pages accessed multiple times through file descriptors */
3135 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
3136 {
3137         int type = folio_is_file_lru(folio);
3138         struct lru_gen_folio *lrugen = &lruvec->lrugen;
3139         int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
3140         unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
3141
3142         VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
3143
3144         do {
3145                 new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
3146                 /* folio_update_gen() has promoted this page? */
3147                 if (new_gen >= 0 && new_gen != old_gen)
3148                         return new_gen;
3149
3150                 new_gen = (old_gen + 1) % MAX_NR_GENS;
3151
3152                 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
3153                 new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
3154                 /* for folio_end_writeback() */
3155                 if (reclaiming)
3156                         new_flags |= BIT(PG_reclaim);
3157         } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
3158
3159         lru_gen_update_size(lruvec, folio, old_gen, new_gen);
3160
3161         return new_gen;
3162 }
3163
3164 static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
3165                               int old_gen, int new_gen)
3166 {
3167         int type = folio_is_file_lru(folio);
3168         int zone = folio_zonenum(folio);
3169         int delta = folio_nr_pages(folio);
3170
3171         VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
3172         VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
3173
3174         walk->batched++;
3175
3176         walk->nr_pages[old_gen][type][zone] -= delta;
3177         walk->nr_pages[new_gen][type][zone] += delta;
3178 }
3179
3180 static void reset_batch_size(struct lru_gen_mm_walk *walk)
3181 {
3182         int gen, type, zone;
3183         struct lruvec *lruvec = walk->lruvec;
3184         struct lru_gen_folio *lrugen = &lruvec->lrugen;
3185
3186         walk->batched = 0;
3187
3188         for_each_gen_type_zone(gen, type, zone) {
3189                 enum lru_list lru = type * LRU_INACTIVE_FILE;
3190                 int delta = walk->nr_pages[gen][type][zone];
3191
3192                 if (!delta)
3193                         continue;
3194
3195                 walk->nr_pages[gen][type][zone] = 0;
3196                 WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
3197                            lrugen->nr_pages[gen][type][zone] + delta);
3198
3199                 if (lru_gen_is_active(lruvec, gen))
3200                         lru += LRU_ACTIVE;
3201                 __update_lru_size(lruvec, lru, zone, delta);
3202         }
3203 }
3204
3205 static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
3206 {
3207         struct address_space *mapping;
3208         struct vm_area_struct *vma = args->vma;
3209         struct lru_gen_mm_walk *walk = args->private;
3210
3211         if (!vma_is_accessible(vma))
3212                 return true;
3213
3214         if (is_vm_hugetlb_page(vma))
3215                 return true;
3216
3217         if (!vma_has_recency(vma))
3218                 return true;
3219
3220         if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
3221                 return true;
3222
3223         if (vma == get_gate_vma(vma->vm_mm))
3224                 return true;
3225
3226         if (vma_is_anonymous(vma))
3227                 return !walk->can_swap;
3228
3229         if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
3230                 return true;
3231
3232         mapping = vma->vm_file->f_mapping;
3233         if (mapping_unevictable(mapping))
3234                 return true;
3235
3236         if (shmem_mapping(mapping))
3237                 return !walk->can_swap;
3238
3239         /* to exclude special mappings like dax, etc. */
3240         return !mapping->a_ops->read_folio;
3241 }
3242
3243 /*
3244  * Some userspace memory allocators map many single-page VMAs. Instead of
3245  * returning back to the PGD table for each of such VMAs, finish an entire PMD
3246  * table to reduce zigzags and improve cache performance.
3247  */
3248 static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
3249                          unsigned long *vm_start, unsigned long *vm_end)
3250 {
3251         unsigned long start = round_up(*vm_end, size);
3252         unsigned long end = (start | ~mask) + 1;
3253         VMA_ITERATOR(vmi, args->mm, start);
3254
3255         VM_WARN_ON_ONCE(mask & size);
3256         VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
3257
3258         for_each_vma(vmi, args->vma) {
3259                 if (end && end <= args->vma->vm_start)
3260                         return false;
3261
3262                 if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args))
3263                         continue;
3264
3265                 *vm_start = max(start, args->vma->vm_start);
3266                 *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
3267
3268                 return true;
3269         }
3270
3271         return false;
3272 }
3273
3274 static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
3275 {
3276         unsigned long pfn = pte_pfn(pte);
3277
3278         VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
3279
3280         if (!pte_present(pte) || is_zero_pfn(pfn))
3281                 return -1;
3282
3283         if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
3284                 return -1;
3285
3286         if (WARN_ON_ONCE(!pfn_valid(pfn)))
3287                 return -1;
3288
3289         return pfn;
3290 }
3291
3292 static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
3293 {
3294         unsigned long pfn = pmd_pfn(pmd);
3295
3296         VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
3297
3298         if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
3299                 return -1;
3300
3301         if (WARN_ON_ONCE(pmd_devmap(pmd)))
3302                 return -1;
3303
3304         if (WARN_ON_ONCE(!pfn_valid(pfn)))
3305                 return -1;
3306
3307         return pfn;
3308 }
3309
3310 static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
3311                                    struct pglist_data *pgdat, bool can_swap)
3312 {
3313         struct folio *folio;
3314
3315         /* try to avoid unnecessary memory loads */
3316         if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3317                 return NULL;
3318
3319         folio = pfn_folio(pfn);
3320         if (folio_nid(folio) != pgdat->node_id)
3321                 return NULL;
3322
3323         if (folio_memcg_rcu(folio) != memcg)
3324                 return NULL;
3325
3326         /* file VMAs can contain anon pages from COW */
3327         if (!folio_is_file_lru(folio) && !can_swap)
3328                 return NULL;
3329
3330         return folio;
3331 }
3332
3333 static bool suitable_to_scan(int total, int young)
3334 {
3335         int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
3336
3337         /* suitable if the average number of young PTEs per cacheline is >=1 */
3338         return young * n >= total;
3339 }
3340
3341 static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
3342                            struct mm_walk *args)
3343 {
3344         int i;
3345         pte_t *pte;
3346         spinlock_t *ptl;
3347         unsigned long addr;
3348         int total = 0;
3349         int young = 0;
3350         struct lru_gen_mm_walk *walk = args->private;
3351         struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
3352         struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3353         DEFINE_MAX_SEQ(walk->lruvec);
3354         int old_gen, new_gen = lru_gen_from_seq(max_seq);
3355
3356         pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl);
3357         if (!pte)
3358                 return false;
3359         if (!spin_trylock(ptl)) {
3360                 pte_unmap(pte);
3361                 return false;
3362         }
3363
3364         arch_enter_lazy_mmu_mode();
3365 restart:
3366         for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
3367                 unsigned long pfn;
3368                 struct folio *folio;
3369                 pte_t ptent = ptep_get(pte + i);
3370
3371                 total++;
3372                 walk->mm_stats[MM_LEAF_TOTAL]++;
3373
3374                 pfn = get_pte_pfn(ptent, args->vma, addr);
3375                 if (pfn == -1)
3376                         continue;
3377
3378                 if (!pte_young(ptent)) {
3379                         walk->mm_stats[MM_LEAF_OLD]++;
3380                         continue;
3381                 }
3382
3383                 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
3384                 if (!folio)
3385                         continue;
3386
3387                 if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
3388                         VM_WARN_ON_ONCE(true);
3389
3390                 young++;
3391                 walk->mm_stats[MM_LEAF_YOUNG]++;
3392
3393                 if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
3394                     !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
3395                       !folio_test_swapcache(folio)))
3396                         folio_mark_dirty(folio);
3397
3398                 old_gen = folio_update_gen(folio, new_gen);
3399                 if (old_gen >= 0 && old_gen != new_gen)
3400                         update_batch_size(walk, folio, old_gen, new_gen);
3401         }
3402
3403         if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
3404                 goto restart;
3405
3406         arch_leave_lazy_mmu_mode();
3407         pte_unmap_unlock(pte, ptl);
3408
3409         return suitable_to_scan(total, young);
3410 }
3411
3412 static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
3413                                   struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
3414 {
3415         int i;
3416         pmd_t *pmd;
3417         spinlock_t *ptl;
3418         struct lru_gen_mm_walk *walk = args->private;
3419         struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
3420         struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3421         DEFINE_MAX_SEQ(walk->lruvec);
3422         int old_gen, new_gen = lru_gen_from_seq(max_seq);
3423
3424         VM_WARN_ON_ONCE(pud_leaf(*pud));
3425
3426         /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
3427         if (*first == -1) {
3428                 *first = addr;
3429                 bitmap_zero(bitmap, MIN_LRU_BATCH);
3430                 return;
3431         }
3432
3433         i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);
3434         if (i && i <= MIN_LRU_BATCH) {
3435                 __set_bit(i - 1, bitmap);
3436                 return;
3437         }
3438
3439         pmd = pmd_offset(pud, *first);
3440
3441         ptl = pmd_lockptr(args->mm, pmd);
3442         if (!spin_trylock(ptl))
3443                 goto done;
3444
3445         arch_enter_lazy_mmu_mode();
3446
3447         do {
3448                 unsigned long pfn;
3449                 struct folio *folio;
3450
3451                 /* don't round down the first address */
3452                 addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
3453
3454                 pfn = get_pmd_pfn(pmd[i], vma, addr);
3455                 if (pfn == -1)
3456                         goto next;
3457
3458                 if (!pmd_trans_huge(pmd[i])) {
3459                         if (should_clear_pmd_young())
3460                                 pmdp_test_and_clear_young(vma, addr, pmd + i);
3461                         goto next;
3462                 }
3463
3464                 folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
3465                 if (!folio)
3466                         goto next;
3467
3468                 if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
3469                         goto next;
3470
3471                 walk->mm_stats[MM_LEAF_YOUNG]++;
3472
3473                 if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
3474                     !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
3475                       !folio_test_swapcache(folio)))
3476                         folio_mark_dirty(folio);
3477
3478                 old_gen = folio_update_gen(folio, new_gen);
3479                 if (old_gen >= 0 && old_gen != new_gen)
3480                         update_batch_size(walk, folio, old_gen, new_gen);
3481 next:
3482                 i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
3483         } while (i <= MIN_LRU_BATCH);
3484
3485         arch_leave_lazy_mmu_mode();
3486         spin_unlock(ptl);
3487 done:
3488         *first = -1;
3489 }
3490
3491 static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
3492                            struct mm_walk *args)
3493 {
3494         int i;
3495         pmd_t *pmd;
3496         unsigned long next;
3497         unsigned long addr;
3498         struct vm_area_struct *vma;
3499         DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
3500         unsigned long first = -1;
3501         struct lru_gen_mm_walk *walk = args->private;
3502         struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
3503
3504         VM_WARN_ON_ONCE(pud_leaf(*pud));
3505
3506         /*
3507          * Finish an entire PMD in two passes: the first only reaches to PTE
3508          * tables to avoid taking the PMD lock; the second, if necessary, takes
3509          * the PMD lock to clear the accessed bit in PMD entries.
3510          */
3511         pmd = pmd_offset(pud, start & PUD_MASK);
3512 restart:
3513         /* walk_pte_range() may call get_next_vma() */
3514         vma = args->vma;
3515         for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
3516                 pmd_t val = pmdp_get_lockless(pmd + i);
3517
3518                 next = pmd_addr_end(addr, end);
3519
3520                 if (!pmd_present(val) || is_huge_zero_pmd(val)) {
3521                         walk->mm_stats[MM_LEAF_TOTAL]++;
3522                         continue;
3523                 }
3524
3525                 if (pmd_trans_huge(val)) {
3526                         unsigned long pfn = pmd_pfn(val);
3527                         struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3528
3529                         walk->mm_stats[MM_LEAF_TOTAL]++;
3530
3531                         if (!pmd_young(val)) {
3532                                 walk->mm_stats[MM_LEAF_OLD]++;
3533                                 continue;
3534                         }
3535
3536                         /* try to avoid unnecessary memory loads */
3537                         if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
3538                                 continue;
3539
3540                         walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
3541                         continue;
3542                 }
3543
3544                 walk->mm_stats[MM_NONLEAF_TOTAL]++;
3545
3546                 if (should_clear_pmd_young()) {
3547                         if (!pmd_young(val))
3548                                 continue;
3549
3550                         walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
3551                 }
3552
3553                 if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i))
3554                         continue;
3555
3556                 walk->mm_stats[MM_NONLEAF_FOUND]++;
3557
3558                 if (!walk_pte_range(&val, addr, next, args))
3559                         continue;
3560
3561                 walk->mm_stats[MM_NONLEAF_ADDED]++;
3562
3563                 /* carry over to the next generation */
3564                 update_bloom_filter(mm_state, walk->seq + 1, pmd + i);
3565         }
3566
3567         walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
3568
3569         if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
3570                 goto restart;
3571 }
3572
3573 static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
3574                           struct mm_walk *args)
3575 {
3576         int i;
3577         pud_t *pud;
3578         unsigned long addr;
3579         unsigned long next;
3580         struct lru_gen_mm_walk *walk = args->private;
3581
3582         VM_WARN_ON_ONCE(p4d_leaf(*p4d));
3583
3584         pud = pud_offset(p4d, start & P4D_MASK);
3585 restart:
3586         for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
3587                 pud_t val = READ_ONCE(pud[i]);
3588
3589                 next = pud_addr_end(addr, end);
3590
3591                 if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
3592                         continue;
3593
3594                 walk_pmd_range(&val, addr, next, args);
3595
3596                 if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
3597                         end = (addr | ~PUD_MASK) + 1;
3598                         goto done;
3599                 }
3600         }
3601
3602         if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
3603                 goto restart;
3604
3605         end = round_up(end, P4D_SIZE);
3606 done:
3607         if (!end || !args->vma)
3608                 return 1;
3609
3610         walk->next_addr = max(end, args->vma->vm_start);
3611
3612         return -EAGAIN;
3613 }
3614
3615 static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
3616 {
3617         static const struct mm_walk_ops mm_walk_ops = {
3618                 .test_walk = should_skip_vma,
3619                 .p4d_entry = walk_pud_range,
3620                 .walk_lock = PGWALK_RDLOCK,
3621         };
3622
3623         int err;
3624         struct lruvec *lruvec = walk->lruvec;
3625         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3626
3627         walk->next_addr = FIRST_USER_ADDRESS;
3628
3629         do {
3630                 DEFINE_MAX_SEQ(lruvec);
3631
3632                 err = -EBUSY;
3633
3634                 /* another thread might have called inc_max_seq() */
3635                 if (walk->seq != max_seq)
3636                         break;
3637
3638                 /* folio_update_gen() requires stable folio_memcg() */
3639                 if (!mem_cgroup_trylock_pages(memcg))
3640                         break;
3641
3642                 /* the caller might be holding the lock for write */
3643                 if (mmap_read_trylock(mm)) {
3644                         err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
3645
3646                         mmap_read_unlock(mm);
3647                 }
3648
3649                 mem_cgroup_unlock_pages();
3650
3651                 if (walk->batched) {
3652                         spin_lock_irq(&lruvec->lru_lock);
3653                         reset_batch_size(walk);
3654                         spin_unlock_irq(&lruvec->lru_lock);
3655                 }
3656
3657                 cond_resched();
3658         } while (err == -EAGAIN);
3659 }
3660
3661 static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
3662 {
3663         struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
3664
3665         if (pgdat && current_is_kswapd()) {
3666                 VM_WARN_ON_ONCE(walk);
3667
3668                 walk = &pgdat->mm_walk;
3669         } else if (!walk && force_alloc) {
3670                 VM_WARN_ON_ONCE(current_is_kswapd());
3671
3672                 walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
3673         }
3674
3675         current->reclaim_state->mm_walk = walk;
3676
3677         return walk;
3678 }
3679
3680 static void clear_mm_walk(void)
3681 {
3682         struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
3683
3684         VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
3685         VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
3686
3687         current->reclaim_state->mm_walk = NULL;
3688
3689         if (!current_is_kswapd())
3690                 kfree(walk);
3691 }
3692
3693 static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
3694 {
3695         int zone;
3696         int remaining = MAX_LRU_BATCH;
3697         struct lru_gen_folio *lrugen = &lruvec->lrugen;
3698         int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
3699
3700         if (type == LRU_GEN_ANON && !can_swap)
3701                 goto done;
3702
3703         /* prevent cold/hot inversion if force_scan is true */
3704         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3705                 struct list_head *head = &lrugen->folios[old_gen][type][zone];
3706
3707                 while (!list_empty(head)) {
3708                         struct folio *folio = lru_to_folio(head);
3709
3710                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
3711                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
3712                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
3713                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
3714
3715                         new_gen = folio_inc_gen(lruvec, folio, false);
3716                         list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
3717
3718                         if (!--remaining)
3719                                 return false;
3720                 }
3721         }
3722 done:
3723         reset_ctrl_pos(lruvec, type, true);
3724         WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
3725
3726         return true;
3727 }
3728
3729 static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
3730 {
3731         int gen, type, zone;
3732         bool success = false;
3733         struct lru_gen_folio *lrugen = &lruvec->lrugen;
3734         DEFINE_MIN_SEQ(lruvec);
3735
3736         VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
3737
3738         /* find the oldest populated generation */
3739         for (type = !can_swap; type < ANON_AND_FILE; type++) {
3740                 while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
3741                         gen = lru_gen_from_seq(min_seq[type]);
3742
3743                         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3744                                 if (!list_empty(&lrugen->folios[gen][type][zone]))
3745                                         goto next;
3746                         }
3747
3748                         min_seq[type]++;
3749                 }
3750 next:
3751                 ;
3752         }
3753
3754         /* see the comment on lru_gen_folio */
3755         if (can_swap) {
3756                 min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
3757                 min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
3758         }
3759
3760         for (type = !can_swap; type < ANON_AND_FILE; type++) {
3761                 if (min_seq[type] == lrugen->min_seq[type])
3762                         continue;
3763
3764                 reset_ctrl_pos(lruvec, type, true);
3765                 WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
3766                 success = true;
3767         }
3768
3769         return success;
3770 }
3771
3772 static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq,
3773                         bool can_swap, bool force_scan)
3774 {
3775         bool success;
3776         int prev, next;
3777         int type, zone;
3778         struct lru_gen_folio *lrugen = &lruvec->lrugen;
3779 restart:
3780         if (seq < READ_ONCE(lrugen->max_seq))
3781                 return false;
3782
3783         spin_lock_irq(&lruvec->lru_lock);
3784
3785         VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
3786
3787         success = seq == lrugen->max_seq;
3788         if (!success)
3789                 goto unlock;
3790
3791         for (type = ANON_AND_FILE - 1; type >= 0; type--) {
3792                 if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
3793                         continue;
3794
3795                 VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
3796
3797                 if (inc_min_seq(lruvec, type, can_swap))
3798                         continue;
3799
3800                 spin_unlock_irq(&lruvec->lru_lock);
3801                 cond_resched();
3802                 goto restart;
3803         }
3804
3805         /*
3806          * Update the active/inactive LRU sizes for compatibility. Both sides of
3807          * the current max_seq need to be covered, since max_seq+1 can overlap
3808          * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
3809          * overlap, cold/hot inversion happens.
3810          */
3811         prev = lru_gen_from_seq(lrugen->max_seq - 1);
3812         next = lru_gen_from_seq(lrugen->max_seq + 1);
3813
3814         for (type = 0; type < ANON_AND_FILE; type++) {
3815                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3816                         enum lru_list lru = type * LRU_INACTIVE_FILE;
3817                         long delta = lrugen->nr_pages[prev][type][zone] -
3818                                      lrugen->nr_pages[next][type][zone];
3819
3820                         if (!delta)
3821                                 continue;
3822
3823                         __update_lru_size(lruvec, lru, zone, delta);
3824                         __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
3825                 }
3826         }
3827
3828         for (type = 0; type < ANON_AND_FILE; type++)
3829                 reset_ctrl_pos(lruvec, type, false);
3830
3831         WRITE_ONCE(lrugen->timestamps[next], jiffies);
3832         /* make sure preceding modifications appear */
3833         smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
3834 unlock:
3835         spin_unlock_irq(&lruvec->lru_lock);
3836
3837         return success;
3838 }
3839
3840 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq,
3841                                bool can_swap, bool force_scan)
3842 {
3843         bool success;
3844         struct lru_gen_mm_walk *walk;
3845         struct mm_struct *mm = NULL;
3846         struct lru_gen_folio *lrugen = &lruvec->lrugen;
3847         struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
3848
3849         VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
3850
3851         if (!mm_state)
3852                 return inc_max_seq(lruvec, seq, can_swap, force_scan);
3853
3854         /* see the comment in iterate_mm_list() */
3855         if (seq <= READ_ONCE(mm_state->seq))
3856                 return false;
3857
3858         /*
3859          * If the hardware doesn't automatically set the accessed bit, fallback
3860          * to lru_gen_look_around(), which only clears the accessed bit in a
3861          * handful of PTEs. Spreading the work out over a period of time usually
3862          * is less efficient, but it avoids bursty page faults.
3863          */
3864         if (!should_walk_mmu()) {
3865                 success = iterate_mm_list_nowalk(lruvec, seq);
3866                 goto done;
3867         }
3868
3869         walk = set_mm_walk(NULL, true);
3870         if (!walk) {
3871                 success = iterate_mm_list_nowalk(lruvec, seq);
3872                 goto done;
3873         }
3874
3875         walk->lruvec = lruvec;
3876         walk->seq = seq;
3877         walk->can_swap = can_swap;
3878         walk->force_scan = force_scan;
3879
3880         do {
3881                 success = iterate_mm_list(walk, &mm);
3882                 if (mm)
3883                         walk_mm(mm, walk);
3884         } while (mm);
3885 done:
3886         if (success) {
3887                 success = inc_max_seq(lruvec, seq, can_swap, force_scan);
3888                 WARN_ON_ONCE(!success);
3889         }
3890
3891         return success;
3892 }
3893
3894 /******************************************************************************
3895  *                          working set protection
3896  ******************************************************************************/
3897
3898 static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
3899 {
3900         int priority;
3901         unsigned long reclaimable;
3902
3903         if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
3904                 return;
3905         /*
3906          * Determine the initial priority based on
3907          * (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
3908          * where reclaimed_to_scanned_ratio = inactive / total.
3909          */
3910         reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
3911         if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
3912                 reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
3913
3914         /* round down reclaimable and round up sc->nr_to_reclaim */
3915         priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
3916
3917         /*
3918          * The estimation is based on LRU pages only, so cap it to prevent
3919          * overshoots of shrinker objects by large margins.
3920          */
3921         sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
3922 }
3923
3924 static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
3925 {
3926         int gen, type, zone;
3927         unsigned long total = 0;
3928         bool can_swap = get_swappiness(lruvec, sc);
3929         struct lru_gen_folio *lrugen = &lruvec->lrugen;
3930         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3931         DEFINE_MAX_SEQ(lruvec);
3932         DEFINE_MIN_SEQ(lruvec);
3933
3934         for (type = !can_swap; type < ANON_AND_FILE; type++) {
3935                 unsigned long seq;
3936
3937                 for (seq = min_seq[type]; seq <= max_seq; seq++) {
3938                         gen = lru_gen_from_seq(seq);
3939
3940                         for (zone = 0; zone < MAX_NR_ZONES; zone++)
3941                                 total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
3942                 }
3943         }
3944
3945         /* whether the size is big enough to be helpful */
3946         return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
3947 }
3948
3949 static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
3950                                   unsigned long min_ttl)
3951 {
3952         int gen;
3953         unsigned long birth;
3954         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3955         DEFINE_MIN_SEQ(lruvec);
3956
3957         if (mem_cgroup_below_min(NULL, memcg))
3958                 return false;
3959
3960         if (!lruvec_is_sizable(lruvec, sc))
3961                 return false;
3962
3963         /* see the comment on lru_gen_folio */
3964         gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
3965         birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
3966
3967         return time_is_before_jiffies(birth + min_ttl);
3968 }
3969
3970 /* to protect the working set of the last N jiffies */
3971 static unsigned long lru_gen_min_ttl __read_mostly;
3972
3973 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
3974 {
3975         struct mem_cgroup *memcg;
3976         unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
3977         bool reclaimable = !min_ttl;
3978
3979         VM_WARN_ON_ONCE(!current_is_kswapd());
3980
3981         set_initial_priority(pgdat, sc);
3982
3983         memcg = mem_cgroup_iter(NULL, NULL, NULL);
3984         do {
3985                 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3986
3987                 mem_cgroup_calculate_protection(NULL, memcg);
3988
3989                 if (!reclaimable)
3990                         reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
3991         } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
3992
3993         /*
3994          * The main goal is to OOM kill if every generation from all memcgs is
3995          * younger than min_ttl. However, another possibility is all memcgs are
3996          * either too small or below min.
3997          */
3998         if (!reclaimable && mutex_trylock(&oom_lock)) {
3999                 struct oom_control oc = {
4000                         .gfp_mask = sc->gfp_mask,
4001                 };
4002
4003                 out_of_memory(&oc);
4004
4005                 mutex_unlock(&oom_lock);
4006         }
4007 }
4008
4009 /******************************************************************************
4010  *                          rmap/PT walk feedback
4011  ******************************************************************************/
4012
4013 /*
4014  * This function exploits spatial locality when shrink_folio_list() walks the
4015  * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
4016  * the scan was done cacheline efficiently, it adds the PMD entry pointing to
4017  * the PTE table to the Bloom filter. This forms a feedback loop between the
4018  * eviction and the aging.
4019  */
4020 void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
4021 {
4022         int i;
4023         unsigned long start;
4024         unsigned long end;
4025         struct lru_gen_mm_walk *walk;
4026         int young = 0;
4027         pte_t *pte = pvmw->pte;
4028         unsigned long addr = pvmw->address;
4029         struct vm_area_struct *vma = pvmw->vma;
4030         struct folio *folio = pfn_folio(pvmw->pfn);
4031         bool can_swap = !folio_is_file_lru(folio);
4032         struct mem_cgroup *memcg = folio_memcg(folio);
4033         struct pglist_data *pgdat = folio_pgdat(folio);
4034         struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4035         struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
4036         DEFINE_MAX_SEQ(lruvec);
4037         int old_gen, new_gen = lru_gen_from_seq(max_seq);
4038
4039         lockdep_assert_held(pvmw->ptl);
4040         VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
4041
4042         if (spin_is_contended(pvmw->ptl))
4043                 return;
4044
4045         /* exclude special VMAs containing anon pages from COW */
4046         if (vma->vm_flags & VM_SPECIAL)
4047                 return;
4048
4049         /* avoid taking the LRU lock under the PTL when possible */
4050         walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
4051
4052         start = max(addr & PMD_MASK, vma->vm_start);
4053         end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1;
4054
4055         if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
4056                 if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
4057                         end = start + MIN_LRU_BATCH * PAGE_SIZE;
4058                 else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)
4059                         start = end - MIN_LRU_BATCH * PAGE_SIZE;
4060                 else {
4061                         start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2;
4062                         end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;
4063                 }
4064         }
4065
4066         /* folio_update_gen() requires stable folio_memcg() */
4067         if (!mem_cgroup_trylock_pages(memcg))
4068                 return;
4069
4070         arch_enter_lazy_mmu_mode();
4071
4072         pte -= (addr - start) / PAGE_SIZE;
4073
4074         for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
4075                 unsigned long pfn;
4076                 pte_t ptent = ptep_get(pte + i);
4077
4078                 pfn = get_pte_pfn(ptent, vma, addr);
4079                 if (pfn == -1)
4080                         continue;
4081
4082                 if (!pte_young(ptent))
4083                         continue;
4084
4085                 folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
4086                 if (!folio)
4087                         continue;
4088
4089                 if (!ptep_test_and_clear_young(vma, addr, pte + i))
4090                         VM_WARN_ON_ONCE(true);
4091
4092                 young++;
4093
4094                 if (pte_dirty(ptent) && !folio_test_dirty(folio) &&
4095                     !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
4096                       !folio_test_swapcache(folio)))
4097                         folio_mark_dirty(folio);
4098
4099                 if (walk) {
4100                         old_gen = folio_update_gen(folio, new_gen);
4101                         if (old_gen >= 0 && old_gen != new_gen)
4102                                 update_batch_size(walk, folio, old_gen, new_gen);
4103
4104                         continue;
4105                 }
4106
4107                 old_gen = folio_lru_gen(folio);
4108                 if (old_gen < 0)
4109                         folio_set_referenced(folio);
4110                 else if (old_gen != new_gen)
4111                         folio_activate(folio);
4112         }
4113
4114         arch_leave_lazy_mmu_mode();
4115         mem_cgroup_unlock_pages();
4116
4117         /* feedback from rmap walkers to page table walkers */
4118         if (mm_state && suitable_to_scan(i, young))
4119                 update_bloom_filter(mm_state, max_seq, pvmw->pmd);
4120 }
4121
4122 /******************************************************************************
4123  *                          memcg LRU
4124  ******************************************************************************/
4125
4126 /* see the comment on MEMCG_NR_GENS */
4127 enum {
4128         MEMCG_LRU_NOP,
4129         MEMCG_LRU_HEAD,
4130         MEMCG_LRU_TAIL,
4131         MEMCG_LRU_OLD,
4132         MEMCG_LRU_YOUNG,
4133 };
4134
4135 static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
4136 {
4137         int seg;
4138         int old, new;
4139         unsigned long flags;
4140         int bin = get_random_u32_below(MEMCG_NR_BINS);
4141         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4142
4143         spin_lock_irqsave(&pgdat->memcg_lru.lock, flags);
4144
4145         VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
4146
4147         seg = 0;
4148         new = old = lruvec->lrugen.gen;
4149
4150         /* see the comment on MEMCG_NR_GENS */
4151         if (op == MEMCG_LRU_HEAD)
4152                 seg = MEMCG_LRU_HEAD;
4153         else if (op == MEMCG_LRU_TAIL)
4154                 seg = MEMCG_LRU_TAIL;
4155         else if (op == MEMCG_LRU_OLD)
4156                 new = get_memcg_gen(pgdat->memcg_lru.seq);
4157         else if (op == MEMCG_LRU_YOUNG)
4158                 new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
4159         else
4160                 VM_WARN_ON_ONCE(true);
4161
4162         WRITE_ONCE(lruvec->lrugen.seg, seg);
4163         WRITE_ONCE(lruvec->lrugen.gen, new);
4164
4165         hlist_nulls_del_rcu(&lruvec->lrugen.list);
4166
4167         if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
4168                 hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
4169         else
4170                 hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
4171
4172         pgdat->memcg_lru.nr_memcgs[old]--;
4173         pgdat->memcg_lru.nr_memcgs[new]++;
4174
4175         if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
4176                 WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
4177
4178         spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
4179 }
4180
4181 #ifdef CONFIG_MEMCG
4182
4183 void lru_gen_online_memcg(struct mem_cgroup *memcg)
4184 {
4185         int gen;
4186         int nid;
4187         int bin = get_random_u32_below(MEMCG_NR_BINS);
4188
4189         for_each_node(nid) {
4190                 struct pglist_data *pgdat = NODE_DATA(nid);
4191                 struct lruvec *lruvec = get_lruvec(memcg, nid);
4192
4193                 spin_lock_irq(&pgdat->memcg_lru.lock);
4194
4195                 VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
4196
4197                 gen = get_memcg_gen(pgdat->memcg_lru.seq);
4198
4199                 lruvec->lrugen.gen = gen;
4200
4201                 hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
4202                 pgdat->memcg_lru.nr_memcgs[gen]++;
4203
4204                 spin_unlock_irq(&pgdat->memcg_lru.lock);
4205         }
4206 }
4207
4208 void lru_gen_offline_memcg(struct mem_cgroup *memcg)
4209 {
4210         int nid;
4211
4212         for_each_node(nid) {
4213                 struct lruvec *lruvec = get_lruvec(memcg, nid);
4214
4215                 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
4216         }
4217 }
4218
4219 void lru_gen_release_memcg(struct mem_cgroup *memcg)
4220 {
4221         int gen;
4222         int nid;
4223
4224         for_each_node(nid) {
4225                 struct pglist_data *pgdat = NODE_DATA(nid);
4226                 struct lruvec *lruvec = get_lruvec(memcg, nid);
4227
4228                 spin_lock_irq(&pgdat->memcg_lru.lock);
4229
4230                 if (hlist_nulls_unhashed(&lruvec->lrugen.list))
4231                         goto unlock;
4232
4233                 gen = lruvec->lrugen.gen;
4234
4235                 hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
4236                 pgdat->memcg_lru.nr_memcgs[gen]--;
4237
4238                 if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
4239                         WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
4240 unlock:
4241                 spin_unlock_irq(&pgdat->memcg_lru.lock);
4242         }
4243 }
4244
4245 void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
4246 {
4247         struct lruvec *lruvec = get_lruvec(memcg, nid);
4248
4249         /* see the comment on MEMCG_NR_GENS */
4250         if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD)
4251                 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
4252 }
4253
4254 #endif /* CONFIG_MEMCG */
4255
4256 /******************************************************************************
4257  *                          the eviction
4258  ******************************************************************************/
4259
4260 static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
4261                        int tier_idx)
4262 {
4263         bool success;
4264         int gen = folio_lru_gen(folio);
4265         int type = folio_is_file_lru(folio);
4266         int zone = folio_zonenum(folio);
4267         int delta = folio_nr_pages(folio);
4268         int refs = folio_lru_refs(folio);
4269         int tier = lru_tier_from_refs(refs);
4270         struct lru_gen_folio *lrugen = &lruvec->lrugen;
4271
4272         VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
4273
4274         /* unevictable */
4275         if (!folio_evictable(folio)) {
4276                 success = lru_gen_del_folio(lruvec, folio, true);
4277                 VM_WARN_ON_ONCE_FOLIO(!success, folio);
4278                 folio_set_unevictable(folio);
4279                 lruvec_add_folio(lruvec, folio);
4280                 __count_vm_events(UNEVICTABLE_PGCULLED, delta);
4281                 return true;
4282         }
4283
4284         /* promoted */
4285         if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
4286                 list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
4287                 return true;
4288         }
4289
4290         /* protected */
4291         if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) {
4292                 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
4293
4294                 gen = folio_inc_gen(lruvec, folio, false);
4295                 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
4296
4297                 WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
4298                            lrugen->protected[hist][type][tier - 1] + delta);
4299                 return true;
4300         }
4301
4302         /* ineligible */
4303         if (zone > sc->reclaim_idx) {
4304                 gen = folio_inc_gen(lruvec, folio, false);
4305                 list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
4306                 return true;
4307         }
4308
4309         /* waiting for writeback */
4310         if (folio_test_locked(folio) || folio_test_writeback(folio) ||
4311             (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
4312                 gen = folio_inc_gen(lruvec, folio, true);
4313                 list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
4314                 return true;
4315         }
4316
4317         return false;
4318 }
4319
4320 static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc)
4321 {
4322         bool success;
4323
4324         /* swap constrained */
4325         if (!(sc->gfp_mask & __GFP_IO) &&
4326             (folio_test_dirty(folio) ||
4327              (folio_test_anon(folio) && !folio_test_swapcache(folio))))
4328                 return false;
4329
4330         /* raced with release_pages() */
4331         if (!folio_try_get(folio))
4332                 return false;
4333
4334         /* raced with another isolation */
4335         if (!folio_test_clear_lru(folio)) {
4336                 folio_put(folio);
4337                 return false;
4338         }
4339
4340         /* see the comment on MAX_NR_TIERS */
4341         if (!folio_test_referenced(folio))
4342                 set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
4343
4344         /* for shrink_folio_list() */
4345         folio_clear_reclaim(folio);
4346         folio_clear_referenced(folio);
4347
4348         success = lru_gen_del_folio(lruvec, folio, true);
4349         VM_WARN_ON_ONCE_FOLIO(!success, folio);
4350
4351         return true;
4352 }
4353
4354 static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
4355                        int type, int tier, struct list_head *list)
4356 {
4357         int i;
4358         int gen;
4359         enum vm_event_item item;
4360         int sorted = 0;
4361         int scanned = 0;
4362         int isolated = 0;
4363         int skipped = 0;
4364         int remaining = MAX_LRU_BATCH;
4365         struct lru_gen_folio *lrugen = &lruvec->lrugen;
4366         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4367
4368         VM_WARN_ON_ONCE(!list_empty(list));
4369
4370         if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
4371                 return 0;
4372
4373         gen = lru_gen_from_seq(lrugen->min_seq[type]);
4374
4375         for (i = MAX_NR_ZONES; i > 0; i--) {
4376                 LIST_HEAD(moved);
4377                 int skipped_zone = 0;
4378                 int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
4379                 struct list_head *head = &lrugen->folios[gen][type][zone];
4380
4381                 while (!list_empty(head)) {
4382                         struct folio *folio = lru_to_folio(head);
4383                         int delta = folio_nr_pages(folio);
4384
4385                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
4386                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
4387                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
4388                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
4389
4390                         scanned += delta;
4391
4392                         if (sort_folio(lruvec, folio, sc, tier))
4393                                 sorted += delta;
4394                         else if (isolate_folio(lruvec, folio, sc)) {
4395                                 list_add(&folio->lru, list);
4396                                 isolated += delta;
4397                         } else {
4398                                 list_move(&folio->lru, &moved);
4399                                 skipped_zone += delta;
4400                         }
4401
4402                         if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH)
4403                                 break;
4404                 }
4405
4406                 if (skipped_zone) {
4407                         list_splice(&moved, head);
4408                         __count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone);
4409                         skipped += skipped_zone;
4410                 }
4411
4412                 if (!remaining || isolated >= MIN_LRU_BATCH)
4413                         break;
4414         }
4415
4416         item = PGSCAN_KSWAPD + reclaimer_offset();
4417         if (!cgroup_reclaim(sc)) {
4418                 __count_vm_events(item, isolated);
4419                 __count_vm_events(PGREFILL, sorted);
4420         }
4421         __count_memcg_events(memcg, item, isolated);
4422         __count_memcg_events(memcg, PGREFILL, sorted);
4423         __count_vm_events(PGSCAN_ANON + type, isolated);
4424         trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
4425                                 scanned, skipped, isolated,
4426                                 type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
4427
4428         /*
4429          * There might not be eligible folios due to reclaim_idx. Check the
4430          * remaining to prevent livelock if it's not making progress.
4431          */
4432         return isolated || !remaining ? scanned : 0;
4433 }
4434
4435 static int get_tier_idx(struct lruvec *lruvec, int type)
4436 {
4437         int tier;
4438         struct ctrl_pos sp, pv;
4439
4440         /*
4441          * To leave a margin for fluctuations, use a larger gain factor (1:2).
4442          * This value is chosen because any other tier would have at least twice
4443          * as many refaults as the first tier.
4444          */
4445         read_ctrl_pos(lruvec, type, 0, 1, &sp);
4446         for (tier = 1; tier < MAX_NR_TIERS; tier++) {
4447                 read_ctrl_pos(lruvec, type, tier, 2, &pv);
4448                 if (!positive_ctrl_err(&sp, &pv))
4449                         break;
4450         }
4451
4452         return tier - 1;
4453 }
4454
4455 static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
4456 {
4457         int type, tier;
4458         struct ctrl_pos sp, pv;
4459         int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness };
4460
4461         /*
4462          * Compare the first tier of anon with that of file to determine which
4463          * type to scan. Also need to compare other tiers of the selected type
4464          * with the first tier of the other type to determine the last tier (of
4465          * the selected type) to evict.
4466          */
4467         read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
4468         read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
4469         type = positive_ctrl_err(&sp, &pv);
4470
4471         read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
4472         for (tier = 1; tier < MAX_NR_TIERS; tier++) {
4473                 read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
4474                 if (!positive_ctrl_err(&sp, &pv))
4475                         break;
4476         }
4477
4478         *tier_idx = tier - 1;
4479
4480         return type;
4481 }
4482
4483 static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
4484                           int *type_scanned, struct list_head *list)
4485 {
4486         int i;
4487         int type;
4488         int scanned;
4489         int tier = -1;
4490         DEFINE_MIN_SEQ(lruvec);
4491
4492         /*
4493          * Try to make the obvious choice first, and if anon and file are both
4494          * available from the same generation,
4495          * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon
4496          *    first.
4497          * 2. If !__GFP_IO, file first since clean pagecache is more likely to
4498          *    exist than clean swapcache.
4499          */
4500         if (!swappiness)
4501                 type = LRU_GEN_FILE;
4502         else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
4503                 type = LRU_GEN_ANON;
4504         else if (swappiness == 1)
4505                 type = LRU_GEN_FILE;
4506         else if (swappiness == MAX_SWAPPINESS)
4507                 type = LRU_GEN_ANON;
4508         else if (!(sc->gfp_mask & __GFP_IO))
4509                 type = LRU_GEN_FILE;
4510         else
4511                 type = get_type_to_scan(lruvec, swappiness, &tier);
4512
4513         for (i = !swappiness; i < ANON_AND_FILE; i++) {
4514                 if (tier < 0)
4515                         tier = get_tier_idx(lruvec, type);
4516
4517                 scanned = scan_folios(lruvec, sc, type, tier, list);
4518                 if (scanned)
4519                         break;
4520
4521                 type = !type;
4522                 tier = -1;
4523         }
4524
4525         *type_scanned = type;
4526
4527         return scanned;
4528 }
4529
4530 static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
4531 {
4532         int type;
4533         int scanned;
4534         int reclaimed;
4535         LIST_HEAD(list);
4536         LIST_HEAD(clean);
4537         struct folio *folio;
4538         struct folio *next;
4539         enum vm_event_item item;
4540         struct reclaim_stat stat;
4541         struct lru_gen_mm_walk *walk;
4542         bool skip_retry = false;
4543         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4544         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4545
4546         spin_lock_irq(&lruvec->lru_lock);
4547
4548         scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
4549
4550         scanned += try_to_inc_min_seq(lruvec, swappiness);
4551
4552         if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
4553                 scanned = 0;
4554
4555         spin_unlock_irq(&lruvec->lru_lock);
4556
4557         if (list_empty(&list))
4558                 return scanned;
4559 retry:
4560         reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
4561         sc->nr_reclaimed += reclaimed;
4562         trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
4563                         scanned, reclaimed, &stat, sc->priority,
4564                         type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
4565
4566         list_for_each_entry_safe_reverse(folio, next, &list, lru) {
4567                 if (!folio_evictable(folio)) {
4568                         list_del(&folio->lru);
4569                         folio_putback_lru(folio);
4570                         continue;
4571                 }
4572
4573                 if (folio_test_reclaim(folio) &&
4574                     (folio_test_dirty(folio) || folio_test_writeback(folio))) {
4575                         /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
4576                         if (folio_test_workingset(folio))
4577                                 folio_set_referenced(folio);
4578                         continue;
4579                 }
4580
4581                 if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
4582                     folio_mapped(folio) || folio_test_locked(folio) ||
4583                     folio_test_dirty(folio) || folio_test_writeback(folio)) {
4584                         /* don't add rejected folios to the oldest generation */
4585                         set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
4586                                       BIT(PG_active));
4587                         continue;
4588                 }
4589
4590                 /* retry folios that may have missed folio_rotate_reclaimable() */
4591                 list_move(&folio->lru, &clean);
4592         }
4593
4594         spin_lock_irq(&lruvec->lru_lock);
4595
4596         move_folios_to_lru(lruvec, &list);
4597
4598         walk = current->reclaim_state->mm_walk;
4599         if (walk && walk->batched) {
4600                 walk->lruvec = lruvec;
4601                 reset_batch_size(walk);
4602         }
4603
4604         item = PGSTEAL_KSWAPD + reclaimer_offset();
4605         if (!cgroup_reclaim(sc))
4606                 __count_vm_events(item, reclaimed);
4607         __count_memcg_events(memcg, item, reclaimed);
4608         __count_vm_events(PGSTEAL_ANON + type, reclaimed);
4609
4610         spin_unlock_irq(&lruvec->lru_lock);
4611
4612         list_splice_init(&clean, &list);
4613
4614         if (!list_empty(&list)) {
4615                 skip_retry = true;
4616                 goto retry;
4617         }
4618
4619         return scanned;
4620 }
4621
4622 static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
4623                              bool can_swap, unsigned long *nr_to_scan)
4624 {
4625         int gen, type, zone;
4626         unsigned long old = 0;
4627         unsigned long young = 0;
4628         unsigned long total = 0;
4629         struct lru_gen_folio *lrugen = &lruvec->lrugen;
4630         DEFINE_MIN_SEQ(lruvec);
4631
4632         /* whether this lruvec is completely out of cold folios */
4633         if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
4634                 *nr_to_scan = 0;
4635                 return true;
4636         }
4637
4638         for (type = !can_swap; type < ANON_AND_FILE; type++) {
4639                 unsigned long seq;
4640
4641                 for (seq = min_seq[type]; seq <= max_seq; seq++) {
4642                         unsigned long size = 0;
4643
4644                         gen = lru_gen_from_seq(seq);
4645
4646                         for (zone = 0; zone < MAX_NR_ZONES; zone++)
4647                                 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
4648
4649                         total += size;
4650                         if (seq == max_seq)
4651                                 young += size;
4652                         else if (seq + MIN_NR_GENS == max_seq)
4653                                 old += size;
4654                 }
4655         }
4656
4657         *nr_to_scan = total;
4658
4659         /*
4660          * The aging tries to be lazy to reduce the overhead, while the eviction
4661          * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
4662          * ideal number of generations is MIN_NR_GENS+1.
4663          */
4664         if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
4665                 return false;
4666
4667         /*
4668          * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
4669          * of the total number of pages for each generation. A reasonable range
4670          * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
4671          * aging cares about the upper bound of hot pages, while the eviction
4672          * cares about the lower bound of cold pages.
4673          */
4674         if (young * MIN_NR_GENS > total)
4675                 return true;
4676         if (old * (MIN_NR_GENS + 2) < total)
4677                 return true;
4678
4679         return false;
4680 }
4681
4682 /*
4683  * For future optimizations:
4684  * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
4685  *    reclaim.
4686  */
4687 static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
4688 {
4689         bool success;
4690         unsigned long nr_to_scan;
4691         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4692         DEFINE_MAX_SEQ(lruvec);
4693
4694         if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
4695                 return -1;
4696
4697         success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan);
4698
4699         /* try to scrape all its memory if this memcg was deleted */
4700         if (nr_to_scan && !mem_cgroup_online(memcg))
4701                 return nr_to_scan;
4702
4703         /* try to get away with not aging at the default priority */
4704         if (!success || sc->priority == DEF_PRIORITY)
4705                 return nr_to_scan >> sc->priority;
4706
4707         /* stop scanning this lruvec as it's low on cold folios */
4708         return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0;
4709 }
4710
4711 static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
4712 {
4713         int i;
4714         enum zone_watermarks mark;
4715
4716         /* don't abort memcg reclaim to ensure fairness */
4717         if (!root_reclaim(sc))
4718                 return false;
4719
4720         if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
4721                 return true;
4722
4723         /* check the order to exclude compaction-induced reclaim */
4724         if (!current_is_kswapd() || sc->order)
4725                 return false;
4726
4727         mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ?
4728                WMARK_PROMO : WMARK_HIGH;
4729
4730         for (i = 0; i <= sc->reclaim_idx; i++) {
4731                 struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
4732                 unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH;
4733
4734                 if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0))
4735                         return false;
4736         }
4737
4738         /* kswapd should abort if all eligible zones are safe */
4739         return true;
4740 }
4741
4742 static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
4743 {
4744         long nr_to_scan;
4745         unsigned long scanned = 0;
4746         int swappiness = get_swappiness(lruvec, sc);
4747
4748         while (true) {
4749                 int delta;
4750
4751                 nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
4752                 if (nr_to_scan <= 0)
4753                         break;
4754
4755                 delta = evict_folios(lruvec, sc, swappiness);
4756                 if (!delta)
4757                         break;
4758
4759                 scanned += delta;
4760                 if (scanned >= nr_to_scan)
4761                         break;
4762
4763                 if (should_abort_scan(lruvec, sc))
4764                         break;
4765
4766                 cond_resched();
4767         }
4768
4769         /* whether this lruvec should be rotated */
4770         return nr_to_scan < 0;
4771 }
4772
4773 static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
4774 {
4775         bool success;
4776         unsigned long scanned = sc->nr_scanned;
4777         unsigned long reclaimed = sc->nr_reclaimed;
4778         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4779         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4780
4781         /* lru_gen_age_node() called mem_cgroup_calculate_protection() */
4782         if (mem_cgroup_below_min(NULL, memcg))
4783                 return MEMCG_LRU_YOUNG;
4784
4785         if (mem_cgroup_below_low(NULL, memcg)) {
4786                 /* see the comment on MEMCG_NR_GENS */
4787                 if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
4788                         return MEMCG_LRU_TAIL;
4789
4790                 memcg_memory_event(memcg, MEMCG_LOW);
4791         }
4792
4793         success = try_to_shrink_lruvec(lruvec, sc);
4794
4795         shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
4796
4797         if (!sc->proactive)
4798                 vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
4799                            sc->nr_reclaimed - reclaimed);
4800
4801         flush_reclaim_state(sc);
4802
4803         if (success && mem_cgroup_online(memcg))
4804                 return MEMCG_LRU_YOUNG;
4805
4806         if (!success && lruvec_is_sizable(lruvec, sc))
4807                 return 0;
4808
4809         /* one retry if offlined or too small */
4810         return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
4811                MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
4812 }
4813
4814 static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
4815 {
4816         int op;
4817         int gen;
4818         int bin;
4819         int first_bin;
4820         struct lruvec *lruvec;
4821         struct lru_gen_folio *lrugen;
4822         struct mem_cgroup *memcg;
4823         struct hlist_nulls_node *pos;
4824
4825         gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
4826         bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
4827 restart:
4828         op = 0;
4829         memcg = NULL;
4830
4831         rcu_read_lock();
4832
4833         hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
4834                 if (op) {
4835                         lru_gen_rotate_memcg(lruvec, op);
4836                         op = 0;
4837                 }
4838
4839                 mem_cgroup_put(memcg);
4840                 memcg = NULL;
4841
4842                 if (gen != READ_ONCE(lrugen->gen))
4843                         continue;
4844
4845                 lruvec = container_of(lrugen, struct lruvec, lrugen);
4846                 memcg = lruvec_memcg(lruvec);
4847
4848                 if (!mem_cgroup_tryget(memcg)) {
4849                         lru_gen_release_memcg(memcg);
4850                         memcg = NULL;
4851                         continue;
4852                 }
4853
4854                 rcu_read_unlock();
4855
4856                 op = shrink_one(lruvec, sc);
4857
4858                 rcu_read_lock();
4859
4860                 if (should_abort_scan(lruvec, sc))
4861                         break;
4862         }
4863
4864         rcu_read_unlock();
4865
4866         if (op)
4867                 lru_gen_rotate_memcg(lruvec, op);
4868
4869         mem_cgroup_put(memcg);
4870
4871         if (!is_a_nulls(pos))
4872                 return;
4873
4874         /* restart if raced with lru_gen_rotate_memcg() */
4875         if (gen != get_nulls_value(pos))
4876                 goto restart;
4877
4878         /* try the rest of the bins of the current generation */
4879         bin = get_memcg_bin(bin + 1);
4880         if (bin != first_bin)
4881                 goto restart;
4882 }
4883
4884 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
4885 {
4886         struct blk_plug plug;
4887
4888         VM_WARN_ON_ONCE(root_reclaim(sc));
4889         VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
4890
4891         lru_add_drain();
4892
4893         blk_start_plug(&plug);
4894
4895         set_mm_walk(NULL, sc->proactive);
4896
4897         if (try_to_shrink_lruvec(lruvec, sc))
4898                 lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
4899
4900         clear_mm_walk();
4901
4902         blk_finish_plug(&plug);
4903 }
4904
4905 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
4906 {
4907         struct blk_plug plug;
4908         unsigned long reclaimed = sc->nr_reclaimed;
4909
4910         VM_WARN_ON_ONCE(!root_reclaim(sc));
4911
4912         /*
4913          * Unmapped clean folios are already prioritized. Scanning for more of
4914          * them is likely futile and can cause high reclaim latency when there
4915          * is a large number of memcgs.
4916          */
4917         if (!sc->may_writepage || !sc->may_unmap)
4918                 goto done;
4919
4920         lru_add_drain();
4921
4922         blk_start_plug(&plug);
4923
4924         set_mm_walk(pgdat, sc->proactive);
4925
4926         set_initial_priority(pgdat, sc);
4927
4928         if (current_is_kswapd())
4929                 sc->nr_reclaimed = 0;
4930
4931         if (mem_cgroup_disabled())
4932                 shrink_one(&pgdat->__lruvec, sc);
4933         else
4934                 shrink_many(pgdat, sc);
4935
4936         if (current_is_kswapd())
4937                 sc->nr_reclaimed += reclaimed;
4938
4939         clear_mm_walk();
4940
4941         blk_finish_plug(&plug);
4942 done:
4943         /* kswapd should never fail */
4944         pgdat->kswapd_failures = 0;
4945 }
4946
4947 /******************************************************************************
4948  *                          state change
4949  ******************************************************************************/
4950
4951 static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
4952 {
4953         struct lru_gen_folio *lrugen = &lruvec->lrugen;
4954
4955         if (lrugen->enabled) {
4956                 enum lru_list lru;
4957
4958                 for_each_evictable_lru(lru) {
4959                         if (!list_empty(&lruvec->lists[lru]))
4960                                 return false;
4961                 }
4962         } else {
4963                 int gen, type, zone;
4964
4965                 for_each_gen_type_zone(gen, type, zone) {
4966                         if (!list_empty(&lrugen->folios[gen][type][zone]))
4967                                 return false;
4968                 }
4969         }
4970
4971         return true;
4972 }
4973
4974 static bool fill_evictable(struct lruvec *lruvec)
4975 {
4976         enum lru_list lru;
4977         int remaining = MAX_LRU_BATCH;
4978
4979         for_each_evictable_lru(lru) {
4980                 int type = is_file_lru(lru);
4981                 bool active = is_active_lru(lru);
4982                 struct list_head *head = &lruvec->lists[lru];
4983
4984                 while (!list_empty(head)) {
4985                         bool success;
4986                         struct folio *folio = lru_to_folio(head);
4987
4988                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
4989                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
4990                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
4991                         VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
4992
4993                         lruvec_del_folio(lruvec, folio);
4994                         success = lru_gen_add_folio(lruvec, folio, false);
4995                         VM_WARN_ON_ONCE(!success);
4996
4997                         if (!--remaining)
4998                                 return false;
4999                 }
5000         }
5001
5002         return true;
5003 }
5004
5005 static bool drain_evictable(struct lruvec *lruvec)
5006 {
5007         int gen, type, zone;
5008         int remaining = MAX_LRU_BATCH;
5009
5010         for_each_gen_type_zone(gen, type, zone) {
5011                 struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
5012
5013                 while (!list_empty(head)) {
5014                         bool success;
5015                         struct folio *folio = lru_to_folio(head);
5016
5017                         VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
5018                         VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
5019                         VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
5020                         VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
5021
5022                         success = lru_gen_del_folio(lruvec, folio, false);
5023                         VM_WARN_ON_ONCE(!success);
5024                         lruvec_add_folio(lruvec, folio);
5025
5026                         if (!--remaining)
5027                                 return false;
5028                 }
5029         }
5030
5031         return true;
5032 }
5033
5034 static void lru_gen_change_state(bool enabled)
5035 {
5036         static DEFINE_MUTEX(state_mutex);
5037
5038         struct mem_cgroup *memcg;
5039
5040         cgroup_lock();
5041         cpus_read_lock();
5042         get_online_mems();
5043         mutex_lock(&state_mutex);
5044
5045         if (enabled == lru_gen_enabled())
5046                 goto unlock;
5047
5048         if (enabled)
5049                 static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
5050         else
5051                 static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
5052
5053         memcg = mem_cgroup_iter(NULL, NULL, NULL);
5054         do {
5055                 int nid;
5056
5057                 for_each_node(nid) {
5058                         struct lruvec *lruvec = get_lruvec(memcg, nid);
5059
5060                         spin_lock_irq(&lruvec->lru_lock);
5061
5062                         VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
5063                         VM_WARN_ON_ONCE(!state_is_valid(lruvec));
5064
5065                         lruvec->lrugen.enabled = enabled;
5066
5067                         while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
5068                                 spin_unlock_irq(&lruvec->lru_lock);
5069                                 cond_resched();
5070                                 spin_lock_irq(&lruvec->lru_lock);
5071                         }
5072
5073                         spin_unlock_irq(&lruvec->lru_lock);
5074                 }
5075
5076                 cond_resched();
5077         } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
5078 unlock:
5079         mutex_unlock(&state_mutex);
5080         put_online_mems();
5081         cpus_read_unlock();
5082         cgroup_unlock();
5083 }
5084
5085 /******************************************************************************
5086  *                          sysfs interface
5087  ******************************************************************************/
5088
5089 static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
5090 {
5091         return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
5092 }
5093
5094 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5095 static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
5096                                 const char *buf, size_t len)
5097 {
5098         unsigned int msecs;
5099
5100         if (kstrtouint(buf, 0, &msecs))
5101                 return -EINVAL;
5102
5103         WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
5104
5105         return len;
5106 }
5107
5108 static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms);
5109
5110 static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
5111 {
5112         unsigned int caps = 0;
5113
5114         if (get_cap(LRU_GEN_CORE))
5115                 caps |= BIT(LRU_GEN_CORE);
5116
5117         if (should_walk_mmu())
5118                 caps |= BIT(LRU_GEN_MM_WALK);
5119
5120         if (should_clear_pmd_young())
5121                 caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
5122
5123         return sysfs_emit(buf, "0x%04x\n", caps);
5124 }
5125
5126 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5127 static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
5128                              const char *buf, size_t len)
5129 {
5130         int i;
5131         unsigned int caps;
5132
5133         if (tolower(*buf) == 'n')
5134                 caps = 0;
5135         else if (tolower(*buf) == 'y')
5136                 caps = -1;
5137         else if (kstrtouint(buf, 0, &caps))
5138                 return -EINVAL;
5139
5140         for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
5141                 bool enabled = caps & BIT(i);
5142
5143                 if (i == LRU_GEN_CORE)
5144                         lru_gen_change_state(enabled);
5145                 else if (enabled)
5146                         static_branch_enable(&lru_gen_caps[i]);
5147                 else
5148                         static_branch_disable(&lru_gen_caps[i]);
5149         }
5150
5151         return len;
5152 }
5153
5154 static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled);
5155
5156 static struct attribute *lru_gen_attrs[] = {
5157         &lru_gen_min_ttl_attr.attr,
5158         &lru_gen_enabled_attr.attr,
5159         NULL
5160 };
5161
5162 static const struct attribute_group lru_gen_attr_group = {
5163         .name = "lru_gen",
5164         .attrs = lru_gen_attrs,
5165 };
5166
5167 /******************************************************************************
5168  *                          debugfs interface
5169  ******************************************************************************/
5170
5171 static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
5172 {
5173         struct mem_cgroup *memcg;
5174         loff_t nr_to_skip = *pos;
5175
5176         m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
5177         if (!m->private)
5178                 return ERR_PTR(-ENOMEM);
5179
5180         memcg = mem_cgroup_iter(NULL, NULL, NULL);
5181         do {
5182                 int nid;
5183
5184                 for_each_node_state(nid, N_MEMORY) {
5185                         if (!nr_to_skip--)
5186                                 return get_lruvec(memcg, nid);
5187                 }
5188         } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
5189
5190         return NULL;
5191 }
5192
5193 static void lru_gen_seq_stop(struct seq_file *m, void *v)
5194 {
5195         if (!IS_ERR_OR_NULL(v))
5196                 mem_cgroup_iter_break(NULL, lruvec_memcg(v));
5197
5198         kvfree(m->private);
5199         m->private = NULL;
5200 }
5201
5202 static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
5203 {
5204         int nid = lruvec_pgdat(v)->node_id;
5205         struct mem_cgroup *memcg = lruvec_memcg(v);
5206
5207         ++*pos;
5208
5209         nid = next_memory_node(nid);
5210         if (nid == MAX_NUMNODES) {
5211                 memcg = mem_cgroup_iter(NULL, memcg, NULL);
5212                 if (!memcg)
5213                         return NULL;
5214
5215                 nid = first_memory_node;
5216         }
5217
5218         return get_lruvec(memcg, nid);
5219 }
5220
5221 static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
5222                                   unsigned long max_seq, unsigned long *min_seq,
5223                                   unsigned long seq)
5224 {
5225         int i;
5226         int type, tier;
5227         int hist = lru_hist_from_seq(seq);
5228         struct lru_gen_folio *lrugen = &lruvec->lrugen;
5229         struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5230
5231         for (tier = 0; tier < MAX_NR_TIERS; tier++) {
5232                 seq_printf(m, "            %10d", tier);
5233                 for (type = 0; type < ANON_AND_FILE; type++) {
5234                         const char *s = "   ";
5235                         unsigned long n[3] = {};
5236
5237                         if (seq == max_seq) {
5238                                 s = "RT ";
5239                                 n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
5240                                 n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
5241                         } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
5242                                 s = "rep";
5243                                 n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
5244                                 n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
5245                                 if (tier)
5246                                         n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
5247                         }
5248
5249                         for (i = 0; i < 3; i++)
5250                                 seq_printf(m, " %10lu%c", n[i], s[i]);
5251                 }
5252                 seq_putc(m, '\n');
5253         }
5254
5255         if (!mm_state)
5256                 return;
5257
5258         seq_puts(m, "                      ");
5259         for (i = 0; i < NR_MM_STATS; i++) {
5260                 const char *s = "      ";
5261                 unsigned long n = 0;
5262
5263                 if (seq == max_seq && NR_HIST_GENS == 1) {
5264                         s = "LOYNFA";
5265                         n = READ_ONCE(mm_state->stats[hist][i]);
5266                 } else if (seq != max_seq && NR_HIST_GENS > 1) {
5267                         s = "loynfa";
5268                         n = READ_ONCE(mm_state->stats[hist][i]);
5269                 }
5270
5271                 seq_printf(m, " %10lu%c", n, s[i]);
5272         }
5273         seq_putc(m, '\n');
5274 }
5275
5276 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5277 static int lru_gen_seq_show(struct seq_file *m, void *v)
5278 {
5279         unsigned long seq;
5280         bool full = !debugfs_real_fops(m->file)->write;
5281         struct lruvec *lruvec = v;
5282         struct lru_gen_folio *lrugen = &lruvec->lrugen;
5283         int nid = lruvec_pgdat(lruvec)->node_id;
5284         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
5285         DEFINE_MAX_SEQ(lruvec);
5286         DEFINE_MIN_SEQ(lruvec);
5287
5288         if (nid == first_memory_node) {
5289                 const char *path = memcg ? m->private : "";
5290
5291 #ifdef CONFIG_MEMCG
5292                 if (memcg)
5293                         cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
5294 #endif
5295                 seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
5296         }
5297
5298         seq_printf(m, " node %5d\n", nid);
5299
5300         if (!full)
5301                 seq = min_seq[LRU_GEN_ANON];
5302         else if (max_seq >= MAX_NR_GENS)
5303                 seq = max_seq - MAX_NR_GENS + 1;
5304         else
5305                 seq = 0;
5306
5307         for (; seq <= max_seq; seq++) {
5308                 int type, zone;
5309                 int gen = lru_gen_from_seq(seq);
5310                 unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
5311
5312                 seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
5313
5314                 for (type = 0; type < ANON_AND_FILE; type++) {
5315                         unsigned long size = 0;
5316                         char mark = full && seq < min_seq[type] ? 'x' : ' ';
5317
5318                         for (zone = 0; zone < MAX_NR_ZONES; zone++)
5319                                 size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
5320
5321                         seq_printf(m, " %10lu%c", size, mark);
5322                 }
5323
5324                 seq_putc(m, '\n');
5325
5326                 if (full)
5327                         lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
5328         }
5329
5330         return 0;
5331 }
5332
5333 static const struct seq_operations lru_gen_seq_ops = {
5334         .start = lru_gen_seq_start,
5335         .stop = lru_gen_seq_stop,
5336         .next = lru_gen_seq_next,
5337         .show = lru_gen_seq_show,
5338 };
5339
5340 static int run_aging(struct lruvec *lruvec, unsigned long seq,
5341                      bool can_swap, bool force_scan)
5342 {
5343         DEFINE_MAX_SEQ(lruvec);
5344         DEFINE_MIN_SEQ(lruvec);
5345
5346         if (seq < max_seq)
5347                 return 0;
5348
5349         if (seq > max_seq)
5350                 return -EINVAL;
5351
5352         if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
5353                 return -ERANGE;
5354
5355         try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan);
5356
5357         return 0;
5358 }
5359
5360 static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
5361                         int swappiness, unsigned long nr_to_reclaim)
5362 {
5363         DEFINE_MAX_SEQ(lruvec);
5364
5365         if (seq + MIN_NR_GENS > max_seq)
5366                 return -EINVAL;
5367
5368         sc->nr_reclaimed = 0;
5369
5370         while (!signal_pending(current)) {
5371                 DEFINE_MIN_SEQ(lruvec);
5372
5373                 if (seq < min_seq[!swappiness])
5374                         return 0;
5375
5376                 if (sc->nr_reclaimed >= nr_to_reclaim)
5377                         return 0;
5378
5379                 if (!evict_folios(lruvec, sc, swappiness))
5380                         return 0;
5381
5382                 cond_resched();
5383         }
5384
5385         return -EINTR;
5386 }
5387
5388 static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
5389                    struct scan_control *sc, int swappiness, unsigned long opt)
5390 {
5391         struct lruvec *lruvec;
5392         int err = -EINVAL;
5393         struct mem_cgroup *memcg = NULL;
5394
5395         if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
5396                 return -EINVAL;
5397
5398         if (!mem_cgroup_disabled()) {
5399                 rcu_read_lock();
5400
5401                 memcg = mem_cgroup_from_id(memcg_id);
5402                 if (!mem_cgroup_tryget(memcg))
5403                         memcg = NULL;
5404
5405                 rcu_read_unlock();
5406
5407                 if (!memcg)
5408                         return -EINVAL;
5409         }
5410
5411         if (memcg_id != mem_cgroup_id(memcg))
5412                 goto done;
5413
5414         lruvec = get_lruvec(memcg, nid);
5415
5416         if (swappiness < MIN_SWAPPINESS)
5417                 swappiness = get_swappiness(lruvec, sc);
5418         else if (swappiness > MAX_SWAPPINESS)
5419                 goto done;
5420
5421         switch (cmd) {
5422         case '+':
5423                 err = run_aging(lruvec, seq, swappiness, opt);
5424                 break;
5425         case '-':
5426                 err = run_eviction(lruvec, seq, sc, swappiness, opt);
5427                 break;
5428         }
5429 done:
5430         mem_cgroup_put(memcg);
5431
5432         return err;
5433 }
5434
5435 /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
5436 static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
5437                                  size_t len, loff_t *pos)
5438 {
5439         void *buf;
5440         char *cur, *next;
5441         unsigned int flags;
5442         struct blk_plug plug;
5443         int err = -EINVAL;
5444         struct scan_control sc = {
5445                 .may_writepage = true,
5446                 .may_unmap = true,
5447                 .may_swap = true,
5448                 .reclaim_idx = MAX_NR_ZONES - 1,
5449                 .gfp_mask = GFP_KERNEL,
5450         };
5451
5452         buf = kvmalloc(len + 1, GFP_KERNEL);
5453         if (!buf)
5454                 return -ENOMEM;
5455
5456         if (copy_from_user(buf, src, len)) {
5457                 kvfree(buf);
5458                 return -EFAULT;
5459         }
5460
5461         set_task_reclaim_state(current, &sc.reclaim_state);
5462         flags = memalloc_noreclaim_save();
5463         blk_start_plug(&plug);
5464         if (!set_mm_walk(NULL, true)) {
5465                 err = -ENOMEM;
5466                 goto done;
5467         }
5468
5469         next = buf;
5470         next[len] = '\0';
5471
5472         while ((cur = strsep(&next, ",;\n"))) {
5473                 int n;
5474                 int end;
5475                 char cmd;
5476                 unsigned int memcg_id;
5477                 unsigned int nid;
5478                 unsigned long seq;
5479                 unsigned int swappiness = -1;
5480                 unsigned long opt = -1;
5481
5482                 cur = skip_spaces(cur);
5483                 if (!*cur)
5484                         continue;
5485
5486                 n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
5487                            &seq, &end, &swappiness, &end, &opt, &end);
5488                 if (n < 4 || cur[end]) {
5489                         err = -EINVAL;
5490                         break;
5491                 }
5492
5493                 err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
5494                 if (err)
5495                         break;
5496         }
5497 done:
5498         clear_mm_walk();
5499         blk_finish_plug(&plug);
5500         memalloc_noreclaim_restore(flags);
5501         set_task_reclaim_state(current, NULL);
5502
5503         kvfree(buf);
5504
5505         return err ? : len;
5506 }
5507
5508 static int lru_gen_seq_open(struct inode *inode, struct file *file)
5509 {
5510         return seq_open(file, &lru_gen_seq_ops);
5511 }
5512
5513 static const struct file_operations lru_gen_rw_fops = {
5514         .open = lru_gen_seq_open,
5515         .read = seq_read,
5516         .write = lru_gen_seq_write,
5517         .llseek = seq_lseek,
5518         .release = seq_release,
5519 };
5520
5521 static const struct file_operations lru_gen_ro_fops = {
5522         .open = lru_gen_seq_open,
5523         .read = seq_read,
5524         .llseek = seq_lseek,
5525         .release = seq_release,
5526 };
5527
5528 /******************************************************************************
5529  *                          initialization
5530  ******************************************************************************/
5531
5532 void lru_gen_init_pgdat(struct pglist_data *pgdat)
5533 {
5534         int i, j;
5535
5536         spin_lock_init(&pgdat->memcg_lru.lock);
5537
5538         for (i = 0; i < MEMCG_NR_GENS; i++) {
5539                 for (j = 0; j < MEMCG_NR_BINS; j++)
5540                         INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
5541         }
5542 }
5543
5544 void lru_gen_init_lruvec(struct lruvec *lruvec)
5545 {
5546         int i;
5547         int gen, type, zone;
5548         struct lru_gen_folio *lrugen = &lruvec->lrugen;
5549         struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5550
5551         lrugen->max_seq = MIN_NR_GENS + 1;
5552         lrugen->enabled = lru_gen_enabled();
5553
5554         for (i = 0; i <= MIN_NR_GENS + 1; i++)
5555                 lrugen->timestamps[i] = jiffies;
5556
5557         for_each_gen_type_zone(gen, type, zone)
5558                 INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
5559
5560         if (mm_state)
5561                 mm_state->seq = MIN_NR_GENS;
5562 }
5563
5564 #ifdef CONFIG_MEMCG
5565
5566 void lru_gen_init_memcg(struct mem_cgroup *memcg)
5567 {
5568         struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
5569
5570         if (!mm_list)
5571                 return;
5572
5573         INIT_LIST_HEAD(&mm_list->fifo);
5574         spin_lock_init(&mm_list->lock);
5575 }
5576
5577 void lru_gen_exit_memcg(struct mem_cgroup *memcg)
5578 {
5579         int i;
5580         int nid;
5581         struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
5582
5583         VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo));
5584
5585         for_each_node(nid) {
5586                 struct lruvec *lruvec = get_lruvec(memcg, nid);
5587                 struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5588
5589                 VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
5590                                            sizeof(lruvec->lrugen.nr_pages)));
5591
5592                 lruvec->lrugen.list.next = LIST_POISON1;
5593
5594                 if (!mm_state)
5595                         continue;
5596
5597                 for (i = 0; i < NR_BLOOM_FILTERS; i++) {
5598                         bitmap_free(mm_state->filters[i]);
5599                         mm_state->filters[i] = NULL;
5600                 }
5601         }
5602 }
5603
5604 #endif /* CONFIG_MEMCG */
5605
5606 static int __init init_lru_gen(void)
5607 {
5608         BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
5609         BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
5610
5611         if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
5612                 pr_err("lru_gen: failed to create sysfs group\n");
5613
5614         debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
5615         debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
5616
5617         return 0;
5618 };
5619 late_initcall(init_lru_gen);
5620
5621 #else /* !CONFIG_LRU_GEN */
5622
5623 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
5624 {
5625         BUILD_BUG();
5626 }
5627
5628 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
5629 {
5630         BUILD_BUG();
5631 }
5632
5633 static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
5634 {
5635         BUILD_BUG();
5636 }
5637
5638 #endif /* CONFIG_LRU_GEN */
5639
5640 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
5641 {
5642         unsigned long nr[NR_LRU_LISTS];
5643         unsigned long targets[NR_LRU_LISTS];
5644         unsigned long nr_to_scan;
5645         enum lru_list lru;
5646         unsigned long nr_reclaimed = 0;
5647         unsigned long nr_to_reclaim = sc->nr_to_reclaim;
5648         bool proportional_reclaim;
5649         struct blk_plug plug;
5650
5651         if (lru_gen_enabled() && !root_reclaim(sc)) {
5652                 lru_gen_shrink_lruvec(lruvec, sc);
5653                 return;
5654         }
5655
5656         get_scan_count(lruvec, sc, nr);
5657
5658         /* Record the original scan target for proportional adjustments later */
5659         memcpy(targets, nr, sizeof(nr));
5660
5661         /*
5662          * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
5663          * event that can occur when there is little memory pressure e.g.
5664          * multiple streaming readers/writers. Hence, we do not abort scanning
5665          * when the requested number of pages are reclaimed when scanning at
5666          * DEF_PRIORITY on the assumption that the fact we are direct
5667          * reclaiming implies that kswapd is not keeping up and it is best to
5668          * do a batch of work at once. For memcg reclaim one check is made to
5669          * abort proportional reclaim if either the file or anon lru has already
5670          * dropped to zero at the first pass.
5671          */
5672         proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
5673                                 sc->priority == DEF_PRIORITY);
5674
5675         blk_start_plug(&plug);
5676         while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
5677                                         nr[LRU_INACTIVE_FILE]) {
5678                 unsigned long nr_anon, nr_file, percentage;
5679                 unsigned long nr_scanned;
5680
5681                 for_each_evictable_lru(lru) {
5682                         if (nr[lru]) {
5683                                 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
5684                                 nr[lru] -= nr_to_scan;
5685
5686                                 nr_reclaimed += shrink_list(lru, nr_to_scan,
5687                                                             lruvec, sc);
5688                         }
5689                 }
5690
5691                 cond_resched();
5692
5693                 if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
5694                         continue;
5695
5696                 /*
5697                  * For kswapd and memcg, reclaim at least the number of pages
5698                  * requested. Ensure that the anon and file LRUs are scanned
5699                  * proportionally what was requested by get_scan_count(). We
5700                  * stop reclaiming one LRU and reduce the amount scanning
5701                  * proportional to the original scan target.
5702                  */
5703                 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
5704                 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
5705
5706                 /*
5707                  * It's just vindictive to attack the larger once the smaller
5708                  * has gone to zero.  And given the way we stop scanning the
5709                  * smaller below, this makes sure that we only make one nudge
5710                  * towards proportionality once we've got nr_to_reclaim.
5711                  */
5712                 if (!nr_file || !nr_anon)
5713                         break;
5714
5715                 if (nr_file > nr_anon) {
5716                         unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
5717                                                 targets[LRU_ACTIVE_ANON] + 1;
5718                         lru = LRU_BASE;
5719                         percentage = nr_anon * 100 / scan_target;
5720                 } else {
5721                         unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
5722                                                 targets[LRU_ACTIVE_FILE] + 1;
5723                         lru = LRU_FILE;
5724                         percentage = nr_file * 100 / scan_target;
5725                 }
5726
5727                 /* Stop scanning the smaller of the LRU */
5728                 nr[lru] = 0;
5729                 nr[lru + LRU_ACTIVE] = 0;
5730
5731                 /*
5732                  * Recalculate the other LRU scan count based on its original
5733                  * scan target and the percentage scanning already complete
5734                  */
5735                 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
5736                 nr_scanned = targets[lru] - nr[lru];
5737                 nr[lru] = targets[lru] * (100 - percentage) / 100;
5738                 nr[lru] -= min(nr[lru], nr_scanned);
5739
5740                 lru += LRU_ACTIVE;
5741                 nr_scanned = targets[lru] - nr[lru];
5742                 nr[lru] = targets[lru] * (100 - percentage) / 100;
5743                 nr[lru] -= min(nr[lru], nr_scanned);
5744         }
5745         blk_finish_plug(&plug);
5746         sc->nr_reclaimed += nr_reclaimed;
5747
5748         /*
5749          * Even if we did not try to evict anon pages at all, we want to
5750          * rebalance the anon lru active/inactive ratio.
5751          */
5752         if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
5753             inactive_is_low(lruvec, LRU_INACTIVE_ANON))
5754                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
5755                                    sc, LRU_ACTIVE_ANON);
5756 }
5757
5758 /* Use reclaim/compaction for costly allocs or under memory pressure */
5759 static bool in_reclaim_compaction(struct scan_control *sc)
5760 {
5761         if (gfp_compaction_allowed(sc->gfp_mask) && sc->order &&
5762                         (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
5763                          sc->priority < DEF_PRIORITY - 2))
5764                 return true;
5765
5766         return false;
5767 }
5768
5769 /*
5770  * Reclaim/compaction is used for high-order allocation requests. It reclaims
5771  * order-0 pages before compacting the zone. should_continue_reclaim() returns
5772  * true if more pages should be reclaimed such that when the page allocator
5773  * calls try_to_compact_pages() that it will have enough free pages to succeed.
5774  * It will give up earlier than that if there is difficulty reclaiming pages.
5775  */
5776 static inline bool should_continue_reclaim(struct pglist_data *pgdat,
5777                                         unsigned long nr_reclaimed,
5778                                         struct scan_control *sc)
5779 {
5780         unsigned long pages_for_compaction;
5781         unsigned long inactive_lru_pages;
5782         int z;
5783
5784         /* If not in reclaim/compaction mode, stop */
5785         if (!in_reclaim_compaction(sc))
5786                 return false;
5787
5788         /*
5789          * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
5790          * number of pages that were scanned. This will return to the caller
5791          * with the risk reclaim/compaction and the resulting allocation attempt
5792          * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
5793          * allocations through requiring that the full LRU list has been scanned
5794          * first, by assuming that zero delta of sc->nr_scanned means full LRU
5795          * scan, but that approximation was wrong, and there were corner cases
5796          * where always a non-zero amount of pages were scanned.
5797          */
5798         if (!nr_reclaimed)
5799                 return false;
5800
5801         /* If compaction would go ahead or the allocation would succeed, stop */
5802         for (z = 0; z <= sc->reclaim_idx; z++) {
5803                 struct zone *zone = &pgdat->node_zones[z];
5804                 if (!managed_zone(zone))
5805                         continue;
5806
5807                 /* Allocation can already succeed, nothing to do */
5808                 if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
5809                                       sc->reclaim_idx, 0))
5810                         return false;
5811
5812                 if (compaction_suitable(zone, sc->order, sc->reclaim_idx))
5813                         return false;
5814         }
5815
5816         /*
5817          * If we have not reclaimed enough pages for compaction and the
5818          * inactive lists are large enough, continue reclaiming
5819          */
5820         pages_for_compaction = compact_gap(sc->order);
5821         inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
5822         if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
5823                 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
5824
5825         return inactive_lru_pages > pages_for_compaction;
5826 }
5827
5828 static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
5829 {
5830         struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
5831         struct mem_cgroup_reclaim_cookie reclaim = {
5832                 .pgdat = pgdat,
5833         };
5834         struct mem_cgroup_reclaim_cookie *partial = &reclaim;
5835         struct mem_cgroup *memcg;
5836
5837         /*
5838          * In most cases, direct reclaimers can do partial walks
5839          * through the cgroup tree, using an iterator state that
5840          * persists across invocations. This strikes a balance between
5841          * fairness and allocation latency.
5842          *
5843          * For kswapd, reliable forward progress is more important
5844          * than a quick return to idle. Always do full walks.
5845          */
5846         if (current_is_kswapd() || sc->memcg_full_walk)
5847                 partial = NULL;
5848
5849         memcg = mem_cgroup_iter(target_memcg, NULL, partial);
5850         do {
5851                 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
5852                 unsigned long reclaimed;
5853                 unsigned long scanned;
5854
5855                 /*
5856                  * This loop can become CPU-bound when target memcgs
5857                  * aren't eligible for reclaim - either because they
5858                  * don't have any reclaimable pages, or because their
5859                  * memory is explicitly protected. Avoid soft lockups.
5860                  */
5861                 cond_resched();
5862
5863                 mem_cgroup_calculate_protection(target_memcg, memcg);
5864
5865                 if (mem_cgroup_below_min(target_memcg, memcg)) {
5866                         /*
5867                          * Hard protection.
5868                          * If there is no reclaimable memory, OOM.
5869                          */
5870                         continue;
5871                 } else if (mem_cgroup_below_low(target_memcg, memcg)) {
5872                         /*
5873                          * Soft protection.
5874                          * Respect the protection only as long as
5875                          * there is an unprotected supply
5876                          * of reclaimable memory from other cgroups.
5877                          */
5878                         if (!sc->memcg_low_reclaim) {
5879                                 sc->memcg_low_skipped = 1;
5880                                 continue;
5881                         }
5882                         memcg_memory_event(memcg, MEMCG_LOW);
5883                 }
5884
5885                 reclaimed = sc->nr_reclaimed;
5886                 scanned = sc->nr_scanned;
5887
5888                 shrink_lruvec(lruvec, sc);
5889
5890                 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
5891                             sc->priority);
5892
5893                 /* Record the group's reclaim efficiency */
5894                 if (!sc->proactive)
5895                         vmpressure(sc->gfp_mask, memcg, false,
5896                                    sc->nr_scanned - scanned,
5897                                    sc->nr_reclaimed - reclaimed);
5898
5899                 /* If partial walks are allowed, bail once goal is reached */
5900                 if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) {
5901                         mem_cgroup_iter_break(target_memcg, memcg);
5902                         break;
5903                 }
5904         } while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial)));
5905 }
5906
5907 static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
5908 {
5909         unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed;
5910         struct lruvec *target_lruvec;
5911         bool reclaimable = false;
5912
5913         if (lru_gen_enabled() && root_reclaim(sc)) {
5914                 lru_gen_shrink_node(pgdat, sc);
5915                 return;
5916         }
5917
5918         target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
5919
5920 again:
5921         memset(&sc->nr, 0, sizeof(sc->nr));
5922
5923         nr_reclaimed = sc->nr_reclaimed;
5924         nr_scanned = sc->nr_scanned;
5925
5926         prepare_scan_control(pgdat, sc);
5927
5928         shrink_node_memcgs(pgdat, sc);
5929
5930         flush_reclaim_state(sc);
5931
5932         nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed;
5933
5934         /* Record the subtree's reclaim efficiency */
5935         if (!sc->proactive)
5936                 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
5937                            sc->nr_scanned - nr_scanned, nr_node_reclaimed);
5938
5939         if (nr_node_reclaimed)
5940                 reclaimable = true;
5941
5942         if (current_is_kswapd()) {
5943                 /*
5944                  * If reclaim is isolating dirty pages under writeback,
5945                  * it implies that the long-lived page allocation rate
5946                  * is exceeding the page laundering rate. Either the
5947                  * global limits are not being effective at throttling
5948                  * processes due to the page distribution throughout
5949                  * zones or there is heavy usage of a slow backing
5950                  * device. The only option is to throttle from reclaim
5951                  * context which is not ideal as there is no guarantee
5952                  * the dirtying process is throttled in the same way
5953                  * balance_dirty_pages() manages.
5954                  *
5955                  * Once a node is flagged PGDAT_WRITEBACK, kswapd will
5956                  * count the number of pages under pages flagged for
5957                  * immediate reclaim and stall if any are encountered
5958                  * in the nr_immediate check below.
5959                  */
5960                 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
5961                         set_bit(PGDAT_WRITEBACK, &pgdat->flags);
5962
5963                 /* Allow kswapd to start writing pages during reclaim.*/
5964                 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
5965                         set_bit(PGDAT_DIRTY, &pgdat->flags);
5966
5967                 /*
5968                  * If kswapd scans pages marked for immediate
5969                  * reclaim and under writeback (nr_immediate), it
5970                  * implies that pages are cycling through the LRU
5971                  * faster than they are written so forcibly stall
5972                  * until some pages complete writeback.
5973                  */
5974                 if (sc->nr.immediate)
5975                         reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
5976         }
5977
5978         /*
5979          * Tag a node/memcg as congested if all the dirty pages were marked
5980          * for writeback and immediate reclaim (counted in nr.congested).
5981          *
5982          * Legacy memcg will stall in page writeback so avoid forcibly
5983          * stalling in reclaim_throttle().
5984          */
5985         if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) {
5986                 if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
5987                         set_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags);
5988
5989                 if (current_is_kswapd())
5990                         set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags);
5991         }
5992
5993         /*
5994          * Stall direct reclaim for IO completions if the lruvec is
5995          * node is congested. Allow kswapd to continue until it
5996          * starts encountering unqueued dirty pages or cycling through
5997          * the LRU too quickly.
5998          */
5999         if (!current_is_kswapd() && current_may_throttle() &&
6000             !sc->hibernation_mode &&
6001             (test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) ||
6002              test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)))
6003                 reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
6004
6005         if (should_continue_reclaim(pgdat, nr_node_reclaimed, sc))
6006                 goto again;
6007
6008         /*
6009          * Kswapd gives up on balancing particular nodes after too
6010          * many failures to reclaim anything from them and goes to
6011          * sleep. On reclaim progress, reset the failure counter. A
6012          * successful direct reclaim run will revive a dormant kswapd.
6013          */
6014         if (reclaimable)
6015                 pgdat->kswapd_failures = 0;
6016         else if (sc->cache_trim_mode)
6017                 sc->cache_trim_mode_failed = 1;
6018 }
6019
6020 /*
6021  * Returns true if compaction should go ahead for a costly-order request, or
6022  * the allocation would already succeed without compaction. Return false if we
6023  * should reclaim first.
6024  */
6025 static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
6026 {
6027         unsigned long watermark;
6028
6029         if (!gfp_compaction_allowed(sc->gfp_mask))
6030                 return false;
6031
6032         /* Allocation can already succeed, nothing to do */
6033         if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
6034                               sc->reclaim_idx, 0))
6035                 return true;
6036
6037         /* Compaction cannot yet proceed. Do reclaim. */
6038         if (!compaction_suitable(zone, sc->order, sc->reclaim_idx))
6039                 return false;
6040
6041         /*
6042          * Compaction is already possible, but it takes time to run and there
6043          * are potentially other callers using the pages just freed. So proceed
6044          * with reclaim to make a buffer of free pages available to give
6045          * compaction a reasonable chance of completing and allocating the page.
6046          * Note that we won't actually reclaim the whole buffer in one attempt
6047          * as the target watermark in should_continue_reclaim() is lower. But if
6048          * we are already above the high+gap watermark, don't reclaim at all.
6049          */
6050         watermark = high_wmark_pages(zone) + compact_gap(sc->order);
6051
6052         return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
6053 }
6054
6055 static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
6056 {
6057         /*
6058          * If reclaim is making progress greater than 12% efficiency then
6059          * wake all the NOPROGRESS throttled tasks.
6060          */
6061         if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
6062                 wait_queue_head_t *wqh;
6063
6064                 wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
6065                 if (waitqueue_active(wqh))
6066                         wake_up(wqh);
6067
6068                 return;
6069         }
6070
6071         /*
6072          * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
6073          * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
6074          * under writeback and marked for immediate reclaim at the tail of the
6075          * LRU.
6076          */
6077         if (current_is_kswapd() || cgroup_reclaim(sc))
6078                 return;
6079
6080         /* Throttle if making no progress at high prioities. */
6081         if (sc->priority == 1 && !sc->nr_reclaimed)
6082                 reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
6083 }
6084
6085 /*
6086  * This is the direct reclaim path, for page-allocating processes.  We only
6087  * try to reclaim pages from zones which will satisfy the caller's allocation
6088  * request.
6089  *
6090  * If a zone is deemed to be full of pinned pages then just give it a light
6091  * scan then give up on it.
6092  */
6093 static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
6094 {
6095         struct zoneref *z;
6096         struct zone *zone;
6097         unsigned long nr_soft_reclaimed;
6098         unsigned long nr_soft_scanned;
6099         gfp_t orig_mask;
6100         pg_data_t *last_pgdat = NULL;
6101         pg_data_t *first_pgdat = NULL;
6102
6103         /*
6104          * If the number of buffer_heads in the machine exceeds the maximum
6105          * allowed level, force direct reclaim to scan the highmem zone as
6106          * highmem pages could be pinning lowmem pages storing buffer_heads
6107          */
6108         orig_mask = sc->gfp_mask;
6109         if (buffer_heads_over_limit) {
6110                 sc->gfp_mask |= __GFP_HIGHMEM;
6111                 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
6112         }
6113
6114         for_each_zone_zonelist_nodemask(zone, z, zonelist,
6115                                         sc->reclaim_idx, sc->nodemask) {
6116                 /*
6117                  * Take care memory controller reclaiming has small influence
6118                  * to global LRU.
6119                  */
6120                 if (!cgroup_reclaim(sc)) {
6121                         if (!cpuset_zone_allowed(zone,
6122                                                  GFP_KERNEL | __GFP_HARDWALL))
6123                                 continue;
6124
6125                         /*
6126                          * If we already have plenty of memory free for
6127                          * compaction in this zone, don't free any more.
6128                          * Even though compaction is invoked for any
6129                          * non-zero order, only frequent costly order
6130                          * reclamation is disruptive enough to become a
6131                          * noticeable problem, like transparent huge
6132                          * page allocations.
6133                          */
6134                         if (IS_ENABLED(CONFIG_COMPACTION) &&
6135                             sc->order > PAGE_ALLOC_COSTLY_ORDER &&
6136                             compaction_ready(zone, sc)) {
6137                                 sc->compaction_ready = true;
6138                                 continue;
6139                         }
6140
6141                         /*
6142                          * Shrink each node in the zonelist once. If the
6143                          * zonelist is ordered by zone (not the default) then a
6144                          * node may be shrunk multiple times but in that case
6145                          * the user prefers lower zones being preserved.
6146                          */
6147                         if (zone->zone_pgdat == last_pgdat)
6148                                 continue;
6149
6150                         /*
6151                          * This steals pages from memory cgroups over softlimit
6152                          * and returns the number of reclaimed pages and
6153                          * scanned pages. This works for global memory pressure
6154                          * and balancing, not for a memcg's limit.
6155                          */
6156                         nr_soft_scanned = 0;
6157                         nr_soft_reclaimed = memcg1_soft_limit_reclaim(zone->zone_pgdat,
6158                                                                       sc->order, sc->gfp_mask,
6159                                                                       &nr_soft_scanned);
6160                         sc->nr_reclaimed += nr_soft_reclaimed;
6161                         sc->nr_scanned += nr_soft_scanned;
6162                         /* need some check for avoid more shrink_zone() */
6163                 }
6164
6165                 if (!first_pgdat)
6166                         first_pgdat = zone->zone_pgdat;
6167
6168                 /* See comment about same check for global reclaim above */
6169                 if (zone->zone_pgdat == last_pgdat)
6170                         continue;
6171                 last_pgdat = zone->zone_pgdat;
6172                 shrink_node(zone->zone_pgdat, sc);
6173         }
6174
6175         if (first_pgdat)
6176                 consider_reclaim_throttle(first_pgdat, sc);
6177
6178         /*
6179          * Restore to original mask to avoid the impact on the caller if we
6180          * promoted it to __GFP_HIGHMEM.
6181          */
6182         sc->gfp_mask = orig_mask;
6183 }
6184
6185 static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
6186 {
6187         struct lruvec *target_lruvec;
6188         unsigned long refaults;
6189
6190         if (lru_gen_enabled())
6191                 return;
6192
6193         target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
6194         refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
6195         target_lruvec->refaults[WORKINGSET_ANON] = refaults;
6196         refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
6197         target_lruvec->refaults[WORKINGSET_FILE] = refaults;
6198 }
6199
6200 /*
6201  * This is the main entry point to direct page reclaim.
6202  *
6203  * If a full scan of the inactive list fails to free enough memory then we
6204  * are "out of memory" and something needs to be killed.
6205  *
6206  * If the caller is !__GFP_FS then the probability of a failure is reasonably
6207  * high - the zone may be full of dirty or under-writeback pages, which this
6208  * caller can't do much about.  We kick the writeback threads and take explicit
6209  * naps in the hope that some of these pages can be written.  But if the
6210  * allocating task holds filesystem locks which prevent writeout this might not
6211  * work, and the allocation attempt will fail.
6212  *
6213  * returns:     0, if no pages reclaimed
6214  *              else, the number of pages reclaimed
6215  */
6216 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
6217                                           struct scan_control *sc)
6218 {
6219         int initial_priority = sc->priority;
6220         pg_data_t *last_pgdat;
6221         struct zoneref *z;
6222         struct zone *zone;
6223 retry:
6224         delayacct_freepages_start();
6225
6226         if (!cgroup_reclaim(sc))
6227                 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
6228
6229         do {
6230                 if (!sc->proactive)
6231                         vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
6232                                         sc->priority);
6233                 sc->nr_scanned = 0;
6234                 shrink_zones(zonelist, sc);
6235
6236                 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
6237                         break;
6238
6239                 if (sc->compaction_ready)
6240                         break;
6241
6242                 /*
6243                  * If we're getting trouble reclaiming, start doing
6244                  * writepage even in laptop mode.
6245                  */
6246                 if (sc->priority < DEF_PRIORITY - 2)
6247                         sc->may_writepage = 1;
6248         } while (--sc->priority >= 0);
6249
6250         last_pgdat = NULL;
6251         for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
6252                                         sc->nodemask) {
6253                 if (zone->zone_pgdat == last_pgdat)
6254                         continue;
6255                 last_pgdat = zone->zone_pgdat;
6256
6257                 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
6258
6259                 if (cgroup_reclaim(sc)) {
6260                         struct lruvec *lruvec;
6261
6262                         lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
6263                                                    zone->zone_pgdat);
6264                         clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
6265                 }
6266         }
6267
6268         delayacct_freepages_end();
6269
6270         if (sc->nr_reclaimed)
6271                 return sc->nr_reclaimed;
6272
6273         /* Aborted reclaim to try compaction? don't OOM, then */
6274         if (sc->compaction_ready)
6275                 return 1;
6276
6277         /*
6278          * In most cases, direct reclaimers can do partial walks
6279          * through the cgroup tree to meet the reclaim goal while
6280          * keeping latency low. Since the iterator state is shared
6281          * among all direct reclaim invocations (to retain fairness
6282          * among cgroups), though, high concurrency can result in
6283          * individual threads not seeing enough cgroups to make
6284          * meaningful forward progress. Avoid false OOMs in this case.
6285          */
6286         if (!sc->memcg_full_walk) {
6287                 sc->priority = initial_priority;
6288                 sc->memcg_full_walk = 1;
6289                 goto retry;
6290         }
6291
6292         /*
6293          * We make inactive:active ratio decisions based on the node's
6294          * composition of memory, but a restrictive reclaim_idx or a
6295          * memory.low cgroup setting can exempt large amounts of
6296          * memory from reclaim. Neither of which are very common, so
6297          * instead of doing costly eligibility calculations of the
6298          * entire cgroup subtree up front, we assume the estimates are
6299          * good, and retry with forcible deactivation if that fails.
6300          */
6301         if (sc->skipped_deactivate) {
6302                 sc->priority = initial_priority;
6303                 sc->force_deactivate = 1;
6304                 sc->skipped_deactivate = 0;
6305                 goto retry;
6306         }
6307
6308         /* Untapped cgroup reserves?  Don't OOM, retry. */
6309         if (sc->memcg_low_skipped) {
6310                 sc->priority = initial_priority;
6311                 sc->force_deactivate = 0;
6312                 sc->memcg_low_reclaim = 1;
6313                 sc->memcg_low_skipped = 0;
6314                 goto retry;
6315         }
6316
6317         return 0;
6318 }
6319
6320 static bool allow_direct_reclaim(pg_data_t *pgdat)
6321 {
6322         struct zone *zone;
6323         unsigned long pfmemalloc_reserve = 0;
6324         unsigned long free_pages = 0;
6325         int i;
6326         bool wmark_ok;
6327
6328         if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
6329                 return true;
6330
6331         for (i = 0; i <= ZONE_NORMAL; i++) {
6332                 zone = &pgdat->node_zones[i];
6333                 if (!managed_zone(zone))
6334                         continue;
6335
6336                 if (!zone_reclaimable_pages(zone))
6337                         continue;
6338
6339                 pfmemalloc_reserve += min_wmark_pages(zone);
6340                 free_pages += zone_page_state_snapshot(zone, NR_FREE_PAGES);
6341         }
6342
6343         /* If there are no reserves (unexpected config) then do not throttle */
6344         if (!pfmemalloc_reserve)
6345                 return true;
6346
6347         wmark_ok = free_pages > pfmemalloc_reserve / 2;
6348
6349         /* kswapd must be awake if processes are being throttled */
6350         if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
6351                 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
6352                         WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
6353
6354                 wake_up_interruptible(&pgdat->kswapd_wait);
6355         }
6356
6357         return wmark_ok;
6358 }
6359
6360 /*
6361  * Throttle direct reclaimers if backing storage is backed by the network
6362  * and the PFMEMALLOC reserve for the preferred node is getting dangerously
6363  * depleted. kswapd will continue to make progress and wake the processes
6364  * when the low watermark is reached.
6365  *
6366  * Returns true if a fatal signal was delivered during throttling. If this
6367  * happens, the page allocator should not consider triggering the OOM killer.
6368  */
6369 static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
6370                                         nodemask_t *nodemask)
6371 {
6372         struct zoneref *z;
6373         struct zone *zone;
6374         pg_data_t *pgdat = NULL;
6375
6376         /*
6377          * Kernel threads should not be throttled as they may be indirectly
6378          * responsible for cleaning pages necessary for reclaim to make forward
6379          * progress. kjournald for example may enter direct reclaim while
6380          * committing a transaction where throttling it could forcing other
6381          * processes to block on log_wait_commit().
6382          */
6383         if (current->flags & PF_KTHREAD)
6384                 goto out;
6385
6386         /*
6387          * If a fatal signal is pending, this process should not throttle.
6388          * It should return quickly so it can exit and free its memory
6389          */
6390         if (fatal_signal_pending(current))
6391                 goto out;
6392
6393         /*
6394          * Check if the pfmemalloc reserves are ok by finding the first node
6395          * with a usable ZONE_NORMAL or lower zone. The expectation is that
6396          * GFP_KERNEL will be required for allocating network buffers when
6397          * swapping over the network so ZONE_HIGHMEM is unusable.
6398          *
6399          * Throttling is based on the first usable node and throttled processes
6400          * wait on a queue until kswapd makes progress and wakes them. There
6401          * is an affinity then between processes waking up and where reclaim
6402          * progress has been made assuming the process wakes on the same node.
6403          * More importantly, processes running on remote nodes will not compete
6404          * for remote pfmemalloc reserves and processes on different nodes
6405          * should make reasonable progress.
6406          */
6407         for_each_zone_zonelist_nodemask(zone, z, zonelist,
6408                                         gfp_zone(gfp_mask), nodemask) {
6409                 if (zone_idx(zone) > ZONE_NORMAL)
6410                         continue;
6411
6412                 /* Throttle based on the first usable node */
6413                 pgdat = zone->zone_pgdat;
6414                 if (allow_direct_reclaim(pgdat))
6415                         goto out;
6416                 break;
6417         }
6418
6419         /* If no zone was usable by the allocation flags then do not throttle */
6420         if (!pgdat)
6421                 goto out;
6422
6423         /* Account for the throttling */
6424         count_vm_event(PGSCAN_DIRECT_THROTTLE);
6425
6426         /*
6427          * If the caller cannot enter the filesystem, it's possible that it
6428          * is due to the caller holding an FS lock or performing a journal
6429          * transaction in the case of a filesystem like ext[3|4]. In this case,
6430          * it is not safe to block on pfmemalloc_wait as kswapd could be
6431          * blocked waiting on the same lock. Instead, throttle for up to a
6432          * second before continuing.
6433          */
6434         if (!(gfp_mask & __GFP_FS))
6435                 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
6436                         allow_direct_reclaim(pgdat), HZ);
6437         else
6438                 /* Throttle until kswapd wakes the process */
6439                 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
6440                         allow_direct_reclaim(pgdat));
6441
6442         if (fatal_signal_pending(current))
6443                 return true;
6444
6445 out:
6446         return false;
6447 }
6448
6449 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
6450                                 gfp_t gfp_mask, nodemask_t *nodemask)
6451 {
6452         unsigned long nr_reclaimed;
6453         struct scan_control sc = {
6454                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
6455                 .gfp_mask = current_gfp_context(gfp_mask),
6456                 .reclaim_idx = gfp_zone(gfp_mask),
6457                 .order = order,
6458                 .nodemask = nodemask,
6459                 .priority = DEF_PRIORITY,
6460                 .may_writepage = !laptop_mode,
6461                 .may_unmap = 1,
6462                 .may_swap = 1,
6463         };
6464
6465         /*
6466          * scan_control uses s8 fields for order, priority, and reclaim_idx.
6467          * Confirm they are large enough for max values.
6468          */
6469         BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX);
6470         BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
6471         BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
6472
6473         /*
6474          * Do not enter reclaim if fatal signal was delivered while throttled.
6475          * 1 is returned so that the page allocator does not OOM kill at this
6476          * point.
6477          */
6478         if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
6479                 return 1;
6480
6481         set_task_reclaim_state(current, &sc.reclaim_state);
6482         trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
6483
6484         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
6485
6486         trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
6487         set_task_reclaim_state(current, NULL);
6488
6489         return nr_reclaimed;
6490 }
6491
6492 #ifdef CONFIG_MEMCG
6493
6494 /* Only used by soft limit reclaim. Do not reuse for anything else. */
6495 unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
6496                                                 gfp_t gfp_mask, bool noswap,
6497                                                 pg_data_t *pgdat,
6498                                                 unsigned long *nr_scanned)
6499 {
6500         struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
6501         struct scan_control sc = {
6502                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
6503                 .target_mem_cgroup = memcg,
6504                 .may_writepage = !laptop_mode,
6505                 .may_unmap = 1,
6506                 .reclaim_idx = MAX_NR_ZONES - 1,
6507                 .may_swap = !noswap,
6508         };
6509
6510         WARN_ON_ONCE(!current->reclaim_state);
6511
6512         sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
6513                         (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
6514
6515         trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
6516                                                       sc.gfp_mask);
6517
6518         /*
6519          * NOTE: Although we can get the priority field, using it
6520          * here is not a good idea, since it limits the pages we can scan.
6521          * if we don't reclaim here, the shrink_node from balance_pgdat
6522          * will pick up pages from other mem cgroup's as well. We hack
6523          * the priority and make it zero.
6524          */
6525         shrink_lruvec(lruvec, &sc);
6526
6527         trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
6528
6529         *nr_scanned = sc.nr_scanned;
6530
6531         return sc.nr_reclaimed;
6532 }
6533
6534 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
6535                                            unsigned long nr_pages,
6536                                            gfp_t gfp_mask,
6537                                            unsigned int reclaim_options,
6538                                            int *swappiness)
6539 {
6540         unsigned long nr_reclaimed;
6541         unsigned int noreclaim_flag;
6542         struct scan_control sc = {
6543                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
6544                 .proactive_swappiness = swappiness,
6545                 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
6546                                 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
6547                 .reclaim_idx = MAX_NR_ZONES - 1,
6548                 .target_mem_cgroup = memcg,
6549                 .priority = DEF_PRIORITY,
6550                 .may_writepage = !laptop_mode,
6551                 .may_unmap = 1,
6552                 .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
6553                 .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
6554         };
6555         /*
6556          * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
6557          * equal pressure on all the nodes. This is based on the assumption that
6558          * the reclaim does not bail out early.
6559          */
6560         struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
6561
6562         set_task_reclaim_state(current, &sc.reclaim_state);
6563         trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
6564         noreclaim_flag = memalloc_noreclaim_save();
6565
6566         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
6567
6568         memalloc_noreclaim_restore(noreclaim_flag);
6569         trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
6570         set_task_reclaim_state(current, NULL);
6571
6572         return nr_reclaimed;
6573 }
6574 #endif
6575
6576 static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
6577 {
6578         struct mem_cgroup *memcg;
6579         struct lruvec *lruvec;
6580
6581         if (lru_gen_enabled()) {
6582                 lru_gen_age_node(pgdat, sc);
6583                 return;
6584         }
6585
6586         if (!can_age_anon_pages(pgdat, sc))
6587                 return;
6588
6589         lruvec = mem_cgroup_lruvec(NULL, pgdat);
6590         if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
6591                 return;
6592
6593         memcg = mem_cgroup_iter(NULL, NULL, NULL);
6594         do {
6595                 lruvec = mem_cgroup_lruvec(memcg, pgdat);
6596                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
6597                                    sc, LRU_ACTIVE_ANON);
6598                 memcg = mem_cgroup_iter(NULL, memcg, NULL);
6599         } while (memcg);
6600 }
6601
6602 static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
6603 {
6604         int i;
6605         struct zone *zone;
6606
6607         /*
6608          * Check for watermark boosts top-down as the higher zones
6609          * are more likely to be boosted. Both watermarks and boosts
6610          * should not be checked at the same time as reclaim would
6611          * start prematurely when there is no boosting and a lower
6612          * zone is balanced.
6613          */
6614         for (i = highest_zoneidx; i >= 0; i--) {
6615                 zone = pgdat->node_zones + i;
6616                 if (!managed_zone(zone))
6617                         continue;
6618
6619                 if (zone->watermark_boost)
6620                         return true;
6621         }
6622
6623         return false;
6624 }
6625
6626 /*
6627  * Returns true if there is an eligible zone balanced for the request order
6628  * and highest_zoneidx
6629  */
6630 static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
6631 {
6632         int i;
6633         unsigned long mark = -1;
6634         struct zone *zone;
6635
6636         /*
6637          * Check watermarks bottom-up as lower zones are more likely to
6638          * meet watermarks.
6639          */
6640         for (i = 0; i <= highest_zoneidx; i++) {
6641                 zone = pgdat->node_zones + i;
6642
6643                 if (!managed_zone(zone))
6644                         continue;
6645
6646                 if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
6647                         mark = wmark_pages(zone, WMARK_PROMO);
6648                 else
6649                         mark = high_wmark_pages(zone);
6650                 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
6651                         return true;
6652         }
6653
6654         /*
6655          * If a node has no managed zone within highest_zoneidx, it does not
6656          * need balancing by definition. This can happen if a zone-restricted
6657          * allocation tries to wake a remote kswapd.
6658          */
6659         if (mark == -1)
6660                 return true;
6661
6662         return false;
6663 }
6664
6665 /* Clear pgdat state for congested, dirty or under writeback. */
6666 static void clear_pgdat_congested(pg_data_t *pgdat)
6667 {
6668         struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
6669
6670         clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags);
6671         clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
6672         clear_bit(PGDAT_DIRTY, &pgdat->flags);
6673         clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
6674 }
6675
6676 /*
6677  * Prepare kswapd for sleeping. This verifies that there are no processes
6678  * waiting in throttle_direct_reclaim() and that watermarks have been met.
6679  *
6680  * Returns true if kswapd is ready to sleep
6681  */
6682 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
6683                                 int highest_zoneidx)
6684 {
6685         /*
6686          * The throttled processes are normally woken up in balance_pgdat() as
6687          * soon as allow_direct_reclaim() is true. But there is a potential
6688          * race between when kswapd checks the watermarks and a process gets
6689          * throttled. There is also a potential race if processes get
6690          * throttled, kswapd wakes, a large process exits thereby balancing the
6691          * zones, which causes kswapd to exit balance_pgdat() before reaching
6692          * the wake up checks. If kswapd is going to sleep, no process should
6693          * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
6694          * the wake up is premature, processes will wake kswapd and get
6695          * throttled again. The difference from wake ups in balance_pgdat() is
6696          * that here we are under prepare_to_wait().
6697          */
6698         if (waitqueue_active(&pgdat->pfmemalloc_wait))
6699                 wake_up_all(&pgdat->pfmemalloc_wait);
6700
6701         /* Hopeless node, leave it to direct reclaim */
6702         if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
6703                 return true;
6704
6705         if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
6706                 clear_pgdat_congested(pgdat);
6707                 return true;
6708         }
6709
6710         return false;
6711 }
6712
6713 /*
6714  * kswapd shrinks a node of pages that are at or below the highest usable
6715  * zone that is currently unbalanced.
6716  *
6717  * Returns true if kswapd scanned at least the requested number of pages to
6718  * reclaim or if the lack of progress was due to pages under writeback.
6719  * This is used to determine if the scanning priority needs to be raised.
6720  */
6721 static bool kswapd_shrink_node(pg_data_t *pgdat,
6722                                struct scan_control *sc)
6723 {
6724         struct zone *zone;
6725         int z;
6726         unsigned long nr_reclaimed = sc->nr_reclaimed;
6727
6728         /* Reclaim a number of pages proportional to the number of zones */
6729         sc->nr_to_reclaim = 0;
6730         for (z = 0; z <= sc->reclaim_idx; z++) {
6731                 zone = pgdat->node_zones + z;
6732                 if (!managed_zone(zone))
6733                         continue;
6734
6735                 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
6736         }
6737
6738         /*
6739          * Historically care was taken to put equal pressure on all zones but
6740          * now pressure is applied based on node LRU order.
6741          */
6742         shrink_node(pgdat, sc);
6743
6744         /*
6745          * Fragmentation may mean that the system cannot be rebalanced for
6746          * high-order allocations. If twice the allocation size has been
6747          * reclaimed then recheck watermarks only at order-0 to prevent
6748          * excessive reclaim. Assume that a process requested a high-order
6749          * can direct reclaim/compact.
6750          */
6751         if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
6752                 sc->order = 0;
6753
6754         /* account for progress from mm_account_reclaimed_pages() */
6755         return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim;
6756 }
6757
6758 /* Page allocator PCP high watermark is lowered if reclaim is active. */
6759 static inline void
6760 update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
6761 {
6762         int i;
6763         struct zone *zone;
6764
6765         for (i = 0; i <= highest_zoneidx; i++) {
6766                 zone = pgdat->node_zones + i;
6767
6768                 if (!managed_zone(zone))
6769                         continue;
6770
6771                 if (active)
6772                         set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
6773                 else
6774                         clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
6775         }
6776 }
6777
6778 static inline void
6779 set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
6780 {
6781         update_reclaim_active(pgdat, highest_zoneidx, true);
6782 }
6783
6784 static inline void
6785 clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
6786 {
6787         update_reclaim_active(pgdat, highest_zoneidx, false);
6788 }
6789
6790 /*
6791  * For kswapd, balance_pgdat() will reclaim pages across a node from zones
6792  * that are eligible for use by the caller until at least one zone is
6793  * balanced.
6794  *
6795  * Returns the order kswapd finished reclaiming at.
6796  *
6797  * kswapd scans the zones in the highmem->normal->dma direction.  It skips
6798  * zones which have free_pages > high_wmark_pages(zone), but once a zone is
6799  * found to have free_pages <= high_wmark_pages(zone), any page in that zone
6800  * or lower is eligible for reclaim until at least one usable zone is
6801  * balanced.
6802  */
6803 static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
6804 {
6805         int i;
6806         unsigned long nr_soft_reclaimed;
6807         unsigned long nr_soft_scanned;
6808         unsigned long pflags;
6809         unsigned long nr_boost_reclaim;
6810         unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
6811         bool boosted;
6812         struct zone *zone;
6813         struct scan_control sc = {
6814                 .gfp_mask = GFP_KERNEL,
6815                 .order = order,
6816                 .may_unmap = 1,
6817         };
6818
6819         set_task_reclaim_state(current, &sc.reclaim_state);
6820         psi_memstall_enter(&pflags);
6821         __fs_reclaim_acquire(_THIS_IP_);
6822
6823         count_vm_event(PAGEOUTRUN);
6824
6825         /*
6826          * Account for the reclaim boost. Note that the zone boost is left in
6827          * place so that parallel allocations that are near the watermark will
6828          * stall or direct reclaim until kswapd is finished.
6829          */
6830         nr_boost_reclaim = 0;
6831         for (i = 0; i <= highest_zoneidx; i++) {
6832                 zone = pgdat->node_zones + i;
6833                 if (!managed_zone(zone))
6834                         continue;
6835
6836                 nr_boost_reclaim += zone->watermark_boost;
6837                 zone_boosts[i] = zone->watermark_boost;
6838         }
6839         boosted = nr_boost_reclaim;
6840
6841 restart:
6842         set_reclaim_active(pgdat, highest_zoneidx);
6843         sc.priority = DEF_PRIORITY;
6844         do {
6845                 unsigned long nr_reclaimed = sc.nr_reclaimed;
6846                 bool raise_priority = true;
6847                 bool balanced;
6848                 bool ret;
6849                 bool was_frozen;
6850
6851                 sc.reclaim_idx = highest_zoneidx;
6852
6853                 /*
6854                  * If the number of buffer_heads exceeds the maximum allowed
6855                  * then consider reclaiming from all zones. This has a dual
6856                  * purpose -- on 64-bit systems it is expected that
6857                  * buffer_heads are stripped during active rotation. On 32-bit
6858                  * systems, highmem pages can pin lowmem memory and shrinking
6859                  * buffers can relieve lowmem pressure. Reclaim may still not
6860                  * go ahead if all eligible zones for the original allocation
6861                  * request are balanced to avoid excessive reclaim from kswapd.
6862                  */
6863                 if (buffer_heads_over_limit) {
6864                         for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
6865                                 zone = pgdat->node_zones + i;
6866                                 if (!managed_zone(zone))
6867                                         continue;
6868
6869                                 sc.reclaim_idx = i;
6870                                 break;
6871                         }
6872                 }
6873
6874                 /*
6875                  * If the pgdat is imbalanced then ignore boosting and preserve
6876                  * the watermarks for a later time and restart. Note that the
6877                  * zone watermarks will be still reset at the end of balancing
6878                  * on the grounds that the normal reclaim should be enough to
6879                  * re-evaluate if boosting is required when kswapd next wakes.
6880                  */
6881                 balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
6882                 if (!balanced && nr_boost_reclaim) {
6883                         nr_boost_reclaim = 0;
6884                         goto restart;
6885                 }
6886
6887                 /*
6888                  * If boosting is not active then only reclaim if there are no
6889                  * eligible zones. Note that sc.reclaim_idx is not used as
6890                  * buffer_heads_over_limit may have adjusted it.
6891                  */
6892                 if (!nr_boost_reclaim && balanced)
6893                         goto out;
6894
6895                 /* Limit the priority of boosting to avoid reclaim writeback */
6896                 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
6897                         raise_priority = false;
6898
6899                 /*
6900                  * Do not writeback or swap pages for boosted reclaim. The
6901                  * intent is to relieve pressure not issue sub-optimal IO
6902                  * from reclaim context. If no pages are reclaimed, the
6903                  * reclaim will be aborted.
6904                  */
6905                 sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
6906                 sc.may_swap = !nr_boost_reclaim;
6907
6908                 /*
6909                  * Do some background aging, to give pages a chance to be
6910                  * referenced before reclaiming. All pages are rotated
6911                  * regardless of classzone as this is about consistent aging.
6912                  */
6913                 kswapd_age_node(pgdat, &sc);
6914
6915                 /*
6916                  * If we're getting trouble reclaiming, start doing writepage
6917                  * even in laptop mode.
6918                  */
6919                 if (sc.priority < DEF_PRIORITY - 2)
6920                         sc.may_writepage = 1;
6921
6922                 /* Call soft limit reclaim before calling shrink_node. */
6923                 sc.nr_scanned = 0;
6924                 nr_soft_scanned = 0;
6925                 nr_soft_reclaimed = memcg1_soft_limit_reclaim(pgdat, sc.order,
6926                                                               sc.gfp_mask, &nr_soft_scanned);
6927                 sc.nr_reclaimed += nr_soft_reclaimed;
6928
6929                 /*
6930                  * There should be no need to raise the scanning priority if
6931                  * enough pages are already being scanned that that high
6932                  * watermark would be met at 100% efficiency.
6933                  */
6934                 if (kswapd_shrink_node(pgdat, &sc))
6935                         raise_priority = false;
6936
6937                 /*
6938                  * If the low watermark is met there is no need for processes
6939                  * to be throttled on pfmemalloc_wait as they should not be
6940                  * able to safely make forward progress. Wake them
6941                  */
6942                 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
6943                                 allow_direct_reclaim(pgdat))
6944                         wake_up_all(&pgdat->pfmemalloc_wait);
6945
6946                 /* Check if kswapd should be suspending */
6947                 __fs_reclaim_release(_THIS_IP_);
6948                 ret = kthread_freezable_should_stop(&was_frozen);
6949                 __fs_reclaim_acquire(_THIS_IP_);
6950                 if (was_frozen || ret)
6951                         break;
6952
6953                 /*
6954                  * Raise priority if scanning rate is too low or there was no
6955                  * progress in reclaiming pages
6956                  */
6957                 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
6958                 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
6959
6960                 /*
6961                  * If reclaim made no progress for a boost, stop reclaim as
6962                  * IO cannot be queued and it could be an infinite loop in
6963                  * extreme circumstances.
6964                  */
6965                 if (nr_boost_reclaim && !nr_reclaimed)
6966                         break;
6967
6968                 if (raise_priority || !nr_reclaimed)
6969                         sc.priority--;
6970         } while (sc.priority >= 1);
6971
6972         /*
6973          * Restart only if it went through the priority loop all the way,
6974          * but cache_trim_mode didn't work.
6975          */
6976         if (!sc.nr_reclaimed && sc.priority < 1 &&
6977             !sc.no_cache_trim_mode && sc.cache_trim_mode_failed) {
6978                 sc.no_cache_trim_mode = 1;
6979                 goto restart;
6980         }
6981
6982         if (!sc.nr_reclaimed)
6983                 pgdat->kswapd_failures++;
6984
6985 out:
6986         clear_reclaim_active(pgdat, highest_zoneidx);
6987
6988         /* If reclaim was boosted, account for the reclaim done in this pass */
6989         if (boosted) {
6990                 unsigned long flags;
6991
6992                 for (i = 0; i <= highest_zoneidx; i++) {
6993                         if (!zone_boosts[i])
6994                                 continue;
6995
6996                         /* Increments are under the zone lock */
6997                         zone = pgdat->node_zones + i;
6998                         spin_lock_irqsave(&zone->lock, flags);
6999                         zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
7000                         spin_unlock_irqrestore(&zone->lock, flags);
7001                 }
7002
7003                 /*
7004                  * As there is now likely space, wakeup kcompact to defragment
7005                  * pageblocks.
7006                  */
7007                 wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
7008         }
7009
7010         snapshot_refaults(NULL, pgdat);
7011         __fs_reclaim_release(_THIS_IP_);
7012         psi_memstall_leave(&pflags);
7013         set_task_reclaim_state(current, NULL);
7014
7015         /*
7016          * Return the order kswapd stopped reclaiming at as
7017          * prepare_kswapd_sleep() takes it into account. If another caller
7018          * entered the allocator slow path while kswapd was awake, order will
7019          * remain at the higher level.
7020          */
7021         return sc.order;
7022 }
7023
7024 /*
7025  * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
7026  * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
7027  * not a valid index then either kswapd runs for first time or kswapd couldn't
7028  * sleep after previous reclaim attempt (node is still unbalanced). In that
7029  * case return the zone index of the previous kswapd reclaim cycle.
7030  */
7031 static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
7032                                            enum zone_type prev_highest_zoneidx)
7033 {
7034         enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
7035
7036         return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
7037 }
7038
7039 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
7040                                 unsigned int highest_zoneidx)
7041 {
7042         long remaining = 0;
7043         DEFINE_WAIT(wait);
7044
7045         if (freezing(current) || kthread_should_stop())
7046                 return;
7047
7048         prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
7049
7050         /*
7051          * Try to sleep for a short interval. Note that kcompactd will only be
7052          * woken if it is possible to sleep for a short interval. This is
7053          * deliberate on the assumption that if reclaim cannot keep an
7054          * eligible zone balanced that it's also unlikely that compaction will
7055          * succeed.
7056          */
7057         if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
7058                 /*
7059                  * Compaction records what page blocks it recently failed to
7060                  * isolate pages from and skips them in the future scanning.
7061                  * When kswapd is going to sleep, it is reasonable to assume
7062                  * that pages and compaction may succeed so reset the cache.
7063                  */
7064                 reset_isolation_suitable(pgdat);
7065
7066                 /*
7067                  * We have freed the memory, now we should compact it to make
7068                  * allocation of the requested order possible.
7069                  */
7070                 wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
7071
7072                 remaining = schedule_timeout(HZ/10);
7073
7074                 /*
7075                  * If woken prematurely then reset kswapd_highest_zoneidx and
7076                  * order. The values will either be from a wakeup request or
7077                  * the previous request that slept prematurely.
7078                  */
7079                 if (remaining) {
7080                         WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
7081                                         kswapd_highest_zoneidx(pgdat,
7082                                                         highest_zoneidx));
7083
7084                         if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
7085                                 WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
7086                 }
7087
7088                 finish_wait(&pgdat->kswapd_wait, &wait);
7089                 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
7090         }
7091
7092         /*
7093          * After a short sleep, check if it was a premature sleep. If not, then
7094          * go fully to sleep until explicitly woken up.
7095          */
7096         if (!remaining &&
7097             prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
7098                 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
7099
7100                 /*
7101                  * vmstat counters are not perfectly accurate and the estimated
7102                  * value for counters such as NR_FREE_PAGES can deviate from the
7103                  * true value by nr_online_cpus * threshold. To avoid the zone
7104                  * watermarks being breached while under pressure, we reduce the
7105                  * per-cpu vmstat threshold while kswapd is awake and restore
7106                  * them before going back to sleep.
7107                  */
7108                 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
7109
7110                 if (!kthread_should_stop())
7111                         schedule();
7112
7113                 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
7114         } else {
7115                 if (remaining)
7116                         count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
7117                 else
7118                         count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
7119         }
7120         finish_wait(&pgdat->kswapd_wait, &wait);
7121 }
7122
7123 /*
7124  * The background pageout daemon, started as a kernel thread
7125  * from the init process.
7126  *
7127  * This basically trickles out pages so that we have _some_
7128  * free memory available even if there is no other activity
7129  * that frees anything up. This is needed for things like routing
7130  * etc, where we otherwise might have all activity going on in
7131  * asynchronous contexts that cannot page things out.
7132  *
7133  * If there are applications that are active memory-allocators
7134  * (most normal use), this basically shouldn't matter.
7135  */
7136 static int kswapd(void *p)
7137 {
7138         unsigned int alloc_order, reclaim_order;
7139         unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
7140         pg_data_t *pgdat = (pg_data_t *)p;
7141         struct task_struct *tsk = current;
7142         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
7143
7144         if (!cpumask_empty(cpumask))
7145                 set_cpus_allowed_ptr(tsk, cpumask);
7146
7147         /*
7148          * Tell the memory management that we're a "memory allocator",
7149          * and that if we need more memory we should get access to it
7150          * regardless (see "__alloc_pages()"). "kswapd" should
7151          * never get caught in the normal page freeing logic.
7152          *
7153          * (Kswapd normally doesn't need memory anyway, but sometimes
7154          * you need a small amount of memory in order to be able to
7155          * page out something else, and this flag essentially protects
7156          * us from recursively trying to free more memory as we're
7157          * trying to free the first piece of memory in the first place).
7158          */
7159         tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
7160         set_freezable();
7161
7162         WRITE_ONCE(pgdat->kswapd_order, 0);
7163         WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
7164         atomic_set(&pgdat->nr_writeback_throttled, 0);
7165         for ( ; ; ) {
7166                 bool was_frozen;
7167
7168                 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
7169                 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
7170                                                         highest_zoneidx);
7171
7172 kswapd_try_sleep:
7173                 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
7174                                         highest_zoneidx);
7175
7176                 /* Read the new order and highest_zoneidx */
7177                 alloc_order = READ_ONCE(pgdat->kswapd_order);
7178                 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
7179                                                         highest_zoneidx);
7180                 WRITE_ONCE(pgdat->kswapd_order, 0);
7181                 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
7182
7183                 if (kthread_freezable_should_stop(&was_frozen))
7184                         break;
7185
7186                 /*
7187                  * We can speed up thawing tasks if we don't call balance_pgdat
7188                  * after returning from the refrigerator
7189                  */
7190                 if (was_frozen)
7191                         continue;
7192
7193                 /*
7194                  * Reclaim begins at the requested order but if a high-order
7195                  * reclaim fails then kswapd falls back to reclaiming for
7196                  * order-0. If that happens, kswapd will consider sleeping
7197                  * for the order it finished reclaiming at (reclaim_order)
7198                  * but kcompactd is woken to compact for the original
7199                  * request (alloc_order).
7200                  */
7201                 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
7202                                                 alloc_order);
7203                 reclaim_order = balance_pgdat(pgdat, alloc_order,
7204                                                 highest_zoneidx);
7205                 if (reclaim_order < alloc_order)
7206                         goto kswapd_try_sleep;
7207         }
7208
7209         tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
7210
7211         return 0;
7212 }
7213
7214 /*
7215  * A zone is low on free memory or too fragmented for high-order memory.  If
7216  * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
7217  * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
7218  * has failed or is not needed, still wake up kcompactd if only compaction is
7219  * needed.
7220  */
7221 void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
7222                    enum zone_type highest_zoneidx)
7223 {
7224         pg_data_t *pgdat;
7225         enum zone_type curr_idx;
7226
7227         if (!managed_zone(zone))
7228                 return;
7229
7230         if (!cpuset_zone_allowed(zone, gfp_flags))
7231                 return;
7232
7233         pgdat = zone->zone_pgdat;
7234         curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
7235
7236         if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
7237                 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
7238
7239         if (READ_ONCE(pgdat->kswapd_order) < order)
7240                 WRITE_ONCE(pgdat->kswapd_order, order);
7241
7242         if (!waitqueue_active(&pgdat->kswapd_wait))
7243                 return;
7244
7245         /* Hopeless node, leave it to direct reclaim if possible */
7246         if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
7247             (pgdat_balanced(pgdat, order, highest_zoneidx) &&
7248              !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
7249                 /*
7250                  * There may be plenty of free memory available, but it's too
7251                  * fragmented for high-order allocations.  Wake up kcompactd
7252                  * and rely on compaction_suitable() to determine if it's
7253                  * needed.  If it fails, it will defer subsequent attempts to
7254                  * ratelimit its work.
7255                  */
7256                 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
7257                         wakeup_kcompactd(pgdat, order, highest_zoneidx);
7258                 return;
7259         }
7260
7261         trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
7262                                       gfp_flags);
7263         wake_up_interruptible(&pgdat->kswapd_wait);
7264 }
7265
7266 #ifdef CONFIG_HIBERNATION
7267 /*
7268  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
7269  * freed pages.
7270  *
7271  * Rather than trying to age LRUs the aim is to preserve the overall
7272  * LRU order by reclaiming preferentially
7273  * inactive > active > active referenced > active mapped
7274  */
7275 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
7276 {
7277         struct scan_control sc = {
7278                 .nr_to_reclaim = nr_to_reclaim,
7279                 .gfp_mask = GFP_HIGHUSER_MOVABLE,
7280                 .reclaim_idx = MAX_NR_ZONES - 1,
7281                 .priority = DEF_PRIORITY,
7282                 .may_writepage = 1,
7283                 .may_unmap = 1,
7284                 .may_swap = 1,
7285                 .hibernation_mode = 1,
7286         };
7287         struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
7288         unsigned long nr_reclaimed;
7289         unsigned int noreclaim_flag;
7290
7291         fs_reclaim_acquire(sc.gfp_mask);
7292         noreclaim_flag = memalloc_noreclaim_save();
7293         set_task_reclaim_state(current, &sc.reclaim_state);
7294
7295         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
7296
7297         set_task_reclaim_state(current, NULL);
7298         memalloc_noreclaim_restore(noreclaim_flag);
7299         fs_reclaim_release(sc.gfp_mask);
7300
7301         return nr_reclaimed;
7302 }
7303 #endif /* CONFIG_HIBERNATION */
7304
7305 /*
7306  * This kswapd start function will be called by init and node-hot-add.
7307  */
7308 void __meminit kswapd_run(int nid)
7309 {
7310         pg_data_t *pgdat = NODE_DATA(nid);
7311
7312         pgdat_kswapd_lock(pgdat);
7313         if (!pgdat->kswapd) {
7314                 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
7315                 if (IS_ERR(pgdat->kswapd)) {
7316                         /* failure at boot is fatal */
7317                         pr_err("Failed to start kswapd on node %d，ret=%ld\n",
7318                                    nid, PTR_ERR(pgdat->kswapd));
7319                         BUG_ON(system_state < SYSTEM_RUNNING);
7320                         pgdat->kswapd = NULL;
7321                 }
7322         }
7323         pgdat_kswapd_unlock(pgdat);
7324 }
7325
7326 /*
7327  * Called by memory hotplug when all memory in a node is offlined.  Caller must
7328  * be holding mem_hotplug_begin/done().
7329  */
7330 void __meminit kswapd_stop(int nid)
7331 {
7332         pg_data_t *pgdat = NODE_DATA(nid);
7333         struct task_struct *kswapd;
7334
7335         pgdat_kswapd_lock(pgdat);
7336         kswapd = pgdat->kswapd;
7337         if (kswapd) {
7338                 kthread_stop(kswapd);
7339                 pgdat->kswapd = NULL;
7340         }
7341         pgdat_kswapd_unlock(pgdat);
7342 }
7343
7344 static int __init kswapd_init(void)
7345 {
7346         int nid;
7347
7348         swap_setup();
7349         for_each_node_state(nid, N_MEMORY)
7350                 kswapd_run(nid);
7351         return 0;
7352 }
7353
7354 module_init(kswapd_init)
7355
7356 #ifdef CONFIG_NUMA
7357 /*
7358  * Node reclaim mode
7359  *
7360  * If non-zero call node_reclaim when the number of free pages falls below
7361  * the watermarks.
7362  */
7363 int node_reclaim_mode __read_mostly;
7364
7365 /*
7366  * Priority for NODE_RECLAIM. This determines the fraction of pages
7367  * of a node considered for each zone_reclaim. 4 scans 1/16th of
7368  * a zone.
7369  */
7370 #define NODE_RECLAIM_PRIORITY 4
7371
7372 /*
7373  * Percentage of pages in a zone that must be unmapped for node_reclaim to
7374  * occur.
7375  */
7376 int sysctl_min_unmapped_ratio = 1;
7377
7378 /*
7379  * If the number of slab pages in a zone grows beyond this percentage then
7380  * slab reclaim needs to occur.
7381  */
7382 int sysctl_min_slab_ratio = 5;
7383
7384 static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
7385 {
7386         unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
7387         unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
7388                 node_page_state(pgdat, NR_ACTIVE_FILE);
7389
7390         /*
7391          * It's possible for there to be more file mapped pages than
7392          * accounted for by the pages on the file LRU lists because
7393          * tmpfs pages accounted for as ANON can also be FILE_MAPPED
7394          */
7395         return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
7396 }
7397
7398 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
7399 static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
7400 {
7401         unsigned long nr_pagecache_reclaimable;
7402         unsigned long delta = 0;
7403
7404         /*
7405          * If RECLAIM_UNMAP is set, then all file pages are considered
7406          * potentially reclaimable. Otherwise, we have to worry about
7407          * pages like swapcache and node_unmapped_file_pages() provides
7408          * a better estimate
7409          */
7410         if (node_reclaim_mode & RECLAIM_UNMAP)
7411                 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
7412         else
7413                 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
7414
7415         /* If we can't clean pages, remove dirty pages from consideration */
7416         if (!(node_reclaim_mode & RECLAIM_WRITE))
7417                 delta += node_page_state(pgdat, NR_FILE_DIRTY);
7418
7419         /* Watch for any possible underflows due to delta */
7420         if (unlikely(delta > nr_pagecache_reclaimable))
7421                 delta = nr_pagecache_reclaimable;
7422
7423         return nr_pagecache_reclaimable - delta;
7424 }
7425
7426 /*
7427  * Try to free up some pages from this node through reclaim.
7428  */
7429 static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
7430 {
7431         /* Minimum pages needed in order to stay on node */
7432         const unsigned long nr_pages = 1 << order;
7433         struct task_struct *p = current;
7434         unsigned int noreclaim_flag;
7435         struct scan_control sc = {
7436                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
7437                 .gfp_mask = current_gfp_context(gfp_mask),
7438                 .order = order,
7439                 .priority = NODE_RECLAIM_PRIORITY,
7440                 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
7441                 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
7442                 .may_swap = 1,
7443                 .reclaim_idx = gfp_zone(gfp_mask),
7444         };
7445         unsigned long pflags;
7446
7447         trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
7448                                            sc.gfp_mask);
7449
7450         cond_resched();
7451         psi_memstall_enter(&pflags);
7452         delayacct_freepages_start();
7453         fs_reclaim_acquire(sc.gfp_mask);
7454         /*
7455          * We need to be able to allocate from the reserves for RECLAIM_UNMAP
7456          */
7457         noreclaim_flag = memalloc_noreclaim_save();
7458         set_task_reclaim_state(p, &sc.reclaim_state);
7459
7460         if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
7461             node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
7462                 /*
7463                  * Free memory by calling shrink node with increasing
7464                  * priorities until we have enough memory freed.
7465                  */
7466                 do {
7467                         shrink_node(pgdat, &sc);
7468                 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
7469         }
7470
7471         set_task_reclaim_state(p, NULL);
7472         memalloc_noreclaim_restore(noreclaim_flag);
7473         fs_reclaim_release(sc.gfp_mask);
7474         psi_memstall_leave(&pflags);
7475         delayacct_freepages_end();
7476
7477         trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
7478
7479         return sc.nr_reclaimed >= nr_pages;
7480 }
7481
7482 int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
7483 {
7484         int ret;
7485
7486         /*
7487          * Node reclaim reclaims unmapped file backed pages and
7488          * slab pages if we are over the defined limits.
7489          *
7490          * A small portion of unmapped file backed pages is needed for
7491          * file I/O otherwise pages read by file I/O will be immediately
7492          * thrown out if the node is overallocated. So we do not reclaim
7493          * if less than a specified percentage of the node is used by
7494          * unmapped file backed pages.
7495          */
7496         if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
7497             node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
7498             pgdat->min_slab_pages)
7499                 return NODE_RECLAIM_FULL;
7500
7501         /*
7502          * Do not scan if the allocation should not be delayed.
7503          */
7504         if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
7505                 return NODE_RECLAIM_NOSCAN;
7506
7507         /*
7508          * Only run node reclaim on the local node or on nodes that do not
7509          * have associated processors. This will favor the local processor
7510          * over remote processors and spread off node memory allocations
7511          * as wide as possible.
7512          */
7513         if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
7514                 return NODE_RECLAIM_NOSCAN;
7515
7516         if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
7517                 return NODE_RECLAIM_NOSCAN;
7518
7519         ret = __node_reclaim(pgdat, gfp_mask, order);
7520         clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
7521
7522         if (!ret)
7523                 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
7524
7525         return ret;
7526 }
7527 #endif
7528
7529 /**
7530  * check_move_unevictable_folios - Move evictable folios to appropriate zone
7531  * lru list
7532  * @fbatch: Batch of lru folios to check.
7533  *
7534  * Checks folios for evictability, if an evictable folio is in the unevictable
7535  * lru list, moves it to the appropriate evictable lru list. This function
7536  * should be only used for lru folios.
7537  */
7538 void check_move_unevictable_folios(struct folio_batch *fbatch)
7539 {
7540         struct lruvec *lruvec = NULL;
7541         int pgscanned = 0;
7542         int pgrescued = 0;
7543         int i;
7544
7545         for (i = 0; i < fbatch->nr; i++) {
7546                 struct folio *folio = fbatch->folios[i];
7547                 int nr_pages = folio_nr_pages(folio);
7548
7549                 pgscanned += nr_pages;
7550
7551                 /* block memcg migration while the folio moves between lrus */
7552                 if (!folio_test_clear_lru(folio))
7553                         continue;
7554
7555                 lruvec = folio_lruvec_relock_irq(folio, lruvec);
7556                 if (folio_evictable(folio) && folio_test_unevictable(folio)) {
7557                         lruvec_del_folio(lruvec, folio);
7558                         folio_clear_unevictable(folio);
7559                         lruvec_add_folio(lruvec, folio);
7560                         pgrescued += nr_pages;
7561                 }
7562                 folio_set_lru(folio);
7563         }
7564
7565         if (lruvec) {
7566                 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
7567                 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
7568                 unlock_page_lruvec_irq(lruvec);
7569         } else if (pgscanned) {
7570                 count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
7571         }
7572 }
7573 EXPORT_SYMBOL_GPL(check_move_unevictable_folios);