Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache

author Linus Torvalds <[email protected]>

Wed, 23 Mar 2022 00:03:12 +0000 (17:03 -0700)

committer Linus Torvalds <[email protected]>

Wed, 23 Mar 2022 00:03:12 +0000 (17:03 -0700)
author Linus Torvalds <[email protected]>
Wed, 23 Mar 2022 00:03:12 +0000 (17:03 -0700)
committer Linus Torvalds <[email protected]>
Wed, 23 Mar 2022 00:03:12 +0000 (17:03 -0700)
diff --combined arch/arm64/mm/mmu.c

index 0b7d25887ec37cef524da7234c2f4d1746860944,580abae6c0b93f1156761ee1db6e51b4eba6112a..626ec32873c6c36bb6d21085bc7340dd73b055c6
--- 1/arch/arm64/mm/mmu.c
--- 2/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@@ -17,6 -17,7 +17,7 @@@
   #include <linux/mman.h>
   #include <linux/nodemask.h>
   #include <linux/memblock.h>
+ #include <linux/memremap.h>
   #include <linux/memory.h>
   #include <linux/fs.h>
   #include <linux/io.h>
@@@ -63,7 -64,6 +64,7 @@@ static pmd_t bm_pmd[PTRS_PER_PMD] __pag
   static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
   
   static DEFINE_SPINLOCK(swapper_pgdir_lock);
+ +static DEFINE_MUTEX(fixmap_lock);
   
   void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
   {
@@@ -295,6 -295,18 +296,6 @@@ static void alloc_init_cont_pmd(pud_t *
         } while (addr = next, addr != end);
   }
   
- -static inline bool use_1G_block(unsigned long addr, unsigned long next,
- -                      unsigned long phys)
- -{
- -      if (PAGE_SHIFT != 12)
- -              return false;
- -
- -      if (((addr | next | phys) & ~PUD_MASK) != 0)
- -              return false;
- -
- -      return true;
- -}
- -
   static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
                            phys_addr_t phys, pgprot_t prot,
                            phys_addr_t (*pgtable_alloc)(int),
@@@ -318,12 -330,6 +319,12 @@@
         }
         BUG_ON(p4d_bad(p4d));
   
+ +      /*
+ +       * No need for locking during early boot. And it doesn't work as
+ +       * expected with KASLR enabled.
+ +       */
+ +      if (system_state != SYSTEM_BOOTING)
+ +              mutex_lock(&fixmap_lock);
         pudp = pud_set_fixmap_offset(p4dp, addr);
         do {
                 pud_t old_pud = READ_ONCE(*pudp);
@@@ -333,8 -339,7 +334,8 @@@
                 /*
                  * For 4K granule only, attempt to put down a 1GB block
                  */
- -              if (use_1G_block(addr, next, phys) &&
+ +              if (pud_sect_supported() &&
+ +                 ((addr | next | phys) & ~PUD_MASK) == 0 &&
                     (flags & NO_BLOCK_MAPPINGS) == 0) {
                         pud_set_huge(pudp, phys, prot);
   
@@@ -355,8 -360,6 +356,8 @@@
         } while (pudp++, addr = next, addr != end);
   
         pud_clear_fixmap();
+ +      if (system_state != SYSTEM_BOOTING)
+ +              mutex_unlock(&fixmap_lock);
   }
   
   static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
@@@ -515,7 -518,7 +516,7 @@@ static void __init map_mem(pgd_t *pgdp
          */
         BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
   
- -      if (can_set_direct_map() || crash_mem_map || IS_ENABLED(CONFIG_KFENCE))
+ +      if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
                 flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
   
         /*
@@@ -526,17 -529,6 +527,17 @@@
          */
         memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
   
+ +#ifdef CONFIG_KEXEC_CORE
+ +      if (crash_mem_map) {
+ +              if (IS_ENABLED(CONFIG_ZONE_DMA) ||
+ +                  IS_ENABLED(CONFIG_ZONE_DMA32))
+ +                      flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
+ +              else if (crashk_res.end)
+ +                      memblock_mark_nomap(crashk_res.start,
+ +                          resource_size(&crashk_res));
+ +      }
+ +#endif
+ +
         /* map all the memory banks */
         for_each_mem_range(i, &start, &end) {
                 if (start >= end)
@@@ -563,25 -555,6 +564,25 @@@
         __map_memblock(pgdp, kernel_start, kernel_end,
                        PAGE_KERNEL, NO_CONT_MAPPINGS);
         memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
+ +
+ +      /*
+ +       * Use page-level mappings here so that we can shrink the region
+ +       * in page granularity and put back unused memory to buddy system
+ +       * through /sys/kernel/kexec_crash_size interface.
+ +       */
+ +#ifdef CONFIG_KEXEC_CORE
+ +      if (crash_mem_map &&
+ +          !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
+ +              if (crashk_res.end) {
+ +                      __map_memblock(pgdp, crashk_res.start,
+ +                                     crashk_res.end + 1,
+ +                                     PAGE_KERNEL,
+ +                                     NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
+ +                      memblock_clear_nomap(crashk_res.start,
+ +                                           resource_size(&crashk_res));
+ +              }
+ +      }
+ +#endif
   }
   
   void mark_rodata_ro(void)
@@@ -645,8 -618,6 +646,8 @@@ early_param("rodata", parse_rodata)
   #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
   static int __init map_entry_trampoline(void)
   {
+ +      int i;
+ +
         pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
         phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
   
@@@ -655,15 -626,11 +656,15 @@@
   
         /* Map only the text into the trampoline page table */
         memset(tramp_pg_dir, 0, PGD_SIZE);
- -      __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE,
- -                           prot, __pgd_pgtable_alloc, 0);
+ +      __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
+ +                           entry_tramp_text_size(), prot,
+ +                           __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);
   
         /* Map both the text and data into the kernel page table */
- -      __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot);
+ +      for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
+ +              __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
+ +                           pa_start + i * PAGE_SIZE, prot);
+ +
         if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
                 extern char __entry_tramp_data_start[];
   
diff --combined arch/parisc/include/asm/pgtable.h

index 7dff736936d00062fa1f60c9a32f094fb594868f,48d49f207f84378c1cda1eaa34ea0b677d1a478b..939db6fe620bdb9af6c58c75d700147589bd844e
--- 1/arch/parisc/include/asm/pgtable.h
--- 2/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@@ -70,9 -70,9 +70,9 @@@ static inline void purge_tlb_entries(st
         unsigned long flags;
   
         purge_tlb_start(flags);
- -      mtsp(mm->context, 1);
- -      pdtlb(addr);
- -      pitlb(addr);
+ +      mtsp(mm->context.space_id, SR_TEMP1);
+ +      pdtlb(SR_TEMP1, addr);
+ +      pitlb(SR_TEMP1, addr);
         purge_tlb_end(flags);
   }
   
@@@ -219,10 -219,9 +219,10 @@@ extern void __update_cache(pte_t pte)
   #define _PAGE_PRESENT  (1 << xlate_pabit(_PAGE_PRESENT_BIT))
   #define _PAGE_HUGE     (1 << xlate_pabit(_PAGE_HPAGE_BIT))
   #define _PAGE_USER     (1 << xlate_pabit(_PAGE_USER_BIT))
+ +#define _PAGE_SPECIAL  (_PAGE_DMB)
   
   #define _PAGE_TABLE   (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)
- -#define _PAGE_CHG_MASK        (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
+ +#define _PAGE_CHG_MASK        (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_SPECIAL)
   #define _PAGE_KERNEL_RO       (_PAGE_PRESENT | _PAGE_READ | _PAGE_DIRTY | _PAGE_ACCESSED)
   #define _PAGE_KERNEL_EXEC     (_PAGE_KERNEL_RO | _PAGE_EXEC)
   #define _PAGE_KERNEL_RWX      (_PAGE_KERNEL_EXEC | _PAGE_WRITE)
@@@ -349,7 -348,6 +349,7 @@@ static inline void pud_clear(pud_t *pud
   static inline int pte_dirty(pte_t pte)                { return pte_val(pte) & _PAGE_DIRTY; }
   static inline int pte_young(pte_t pte)                { return pte_val(pte) & _PAGE_ACCESSED; }
   static inline int pte_write(pte_t pte)                { return pte_val(pte) & _PAGE_WRITE; }
+ +static inline int pte_special(pte_t pte)      { return pte_val(pte) & _PAGE_SPECIAL; }
   
   static inline pte_t pte_mkclean(pte_t pte)    { pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
   static inline pte_t pte_mkold(pte_t pte)      { pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
@@@ -357,7 -355,6 +357,7 @@@ static inline pte_t pte_wrprotect(pte_
   static inline pte_t pte_mkdirty(pte_t pte)    { pte_val(pte) |= _PAGE_DIRTY; return pte; }
   static inline pte_t pte_mkyoung(pte_t pte)    { pte_val(pte) |= _PAGE_ACCESSED; return pte; }
   static inline pte_t pte_mkwrite(pte_t pte)    { pte_val(pte) |= _PAGE_WRITE; return pte; }
+ +static inline pte_t pte_mkspecial(pte_t pte)  { pte_val(pte) |= _PAGE_SPECIAL; return pte; }
   
   /*
    * Huge pte definitions.
@@@ -408,6 -405,7 +408,7 @@@ static inline unsigned long pmd_page_va
         return ((unsigned long) __va(pmd_address(pmd)));
   }
   
+ #define pmd_pfn(pmd)  (pmd_address(pmd) >> PAGE_SHIFT)
   #define __pmd_page(pmd) ((unsigned long) __va(pmd_address(pmd)))
   #define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd))
   
diff --combined drivers/nvme/host/pci.c

index 9f4f3884fefe9161d15cce39f650e339d8125f54,ab15bc72710dbeb78a05e1765a6c7351e1e191f5..2e98ac3f3ad684feaa5ece1cd424306e7d91bd08
--- 1/drivers/nvme/host/pci.c
--- 2/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@@ -15,6 -15,7 +15,7 @@@
   #include <linux/init.h>
   #include <linux/interrupt.h>
   #include <linux/io.h>
+ #include <linux/memremap.h>
   #include <linux/mm.h>
   #include <linux/module.h>
   #include <linux/mutex.h>
@@@ -424,9 -425,8 +425,9 @@@ static int nvme_init_hctx(struct blk_mq
         return 0;
   }
   
- -static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
- -              unsigned int hctx_idx, unsigned int numa_node)
+ +static int nvme_pci_init_request(struct blk_mq_tag_set *set,
+ +              struct request *req, unsigned int hctx_idx,
+ +              unsigned int numa_node)
   {
         struct nvme_dev *dev = set->driver_data;
         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@@ -1429,13 -1429,12 +1430,13 @@@ static enum blk_eh_timer_return nvme_ti
                 "I/O %d QID %d timeout, aborting\n",
                  req->tag, nvmeq->qid);
   
- -      abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
- -                      BLK_MQ_REQ_NOWAIT);
+ +      abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
+ +                                       BLK_MQ_REQ_NOWAIT);
         if (IS_ERR(abort_req)) {
                 atomic_inc(&dev->ctrl.abort_limit);
                 return BLK_EH_RESET_TIMER;
         }
+ +      nvme_init_request(abort_req, &cmd);
   
         abort_req->end_io_data = NULL;
         blk_execute_rq_nowait(abort_req, false, abort_endio);
@@@ -1724,7 -1723,7 +1725,7 @@@ static const struct blk_mq_ops nvme_mq_
         .queue_rq       = nvme_queue_rq,
         .complete       = nvme_pci_complete_rq,
         .init_hctx      = nvme_admin_init_hctx,
- -      .init_request   = nvme_init_request,
+ +      .init_request   = nvme_pci_init_request,
         .timeout        = nvme_timeout,
   };
   
@@@ -1734,7 -1733,7 +1735,7 @@@ static const struct blk_mq_ops nvme_mq_
         .complete       = nvme_pci_complete_rq,
         .commit_rqs     = nvme_commit_rqs,
         .init_hctx      = nvme_init_hctx,
- -      .init_request   = nvme_init_request,
+ +      .init_request   = nvme_pci_init_request,
         .map_queues     = nvme_pci_map_queues,
         .timeout        = nvme_timeout,
         .poll           = nvme_poll,
@@@ -2477,10 -2476,9 +2478,10 @@@ static int nvme_delete_queue(struct nvm
         cmd.delete_queue.opcode = opcode;
         cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
   
- -      req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT);
+ +      req = blk_mq_alloc_request(q, nvme_req_op(&cmd), BLK_MQ_REQ_NOWAIT);
         if (IS_ERR(req))
                 return PTR_ERR(req);
+ +      nvme_init_request(req, &cmd);
   
         req->end_io_data = nvmeq;
   
diff --combined drivers/nvme/target/io-cmd-bdev.c

index e9194804ddee4577903fb35c45b425fa2751c423,a141446db1bea34e4cee0412588be3a472c051a9..d886c2c59554f69fdc438aa4596e6a26c01584e9
--- 1/drivers/nvme/target/io-cmd-bdev.c
--- 2/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@@ -6,6 -6,7 +6,7 @@@
   #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   #include <linux/blkdev.h>
   #include <linux/blk-integrity.h>
+ #include <linux/memremap.h>
   #include <linux/module.h>
   #include "nvmet.h"
   
@@@ -76,14 -77,6 +77,14 @@@ int nvmet_bdev_ns_enable(struct nvmet_n
   {
         int ret;
   
+ +      /*
+ +       * When buffered_io namespace attribute is enabled that means user want
+ +       * this block device to be used as a file, so block device can take
+ +       * an advantage of cache.
+ +       */
+ +      if (ns->buffered_io)
+ +              return -ENOTBLK;
+ +
         ns->bdev = blkdev_get_by_path(ns->device_path,
                         FMODE_READ | FMODE_WRITE, NULL);
         if (IS_ERR(ns->bdev)) {
@@@ -275,15 -268,15 +276,15 @@@ static void nvmet_bdev_execute_rw(struc
   
         if (nvmet_use_inline_bvec(req)) {
                 bio = &req->b.inline_bio;
- -              bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
+ +              bio_init(bio, req->ns->bdev, req->inline_bvec,
+ +                       ARRAY_SIZE(req->inline_bvec), op);
         } else {
- -              bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
+ +              bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt), op,
+ +                              GFP_KERNEL);
         }
- -      bio_set_dev(bio, req->ns->bdev);
         bio->bi_iter.bi_sector = sector;
         bio->bi_private = req;
         bio->bi_end_io = nvmet_bio_done;
- -      bio->bi_opf = op;
   
         blk_start_plug(&plug);
         if (req->metadata_len)
@@@ -304,9 -297,10 +305,9 @@@
                                 }
                         }
   
- -                      bio = bio_alloc(GFP_KERNEL, bio_max_segs(sg_cnt));
- -                      bio_set_dev(bio, req->ns->bdev);
+ +                      bio = bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt),
+ +                                      op, GFP_KERNEL);
                         bio->bi_iter.bi_sector = sector;
- -                      bio->bi_opf = op;
   
                         bio_chain(bio, prev);
                         submit_bio(prev);
@@@ -335,10 -329,11 +336,10 @@@ static void nvmet_bdev_execute_flush(st
         if (!nvmet_check_transfer_len(req, 0))
                 return;
   
- -      bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
- -      bio_set_dev(bio, req->ns->bdev);
+ +      bio_init(bio, req->ns->bdev, req->inline_bvec,
+ +               ARRAY_SIZE(req->inline_bvec), REQ_OP_WRITE | REQ_PREFLUSH);
         bio->bi_private = req;
         bio->bi_end_io = nvmet_bio_done;
- -      bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
   
         submit_bio(bio);
   }
diff --combined fs/Kconfig

index 90fdb62545e0b42fcdc8a563e1b969cff1ca0d94,7f2455e8e18ae26ee0af1a5db228e3cdac7c2140..30b751c7f11a63db6719e31a0d797471dae33b1b
--- 1/fs/Kconfig
--- 2/fs/Kconfig
+++ b/fs/Kconfig
@@@ -48,7 -48,7 +48,7 @@@ config FS_DA
         bool "File system based Direct Access (DAX) support"
         depends on MMU
         depends on !(ARM || MIPS || SPARC)
-       select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
+       depends on ZONE_DEVICE || FS_DAX_LIMITED
         select FS_IOMAP
         select DAX
         help
@@@ -344,7 -344,7 +344,7 @@@ config LOCK
   
   config LOCKD_V4
         bool
- -      depends on NFSD_V3 || NFS_V3
+ +      depends on NFSD || NFS_V3
         depends on FILE_LOCKING
         default y
   
diff --combined fs/nfsd/filecache.c

index cc2831cec66954075549e638664b3196e65e8d07,47f804e0ec9306c12066942398cb0234844c54ef..c08882f5867b2a3268cc0358f7e13704840df183
--- 1/fs/nfsd/filecache.c
--- 2/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@@ -7,6 -7,7 +7,7 @@@
   #include <linux/hash.h>
   #include <linux/slab.h>
   #include <linux/file.h>
+ #include <linux/pagemap.h>
   #include <linux/sched.h>
   #include <linux/list_lru.h>
   #include <linux/fsnotify_backend.h>
@@@ -632,7 -633,7 +633,7 @@@ nfsd_file_cache_init(void
         if (!nfsd_filecache_wq)
                 goto out;
   
- -      nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
+ +      nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE,
                                 sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
         if (!nfsd_file_hashtbl) {
                 pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
@@@ -700,7 -701,7 +701,7 @@@ out_err
         nfsd_file_slab = NULL;
         kmem_cache_destroy(nfsd_file_mark_slab);
         nfsd_file_mark_slab = NULL;
- -      kfree(nfsd_file_hashtbl);
+ +      kvfree(nfsd_file_hashtbl);
         nfsd_file_hashtbl = NULL;
         destroy_workqueue(nfsd_filecache_wq);
         nfsd_filecache_wq = NULL;
@@@ -811,7 -812,7 +812,7 @@@ nfsd_file_cache_shutdown(void
         fsnotify_wait_marks_destroyed();
         kmem_cache_destroy(nfsd_file_mark_slab);
         nfsd_file_mark_slab = NULL;
- -      kfree(nfsd_file_hashtbl);
+ +      kvfree(nfsd_file_hashtbl);
         nfsd_file_hashtbl = NULL;
         destroy_workqueue(nfsd_filecache_wq);
         nfsd_filecache_wq = NULL;
diff --combined fs/nfsd/vfs.c

index 166eb0ba3e7133f4c5a2aea6accb1b208255dcd2,fe0d7abbc1b13c815ed1bf9e3e8266fb027be478..c22ad0532e8ee730f77398dac70a608464b35262
--- 1/fs/nfsd/vfs.c
--- 2/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@@ -26,13 -26,16 +26,14 @@@
   #include <linux/xattr.h>
   #include <linux/jhash.h>
   #include <linux/ima.h>
+ #include <linux/pagemap.h>
   #include <linux/slab.h>
   #include <linux/uaccess.h>
   #include <linux/exportfs.h>
   #include <linux/writeback.h>
   #include <linux/security.h>
   
- -#ifdef CONFIG_NFSD_V3
   #include "xdr3.h"
- -#endif /* CONFIG_NFSD_V3 */
   
   #ifdef CONFIG_NFSD_V4
   #include "../internal.h"
@@@ -606,6 -609,7 +607,6 @@@ __be32 nfsd4_vfs_fallocate(struct svc_r
   }
   #endif /* defined(CONFIG_NFSD_V4) */
   
- -#ifdef CONFIG_NFSD_V3
   /*
    * Check server access rights to a file system object
    */
@@@ -717,6 -721,7 +718,6 @@@ nfsd_access(struct svc_rqst *rqstp, str
    out:
         return error;
   }
- -#endif /* CONFIG_NFSD_V3 */
   
   int nfsd_open_break_lease(struct inode *inode, int access)
   {
@@@ -1109,6 -1114,7 +1110,6 @@@ out
         return err;
   }
   
- -#ifdef CONFIG_NFSD_V3
   /**
    * nfsd_commit - Commit pending writes to stable storage
    * @rqstp: RPC request being processed
@@@ -1185,6 -1191,7 +1186,6 @@@ nfsd_commit(struct svc_rqst *rqstp, str
   out:
         return err;
   }
- -#endif /* CONFIG_NFSD_V3 */
   
   static __be32
   nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
@@@ -1374,6 -1381,8 +1375,6 @@@ nfsd_create(struct svc_rqst *rqstp, str
                                         rdev, resfhp);
   }
   
- -#ifdef CONFIG_NFSD_V3
- -
   /*
    * NFSv3 and NFSv4 version of nfsd_create
    */
@@@ -1539,6 -1548,7 +1540,6 @@@ do_nfsd_create(struct svc_rqst *rqstp, 
         err = nfserrno(host_err);
         goto out;
   }
- -#endif /* CONFIG_NFSD_V3 */
   
   /*
    * Read a symlink. On entry, *lenp must contain the maximum path length that
diff --combined include/linux/fs.h

index 4fb6d5a50be708defbcfe05fa52bdb241c6e3bd2,0961c979e949f23542ba5b44610fa4024b16e194..60462181e9b22f31844891aa9db26b97486d1d9e
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -42,7 -42,6 +42,7 @@@
   #include <linux/mount.h>
   #include <linux/cred.h>
   #include <linux/mnt_idmapping.h>
+ +#include <linux/slab.h>
   
   #include <asm/byteorder.h>
   #include <uapi/linux/fs.h>
@@@ -931,15 -930,10 +931,15 @@@ struct fown_struct 
    * struct file_ra_state - Track a file's readahead state.
    * @start: Where the most recent readahead started.
    * @size: Number of pages read in the most recent readahead.
- - * @async_size: Start next readahead when this many pages are left.
- - * @ra_pages: Maximum size of a readahead request.
+ + * @async_size: Numer of pages that were/are not needed immediately
+ + *      and so were/are genuinely "ahead".  Start next readahead when
+ + *      the first of these pages is accessed.
+ + * @ra_pages: Maximum size of a readahead request, copied from the bdi.
    * @mmap_miss: How many mmap accesses missed in the page cache.
    * @prev_pos: The last byte in the most recent read request.
+ + *
+ + * When this structure is passed to ->readahead(), the "most recent"
+ + * readahead means the current readahead.
    */
   struct file_ra_state {
         pgoff_t start;
@@@ -1441,7 -1435,6 +1441,7 @@@ extern int send_sigurg(struct fown_stru
   
   #define SB_I_SKIP_SYNC        0x00000100      /* Skip superblock at global sync */
   #define SB_I_PERSB_BDI        0x00000200      /* has a per-sb bdi */
+ +#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
   
   /* Possible states of 'frozen' field */
   enum {
@@@ -2753,54 -2746,6 +2753,6 @@@ extern void init_special_inode(struct i
   extern void make_bad_inode(struct inode *);
   extern bool is_bad_inode(struct inode *);
   
- unsigned long invalidate_mapping_pages(struct address_space *mapping,
-                                       pgoff_t start, pgoff_t end);
- 
- void invalidate_mapping_pagevec(struct address_space *mapping,
-                               pgoff_t start, pgoff_t end,
-                               unsigned long *nr_pagevec);
- 
- static inline void invalidate_remote_inode(struct inode *inode)
- {
-       if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-           S_ISLNK(inode->i_mode))
-               invalidate_mapping_pages(inode->i_mapping, 0, -1);
- }
- extern int invalidate_inode_pages2(struct address_space *mapping);
- extern int invalidate_inode_pages2_range(struct address_space *mapping,
-                                        pgoff_t start, pgoff_t end);
- extern int write_inode_now(struct inode *, int);
- extern int filemap_fdatawrite(struct address_space *);
- extern int filemap_flush(struct address_space *);
- extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
- extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
-                                  loff_t lend);
- extern int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
-               loff_t start_byte, loff_t end_byte);
- 
- static inline int filemap_fdatawait(struct address_space *mapping)
- {
-       return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
- }
- 
- extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
-                                 loff_t lend);
- extern int filemap_write_and_wait_range(struct address_space *mapping,
-                                       loff_t lstart, loff_t lend);
- extern int __filemap_fdatawrite_range(struct address_space *mapping,
-                               loff_t start, loff_t end, int sync_mode);
- extern int filemap_fdatawrite_range(struct address_space *mapping,
-                               loff_t start, loff_t end);
- extern int filemap_check_errors(struct address_space *mapping);
- extern void __filemap_set_wb_err(struct address_space *mapping, int err);
- int filemap_fdatawrite_wbc(struct address_space *mapping,
-                          struct writeback_control *wbc);
- 
- static inline int filemap_write_and_wait(struct address_space *mapping)
- {
-       return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
- }
- 
   extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
                                                 loff_t lend);
   extern int __must_check file_check_and_advance_wb_err(struct file *file);
@@@ -2812,67 -2757,6 +2764,6 @@@ static inline int file_write_and_wait(s
         return file_write_and_wait_range(file, 0, LLONG_MAX);
   }
   
- /**
-  * filemap_set_wb_err - set a writeback error on an address_space
-  * @mapping: mapping in which to set writeback error
-  * @err: error to be set in mapping
-  *
-  * When writeback fails in some way, we must record that error so that
-  * userspace can be informed when fsync and the like are called.  We endeavor
-  * to report errors on any file that was open at the time of the error.  Some
-  * internal callers also need to know when writeback errors have occurred.
-  *
-  * When a writeback error occurs, most filesystems will want to call
-  * filemap_set_wb_err to record the error in the mapping so that it will be
-  * automatically reported whenever fsync is called on the file.
-  */
- static inline void filemap_set_wb_err(struct address_space *mapping, int err)
- {
-       /* Fastpath for common case of no error */
-       if (unlikely(err))
-               __filemap_set_wb_err(mapping, err);
- }
- 
- /**
-  * filemap_check_wb_err - has an error occurred since the mark was sampled?
-  * @mapping: mapping to check for writeback errors
-  * @since: previously-sampled errseq_t
-  *
-  * Grab the errseq_t value from the mapping, and see if it has changed "since"
-  * the given value was sampled.
-  *
-  * If it has then report the latest error set, otherwise return 0.
-  */
- static inline int filemap_check_wb_err(struct address_space *mapping,
-                                       errseq_t since)
- {
-       return errseq_check(&mapping->wb_err, since);
- }
- 
- /**
-  * filemap_sample_wb_err - sample the current errseq_t to test for later errors
-  * @mapping: mapping to be sampled
-  *
-  * Writeback errors are always reported relative to a particular sample point
-  * in the past. This function provides those sample points.
-  */
- static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
- {
-       return errseq_sample(&mapping->wb_err);
- }
- 
- /**
-  * file_sample_sb_err - sample the current errseq_t to test for later errors
-  * @file: file pointer to be sampled
-  *
-  * Grab the most current superblock-level errseq_t value for the given
-  * struct file.
-  */
- static inline errseq_t file_sample_sb_err(struct file *file)
- {
-       return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
- }
- 
   extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
                            int datasync);
   extern int vfs_fsync(struct file *file, int datasync);
@@@ -3115,16 -2999,6 +3006,16 @@@ extern void free_inode_nonrcu(struct in
   extern int should_remove_suid(struct dentry *);
   extern int file_remove_privs(struct file *);
   
+ +/*
+ + * This must be used for allocating filesystems specific inodes to set
+ + * up the inode reclaim context correctly.
+ + */
+ +static inline void *
+ +alloc_inode_sb(struct super_block *sb, struct kmem_cache *cache, gfp_t gfp)
+ +{
+ +      return kmem_cache_alloc_lru(cache, &sb->s_inode_lru, gfp);
+ +}
+ +
   extern void __insert_inode_hash(struct inode *, unsigned long hashval);
   static inline void insert_inode_hash(struct inode *inode)
   {
@@@ -3147,7 -3021,6 +3038,7 @@@ extern int sb_min_blocksize(struct supe
   extern int generic_file_mmap(struct file *, struct vm_area_struct *);
   extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
   extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
+ +int generic_write_checks_count(struct kiocb *iocb, loff_t *count);
   extern int generic_write_check_limits(struct file *file, loff_t pos,
                 loff_t *count);
   extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
@@@ -3191,7 -3064,6 +3082,7 @@@ extern loff_t fixed_size_llseek(struct 
                 int whence, loff_t size);
   extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
   extern loff_t no_seek_end_llseek(struct file *, loff_t, int);
+ +int rw_verify_area(int, struct file *, const loff_t *, size_t);
   extern int generic_file_open(struct inode * inode, struct file * filp);
   extern int nonseekable_open(struct inode * inode, struct file * filp);
   extern int stream_open(struct inode * inode, struct file * filp);
@@@ -3627,15 -3499,4 +3518,4 @@@ extern int vfs_fadvise(struct file *fil
   extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
                            int advice);
   
- /*
-  * Flush file data before changing attributes.  Caller must hold any locks
-  * required to prevent further writes to this file until we're done setting
-  * flags.
-  */
- static inline int inode_drain_writes(struct inode *inode)
- {
-       inode_dio_wait(inode);
-       return filemap_write_and_wait(inode->i_mapping);
- }
- 
   #endif /* _LINUX_FS_H */
diff --combined include/linux/hugetlb.h

index 08357b4c7be73c15221cd07e9b4e7fb0631f2acd,6ba2f8e74fbba9ad11e6cf59496cb67f23d35774..53c1b6082a4cd9f42fc14e8259e4818b8c7e3e20
--- 1/include/linux/hugetlb.h
--- 2/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@@ -754,7 -754,7 +754,7 @@@ static inline void arch_clear_hugepage_
   static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
                                        vm_flags_t flags)
   {
- -      return entry;
+ +      return pte_mkhuge(entry);
   }
   #endif
   
@@@ -970,6 -970,11 +970,11 @@@ static inline struct hstate *page_hstat
         return NULL;
   }
   
+ static inline struct hstate *size_to_hstate(unsigned long size)
+ {
+       return NULL;
+ }
+ 
   static inline unsigned long huge_page_size(struct hstate *h)
   {
         return PAGE_SIZE;
@@@ -1075,6 -1080,12 +1080,6 @@@ static inline void set_huge_swap_pte_at
   }
   #endif        /* CONFIG_HUGETLB_PAGE */
   
- -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
- -extern bool hugetlb_free_vmemmap_enabled;
- -#else
- -#define hugetlb_free_vmemmap_enabled  false
- -#endif
- -
   static inline spinlock_t *huge_pte_lock(struct hstate *h,
                                         struct mm_struct *mm, pte_t *pte)
   {
diff --combined include/linux/mm.h

index 0e4fd101616e282533a8b1013c451097cc2c36f1,c1966ad34142ddff148ee1f55590b9d11cf5537c..b8f9ba93a162f98f17c41eae671a760fb0bc582c
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -3,9 -3,6 +3,6 @@@
   #define _LINUX_MM_H
   
   #include <linux/errno.h>
- 
- #ifdef __KERNEL__
- 
   #include <linux/mmdebug.h>
   #include <linux/gfp.h>
   #include <linux/bug.h>
@@@ -26,7 -23,6 +23,6 @@@
   #include <linux/err.h>
   #include <linux/page-flags.h>
   #include <linux/page_ref.h>
- #include <linux/memremap.h>
   #include <linux/overflow.h>
   #include <linux/sizes.h>
   #include <linux/sched.h>
@@@ -216,8 -212,10 +212,10 @@@ int overcommit_policy_handler(struct ct
   
   #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
   #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
+ #define folio_page_idx(folio, p)      (page_to_pfn(p) - folio_pfn(folio))
   #else
   #define nth_page(page,n) ((page) + (n))
+ #define folio_page_idx(folio, p)      ((p) - &(folio)->page)
   #endif
   
   /* to align the pointer to the (next) page boundary */
@@@ -227,6 -225,10 +225,10 @@@
   #define PAGE_ALIGNED(addr)    IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
   
   #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
+ static inline struct folio *lru_to_folio(struct list_head *head)
+ {
+       return list_entry((head)->prev, struct folio, lru);
+ }
   
   void setup_initial_init_mm(void *start_code, void *end_code,
                            void *end_data, void *brk);
@@@ -478,8 -480,7 +480,8 @@@ struct vm_fault 
                 struct vm_area_struct *vma;     /* Target VMA */
                 gfp_t gfp_mask;                 /* gfp mask to be used for allocations */
                 pgoff_t pgoff;                  /* Logical page offset based on vma */
- -              unsigned long address;          /* Faulting virtual address */
+ +              unsigned long address;          /* Faulting virtual address - masked */
+ +              unsigned long real_address;     /* Faulting virtual address - unmasked */
         };
         enum fault_flag flags;          /* FAULT_FLAG_xxx flags
                                          * XXX: should really be 'const' */
@@@ -775,21 -776,26 +777,26 @@@ static inline int is_vmalloc_or_module_
   }
   #endif
   
- static inline int head_compound_mapcount(struct page *head)
+ /*
+  * How many times the entire folio is mapped as a single unit (eg by a
+  * PMD or PUD entry).  This is probably not what you want, except for
+  * debugging purposes; look at folio_mapcount() or page_mapcount()
+  * instead.
+  */
+ static inline int folio_entire_mapcount(struct folio *folio)
   {
-       return atomic_read(compound_mapcount_ptr(head)) + 1;
+       VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+       return atomic_read(folio_mapcount_ptr(folio)) + 1;
   }
   
   /*
    * Mapcount of compound page as a whole, does not include mapped sub-pages.
    *
-  * Must be called only for compound pages or any their tail sub-pages.
+  * Must be called only for compound pages.
    */
   static inline int compound_mapcount(struct page *page)
   {
-       VM_BUG_ON_PAGE(!PageCompound(page), page);
-       page = compound_head(page);
-       return head_compound_mapcount(page);
+       return folio_entire_mapcount(page_folio(page));
   }
   
   /*
@@@ -819,8 -825,14 +826,14 @@@ static inline int page_mapcount(struct 
         return atomic_read(&page->_mapcount) + 1;
   }
   
+ int folio_mapcount(struct folio *folio);
+ 
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
- int total_mapcount(struct page *page);
+ static inline int total_mapcount(struct page *page)
+ {
+       return folio_mapcount(page_folio(page));
+ }
+ 
   int page_trans_huge_mapcount(struct page *page);
   #else
   static inline int total_mapcount(struct page *page)
@@@ -890,33 -902,17 +903,17 @@@ static inline void destroy_compound_pag
         compound_page_dtors[page[1].compound_dtor](page);
   }
   
- static inline bool hpage_pincount_available(struct page *page)
- {
-       /*
-        * Can the page->hpage_pinned_refcount field be used? That field is in
-        * the 3rd page of the compound page, so the smallest (2-page) compound
-        * pages cannot support it.
-        */
-       page = compound_head(page);
-       return PageCompound(page) && compound_order(page) > 1;
- }
- 
   static inline int head_compound_pincount(struct page *head)
   {
         return atomic_read(compound_pincount_ptr(head));
   }
   
- static inline int compound_pincount(struct page *page)
- {
-       VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
-       page = compound_head(page);
-       return head_compound_pincount(page);
- }
- 
   static inline void set_compound_order(struct page *page, unsigned int order)
   {
         page[1].compound_order = order;
+ #ifdef CONFIG_64BIT
         page[1].compound_nr = 1U << order;
+ #endif
   }
   
   /* Returns the number of pages in this potentially compound page. */
@@@ -924,7 -920,11 +921,11 @@@ static inline unsigned long compound_nr
   {
         if (!PageHead(page))
                 return 1;
+ #ifdef CONFIG_64BIT
         return page[1].compound_nr;
+ #else
+       return 1UL << compound_order(page);
+ #endif
   }
   
   /* Returns the number of bytes in this potentially compound page. */
@@@ -939,6 -939,37 +940,37 @@@ static inline unsigned int page_shift(s
         return PAGE_SHIFT + compound_order(page);
   }
   
+ /**
+  * thp_order - Order of a transparent huge page.
+  * @page: Head page of a transparent huge page.
+  */
+ static inline unsigned int thp_order(struct page *page)
+ {
+       VM_BUG_ON_PGFLAGS(PageTail(page), page);
+       return compound_order(page);
+ }
+ 
+ /**
+  * thp_nr_pages - The number of regular pages in this huge page.
+  * @page: The head page of a huge page.
+  */
+ static inline int thp_nr_pages(struct page *page)
+ {
+       VM_BUG_ON_PGFLAGS(PageTail(page), page);
+       return compound_nr(page);
+ }
+ 
+ /**
+  * thp_size - Size of a transparent huge page.
+  * @page: Head page of a transparent huge page.
+  *
+  * Return: Number of bytes in this page.
+  */
+ static inline unsigned long thp_size(struct page *page)
+ {
+       return PAGE_SIZE << thp_order(page);
+ }
+ 
   void free_compound_page(struct page *page);
   
   #ifdef CONFIG_MMU
@@@ -1090,59 -1121,35 +1122,35 @@@ static inline bool is_zone_device_page(
   }
   #endif
   
+ static inline bool folio_is_zone_device(const struct folio *folio)
+ {
+       return is_zone_device_page(&folio->page);
+ }
+ 
   static inline bool is_zone_movable_page(const struct page *page)
   {
         return page_zonenum(page) == ZONE_MOVABLE;
   }
   
- #ifdef CONFIG_DEV_PAGEMAP_OPS
- void free_devmap_managed_page(struct page *page);
+ #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
   DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
   
- static inline bool page_is_devmap_managed(struct page *page)
+ bool __put_devmap_managed_page(struct page *page);
+ static inline bool put_devmap_managed_page(struct page *page)
   {
         if (!static_branch_unlikely(&devmap_managed_key))
                 return false;
         if (!is_zone_device_page(page))
                 return false;
-       switch (page->pgmap->type) {
-       case MEMORY_DEVICE_PRIVATE:
-       case MEMORY_DEVICE_FS_DAX:
-               return true;
-       default:
-               break;
-       }
-       return false;
+       return __put_devmap_managed_page(page);
   }
   
- void put_devmap_managed_page(struct page *page);
- 
- #else /* CONFIG_DEV_PAGEMAP_OPS */
- static inline bool page_is_devmap_managed(struct page *page)
+ #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
+ static inline bool put_devmap_managed_page(struct page *page)
   {
         return false;
   }
- 
- static inline void put_devmap_managed_page(struct page *page)
- {
- }
- #endif /* CONFIG_DEV_PAGEMAP_OPS */
- 
- static inline bool is_device_private_page(const struct page *page)
- {
-       return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
-               IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
-               is_zone_device_page(page) &&
-               page->pgmap->type == MEMORY_DEVICE_PRIVATE;
- }
- 
- static inline bool is_pci_p2pdma_page(const struct page *page)
- {
-       return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
-               IS_ENABLED(CONFIG_PCI_P2PDMA) &&
-               is_zone_device_page(page) &&
-               page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
- }
+ #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
   
   /* 127: arbitrary random number, small enough to assemble well */
   #define folio_ref_zero_or_close_to_overflow(folio) \
@@@ -1168,9 -1175,6 +1176,6 @@@ static inline void get_page(struct pag
   }
   
   bool __must_check try_grab_page(struct page *page, unsigned int flags);
- struct page *try_grab_compound_head(struct page *page, int refs,
-                                   unsigned int flags);
- 
   
   static inline __must_check bool try_get_page(struct page *page)
   {
@@@ -1225,16 -1229,11 +1230,11 @@@ static inline void put_page(struct pag
         struct folio *folio = page_folio(page);
   
         /*
-        * For devmap managed pages we need to catch refcount transition from
-        * 2 to 1, when refcount reach one it means the page is free and we
-        * need to inform the device driver through callback. See
-        * include/linux/memremap.h and HMM for details.
+        * For some devmap managed pages we need to catch refcount transition
+        * from 2 to 1:
          */
-       if (page_is_devmap_managed(&folio->page)) {
-               put_devmap_managed_page(&folio->page);
+       if (put_devmap_managed_page(&folio->page))
                 return;
-       }
- 
         folio_put(folio);
   }
   
@@@ -1264,10 -1263,9 +1264,9 @@@
    * applications that don't have huge page reference counts, this won't be an
    * issue.
    *
-  * Locking: the lockless algorithm described in page_cache_get_speculative()
-  * and page_cache_gup_pin_speculative() provides safe operation for
-  * get_user_pages and page_mkclean and other calls that race to set up page
-  * table entries.
+  * Locking: the lockless algorithm described in folio_try_get_rcu()
+  * provides safe operation for get_user_pages(), page_mkclean() and
+  * other calls that race to set up page table entries.
    */
   #define GUP_PIN_COUNTING_BIAS (1U << 10)
   
@@@ -1278,70 -1276,11 +1277,11 @@@ void unpin_user_page_range_dirty_lock(s
                                       bool make_dirty);
   void unpin_user_pages(struct page **pages, unsigned long npages);
   
- /**
-  * page_maybe_dma_pinned - Report if a page is pinned for DMA.
-  * @page: The page.
-  *
-  * This function checks if a page has been pinned via a call to
-  * a function in the pin_user_pages() family.
-  *
-  * For non-huge pages, the return value is partially fuzzy: false is not fuzzy,
-  * because it means "definitely not pinned for DMA", but true means "probably
-  * pinned for DMA, but possibly a false positive due to having at least
-  * GUP_PIN_COUNTING_BIAS worth of normal page references".
-  *
-  * False positives are OK, because: a) it's unlikely for a page to get that many
-  * refcounts, and b) all the callers of this routine are expected to be able to
-  * deal gracefully with a false positive.
-  *
-  * For huge pages, the result will be exactly correct. That's because we have
-  * more tracking data available: the 3rd struct page in the compound page is
-  * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS
-  * scheme).
-  *
-  * For more information, please see Documentation/core-api/pin_user_pages.rst.
-  *
-  * Return: True, if it is likely that the page has been "dma-pinned".
-  * False, if the page is definitely not dma-pinned.
-  */
- static inline bool page_maybe_dma_pinned(struct page *page)
- {
-       if (hpage_pincount_available(page))
-               return compound_pincount(page) > 0;
- 
-       /*
-        * page_ref_count() is signed. If that refcount overflows, then
-        * page_ref_count() returns a negative value, and callers will avoid
-        * further incrementing the refcount.
-        *
-        * Here, for that overflow case, use the signed bit to count a little
-        * bit higher via unsigned math, and thus still get an accurate result.
-        */
-       return ((unsigned int)page_ref_count(compound_head(page))) >=
-               GUP_PIN_COUNTING_BIAS;
- }
- 
   static inline bool is_cow_mapping(vm_flags_t flags)
   {
         return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
   }
   
- /*
-  * This should most likely only be called during fork() to see whether we
-  * should break the cow immediately for a page on the src mm.
-  */
- static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
-                                         struct page *page)
- {
-       if (!is_cow_mapping(vma->vm_flags))
-               return false;
- 
-       if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
-               return false;
- 
-       return page_maybe_dma_pinned(page);
- }
- 
   #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
   #define SECTION_IN_PAGE_FLAGS
   #endif
@@@ -1586,6 -1525,74 +1526,74 @@@ static inline unsigned long folio_pfn(s
         return page_to_pfn(&folio->page);
   }
   
+ static inline atomic_t *folio_pincount_ptr(struct folio *folio)
+ {
+       return &folio_page(folio, 1)->compound_pincount;
+ }
+ 
+ /**
+  * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
+  * @folio: The folio.
+  *
+  * This function checks if a folio has been pinned via a call to
+  * a function in the pin_user_pages() family.
+  *
+  * For small folios, the return value is partially fuzzy: false is not fuzzy,
+  * because it means "definitely not pinned for DMA", but true means "probably
+  * pinned for DMA, but possibly a false positive due to having at least
+  * GUP_PIN_COUNTING_BIAS worth of normal folio references".
+  *
+  * False positives are OK, because: a) it's unlikely for a folio to
+  * get that many refcounts, and b) all the callers of this routine are
+  * expected to be able to deal gracefully with a false positive.
+  *
+  * For large folios, the result will be exactly correct. That's because
+  * we have more tracking data available: the compound_pincount is used
+  * instead of the GUP_PIN_COUNTING_BIAS scheme.
+  *
+  * For more information, please see Documentation/core-api/pin_user_pages.rst.
+  *
+  * Return: True, if it is likely that the page has been "dma-pinned".
+  * False, if the page is definitely not dma-pinned.
+  */
+ static inline bool folio_maybe_dma_pinned(struct folio *folio)
+ {
+       if (folio_test_large(folio))
+               return atomic_read(folio_pincount_ptr(folio)) > 0;
+ 
+       /*
+        * folio_ref_count() is signed. If that refcount overflows, then
+        * folio_ref_count() returns a negative value, and callers will avoid
+        * further incrementing the refcount.
+        *
+        * Here, for that overflow case, use the sign bit to count a little
+        * bit higher via unsigned math, and thus still get an accurate result.
+        */
+       return ((unsigned int)folio_ref_count(folio)) >=
+               GUP_PIN_COUNTING_BIAS;
+ }
+ 
+ static inline bool page_maybe_dma_pinned(struct page *page)
+ {
+       return folio_maybe_dma_pinned(page_folio(page));
+ }
+ 
+ /*
+  * This should most likely only be called during fork() to see whether we
+  * should break the cow immediately for a page on the src mm.
+  */
+ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
+                                         struct page *page)
+ {
+       if (!is_cow_mapping(vma->vm_flags))
+               return false;
+ 
+       if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
+               return false;
+ 
+       return page_maybe_dma_pinned(page);
+ }
+ 
   /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
   #ifdef CONFIG_MIGRATION
   static inline bool is_pinnable_page(struct page *page)
@@@ -1600,6 -1607,11 +1608,11 @@@ static inline bool is_pinnable_page(str
   }
   #endif
   
+ static inline bool folio_is_pinnable(struct folio *folio)
+ {
+       return is_pinnable_page(&folio->page);
+ }
+ 
   static inline void set_page_zone(struct page *page, enum zone_type zone)
   {
         page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
@@@ -1749,7 -1761,6 +1762,6 @@@ static inline void *folio_address(cons
   }
   
   extern void *page_rmapping(struct page *page);
- extern struct anon_vma *page_anon_vma(struct page *page);
   extern pgoff_t __page_file_index(struct page *page);
   
   /*
@@@ -1855,7 -1866,6 +1867,6 @@@ extern void truncate_setsize(struct ino
   void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
   void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
   int generic_error_remove_page(struct address_space *mapping, struct page *page);
- int invalidate_inode_page(struct page *page);
   
   #ifdef CONFIG_MMU
   extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
@@@ -1917,6 -1927,10 +1928,6 @@@ long get_user_pages(unsigned long start
   long pin_user_pages(unsigned long start, unsigned long nr_pages,
                     unsigned int gup_flags, struct page **pages,
                     struct vm_area_struct **vmas);
- -long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
- -                  unsigned int gup_flags, struct page **pages, int *locked);
- -long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
- -                  unsigned int gup_flags, struct page **pages, int *locked);
   long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                     struct page **pages, unsigned int gup_flags);
   long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
@@@ -2450,6 -2464,7 +2461,6 @@@ static inline spinlock_t *pud_lock(stru
   }
   
   extern void __init pagecache_init(void);
- -extern void __init free_area_init_memoryless_node(int nid);
   extern void free_initmem(void);
   
   /*
@@@ -2622,7 -2637,7 +2633,7 @@@ static inline int vma_adjust(struct vm_
   extern struct vm_area_struct *vma_merge(struct mm_struct *,
         struct vm_area_struct *prev, unsigned long addr, unsigned long end,
         unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- -      struct mempolicy *, struct vm_userfaultfd_ctx, const char *);
+ +      struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *);
   extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
   extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
         unsigned long addr, int new_below);
@@@ -2921,13 -2936,11 +2932,11 @@@ struct page *follow_page(struct vm_area
   #define FOLL_FORCE    0x10    /* get_user_pages read/write w/o permission */
   #define FOLL_NOWAIT   0x20    /* if a disk transfer is needed, start the IO
                                  * and return without waiting upon it */
- #define FOLL_POPULATE 0x40    /* fault in pages (with FOLL_MLOCK) */
   #define FOLL_NOFAULT  0x80    /* do not fault in pages */
   #define FOLL_HWPOISON 0x100   /* check page is hwpoisoned */
   #define FOLL_NUMA     0x200   /* force NUMA hinting page fault */
   #define FOLL_MIGRATION        0x400   /* wait for page to replace migration entry */
   #define FOLL_TRIED    0x800   /* a retry, previous pass started an IO */
- #define FOLL_MLOCK    0x1000  /* lock present pages */
   #define FOLL_REMOTE   0x2000  /* we are working on non-current tsk/mm */
   #define FOLL_COW      0x4000  /* internal GUP flag */
   #define FOLL_ANON     0x8000  /* don't do file mappings */
@@@ -3147,12 -3160,10 +3156,12 @@@ static inline void print_vma_addr(char 
   }
   #endif
   
+ +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
   int vmemmap_remap_free(unsigned long start, unsigned long end,
                        unsigned long reuse);
   int vmemmap_remap_alloc(unsigned long start, unsigned long end,
                         unsigned long reuse, gfp_t gfp_mask);
+ +#endif
   
   void *sparse_buffer_alloc(unsigned long size);
   struct page * __populate_section_memmap(unsigned long pfn,
@@@ -3242,7 -3253,6 +3251,7 @@@ enum mf_action_page_type 
         MF_MSG_BUDDY,
         MF_MSG_DAX,
         MF_MSG_UNSPLIT_THP,
+ +      MF_MSG_DIFFERENT_PAGE_SIZE,
         MF_MSG_UNKNOWN,
   };
   
@@@ -3371,15 -3381,13 +3380,14 @@@ static inline int seal_check_future_wri
   
   #ifdef CONFIG_ANON_VMA_NAME
   int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- -                        unsigned long len_in, const char *name);
+ +                        unsigned long len_in,
+ +                        struct anon_vma_name *anon_name);
   #else
   static inline int
   madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- -                    unsigned long len_in, const char *name) {
+ +                    unsigned long len_in, struct anon_vma_name *anon_name) {
         return 0;
   }
   #endif
   
- #endif /* __KERNEL__ */
   #endif /* _LINUX_MM_H */
diff --combined include/linux/mm_inline.h

index cf90b1fa2c60c6d9862329926e6762ccbd5af120,884d6f6af05bb6c0236de5efb3eedc0afd928c02..ac32125745abc03f2373dd6930ec29783047b6cc
--- 1/include/linux/mm_inline.h
--- 2/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@@ -99,7 -99,8 +99,8 @@@ void lruvec_add_folio(struct lruvec *lr
   
         update_lru_size(lruvec, lru, folio_zonenum(folio),
                         folio_nr_pages(folio));
-       list_add(&folio->lru, &lruvec->lists[lru]);
+       if (lru != LRU_UNEVICTABLE)
+               list_add(&folio->lru, &lruvec->lists[lru]);
   }
   
   static __always_inline void add_page_to_lru_list(struct page *page,
@@@ -115,6 -116,7 +116,7 @@@ void lruvec_add_folio_tail(struct lruve
   
         update_lru_size(lruvec, lru, folio_zonenum(folio),
                         folio_nr_pages(folio));
+       /* This is not expected to be used on LRU_UNEVICTABLE */
         list_add_tail(&folio->lru, &lruvec->lists[lru]);
   }
   
@@@ -127,8 -129,11 +129,11 @@@ static __always_inline void add_page_to
   static __always_inline
   void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
   {
-       list_del(&folio->lru);
-       update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio),
+       enum lru_list lru = folio_lru_list(folio);
+ 
+       if (lru != LRU_UNEVICTABLE)
+               list_del(&folio->lru);
+       update_lru_size(lruvec, lru, folio_zonenum(folio),
                         -folio_nr_pages(folio));
   }
   
@@@ -140,91 -145,50 +145,91 @@@ static __always_inline void del_page_fr
   
   #ifdef CONFIG_ANON_VMA_NAME
   /*
- - * mmap_lock should be read-locked when calling vma_anon_name() and while using
- - * the returned pointer.
+ + * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
+ + * either keep holding the lock while using the returned pointer or it should
+ + * raise anon_vma_name refcount before releasing the lock.
    */
- -extern const char *vma_anon_name(struct vm_area_struct *vma);
+ +extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
+ +extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
+ +extern void anon_vma_name_free(struct kref *kref);
   
- -/*
- - * mmap_lock should be read-locked for orig_vma->vm_mm.
- - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be
- - * isolated.
- - */
- -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma,
- -                            struct vm_area_struct *new_vma);
+ +/* mmap_lock should be read-locked */
+ +static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
+ +{
+ +      if (anon_name)
+ +              kref_get(&anon_name->kref);
+ +}
   
- -/*
- - * mmap_lock should be write-locked or vma should have been isolated under
- - * write-locked mmap_lock protection.
- - */
- -extern void free_vma_anon_name(struct vm_area_struct *vma);
+ +static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
+ +{
+ +      if (anon_name)
+ +              kref_put(&anon_name->kref, anon_vma_name_free);
+ +}
   
- -/* mmap_lock should be read-locked */
- -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
- -                                       const char *name)
+ +static inline
+ +struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
   {
- -      const char *vma_name = vma_anon_name(vma);
+ +      /* Prevent anon_name refcount saturation early on */
+ +      if (kref_read(&anon_name->kref) < REFCOUNT_MAX) {
+ +              anon_vma_name_get(anon_name);
+ +              return anon_name;
+ +
+ +      }
+ +      return anon_vma_name_alloc(anon_name->name);
+ +}
   
- -      /* either both NULL, or pointers to same string */
- -      if (vma_name == name)
+ +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
+ +                                   struct vm_area_struct *new_vma)
+ +{
+ +      struct anon_vma_name *anon_name = anon_vma_name(orig_vma);
+ +
+ +      if (anon_name)
+ +              new_vma->anon_name = anon_vma_name_reuse(anon_name);
+ +}
+ +
+ +static inline void free_anon_vma_name(struct vm_area_struct *vma)
+ +{
+ +      /*
+ +       * Not using anon_vma_name because it generates a warning if mmap_lock
+ +       * is not held, which might be the case here.
+ +       */
+ +      if (!vma->vm_file)
+ +              anon_vma_name_put(vma->anon_name);
+ +}
+ +
+ +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
+ +                                  struct anon_vma_name *anon_name2)
+ +{
+ +      if (anon_name1 == anon_name2)
                 return true;
   
- -      return name && vma_name && !strcmp(name, vma_name);
+ +      return anon_name1 && anon_name2 &&
+ +              !strcmp(anon_name1->name, anon_name2->name);
   }
+ +
   #else /* CONFIG_ANON_VMA_NAME */
- -static inline const char *vma_anon_name(struct vm_area_struct *vma)
+ +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
   {
         return NULL;
   }
- -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
- -                            struct vm_area_struct *new_vma) {}
- -static inline void free_vma_anon_name(struct vm_area_struct *vma) {}
- -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
- -                                       const char *name)
+ +
+ +static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
+ +{
+ +      return NULL;
+ +}
+ +
+ +static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
+ +static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
+ +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
+ +                                   struct vm_area_struct *new_vma) {}
+ +static inline void free_anon_vma_name(struct vm_area_struct *vma) {}
+ +
+ +static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
+ +                                  struct anon_vma_name *anon_name2)
   {
         return true;
   }
+ +
   #endif  /* CONFIG_ANON_VMA_NAME */
   
   static inline void init_tlb_flush_pending(struct mm_struct *mm)
diff --combined include/linux/mm_types.h

index 5f7a33890b0fa718b3930d76c06afcb9741d396b,0e274c9b934e58e319d53e0648243e630db3d5b2..8834e38c06a4fc163bb40f01370b3457e11af532
--- 1/include/linux/mm_types.h
--- 2/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@@ -85,7 -85,16 +85,16 @@@ struct page 
                          * lruvec->lru_lock.  Sometimes used as a generic list
                          * by the page owner.
                          */
-                       struct list_head lru;
+                       union {
+                               struct list_head lru;
+                               /* Or, for the Unevictable "LRU list" slot */
+                               struct {
+                                       /* Always even, to negate PageTail */
+                                       void *__filler;
+                                       /* Count page's or folio's mlocks */
+                                       unsigned int mlock_count;
+                               };
+                       };
                         /* See page-flags.h for PAGE_MAPPING_FLAGS */
                         struct address_space *mapping;
                         pgoff_t index;          /* Our offset within mapping. */
@@@ -126,11 -135,14 +135,14 @@@
                         unsigned char compound_dtor;
                         unsigned char compound_order;
                         atomic_t compound_mapcount;
+                       atomic_t compound_pincount;
+ #ifdef CONFIG_64BIT
                         unsigned int compound_nr; /* 1 << compound_order */
+ #endif
                 };
                 struct {        /* Second tail page of compound page */
                         unsigned long _compound_pad_1;  /* compound_head */
-                       atomic_t hpage_pinned_refcount;
+                       unsigned long _compound_pad_2;
                         /* For both global and memcg */
                         struct list_head deferred_list;
                 };
@@@ -241,7 -253,13 +253,13 @@@ struct folio 
                 struct {
         /* public: */
                         unsigned long flags;
-                       struct list_head lru;
+                       union {
+                               struct list_head lru;
+                               struct {
+                                       void *__filler;
+                                       unsigned int mlock_count;
+                               };
+                       };
                         struct address_space *mapping;
                         pgoff_t index;
                         void *private;
@@@ -285,7 -303,7 +303,7 @@@ static inline atomic_t *compound_mapcou
   
   static inline atomic_t *compound_pincount_ptr(struct page *page)
   {
-       return &page[2].hpage_pinned_refcount;
+       return &page[1].compound_pincount;
   }
   
   /*
@@@ -416,10 -434,7 +434,10 @@@ struct vm_area_struct 
                         struct rb_node rb;
                         unsigned long rb_subtree_last;
                 } shared;
- -              /* Serialized by mmap_sem. */
+ +              /*
+ +               * Serialized by mmap_sem. Never use directly because it is
+ +               * valid only when vm_file is NULL. Use anon_vma_name instead.
+ +               */
                 struct anon_vma_name *anon_name;
         };
   
@@@ -634,7 -649,7 +652,7 @@@ struct mm_struct 
   #endif
                 struct work_struct async_put_work;
   
- -#ifdef CONFIG_IOMMU_SUPPORT
+ +#ifdef CONFIG_IOMMU_SVA
                 u32 pasid;
   #endif
         } __randomize_layout;
diff --combined include/linux/pagemap.h

index dc31eb981ea2bda55a127d11876cb35d35e9d218,20d7cbabf6546df5d9428bebdf0479c276611da8..eaedcef43a7f4865c9fe3249133ce2a23d6738fe
--- 1/include/linux/pagemap.h
--- 2/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@@ -18,6 -18,120 +18,120 @@@
   
   struct folio_batch;
   
+ unsigned long invalidate_mapping_pages(struct address_space *mapping,
+                                       pgoff_t start, pgoff_t end);
+ 
+ static inline void invalidate_remote_inode(struct inode *inode)
+ {
+       if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+           S_ISLNK(inode->i_mode))
+               invalidate_mapping_pages(inode->i_mapping, 0, -1);
+ }
+ int invalidate_inode_pages2(struct address_space *mapping);
+ int invalidate_inode_pages2_range(struct address_space *mapping,
+               pgoff_t start, pgoff_t end);
+ int write_inode_now(struct inode *, int sync);
+ int filemap_fdatawrite(struct address_space *);
+ int filemap_flush(struct address_space *);
+ int filemap_fdatawait_keep_errors(struct address_space *mapping);
+ int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend);
+ int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
+               loff_t start_byte, loff_t end_byte);
+ 
+ static inline int filemap_fdatawait(struct address_space *mapping)
+ {
+       return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
+ }
+ 
+ bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend);
+ int filemap_write_and_wait_range(struct address_space *mapping,
+               loff_t lstart, loff_t lend);
+ int __filemap_fdatawrite_range(struct address_space *mapping,
+               loff_t start, loff_t end, int sync_mode);
+ int filemap_fdatawrite_range(struct address_space *mapping,
+               loff_t start, loff_t end);
+ int filemap_check_errors(struct address_space *mapping);
+ void __filemap_set_wb_err(struct address_space *mapping, int err);
+ int filemap_fdatawrite_wbc(struct address_space *mapping,
+                          struct writeback_control *wbc);
+ 
+ static inline int filemap_write_and_wait(struct address_space *mapping)
+ {
+       return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
+ }
+ 
+ /**
+  * filemap_set_wb_err - set a writeback error on an address_space
+  * @mapping: mapping in which to set writeback error
+  * @err: error to be set in mapping
+  *
+  * When writeback fails in some way, we must record that error so that
+  * userspace can be informed when fsync and the like are called.  We endeavor
+  * to report errors on any file that was open at the time of the error.  Some
+  * internal callers also need to know when writeback errors have occurred.
+  *
+  * When a writeback error occurs, most filesystems will want to call
+  * filemap_set_wb_err to record the error in the mapping so that it will be
+  * automatically reported whenever fsync is called on the file.
+  */
+ static inline void filemap_set_wb_err(struct address_space *mapping, int err)
+ {
+       /* Fastpath for common case of no error */
+       if (unlikely(err))
+               __filemap_set_wb_err(mapping, err);
+ }
+ 
+ /**
+  * filemap_check_wb_err - has an error occurred since the mark was sampled?
+  * @mapping: mapping to check for writeback errors
+  * @since: previously-sampled errseq_t
+  *
+  * Grab the errseq_t value from the mapping, and see if it has changed "since"
+  * the given value was sampled.
+  *
+  * If it has then report the latest error set, otherwise return 0.
+  */
+ static inline int filemap_check_wb_err(struct address_space *mapping,
+                                       errseq_t since)
+ {
+       return errseq_check(&mapping->wb_err, since);
+ }
+ 
+ /**
+  * filemap_sample_wb_err - sample the current errseq_t to test for later errors
+  * @mapping: mapping to be sampled
+  *
+  * Writeback errors are always reported relative to a particular sample point
+  * in the past. This function provides those sample points.
+  */
+ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
+ {
+       return errseq_sample(&mapping->wb_err);
+ }
+ 
+ /**
+  * file_sample_sb_err - sample the current errseq_t to test for later errors
+  * @file: file pointer to be sampled
+  *
+  * Grab the most current superblock-level errseq_t value for the given
+  * struct file.
+  */
+ static inline errseq_t file_sample_sb_err(struct file *file)
+ {
+       return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
+ }
+ 
+ /*
+  * Flush file data before changing attributes.  Caller must hold any locks
+  * required to prevent further writes to this file until we're done setting
+  * flags.
+  */
+ static inline int inode_drain_writes(struct inode *inode)
+ {
+       inode_dio_wait(inode);
+       return filemap_write_and_wait(inode->i_mapping);
+ }
+ 
   static inline bool mapping_empty(struct address_space *mapping)
   {
         return xa_empty(&mapping->i_pages);
@@@ -192,9 -306,14 +306,14 @@@ static inline void mapping_set_large_fo
         __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
   }
   
+ /*
+  * Large folio support currently depends on THP.  These dependencies are
+  * being worked on but are not yet fixed.
+  */
   static inline bool mapping_large_folio_support(struct address_space *mapping)
   {
-       return test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
+       return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+               test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
   }
   
   static inline int filemap_nr_thps(struct address_space *mapping)
@@@ -212,7 -331,7 +331,7 @@@ static inline void filemap_nr_thps_inc(
         if (!mapping_large_folio_support(mapping))
                 atomic_inc(&mapping->nr_thps);
   #else
-       WARN_ON_ONCE(1);
+       WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
   #endif
   }
   
@@@ -222,7 -341,7 +341,7 @@@ static inline void filemap_nr_thps_dec(
         if (!mapping_large_folio_support(mapping))
                 atomic_dec(&mapping->nr_thps);
   #else
-       WARN_ON_ONCE(1);
+       WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
   #endif
   }
   
@@@ -283,16 -402,6 +402,6 @@@ static inline struct inode *folio_inode
         return folio->mapping->host;
   }
   
- static inline bool page_cache_add_speculative(struct page *page, int count)
- {
-       return folio_ref_try_add_rcu((struct folio *)page, count);
- }
- 
- static inline bool page_cache_get_speculative(struct page *page)
- {
-       return page_cache_add_speculative(page, 1);
- }
- 
   /**
    * folio_attach_private - Attach private data to a folio.
    * @folio: Folio to attach data to.
@@@ -594,6 -703,13 +703,6 @@@ static inline struct page *find_subpage
   unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
                         pgoff_t end, unsigned int nr_pages,
                         struct page **pages);
- -static inline unsigned find_get_pages(struct address_space *mapping,
- -                      pgoff_t *start, unsigned int nr_pages,
- -                      struct page **pages)
- -{
- -      return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages,
- -                                  pages);
- -}
   unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
                                unsigned int nr_pages, struct page **pages);
   unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
@@@ -706,6 -822,17 +815,17 @@@ static inline loff_t folio_file_pos(str
         return page_file_offset(&folio->page);
   }
   
+ /*
+  * Get the offset in PAGE_SIZE (even for hugetlb folios).
+  * (TODO: hugetlb folios should have ->index in PAGE_SIZE)
+  */
+ static inline pgoff_t folio_pgoff(struct folio *folio)
+ {
+       if (unlikely(folio_test_hugetlb(folio)))
+               return hugetlb_basepage_index(&folio->page);
+       return folio->index;
+ }
+ 
   extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
                                      unsigned long address);
   
diff --combined include/linux/swap.h

index a246c137678e6594e67c096e807174b3e77aba82,064e60e9f63f7951a8c2875b30a645def6b0d90d..40c618422753dd63484b57adcf325d1fe87d8f7b
--- 1/include/linux/swap.h
--- 2/include/linux/swap.h
+++ b/include/linux/swap.h
@@@ -328,18 -328,15 +328,18 @@@ static inline swp_entry_t folio_swap_en
   
   /* linux/mm/workingset.c */
   void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
- void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
+ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
   void workingset_refault(struct folio *folio, void *shadow);
   void workingset_activation(struct folio *folio);
   
   /* Only track the nodes of mappings with shadow entries */
   void workingset_update_node(struct xa_node *node);
+ +extern struct list_lru shadow_nodes;
   #define mapping_set_update(xas, mapping) do {                         \
- -      if (!dax_mapping(mapping) && !shmem_mapping(mapping))           \
+ +      if (!dax_mapping(mapping) && !shmem_mapping(mapping)) {         \
                 xas_set_update(xas, workingset_update_node);            \
+ +              xas_set_lru(xas, &shadow_nodes);                        \
+ +      }                                                               \
   } while (0)
   
   /* linux/mm/page_alloc.c */
@@@ -375,7 -372,6 +375,6 @@@ extern void lru_add_drain(void)
   extern void lru_add_drain_cpu(int cpu);
   extern void lru_add_drain_cpu_zone(struct zone *zone);
   extern void lru_add_drain_all(void);
- extern void deactivate_file_page(struct page *page);
   extern void deactivate_page(struct page *page);
   extern void mark_page_lazyfree(struct page *page);
   extern void swap_setup(void);
@@@ -387,6 -383,7 +386,6 @@@ extern void lru_cache_add_inactive_or_u
   extern unsigned long zone_reclaimable_pages(struct zone *zone);
   extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                         gfp_t gfp_mask, nodemask_t *mask);
- -extern bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode);
   extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                                   unsigned long nr_pages,
                                                   gfp_t gfp_mask,
@@@ -397,7 -394,7 +396,7 @@@ extern unsigned long mem_cgroup_shrink_
                                                 unsigned long *nr_scanned);
   extern unsigned long shrink_all_memory(unsigned long nr_pages);
   extern int vm_swappiness;
- extern int remove_mapping(struct address_space *mapping, struct page *page);
+ long remove_mapping(struct address_space *mapping, struct folio *folio);
   
   extern unsigned long reclaim_pages(struct list_head *page_list);
   #ifdef CONFIG_NUMA
@@@ -743,7 -740,7 +742,7 @@@ static inline void cgroup_throttle_swap
   #endif
   
   #ifdef CONFIG_MEMCG_SWAP
- extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
+ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
   extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
   static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
   {
@@@ -763,7 -760,7 +762,7 @@@ static inline void mem_cgroup_uncharge_
   extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
   extern bool mem_cgroup_swap_full(struct page *page);
   #else
- static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+ static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
   {
   }
   
diff --combined mm/Kconfig

index 5ffb79162a72fa28475f285812398ab8135eb75f,95d4aa3acaefe08fef4fc3bada7bdaed8296330c..761f5021ba511345ad5aeb1deb182e118cb2060f
--- 1/mm/Kconfig
--- 2/mm/Kconfig
+++ b/mm/Kconfig
@@@ -249,6 -249,9 +249,9 @@@ config MIGRATIO
           pages as migration can relocate pages to satisfy a huge page
           allocation instead of reclaiming.
   
+ config DEVICE_MIGRATION
+       def_bool MIGRATION && ZONE_DEVICE
+ 
   config ARCH_ENABLE_HUGEPAGE_MIGRATION
         bool
   
@@@ -262,9 -265,6 +265,9 @@@ config HUGETLB_PAGE_SIZE_VARIABL
           HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
           on a platform.
   
+ +        Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be
+ +        clamped down to MAX_ORDER - 1.
+ +
   config CONTIG_ALLOC
         def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
   
@@@ -414,9 -414,6 +417,9 @@@ choic
           benefit.
   endchoice
   
+ +config ARCH_WANT_GENERAL_HUGETLB
+ +      bool
+ +
   config ARCH_WANTS_THP_SWAP
         def_bool n
   
@@@ -750,15 -747,6 +753,15 @@@ config IDLE_PAGE_TRACKIN
   config ARCH_HAS_CACHE_LINE_SIZE
         bool
   
+ +config ARCH_HAS_CURRENT_STACK_POINTER
+ +      bool
+ +      help
+ +        In support of HARDENED_USERCOPY performing stack variable lifetime
+ +        checking, an architecture-agnostic way to find the stack pointer
+ +        is needed. Once an architecture defines an unsigned long global
+ +        register alias named "current_stack_pointer", this config can be
+ +        selected.
+ +
   config ARCH_HAS_PTE_DEVMAP
         bool
   
@@@ -791,9 -779,6 +794,6 @@@ config ZONE_DEVIC
   
           If FS_DAX is enabled, then say Y.
   
- config DEV_PAGEMAP_OPS
-       bool
- 
   #
   # Helpers to mirror range of the CPU page tables of a process into device page
   # tables.
@@@ -805,7 -790,6 +805,6 @@@ config HMM_MIRRO
   config DEVICE_PRIVATE
         bool "Unaddressable device memory (GPU memory, ...)"
         depends on ZONE_DEVICE
-       select DEV_PAGEMAP_OPS
   
         help
           Allows creation of struct pages to represent unaddressable device
diff --combined mm/damon/paddr.c

index 7c263797a9a9c99a573586f620e2b4522d258f2d,74c2b6e1ca489643c69e41740c6e8d146a7f05fc..21474ae63bc7a8807a79f074b89c9b5e998daafe
--- 1/mm/damon/paddr.c
--- 2/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@@ -14,16 -14,12 +14,12 @@@
   #include <linux/swap.h>
   
   #include "../internal.h"
- -#include "prmtv-common.h"
+ +#include "ops-common.h"
   
- static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma,
+ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma,
                 unsigned long addr, void *arg)
   {
-       struct page_vma_mapped_walk pvmw = {
-               .page = page,
-               .vma = vma,
-               .address = addr,
-       };
+       DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
   
         while (page_vma_mapped_walk(&pvmw)) {
                 addr = pvmw.address;
@@@ -37,32 -33,34 +33,34 @@@
   
   static void damon_pa_mkold(unsigned long paddr)
   {
+       struct folio *folio;
         struct page *page = damon_get_page(PHYS_PFN(paddr));
         struct rmap_walk_control rwc = {
                 .rmap_one = __damon_pa_mkold,
-               .anon_lock = page_lock_anon_vma_read,
+               .anon_lock = folio_lock_anon_vma_read,
         };
         bool need_lock;
   
         if (!page)
                 return;
+       folio = page_folio(page);
   
-       if (!page_mapped(page) || !page_rmapping(page)) {
-               set_page_idle(page);
+       if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+               folio_set_idle(folio);
                 goto out;
         }
   
-       need_lock = !PageAnon(page) || PageKsm(page);
-       if (need_lock && !trylock_page(page))
+       need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+       if (need_lock && !folio_trylock(folio))
                 goto out;
   
-       rmap_walk(page, &rwc);
+       rmap_walk(folio, &rwc);
   
         if (need_lock)
-               unlock_page(page);
+               folio_unlock(folio);
   
   out:
-       put_page(page);
+       folio_put(folio);
   }
   
   static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
@@@ -89,15 -87,11 +87,11 @@@ struct damon_pa_access_chk_result 
         bool accessed;
   };
   
- static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma,
+ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
                 unsigned long addr, void *arg)
   {
         struct damon_pa_access_chk_result *result = arg;
-       struct page_vma_mapped_walk pvmw = {
-               .page = page,
-               .vma = vma,
-               .address = addr,
-       };
+       DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
   
         result->accessed = false;
         result->page_sz = PAGE_SIZE;
@@@ -105,12 -99,12 +99,12 @@@
                 addr = pvmw.address;
                 if (pvmw.pte) {
                         result->accessed = pte_young(*pvmw.pte) ||
-                               !page_is_idle(page) ||
+                               !folio_test_idle(folio) ||
                                 mmu_notifier_test_young(vma->vm_mm, addr);
                 } else {
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
                         result->accessed = pmd_young(*pvmw.pmd) ||
-                               !page_is_idle(page) ||
+                               !folio_test_idle(folio) ||
                                 mmu_notifier_test_young(vma->vm_mm, addr);
                         result->page_sz = ((1UL) << HPAGE_PMD_SHIFT);
   #else
@@@ -129,6 -123,7 +123,7 @@@
   
   static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
   {
+       struct folio *folio;
         struct page *page = damon_get_page(PHYS_PFN(paddr));
         struct damon_pa_access_chk_result result = {
                 .page_sz = PAGE_SIZE,
@@@ -137,33 -132,34 +132,34 @@@
         struct rmap_walk_control rwc = {
                 .arg = &result,
                 .rmap_one = __damon_pa_young,
-               .anon_lock = page_lock_anon_vma_read,
+               .anon_lock = folio_lock_anon_vma_read,
         };
         bool need_lock;
   
         if (!page)
                 return false;
+       folio = page_folio(page);
   
-       if (!page_mapped(page) || !page_rmapping(page)) {
-               if (page_is_idle(page))
+       if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+               if (folio_test_idle(folio))
                         result.accessed = false;
                 else
                         result.accessed = true;
-               put_page(page);
+               folio_put(folio);
                 goto out;
         }
   
-       need_lock = !PageAnon(page) || PageKsm(page);
-       if (need_lock && !trylock_page(page)) {
-               put_page(page);
-               return NULL;
+       need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+       if (need_lock && !folio_trylock(folio)) {
+               folio_put(folio);
+               return false;
         }
   
-       rmap_walk(page, &rwc);
+       rmap_walk(folio, &rwc);
   
         if (need_lock)
-               unlock_page(page);
-       put_page(page);
+               folio_unlock(folio);
+       folio_put(folio);
   
   out:
         *page_sz = result.page_sz;
@@@ -208,6 -204,11 +204,6 @@@ static unsigned int damon_pa_check_acce
         return max_nr_accesses;
   }
   
- -bool damon_pa_target_valid(void *t)
- -{
- -      return true;
- -}
- -
   static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
                 struct damon_target *t, struct damon_region *r,
                 struct damos *scheme)
@@@ -256,22 -257,15 +252,22 @@@ static int damon_pa_scheme_score(struc
         return DAMOS_MAX_SCORE;
   }
   
- -void damon_pa_set_primitives(struct damon_ctx *ctx)
+ +static int __init damon_pa_initcall(void)
   {
- -      ctx->primitive.init = NULL;
- -      ctx->primitive.update = NULL;
- -      ctx->primitive.prepare_access_checks = damon_pa_prepare_access_checks;
- -      ctx->primitive.check_accesses = damon_pa_check_accesses;
- -      ctx->primitive.reset_aggregated = NULL;
- -      ctx->primitive.target_valid = damon_pa_target_valid;
- -      ctx->primitive.cleanup = NULL;
- -      ctx->primitive.apply_scheme = damon_pa_apply_scheme;
- -      ctx->primitive.get_scheme_score = damon_pa_scheme_score;
- -}
+ +      struct damon_operations ops = {
+ +              .id = DAMON_OPS_PADDR,
+ +              .init = NULL,
+ +              .update = NULL,
+ +              .prepare_access_checks = damon_pa_prepare_access_checks,
+ +              .check_accesses = damon_pa_check_accesses,
+ +              .reset_aggregated = NULL,
+ +              .target_valid = NULL,
+ +              .cleanup = NULL,
+ +              .apply_scheme = damon_pa_apply_scheme,
+ +              .get_scheme_score = damon_pa_scheme_score,
+ +      };
+ +
+ +      return damon_register_ops(&ops);
+ +};
+ +
+ +subsys_initcall(damon_pa_initcall);
diff --combined mm/filemap.c

index bd788bbe41b0c64804d0353b2695d2d1c9bd61e6,7608ee03066298ff22a0465cbc3252c0623af794..1752ef1266f3c2110e9fdbf32f6f437f9da26ba1
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -842,26 -842,27 +842,27 @@@ noinline int __filemap_add_folio(struc
   {
         XA_STATE(xas, &mapping->i_pages, index);
         int huge = folio_test_hugetlb(folio);
-       int error;
         bool charged = false;
+       long nr = 1;
   
         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
         VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
         mapping_set_update(&xas, mapping);
   
-       folio_get(folio);
-       folio->mapping = mapping;
-       folio->index = index;
- 
         if (!huge) {
-               error = mem_cgroup_charge(folio, NULL, gfp);
+               int error = mem_cgroup_charge(folio, NULL, gfp);
                 VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
                 if (error)
-                       goto error;
+                       return error;
                 charged = true;
+               xas_set_order(&xas, index, folio_order(folio));
+               nr = folio_nr_pages(folio);
         }
   
         gfp &= GFP_RECLAIM_MASK;
+       folio_ref_add(folio, nr);
+       folio->mapping = mapping;
+       folio->index = xas.xa_index;
   
         do {
                 unsigned int order = xa_get_order(xas.xa, xas.xa_index);
@@@ -885,6 -886,8 +886,8 @@@
                         /* entry may have been split before we acquired lock */
                         order = xa_get_order(xas.xa, xas.xa_index);
                         if (order > folio_order(folio)) {
+                               /* How to handle large swap entries? */
+                               BUG_ON(shmem_mapping(mapping));
                                 xas_split(&xas, old, order);
                                 xas_reset(&xas);
                         }
@@@ -894,29 -897,31 +897,31 @@@
                 if (xas_error(&xas))
                         goto unlock;
   
-               mapping->nrpages++;
+               mapping->nrpages += nr;
   
                 /* hugetlb pages do not participate in page cache accounting */
-               if (!huge)
-                       __lruvec_stat_add_folio(folio, NR_FILE_PAGES);
+               if (!huge) {
+                       __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+                       if (folio_test_pmd_mappable(folio))
+                               __lruvec_stat_mod_folio(folio,
+                                               NR_FILE_THPS, nr);
+               }
   unlock:
                 xas_unlock_irq(&xas);
         } while (xas_nomem(&xas, gfp));
   
-       if (xas_error(&xas)) {
-               error = xas_error(&xas);
-               if (charged)
-                       mem_cgroup_uncharge(folio);
+       if (xas_error(&xas))
                 goto error;
-       }
   
         trace_mm_filemap_add_to_page_cache(folio);
         return 0;
   error:
+       if (charged)
+               mem_cgroup_uncharge(folio);
         folio->mapping = NULL;
         /* Leave page->index set: truncation relies upon it */
-       folio_put(folio);
-       return error;
+       folio_put_refs(folio, nr);
+       return xas_error(&xas);
   }
   ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
   
@@@ -1054,12 -1059,6 +1059,12 @@@ void __init pagecache_init(void
                 init_waitqueue_head(&folio_wait_table[i]);
   
         page_writeback_init();
+ +
+ +      /*
+ +       * tmpfs uses the ZERO_PAGE for reading holes: it is up-to-date,
+ +       * and splice's page_cache_pipe_buf_confirm() needs to see that.
+ +       */
+ +      SetPageUptodate(ZERO_PAGE(0));
   }
   
   /*
@@@ -2235,9 -2234,8 +2240,9 @@@ out
    * @nr_pages: The maximum number of pages
    * @pages:    Where the resulting pages are placed
    *
- - * find_get_pages_contig() works exactly like find_get_pages(), except
- - * that the returned number of pages are guaranteed to be contiguous.
+ + * find_get_pages_contig() works exactly like find_get_pages_range(),
+ + * except that the returned number of pages are guaranteed to be
+ + * contiguous.
    *
    * Return: the number of pages which were found.
    */
@@@ -2297,9 -2295,9 +2302,9 @@@ EXPORT_SYMBOL(find_get_pages_contig)
    * @nr_pages: the maximum number of pages
    * @pages:    where the resulting pages are placed
    *
- - * Like find_get_pages(), except we only return head pages which are tagged
- - * with @tag.  @index is updated to the index immediately after the last
- - * page we return, ready for the next iteration.
+ + * Like find_get_pages_range(), except we only return head pages which are
+ + * tagged with @tag.  @index is updated to the index immediately after the
+ + * last page we return, ready for the next iteration.
    *
    * Return: the number of pages which were found.
    */
@@@ -2997,6 -2995,24 +3002,24 @@@ static struct file *do_sync_mmap_readah
         struct file *fpin = NULL;
         unsigned int mmap_miss;
   
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /* Use the readahead code, even if readahead is disabled */
+       if (vmf->vma->vm_flags & VM_HUGEPAGE) {
+               fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+               ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
+               ra->size = HPAGE_PMD_NR;
+               /*
+                * Fetch two PMD folios, so we get the chance to actually
+                * readahead, unless we've been told not to.
+                */
+               if (!(vmf->vma->vm_flags & VM_RAND_READ))
+                       ra->size *= 2;
+               ra->async_size = HPAGE_PMD_NR;
+               page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
+               return fpin;
+       }
+ #endif
+ 
         /* If we don't want any read-ahead, don't bother */
         if (vmf->vma->vm_flags & VM_RAND_READ)
                 return fpin;
@@@ -3029,7 -3045,7 +3052,7 @@@
         ra->size = ra->ra_pages;
         ra->async_size = ra->ra_pages / 4;
         ractl._index = ra->start;
-       do_page_cache_ra(&ractl, ra->size, ra->async_size);
+       page_cache_ra_order(&ractl, ra, 0);
         return fpin;
   }
   
diff --combined mm/gup.c

index 85d59dc08644ea38f281b6767a1c0ec28aefadfb,35d550dde7ff39cbdc8dd173d893985cea2469c6..271fbe8195d712d8babc3375859de1271b472687
--- 1/mm/gup.c
--- 2/mm/gup.c
+++ b/mm/gup.c
@@@ -29,107 -29,71 +29,71 @@@ struct follow_page_context 
         unsigned int page_mask;
   };
   
- static void hpage_pincount_add(struct page *page, int refs)
- {
-       VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
-       VM_BUG_ON_PAGE(page != compound_head(page), page);
- 
-       atomic_add(refs, compound_pincount_ptr(page));
- }
- 
- static void hpage_pincount_sub(struct page *page, int refs)
- {
-       VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
-       VM_BUG_ON_PAGE(page != compound_head(page), page);
- 
-       atomic_sub(refs, compound_pincount_ptr(page));
- }
- 
- /* Equivalent to calling put_page() @refs times. */
- static void put_page_refs(struct page *page, int refs)
- {
- #ifdef CONFIG_DEBUG_VM
-       if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
-               return;
- #endif
- 
-       /*
-        * Calling put_page() for each ref is unnecessarily slow. Only the last
-        * ref needs a put_page().
-        */
-       if (refs > 1)
-               page_ref_sub(page, refs - 1);
-       put_page(page);
- }
- 
   /*
-  * Return the compound head page with ref appropriately incremented,
+  * Return the folio with ref appropriately incremented,
    * or NULL if that failed.
    */
- static inline struct page *try_get_compound_head(struct page *page, int refs)
+ static inline struct folio *try_get_folio(struct page *page, int refs)
   {
-       struct page *head = compound_head(page);
+       struct folio *folio;
   
-       if (WARN_ON_ONCE(page_ref_count(head) < 0))
+ retry:
+       folio = page_folio(page);
+       if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
                 return NULL;
-       if (unlikely(!page_cache_add_speculative(head, refs)))
+       if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
                 return NULL;
   
         /*
-        * At this point we have a stable reference to the head page; but it
-        * could be that between the compound_head() lookup and the refcount
-        * increment, the compound page was split, in which case we'd end up
-        * holding a reference on a page that has nothing to do with the page
+        * At this point we have a stable reference to the folio; but it
+        * could be that between calling page_folio() and the refcount
+        * increment, the folio was split, in which case we'd end up
+        * holding a reference on a folio that has nothing to do with the page
          * we were given anymore.
-        * So now that the head page is stable, recheck that the pages still
-        * belong together.
+        * So now that the folio is stable, recheck that the page still
+        * belongs to this folio.
          */
-       if (unlikely(compound_head(page) != head)) {
-               put_page_refs(head, refs);
-               return NULL;
+       if (unlikely(page_folio(page) != folio)) {
+               folio_put_refs(folio, refs);
+               goto retry;
         }
   
-       return head;
+       return folio;
   }
   
   /**
-  * try_grab_compound_head() - attempt to elevate a page's refcount, by a
-  * flags-dependent amount.
-  *
-  * Even though the name includes "compound_head", this function is still
-  * appropriate for callers that have a non-compound @page to get.
-  *
+  * try_grab_folio() - Attempt to get or pin a folio.
    * @page:  pointer to page to be grabbed
-  * @refs:  the value to (effectively) add to the page's refcount
+  * @refs:  the value to (effectively) add to the folio's refcount
    * @flags: gup flags: these are the FOLL_* flag values.
    *
    * "grab" names in this file mean, "look at flags to decide whether to use
-  * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
+  * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
    *
    * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
    * same time. (That's true throughout the get_user_pages*() and
    * pin_user_pages*() APIs.) Cases:
    *
-  *    FOLL_GET: page's refcount will be incremented by @refs.
+  *    FOLL_GET: folio's refcount will be incremented by @refs.
    *
-  *    FOLL_PIN on compound pages that are > two pages long: page's refcount will
-  *    be incremented by @refs, and page[2].hpage_pinned_refcount will be
-  *    incremented by @refs * GUP_PIN_COUNTING_BIAS.
+  *    FOLL_PIN on large folios: folio's refcount will be incremented by
+  *    @refs, and its compound_pincount will be incremented by @refs.
    *
-  *    FOLL_PIN on normal pages, or compound pages that are two pages long:
-  *    page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS.
+  *    FOLL_PIN on single-page folios: folio's refcount will be incremented by
+  *    @refs * GUP_PIN_COUNTING_BIAS.
    *
-  * Return: head page (with refcount appropriately incremented) for success, or
-  * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
-  * considered failure, and furthermore, a likely bug in the caller, so a warning
-  * is also emitted.
+  * Return: The folio containing @page (with refcount appropriately
+  * incremented) for success, or NULL upon failure. If neither FOLL_GET
+  * nor FOLL_PIN was set, that's considered failure, and furthermore,
+  * a likely bug in the caller, so a warning is also emitted.
    */
- __maybe_unused struct page *try_grab_compound_head(struct page *page,
-                                                  int refs, unsigned int flags)
+ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
   {
         if (flags & FOLL_GET)
-               return try_get_compound_head(page, refs);
+               return try_get_folio(page, refs);
         else if (flags & FOLL_PIN) {
+               struct folio *folio;
+ 
                 /*
                  * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
                  * right zone, so fail and let the caller fall back to the slow
@@@ -143,63 -107,57 +107,57 @@@
                  * CAUTION: Don't use compound_head() on the page before this
                  * point, the result won't be stable.
                  */
-               page = try_get_compound_head(page, refs);
-               if (!page)
+               folio = try_get_folio(page, refs);
+               if (!folio)
                         return NULL;
   
                 /*
-                * When pinning a compound page of order > 1 (which is what
-                * hpage_pincount_available() checks for), use an exact count to
-                * track it, via hpage_pincount_add/_sub().
+                * When pinning a large folio, use an exact count to track it.
                  *
-                * However, be sure to *also* increment the normal page refcount
-                * field at least once, so that the page really is pinned.
-                * That's why the refcount from the earlier
-                * try_get_compound_head() is left intact.
+                * However, be sure to *also* increment the normal folio
+                * refcount field at least once, so that the folio really
+                * is pinned.  That's why the refcount from the earlier
+                * try_get_folio() is left intact.
                  */
-               if (hpage_pincount_available(page))
-                       hpage_pincount_add(page, refs);
+               if (folio_test_large(folio))
+                       atomic_add(refs, folio_pincount_ptr(folio));
                 else
-                       page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
+                       folio_ref_add(folio,
+                                       refs * (GUP_PIN_COUNTING_BIAS - 1));
+               node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
   
-               mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
-                                   refs);
- 
-               return page;
+               return folio;
         }
   
         WARN_ON_ONCE(1);
         return NULL;
   }
   
- static void put_compound_head(struct page *page, int refs, unsigned int flags)
+ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
   {
         if (flags & FOLL_PIN) {
-               mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
-                                   refs);
- 
-               if (hpage_pincount_available(page))
-                       hpage_pincount_sub(page, refs);
+               node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
+               if (folio_test_large(folio))
+                       atomic_sub(refs, folio_pincount_ptr(folio));
                 else
                         refs *= GUP_PIN_COUNTING_BIAS;
         }
   
-       put_page_refs(page, refs);
+       folio_put_refs(folio, refs);
   }
   
   /**
    * try_grab_page() - elevate a page's refcount by a flag-dependent amount
+  * @page:    pointer to page to be grabbed
+  * @flags:   gup flags: these are the FOLL_* flag values.
    *
    * This might not do anything at all, depending on the flags argument.
    *
    * "grab" names in this file mean, "look at flags to decide whether to use
    * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
    *
-  * @page:    pointer to page to be grabbed
-  * @flags:   gup flags: these are the FOLL_* flag values.
-  *
    * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
-  * time. Cases: please see the try_grab_compound_head() documentation, with
+  * time. Cases: please see the try_grab_folio() documentation, with
    * "refs=1".
    *
    * Return: true for success, or if no action was required (if neither FOLL_PIN
@@@ -208,32 -166,28 +166,28 @@@
    */
   bool __must_check try_grab_page(struct page *page, unsigned int flags)
   {
+       struct folio *folio = page_folio(page);
+ 
         WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
+       if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
+               return false;
   
         if (flags & FOLL_GET)
-               return try_get_page(page);
+               folio_ref_inc(folio);
         else if (flags & FOLL_PIN) {
-               int refs = 1;
- 
-               page = compound_head(page);
- 
-               if (WARN_ON_ONCE(page_ref_count(page) <= 0))
-                       return false;
- 
-               if (hpage_pincount_available(page))
-                       hpage_pincount_add(page, 1);
-               else
-                       refs = GUP_PIN_COUNTING_BIAS;
- 
                 /*
-                * Similar to try_grab_compound_head(): even if using the
-                * hpage_pincount_add/_sub() routines, be sure to
-                * *also* increment the normal page refcount field at least
-                * once, so that the page really is pinned.
+                * Similar to try_grab_folio(): be sure to *also*
+                * increment the normal page refcount field at least once,
+                * so that the page really is pinned.
                  */
-               page_ref_add(page, refs);
+               if (folio_test_large(folio)) {
+                       folio_ref_add(folio, 1);
+                       atomic_add(1, folio_pincount_ptr(folio));
+               } else {
+                       folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
+               }
   
-               mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
+               node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
         }
   
         return true;
@@@ -250,62 -204,40 +204,40 @@@
    */
   void unpin_user_page(struct page *page)
   {
-       put_compound_head(compound_head(page), 1, FOLL_PIN);
+       gup_put_folio(page_folio(page), 1, FOLL_PIN);
   }
   EXPORT_SYMBOL(unpin_user_page);
   
- static inline void compound_range_next(unsigned long i, unsigned long npages,
-                                      struct page **list, struct page **head,
-                                      unsigned int *ntails)
+ static inline struct folio *gup_folio_range_next(struct page *start,
+               unsigned long npages, unsigned long i, unsigned int *ntails)
   {
-       struct page *next, *page;
+       struct page *next = nth_page(start, i);
+       struct folio *folio = page_folio(next);
         unsigned int nr = 1;
   
-       if (i >= npages)
-               return;
- 
-       next = *list + i;
-       page = compound_head(next);
-       if (PageCompound(page) && compound_order(page) >= 1)
-               nr = min_t(unsigned int,
-                          page + compound_nr(page) - next, npages - i);
+       if (folio_test_large(folio))
+               nr = min_t(unsigned int, npages - i,
+                          folio_nr_pages(folio) - folio_page_idx(folio, next));
   
-       *head = page;
         *ntails = nr;
+       return folio;
   }
   
- #define for_each_compound_range(__i, __list, __npages, __head, __ntails) \
-       for (__i = 0, \
-            compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \
-            __i < __npages; __i += __ntails, \
-            compound_range_next(__i, __npages, __list, &(__head), &(__ntails)))
- 
- static inline void compound_next(unsigned long i, unsigned long npages,
-                                struct page **list, struct page **head,
-                                unsigned int *ntails)
+ static inline struct folio *gup_folio_next(struct page **list,
+               unsigned long npages, unsigned long i, unsigned int *ntails)
   {
-       struct page *page;
+       struct folio *folio = page_folio(list[i]);
         unsigned int nr;
   
-       if (i >= npages)
-               return;
- 
-       page = compound_head(list[i]);
         for (nr = i + 1; nr < npages; nr++) {
-               if (compound_head(list[nr]) != page)
+               if (page_folio(list[nr]) != folio)
                         break;
         }
   
-       *head = page;
         *ntails = nr - i;
+       return folio;
   }
   
- #define for_each_compound_head(__i, __list, __npages, __head, __ntails) \
-       for (__i = 0, \
-            compound_next(__i, __npages, __list, &(__head), &(__ntails)); \
-            __i < __npages; __i += __ntails, \
-            compound_next(__i, __npages, __list, &(__head), &(__ntails)))
- 
   /**
    * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
    * @pages:  array of pages to be maybe marked dirty, and definitely released.
@@@ -331,16 -263,17 +263,17 @@@
   void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                  bool make_dirty)
   {
-       unsigned long index;
-       struct page *head;
-       unsigned int ntails;
+       unsigned long i;
+       struct folio *folio;
+       unsigned int nr;
   
         if (!make_dirty) {
                 unpin_user_pages(pages, npages);
                 return;
         }
   
-       for_each_compound_head(index, pages, npages, head, ntails) {
+       for (i = 0; i < npages; i += nr) {
+               folio = gup_folio_next(pages, npages, i, &nr);
                 /*
                  * Checking PageDirty at this point may race with
                  * clear_page_dirty_for_io(), but that's OK. Two key
@@@ -361,9 -294,12 +294,12 @@@
                  * written back, so it gets written back again in the
                  * next writeback cycle. This is harmless.
                  */
-               if (!PageDirty(head))
-                       set_page_dirty_lock(head);
-               put_compound_head(head, ntails, FOLL_PIN);
+               if (!folio_test_dirty(folio)) {
+                       folio_lock(folio);
+                       folio_mark_dirty(folio);
+                       folio_unlock(folio);
+               }
+               gup_put_folio(folio, nr, FOLL_PIN);
         }
   }
   EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
@@@ -392,14 -328,18 +328,18 @@@
   void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
                                       bool make_dirty)
   {
-       unsigned long index;
-       struct page *head;
-       unsigned int ntails;
- 
-       for_each_compound_range(index, &page, npages, head, ntails) {
-               if (make_dirty && !PageDirty(head))
-                       set_page_dirty_lock(head);
-               put_compound_head(head, ntails, FOLL_PIN);
+       unsigned long i;
+       struct folio *folio;
+       unsigned int nr;
+ 
+       for (i = 0; i < npages; i += nr) {
+               folio = gup_folio_range_next(page, npages, i, &nr);
+               if (make_dirty && !folio_test_dirty(folio)) {
+                       folio_lock(folio);
+                       folio_mark_dirty(folio);
+                       folio_unlock(folio);
+               }
+               gup_put_folio(folio, nr, FOLL_PIN);
         }
   }
   EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
@@@ -415,9 -355,9 +355,9 @@@
    */
   void unpin_user_pages(struct page **pages, unsigned long npages)
   {
-       unsigned long index;
-       struct page *head;
-       unsigned int ntails;
+       unsigned long i;
+       struct folio *folio;
+       unsigned int nr;
   
         /*
          * If this WARN_ON() fires, then the system *might* be leaking pages (by
@@@ -427,8 -367,10 +367,10 @@@
         if (WARN_ON(IS_ERR_VALUE(npages)))
                 return;
   
-       for_each_compound_head(index, pages, npages, head, ntails)
-               put_compound_head(head, ntails, FOLL_PIN);
+       for (i = 0; i < npages; i += nr) {
+               folio = gup_folio_next(pages, npages, i, &nr);
+               gup_put_folio(folio, nr, FOLL_PIN);
+       }
   }
   EXPORT_SYMBOL(unpin_user_pages);
   
@@@ -464,6 -406,10 +406,6 @@@ static struct page *no_page_table(struc
   static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
                 pte_t *pte, unsigned int flags)
   {
- -      /* No page to get reference */
- -      if (flags & FOLL_GET)
- -              return -EFAULT;
- -
         if (flags & FOLL_TOUCH) {
                 pte_t entry = *pte;
   
@@@ -593,32 -539,6 +535,6 @@@ retry
                  */
                 mark_page_accessed(page);
         }
-       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
-               /* Do not mlock pte-mapped THP */
-               if (PageTransCompound(page))
-                       goto out;
- 
-               /*
-                * The preliminary mapping check is mainly to avoid the
-                * pointless overhead of lock_page on the ZERO_PAGE
-                * which might bounce very badly if there is contention.
-                *
-                * If the page is already locked, we don't need to
-                * handle it now - vmscan will handle it later if and
-                * when it attempts to reclaim the page.
-                */
-               if (page->mapping && trylock_page(page)) {
-                       lru_add_drain();  /* push cached pages to LRU */
-                       /*
-                        * Because we lock page here, and migration is
-                        * blocked by the pte's page reference, and we
-                        * know the page is still mapped, we don't even
-                        * need to check for file-cache page truncation.
-                        */
-                       mlock_vma_page(page);
-                       unlock_page(page);
-               }
-       }
   out:
         pte_unmap_unlock(ptep, ptl);
         return page;
@@@ -941,9 -861,6 +857,6 @@@ static int faultin_page(struct vm_area_
         unsigned int fault_flags = 0;
         vm_fault_t ret;
   
-       /* mlock all present pages, but do not fault in new pages */
-       if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
-               return -ENOENT;
         if (*flags & FOLL_NOFAULT)
                 return -EFAULT;
         if (*flags & FOLL_WRITE)
@@@ -1194,22 -1111,13 +1107,20 @@@ retry
                         case -ENOMEM:
                         case -EHWPOISON:
                                 goto out;
-                       case -ENOENT:
-                               goto next_page;
                         }
                         BUG();
                 } else if (PTR_ERR(page) == -EEXIST) {
                         /*
                          * Proper page table entry exists, but no corresponding
- -                       * struct page.
+ +                       * struct page. If the caller expects **pages to be
+ +                       * filled in, bail out now, because that can't be done
+ +                       * for this page.
                          */
+ +                      if (pages) {
+ +                              ret = PTR_ERR(page);
+ +                              goto out;
+ +                      }
+ +
                         goto next_page;
                 } else if (IS_ERR(page)) {
                         ret = PTR_ERR(page);
@@@ -1500,9 -1408,14 +1411,14 @@@ long populate_vma_page_range(struct vm_
         VM_BUG_ON_VMA(end   > vma->vm_end, vma);
         mmap_assert_locked(mm);
   
-       gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+       /*
+        * Rightly or wrongly, the VM_LOCKONFAULT case has never used
+        * faultin_page() to break COW, so it has no work to do here.
+        */
         if (vma->vm_flags & VM_LOCKONFAULT)
-               gup_flags &= ~FOLL_POPULATE;
+               return nr_pages;
+ 
+       gup_flags = FOLL_TOUCH;
         /*
          * We want to touch writable mappings with a write fault in order
          * to break COW, except for shared mappings because these don't COW
@@@ -1569,10 -1482,9 +1485,9 @@@ long faultin_vma_page_range(struct vm_a
          *             in the page table.
          * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
          *                a poisoned page.
-        * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT.
          * !FOLL_FORCE: Require proper access permissions.
          */
-       gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON;
+       gup_flags = FOLL_TOUCH | FOLL_HWPOISON;
         if (write)
                 gup_flags |= FOLL_WRITE;
   
@@@ -1732,11 -1644,11 +1647,11 @@@ EXPORT_SYMBOL(fault_in_writeable)
    * @uaddr: start of address range
    * @size: length of address range
    *
- - * Faults in an address range using get_user_pages, i.e., without triggering
- - * hardware page faults.  This is primarily useful when we already know that
- - * some or all of the pages in the address range aren't in memory.
+ + * Faults in an address range for writing.  This is primarily useful when we
+ + * already know that some or all of the pages in the address range aren't in
+ + * memory.
    *
- - * Other than fault_in_writeable(), this function is non-destructive.
+ + * Unlike fault_in_writeable(), this function is non-destructive.
    *
    * Note that we don't pin or otherwise hold the pages referenced that we fault
    * in.  There's no guarantee that they'll stay in memory for any duration of
@@@ -1747,27 -1659,46 +1662,27 @@@
    */
   size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
   {
- -      unsigned long start = (unsigned long)untagged_addr(uaddr);
- -      unsigned long end, nstart, nend;
+ +      unsigned long start = (unsigned long)uaddr, end;
         struct mm_struct *mm = current->mm;
- -      struct vm_area_struct *vma = NULL;
- -      int locked = 0;
+ +      bool unlocked = false;
   
- -      nstart = start & PAGE_MASK;
+ +      if (unlikely(size == 0))
+ +              return 0;
         end = PAGE_ALIGN(start + size);
- -      if (end < nstart)
+ +      if (end < start)
                 end = 0;
- -      for (; nstart != end; nstart = nend) {
- -              unsigned long nr_pages;
- -              long ret;
   
- -              if (!locked) {
- -                      locked = 1;
- -                      mmap_read_lock(mm);
- -                      vma = find_vma(mm, nstart);
- -              } else if (nstart >= vma->vm_end)
- -                      vma = vma->vm_next;
- -              if (!vma || vma->vm_start >= end)
- -                      break;
- -              nend = end ? min(end, vma->vm_end) : vma->vm_end;
- -              if (vma->vm_flags & (VM_IO | VM_PFNMAP))
- -                      continue;
- -              if (nstart < vma->vm_start)
- -                      nstart = vma->vm_start;
- -              nr_pages = (nend - nstart) / PAGE_SIZE;
- -              ret = __get_user_pages_locked(mm, nstart, nr_pages,
- -                                            NULL, NULL, &locked,
- -                                            FOLL_TOUCH | FOLL_WRITE);
- -              if (ret <= 0)
+ +      mmap_read_lock(mm);
+ +      do {
+ +              if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
                         break;
- -              nend = nstart + ret * PAGE_SIZE;
- -      }
- -      if (locked)
- -              mmap_read_unlock(mm);
- -      if (nstart == end)
- -              return 0;
- -      return size - min_t(size_t, nstart - start, size);
+ +              start = (start + PAGE_SIZE) & PAGE_MASK;
+ +      } while (start != end);
+ +      mmap_read_unlock(mm);
+ +
+ +      if (size > (unsigned long)uaddr - start)
+ +              return size - ((unsigned long)uaddr - start);
+ +      return 0;
   }
   EXPORT_SYMBOL(fault_in_safe_writeable);
   
@@@ -1852,72 -1783,80 +1767,80 @@@ static long check_and_migrate_movable_p
                                             struct page **pages,
                                             unsigned int gup_flags)
   {
-       unsigned long i;
-       unsigned long isolation_error_count = 0;
-       bool drain_allow = true;
+       unsigned long isolation_error_count = 0, i;
+       struct folio *prev_folio = NULL;
         LIST_HEAD(movable_page_list);
-       long ret = 0;
-       struct page *prev_head = NULL;
-       struct page *head;
-       struct migration_target_control mtc = {
-               .nid = NUMA_NO_NODE,
-               .gfp_mask = GFP_USER | __GFP_NOWARN,
-       };
+       bool drain_allow = true;
+       int ret = 0;
   
         for (i = 0; i < nr_pages; i++) {
-               head = compound_head(pages[i]);
-               if (head == prev_head)
+               struct folio *folio = page_folio(pages[i]);
+ 
+               if (folio == prev_folio)
                         continue;
-               prev_head = head;
+               prev_folio = folio;
+ 
+               if (folio_is_pinnable(folio))
+                       continue;
+ 
                 /*
-                * If we get a movable page, since we are going to be pinning
-                * these entries, try to move them out if possible.
+                * Try to move out any movable page before pinning the range.
                  */
-               if (!is_pinnable_page(head)) {
-                       if (PageHuge(head)) {
-                               if (!isolate_huge_page(head, &movable_page_list))
-                                       isolation_error_count++;
-                       } else {
-                               if (!PageLRU(head) && drain_allow) {
-                                       lru_add_drain_all();
-                                       drain_allow = false;
-                               }
+               if (folio_test_hugetlb(folio)) {
+                       if (!isolate_huge_page(&folio->page,
+                                               &movable_page_list))
+                               isolation_error_count++;
+                       continue;
+               }
   
-                               if (isolate_lru_page(head)) {
-                                       isolation_error_count++;
-                                       continue;
-                               }
-                               list_add_tail(&head->lru, &movable_page_list);
-                               mod_node_page_state(page_pgdat(head),
-                                                   NR_ISOLATED_ANON +
-                                                   page_is_file_lru(head),
-                                                   thp_nr_pages(head));
-                       }
+               if (!folio_test_lru(folio) && drain_allow) {
+                       lru_add_drain_all();
+                       drain_allow = false;
+               }
+ 
+               if (folio_isolate_lru(folio)) {
+                       isolation_error_count++;
+                       continue;
                 }
+               list_add_tail(&folio->lru, &movable_page_list);
+               node_stat_mod_folio(folio,
+                                   NR_ISOLATED_ANON + folio_is_file_lru(folio),
+                                   folio_nr_pages(folio));
         }
   
+       if (!list_empty(&movable_page_list) || isolation_error_count)
+               goto unpin_pages;
+ 
         /*
          * If list is empty, and no isolation errors, means that all pages are
          * in the correct zone.
          */
-       if (list_empty(&movable_page_list) && !isolation_error_count)
-               return nr_pages;
+       return nr_pages;
   
+ unpin_pages:
         if (gup_flags & FOLL_PIN) {
                 unpin_user_pages(pages, nr_pages);
         } else {
                 for (i = 0; i < nr_pages; i++)
                         put_page(pages[i]);
         }
+ 
         if (!list_empty(&movable_page_list)) {
+               struct migration_target_control mtc = {
+                       .nid = NUMA_NO_NODE,
+                       .gfp_mask = GFP_USER | __GFP_NOWARN,
+               };
+ 
                 ret = migrate_pages(&movable_page_list, alloc_migration_target,
                                     NULL, (unsigned long)&mtc, MIGRATE_SYNC,
                                     MR_LONGTERM_PIN, NULL);
-               if (ret && !list_empty(&movable_page_list))
-                       putback_movable_pages(&movable_page_list);
+               if (ret > 0) /* number of pages not migrated */
+                       ret = -ENOMEM;
         }
   
-       return ret > 0 ? -ENOMEM : ret;
+       if (ret && !list_empty(&movable_page_list))
+               putback_movable_pages(&movable_page_list);
+       return ret;
   }
   #else
   static long check_and_migrate_movable_pages(unsigned long nr_pages,
@@@ -2126,6 -2065,65 +2049,6 @@@ long get_user_pages(unsigned long start
   }
   EXPORT_SYMBOL(get_user_pages);
   
- -/**
- - * get_user_pages_locked() - variant of get_user_pages()
- - *
- - * @start:      starting user address
- - * @nr_pages:   number of pages from start to pin
- - * @gup_flags:  flags modifying lookup behaviour
- - * @pages:      array that receives pointers to the pages pinned.
- - *              Should be at least nr_pages long. Or NULL, if caller
- - *              only intends to ensure the pages are faulted in.
- - * @locked:     pointer to lock flag indicating whether lock is held and
- - *              subsequently whether VM_FAULT_RETRY functionality can be
- - *              utilised. Lock must initially be held.
- - *
- - * It is suitable to replace the form:
- - *
- - *      mmap_read_lock(mm);
- - *      do_something()
- - *      get_user_pages(mm, ..., pages, NULL);
- - *      mmap_read_unlock(mm);
- - *
- - *  to:
- - *
- - *      int locked = 1;
- - *      mmap_read_lock(mm);
- - *      do_something()
- - *      get_user_pages_locked(mm, ..., pages, &locked);
- - *      if (locked)
- - *          mmap_read_unlock(mm);
- - *
- - * We can leverage the VM_FAULT_RETRY functionality in the page fault
- - * paths better by using either get_user_pages_locked() or
- - * get_user_pages_unlocked().
- - *
- - */
- -long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
- -                         unsigned int gup_flags, struct page **pages,
- -                         int *locked)
- -{
- -      /*
- -       * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- -       * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- -       * vmas.  As there are no users of this flag in this call we simply
- -       * disallow this option for now.
- -       */
- -      if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- -              return -EINVAL;
- -      /*
- -       * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- -       * never directly by the caller, so enforce that:
- -       */
- -      if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
- -              return -EINVAL;
- -
- -      return __get_user_pages_locked(current->mm, start, nr_pages,
- -                                     pages, NULL, locked,
- -                                     gup_flags | FOLL_TOUCH);
- -}
- -EXPORT_SYMBOL(get_user_pages_locked);
- -
   /*
    * get_user_pages_unlocked() is suitable to replace the form:
    *
@@@ -2227,7 -2225,8 +2150,8 @@@ static int gup_pte_range(pmd_t pmd, uns
         ptem = ptep = pte_offset_map(&pmd, addr);
         do {
                 pte_t pte = ptep_get_lockless(ptep);
-               struct page *head, *page;
+               struct page *page;
+               struct folio *folio;
   
                 /*
                  * Similar to the PMD case below, NUMA hinting must take slow
@@@ -2254,22 -2253,20 +2178,20 @@@
                 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                 page = pte_page(pte);
   
-               head = try_grab_compound_head(page, 1, flags);
-               if (!head)
+               folio = try_grab_folio(page, 1, flags);
+               if (!folio)
                         goto pte_unmap;
   
                 if (unlikely(page_is_secretmem(page))) {
-                       put_compound_head(head, 1, flags);
+                       gup_put_folio(folio, 1, flags);
                         goto pte_unmap;
                 }
   
                 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-                       put_compound_head(head, 1, flags);
+                       gup_put_folio(folio, 1, flags);
                         goto pte_unmap;
                 }
   
-               VM_BUG_ON_PAGE(compound_head(page) != head, page);
- 
                 /*
                  * We need to make the page accessible if and only if we are
                  * going to access its content (the FOLL_PIN case).  Please
@@@ -2279,14 -2276,13 +2201,13 @@@
                 if (flags & FOLL_PIN) {
                         ret = arch_make_page_accessible(page);
                         if (ret) {
-                               unpin_user_page(page);
+                               gup_put_folio(folio, 1, flags);
                                 goto pte_unmap;
                         }
                 }
-               SetPageReferenced(page);
+               folio_set_referenced(folio);
                 pages[*nr] = page;
                 (*nr)++;
- 
         } while (ptep++, addr += PAGE_SIZE, addr != end);
   
         ret = 1;
@@@ -2403,8 -2399,8 +2324,8 @@@ static int record_subpages(struct page 
   {
         int nr;
   
-       for (nr = 0; addr != end; addr += PAGE_SIZE)
-               pages[nr++] = page++;
+       for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
+               pages[nr] = nth_page(page, nr);
   
         return nr;
   }
@@@ -2422,7 -2418,8 +2343,8 @@@ static int gup_hugepte(pte_t *ptep, uns
                        struct page **pages, int *nr)
   {
         unsigned long pte_end;
-       struct page *head, *page;
+       struct page *page;
+       struct folio *folio;
         pte_t pte;
         int refs;
   
@@@ -2438,21 -2435,20 +2360,20 @@@
         /* hugepages are never "special" */
         VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
   
-       head = pte_page(pte);
-       page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+       page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
         refs = record_subpages(page, addr, end, pages + *nr);
   
-       head = try_grab_compound_head(head, refs, flags);
-       if (!head)
+       folio = try_grab_folio(page, refs, flags);
+       if (!folio)
                 return 0;
   
         if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-               put_compound_head(head, refs, flags);
+               gup_put_folio(folio, refs, flags);
                 return 0;
         }
   
         *nr += refs;
-       SetPageReferenced(head);
+       folio_set_referenced(folio);
         return 1;
   }
   
@@@ -2486,7 -2482,8 +2407,8 @@@ static int gup_huge_pmd(pmd_t orig, pmd
                         unsigned long end, unsigned int flags,
                         struct page **pages, int *nr)
   {
-       struct page *head, *page;
+       struct page *page;
+       struct folio *folio;
         int refs;
   
         if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
@@@ -2499,20 -2496,20 +2421,20 @@@
                                              pages, nr);
         }
   
-       page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+       page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
         refs = record_subpages(page, addr, end, pages + *nr);
   
-       head = try_grab_compound_head(pmd_page(orig), refs, flags);
-       if (!head)
+       folio = try_grab_folio(page, refs, flags);
+       if (!folio)
                 return 0;
   
         if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
-               put_compound_head(head, refs, flags);
+               gup_put_folio(folio, refs, flags);
                 return 0;
         }
   
         *nr += refs;
-       SetPageReferenced(head);
+       folio_set_referenced(folio);
         return 1;
   }
   
@@@ -2520,7 -2517,8 +2442,8 @@@ static int gup_huge_pud(pud_t orig, pud
                         unsigned long end, unsigned int flags,
                         struct page **pages, int *nr)
   {
-       struct page *head, *page;
+       struct page *page;
+       struct folio *folio;
         int refs;
   
         if (!pud_access_permitted(orig, flags & FOLL_WRITE))
@@@ -2533,20 -2531,20 +2456,20 @@@
                                              pages, nr);
         }
   
-       page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+       page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT);
         refs = record_subpages(page, addr, end, pages + *nr);
   
-       head = try_grab_compound_head(pud_page(orig), refs, flags);
-       if (!head)
+       folio = try_grab_folio(page, refs, flags);
+       if (!folio)
                 return 0;
   
         if (unlikely(pud_val(orig) != pud_val(*pudp))) {
-               put_compound_head(head, refs, flags);
+               gup_put_folio(folio, refs, flags);
                 return 0;
         }
   
         *nr += refs;
-       SetPageReferenced(head);
+       folio_set_referenced(folio);
         return 1;
   }
   
@@@ -2555,27 -2553,28 +2478,28 @@@ static int gup_huge_pgd(pgd_t orig, pgd
                         struct page **pages, int *nr)
   {
         int refs;
-       struct page *head, *page;
+       struct page *page;
+       struct folio *folio;
   
         if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
                 return 0;
   
         BUILD_BUG_ON(pgd_devmap(orig));
   
-       page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
+       page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT);
         refs = record_subpages(page, addr, end, pages + *nr);
   
-       head = try_grab_compound_head(pgd_page(orig), refs, flags);
-       if (!head)
+       folio = try_grab_folio(page, refs, flags);
+       if (!folio)
                 return 0;
   
         if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
-               put_compound_head(head, refs, flags);
+               gup_put_folio(folio, refs, flags);
                 return 0;
         }
   
         *nr += refs;
-       SetPageReferenced(head);
+       folio_set_referenced(folio);
         return 1;
   }
   
@@@ -3068,3 -3067,32 +2992,3 @@@ long pin_user_pages_unlocked(unsigned l
         return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
   }
   EXPORT_SYMBOL(pin_user_pages_unlocked);
- -
- -/*
- - * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
- - * Behavior is the same, except that this one sets FOLL_PIN and rejects
- - * FOLL_GET.
- - */
- -long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
- -                         unsigned int gup_flags, struct page **pages,
- -                         int *locked)
- -{
- -      /*
- -       * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- -       * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- -       * vmas.  As there are no users of this flag in this call we simply
- -       * disallow this option for now.
- -       */
- -      if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- -              return -EINVAL;
- -
- -      /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- -      if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- -              return -EINVAL;
- -
- -      gup_flags |= FOLL_PIN;
- -      return __get_user_pages_locked(current->mm, start, nr_pages,
- -                                     pages, NULL, locked,
- -                                     gup_flags | FOLL_TOUCH);
- -}
- -EXPORT_SYMBOL(pin_user_pages_locked);
diff --combined mm/huge_memory.c

index 88c83c84325c05dbd1af48fdc517c6c495ac82e2,f85b04b31bd121d88f6ebbddf827517f37808a33..005fab2f3b73a56966e2ab9c5e81701b402746be
--- 1/mm/huge_memory.c
--- 2/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@@ -34,7 -34,6 +34,7 @@@
   #include <linux/oom.h>
   #include <linux/numa.h>
   #include <linux/page_owner.h>
+ +#include <linux/sched/sysctl.h>
   
   #include <asm/tlb.h>
   #include <asm/pgalloc.h>
@@@ -583,13 -582,10 +583,10 @@@ unsigned long thp_get_unmapped_area(str
         unsigned long ret;
         loff_t off = (loff_t)pgoff << PAGE_SHIFT;
   
-       if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
-               goto out;
- 
         ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
         if (ret)
                 return ret;
- out:
+ 
         return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
   }
   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
@@@ -1381,39 -1377,6 +1378,6 @@@ struct page *follow_trans_huge_pmd(stru
         if (flags & FOLL_TOUCH)
                 touch_pmd(vma, addr, pmd, flags);
   
-       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
-               /*
-                * We don't mlock() pte-mapped THPs. This way we can avoid
-                * leaking mlocked pages into non-VM_LOCKED VMAs.
-                *
-                * For anon THP:
-                *
-                * In most cases the pmd is the only mapping of the page as we
-                * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
-                * writable private mappings in populate_vma_page_range().
-                *
-                * The only scenario when we have the page shared here is if we
-                * mlocking read-only mapping shared over fork(). We skip
-                * mlocking such pages.
-                *
-                * For file THP:
-                *
-                * We can expect PageDoubleMap() to be stable under page lock:
-                * for file pages we set it in page_add_file_rmap(), which
-                * requires page to be locked.
-                */
- 
-               if (PageAnon(page) && compound_mapcount(page) != 1)
-                       goto skip_mlock;
-               if (PageDoubleMap(page) || !page->mapping)
-                       goto skip_mlock;
-               if (!trylock_page(page))
-                       goto skip_mlock;
-               if (page->mapping && !PageDoubleMap(page))
-                       mlock_vma_page(page);
-               unlock_page(page);
-       }
- skip_mlock:
         page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
         VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
   
@@@ -1611,7 -1574,7 +1575,7 @@@ int zap_huge_pmd(struct mmu_gather *tlb
   
                 if (pmd_present(orig_pmd)) {
                         page = pmd_page(orig_pmd);
-                       page_remove_rmap(page, true);
+                       page_remove_rmap(page, vma, true);
                         VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
                         VM_BUG_ON_PAGE(!PageHead(page), page);
                 } else if (thp_migration_supported()) {
@@@ -1767,28 -1730,17 +1731,28 @@@ int change_huge_pmd(struct vm_area_stru
         }
   #endif
   
- -      /*
- -       * Avoid trapping faults against the zero page. The read-only
- -       * data is likely to be read-cached on the local CPU and
- -       * local/remote hits to the zero page are not interesting.
- -       */
- -      if (prot_numa && is_huge_zero_pmd(*pmd))
- -              goto unlock;
+ +      if (prot_numa) {
+ +              struct page *page;
+ +              /*
+ +               * Avoid trapping faults against the zero page. The read-only
+ +               * data is likely to be read-cached on the local CPU and
+ +               * local/remote hits to the zero page are not interesting.
+ +               */
+ +              if (is_huge_zero_pmd(*pmd))
+ +                      goto unlock;
   
- -      if (prot_numa && pmd_protnone(*pmd))
- -              goto unlock;
+ +              if (pmd_protnone(*pmd))
+ +                      goto unlock;
   
+ +              page = pmd_page(*pmd);
+ +              /*
+ +               * Skip scanning top tier node if normal numa
+ +               * balancing is disabled
+ +               */
+ +              if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+ +                  node_is_toptier(page_to_nid(page)))
+ +                      goto unlock;
+ +      }
         /*
          * In case prot_numa, we are under mmap_read_lock(mm). It's critical
          * to not clear pmd intermittently to avoid race with MADV_DONTNEED
@@@ -2007,7 -1959,7 +1971,7 @@@ static void __split_huge_pmd_locked(str
                                 set_page_dirty(page);
                         if (!PageReferenced(page) && pmd_young(old_pmd))
                                 SetPageReferenced(page);
-                       page_remove_rmap(page, true);
+                       page_remove_rmap(page, vma, true);
                         put_page(page);
                 }
                 add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
@@@ -2067,9 -2019,9 +2031,9 @@@
                 young = pmd_young(old_pmd);
                 soft_dirty = pmd_soft_dirty(old_pmd);
                 uffd_wp = pmd_uffd_wp(old_pmd);
+ +              VM_BUG_ON_PAGE(!page_count(page), page);
+ +              page_ref_add(page, HPAGE_PMD_NR - 1);
         }
- -      VM_BUG_ON_PAGE(!page_count(page), page);
- -      page_ref_add(page, HPAGE_PMD_NR - 1);
   
         /*
          * Withdraw the table only after we mark the pmd entry invalid.
@@@ -2141,6 -2093,9 +2105,9 @@@
                         }
                 }
                 unlock_page_memcg(page);
+ 
+               /* Above is effectively page_remove_rmap(page, vma, true) */
+               munlock_vma_page(page, vma, true);
         }
   
         smp_wmb(); /* make pte visible before pmd */
@@@ -2148,18 -2103,18 +2115,18 @@@
   
         if (freeze) {
                 for (i = 0; i < HPAGE_PMD_NR; i++) {
-                       page_remove_rmap(page + i, false);
+                       page_remove_rmap(page + i, vma, false);
                         put_page(page + i);
                 }
         }
   }
   
   void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-               unsigned long address, bool freeze, struct page *page)
+               unsigned long address, bool freeze, struct folio *folio)
   {
         spinlock_t *ptl;
         struct mmu_notifier_range range;
-       bool do_unlock_page = false;
+       bool do_unlock_folio = false;
         pmd_t _pmd;
   
         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
@@@ -2169,20 -2124,20 +2136,20 @@@
         ptl = pmd_lock(vma->vm_mm, pmd);
   
         /*
-        * If caller asks to setup a migration entries, we need a page to check
-        * pmd against. Otherwise we can end up replacing wrong page.
+        * If caller asks to setup a migration entry, we need a folio to check
+        * pmd against. Otherwise we can end up replacing wrong folio.
          */
-       VM_BUG_ON(freeze && !page);
-       if (page) {
-               VM_WARN_ON_ONCE(!PageLocked(page));
-               if (page != pmd_page(*pmd))
+       VM_BUG_ON(freeze && !folio);
+       if (folio) {
+               VM_WARN_ON_ONCE(!folio_test_locked(folio));
+               if (folio != page_folio(pmd_page(*pmd)))
                         goto out;
         }
   
   repeat:
         if (pmd_trans_huge(*pmd)) {
-               if (!page) {
-                       page = pmd_page(*pmd);
+               if (!folio) {
+                       folio = page_folio(pmd_page(*pmd));
                         /*
                          * An anonymous page must be locked, to ensure that a
                          * concurrent reuse_swap_page() sees stable mapcount;
@@@ -2190,33 -2145,31 +2157,31 @@@
                          * and page lock must not be taken when zap_pmd_range()
                          * calls __split_huge_pmd() while i_mmap_lock is held.
                          */
-                       if (PageAnon(page)) {
-                               if (unlikely(!trylock_page(page))) {
-                                       get_page(page);
+                       if (folio_test_anon(folio)) {
+                               if (unlikely(!folio_trylock(folio))) {
+                                       folio_get(folio);
                                         _pmd = *pmd;
                                         spin_unlock(ptl);
-                                       lock_page(page);
+                                       folio_lock(folio);
                                         spin_lock(ptl);
                                         if (unlikely(!pmd_same(*pmd, _pmd))) {
-                                               unlock_page(page);
-                                               put_page(page);
-                                               page = NULL;
+                                               folio_unlock(folio);
+                                               folio_put(folio);
+                                               folio = NULL;
                                                 goto repeat;
                                         }
-                                       put_page(page);
+                                       folio_put(folio);
                                 }
-                               do_unlock_page = true;
+                               do_unlock_folio = true;
                         }
                 }
-               if (PageMlocked(page))
-                       clear_page_mlock(page);
         } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
                 goto out;
         __split_huge_pmd_locked(vma, pmd, range.start, freeze);
   out:
         spin_unlock(ptl);
-       if (do_unlock_page)
-               unlock_page(page);
+       if (do_unlock_folio)
+               folio_unlock(folio);
         /*
          * No need to double call mmu_notifier->invalidate_range() callback.
          * They are 3 cases to consider inside __split_huge_pmd_locked():
@@@ -2234,7 -2187,7 +2199,7 @@@
   }
   
   void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
-               bool freeze, struct page *page)
+               bool freeze, struct folio *folio)
   {
         pgd_t *pgd;
         p4d_t *p4d;
@@@ -2255,7 -2208,7 +2220,7 @@@
   
         pmd = pmd_offset(pud, address);
   
-       __split_huge_pmd(vma, pmd, address, freeze, page);
+       __split_huge_pmd(vma, pmd, address, freeze, folio);
   }
   
   static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
@@@ -2295,6 -2248,7 +2260,7 @@@ void vma_adjust_trans_huge(struct vm_ar
   
   static void unmap_page(struct page *page)
   {
+       struct folio *folio = page_folio(page);
         enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
                 TTU_SYNC;
   
@@@ -2305,26 -2259,27 +2271,27 @@@
          * pages can simply be left unmapped, then faulted back on demand.
          * If that is ever changed (perhaps for mlock), update remap_page().
          */
-       if (PageAnon(page))
-               try_to_migrate(page, ttu_flags);
+       if (folio_test_anon(folio))
+               try_to_migrate(folio, ttu_flags);
         else
-               try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK);
+               try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
   
         VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
   }
   
- static void remap_page(struct page *page, unsigned int nr)
+ static void remap_page(struct folio *folio, unsigned long nr)
   {
-       int i;
+       int i = 0;
   
         /* If unmap_page() uses try_to_migrate() on file, remove this check */
-       if (!PageAnon(page))
+       if (!folio_test_anon(folio))
                 return;
-       if (PageTransHuge(page)) {
-               remove_migration_ptes(page, page, true);
-       } else {
-               for (i = 0; i < nr; i++)
-                       remove_migration_ptes(page + i, page + i, true);
+       for (;;) {
+               remove_migration_ptes(folio, folio, true);
+               i += folio_nr_pages(folio);
+               if (i >= nr)
+                       break;
+               folio = folio_next(folio);
         }
   }
   
@@@ -2344,8 -2299,11 +2311,11 @@@ static void lru_add_page_tail(struct pa
         } else {
                 /* head is still on lru (and we have it frozen) */
                 VM_WARN_ON(!PageLRU(head));
+               if (PageUnevictable(tail))
+                       tail->mlock_count = 0;
+               else
+                       list_add_tail(&tail->lru, &head->lru);
                 SetPageLRU(tail);
-               list_add_tail(&tail->lru, &head->lru);
         }
   }
   
@@@ -2481,7 -2439,7 +2451,7 @@@ static void __split_huge_page(struct pa
         }
         local_irq_enable();
   
-       remap_page(head, nr);
+       remap_page(folio, nr);
   
         if (PageSwapCache(head)) {
                 swp_entry_t entry = { .val = page_private(head) };
@@@ -2506,30 -2464,6 +2476,6 @@@
         }
   }
   
- int total_mapcount(struct page *page)
- {
-       int i, compound, nr, ret;
- 
-       VM_BUG_ON_PAGE(PageTail(page), page);
- 
-       if (likely(!PageCompound(page)))
-               return atomic_read(&page->_mapcount) + 1;
- 
-       compound = compound_mapcount(page);
-       nr = compound_nr(page);
-       if (PageHuge(page))
-               return compound;
-       ret = compound;
-       for (i = 0; i < nr; i++)
-               ret += atomic_read(&page[i]._mapcount) + 1;
-       /* File pages has compound_mapcount included in _mapcount */
-       if (!PageAnon(page))
-               return ret - compound * nr;
-       if (PageDoubleMap(page))
-               ret -= nr;
-       return ret;
- }
- 
   /*
    * This calculates accurately how many mappings a transparent hugepage
    * has (unlike page_mapcount() which isn't fully accurate). This full
@@@ -2579,18 -2513,19 +2525,19 @@@ int page_trans_huge_mapcount(struct pag
   }
   
   /* Racy check whether the huge page can be split */
- bool can_split_huge_page(struct page *page, int *pextra_pins)
+ bool can_split_folio(struct folio *folio, int *pextra_pins)
   {
         int extra_pins;
   
         /* Additional pins from page cache */
-       if (PageAnon(page))
-               extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
+       if (folio_test_anon(folio))
+               extra_pins = folio_test_swapcache(folio) ?
+                               folio_nr_pages(folio) : 0;
         else
-               extra_pins = thp_nr_pages(page);
+               extra_pins = folio_nr_pages(folio);
         if (pextra_pins)
                 *pextra_pins = extra_pins;
-       return total_mapcount(page) == page_count(page) - extra_pins - 1;
+       return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
   }
   
   /*
@@@ -2614,7 -2549,8 +2561,8 @@@
    */
   int split_huge_page_to_list(struct page *page, struct list_head *list)
   {
-       struct page *head = compound_head(page);
+       struct folio *folio = page_folio(page);
+       struct page *head = &folio->page;
         struct deferred_split *ds_queue = get_deferred_split_queue(head);
         XA_STATE(xas, &head->mapping->i_pages, head->index);
         struct anon_vma *anon_vma = NULL;
@@@ -2634,7 -2570,7 +2582,7 @@@
                  * The caller does not necessarily hold an mmap_lock that would
                  * prevent the anon_vma disappearing so we first we take a
                  * reference to it and then lock the anon_vma for write. This
-                * is similar to page_lock_anon_vma_read except the write lock
+                * is similar to folio_lock_anon_vma_read except the write lock
                  * is taken to serialise against parallel split or collapse
                  * operations.
                  */
@@@ -2681,7 -2617,7 +2629,7 @@@
          * Racy check if we can split the page, before unmap_page() will
          * split PMDs
          */
-       if (!can_split_huge_page(head, &extra_pins)) {
+       if (!can_split_folio(folio, &extra_pins)) {
                 ret = -EBUSY;
                 goto out_unlock;
         }
@@@ -2731,7 -2667,7 +2679,7 @@@ fail
                 if (mapping)
                         xas_unlock(&xas);
                 local_irq_enable();
-               remap_page(head, thp_nr_pages(head));
+               remap_page(folio, folio_nr_pages(folio));
                 ret = -EBUSY;
         }
   
@@@ -2965,6 -2901,7 +2913,6 @@@ static int split_huge_pages_pid(int pid
          */
         for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
                 struct vm_area_struct *vma = find_vma(mm, addr);
- -              unsigned int follflags;
                 struct page *page;
   
                 if (!vma || addr < vma->vm_start)
@@@ -2977,7 -2914,8 +2925,7 @@@
                 }
   
                 /* FOLL_DUMP to ignore special (like zero) pages */
- -              follflags = FOLL_GET | FOLL_DUMP;
- -              page = follow_page(vma, addr, follflags);
+ +              page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
   
                 if (IS_ERR(page))
                         continue;
@@@ -2988,7 -2926,7 +2936,7 @@@
                         goto next;
   
                 total++;
-               if (!can_split_huge_page(compound_head(page), NULL))
+               if (!can_split_folio(page_folio(page), NULL))
                         goto next;
   
                 if (!trylock_page(page))
@@@ -3181,7 -3119,7 +3129,7 @@@ void set_pmd_migration_entry(struct pag
         if (pmd_soft_dirty(pmdval))
                 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
         set_pmd_at(mm, address, pvmw->pmd, pmdswp);
-       page_remove_rmap(page, true);
+       page_remove_rmap(page, vma, true);
         put_page(page);
   }
   
@@@ -3207,15 -3145,12 +3155,13 @@@ void remove_migration_pmd(struct page_v
         if (pmd_swp_uffd_wp(*pvmw->pmd))
                 pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
   
- -      flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
         if (PageAnon(new))
                 page_add_anon_rmap(new, vma, mmun_start, true);
         else
-               page_add_file_rmap(new, true);
+               page_add_file_rmap(new, vma, true);
         set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
-       if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
-               mlock_vma_page(new);
+ +
+ +      /* No need to invalidate - it was non-present before */
         update_mmu_cache_pmd(vma, address, pvmw->pmd);
   }
   #endif
diff --combined mm/hugetlb.c

index 75b41879e9e97fb367c084950957d4b0ffa7d518,10203f3b1ccfd9a60ab4f9d81a9958ad54fa7a65..b34f50156f7ec29cdfa23006be920afac8954a2b
--- 1/mm/hugetlb.c
--- 2/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@@ -31,7 -31,6 +31,7 @@@
   #include <linux/llist.h>
   #include <linux/cma.h>
   #include <linux/migrate.h>
+ +#include <linux/nospec.h>
   
   #include <asm/page.h>
   #include <asm/pgalloc.h>
@@@ -1321,7 -1320,9 +1321,9 @@@ static void __destroy_compound_gigantic
         }
   
         set_compound_order(page, 0);
+ #ifdef CONFIG_64BIT
         page[1].compound_nr = 0;
+ #endif
         __ClearPageHead(page);
   }
   
@@@ -1813,7 -1814,9 +1815,9 @@@ out_error
         for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
                 __ClearPageReserved(p);
         set_compound_order(page, 0);
+ #ifdef CONFIG_64BIT
         page[1].compound_nr = 0;
+ #endif
         __ClearPageHead(page);
         return false;
   }
@@@ -1855,7 -1858,6 +1859,7 @@@ int PageHeadHuge(struct page *page_head
   
         return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
   }
+ +EXPORT_SYMBOL_GPL(PageHeadHuge);
   
   /*
    * Find and lock address space (mapping) in write mode.
@@@ -3500,7 -3502,8 +3504,7 @@@ static int demote_pool_huge_page(struc
         static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
   
   #define HSTATE_ATTR(_name) \
- -      static struct kobj_attribute _name##_attr = \
- -              __ATTR(_name, 0644, _name##_show, _name##_store)
+ +      static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
   
   static struct kobject *hugepages_kobj;
   static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
@@@ -4160,10 -4163,10 +4164,10 @@@ static int __init hugepages_setup(char 
                                 pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
                                 return 0;
                         }
- -                      node = tmp;
- -                      p += count + 1;
- -                      if (node < 0 || node >= nr_online_nodes)
+ +                      if (tmp >= nr_online_nodes)
                                 goto invalid;
+ +                      node = array_index_nospec(tmp, nr_online_nodes);
+ +                      p += count + 1;
                         /* Parse hugepages */
                         if (sscanf(p, "%lu%n", &tmp, &count) != 1)
                                 goto invalid;
@@@ -4638,6 -4641,7 +4642,6 @@@ static pte_t make_huge_pte(struct vm_ar
                                            vma->vm_page_prot));
         }
         entry = pte_mkyoung(entry);
- -      entry = pte_mkhuge(entry);
         entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
   
         return entry;
@@@ -4851,13 -4855,14 +4855,13 @@@ again
   }
   
   static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
- -                        unsigned long new_addr, pte_t *src_pte)
+ +                        unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte)
   {
         struct hstate *h = hstate_vma(vma);
         struct mm_struct *mm = vma->vm_mm;
- -      pte_t *dst_pte, pte;
         spinlock_t *src_ptl, *dst_ptl;
+ +      pte_t pte;
   
- -      dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h));
         dst_ptl = huge_pte_lock(h, mm, dst_pte);
         src_ptl = huge_pte_lockptr(h, mm, src_pte);
   
@@@ -4916,7 -4921,7 +4920,7 @@@ int move_hugetlb_page_tables(struct vm_
                 if (!dst_pte)
                         break;
   
- -              move_huge_pte(vma, old_addr, new_addr, src_pte);
+ +              move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
         }
         flush_tlb_range(vma, old_end - len, old_end);
         mmu_notifier_invalidate_range_end(&range);
@@@ -5013,7 -5018,7 +5017,7 @@@ static void __unmap_hugepage_range(stru
                         set_page_dirty(page);
   
                 hugetlb_count_sub(pages_per_huge_page(h), mm);
-               page_remove_rmap(page, true);
+               page_remove_rmap(page, vma, true);
   
                 spin_unlock(ptl);
                 tlb_remove_page_size(tlb, page, huge_page_size(h));
@@@ -5258,7 -5263,7 +5262,7 @@@ retry_avoidcopy
                 /* Break COW */
                 huge_ptep_clear_flush(vma, haddr, ptep);
                 mmu_notifier_invalidate_range(mm, range.start, range.end);
-               page_remove_rmap(old_page, true);
+               page_remove_rmap(old_page, vma, true);
                 hugepage_add_new_anon_rmap(new_page, vma, haddr);
                 set_huge_pte_at(mm, haddr, ptep,
                                 make_huge_pte(vma, new_page, 1));
@@@ -5341,7 -5346,6 +5345,7 @@@ static inline vm_fault_t hugetlb_handle
                                                   pgoff_t idx,
                                                   unsigned int flags,
                                                   unsigned long haddr,
+ +                                                unsigned long addr,
                                                   unsigned long reason)
   {
         vm_fault_t ret;
@@@ -5349,7 -5353,6 +5353,7 @@@
         struct vm_fault vmf = {
                 .vma = vma,
                 .address = haddr,
+ +              .real_address = addr,
                 .flags = flags,
   
                 /*
@@@ -5418,7 -5421,7 +5422,7 @@@ retry
                 /* Check for page in userfault range */
                 if (userfaultfd_missing(vma)) {
                         ret = hugetlb_handle_userfault(vma, mapping, idx,
- -                                                     flags, haddr,
+ +                                                     flags, haddr, address,
                                                        VM_UFFD_MISSING);
                         goto out;
                 }
@@@ -5482,7 -5485,7 +5486,7 @@@
                         unlock_page(page);
                         put_page(page);
                         ret = hugetlb_handle_userfault(vma, mapping, idx,
- -                                                     flags, haddr,
+ +                                                     flags, haddr, address,
                                                        VM_UFFD_MINOR);
                         goto out;
                 }
@@@ -5819,8 -5822,7 +5823,8 @@@ int hugetlb_mcopy_atomic_pte(struct mm_
                         *pagep = NULL;
                         goto out;
                 }
- -              folio_copy(page_folio(page), page_folio(*pagep));
+ +              copy_user_huge_page(page, *pagep, dst_addr, dst_vma,
+ +                                  pages_per_huge_page(h));
                 put_page(*pagep);
                 *pagep = NULL;
         }
@@@ -6074,7 -6076,7 +6078,7 @@@ long follow_hugetlb_page(struct mm_stru
   
                 if (pages) {
                         /*
-                        * try_grab_compound_head() should always succeed here,
+                        * try_grab_folio() should always succeed here,
                          * because: a) we hold the ptl lock, and b) we've just
                          * checked that the huge page is present in the page
                          * tables. If the huge page is present, then the tail
@@@ -6083,9 -6085,8 +6087,8 @@@
                          * any way. So this page must be available at this
                          * point, unless the page refcount overflowed:
                          */
-                       if (WARN_ON_ONCE(!try_grab_compound_head(pages[i],
-                                                                refs,
-                                                                flags))) {
+                       if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
+                                                        flags))) {
                                 spin_unlock(ptl);
                                 remainder = 0;
                                 err = -ENOMEM;
@@@ -6174,7 -6175,7 +6177,7 @@@ unsigned long hugetlb_change_protection
                         unsigned int shift = huge_page_shift(hstate_vma(vma));
   
                         old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
- -                      pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
+ +                      pte = huge_pte_modify(old_pte, newprot);
                         pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
                         huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
                         pages++;
@@@ -6892,9 -6893,9 +6895,9 @@@ static int __init cmdline_parse_hugetlb
                         break;
   
                 if (s[count] == ':') {
- -                      nid = tmp;
- -                      if (nid < 0 || nid >= MAX_NUMNODES)
+ +                      if (tmp >= MAX_NUMNODES)
                                 break;
+ +                      nid = array_index_nospec(tmp, MAX_NUMNODES);
   
                         s += count + 1;
                         tmp = memparse(s, &s);
diff --combined mm/internal.h

index 00d6e3e3ec45d473ccd100a6847166007c016316,293eca1360dc3290b339c5a12d948220d5d7d23d..58dc6adc19c5edd0e5aeb987b6155cfb8326530e
--- 1/mm/internal.h
--- 2/mm/internal.h
+++ b/mm/internal.h
@@@ -10,6 -10,7 +10,7 @@@
   #include <linux/fs.h>
   #include <linux/mm.h>
   #include <linux/pagemap.h>
+ #include <linux/rmap.h>
   #include <linux/tracepoint-defs.h>
   
   struct folio_batch;
@@@ -66,24 -67,20 +67,20 @@@ static inline void wake_throttle_isolat
   vm_fault_t do_swap_page(struct vm_fault *vmf);
   void folio_rotate_reclaimable(struct folio *folio);
   bool __folio_end_writeback(struct folio *folio);
+ void deactivate_file_folio(struct folio *folio);
   
   void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                 unsigned long floor, unsigned long ceiling);
   void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
   
- static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
- {
-       return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
- }
- 
   struct zap_details;
   void unmap_page_range(struct mmu_gather *tlb,
                              struct vm_area_struct *vma,
                              unsigned long addr, unsigned long end,
                              struct zap_details *details);
   
- void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
-               unsigned long lookahead_size);
+ void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
+               unsigned int order);
   void force_page_cache_ra(struct readahead_control *, unsigned long nr);
   static inline void force_page_cache_readahead(struct address_space *mapping,
                 struct file *file, pgoff_t index, unsigned long nr_to_read)
@@@ -100,6 -97,9 +97,9 @@@ void filemap_free_folio(struct address_
   int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
   bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
                 loff_t end);
+ long invalidate_inode_page(struct page *page);
+ unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
+               pgoff_t start, pgoff_t end, unsigned long *nr_pagevec);
   
   /**
    * folio_evictable - Test whether a folio is evictable.
@@@ -154,17 -154,13 +154,19 @@@ extern unsigned long highest_memmap_pfn
    */
   #define MAX_RECLAIM_RETRIES 16
   
+ +/*
+ + * in mm/early_ioremap.c
+ + */
+ +pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
+ +                                      unsigned long size, pgprot_t prot);
+ +
   /*
    * in mm/vmscan.c:
    */
- extern int isolate_lru_page(struct page *page);
- extern void putback_lru_page(struct page *page);
+ int isolate_lru_page(struct page *page);
+ int folio_isolate_lru(struct folio *folio);
+ void putback_lru_page(struct page *page);
+ void folio_putback_lru(struct folio *folio);
   extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
   
   /*
@@@ -396,6 -392,7 +398,7 @@@ static inline bool is_data_mapping(vm_f
   void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                 struct vm_area_struct *prev);
   void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
+ struct anon_vma *folio_anon_vma(struct folio *folio);
   
   #ifdef CONFIG_MMU
   void unmap_mapping_folio(struct folio *folio);
@@@ -404,32 -401,56 +407,56 @@@ extern long populate_vma_page_range(str
   extern long faultin_vma_page_range(struct vm_area_struct *vma,
                                    unsigned long start, unsigned long end,
                                    bool write, int *locked);
- extern void munlock_vma_pages_range(struct vm_area_struct *vma,
-                       unsigned long start, unsigned long end);
- static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
- {
-       munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
- }
- 
- /*
-  * must be called with vma's mmap_lock held for read or write, and page locked.
-  */
- extern void mlock_vma_page(struct page *page);
- extern unsigned int munlock_vma_page(struct page *page);
- 
   extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
                               unsigned long len);
- 
   /*
-  * Clear the page's PageMlocked().  This can be useful in a situation where
-  * we want to unconditionally remove a page from the pagecache -- e.g.,
-  * on truncation or freeing.
+  * mlock_vma_page() and munlock_vma_page():
+  * should be called with vma's mmap_lock held for read or write,
+  * under page table lock for the pte/pmd being added or removed.
    *
-  * It is legal to call this function for any page, mlocked or not.
-  * If called for a page that is still mapped by mlocked vmas, all we do
-  * is revert to lazy LRU behaviour -- semantics are not broken.
+  * mlock is usually called at the end of page_add_*_rmap(),
+  * munlock at the end of page_remove_rmap(); but new anon
+  * pages are managed by lru_cache_add_inactive_or_unevictable()
+  * calling mlock_new_page().
+  *
+  * @compound is used to include pmd mappings of THPs, but filter out
+  * pte mappings of THPs, which cannot be consistently counted: a pte
+  * mapping of the THP head cannot be distinguished by the page alone.
    */
- extern void clear_page_mlock(struct page *page);
+ void mlock_folio(struct folio *folio);
+ static inline void mlock_vma_folio(struct folio *folio,
+                       struct vm_area_struct *vma, bool compound)
+ {
+       /*
+        * The VM_SPECIAL check here serves two purposes.
+        * 1) VM_IO check prevents migration from double-counting during mlock.
+        * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
+        *    is never left set on a VM_SPECIAL vma, there is an interval while
+        *    file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
+        *    still be set while VM_SPECIAL bits are added: so ignore it then.
+        */
+       if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
+           (compound || !folio_test_large(folio)))
+               mlock_folio(folio);
+ }
+ 
+ static inline void mlock_vma_page(struct page *page,
+                       struct vm_area_struct *vma, bool compound)
+ {
+       mlock_vma_folio(page_folio(page), vma, compound);
+ }
+ 
+ void munlock_page(struct page *page);
+ static inline void munlock_vma_page(struct page *page,
+                       struct vm_area_struct *vma, bool compound)
+ {
+       if (unlikely(vma->vm_flags & VM_LOCKED) &&
+           (compound || !PageTransCompound(page)))
+               munlock_page(page);
+ }
+ void mlock_new_page(struct page *page);
+ bool need_mlock_page_drain(int cpu);
+ void mlock_page_drain(int cpu);
   
   extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
   
@@@ -463,18 -484,20 +490,20 @@@ vma_address(struct page *page, struct v
   }
   
   /*
-  * Then at what user virtual address will none of the page be found in vma?
+  * Then at what user virtual address will none of the range be found in vma?
    * Assumes that vma_address() already returned a good starting address.
-  * If page is a compound head, the entire compound page is considered.
    */
- static inline unsigned long
- vma_address_end(struct page *page, struct vm_area_struct *vma)
+ static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
   {
+       struct vm_area_struct *vma = pvmw->vma;
         pgoff_t pgoff;
         unsigned long address;
   
-       VM_BUG_ON_PAGE(PageKsm(page), page);    /* KSM page->index unusable */
-       pgoff = page_to_pgoff(page) + compound_nr(page);
+       /* Common case, plus ->pgoff is invalid for KSM */
+       if (pvmw->nr_pages == 1)
+               return pvmw->address + PAGE_SIZE;
+ 
+       pgoff = pvmw->pgoff + pvmw->nr_pages;
         address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
         /* Check for address beyond vma (or wrapped through 0?) */
         if (address < vma->vm_start || address > vma->vm_end)
@@@ -504,8 -527,13 +533,13 @@@ static inline struct file *maybe_unlock
   }
   #else /* !CONFIG_MMU */
   static inline void unmap_mapping_folio(struct folio *folio) { }
- static inline void clear_page_mlock(struct page *page) { }
- static inline void mlock_vma_page(struct page *page) { }
+ static inline void mlock_vma_page(struct page *page,
+                       struct vm_area_struct *vma, bool compound) { }
+ static inline void munlock_vma_page(struct page *page,
+                       struct vm_area_struct *vma, bool compound) { }
+ static inline void mlock_new_page(struct page *page) { }
+ static inline bool need_mlock_page_drain(int cpu) { return false; }
+ static inline void mlock_page_drain(int cpu) { }
   static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
   {
   }
@@@ -578,6 -606,17 +612,6 @@@ static inline void mminit_verify_zoneli
   }
   #endif /* CONFIG_DEBUG_MEMORY_INIT */
   
- -/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
- -#if defined(CONFIG_SPARSEMEM)
- -extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
- -                              unsigned long *end_pfn);
- -#else
- -static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
- -                              unsigned long *end_pfn)
- -{
- -}
- -#endif /* CONFIG_SPARSEMEM */
- -
   #define NODE_RECLAIM_NOSCAN   -2
   #define NODE_RECLAIM_FULL     -1
   #define NODE_RECLAIM_SOME     0
@@@ -713,6 -752,11 +747,13 @@@ void vunmap_range_noflush(unsigned lon
   int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
                       unsigned long addr, int page_nid, int *flags);
   
+ void free_zone_device_page(struct page *page);
+ 
+ /*
+  * mm/gup.c
+  */
+ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
+ 
+ +DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+ +
   #endif        /* __MM_INTERNAL_H */
diff --combined mm/ksm.c

index 03d599bbc155407c46030f2922c7236c46532247,eed2ff25a2fb64cd8c303abc96d5e2d225ea83fc..063a48eeb5eee9727c0f7438f5d449f95f282472
--- 1/mm/ksm.c
--- 2/mm/ksm.c
+++ b/mm/ksm.c
@@@ -1034,10 -1034,7 +1034,7 @@@ static int write_protect_page(struct vm
                               pte_t *orig_pte)
   {
         struct mm_struct *mm = vma->vm_mm;
-       struct page_vma_mapped_walk pvmw = {
-               .page = page,
-               .vma = vma,
-       };
+       DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0);
         int swapped;
         int err = -EFAULT;
         struct mmu_notifier_range range;
@@@ -1177,7 -1174,7 +1174,7 @@@ static int replace_page(struct vm_area_
         ptep_clear_flush(vma, addr, ptep);
         set_pte_at_notify(mm, addr, ptep, newpte);
   
-       page_remove_rmap(page, false);
+       page_remove_rmap(page, vma, false);
         if (!page_mapped(page))
                 try_to_free_swap(page);
         put_page(page);
@@@ -1252,16 -1249,6 +1249,6 @@@ static int try_to_merge_one_page(struc
                         err = replace_page(vma, page, kpage, orig_pte);
         }
   
-       if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
-               munlock_vma_page(page);
-               if (!PageMlocked(kpage)) {
-                       unlock_page(page);
-                       lock_page(kpage);
-                       mlock_vma_page(kpage);
-                       page = kpage;           /* for final unlock */
-               }
-       }
- 
   out_unlock:
         unlock_page(page);
   out:
@@@ -2567,7 -2554,8 +2554,8 @@@ void __ksm_exit(struct mm_struct *mm
   struct page *ksm_might_need_to_copy(struct page *page,
                         struct vm_area_struct *vma, unsigned long address)
   {
-       struct anon_vma *anon_vma = page_anon_vma(page);
+       struct folio *folio = page_folio(page);
+       struct anon_vma *anon_vma = folio_anon_vma(folio);
         struct page *new_page;
   
         if (PageKsm(page)) {
@@@ -2595,29 -2583,26 +2583,29 @@@
                 SetPageDirty(new_page);
                 __SetPageUptodate(new_page);
                 __SetPageLocked(new_page);
+ +#ifdef CONFIG_SWAP
+ +              count_vm_event(KSM_SWPIN_COPY);
+ +#endif
         }
   
         return new_page;
   }
   
- void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+ void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc)
   {
         struct stable_node *stable_node;
         struct rmap_item *rmap_item;
         int search_new_forks = 0;
   
-       VM_BUG_ON_PAGE(!PageKsm(page), page);
+       VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
   
         /*
          * Rely on the page lock to protect against concurrent modifications
          * to that page's node of the stable tree.
          */
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
   
-       stable_node = page_stable_node(page);
+       stable_node = folio_stable_node(folio);
         if (!stable_node)
                 return;
   again:
@@@ -2652,11 -2637,11 +2640,11 @@@
                         if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                                 continue;
   
-                       if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
+                       if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
                                 anon_vma_unlock_read(anon_vma);
                                 return;
                         }
-                       if (rwc->done && rwc->done(page)) {
+                       if (rwc->done && rwc->done(folio)) {
                                 anon_vma_unlock_read(anon_vma);
                                 return;
                         }
@@@ -2829,7 -2814,8 +2817,7 @@@ static void wait_while_offlining(void
   #define KSM_ATTR_RO(_name) \
         static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
   #define KSM_ATTR(_name) \
- -      static struct kobj_attribute _name##_attr = \
- -              __ATTR(_name, 0644, _name##_show, _name##_store)
+ +      static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
   
   static ssize_t sleep_millisecs_show(struct kobject *kobj,
                                     struct kobj_attribute *attr, char *buf)
diff --combined mm/madvise.c

index 89490c859c3ffa1d887476008cedf3b7814ad7fb,ae35d72627efab9074288c57bfda5a02880c8396..39b712fd83000361c9f6e1d4d377514c27d04eea
--- 1/mm/madvise.c
--- 2/mm/madvise.c
+++ b/mm/madvise.c
@@@ -65,7 -65,7 +65,7 @@@ static int madvise_need_mmap_write(int 
   }
   
   #ifdef CONFIG_ANON_VMA_NAME
- -static struct anon_vma_name *anon_vma_name_alloc(const char *name)
+ +struct anon_vma_name *anon_vma_name_alloc(const char *name)
   {
         struct anon_vma_name *anon_name;
         size_t count;
@@@ -81,48 -81,78 +81,48 @@@
         return anon_name;
   }
   
- -static void vma_anon_name_free(struct kref *kref)
+ +void anon_vma_name_free(struct kref *kref)
   {
         struct anon_vma_name *anon_name =
                         container_of(kref, struct anon_vma_name, kref);
         kfree(anon_name);
   }
   
- -static inline bool has_vma_anon_name(struct vm_area_struct *vma)
+ +struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
   {
- -      return !vma->vm_file && vma->anon_name;
- -}
- -
- -const char *vma_anon_name(struct vm_area_struct *vma)
- -{
- -      if (!has_vma_anon_name(vma))
- -              return NULL;
- -
         mmap_assert_locked(vma->vm_mm);
   
- -      return vma->anon_name->name;
- -}
- -
- -void dup_vma_anon_name(struct vm_area_struct *orig_vma,
- -                     struct vm_area_struct *new_vma)
- -{
- -      if (!has_vma_anon_name(orig_vma))
- -              return;
- -
- -      kref_get(&orig_vma->anon_name->kref);
- -      new_vma->anon_name = orig_vma->anon_name;
- -}
- -
- -void free_vma_anon_name(struct vm_area_struct *vma)
- -{
- -      struct anon_vma_name *anon_name;
- -
- -      if (!has_vma_anon_name(vma))
- -              return;
+ +      if (vma->vm_file)
+ +              return NULL;
   
- -      anon_name = vma->anon_name;
- -      vma->anon_name = NULL;
- -      kref_put(&anon_name->kref, vma_anon_name_free);
+ +      return vma->anon_name;
   }
   
   /* mmap_lock should be write-locked */
- -static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+ +static int replace_anon_vma_name(struct vm_area_struct *vma,
+ +                               struct anon_vma_name *anon_name)
   {
- -      const char *anon_name;
+ +      struct anon_vma_name *orig_name = anon_vma_name(vma);
   
- -      if (!name) {
- -              free_vma_anon_name(vma);
+ +      if (!anon_name) {
+ +              vma->anon_name = NULL;
+ +              anon_vma_name_put(orig_name);
                 return 0;
         }
   
- -      anon_name = vma_anon_name(vma);
- -      if (anon_name) {
- -              /* Same name, nothing to do here */
- -              if (!strcmp(name, anon_name))
- -                      return 0;
+ +      if (anon_vma_name_eq(orig_name, anon_name))
+ +              return 0;
   
- -              free_vma_anon_name(vma);
- -      }
- -      vma->anon_name = anon_vma_name_alloc(name);
- -      if (!vma->anon_name)
- -              return -ENOMEM;
+ +      vma->anon_name = anon_vma_name_reuse(anon_name);
+ +      anon_vma_name_put(orig_name);
   
         return 0;
   }
   #else /* CONFIG_ANON_VMA_NAME */
- -static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+ +static int replace_anon_vma_name(struct vm_area_struct *vma,
+ +                               struct anon_vma_name *anon_name)
   {
- -      if (name)
+ +      if (anon_name)
                 return -EINVAL;
   
         return 0;
@@@ -131,19 -161,17 +131,19 @@@
   /*
    * Update the vm_flags on region of a vma, splitting it or merging it as
    * necessary.  Must be called with mmap_sem held for writing;
+ + * Caller should ensure anon_name stability by raising its refcount even when
+ + * anon_name belongs to a valid vma because this function might free that vma.
    */
   static int madvise_update_vma(struct vm_area_struct *vma,
                               struct vm_area_struct **prev, unsigned long start,
                               unsigned long end, unsigned long new_flags,
- -                            const char *name)
+ +                            struct anon_vma_name *anon_name)
   {
         struct mm_struct *mm = vma->vm_mm;
         int error;
         pgoff_t pgoff;
   
- -      if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) {
+ +      if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
                 *prev = vma;
                 return 0;
         }
@@@ -151,7 -179,7 +151,7 @@@
         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
         *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
                           vma->vm_file, pgoff, vma_policy(vma),
- -                        vma->vm_userfaultfd_ctx, name);
+ +                        vma->vm_userfaultfd_ctx, anon_name);
         if (*prev) {
                 vma = *prev;
                 goto success;
@@@ -181,7 -209,7 +181,7 @@@ success
          */
         vma->vm_flags = new_flags;
         if (!vma->vm_file) {
- -              error = replace_vma_anon_name(vma, name);
+ +              error = replace_anon_vma_name(vma, anon_name);
                 if (error)
                         return error;
         }
@@@ -502,6 -530,11 +502,11 @@@ static void madvise_cold_page_range(str
         tlb_end_vma(tlb, vma);
   }
   
+ static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
+ {
+       return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
+ }
+ 
   static long madvise_cold(struct vm_area_struct *vma,
                         struct vm_area_struct **prev,
                         unsigned long start_addr, unsigned long end_addr)
@@@ -849,8 -882,8 +854,8 @@@ static long madvise_populate(struct vm_
                  * our VMA might have been split.
                  */
                 if (!vma || start >= vma->vm_end) {
- -                      vma = find_vma(mm, start);
- -                      if (!vma || start < vma->vm_start)
+ +                      vma = vma_lookup(mm, start);
+ +                      if (!vma)
                                 return -ENOMEM;
                 }
   
@@@ -947,7 -980,6 +952,7 @@@ static int madvise_vma_behavior(struct 
                                 unsigned long behavior)
   {
         int error;
+ +      struct anon_vma_name *anon_name;
         unsigned long new_flags = vma->vm_flags;
   
         switch (behavior) {
@@@ -1013,11 -1045,8 +1018,11 @@@
                 break;
         }
   
+ +      anon_name = anon_vma_name(vma);
+ +      anon_vma_name_get(anon_name);
         error = madvise_update_vma(vma, prev, start, end, new_flags,
- -                                 vma_anon_name(vma));
+ +                                 anon_name);
+ +      anon_vma_name_put(anon_name);
   
   out:
         /*
@@@ -1067,8 -1096,6 +1072,8 @@@ static int madvise_inject_error(int beh
                         pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
                                  pfn, start);
                         ret = memory_failure(pfn, MF_COUNT_INCREASED);
+ +                      if (ret == -EOPNOTSUPP)
+ +                              ret = 0;
                 }
   
                 if (ret)
@@@ -1203,7 -1230,7 +1208,7 @@@ int madvise_walk_vmas(struct mm_struct 
   static int madvise_vma_anon_name(struct vm_area_struct *vma,
                                  struct vm_area_struct **prev,
                                  unsigned long start, unsigned long end,
- -                               unsigned long name)
+ +                               unsigned long anon_name)
   {
         int error;
   
@@@ -1212,7 -1239,7 +1217,7 @@@
                 return -EBADF;
   
         error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- -                                 (const char *)name);
+ +                                 (struct anon_vma_name *)anon_name);
   
         /*
          * madvise() returns EAGAIN if kernel resources, such as
@@@ -1224,7 -1251,7 +1229,7 @@@
   }
   
   int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- -                        unsigned long len_in, const char *name)
+ +                        unsigned long len_in, struct anon_vma_name *anon_name)
   {
         unsigned long end;
         unsigned long len;
@@@ -1244,7 -1271,7 +1249,7 @@@
         if (end == start)
                 return 0;
   
- -      return madvise_walk_vmas(mm, start, end, (unsigned long)name,
+ +      return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
                                  madvise_vma_anon_name);
   }
   #endif /* CONFIG_ANON_VMA_NAME */
@@@ -1428,21 -1455,15 +1433,21 @@@ SYSCALL_DEFINE5(process_madvise, int, p
   
         while (iov_iter_count(&iter)) {
                 iovec = iov_iter_iovec(&iter);
+ +              /*
+ +               * do_madvise returns ENOMEM if unmapped holes are present
+ +               * in the passed VMA. process_madvise() is expected to skip
+ +               * unmapped holes passed to it in the 'struct iovec' list
+ +               * and not fail because of them. Thus treat -ENOMEM return
+ +               * from do_madvise as valid and continue processing.
+ +               */
                 ret = do_madvise(mm, (unsigned long)iovec.iov_base,
                                         iovec.iov_len, behavior);
- -              if (ret < 0)
+ +              if (ret < 0 && ret != -ENOMEM)
                         break;
                 iov_iter_advance(&iter, iovec.iov_len);
         }
   
- -      if (ret == 0)
- -              ret = total_len - iov_iter_count(&iter);
+ +      ret = (total_len - iov_iter_count(&iter)) ? : ret;
   
   release_mm:
         mmput(mm);
diff --combined mm/memcontrol.c

index f5ad1a6804949bf7c40acf4a932bcbe41874599b,f7fbd5f91e3d9f3f3fb7040b9deb43b97aa72f3c..d495c2acb9f0e916be6870d7a2f43c4236099d10
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -53,6 -53,7 +53,7 @@@
   #include <linux/fs.h>
   #include <linux/seq_file.h>
   #include <linux/vmpressure.h>
+ #include <linux/memremap.h>
   #include <linux/mm_inline.h>
   #include <linux/swap_cgroup.h>
   #include <linux/cpu.h>
@@@ -347,6 -348,48 +348,6 @@@ static void memcg_reparent_objcgs(struc
         percpu_ref_kill(&objcg->refcnt);
   }
   
- -/*
- - * This will be used as a shrinker list's index.
- - * The main reason for not using cgroup id for this:
- - *  this works better in sparse environments, where we have a lot of memcgs,
- - *  but only a few kmem-limited. Or also, if we have, for instance, 200
- - *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
- - *  200 entry array for that.
- - *
- - * The current size of the caches array is stored in memcg_nr_cache_ids. It
- - * will double each time we have to increase it.
- - */
- -static DEFINE_IDA(memcg_cache_ida);
- -int memcg_nr_cache_ids;
- -
- -/* Protects memcg_nr_cache_ids */
- -static DECLARE_RWSEM(memcg_cache_ids_sem);
- -
- -void memcg_get_cache_ids(void)
- -{
- -      down_read(&memcg_cache_ids_sem);
- -}
- -
- -void memcg_put_cache_ids(void)
- -{
- -      up_read(&memcg_cache_ids_sem);
- -}
- -
- -/*
- - * MIN_SIZE is different than 1, because we would like to avoid going through
- - * the alloc/free process all the time. In a small machine, 4 kmem-limited
- - * cgroups is a reasonable guess. In the future, it could be a parameter or
- - * tunable, but that is strictly not necessary.
- - *
- - * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
- - * this constant directly from cgroup, but it is understandable that this is
- - * better kept as an internal representation in cgroup.c. In any case, the
- - * cgrp_id space is not getting any smaller, and we don't have to necessarily
- - * increase ours as well if it increases.
- - */
- -#define MEMCG_CACHES_MIN_SIZE 4
- -#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
- -
   /*
    * A lot of the calls to the cache allocation functions are expected to be
    * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
@@@ -587,35 -630,6 +588,35 @@@ static DEFINE_SPINLOCK(stats_flush_lock
   static DEFINE_PER_CPU(unsigned int, stats_updates);
   static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
   
+ +/*
+ + * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
+ + * not rely on this as part of an acquired spinlock_t lock. These functions are
+ + * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
+ + * is sufficient.
+ + */
+ +static void memcg_stats_lock(void)
+ +{
+ +#ifdef CONFIG_PREEMPT_RT
+ +      preempt_disable();
+ +#else
+ +      VM_BUG_ON(!irqs_disabled());
+ +#endif
+ +}
+ +
+ +static void __memcg_stats_lock(void)
+ +{
+ +#ifdef CONFIG_PREEMPT_RT
+ +      preempt_disable();
+ +#endif
+ +}
+ +
+ +static void memcg_stats_unlock(void)
+ +{
+ +#ifdef CONFIG_PREEMPT_RT
+ +      preempt_enable();
+ +#endif
+ +}
+ +
   static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
   {
         unsigned int x;
@@@ -692,27 -706,6 +693,27 @@@ void __mod_memcg_lruvec_state(struct lr
         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
         memcg = pn->memcg;
   
+ +      /*
+ +       * The caller from rmap relay on disabled preemption becase they never
+ +       * update their counter from in-interrupt context. For these two
+ +       * counters we check that the update is never performed from an
+ +       * interrupt context while other caller need to have disabled interrupt.
+ +       */
+ +      __memcg_stats_lock();
+ +      if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ +              switch (idx) {
+ +              case NR_ANON_MAPPED:
+ +              case NR_FILE_MAPPED:
+ +              case NR_ANON_THPS:
+ +              case NR_SHMEM_PMDMAPPED:
+ +              case NR_FILE_PMDMAPPED:
+ +                      WARN_ON_ONCE(!in_task());
+ +                      break;
+ +              default:
+ +                      WARN_ON_ONCE(!irqs_disabled());
+ +              }
+ +      }
+ +
         /* Update memcg */
         __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
   
@@@ -720,7 -713,6 +721,7 @@@
         __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
   
         memcg_rstat_updated(memcg, val);
+ +      memcg_stats_unlock();
   }
   
   /**
@@@ -803,10 -795,8 +804,10 @@@ void __count_memcg_events(struct mem_cg
         if (mem_cgroup_disabled())
                 return;
   
+ +      memcg_stats_lock();
         __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
         memcg_rstat_updated(memcg, count);
+ +      memcg_stats_unlock();
   }
   
   static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@@ -869,9 -859,6 +870,9 @@@ static bool mem_cgroup_event_ratelimit(
    */
   static void memcg_check_events(struct mem_cgroup *memcg, int nid)
   {
+ +      if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ +              return;
+ +
         /* threshold event is triggered in finer grain than soft limit */
         if (unlikely(mem_cgroup_event_ratelimit(memcg,
                                                 MEM_CGROUP_TARGET_THRESH))) {
@@@ -1271,8 -1258,7 +1272,7 @@@ struct lruvec *folio_lruvec_lock_irqsav
    * @nr_pages: positive when adding or negative when removing
    *
    * This function must be called under lru_lock, just before a page is added
-  * to or just after a page is removed from an lru list (that ordering being
-  * so as to allow it to check that lru_size 0 is consistent with list_empty).
+  * to or just after a page is removed from an lru list.
    */
   void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                                 int zid, int nr_pages)
@@@ -1385,7 -1371,6 +1385,7 @@@ struct memory_stat 
   static const struct memory_stat memory_stats[] = {
         { "anon",                       NR_ANON_MAPPED                  },
         { "file",                       NR_FILE_PAGES                   },
+ +      { "kernel",                     MEMCG_KMEM                      },
         { "kernel_stack",               NR_KERNEL_STACK_KB              },
         { "pagetables",                 NR_PAGETABLE                    },
         { "percpu",                     MEMCG_PERCPU_B                  },
@@@ -1810,16 -1795,20 +1810,16 @@@ static void memcg_oom_recover(struct me
                 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
   }
   
- -enum oom_status {
- -      OOM_SUCCESS,
- -      OOM_FAILED,
- -      OOM_ASYNC,
- -      OOM_SKIPPED
- -};
- -
- -static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+ +/*
+ + * Returns true if successfully killed one or more processes. Though in some
+ + * corner cases it can return true even without killing any process.
+ + */
+ +static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
   {
- -      enum oom_status ret;
- -      bool locked;
+ +      bool locked, ret;
   
         if (order > PAGE_ALLOC_COSTLY_ORDER)
- -              return OOM_SKIPPED;
+ +              return false;
   
         memcg_memory_event(memcg, MEMCG_OOM);
   
@@@ -1842,13 -1831,14 +1842,13 @@@
          * victim and then we have to bail out from the charge path.
          */
         if (memcg->oom_kill_disable) {
- -              if (!current->in_user_fault)
- -                      return OOM_SKIPPED;
- -              css_get(&memcg->css);
- -              current->memcg_in_oom = memcg;
- -              current->memcg_oom_gfp_mask = mask;
- -              current->memcg_oom_order = order;
- -
- -              return OOM_ASYNC;
+ +              if (current->in_user_fault) {
+ +                      css_get(&memcg->css);
+ +                      current->memcg_in_oom = memcg;
+ +                      current->memcg_oom_gfp_mask = mask;
+ +                      current->memcg_oom_order = order;
+ +              }
+ +              return false;
         }
   
         mem_cgroup_mark_under_oom(memcg);
@@@ -1859,7 -1849,10 +1859,7 @@@
                 mem_cgroup_oom_notify(memcg);
   
         mem_cgroup_unmark_under_oom(memcg);
- -      if (mem_cgroup_out_of_memory(memcg, mask, order))
- -              ret = OOM_SUCCESS;
- -      else
- -              ret = OOM_FAILED;
+ +      ret = mem_cgroup_out_of_memory(memcg, mask, order);
   
         if (locked)
                 mem_cgroup_oom_unlock(memcg);
@@@ -2092,47 -2085,45 +2092,47 @@@ void unlock_page_memcg(struct page *pag
         folio_memcg_unlock(page_folio(page));
   }
   
- -struct obj_stock {
+ +struct memcg_stock_pcp {
+ +      local_lock_t stock_lock;
+ +      struct mem_cgroup *cached; /* this never be root cgroup */
+ +      unsigned int nr_pages;
+ +
   #ifdef CONFIG_MEMCG_KMEM
         struct obj_cgroup *cached_objcg;
         struct pglist_data *cached_pgdat;
         unsigned int nr_bytes;
         int nr_slab_reclaimable_b;
         int nr_slab_unreclaimable_b;
- -#else
- -      int dummy[0];
   #endif
- -};
- -
- -struct memcg_stock_pcp {
- -      struct mem_cgroup *cached; /* this never be root cgroup */
- -      unsigned int nr_pages;
- -      struct obj_stock task_obj;
- -      struct obj_stock irq_obj;
   
         struct work_struct work;
         unsigned long flags;
   #define FLUSHING_CACHED_CHARGE        0
   };
- -static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+ +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
+ +      .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+ +};
   static DEFINE_MUTEX(percpu_charge_mutex);
   
   #ifdef CONFIG_MEMCG_KMEM
- -static void drain_obj_stock(struct obj_stock *stock);
+ +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
   static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
                                      struct mem_cgroup *root_memcg);
+ +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
   
   #else
- -static inline void drain_obj_stock(struct obj_stock *stock)
+ +static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
   {
+ +      return NULL;
   }
   static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
                                      struct mem_cgroup *root_memcg)
   {
         return false;
   }
+ +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
+ +{
+ +}
   #endif
   
   /**
@@@ -2155,7 -2146,7 +2155,7 @@@ static bool consume_stock(struct mem_cg
         if (nr_pages > MEMCG_CHARGE_BATCH)
                 return ret;
   
- -      local_irq_save(flags);
+ +      local_lock_irqsave(&memcg_stock.stock_lock, flags);
   
         stock = this_cpu_ptr(&memcg_stock);
         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
@@@ -2163,7 -2154,7 +2163,7 @@@
                 ret = true;
         }
   
- -      local_irq_restore(flags);
+ +      local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
   
         return ret;
   }
@@@ -2192,7 -2183,6 +2192,7 @@@ static void drain_stock(struct memcg_st
   static void drain_local_stock(struct work_struct *dummy)
   {
         struct memcg_stock_pcp *stock;
+ +      struct obj_cgroup *old = NULL;
         unsigned long flags;
   
         /*
@@@ -2200,25 -2190,28 +2200,25 @@@
          * drain_stock races is that we always operate on local CPU stock
          * here with IRQ disabled
          */
- -      local_irq_save(flags);
+ +      local_lock_irqsave(&memcg_stock.stock_lock, flags);
   
         stock = this_cpu_ptr(&memcg_stock);
- -      drain_obj_stock(&stock->irq_obj);
- -      if (in_task())
- -              drain_obj_stock(&stock->task_obj);
+ +      old = drain_obj_stock(stock);
         drain_stock(stock);
         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
   
- -      local_irq_restore(flags);
+ +      local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ +      if (old)
+ +              obj_cgroup_put(old);
   }
   
   /*
    * Cache charges(val) to local per_cpu area.
    * This will be consumed by consume_stock() function, later.
    */
- -static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+ +static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
   {
         struct memcg_stock_pcp *stock;
- -      unsigned long flags;
- -
- -      local_irq_save(flags);
   
         stock = this_cpu_ptr(&memcg_stock);
         if (stock->cached != memcg) { /* reset if necessary */
@@@ -2230,15 -2223,8 +2230,15 @@@
   
         if (stock->nr_pages > MEMCG_CHARGE_BATCH)
                 drain_stock(stock);
+ +}
   
- -      local_irq_restore(flags);
+ +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+ +{
+ +      unsigned long flags;
+ +
+ +      local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ +      __refill_stock(memcg, nr_pages);
+ +      local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
   }
   
   /*
@@@ -2258,8 -2244,7 +2258,8 @@@ static void drain_all_stock(struct mem_
          * as well as workers from this path always operate on the local
          * per-cpu data. CPU up doesn't touch memcg_stock at all.
          */
- -      curcpu = get_cpu();
+ +      migrate_disable();
+ +      curcpu = smp_processor_id();
         for_each_online_cpu(cpu) {
                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
                 struct mem_cgroup *memcg;
@@@ -2282,7 -2267,7 +2282,7 @@@
                                 schedule_work_on(cpu, &stock->work);
                 }
         }
- -      put_cpu();
+ +      migrate_enable();
         mutex_unlock(&percpu_charge_mutex);
   }
   
@@@ -2556,6 -2541,7 +2556,6 @@@ static int try_charge_memcg(struct mem_
         int nr_retries = MAX_RECLAIM_RETRIES;
         struct mem_cgroup *mem_over_limit;
         struct page_counter *counter;
- -      enum oom_status oom_status;
         unsigned long nr_reclaimed;
         bool passed_oom = false;
         bool may_swap = true;
@@@ -2583,6 -2569,15 +2583,6 @@@ retry
                 goto retry;
         }
   
- -      /*
- -       * Memcg doesn't have a dedicated reserve for atomic
- -       * allocations. But like the global atomic pool, we need to
- -       * put the burden of reclaim on regular allocation requests
- -       * and let these go through as privileged allocations.
- -       */
- -      if (gfp_mask & __GFP_ATOMIC)
- -              goto force;
- -
         /*
          * Prevent unbounded recursion when reclaim operations need to
          * allocate memory. This might exceed the limits temporarily,
@@@ -2649,20 -2644,15 +2649,20 @@@
          * a forward progress or bypass the charge if the oom killer
          * couldn't make any progress.
          */
- -      oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
- -                     get_order(nr_pages * PAGE_SIZE));
- -      if (oom_status == OOM_SUCCESS) {
+ +      if (mem_cgroup_oom(mem_over_limit, gfp_mask,
+ +                         get_order(nr_pages * PAGE_SIZE))) {
                 passed_oom = true;
                 nr_retries = MAX_RECLAIM_RETRIES;
                 goto retry;
         }
   nomem:
- -      if (!(gfp_mask & __GFP_NOFAIL))
+ +      /*
+ +       * Memcg doesn't have a dedicated reserve for atomic
+ +       * allocations. But like the global atomic pool, we need to
+ +       * put the burden of reclaim on regular allocation requests
+ +       * and let these go through as privileged allocations.
+ +       */
+ +      if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
                 return -ENOMEM;
   force:
         /*
@@@ -2698,7 -2688,7 +2698,7 @@@ done_restock
                         READ_ONCE(memcg->swap.high);
   
                 /* Don't bother a random interrupted task */
- -              if (in_interrupt()) {
+ +              if (!in_task()) {
                         if (mem_high) {
                                 schedule_work(&memcg->high_work);
                                 break;
@@@ -2722,11 -2712,6 +2722,11 @@@
                 }
         } while ((memcg = parent_mem_cgroup(memcg)));
   
+ +      if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
+ +          !(current->flags & PF_MEMALLOC) &&
+ +          gfpflags_allow_blocking(gfp_mask)) {
+ +              mem_cgroup_handle_over_high();
+ +      }
         return 0;
   }
   
@@@ -2763,6 -2748,20 +2763,6 @@@ static void commit_charge(struct folio 
         folio->memcg_data = (unsigned long)memcg;
   }
   
- -static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
- -{
- -      struct mem_cgroup *memcg;
- -
- -      rcu_read_lock();
- -retry:
- -      memcg = obj_cgroup_memcg(objcg);
- -      if (unlikely(!css_tryget(&memcg->css)))
- -              goto retry;
- -      rcu_read_unlock();
- -
- -      return memcg;
- -}
- -
   #ifdef CONFIG_MEMCG_KMEM
   /*
    * The allocated objcg pointers array is not accounted directly.
@@@ -2771,6 -2770,41 +2771,6 @@@
    */
   #define OBJCGS_CLEAR_MASK     (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
   
- -/*
- - * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
- - * sequence used in this case to access content from object stock is slow.
- - * To optimize for user context access, there are now two object stocks for
- - * task context and interrupt context access respectively.
- - *
- - * The task context object stock can be accessed by disabling preemption only
- - * which is cheap in non-preempt kernel. The interrupt context object stock
- - * can only be accessed after disabling interrupt. User context code can
- - * access interrupt object stock, but not vice versa.
- - */
- -static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
- -{
- -      struct memcg_stock_pcp *stock;
- -
- -      if (likely(in_task())) {
- -              *pflags = 0UL;
- -              preempt_disable();
- -              stock = this_cpu_ptr(&memcg_stock);
- -              return &stock->task_obj;
- -      }
- -
- -      local_irq_save(*pflags);
- -      stock = this_cpu_ptr(&memcg_stock);
- -      return &stock->irq_obj;
- -}
- -
- -static inline void put_obj_stock(unsigned long flags)
- -{
- -      if (likely(in_task()))
- -              preempt_enable();
- -      else
- -              local_irq_restore(flags);
- -}
- -
   /*
    * mod_objcg_mlstate() may be called with irq enabled, so
    * mod_memcg_lruvec_state() should be used.
@@@ -2902,17 -2936,48 +2902,17 @@@ __always_inline struct obj_cgroup *get_
         return objcg;
   }
   
- -static int memcg_alloc_cache_id(void)
+ +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
   {
- -      int id, size;
- -      int err;
- -
- -      id = ida_simple_get(&memcg_cache_ida,
- -                          0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
- -      if (id < 0)
- -              return id;
- -
- -      if (id < memcg_nr_cache_ids)
- -              return id;
- -
- -      /*
- -       * There's no space for the new id in memcg_caches arrays,
- -       * so we have to grow them.
- -       */
- -      down_write(&memcg_cache_ids_sem);
- -
- -      size = 2 * (id + 1);
- -      if (size < MEMCG_CACHES_MIN_SIZE)
- -              size = MEMCG_CACHES_MIN_SIZE;
- -      else if (size > MEMCG_CACHES_MAX_SIZE)
- -              size = MEMCG_CACHES_MAX_SIZE;
- -
- -      err = memcg_update_all_list_lrus(size);
- -      if (!err)
- -              memcg_nr_cache_ids = size;
- -
- -      up_write(&memcg_cache_ids_sem);
- -
- -      if (err) {
- -              ida_simple_remove(&memcg_cache_ida, id);
- -              return err;
+ +      mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ +      if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ +              if (nr_pages > 0)
+ +                      page_counter_charge(&memcg->kmem, nr_pages);
+ +              else
+ +                      page_counter_uncharge(&memcg->kmem, -nr_pages);
         }
- -      return id;
   }
   
- -static void memcg_free_cache_id(int id)
- -{
- -      ida_simple_remove(&memcg_cache_ida, id);
- -}
   
   /*
    * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
@@@ -2926,7 -2991,8 +2926,7 @@@ static void obj_cgroup_uncharge_pages(s
   
         memcg = get_mem_cgroup_from_objcg(objcg);
   
- -      if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- -              page_counter_uncharge(&memcg->kmem, nr_pages);
+ +      memcg_account_kmem(memcg, -nr_pages);
         refill_stock(memcg, nr_pages);
   
         css_put(&memcg->css);
@@@ -2952,7 -3018,8 +2952,7 @@@ static int obj_cgroup_charge_pages(stru
         if (ret)
                 goto out;
   
- -      if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- -              page_counter_charge(&memcg->kmem, nr_pages);
+ +      memcg_account_kmem(memcg, nr_pages);
   out:
         css_put(&memcg->css);
   
@@@ -3008,21 -3075,17 +3008,21 @@@ void __memcg_kmem_uncharge_page(struct 
   void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
                      enum node_stat_item idx, int nr)
   {
+ +      struct memcg_stock_pcp *stock;
+ +      struct obj_cgroup *old = NULL;
         unsigned long flags;
- -      struct obj_stock *stock = get_obj_stock(&flags);
         int *bytes;
   
+ +      local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ +      stock = this_cpu_ptr(&memcg_stock);
+ +
         /*
          * Save vmstat data in stock and skip vmstat array update unless
          * accumulating over a page of vmstat data or when pgdat or idx
          * changes.
          */
         if (stock->cached_objcg != objcg) {
- -              drain_obj_stock(stock);
+ +              old = drain_obj_stock(stock);
                 obj_cgroup_get(objcg);
                 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
                                 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
@@@ -3066,51 -3129,38 +3066,51 @@@
         if (nr)
                 mod_objcg_mlstate(objcg, pgdat, idx, nr);
   
- -      put_obj_stock(flags);
+ +      local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ +      if (old)
+ +              obj_cgroup_put(old);
   }
   
   static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
   {
+ +      struct memcg_stock_pcp *stock;
         unsigned long flags;
- -      struct obj_stock *stock = get_obj_stock(&flags);
         bool ret = false;
   
+ +      local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ +
+ +      stock = this_cpu_ptr(&memcg_stock);
         if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
                 stock->nr_bytes -= nr_bytes;
                 ret = true;
         }
   
- -      put_obj_stock(flags);
+ +      local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
   
         return ret;
   }
   
- -static void drain_obj_stock(struct obj_stock *stock)
+ +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
   {
         struct obj_cgroup *old = stock->cached_objcg;
   
         if (!old)
- -              return;
+ +              return NULL;
   
         if (stock->nr_bytes) {
                 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
                 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
   
- -              if (nr_pages)
- -                      obj_cgroup_uncharge_pages(old, nr_pages);
+ +              if (nr_pages) {
+ +                      struct mem_cgroup *memcg;
+ +
+ +                      memcg = get_mem_cgroup_from_objcg(old);
+ +
+ +                      memcg_account_kmem(memcg, -nr_pages);
+ +                      __refill_stock(memcg, nr_pages);
+ +
+ +                      css_put(&memcg->css);
+ +              }
   
                 /*
                  * The leftover is flushed to the centralized per-memcg value.
@@@ -3145,12 -3195,8 +3145,12 @@@
                 stock->cached_pgdat = NULL;
         }
   
- -      obj_cgroup_put(old);
         stock->cached_objcg = NULL;
+ +      /*
+ +       * The `old' objects needs to be released by the caller via
+ +       * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
+ +       */
+ +      return old;
   }
   
   static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
@@@ -3158,8 -3204,13 +3158,8 @@@
   {
         struct mem_cgroup *memcg;
   
- -      if (in_task() && stock->task_obj.cached_objcg) {
- -              memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
- -              if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
- -                      return true;
- -      }
- -      if (stock->irq_obj.cached_objcg) {
- -              memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
+ +      if (stock->cached_objcg) {
+ +              memcg = obj_cgroup_memcg(stock->cached_objcg);
                 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
                         return true;
         }
@@@ -3170,16 -3221,12 +3170,16 @@@
   static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
                              bool allow_uncharge)
   {
+ +      struct memcg_stock_pcp *stock;
+ +      struct obj_cgroup *old = NULL;
         unsigned long flags;
- -      struct obj_stock *stock = get_obj_stock(&flags);
         unsigned int nr_pages = 0;
   
+ +      local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ +
+ +      stock = this_cpu_ptr(&memcg_stock);
         if (stock->cached_objcg != objcg) { /* reset if necessary */
- -              drain_obj_stock(stock);
+ +              old = drain_obj_stock(stock);
                 obj_cgroup_get(objcg);
                 stock->cached_objcg = objcg;
                 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
@@@ -3193,9 -3240,7 +3193,9 @@@
                 stock->nr_bytes &= (PAGE_SIZE - 1);
         }
   
- -      put_obj_stock(flags);
+ +      local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ +      if (old)
+ +              obj_cgroup_put(old);
   
         if (nr_pages)
                 obj_cgroup_uncharge_pages(objcg, nr_pages);
@@@ -3580,23 -3625,28 +3580,23 @@@ static u64 mem_cgroup_read_u64(struct c
   static int memcg_online_kmem(struct mem_cgroup *memcg)
   {
         struct obj_cgroup *objcg;
- -      int memcg_id;
   
         if (cgroup_memory_nokmem)
                 return 0;
   
- -      BUG_ON(memcg->kmemcg_id >= 0);
- -
- -      memcg_id = memcg_alloc_cache_id();
- -      if (memcg_id < 0)
- -              return memcg_id;
+ +      if (unlikely(mem_cgroup_is_root(memcg)))
+ +              return 0;
   
         objcg = obj_cgroup_alloc();
- -      if (!objcg) {
- -              memcg_free_cache_id(memcg_id);
+ +      if (!objcg)
                 return -ENOMEM;
- -      }
+ +
         objcg->memcg = memcg;
         rcu_assign_pointer(memcg->objcg, objcg);
   
         static_branch_enable(&memcg_kmem_enabled_key);
   
- -      memcg->kmemcg_id = memcg_id;
+ +      memcg->kmemcg_id = memcg->id.id;
   
         return 0;
   }
@@@ -3604,11 -3654,9 +3604,11 @@@
   static void memcg_offline_kmem(struct mem_cgroup *memcg)
   {
         struct mem_cgroup *parent;
- -      int kmemcg_id;
   
- -      if (memcg->kmemcg_id == -1)
+ +      if (cgroup_memory_nokmem)
+ +              return;
+ +
+ +      if (unlikely(mem_cgroup_is_root(memcg)))
                 return;
   
         parent = parent_mem_cgroup(memcg);
@@@ -3617,13 -3665,19 +3617,13 @@@
   
         memcg_reparent_objcgs(memcg, parent);
   
- -      kmemcg_id = memcg->kmemcg_id;
- -      BUG_ON(kmemcg_id < 0);
- -
         /*
          * After we have finished memcg_reparent_objcgs(), all list_lrus
          * corresponding to this cgroup are guaranteed to remain empty.
          * The ordering is imposed by list_lru_node->lock taken by
- -       * memcg_drain_all_list_lrus().
+ +       * memcg_reparent_list_lrus().
          */
- -      memcg_drain_all_list_lrus(kmemcg_id, parent);
- -
- -      memcg_free_cache_id(kmemcg_id);
- -      memcg->kmemcg_id = -1;
+ +      memcg_reparent_list_lrus(memcg, parent);
   }
   #else
   static int memcg_online_kmem(struct mem_cgroup *memcg)
@@@ -3709,12 -3763,8 +3709,12 @@@ static ssize_t mem_cgroup_write(struct 
                 }
                 break;
         case RES_SOFT_LIMIT:
- -              memcg->soft_limit = nr_pages;
- -              ret = 0;
+ +              if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ +                      ret = -EOPNOTSUPP;
+ +              } else {
+ +                      memcg->soft_limit = nr_pages;
+ +                      ret = 0;
+ +              }
                 break;
         }
         return ret ?: nbytes;
@@@ -4690,9 -4740,6 +4690,9 @@@ static ssize_t memcg_write_event_contro
         char *endp;
         int ret;
   
+ +      if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ +              return -EOPNOTSUPP;
+ +
         buf = strstrip(buf);
   
         efd = simple_strtoul(buf, &endp, 10);
@@@ -5020,8 -5067,18 +5020,8 @@@ struct mem_cgroup *mem_cgroup_from_id(u
   static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
   {
         struct mem_cgroup_per_node *pn;
- -      int tmp = node;
- -      /*
- -       * This routine is called against possible nodes.
- -       * But it's BUG to call kmalloc() against offline node.
- -       *
- -       * TODO: this routine can waste much memory for nodes which will
- -       *       never be onlined. It's better to use memory hotplug callback
- -       *       function.
- -       */
- -      if (!node_state(node, N_NORMAL_MEMORY))
- -              tmp = -1;
- -      pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+ +
+ +      pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
         if (!pn)
                 return 1;
   
@@@ -5033,6 -5090,8 +5033,6 @@@
         }
   
         lruvec_init(&pn->lruvec);
- -      pn->usage_in_excess = 0;
- -      pn->on_tree = false;
         pn->memcg = memcg;
   
         memcg->nodeinfo[node] = pn;
@@@ -5078,7 -5137,8 +5078,7 @@@ static struct mem_cgroup *mem_cgroup_al
                 return ERR_PTR(error);
   
         memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
- -                               1, MEM_CGROUP_ID_MAX,
- -                               GFP_KERNEL);
+ +                               1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
         if (memcg->id.id < 0) {
                 error = memcg->id.id;
                 goto fail;
@@@ -5132,6 -5192,7 +5132,6 @@@ mem_cgroup_css_alloc(struct cgroup_subs
   {
         struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
         struct mem_cgroup *memcg, *old_memcg;
- -      long error = -ENOMEM;
   
         old_memcg = set_active_memcg(parent);
         memcg = mem_cgroup_alloc();
@@@ -5160,26 -5221,34 +5160,26 @@@
                 return &memcg->css;
         }
   
- -      /* The following stuff does not apply to the root */
- -      error = memcg_online_kmem(memcg);
- -      if (error)
- -              goto fail;
- -
         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
                 static_branch_inc(&memcg_sockets_enabled_key);
   
         return &memcg->css;
- -fail:
- -      mem_cgroup_id_remove(memcg);
- -      mem_cgroup_free(memcg);
- -      return ERR_PTR(error);
   }
   
   static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
   {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   
+ +      if (memcg_online_kmem(memcg))
+ +              goto remove_id;
+ +
         /*
          * A memcg must be visible for expand_shrinker_info()
          * by the time the maps are allocated. So, we allocate maps
          * here, when for_each_mem_cgroup() can't skip it.
          */
- -      if (alloc_shrinker_info(memcg)) {
- -              mem_cgroup_id_remove(memcg);
- -              return -ENOMEM;
- -      }
+ +      if (alloc_shrinker_info(memcg))
+ +              goto offline_kmem;
   
         /* Online state pins memcg ID, memcg ID pins CSS */
         refcount_set(&memcg->id.ref, 1);
@@@ -5189,11 -5258,6 +5189,11 @@@
                 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
                                    2UL*HZ);
         return 0;
+ +offline_kmem:
+ +      memcg_offline_kmem(memcg);
+ +remove_id:
+ +      mem_cgroup_id_remove(memcg);
+ +      return -ENOMEM;
   }
   
   static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
@@@ -5251,6 -5315,9 +5251,6 @@@ static void mem_cgroup_css_free(struct 
         cancel_work_sync(&memcg->high_work);
         mem_cgroup_remove_from_trees(memcg);
         free_shrinker_info(memcg);
- -
- -      /* Need to offline kmem if online_css() fails */
- -      memcg_offline_kmem(memcg);
         mem_cgroup_free(memcg);
   }
   
@@@ -5436,17 -5503,12 +5436,12 @@@ static struct page *mc_handle_swap_pte(
                 return NULL;
   
         /*
-        * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
-        * a device and because they are not accessible by CPU they are store
-        * as special swap entry in the CPU page table.
+        * Handle device private pages that are not accessible by the CPU, but
+        * stored as special swap entries in the page table.
          */
         if (is_device_private_entry(ent)) {
                 page = pfn_swap_entry_to_page(ent);
-               /*
-                * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
-                * a refcount of 1 when free (unlike normal page)
-                */
-               if (!page_ref_add_unless(page, 1, 1))
+               if (!get_page_unless_zero(page))
                         return NULL;
                 return page;
         }
@@@ -6734,8 -6796,8 +6729,8 @@@ static void uncharge_batch(const struc
                 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
                 if (do_memsw_account())
                         page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
- -              if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
- -                      page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+ +              if (ug->nr_kmem)
+ +                      memcg_account_kmem(ug->memcg, -ug->nr_kmem);
                 memcg_oom_recover(ug->memcg);
         }
   
@@@ -6754,6 -6816,7 +6749,6 @@@ static void uncharge_folio(struct foli
         long nr_pages;
         struct mem_cgroup *memcg;
         struct obj_cgroup *objcg;
- -      bool use_objcg = folio_memcg_kmem(folio);
   
         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
   
@@@ -6762,7 -6825,7 +6757,7 @@@
          * folio memcg or objcg at this point, we have fully
          * exclusive access to the folio.
          */
- -      if (use_objcg) {
+ +      if (folio_memcg_kmem(folio)) {
                 objcg = __folio_objcg(folio);
                 /*
                  * This get matches the put at the end of the function and
@@@ -6790,7 -6853,7 +6785,7 @@@
   
         nr_pages = folio_nr_pages(folio);
   
- -      if (use_objcg) {
+ +      if (folio_memcg_kmem(folio)) {
                 ug->nr_memory += nr_pages;
                 ug->nr_kmem += nr_pages;
   
@@@ -6900,7 -6963,7 +6895,7 @@@ void mem_cgroup_sk_alloc(struct sock *s
                 return;
   
         /* Do not associate the sock with unrelated interrupted task's memcg. */
- -      if (in_interrupt())
+ +      if (!in_task())
                 return;
   
         rcu_read_lock();
@@@ -6985,7 -7048,7 +6980,7 @@@ static int __init cgroup_memory(char *s
                 if (!strcmp(token, "nokmem"))
                         cgroup_memory_nokmem = true;
         }
- -      return 0;
+ +      return 1;
   }
   __setup("cgroup.memory=", cgroup_memory);
   
@@@ -7053,19 -7116,19 +7048,19 @@@ static struct mem_cgroup *mem_cgroup_id
   
   /**
    * mem_cgroup_swapout - transfer a memsw charge to swap
-  * @page: page whose memsw charge to transfer
+  * @folio: folio whose memsw charge to transfer
    * @entry: swap entry to move the charge to
    *
-  * Transfer the memsw charge of @page to @entry.
+  * Transfer the memsw charge of @folio to @entry.
    */
- void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
   {
         struct mem_cgroup *memcg, *swap_memcg;
         unsigned int nr_entries;
         unsigned short oldid;
   
-       VM_BUG_ON_PAGE(PageLRU(page), page);
-       VM_BUG_ON_PAGE(page_count(page), page);
+       VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+       VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
   
         if (mem_cgroup_disabled())
                 return;
@@@ -7073,9 -7136,9 +7068,9 @@@
         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                 return;
   
-       memcg = page_memcg(page);
+       memcg = folio_memcg(folio);
   
-       VM_WARN_ON_ONCE_PAGE(!memcg, page);
+       VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
         if (!memcg)
                 return;
   
@@@ -7085,16 -7148,16 +7080,16 @@@
          * ancestor for the swap instead and transfer the memory+swap charge.
          */
         swap_memcg = mem_cgroup_id_get_online(memcg);
-       nr_entries = thp_nr_pages(page);
+       nr_entries = folio_nr_pages(folio);
         /* Get references for the tail pages, too */
         if (nr_entries > 1)
                 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
         oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
                                    nr_entries);
-       VM_BUG_ON_PAGE(oldid, page);
+       VM_BUG_ON_FOLIO(oldid, folio);
         mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
   
-       page->memcg_data = 0;
+       folio->memcg_data = 0;
   
         if (!mem_cgroup_is_root(memcg))
                 page_counter_uncharge(&memcg->memory, nr_entries);
@@@ -7111,10 -7174,9 +7106,10 @@@
          * important here to have the interrupts disabled because it is the
          * only synchronisation we have for updating the per-CPU variables.
          */
- -      VM_BUG_ON(!irqs_disabled());
+ +      memcg_stats_lock();
         mem_cgroup_charge_statistics(memcg, -nr_entries);
-       memcg_check_events(memcg, page_to_nid(page));
+ +      memcg_stats_unlock();
+       memcg_check_events(memcg, folio_nid(folio));
   
         css_put(&memcg->css);
   }
diff --combined mm/memory-failure.c

index 1434e0608d5a69774e36bbfc7c129724f3571e23,aa8236848949f5fe337dc7762253e27c1808a8cc..dcb6bb9cf73152f99824cdb5db0c2e2028c3e1b4
--- 1/mm/memory-failure.c
--- 2/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@@ -130,6 -130,12 +130,6 @@@ static int hwpoison_filter_dev(struct p
             hwpoison_filter_dev_minor == ~0U)
                 return 0;
   
- -      /*
- -       * page_mapping() does not accept slab pages.
- -       */
- -      if (PageSlab(p))
- -              return -EINVAL;
- -
         mapping = page_mapping(p);
         if (mapping == NULL || mapping->host == NULL)
                 return -EINVAL;
@@@ -252,13 -258,16 +252,13 @@@ static int kill_proc(struct to_kill *tk
         pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
                         pfn, t->comm, t->pid);
   
- -      if (flags & MF_ACTION_REQUIRED) {
- -              if (t == current)
- -                      ret = force_sig_mceerr(BUS_MCEERR_AR,
- -                                       (void __user *)tk->addr, addr_lsb);
- -              else
- -                      /* Signal other processes sharing the page if they have PF_MCE_EARLY set. */
- -                      ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
- -                              addr_lsb, t);
- -      } else {
+ +      if ((flags & MF_ACTION_REQUIRED) && (t == current))
+ +              ret = force_sig_mceerr(BUS_MCEERR_AR,
+ +                               (void __user *)tk->addr, addr_lsb);
+ +      else
                 /*
+ +               * Signal other processes sharing the page if they have
+ +               * PF_MCE_EARLY set.
                  * Don't use force here, it's convenient if the signal
                  * can be temporarily blocked.
                  * This could cause a loop when the user sets SIGBUS
@@@ -266,6 -275,7 +266,6 @@@
                  */
                 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
                                       addr_lsb, t);  /* synchronous? */
- -      }
         if (ret < 0)
                 pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
                         t->comm, t->pid, ret);
@@@ -305,7 -315,6 +305,7 @@@ static unsigned long dev_pagemap_mappin
         pmd_t *pmd;
         pte_t *pte;
   
+ +      VM_BUG_ON_VMA(address == -EFAULT, vma);
         pgd = pgd_offset(vma->vm_mm, address);
         if (!pgd_present(*pgd))
                 return 0;
@@@ -478,12 -487,13 +478,13 @@@ static struct task_struct *task_early_k
   static void collect_procs_anon(struct page *page, struct list_head *to_kill,
                                 int force_early)
   {
+       struct folio *folio = page_folio(page);
         struct vm_area_struct *vma;
         struct task_struct *tsk;
         struct anon_vma *av;
         pgoff_t pgoff;
   
-       av = page_lock_anon_vma_read(page);
+       av = folio_lock_anon_vma_read(folio);
         if (av == NULL) /* Not actually mapped anymore */
                 return;
   
@@@ -698,10 -708,8 +699,10 @@@ static int kill_accessing_process(struc
                               (void *)&priv);
         if (ret == 1 && priv.tk.addr)
                 kill_proc(&priv.tk, pfn, flags);
+ +      else
+ +              ret = 0;
         mmap_read_unlock(p->mm);
- -      return ret ? -EFAULT : -EHWPOISON;
+ +      return ret > 0 ? -EHWPOISON : -EFAULT;
   }
   
   static const char *action_name[] = {
@@@ -732,7 -740,6 +733,7 @@@ static const char * const action_page_t
         [MF_MSG_BUDDY]                  = "free buddy page",
         [MF_MSG_DAX]                    = "dax page",
         [MF_MSG_UNSPLIT_THP]            = "unsplit thp",
+ +      [MF_MSG_DIFFERENT_PAGE_SIZE]    = "different page size",
         [MF_MSG_UNKNOWN]                = "unknown page",
   };
   
@@@ -1176,18 -1183,12 +1177,18 @@@ void ClearPageHWPoisonTakenOff(struct p
    * does not return true for hugetlb or device memory pages, so it's assumed
    * to be called only in the context where we never have such pages.
    */
- -static inline bool HWPoisonHandlable(struct page *page)
+ +static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
   {
- -      return PageLRU(page) || __PageMovable(page) || is_free_buddy_page(page);
+ +      bool movable = false;
+ +
+ +      /* Soft offline could mirgate non-LRU movable pages */
+ +      if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
+ +              movable = true;
+ +
+ +      return movable || PageLRU(page) || is_free_buddy_page(page);
   }
   
- -static int __get_hwpoison_page(struct page *page)
+ +static int __get_hwpoison_page(struct page *page, unsigned long flags)
   {
         struct page *head = compound_head(page);
         int ret = 0;
@@@ -1202,7 -1203,7 +1203,7 @@@
          * for any unsupported type of page in order to reduce the risk of
          * unexpected races caused by taking a page refcount.
          */
- -      if (!HWPoisonHandlable(head))
+ +      if (!HWPoisonHandlable(head, flags))
                 return -EBUSY;
   
         if (get_page_unless_zero(head)) {
@@@ -1227,7 -1228,7 +1228,7 @@@ static int get_any_page(struct page *p
   
   try_again:
         if (!count_increased) {
- -              ret = __get_hwpoison_page(p);
+ +              ret = __get_hwpoison_page(p, flags);
                 if (!ret) {
                         if (page_count(p)) {
                                 /* We raced with an allocation, retry. */
@@@ -1255,7 -1256,7 +1256,7 @@@
                 }
         }
   
- -      if (PageHuge(p) || HWPoisonHandlable(p)) {
+ +      if (PageHuge(p) || HWPoisonHandlable(p, flags)) {
                 ret = 1;
         } else {
                 /*
@@@ -1347,6 -1348,7 +1348,7 @@@ static int get_hwpoison_page(struct pag
   static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
                                   int flags, struct page *hpage)
   {
+       struct folio *folio = page_folio(hpage);
         enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
         struct address_space *mapping;
         LIST_HEAD(tokill);
@@@ -1411,22 -1413,26 +1413,22 @@@
         if (kill)
                 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
   
- -      if (!PageHuge(hpage)) {
- -              try_to_unmap(folio, ttu);
+ +      if (PageHuge(hpage) && !PageAnon(hpage)) {
+ +              /*
+ +               * For hugetlb pages in shared mappings, try_to_unmap
+ +               * could potentially call huge_pmd_unshare.  Because of
+ +               * this, take semaphore in write mode here and set
+ +               * TTU_RMAP_LOCKED to indicate we have taken the lock
+ +               * at this higher level.
+ +               */
+ +              mapping = hugetlb_page_mapping_lock_write(hpage);
+ +              if (mapping) {
-                       try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
++                      try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
+ +                      i_mmap_unlock_write(mapping);
+ +              } else
+ +                      pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
         } else {
-               try_to_unmap(hpage, ttu);
- -              if (!PageAnon(hpage)) {
- -                      /*
- -                       * For hugetlb pages in shared mappings, try_to_unmap
- -                       * could potentially call huge_pmd_unshare.  Because of
- -                       * this, take semaphore in write mode here and set
- -                       * TTU_RMAP_LOCKED to indicate we have taken the lock
- -                       * at this higher level.
- -                       */
- -                      mapping = hugetlb_page_mapping_lock_write(hpage);
- -                      if (mapping) {
- -                              try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
- -                              i_mmap_unlock_write(mapping);
- -                      } else
- -                              pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
- -              } else {
- -                      try_to_unmap(folio, ttu);
- -              }
++              try_to_unmap(folio, ttu);
         }
   
         unmap_success = !page_mapped(hpage);
@@@ -1522,7 -1528,7 +1524,7 @@@ static int memory_failure_hugetlb(unsig
                                 if (TestClearPageHWPoison(head))
                                         num_poisoned_pages_dec();
                                 unlock_page(head);
- -                              return 0;
+ +                              return -EOPNOTSUPP;
                         }
                         unlock_page(head);
                         res = MF_FAILED;
@@@ -1539,27 -1545,8 +1541,27 @@@
         }
   
         lock_page(head);
+ +
+ +      /*
+ +       * The page could have changed compound pages due to race window.
+ +       * If this happens just bail out.
+ +       */
+ +      if (!PageHuge(p) || compound_head(p) != head) {
+ +              action_result(pfn, MF_MSG_DIFFERENT_PAGE_SIZE, MF_IGNORED);
+ +              res = -EBUSY;
+ +              goto out;
+ +      }
+ +
         page_flags = head->flags;
   
+ +      if (hwpoison_filter(p)) {
+ +              if (TestClearPageHWPoison(head))
+ +                      num_poisoned_pages_dec();
+ +              put_page(p);
+ +              res = -EOPNOTSUPP;
+ +              goto out;
+ +      }
+ +
         /*
          * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
          * simply disable it. In order to make it work properly, we need
@@@ -1628,7 -1615,7 +1630,7 @@@ static int memory_failure_dev_pagemap(u
                 goto out;
   
         if (hwpoison_filter(page)) {
- -              rc = 0;
+ +              rc = -EOPNOTSUPP;
                 goto unlock;
         }
   
@@@ -1653,7 -1640,7 +1655,7 @@@
          * SIGBUS (i.e. MF_MUST_KILL)
          */
         flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
- -      collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+ +      collect_procs(page, &tokill, true);
   
         list_for_each_entry(tk, &tokill, nd)
                 if (tk->size_shift)
@@@ -1668,7 -1655,7 +1670,7 @@@
                 start = (page->index << PAGE_SHIFT) & ~(size - 1);
                 unmap_mapping_range(page->mapping, start, size, 0);
         }
- -      kill_procs(&tokill, flags & MF_MUST_KILL, false, pfn, flags);
+ +      kill_procs(&tokill, true, false, pfn, flags);
         rc = 0;
   unlock:
         dax_unlock_page(page, cookie);
@@@ -1697,15 -1684,12 +1699,15 @@@ static DEFINE_MUTEX(mf_mutex)
    *
    * Must run in process context (e.g. a work queue) with interrupts
    * enabled and no spinlocks hold.
+ + *
+ + * Return: 0 for successfully handled the memory error,
+ + *         -EOPNOTSUPP for memory_filter() filtered the error event,
+ + *         < 0(except -EOPNOTSUPP) on failure.
    */
   int memory_failure(unsigned long pfn, int flags)
   {
         struct page *p;
         struct page *hpage;
- -      struct page *orig_head;
         struct dev_pagemap *pgmap;
         int res = 0;
         unsigned long page_flags;
@@@ -1751,7 -1735,7 +1753,7 @@@ try_again
                 goto unlock_mutex;
         }
   
- -      orig_head = hpage = compound_head(p);
+ +      hpage = compound_head(p);
         num_poisoned_pages_inc();
   
         /*
@@@ -1832,21 -1816,10 +1834,21 @@@
         lock_page(p);
   
         /*
- -       * The page could have changed compound pages during the locking.
- -       * If this happens just bail out.
+ +       * We're only intended to deal with the non-Compound page here.
+ +       * However, the page could have changed compound pages due to
+ +       * race window. If this happens, we could try again to hopefully
+ +       * handle the page next round.
          */
- -      if (PageCompound(p) && compound_head(p) != orig_head) {
+ +      if (PageCompound(p)) {
+ +              if (retry) {
+ +                      if (TestClearPageHWPoison(p))
+ +                              num_poisoned_pages_dec();
+ +                      unlock_page(p);
+ +                      put_page(p);
+ +                      flags &= ~MF_COUNT_INCREASED;
+ +                      retry = false;
+ +                      goto try_again;
+ +              }
                 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
                 res = -EBUSY;
                 goto unlock_page;
@@@ -1866,7 -1839,6 +1868,7 @@@
                         num_poisoned_pages_dec();
                 unlock_page(p);
                 put_page(p);
+ +              res = -EOPNOTSUPP;
                 goto unlock_mutex;
         }
   
@@@ -1875,7 -1847,7 +1877,7 @@@
          * page_lock. We need wait writeback completion for this page or it
          * may trigger vfs BUG while evict inode.
          */
- -      if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p))
+ +      if (!PageLRU(p) && !PageWriteback(p))
                 goto identify_page_state;
   
         /*
@@@ -2169,7 -2141,7 +2171,7 @@@ static bool isolate_page(struct page *p
    */
   static int __soft_offline_page(struct page *page)
   {
-       int ret = 0;
+       long ret = 0;
         unsigned long pfn = page_to_pfn(page);
         struct page *hpage = compound_head(page);
         char const *msg_page[] = {"page", "hugepage"};
@@@ -2180,6 -2152,12 +2182,6 @@@
                 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
         };
   
- -      /*
- -       * Check PageHWPoison again inside page lock because PageHWPoison
- -       * is set by memory_failure() outside page lock. Note that
- -       * memory_failure() also double-checks PageHWPoison inside page lock,
- -       * so there's no race between soft_offline_page() and memory_failure().
- -       */
         lock_page(page);
         if (!PageHuge(page))
                 wait_on_page_writeback(page);
@@@ -2190,7 -2168,7 +2192,7 @@@
                 return 0;
         }
   
- -      if (!PageHuge(page))
+ +      if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page))
                 /*
                  * Try to invalidate first. This should work for
                  * non dirty unmapped page cache pages.
@@@ -2198,6 -2176,10 +2200,6 @@@
                 ret = invalidate_inode_page(page);
         unlock_page(page);
   
- -      /*
- -       * RED-PEN would be better to keep it isolated here, but we
- -       * would need to fix isolation locking first.
- -       */
         if (ret) {
                 pr_info("soft_offline: %#lx: invalidated\n", pfn);
                 page_handle_poison(page, false, true);
@@@ -2216,7 -2198,7 +2218,7 @@@
                         if (!list_empty(&pagelist))
                                 putback_movable_pages(&pagelist);
   
-                       pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n",
+                       pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n",
                                 pfn, msg_page[huge], ret, &page->flags);
                         if (ret > 0)
                                 ret = -EBUSY;
@@@ -2308,7 -2290,7 +2310,7 @@@ int soft_offline_page(unsigned long pfn
   
   retry:
         get_online_mems();
- -      ret = get_hwpoison_page(page, flags);
+ +      ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
         put_online_mems();
   
         if (ret > 0) {
diff --combined mm/memory.c

index e0f3410fa70cb1e96e9e9fc40c1c4d01e5eda12c,53bd9e5f2e33adfc48516ac4a24cc54501b5a0d2..7c40850b7124eb190f1dc345b801e1234b740d78
--- 1/mm/memory.c
--- 2/mm/memory.c
+++ b/mm/memory.c
@@@ -735,9 -735,6 +735,6 @@@ static void restore_exclusive_pte(struc
   
         set_pte_at(vma->vm_mm, address, ptep, pte);
   
-       if (vma->vm_flags & VM_LOCKED)
-               mlock_vma_page(page);
- 
         /*
          * No need to invalidate - it was non-present before. However
          * secondary CPUs may have mappings that need invalidating.
@@@ -1309,34 -1306,22 +1306,34 @@@ copy_page_range(struct vm_area_struct *
    * Parameter block passed down to zap_pte_range in exceptional cases.
    */
   struct zap_details {
- -      struct address_space *zap_mapping;      /* Check page->mapping if set */
         struct folio *single_folio;     /* Locked folio to be unmapped */
+ +      bool even_cows;                 /* Zap COWed private pages too? */
   };
   
- -/*
- - * We set details->zap_mapping when we want to unmap shared but keep private
- - * pages. Return true if skip zapping this page, false otherwise.
- - */
- -static inline bool
- -zap_skip_check_mapping(struct zap_details *details, struct page *page)
+ +/* Whether we should zap all COWed (private) pages too */
+ +static inline bool should_zap_cows(struct zap_details *details)
   {
- -      if (!details || !page)
- -              return false;
+ +      /* By default, zap all pages */
+ +      if (!details)
+ +              return true;
+ +
+ +      /* Or, we zap COWed pages only if the caller wants to */
+ +      return details->even_cows;
+ +}
+ +
+ +/* Decides whether we should zap this page with the page pointer specified */
+ +static inline bool should_zap_page(struct zap_details *details, struct page *page)
+ +{
+ +      /* If we can make a decision without *page.. */
+ +      if (should_zap_cows(details))
+ +              return true;
+ +
+ +      /* E.g. the caller passes NULL for the case of a zero page */
+ +      if (!page)
+ +              return true;
   
- -      return details->zap_mapping &&
- -              (details->zap_mapping != page_rmapping(page));
+ +      /* Otherwise we should only zap non-anon pages */
+ +      return !PageAnon(page);
   }
   
   static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@@ -1361,8 -1346,6 +1358,8 @@@ again
         arch_enter_lazy_mmu_mode();
         do {
                 pte_t ptent = *pte;
+ +              struct page *page;
+ +
                 if (pte_none(ptent))
                         continue;
   
@@@ -1370,8 -1353,10 +1367,8 @@@
                         break;
   
                 if (pte_present(ptent)) {
- -                      struct page *page;
- -
                         page = vm_normal_page(vma, addr, ptent);
- -                      if (unlikely(zap_skip_check_mapping(details, page)))
+ +                      if (unlikely(!should_zap_page(details, page)))
                                 continue;
                         ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                         tlb->fullmm);
@@@ -1389,7 -1374,7 +1386,7 @@@
                                         mark_page_accessed(page);
                         }
                         rss[mm_counter(page)]--;
-                       page_remove_rmap(page, false);
+                       page_remove_rmap(page, vma, false);
                         if (unlikely(page_mapcount(page) < 0))
                                 print_bad_pte(vma, addr, ptent, page);
                         if (unlikely(__tlb_remove_page(tlb, page))) {
@@@ -1403,32 -1388,32 +1400,32 @@@
                 entry = pte_to_swp_entry(ptent);
                 if (is_device_private_entry(entry) ||
                     is_device_exclusive_entry(entry)) {
- -                      struct page *page = pfn_swap_entry_to_page(entry);
- -
- -                      if (unlikely(zap_skip_check_mapping(details, page)))
+ +                      page = pfn_swap_entry_to_page(entry);
+ +                      if (unlikely(!should_zap_page(details, page)))
                                 continue;
- -                      pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                         rss[mm_counter(page)]--;
                         if (is_device_private_entry(entry))
-                               page_remove_rmap(page, false);
+                               page_remove_rmap(page, vma, false);
                         put_page(page);
- -                      continue;
- -              }
- -
- -              /* If details->check_mapping, we leave swap entries. */
- -              if (unlikely(details))
- -                      continue;
- -
- -              if (!non_swap_entry(entry))
+ +              } else if (!non_swap_entry(entry)) {
+ +                      /* Genuine swap entry, hence a private anon page */
+ +                      if (!should_zap_cows(details))
+ +                              continue;
                         rss[MM_SWAPENTS]--;
- -              else if (is_migration_entry(entry)) {
- -                      struct page *page;
- -
+ +                      if (unlikely(!free_swap_and_cache(entry)))
+ +                              print_bad_pte(vma, addr, ptent, NULL);
+ +              } else if (is_migration_entry(entry)) {
                         page = pfn_swap_entry_to_page(entry);
+ +                      if (!should_zap_page(details, page))
+ +                              continue;
                         rss[mm_counter(page)]--;
+ +              } else if (is_hwpoison_entry(entry)) {
+ +                      if (!should_zap_cows(details))
+ +                              continue;
+ +              } else {
+ +                      /* We should have covered all the swap entry types */
+ +                      WARN_ON_ONCE(1);
                 }
- -              if (unlikely(!free_swap_and_cache(entry)))
- -                      print_bad_pte(vma, addr, ptent, NULL);
                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
         } while (pte++, addr += PAGE_SIZE, addr != end);
   
@@@ -1715,7 -1700,7 +1712,7 @@@ static void zap_page_range_single(struc
   void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                 unsigned long size)
   {
- -      if (address < vma->vm_start || address + size > vma->vm_end ||
+ +      if (!range_in_vma(vma, address, address + size) ||
                         !(vma->vm_flags & VM_PFNMAP))
                 return;
   
@@@ -1763,16 -1748,16 +1760,16 @@@ static int validate_page_before_insert(
         return 0;
   }
   
- static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
+ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
                         unsigned long addr, struct page *page, pgprot_t prot)
   {
         if (!pte_none(*pte))
                 return -EBUSY;
         /* Ok, finally just insert the thing.. */
         get_page(page);
-       inc_mm_counter_fast(mm, mm_counter_file(page));
-       page_add_file_rmap(page, false);
-       set_pte_at(mm, addr, pte, mk_pte(page, prot));
+       inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+       page_add_file_rmap(page, vma, false);
+       set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
         return 0;
   }
   
@@@ -1786,7 -1771,6 +1783,6 @@@
   static int insert_page(struct vm_area_struct *vma, unsigned long addr,
                         struct page *page, pgprot_t prot)
   {
-       struct mm_struct *mm = vma->vm_mm;
         int retval;
         pte_t *pte;
         spinlock_t *ptl;
@@@ -1795,17 -1779,17 +1791,17 @@@
         if (retval)
                 goto out;
         retval = -ENOMEM;
-       pte = get_locked_pte(mm, addr, &ptl);
+       pte = get_locked_pte(vma->vm_mm, addr, &ptl);
         if (!pte)
                 goto out;
-       retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
+       retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
         pte_unmap_unlock(pte, ptl);
   out:
         return retval;
   }
   
   #ifdef pte_index
- static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
+ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
                         unsigned long addr, struct page *page, pgprot_t prot)
   {
         int err;
@@@ -1815,7 -1799,7 +1811,7 @@@
         err = validate_page_before_insert(page);
         if (err)
                 return err;
-       return insert_page_into_pte_locked(mm, pte, addr, page, prot);
+       return insert_page_into_pte_locked(vma, pte, addr, page, prot);
   }
   
   /* insert_pages() amortizes the cost of spinlock operations
@@@ -1852,7 -1836,7 +1848,7 @@@ more
   
                 start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
                 for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
-                       int err = insert_page_in_batch_locked(mm, pte,
+                       int err = insert_page_in_batch_locked(vma, pte,
                                 addr, pages[curr_page_idx], prot);
                         if (unlikely(err)) {
                                 pte_unmap_unlock(start_pte, pte_lock);
@@@ -3108,7 -3092,7 +3104,7 @@@ static vm_fault_t wp_page_copy(struct v
                          * mapcount is visible. So transitively, TLBs to
                          * old page will be flushed before it can be reused.
                          */
-                       page_remove_rmap(old_page, false);
+                       page_remove_rmap(old_page, vma, false);
                 }
   
                 /* Free the old page.. */
@@@ -3128,16 -3112,6 +3124,6 @@@
          */
         mmu_notifier_invalidate_range_only_end(&range);
         if (old_page) {
-               /*
-                * Don't let another task, with possibly unlocked vma,
-                * keep the mlocked page.
-                */
-               if (page_copied && (vma->vm_flags & VM_LOCKED)) {
-                       lock_page(old_page);    /* LRU manipulation */
-                       if (PageMlocked(old_page))
-                               munlock_vma_page(old_page);
-                       unlock_page(old_page);
-               }
                 if (page_copied)
                         free_swap_cache(old_page);
                 put_page(old_page);
@@@ -3350,8 -3324,12 +3336,8 @@@ static inline void unmap_mapping_range_
         vma_interval_tree_foreach(vma, root, first_index, last_index) {
                 vba = vma->vm_pgoff;
                 vea = vba + vma_pages(vma) - 1;
- -              zba = first_index;
- -              if (zba < vba)
- -                      zba = vba;
- -              zea = last_index;
- -              if (zea > vea)
- -                      zea = vea;
+ +              zba = max(first_index, vba);
+ +              zea = min(last_index, vea);
   
                 unmap_mapping_range_vma(vma,
                         ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
@@@ -3383,7 -3361,7 +3369,7 @@@ void unmap_mapping_folio(struct folio *
         first_index = folio->index;
         last_index = folio->index + folio_nr_pages(folio) - 1;
   
- -      details.zap_mapping = mapping;
+ +      details.even_cows = false;
         details.single_folio = folio;
   
         i_mmap_lock_write(mapping);
@@@ -3412,7 -3390,7 +3398,7 @@@ void unmap_mapping_pages(struct address
         pgoff_t first_index = start;
         pgoff_t last_index = start + nr - 1;
   
- -      details.zap_mapping = even_cows ? NULL : mapping;
+ +      details.even_cows = even_cows;
         if (last_index < first_index)
                 last_index = ULONG_MAX;
   
@@@ -3877,16 -3855,11 +3863,16 @@@ static vm_fault_t __do_fault(struct vm_
                 return ret;
   
         if (unlikely(PageHWPoison(vmf->page))) {
- -              if (ret & VM_FAULT_LOCKED)
+ +              vm_fault_t poisonret = VM_FAULT_HWPOISON;
+ +              if (ret & VM_FAULT_LOCKED) {
+ +                      /* Retry if a clean page was removed from the cache. */
+ +                      if (invalidate_inode_page(vmf->page))
+ +                              poisonret = 0;
                         unlock_page(vmf->page);
+ +              }
                 put_page(vmf->page);
                 vmf->page = NULL;
- -              return VM_FAULT_HWPOISON;
+ +              return poisonret;
         }
   
         if (unlikely(!(ret & VM_FAULT_LOCKED)))
@@@ -3958,7 -3931,8 +3944,8 @@@ vm_fault_t do_set_pmd(struct vm_fault *
                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
   
         add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
-       page_add_file_rmap(page, true);
+       page_add_file_rmap(page, vma, true);
+ 
         /*
          * deposit and withdraw with pmd lock held
          */
@@@ -4007,7 -3981,7 +3994,7 @@@ void do_set_pte(struct vm_fault *vmf, s
                 lru_cache_add_inactive_or_unevictable(page, vma);
         } else {
                 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
-               page_add_file_rmap(page, false);
+               page_add_file_rmap(page, vma, false);
         }
         set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
   }
@@@ -4633,7 -4607,6 +4620,7 @@@ static vm_fault_t __handle_mm_fault(str
         struct vm_fault vmf = {
                 .vma = vma,
                 .address = address & PAGE_MASK,
+ +              .real_address = address,
                 .flags = flags,
                 .pgoff = linear_page_index(vma, address),
                 .gfp_mask = __get_fault_gfp_mask(vma),
@@@ -5456,8 -5429,6 +5443,8 @@@ long copy_huge_page_from_user(struct pa
                 if (rc)
                         break;
   
+ +              flush_dcache_page(subpage);
+ +
                 cond_resched();
         }
         return ret_val;
diff --combined mm/memory_hotplug.c

index aee69281dad682560afbb7838786e51b07c58a4e,914057da53c7521e59e8442ebd58f7209fc195ed..416b38ca8defa3af0e3599aa2698d431c4fe8699
--- 1/mm/memory_hotplug.c
--- 2/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@@ -295,6 -295,12 +295,6 @@@ struct page *pfn_to_online_page(unsigne
   }
   EXPORT_SYMBOL_GPL(pfn_to_online_page);
   
- -/*
- - * Reasonably generic function for adding memory.  It is
- - * expected that archs that support memory hotplug will
- - * call this function after deciding the zone to which to
- - * add the new pages.
- - */
   int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
                 struct mhp_params *params)
   {
@@@ -823,7 -829,7 +823,7 @@@ static struct zone *default_kernel_zone
         struct pglist_data *pgdat = NODE_DATA(nid);
         int zid;
   
- -      for (zid = 0; zid <= ZONE_NORMAL; zid++) {
+ +      for (zid = 0; zid < ZONE_NORMAL; zid++) {
                 struct zone *zone = &pgdat->node_zones[zid];
   
                 if (zone_intersects(zone, start_pfn, nr_pages))
@@@ -1156,20 -1162,43 +1156,20 @@@ static void reset_node_present_pages(pg
   }
   
   /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
- -static pg_data_t __ref *hotadd_new_pgdat(int nid)
+ +static pg_data_t __ref *hotadd_init_pgdat(int nid)
   {
         struct pglist_data *pgdat;
   
+ +      /*
+ +       * NODE_DATA is preallocated (free_area_init) but its internal
+ +       * state is not allocated completely. Add missing pieces.
+ +       * Completely offline nodes stay around and they just need
+ +       * reintialization.
+ +       */
         pgdat = NODE_DATA(nid);
- -      if (!pgdat) {
- -              pgdat = arch_alloc_nodedata(nid);
- -              if (!pgdat)
- -                      return NULL;
- -
- -              pgdat->per_cpu_nodestats =
- -                      alloc_percpu(struct per_cpu_nodestat);
- -              arch_refresh_nodedata(nid, pgdat);
- -      } else {
- -              int cpu;
- -              /*
- -               * Reset the nr_zones, order and highest_zoneidx before reuse.
- -               * Note that kswapd will init kswapd_highest_zoneidx properly
- -               * when it starts in the near future.
- -               */
- -              pgdat->nr_zones = 0;
- -              pgdat->kswapd_order = 0;
- -              pgdat->kswapd_highest_zoneidx = 0;
- -              for_each_online_cpu(cpu) {
- -                      struct per_cpu_nodestat *p;
- -
- -                      p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
- -                      memset(p, 0, sizeof(*p));
- -              }
- -      }
- -
- -      /* we can use NODE_DATA(nid) from here */
- -      pgdat->node_id = nid;
- -      pgdat->node_start_pfn = 0;
   
         /* init node's zones as empty zones, we don't have any present pages.*/
- -      free_area_init_core_hotplug(nid);
+ +      free_area_init_core_hotplug(pgdat);
   
         /*
          * The node we allocated has no zone fallback lists. For avoiding
@@@ -1181,7 -1210,6 +1181,7 @@@
          * When memory is hot-added, all the memory is in offline state. So
          * clear all zones' present_pages because they will be updated in
          * online_pages() and offline_pages().
+ +       * TODO: should be in free_area_init_core_hotplug?
          */
         reset_node_managed_pages(pgdat);
         reset_node_present_pages(pgdat);
@@@ -1189,6 -1217,16 +1189,6 @@@
         return pgdat;
   }
   
- -static void rollback_node_hotadd(int nid)
- -{
- -      pg_data_t *pgdat = NODE_DATA(nid);
- -
- -      arch_refresh_nodedata(nid, NULL);
- -      free_percpu(pgdat->per_cpu_nodestats);
- -      arch_free_nodedata(pgdat);
- -}
- -
- -
   /*
    * __try_online_node - online a node if offlined
    * @nid: the node ID
@@@ -1208,7 -1246,7 +1208,7 @@@ static int __try_online_node(int nid, b
         if (node_online(nid))
                 return 0;
   
- -      pgdat = hotadd_new_pgdat(nid);
+ +      pgdat = hotadd_init_pgdat(nid);
         if (!pgdat) {
                 pr_err("Cannot online node %d due to NULL pgdat\n", nid);
                 ret = -ENOMEM;
@@@ -1289,7 -1327,7 +1289,7 @@@ bool mhp_supports_memmap_on_memory(unsi
          *       populate a single PMD.
          */
         return memmap_on_memory &&
- -             !hugetlb_free_vmemmap_enabled &&
+ +             !hugetlb_free_vmemmap_enabled() &&
                IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
                size == memory_block_size_bytes() &&
                IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
@@@ -1383,9 -1421,9 +1383,9 @@@ int __ref add_memory_resource(int nid, 
                 BUG_ON(ret);
         }
   
- -      /* link memory sections under this node.*/
- -      link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
- -                        MEMINIT_HOTPLUG);
+ +      register_memory_blocks_under_node(nid, PFN_DOWN(start),
+ +                                        PFN_UP(start + size - 1),
+ +                                        MEMINIT_HOTPLUG);
   
         /* create new memmap entry */
         if (!strcmp(res->name, "System RAM"))
@@@ -1407,6 -1445,9 +1407,6 @@@
   
         return ret;
   error:
- -      /* rollback pgdat allocation and others */
- -      if (new_node)
- -              rollback_node_hotadd(nid);
         if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
                 memblock_remove(start, size);
   error_mem_hotplug_end:
@@@ -1548,6 -1589,38 +1548,6 @@@ bool mhp_range_allowed(u64 start, u64 s
   }
   
   #ifdef CONFIG_MEMORY_HOTREMOVE
- -/*
- - * Confirm all pages in a range [start, end) belong to the same zone (skipping
- - * memory holes). When true, return the zone.
- - */
- -struct zone *test_pages_in_a_zone(unsigned long start_pfn,
- -                                unsigned long end_pfn)
- -{
- -      unsigned long pfn, sec_end_pfn;
- -      struct zone *zone = NULL;
- -      struct page *page;
- -
- -      for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
- -           pfn < end_pfn;
- -           pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
- -              /* Make sure the memory section is present first */
- -              if (!present_section_nr(pfn_to_section_nr(pfn)))
- -                      continue;
- -              for (; pfn < sec_end_pfn && pfn < end_pfn;
- -                   pfn += MAX_ORDER_NR_PAGES) {
- -                      /* Check if we got outside of the zone */
- -                      if (zone && !zone_spans_pfn(zone, pfn))
- -                              return NULL;
- -                      page = pfn_to_page(pfn);
- -                      if (zone && page_zone(page) != zone)
- -                              return NULL;
- -                      zone = page_zone(page);
- -              }
- -      }
- -
- -      return zone;
- -}
- -
   /*
    * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
    * non-lru movable pages and hugepages). Will skip over most unmovable
@@@ -1617,10 -1690,13 +1617,13 @@@ do_migrate_range(unsigned long start_pf
                                       DEFAULT_RATELIMIT_BURST);
   
         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+               struct folio *folio;
+ 
                 if (!pfn_valid(pfn))
                         continue;
                 page = pfn_to_page(pfn);
-               head = compound_head(page);
+               folio = page_folio(page);
+               head = &folio->page;
   
                 if (PageHuge(page)) {
                         pfn = page_to_pfn(head) + compound_nr(head) - 1;
@@@ -1637,10 -1713,10 +1640,10 @@@
                  * the unmap as the catch all safety net).
                  */
                 if (PageHWPoison(page)) {
-                       if (WARN_ON(PageLRU(page)))
-                               isolate_lru_page(page);
-                       if (page_mapped(page))
-                               try_to_unmap(page, TTU_IGNORE_MLOCK);
+                       if (WARN_ON(folio_test_lru(folio)))
+                               folio_isolate_lru(folio);
+                       if (folio_mapped(folio))
+                               try_to_unmap(folio, TTU_IGNORE_MLOCK);
                         continue;
                 }
   
@@@ -1771,15 -1847,15 +1774,15 @@@ static int count_system_ram_pages_cb(un
   }
   
   int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
- -                      struct memory_group *group)
+ +                      struct zone *zone, struct memory_group *group)
   {
         const unsigned long end_pfn = start_pfn + nr_pages;
         unsigned long pfn, system_ram_pages = 0;
+ +      const int node = zone_to_nid(zone);
         unsigned long flags;
- -      struct zone *zone;
         struct memory_notify arg;
- -      int ret, node;
         char *reason;
+ +      int ret;
   
         /*
          * {on,off}lining is constrained to full memory sections (or more
@@@ -1811,17 -1887,15 +1814,17 @@@
                 goto failed_removal;
         }
   
- -      /* This makes hotplug much easier...and readable.
- -         we assume this for now. .*/
- -      zone = test_pages_in_a_zone(start_pfn, end_pfn);
- -      if (!zone) {
+ +      /*
+ +       * We only support offlining of memory blocks managed by a single zone,
+ +       * checked by calling code. This is just a sanity check that we might
+ +       * want to remove in the future.
+ +       */
+ +      if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone ||
+ +                       page_zone(pfn_to_page(end_pfn - 1)) != zone)) {
                 ret = -EINVAL;
                 reason = "multizone range";
                 goto failed_removal;
         }
- -      node = zone_to_nid(zone);
   
         /*
          * Disable pcplists so that page isolation cannot race with freeing
@@@ -1933,7 -2007,6 +1936,7 @@@
         return 0;
   
   failed_removal_isolated:
+ +      /* pushback to free area */
         undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
         memory_notify(MEM_CANCEL_OFFLINE, &arg);
   failed_removal_pcplists_disabled:
@@@ -1944,6 -2017,7 +1947,6 @@@ failed_removal
                  (unsigned long long) start_pfn << PAGE_SHIFT,
                  ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
                  reason);
- -      /* pushback to free area */
         mem_hotplug_done();
         return ret;
   }
@@@ -1975,12 -2049,12 +1978,12 @@@ static int get_nr_vmemmap_pages_cb(stru
         return mem->nr_vmemmap_pages;
   }
   
- -static int check_cpu_on_node(pg_data_t *pgdat)
+ +static int check_cpu_on_node(int nid)
   {
         int cpu;
   
         for_each_present_cpu(cpu) {
- -              if (cpu_to_node(cpu) == pgdat->node_id)
+ +              if (cpu_to_node(cpu) == nid)
                         /*
                          * the cpu on this node isn't removed, and we can't
                          * offline this node.
@@@ -2014,6 -2088,7 +2017,6 @@@ static int check_no_memblock_for_node_c
    */
   void try_offline_node(int nid)
   {
- -      pg_data_t *pgdat = NODE_DATA(nid);
         int rc;
   
         /*
@@@ -2021,7 -2096,7 +2024,7 @@@
          * offline it. A node spans memory after move_pfn_range_to_zone(),
          * e.g., after the memory block was onlined.
          */
- -      if (pgdat->node_spanned_pages)
+ +      if (node_spanned_pages(nid))
                 return;
   
         /*
@@@ -2033,7 -2108,7 +2036,7 @@@
         if (rc)
                 return;
   
- -      if (check_cpu_on_node(pgdat))
+ +      if (check_cpu_on_node(nid))
                 return;
   
         /*
diff --combined mm/memremap.c

index 4d73533d8ca79a51550c333267890f436bf37d8c,e00ffcdba7b632dee3845d67f2128ba5def6c947..c17eca4a48ca68a1bd4471d5a03996e881895044
--- 1/mm/memremap.c
--- 2/mm/memremap.c
+++ b/mm/memremap.c
@@@ -4,7 -4,7 +4,7 @@@
   #include <linux/io.h>
   #include <linux/kasan.h>
   #include <linux/memory_hotplug.h>
- #include <linux/mm.h>
+ #include <linux/memremap.h>
   #include <linux/pfn_t.h>
   #include <linux/swap.h>
   #include <linux/mmzone.h>
@@@ -12,6 -12,7 +12,7 @@@
   #include <linux/types.h>
   #include <linux/wait_bit.h>
   #include <linux/xarray.h>
+ #include "internal.h"
   
   static DEFINE_XARRAY(pgmap_array);
   
@@@ -37,21 -38,19 +38,19 @@@ unsigned long memremap_compat_align(voi
   EXPORT_SYMBOL_GPL(memremap_compat_align);
   #endif
   
- #ifdef CONFIG_DEV_PAGEMAP_OPS
+ #ifdef CONFIG_FS_DAX
   DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
   EXPORT_SYMBOL(devmap_managed_key);
   
   static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
   {
-       if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
-           pgmap->type == MEMORY_DEVICE_FS_DAX)
+       if (pgmap->type == MEMORY_DEVICE_FS_DAX)
                 static_branch_dec(&devmap_managed_key);
   }
   
   static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
   {
-       if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
-           pgmap->type == MEMORY_DEVICE_FS_DAX)
+       if (pgmap->type == MEMORY_DEVICE_FS_DAX)
                 static_branch_inc(&devmap_managed_key);
   }
   #else
@@@ -61,7 -60,7 +60,7 @@@ static void devmap_managed_enable_get(s
   static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
   {
   }
- #endif /* CONFIG_DEV_PAGEMAP_OPS */
+ #endif /* CONFIG_FS_DAX */
   
   static void pgmap_array_delete(struct range *range)
   {
@@@ -102,23 -101,12 +101,12 @@@ static unsigned long pfn_end(struct dev
         return (range->start + range_len(range)) >> PAGE_SHIFT;
   }
   
- static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
- {
-       if (pfn % (1024 << pgmap->vmemmap_shift))
-               cond_resched();
-       return pfn + pgmap_vmemmap_nr(pgmap);
- }
- 
   static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
   {
         return (pfn_end(pgmap, range_id) -
                 pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
   }
   
- #define for_each_device_pfn(pfn, map, i) \
-       for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \
-            pfn = pfn_next(map, pfn))
- 
   static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
   {
         struct range *range = &pgmap->ranges[range_id];
@@@ -147,13 -135,11 +135,11 @@@
   
   void memunmap_pages(struct dev_pagemap *pgmap)
   {
-       unsigned long pfn;
         int i;
   
         percpu_ref_kill(&pgmap->ref);
         for (i = 0; i < pgmap->nr_range; i++)
-               for_each_device_pfn(pfn, pgmap, i)
-                       put_page(pfn_to_page(pfn));
+               percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
         wait_for_completion(&pgmap->done);
         percpu_ref_exit(&pgmap->ref);
   
@@@ -282,8 -268,7 +268,8 @@@ static int pagemap_range(struct dev_pag
         return 0;
   
   err_add_memory:
- -      kasan_remove_zero_shadow(__va(range->start), range_len(range));
+ +      if (!is_private)
+ +              kasan_remove_zero_shadow(__va(range->start), range_len(range));
   err_kasan:
         untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
   err_pfn_remap:
@@@ -329,8 -314,7 +315,7 @@@ void *memremap_pages(struct dev_pagema
                 }
                 break;
         case MEMORY_DEVICE_FS_DAX:
-               if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
-                   IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
+               if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
                         WARN(1, "File system DAX not supported\n");
                         return ERR_PTR(-EINVAL);
                 }
@@@ -466,21 -450,17 +451,17 @@@ struct dev_pagemap *get_dev_pagemap(uns
   }
   EXPORT_SYMBOL_GPL(get_dev_pagemap);
   
- #ifdef CONFIG_DEV_PAGEMAP_OPS
- void free_devmap_managed_page(struct page *page)
+ void free_zone_device_page(struct page *page)
   {
-       /* notify page idle for dax */
-       if (!is_device_private_page(page)) {
-               wake_up_var(&page->_refcount);
+       if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free))
                 return;
-       }
   
         __ClearPageWaiters(page);
   
         mem_cgroup_uncharge(page_folio(page));
   
         /*
-        * When a device_private page is freed, the page->mapping field
+        * When a device managed page is freed, the page->mapping field
          * may still contain a (stale) mapping value. For example, the
          * lower bits of page->mapping may still identify the page as an
          * anonymous page. Ultimately, this entire field is just stale
@@@ -502,5 -482,27 +483,27 @@@
          */
         page->mapping = NULL;
         page->pgmap->ops->page_free(page);
+ 
+       /*
+        * Reset the page count to 1 to prepare for handing out the page again.
+        */
+       set_page_count(page, 1);
+ }
+ 
+ #ifdef CONFIG_FS_DAX
+ bool __put_devmap_managed_page(struct page *page)
+ {
+       if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
+               return false;
+ 
+       /*
+        * fsdax page refcounts are 1-based, rather than 0-based: if
+        * refcount is 1, then the page is free and the refcount is
+        * stable because nobody holds a reference on the page.
+        */
+       if (page_ref_dec_return(page) == 1)
+               wake_up_var(&page->_refcount);
+       return true;
   }
- #endif /* CONFIG_DEV_PAGEMAP_OPS */
+ EXPORT_SYMBOL(__put_devmap_managed_page);
+ #endif /* CONFIG_FS_DAX */
diff --combined mm/migrate.c

index bc9da3fd01aa93ef1252325a2ff2288fdcde14e4,2defe9aa4d0ee90952a067bfd00e3c94d9794d66..4f30ed37856f0ee5a066afc69d40994682f80f37
--- 1/mm/migrate.c
--- 2/mm/migrate.c
+++ b/mm/migrate.c
@@@ -38,12 -38,10 +38,10 @@@
   #include <linux/hugetlb.h>
   #include <linux/hugetlb_cgroup.h>
   #include <linux/gfp.h>
- #include <linux/pagewalk.h>
   #include <linux/pfn_t.h>
   #include <linux/memremap.h>
   #include <linux/userfaultfd_k.h>
   #include <linux/balloon_compaction.h>
- #include <linux/mmu_notifier.h>
   #include <linux/page_idle.h>
   #include <linux/page_owner.h>
   #include <linux/sched/mm.h>
@@@ -51,7 -49,6 +49,7 @@@
   #include <linux/oom.h>
   #include <linux/memory.h>
   #include <linux/random.h>
+ +#include <linux/sched/sysctl.h>
   
   #include <asm/tlbflush.h>
   
@@@ -108,7 -105,7 +106,7 @@@ int isolate_movable_page(struct page *p
   
         /* Driver shouldn't use PG_isolated bit of page->flags */
         WARN_ON_ONCE(PageIsolated(page));
- -      __SetPageIsolated(page);
+ +      SetPageIsolated(page);
         unlock_page(page);
   
         return 0;
@@@ -127,7 -124,7 +125,7 @@@ static void putback_movable_page(struc
   
         mapping = page_mapping(page);
         mapping->a_ops->putback_page(page);
- -      __ClearPageIsolated(page);
+ +      ClearPageIsolated(page);
   }
   
   /*
@@@ -160,7 -157,7 +158,7 @@@ void putback_movable_pages(struct list_
                         if (PageMovable(page))
                                 putback_movable_page(page);
                         else
- -                              __ClearPageIsolated(page);
+ +                              ClearPageIsolated(page);
                         unlock_page(page);
                         put_page(page);
                 } else {
@@@ -174,37 -171,33 +172,33 @@@
   /*
    * Restore a potential migration pte to a working pte entry
    */
- static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
-                                unsigned long addr, void *old)
+ static bool remove_migration_pte(struct folio *folio,
+               struct vm_area_struct *vma, unsigned long addr, void *old)
   {
-       struct page_vma_mapped_walk pvmw = {
-               .page = old,
-               .vma = vma,
-               .address = addr,
-               .flags = PVMW_SYNC | PVMW_MIGRATION,
-       };
-       struct page *new;
-       pte_t pte;
-       swp_entry_t entry;
+       DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
   
-       VM_BUG_ON_PAGE(PageTail(page), page);
         while (page_vma_mapped_walk(&pvmw)) {
-               if (PageKsm(page))
-                       new = page;
-               else
-                       new = page - pvmw.page->index +
-                               linear_page_index(vma, pvmw.address);
+               pte_t pte;
+               swp_entry_t entry;
+               struct page *new;
+               unsigned long idx = 0;
+ 
+               /* pgoff is invalid for ksm pages, but they are never large */
+               if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+                       idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
+               new = folio_page(folio, idx);
   
   #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
                 /* PMD-mapped THP migration entry */
                 if (!pvmw.pte) {
-                       VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+                       VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
+                                       !folio_test_pmd_mappable(folio), folio);
                         remove_migration_pmd(&pvmw, new);
                         continue;
                 }
   #endif
   
-               get_page(new);
+               folio_get(folio);
                 pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
                 if (pte_swp_soft_dirty(*pvmw.pte))
                         pte = pte_mksoft_dirty(pte);
@@@ -233,12 -226,12 +227,12 @@@
                 }
   
   #ifdef CONFIG_HUGETLB_PAGE
-               if (PageHuge(new)) {
+               if (folio_test_hugetlb(folio)) {
                         unsigned int shift = huge_page_shift(hstate_vma(vma));
   
                         pte = pte_mkhuge(pte);
                         pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
-                       if (PageAnon(new))
+                       if (folio_test_anon(folio))
                                 hugepage_add_anon_rmap(new, vma, pvmw.address);
                         else
                                 page_dup_rmap(new, true);
@@@ -246,17 -239,14 +240,14 @@@
                 } else
   #endif
                 {
-                       if (PageAnon(new))
+                       if (folio_test_anon(folio))
                                 page_add_anon_rmap(new, vma, pvmw.address, false);
                         else
-                               page_add_file_rmap(new, false);
+                               page_add_file_rmap(new, vma, false);
                         set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
                 }
-               if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
-                       mlock_vma_page(new);
- 
-               if (PageTransHuge(page) && PageMlocked(page))
-                       clear_page_mlock(page);
+               if (vma->vm_flags & VM_LOCKED)
+                       mlock_page_drain(smp_processor_id());
   
                 /* No need to invalidate - it was non-present before */
                 update_mmu_cache(vma, pvmw.address, pvmw.pte);
@@@ -269,17 -259,17 +260,17 @@@
    * Get rid of all migration entries and replace them by
    * references to the indicated page.
    */
- void remove_migration_ptes(struct page *old, struct page *new, bool locked)
+ void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
   {
         struct rmap_walk_control rwc = {
                 .rmap_one = remove_migration_pte,
-               .arg = old,
+               .arg = src,
         };
   
         if (locked)
-               rmap_walk_locked(new, &rwc);
+               rmap_walk_locked(dst, &rwc);
         else
-               rmap_walk(new, &rwc);
+               rmap_walk(dst, &rwc);
   }
   
   /*
@@@ -342,14 -332,8 +333,8 @@@ static int expected_page_refs(struct ad
   {
         int expected_count = 1;
   
-       /*
-        * Device private pages have an extra refcount as they are
-        * ZONE_DEVICE pages.
-        */
-       expected_count += is_device_private_page(page);
         if (mapping)
                 expected_count += compound_nr(page) + page_has_private(page);
- 
         return expected_count;
   }
   
@@@ -772,6 -756,7 +757,7 @@@ int buffer_migrate_page_norefs(struct a
    */
   static int writeout(struct address_space *mapping, struct page *page)
   {
+       struct folio *folio = page_folio(page);
         struct writeback_control wbc = {
                 .sync_mode = WB_SYNC_NONE,
                 .nr_to_write = 1,
@@@ -797,7 -782,7 +783,7 @@@
          * At this point we know that the migration attempt cannot
          * be successful.
          */
-       remove_migration_ptes(page, page, false);
+       remove_migration_ptes(folio, folio, false);
   
         rc = mapping->a_ops->writepage(page, &wbc);
   
@@@ -884,7 -869,7 +870,7 @@@ static int move_to_new_page(struct pag
                 VM_BUG_ON_PAGE(!PageIsolated(page), page);
                 if (!PageMovable(page)) {
                         rc = MIGRATEPAGE_SUCCESS;
- -                      __ClearPageIsolated(page);
+ +                      ClearPageIsolated(page);
                         goto out;
                 }
   
@@@ -906,7 -891,7 +892,7 @@@
                          * We clear PG_movable under page_lock so any compactor
                          * cannot try to migrate this page.
                          */
- -                      __ClearPageIsolated(page);
+ +                      ClearPageIsolated(page);
                 }
   
                 /*
@@@ -918,7 -903,8 +904,7 @@@
                         page->mapping = NULL;
   
                 if (likely(!is_zone_device_page(newpage)))
- -                      flush_dcache_page(newpage);
- -
+ +                      flush_dcache_folio(page_folio(newpage));
         }
   out:
         return rc;
@@@ -927,6 -913,8 +913,8 @@@
   static int __unmap_and_move(struct page *page, struct page *newpage,
                                 int force, enum migrate_mode mode)
   {
+       struct folio *folio = page_folio(page);
+       struct folio *dst = page_folio(newpage);
         int rc = -EAGAIN;
         bool page_was_mapped = false;
         struct anon_vma *anon_vma = NULL;
@@@ -1030,16 -1018,31 +1018,31 @@@
                 /* Establish migration ptes */
                 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
                                 page);
-               try_to_migrate(page, 0);
+               try_to_migrate(folio, 0);
                 page_was_mapped = true;
         }
   
         if (!page_mapped(page))
                 rc = move_to_new_page(newpage, page, mode);
   
+       /*
+        * When successful, push newpage to LRU immediately: so that if it
+        * turns out to be an mlocked page, remove_migration_ptes() will
+        * automatically build up the correct newpage->mlock_count for it.
+        *
+        * We would like to do something similar for the old page, when
+        * unsuccessful, and other cases when a page has been temporarily
+        * isolated from the unevictable LRU: but this case is the easiest.
+        */
+       if (rc == MIGRATEPAGE_SUCCESS) {
+               lru_cache_add(newpage);
+               if (page_was_mapped)
+                       lru_add_drain();
+       }
+ 
         if (page_was_mapped)
-               remove_migration_ptes(page,
-                       rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
+               remove_migration_ptes(folio,
+                       rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
   
   out_unlock_both:
         unlock_page(newpage);
@@@ -1050,20 -1053,12 +1053,12 @@@ out_unlock
         unlock_page(page);
   out:
         /*
-        * If migration is successful, decrease refcount of the newpage
+        * If migration is successful, decrease refcount of the newpage,
          * which will not free the page because new page owner increased
-        * refcounter. As well, if it is LRU page, add the page to LRU
-        * list in here. Use the old state of the isolated source page to
-        * determine if we migrated a LRU page. newpage was already unlocked
-        * and possibly modified by its owner - don't rely on the page
-        * state.
+        * refcounter.
          */
-       if (rc == MIGRATEPAGE_SUCCESS) {
-               if (unlikely(!is_lru))
-                       put_page(newpage);
-               else
-                       putback_lru_page(newpage);
-       }
+       if (rc == MIGRATEPAGE_SUCCESS)
+               put_page(newpage);
   
         return rc;
   }
@@@ -1092,7 -1087,7 +1087,7 @@@ static int unmap_and_move(new_page_t ge
                 if (unlikely(__PageMovable(page))) {
                         lock_page(page);
                         if (!PageMovable(page))
- -                              __ClearPageIsolated(page);
+ +                              ClearPageIsolated(page);
                         unlock_page(page);
                 }
                 goto out;
@@@ -1173,6 -1168,7 +1168,7 @@@ static int unmap_and_move_huge_page(new
                                 enum migrate_mode mode, int reason,
                                 struct list_head *ret)
   {
+       struct folio *dst, *src = page_folio(hpage);
         int rc = -EAGAIN;
         int page_was_mapped = 0;
         struct page *new_hpage;
@@@ -1200,6 -1196,7 +1196,7 @@@
         new_hpage = get_new_page(hpage, private);
         if (!new_hpage)
                 return -ENOMEM;
+       dst = page_folio(new_hpage);
   
         if (!trylock_page(hpage)) {
                 if (!force)
@@@ -1249,7 -1246,7 +1246,7 @@@
                         ttu |= TTU_RMAP_LOCKED;
                 }
   
-               try_to_migrate(hpage, ttu);
+               try_to_migrate(src, ttu);
                 page_was_mapped = 1;
   
                 if (mapping_locked)
@@@ -1260,8 -1257,8 +1257,8 @@@
                 rc = move_to_new_page(new_hpage, hpage, mode);
   
         if (page_was_mapped)
-               remove_migration_ptes(hpage,
-                       rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
+               remove_migration_ptes(src,
+                       rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
   
   unlock_put_anon:
         unlock_page(new_hpage);
@@@ -1351,6 -1348,7 +1348,6 @@@ int migrate_pages(struct list_head *fro
         bool is_thp = false;
         struct page *page;
         struct page *page2;
- -      int swapwrite = current->flags & PF_SWAPWRITE;
         int rc, nr_subpages;
         LIST_HEAD(ret_pages);
         LIST_HEAD(thp_split_pages);
@@@ -1359,6 -1357,9 +1356,6 @@@
   
         trace_mm_migrate_pages_start(mode, reason);
   
- -      if (!swapwrite)
- -              current->flags |= PF_SWAPWRITE;
- -
   thp_subpage_migration:
         for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
                 retry = 0;
@@@ -1513,6 -1514,9 +1510,6 @@@ out
         trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
                                nr_thp_failed, nr_thp_split, mode, reason);
   
- -      if (!swapwrite)
- -              current->flags &= ~PF_SWAPWRITE;
- -
         if (ret_succeeded)
                 *ret_succeeded = nr_succeeded;
   
@@@ -1605,6 -1609,7 +1602,6 @@@ static int add_page_for_migration(struc
   {
         struct vm_area_struct *vma;
         struct page *page;
- -      unsigned int follflags;
         int err;
   
         mmap_read_lock(mm);
@@@ -1614,7 -1619,8 +1611,7 @@@
                 goto out;
   
         /* FOLL_DUMP to ignore special (like zero) pages */
- -      follflags = FOLL_GET | FOLL_DUMP;
- -      page = follow_page(vma, addr, follflags);
+ +      page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
   
         err = PTR_ERR(page);
         if (IS_ERR(page))
@@@ -1752,13 -1758,6 +1749,13 @@@ static int do_pages_move(struct mm_stru
                         continue;
                 }
   
+ +              /*
+ +               * The move_pages() man page does not have an -EEXIST choice, so
+ +               * use -EFAULT instead.
+ +               */
+ +              if (err == -EEXIST)
+ +                      err = -EFAULT;
+ +
                 /*
                  * If the page is already on the target node (!err), store the
                  * node, otherwise, store the err.
@@@ -2032,27 -2031,16 +2029,27 @@@ static int numamigrate_isolate_page(pg_
   {
         int page_lru;
         int nr_pages = thp_nr_pages(page);
+ +      int order = compound_order(page);
   
- -      VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
+ +      VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
   
         /* Do not migrate THP mapped by multiple processes */
         if (PageTransHuge(page) && total_mapcount(page) > 1)
                 return 0;
   
         /* Avoid migrating to a node that is nearly full */
- -      if (!migrate_balanced_pgdat(pgdat, nr_pages))
+ +      if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
+ +              int z;
+ +
+ +              if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
+ +                      return 0;
+ +              for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ +                      if (populated_zone(pgdat->node_zones + z))
+ +                              break;
+ +              }
+ +              wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
                 return 0;
+ +      }
   
         if (isolate_lru_page(page))
                 return 0;
@@@ -2081,7 -2069,6 +2078,7 @@@ int migrate_misplaced_page(struct page 
         pg_data_t *pgdat = NODE_DATA(node);
         int isolated;
         int nr_remaining;
+ +      unsigned int nr_succeeded;
         LIST_HEAD(migratepages);
         new_page_t *new;
         bool compound;
@@@ -2120,8 -2107,7 +2117,8 @@@
   
         list_add(&page->lru, &migratepages);
         nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
- -                                   MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
+ +                                   MIGRATE_ASYNC, MR_NUMA_MISPLACED,
+ +                                   &nr_succeeded);
         if (nr_remaining) {
                 if (!list_empty(&migratepages)) {
                         list_del(&page->lru);
@@@ -2130,13 -2116,8 +2127,13 @@@
                         putback_lru_page(page);
                 }
                 isolated = 0;
- -      } else
- -              count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
+ +      }
+ +      if (nr_succeeded) {
+ +              count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
+ +              if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
+ +                      mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
+ +                                          nr_succeeded);
+ +      }
         BUG_ON(!list_empty(&migratepages));
         return isolated;
   
@@@ -2147,761 -2128,6 +2144,6 @@@ out
   #endif /* CONFIG_NUMA_BALANCING */
   #endif /* CONFIG_NUMA */
   
- #ifdef CONFIG_DEVICE_PRIVATE
- static int migrate_vma_collect_skip(unsigned long start,
-                                   unsigned long end,
-                                   struct mm_walk *walk)
- {
-       struct migrate_vma *migrate = walk->private;
-       unsigned long addr;
- 
-       for (addr = start; addr < end; addr += PAGE_SIZE) {
-               migrate->dst[migrate->npages] = 0;
-               migrate->src[migrate->npages++] = 0;
-       }
- 
-       return 0;
- }
- 
- static int migrate_vma_collect_hole(unsigned long start,
-                                   unsigned long end,
-                                   __always_unused int depth,
-                                   struct mm_walk *walk)
- {
-       struct migrate_vma *migrate = walk->private;
-       unsigned long addr;
- 
-       /* Only allow populating anonymous memory. */
-       if (!vma_is_anonymous(walk->vma))
-               return migrate_vma_collect_skip(start, end, walk);
- 
-       for (addr = start; addr < end; addr += PAGE_SIZE) {
-               migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
-               migrate->dst[migrate->npages] = 0;
-               migrate->npages++;
-               migrate->cpages++;
-       }
- 
-       return 0;
- }
- 
- static int migrate_vma_collect_pmd(pmd_t *pmdp,
-                                  unsigned long start,
-                                  unsigned long end,
-                                  struct mm_walk *walk)
- {
-       struct migrate_vma *migrate = walk->private;
-       struct vm_area_struct *vma = walk->vma;
-       struct mm_struct *mm = vma->vm_mm;
-       unsigned long addr = start, unmapped = 0;
-       spinlock_t *ptl;
-       pte_t *ptep;
- 
- again:
-       if (pmd_none(*pmdp))
-               return migrate_vma_collect_hole(start, end, -1, walk);
- 
-       if (pmd_trans_huge(*pmdp)) {
-               struct page *page;
- 
-               ptl = pmd_lock(mm, pmdp);
-               if (unlikely(!pmd_trans_huge(*pmdp))) {
-                       spin_unlock(ptl);
-                       goto again;
-               }
- 
-               page = pmd_page(*pmdp);
-               if (is_huge_zero_page(page)) {
-                       spin_unlock(ptl);
-                       split_huge_pmd(vma, pmdp, addr);
-                       if (pmd_trans_unstable(pmdp))
-                               return migrate_vma_collect_skip(start, end,
-                                                               walk);
-               } else {
-                       int ret;
- 
-                       get_page(page);
-                       spin_unlock(ptl);
-                       if (unlikely(!trylock_page(page)))
-                               return migrate_vma_collect_skip(start, end,
-                                                               walk);
-                       ret = split_huge_page(page);
-                       unlock_page(page);
-                       put_page(page);
-                       if (ret)
-                               return migrate_vma_collect_skip(start, end,
-                                                               walk);
-                       if (pmd_none(*pmdp))
-                               return migrate_vma_collect_hole(start, end, -1,
-                                                               walk);
-               }
-       }
- 
-       if (unlikely(pmd_bad(*pmdp)))
-               return migrate_vma_collect_skip(start, end, walk);
- 
-       ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-       arch_enter_lazy_mmu_mode();
- 
-       for (; addr < end; addr += PAGE_SIZE, ptep++) {
-               unsigned long mpfn = 0, pfn;
-               struct page *page;
-               swp_entry_t entry;
-               pte_t pte;
- 
-               pte = *ptep;
- 
-               if (pte_none(pte)) {
-                       if (vma_is_anonymous(vma)) {
-                               mpfn = MIGRATE_PFN_MIGRATE;
-                               migrate->cpages++;
-                       }
-                       goto next;
-               }
- 
-               if (!pte_present(pte)) {
-                       /*
-                        * Only care about unaddressable device page special
-                        * page table entry. Other special swap entries are not
-                        * migratable, and we ignore regular swapped page.
-                        */
-                       entry = pte_to_swp_entry(pte);
-                       if (!is_device_private_entry(entry))
-                               goto next;
- 
-                       page = pfn_swap_entry_to_page(entry);
-                       if (!(migrate->flags &
-                               MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
-                           page->pgmap->owner != migrate->pgmap_owner)
-                               goto next;
- 
-                       mpfn = migrate_pfn(page_to_pfn(page)) |
-                                       MIGRATE_PFN_MIGRATE;
-                       if (is_writable_device_private_entry(entry))
-                               mpfn |= MIGRATE_PFN_WRITE;
-               } else {
-                       if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
-                               goto next;
-                       pfn = pte_pfn(pte);
-                       if (is_zero_pfn(pfn)) {
-                               mpfn = MIGRATE_PFN_MIGRATE;
-                               migrate->cpages++;
-                               goto next;
-                       }
-                       page = vm_normal_page(migrate->vma, addr, pte);
-                       mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
-                       mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
-               }
- 
-               /* FIXME support THP */
-               if (!page || !page->mapping || PageTransCompound(page)) {
-                       mpfn = 0;
-                       goto next;
-               }
- 
-               /*
-                * By getting a reference on the page we pin it and that blocks
-                * any kind of migration. Side effect is that it "freezes" the
-                * pte.
-                *
-                * We drop this reference after isolating the page from the lru
-                * for non device page (device page are not on the lru and thus
-                * can't be dropped from it).
-                */
-               get_page(page);
- 
-               /*
-                * Optimize for the common case where page is only mapped once
-                * in one process. If we can lock the page, then we can safely
-                * set up a special migration page table entry now.
-                */
-               if (trylock_page(page)) {
-                       pte_t swp_pte;
- 
-                       migrate->cpages++;
-                       ptep_get_and_clear(mm, addr, ptep);
- 
-                       /* Setup special migration page table entry */
-                       if (mpfn & MIGRATE_PFN_WRITE)
-                               entry = make_writable_migration_entry(
-                                                       page_to_pfn(page));
-                       else
-                               entry = make_readable_migration_entry(
-                                                       page_to_pfn(page));
-                       swp_pte = swp_entry_to_pte(entry);
-                       if (pte_present(pte)) {
-                               if (pte_soft_dirty(pte))
-                                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
-                               if (pte_uffd_wp(pte))
-                                       swp_pte = pte_swp_mkuffd_wp(swp_pte);
-                       } else {
-                               if (pte_swp_soft_dirty(pte))
-                                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
-                               if (pte_swp_uffd_wp(pte))
-                                       swp_pte = pte_swp_mkuffd_wp(swp_pte);
-                       }
-                       set_pte_at(mm, addr, ptep, swp_pte);
- 
-                       /*
-                        * This is like regular unmap: we remove the rmap and
-                        * drop page refcount. Page won't be freed, as we took
-                        * a reference just above.
-                        */
-                       page_remove_rmap(page, false);
-                       put_page(page);
- 
-                       if (pte_present(pte))
-                               unmapped++;
-               } else {
-                       put_page(page);
-                       mpfn = 0;
-               }
- 
- next:
-               migrate->dst[migrate->npages] = 0;
-               migrate->src[migrate->npages++] = mpfn;
-       }
-       arch_leave_lazy_mmu_mode();
-       pte_unmap_unlock(ptep - 1, ptl);
- 
-       /* Only flush the TLB if we actually modified any entries */
-       if (unmapped)
-               flush_tlb_range(walk->vma, start, end);
- 
-       return 0;
- }
- 
- static const struct mm_walk_ops migrate_vma_walk_ops = {
-       .pmd_entry              = migrate_vma_collect_pmd,
-       .pte_hole               = migrate_vma_collect_hole,
- };
- 
- /*
-  * migrate_vma_collect() - collect pages over a range of virtual addresses
-  * @migrate: migrate struct containing all migration information
-  *
-  * This will walk the CPU page table. For each virtual address backed by a
-  * valid page, it updates the src array and takes a reference on the page, in
-  * order to pin the page until we lock it and unmap it.
-  */
- static void migrate_vma_collect(struct migrate_vma *migrate)
- {
-       struct mmu_notifier_range range;
- 
-       /*
-        * Note that the pgmap_owner is passed to the mmu notifier callback so
-        * that the registered device driver can skip invalidating device
-        * private page mappings that won't be migrated.
-        */
-       mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
-               migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
-               migrate->pgmap_owner);
-       mmu_notifier_invalidate_range_start(&range);
- 
-       walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
-                       &migrate_vma_walk_ops, migrate);
- 
-       mmu_notifier_invalidate_range_end(&range);
-       migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
- }
- 
- /*
-  * migrate_vma_check_page() - check if page is pinned or not
-  * @page: struct page to check
-  *
-  * Pinned pages cannot be migrated. This is the same test as in
-  * folio_migrate_mapping(), except that here we allow migration of a
-  * ZONE_DEVICE page.
-  */
- static bool migrate_vma_check_page(struct page *page)
- {
-       /*
-        * One extra ref because caller holds an extra reference, either from
-        * isolate_lru_page() for a regular page, or migrate_vma_collect() for
-        * a device page.
-        */
-       int extra = 1;
- 
-       /*
-        * FIXME support THP (transparent huge page), it is bit more complex to
-        * check them than regular pages, because they can be mapped with a pmd
-        * or with a pte (split pte mapping).
-        */
-       if (PageCompound(page))
-               return false;
- 
-       /* Page from ZONE_DEVICE have one extra reference */
-       if (is_zone_device_page(page))
-               extra++;
- 
-       /* For file back page */
-       if (page_mapping(page))
-               extra += 1 + page_has_private(page);
- 
-       if ((page_count(page) - extra) > page_mapcount(page))
-               return false;
- 
-       return true;
- }
- 
- /*
-  * migrate_vma_unmap() - replace page mapping with special migration pte entry
-  * @migrate: migrate struct containing all migration information
-  *
-  * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
-  * special migration pte entry and check if it has been pinned. Pinned pages are
-  * restored because we cannot migrate them.
-  *
-  * This is the last step before we call the device driver callback to allocate
-  * destination memory and copy contents of original page over to new page.
-  */
- static void migrate_vma_unmap(struct migrate_vma *migrate)
- {
-       const unsigned long npages = migrate->npages;
-       unsigned long i, restore = 0;
-       bool allow_drain = true;
- 
-       lru_add_drain();
- 
-       for (i = 0; i < npages; i++) {
-               struct page *page = migrate_pfn_to_page(migrate->src[i]);
- 
-               if (!page)
-                       continue;
- 
-               /* ZONE_DEVICE pages are not on LRU */
-               if (!is_zone_device_page(page)) {
-                       if (!PageLRU(page) && allow_drain) {
-                               /* Drain CPU's pagevec */
-                               lru_add_drain_all();
-                               allow_drain = false;
-                       }
- 
-                       if (isolate_lru_page(page)) {
-                               migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
-                               migrate->cpages--;
-                               restore++;
-                               continue;
-                       }
- 
-                       /* Drop the reference we took in collect */
-                       put_page(page);
-               }
- 
-               if (page_mapped(page))
-                       try_to_migrate(page, 0);
- 
-               if (page_mapped(page) || !migrate_vma_check_page(page)) {
-                       if (!is_zone_device_page(page)) {
-                               get_page(page);
-                               putback_lru_page(page);
-                       }
- 
-                       migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
-                       migrate->cpages--;
-                       restore++;
-                       continue;
-               }
-       }
- 
-       for (i = 0; i < npages && restore; i++) {
-               struct page *page = migrate_pfn_to_page(migrate->src[i]);
- 
-               if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
-                       continue;
- 
-               remove_migration_ptes(page, page, false);
- 
-               migrate->src[i] = 0;
-               unlock_page(page);
-               put_page(page);
-               restore--;
-       }
- }
- 
- /**
-  * migrate_vma_setup() - prepare to migrate a range of memory
-  * @args: contains the vma, start, and pfns arrays for the migration
-  *
-  * Returns: negative errno on failures, 0 when 0 or more pages were migrated
-  * without an error.
-  *
-  * Prepare to migrate a range of memory virtual address range by collecting all
-  * the pages backing each virtual address in the range, saving them inside the
-  * src array.  Then lock those pages and unmap them. Once the pages are locked
-  * and unmapped, check whether each page is pinned or not.  Pages that aren't
-  * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
-  * corresponding src array entry.  Then restores any pages that are pinned, by
-  * remapping and unlocking those pages.
-  *
-  * The caller should then allocate destination memory and copy source memory to
-  * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
-  * flag set).  Once these are allocated and copied, the caller must update each
-  * corresponding entry in the dst array with the pfn value of the destination
-  * page and with MIGRATE_PFN_VALID. Destination pages must be locked via
-  * lock_page().
-  *
-  * Note that the caller does not have to migrate all the pages that are marked
-  * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
-  * device memory to system memory.  If the caller cannot migrate a device page
-  * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
-  * consequences for the userspace process, so it must be avoided if at all
-  * possible.
-  *
-  * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
-  * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
-  * allowing the caller to allocate device memory for those unbacked virtual
-  * addresses.  For this the caller simply has to allocate device memory and
-  * properly set the destination entry like for regular migration.  Note that
-  * this can still fail, and thus inside the device driver you must check if the
-  * migration was successful for those entries after calling migrate_vma_pages(),
-  * just like for regular migration.
-  *
-  * After that, the callers must call migrate_vma_pages() to go over each entry
-  * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
-  * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
-  * then migrate_vma_pages() to migrate struct page information from the source
-  * struct page to the destination struct page.  If it fails to migrate the
-  * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
-  * src array.
-  *
-  * At this point all successfully migrated pages have an entry in the src
-  * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
-  * array entry with MIGRATE_PFN_VALID flag set.
-  *
-  * Once migrate_vma_pages() returns the caller may inspect which pages were
-  * successfully migrated, and which were not.  Successfully migrated pages will
-  * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
-  *
-  * It is safe to update device page table after migrate_vma_pages() because
-  * both destination and source page are still locked, and the mmap_lock is held
-  * in read mode (hence no one can unmap the range being migrated).
-  *
-  * Once the caller is done cleaning up things and updating its page table (if it
-  * chose to do so, this is not an obligation) it finally calls
-  * migrate_vma_finalize() to update the CPU page table to point to new pages
-  * for successfully migrated pages or otherwise restore the CPU page table to
-  * point to the original source pages.
-  */
- int migrate_vma_setup(struct migrate_vma *args)
- {
-       long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
- 
-       args->start &= PAGE_MASK;
-       args->end &= PAGE_MASK;
-       if (!args->vma || is_vm_hugetlb_page(args->vma) ||
-           (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
-               return -EINVAL;
-       if (nr_pages <= 0)
-               return -EINVAL;
-       if (args->start < args->vma->vm_start ||
-           args->start >= args->vma->vm_end)
-               return -EINVAL;
-       if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
-               return -EINVAL;
-       if (!args->src || !args->dst)
-               return -EINVAL;
- 
-       memset(args->src, 0, sizeof(*args->src) * nr_pages);
-       args->cpages = 0;
-       args->npages = 0;
- 
-       migrate_vma_collect(args);
- 
-       if (args->cpages)
-               migrate_vma_unmap(args);
- 
-       /*
-        * At this point pages are locked and unmapped, and thus they have
-        * stable content and can safely be copied to destination memory that
-        * is allocated by the drivers.
-        */
-       return 0;
- 
- }
- EXPORT_SYMBOL(migrate_vma_setup);
- 
- /*
-  * This code closely matches the code in:
-  *   __handle_mm_fault()
-  *     handle_pte_fault()
-  *       do_anonymous_page()
-  * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
-  * private page.
-  */
- static void migrate_vma_insert_page(struct migrate_vma *migrate,
-                                   unsigned long addr,
-                                   struct page *page,
-                                   unsigned long *src)
- {
-       struct vm_area_struct *vma = migrate->vma;
-       struct mm_struct *mm = vma->vm_mm;
-       bool flush = false;
-       spinlock_t *ptl;
-       pte_t entry;
-       pgd_t *pgdp;
-       p4d_t *p4dp;
-       pud_t *pudp;
-       pmd_t *pmdp;
-       pte_t *ptep;
- 
-       /* Only allow populating anonymous memory */
-       if (!vma_is_anonymous(vma))
-               goto abort;
- 
-       pgdp = pgd_offset(mm, addr);
-       p4dp = p4d_alloc(mm, pgdp, addr);
-       if (!p4dp)
-               goto abort;
-       pudp = pud_alloc(mm, p4dp, addr);
-       if (!pudp)
-               goto abort;
-       pmdp = pmd_alloc(mm, pudp, addr);
-       if (!pmdp)
-               goto abort;
- 
-       if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
-               goto abort;
- 
-       /*
-        * Use pte_alloc() instead of pte_alloc_map().  We can't run
-        * pte_offset_map() on pmds where a huge pmd might be created
-        * from a different thread.
-        *
-        * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
-        * parallel threads are excluded by other means.
-        *
-        * Here we only have mmap_read_lock(mm).
-        */
-       if (pte_alloc(mm, pmdp))
-               goto abort;
- 
-       /* See the comment in pte_alloc_one_map() */
-       if (unlikely(pmd_trans_unstable(pmdp)))
-               goto abort;
- 
-       if (unlikely(anon_vma_prepare(vma)))
-               goto abort;
-       if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
-               goto abort;
- 
-       /*
-        * The memory barrier inside __SetPageUptodate makes sure that
-        * preceding stores to the page contents become visible before
-        * the set_pte_at() write.
-        */
-       __SetPageUptodate(page);
- 
-       if (is_zone_device_page(page)) {
-               if (is_device_private_page(page)) {
-                       swp_entry_t swp_entry;
- 
-                       if (vma->vm_flags & VM_WRITE)
-                               swp_entry = make_writable_device_private_entry(
-                                                       page_to_pfn(page));
-                       else
-                               swp_entry = make_readable_device_private_entry(
-                                                       page_to_pfn(page));
-                       entry = swp_entry_to_pte(swp_entry);
-               } else {
-                       /*
-                        * For now we only support migrating to un-addressable
-                        * device memory.
-                        */
-                       pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
-                       goto abort;
-               }
-       } else {
-               entry = mk_pte(page, vma->vm_page_prot);
-               if (vma->vm_flags & VM_WRITE)
-                       entry = pte_mkwrite(pte_mkdirty(entry));
-       }
- 
-       ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
- 
-       if (check_stable_address_space(mm))
-               goto unlock_abort;
- 
-       if (pte_present(*ptep)) {
-               unsigned long pfn = pte_pfn(*ptep);
- 
-               if (!is_zero_pfn(pfn))
-                       goto unlock_abort;
-               flush = true;
-       } else if (!pte_none(*ptep))
-               goto unlock_abort;
- 
-       /*
-        * Check for userfaultfd but do not deliver the fault. Instead,
-        * just back off.
-        */
-       if (userfaultfd_missing(vma))
-               goto unlock_abort;
- 
-       inc_mm_counter(mm, MM_ANONPAGES);
-       page_add_new_anon_rmap(page, vma, addr, false);
-       if (!is_zone_device_page(page))
-               lru_cache_add_inactive_or_unevictable(page, vma);
-       get_page(page);
- 
-       if (flush) {
-               flush_cache_page(vma, addr, pte_pfn(*ptep));
-               ptep_clear_flush_notify(vma, addr, ptep);
-               set_pte_at_notify(mm, addr, ptep, entry);
-               update_mmu_cache(vma, addr, ptep);
-       } else {
-               /* No need to invalidate - it was non-present before */
-               set_pte_at(mm, addr, ptep, entry);
-               update_mmu_cache(vma, addr, ptep);
-       }
- 
-       pte_unmap_unlock(ptep, ptl);
-       *src = MIGRATE_PFN_MIGRATE;
-       return;
- 
- unlock_abort:
-       pte_unmap_unlock(ptep, ptl);
- abort:
-       *src &= ~MIGRATE_PFN_MIGRATE;
- }
- 
- /**
-  * migrate_vma_pages() - migrate meta-data from src page to dst page
-  * @migrate: migrate struct containing all migration information
-  *
-  * This migrates struct page meta-data from source struct page to destination
-  * struct page. This effectively finishes the migration from source page to the
-  * destination page.
-  */
- void migrate_vma_pages(struct migrate_vma *migrate)
- {
-       const unsigned long npages = migrate->npages;
-       const unsigned long start = migrate->start;
-       struct mmu_notifier_range range;
-       unsigned long addr, i;
-       bool notified = false;
- 
-       for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
-               struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
-               struct page *page = migrate_pfn_to_page(migrate->src[i]);
-               struct address_space *mapping;
-               int r;
- 
-               if (!newpage) {
-                       migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
-                       continue;
-               }
- 
-               if (!page) {
-                       if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
-                               continue;
-                       if (!notified) {
-                               notified = true;
- 
-                               mmu_notifier_range_init_owner(&range,
-                                       MMU_NOTIFY_MIGRATE, 0, migrate->vma,
-                                       migrate->vma->vm_mm, addr, migrate->end,
-                                       migrate->pgmap_owner);
-                               mmu_notifier_invalidate_range_start(&range);
-                       }
-                       migrate_vma_insert_page(migrate, addr, newpage,
-                                               &migrate->src[i]);
-                       continue;
-               }
- 
-               mapping = page_mapping(page);
- 
-               if (is_zone_device_page(newpage)) {
-                       if (is_device_private_page(newpage)) {
-                               /*
-                                * For now only support private anonymous when
-                                * migrating to un-addressable device memory.
-                                */
-                               if (mapping) {
-                                       migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
-                                       continue;
-                               }
-                       } else {
-                               /*
-                                * Other types of ZONE_DEVICE page are not
-                                * supported.
-                                */
-                               migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
-                               continue;
-                       }
-               }
- 
-               r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
-               if (r != MIGRATEPAGE_SUCCESS)
-                       migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
-       }
- 
-       /*
-        * No need to double call mmu_notifier->invalidate_range() callback as
-        * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
-        * did already call it.
-        */
-       if (notified)
-               mmu_notifier_invalidate_range_only_end(&range);
- }
- EXPORT_SYMBOL(migrate_vma_pages);
- 
- /**
-  * migrate_vma_finalize() - restore CPU page table entry
-  * @migrate: migrate struct containing all migration information
-  *
-  * This replaces the special migration pte entry with either a mapping to the
-  * new page if migration was successful for that page, or to the original page
-  * otherwise.
-  *
-  * This also unlocks the pages and puts them back on the lru, or drops the extra
-  * refcount, for device pages.
-  */
- void migrate_vma_finalize(struct migrate_vma *migrate)
- {
-       const unsigned long npages = migrate->npages;
-       unsigned long i;
- 
-       for (i = 0; i < npages; i++) {
-               struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
-               struct page *page = migrate_pfn_to_page(migrate->src[i]);
- 
-               if (!page) {
-                       if (newpage) {
-                               unlock_page(newpage);
-                               put_page(newpage);
-                       }
-                       continue;
-               }
- 
-               if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
-                       if (newpage) {
-                               unlock_page(newpage);
-                               put_page(newpage);
-                       }
-                       newpage = page;
-               }
- 
-               remove_migration_ptes(page, newpage, false);
-               unlock_page(page);
- 
-               if (is_zone_device_page(page))
-                       put_page(page);
-               else
-                       putback_lru_page(page);
- 
-               if (newpage != page) {
-                       unlock_page(newpage);
-                       if (is_zone_device_page(newpage))
-                               put_page(newpage);
-                       else
-                               putback_lru_page(newpage);
-               }
-       }
- }
- EXPORT_SYMBOL(migrate_vma_finalize);
- #endif /* CONFIG_DEVICE_PRIVATE */
- 
   /*
    * node_demotion[] example:
    *
@@@ -3098,21 -2324,18 +2340,21 @@@ static int establish_migrate_target(in
         if (best_distance != -1) {
                 val = node_distance(node, migration_target);
                 if (val > best_distance)
- -                      return NUMA_NO_NODE;
+ +                      goto out_clear;
         }
   
         index = nd->nr;
         if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
                       "Exceeds maximum demotion target nodes\n"))
- -              return NUMA_NO_NODE;
+ +              goto out_clear;
   
         nd->nodes[index] = migration_target;
         nd->nr++;
   
         return migration_target;
+ +out_clear:
+ +      node_clear(migration_target, *used);
+ +      return NUMA_NO_NODE;
   }
   
   /*
@@@ -3209,7 -2432,7 +2451,7 @@@ again
   /*
    * For callers that do not hold get_online_mems() already.
    */
- -static void set_migration_target_nodes(void)
+ +void set_migration_target_nodes(void)
   {
         get_online_mems();
         __set_migration_target_nodes();
@@@ -3273,24 -2496,51 +2515,24 @@@ static int __meminit migrate_on_reclaim
         return notifier_from_errno(0);
   }
   
- -/*
- - * React to hotplug events that might affect the migration targets
- - * like events that online or offline NUMA nodes.
- - *
- - * The ordering is also currently dependent on which nodes have
- - * CPUs.  That means we need CPU on/offline notification too.
- - */
- -static int migration_online_cpu(unsigned int cpu)
- -{
- -      set_migration_target_nodes();
- -      return 0;
- -}
- -
- -static int migration_offline_cpu(unsigned int cpu)
- -{
- -      set_migration_target_nodes();
- -      return 0;
- -}
- -
- -static int __init migrate_on_reclaim_init(void)
+ +void __init migrate_on_reclaim_init(void)
   {
- -      int ret;
- -
         node_demotion = kmalloc_array(nr_node_ids,
                                       sizeof(struct demotion_nodes),
                                       GFP_KERNEL);
         WARN_ON(!node_demotion);
   
- -      ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
- -                                      NULL, migration_offline_cpu);
+ +      hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
         /*
- -       * In the unlikely case that this fails, the automatic
- -       * migration targets may become suboptimal for nodes
- -       * where N_CPU changes.  With such a small impact in a
- -       * rare case, do not bother trying to do anything special.
+ +       * At this point, all numa nodes with memory/CPus have their state
+ +       * properly set, so we can build the demotion order now.
+ +       * Let us hold the cpu_hotplug lock just, as we could possibily have
+ +       * CPU hotplug events during boot.
          */
- -      WARN_ON(ret < 0);
- -      ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
- -                              migration_online_cpu, NULL);
- -      WARN_ON(ret < 0);
- -
- -      hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
- -      return 0;
+ +      cpus_read_lock();
+ +      set_migration_target_nodes();
+ +      cpus_read_unlock();
   }
- -late_initcall(migrate_on_reclaim_init);
   #endif /* CONFIG_HOTPLUG_CPU */
   
   bool numa_demotion_enabled = false;
diff --combined mm/mlock.c

index 349e2cda8c50caf50a176d903cc5e13a10b52032,833d482746d9284d305d1c7ce6b843aed16d42b9..efd2dd2943decfb384c36d2ab1d4a9c1cf152a66
--- 1/mm/mlock.c
--- 2/mm/mlock.c
+++ b/mm/mlock.c
@@@ -14,6 -14,7 +14,7 @@@
   #include <linux/swapops.h>
   #include <linux/pagemap.h>
   #include <linux/pagevec.h>
+ #include <linux/pagewalk.h>
   #include <linux/mempolicy.h>
   #include <linux/syscalls.h>
   #include <linux/sched.h>
@@@ -27,6 -28,8 +28,8 @@@
   
   #include "internal.h"
   
+ static DEFINE_PER_CPU(struct pagevec, mlock_pvec);
+ 
   bool can_do_mlock(void)
   {
         if (rlimit(RLIMIT_MEMLOCK) != 0)
@@@ -46,441 -49,320 +49,320 @@@ EXPORT_SYMBOL(can_do_mlock)
    * be placed on the LRU "unevictable" list, rather than the [in]active lists.
    * The unevictable list is an LRU sibling list to the [in]active lists.
    * PageUnevictable is set to indicate the unevictable state.
-  *
-  * When lazy mlocking via vmscan, it is important to ensure that the
-  * vma's VM_LOCKED status is not concurrently being modified, otherwise we
-  * may have mlocked a page that is being munlocked. So lazy mlock must take
-  * the mmap_lock for read, and verify that the vma really is locked
-  * (see mm/rmap.c).
    */
   
- /*
-  *  LRU accounting for clear_page_mlock()
-  */
- void clear_page_mlock(struct page *page)
+ static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec)
   {
-       int nr_pages;
+       /* There is nothing more we can do while it's off LRU */
+       if (!TestClearPageLRU(page))
+               return lruvec;
   
-       if (!TestClearPageMlocked(page))
-               return;
+       lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
   
-       nr_pages = thp_nr_pages(page);
-       mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
-       count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
-       /*
-        * The previous TestClearPageMlocked() corresponds to the smp_mb()
-        * in __pagevec_lru_add_fn().
-        *
-        * See __pagevec_lru_add_fn for more explanation.
-        */
-       if (!isolate_lru_page(page)) {
-               putback_lru_page(page);
-       } else {
+       if (unlikely(page_evictable(page))) {
                 /*
-                * We lost the race. the page already moved to evictable list.
+                * This is a little surprising, but quite possible:
+                * PageMlocked must have got cleared already by another CPU.
+                * Could this page be on the Unevictable LRU?  I'm not sure,
+                * but move it now if so.
                  */
-               if (PageUnevictable(page))
-                       count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+               if (PageUnevictable(page)) {
+                       del_page_from_lru_list(page, lruvec);
+                       ClearPageUnevictable(page);
+                       add_page_to_lru_list(page, lruvec);
+                       __count_vm_events(UNEVICTABLE_PGRESCUED,
+                                         thp_nr_pages(page));
+               }
+               goto out;
         }
+ 
+       if (PageUnevictable(page)) {
+               if (PageMlocked(page))
+                       page->mlock_count++;
+               goto out;
+       }
+ 
+       del_page_from_lru_list(page, lruvec);
+       ClearPageActive(page);
+       SetPageUnevictable(page);
+       page->mlock_count = !!PageMlocked(page);
+       add_page_to_lru_list(page, lruvec);
+       __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+ out:
+       SetPageLRU(page);
+       return lruvec;
   }
   
- /*
-  * Mark page as mlocked if not already.
-  * If page on LRU, isolate and putback to move to unevictable list.
-  */
- void mlock_vma_page(struct page *page)
+ static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec)
   {
-       /* Serialize with page migration */
-       BUG_ON(!PageLocked(page));
+       VM_BUG_ON_PAGE(PageLRU(page), page);
   
-       VM_BUG_ON_PAGE(PageTail(page), page);
-       VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+       lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
   
-       if (!TestSetPageMlocked(page)) {
-               int nr_pages = thp_nr_pages(page);
+       /* As above, this is a little surprising, but possible */
+       if (unlikely(page_evictable(page)))
+               goto out;
   
-               mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
-               count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
-               if (!isolate_lru_page(page))
-                       putback_lru_page(page);
-       }
+       SetPageUnevictable(page);
+       page->mlock_count = !!PageMlocked(page);
+       __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+ out:
+       add_page_to_lru_list(page, lruvec);
+       SetPageLRU(page);
+       return lruvec;
   }
   
- /*
-  * Finish munlock after successful page isolation
-  *
-  * Page must be locked. This is a wrapper for page_mlock()
-  * and putback_lru_page() with munlock accounting.
-  */
- static void __munlock_isolated_page(struct page *page)
+ static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec)
   {
-       /*
-        * Optimization: if the page was mapped just once, that's our mapping
-        * and we don't need to check all the other vmas.
-        */
-       if (page_mapcount(page) > 1)
-               page_mlock(page);
+       int nr_pages = thp_nr_pages(page);
+       bool isolated = false;
+ 
+       if (!TestClearPageLRU(page))
+               goto munlock;
   
-       /* Did try_to_unlock() succeed or punt? */
-       if (!PageMlocked(page))
-               count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page));
+       isolated = true;
+       lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
   
-       putback_lru_page(page);
+       if (PageUnevictable(page)) {
+               /* Then mlock_count is maintained, but might undercount */
+               if (page->mlock_count)
+                       page->mlock_count--;
+               if (page->mlock_count)
+                       goto out;
+       }
+       /* else assume that was the last mlock: reclaim will fix it if not */
+ 
+ munlock:
+       if (TestClearPageMlocked(page)) {
+               __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+               if (isolated || !PageUnevictable(page))
+                       __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
+               else
+                       __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+       }
+ 
+       /* page_evictable() has to be checked *after* clearing Mlocked */
+       if (isolated && PageUnevictable(page) && page_evictable(page)) {
+               del_page_from_lru_list(page, lruvec);
+               ClearPageUnevictable(page);
+               add_page_to_lru_list(page, lruvec);
+               __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+       }
+ out:
+       if (isolated)
+               SetPageLRU(page);
+       return lruvec;
   }
   
   /*
-  * Accounting for page isolation fail during munlock
-  *
-  * Performs accounting when page isolation fails in munlock. There is nothing
-  * else to do because it means some other task has already removed the page
-  * from the LRU. putback_lru_page() will take care of removing the page from
-  * the unevictable list, if necessary. vmscan [page_referenced()] will move
-  * the page back to the unevictable list if some other vma has it mlocked.
+  * Flags held in the low bits of a struct page pointer on the mlock_pvec.
    */
- static void __munlock_isolation_failed(struct page *page)
+ #define LRU_PAGE 0x1
+ #define NEW_PAGE 0x2
+ static inline struct page *mlock_lru(struct page *page)
   {
-       int nr_pages = thp_nr_pages(page);
+       return (struct page *)((unsigned long)page + LRU_PAGE);
+ }
   
-       if (PageUnevictable(page))
-               __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
-       else
-               __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
+ static inline struct page *mlock_new(struct page *page)
+ {
+       return (struct page *)((unsigned long)page + NEW_PAGE);
   }
   
- /**
-  * munlock_vma_page - munlock a vma page
-  * @page: page to be unlocked, either a normal page or THP page head
-  *
-  * returns the size of the page as a page mask (0 for normal page,
-  *         HPAGE_PMD_NR - 1 for THP head page)
-  *
-  * called from munlock()/munmap() path with page supposedly on the LRU.
-  * When we munlock a page, because the vma where we found the page is being
-  * munlock()ed or munmap()ed, we want to check whether other vmas hold the
-  * page locked so that we can leave it on the unevictable lru list and not
-  * bother vmscan with it.  However, to walk the page's rmap list in
-  * page_mlock() we must isolate the page from the LRU.  If some other
-  * task has removed the page from the LRU, we won't be able to do that.
-  * So we clear the PageMlocked as we might not get another chance.  If we
-  * can't isolate the page, we leave it for putback_lru_page() and vmscan
-  * [page_referenced()/try_to_unmap()] to deal with.
+ /*
+  * mlock_pagevec() is derived from pagevec_lru_move_fn():
+  * perhaps that can make use of such page pointer flags in future,
+  * but for now just keep it for mlock.  We could use three separate
+  * pagevecs instead, but one feels better (munlocking a full pagevec
+  * does not need to drain mlocking pagevecs first).
    */
- unsigned int munlock_vma_page(struct page *page)
+ static void mlock_pagevec(struct pagevec *pvec)
   {
-       int nr_pages;
- 
-       /* For page_mlock() and to serialize with page migration */
-       BUG_ON(!PageLocked(page));
-       VM_BUG_ON_PAGE(PageTail(page), page);
+       struct lruvec *lruvec = NULL;
+       unsigned long mlock;
+       struct page *page;
+       int i;
   
-       if (!TestClearPageMlocked(page)) {
-               /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
-               return 0;
+       for (i = 0; i < pagevec_count(pvec); i++) {
+               page = pvec->pages[i];
+               mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE);
+               page = (struct page *)((unsigned long)page - mlock);
+               pvec->pages[i] = page;
+ 
+               if (mlock & LRU_PAGE)
+                       lruvec = __mlock_page(page, lruvec);
+               else if (mlock & NEW_PAGE)
+                       lruvec = __mlock_new_page(page, lruvec);
+               else
+                       lruvec = __munlock_page(page, lruvec);
         }
   
-       nr_pages = thp_nr_pages(page);
-       mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+       if (lruvec)
+               unlock_page_lruvec_irq(lruvec);
+       release_pages(pvec->pages, pvec->nr);
+       pagevec_reinit(pvec);
+ }
   
-       if (!isolate_lru_page(page))
-               __munlock_isolated_page(page);
-       else
-               __munlock_isolation_failed(page);
+ void mlock_page_drain(int cpu)
+ {
+       struct pagevec *pvec;
   
-       return nr_pages - 1;
+       pvec = &per_cpu(mlock_pvec, cpu);
+       if (pagevec_count(pvec))
+               mlock_pagevec(pvec);
   }
   
- /*
-  * convert get_user_pages() return value to posix mlock() error
-  */
- static int __mlock_posix_error_return(long retval)
+ bool need_mlock_page_drain(int cpu)
   {
-       if (retval == -EFAULT)
-               retval = -ENOMEM;
-       else if (retval == -ENOMEM)
-               retval = -EAGAIN;
-       return retval;
+       return pagevec_count(&per_cpu(mlock_pvec, cpu));
   }
   
- /*
-  * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
-  *
-  * The fast path is available only for evictable pages with single mapping.
-  * Then we can bypass the per-cpu pvec and get better performance.
-  * when mapcount > 1 we need page_mlock() which can fail.
-  * when !page_evictable(), we need the full redo logic of putback_lru_page to
-  * avoid leaving evictable page in unevictable list.
-  *
-  * In case of success, @page is added to @pvec and @pgrescued is incremented
-  * in case that the page was previously unevictable. @page is also unlocked.
+ /**
+  * mlock_folio - mlock a folio already on (or temporarily off) LRU
+  * @folio: folio to be mlocked.
    */
- static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
-               int *pgrescued)
+ void mlock_folio(struct folio *folio)
   {
-       VM_BUG_ON_PAGE(PageLRU(page), page);
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       struct pagevec *pvec = &get_cpu_var(mlock_pvec);
   
-       if (page_mapcount(page) <= 1 && page_evictable(page)) {
-               pagevec_add(pvec, page);
-               if (TestClearPageUnevictable(page))
-                       (*pgrescued)++;
-               unlock_page(page);
-               return true;
+       if (!folio_test_set_mlocked(folio)) {
+               int nr_pages = folio_nr_pages(folio);
+ 
+               zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
+               __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
         }
   
-       return false;
+       folio_get(folio);
+       if (!pagevec_add(pvec, mlock_lru(&folio->page)) ||
+           folio_test_large(folio) || lru_cache_disabled())
+               mlock_pagevec(pvec);
+       put_cpu_var(mlock_pvec);
   }
   
- /*
-  * Putback multiple evictable pages to the LRU
-  *
-  * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
-  * the pages might have meanwhile become unevictable but that is OK.
+ /**
+  * mlock_new_page - mlock a newly allocated page not yet on LRU
+  * @page: page to be mlocked, either a normal page or a THP head.
    */
- static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
+ void mlock_new_page(struct page *page)
   {
-       count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
-       /*
-        *__pagevec_lru_add() calls release_pages() so we don't call
-        * put_page() explicitly
-        */
-       __pagevec_lru_add(pvec);
-       count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+       struct pagevec *pvec = &get_cpu_var(mlock_pvec);
+       int nr_pages = thp_nr_pages(page);
+ 
+       SetPageMlocked(page);
+       mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+       __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
+ 
+       get_page(page);
+       if (!pagevec_add(pvec, mlock_new(page)) ||
+           PageHead(page) || lru_cache_disabled())
+               mlock_pagevec(pvec);
+       put_cpu_var(mlock_pvec);
   }
   
- /*
-  * Munlock a batch of pages from the same zone
-  *
-  * The work is split to two main phases. First phase clears the Mlocked flag
-  * and attempts to isolate the pages, all under a single zone lru lock.
-  * The second phase finishes the munlock only for pages where isolation
-  * succeeded.
-  *
-  * Note that the pagevec may be modified during the process.
+ /**
+  * munlock_page - munlock a page
+  * @page: page to be munlocked, either a normal page or a THP head.
    */
- static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
+ void munlock_page(struct page *page)
   {
-       int i;
-       int nr = pagevec_count(pvec);
-       int delta_munlocked = -nr;
-       struct pagevec pvec_putback;
-       struct lruvec *lruvec = NULL;
-       int pgrescued = 0;
- 
-       pagevec_init(&pvec_putback);
- 
-       /* Phase 1: page isolation */
-       for (i = 0; i < nr; i++) {
-               struct page *page = pvec->pages[i];
-               struct folio *folio = page_folio(page);
- 
-               if (TestClearPageMlocked(page)) {
-                       /*
-                        * We already have pin from follow_page_mask()
-                        * so we can spare the get_page() here.
-                        */
-                       if (TestClearPageLRU(page)) {
-                               lruvec = folio_lruvec_relock_irq(folio, lruvec);
-                               del_page_from_lru_list(page, lruvec);
-                               continue;
-                       } else
-                               __munlock_isolation_failed(page);
-               } else {
-                       delta_munlocked++;
-               }
- 
-               /*
-                * We won't be munlocking this page in the next phase
-                * but we still need to release the follow_page_mask()
-                * pin. We cannot do it under lru_lock however. If it's
-                * the last pin, __page_cache_release() would deadlock.
-                */
-               pagevec_add(&pvec_putback, pvec->pages[i]);
-               pvec->pages[i] = NULL;
-       }
-       if (lruvec) {
-               __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
-               unlock_page_lruvec_irq(lruvec);
-       } else if (delta_munlocked) {
-               mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
-       }
- 
-       /* Now we can release pins of pages that we are not munlocking */
-       pagevec_release(&pvec_putback);
- 
-       /* Phase 2: page munlock */
-       for (i = 0; i < nr; i++) {
-               struct page *page = pvec->pages[i];
- 
-               if (page) {
-                       lock_page(page);
-                       if (!__putback_lru_fast_prepare(page, &pvec_putback,
-                                       &pgrescued)) {
-                               /*
-                                * Slow path. We don't want to lose the last
-                                * pin before unlock_page()
-                                */
-                               get_page(page); /* for putback_lru_page() */
-                               __munlock_isolated_page(page);
-                               unlock_page(page);
-                               put_page(page); /* from follow_page_mask() */
-                       }
-               }
-       }
+       struct pagevec *pvec = &get_cpu_var(mlock_pvec);
   
         /*
-        * Phase 3: page putback for pages that qualified for the fast path
-        * This will also call put_page() to return pin from follow_page_mask()
+        * TestClearPageMlocked(page) must be left to __munlock_page(),
+        * which will check whether the page is multiply mlocked.
          */
-       if (pagevec_count(&pvec_putback))
-               __putback_lru_fast(&pvec_putback, pgrescued);
+ 
+       get_page(page);
+       if (!pagevec_add(pvec, page) ||
+           PageHead(page) || lru_cache_disabled())
+               mlock_pagevec(pvec);
+       put_cpu_var(mlock_pvec);
   }
   
- /*
-  * Fill up pagevec for __munlock_pagevec using pte walk
-  *
-  * The function expects that the struct page corresponding to @start address is
-  * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
-  *
-  * The rest of @pvec is filled by subsequent pages within the same pmd and same
-  * zone, as long as the pte's are present and vm_normal_page() succeeds. These
-  * pages also get pinned.
-  *
-  * Returns the address of the next page that should be scanned. This equals
-  * @start + PAGE_SIZE when no page could be added by the pte walk.
-  */
- static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
-                       struct vm_area_struct *vma, struct zone *zone,
-                       unsigned long start, unsigned long end)
+ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
+                          unsigned long end, struct mm_walk *walk)
+ 
   {
-       pte_t *pte;
+       struct vm_area_struct *vma = walk->vma;
         spinlock_t *ptl;
+       pte_t *start_pte, *pte;
+       struct page *page;
   
-       /*
-        * Initialize pte walk starting at the already pinned page where we
-        * are sure that there is a pte, as it was pinned under the same
-        * mmap_lock write op.
-        */
-       pte = get_locked_pte(vma->vm_mm, start, &ptl);
-       /* Make sure we do not cross the page table boundary */
-       end = pgd_addr_end(start, end);
-       end = p4d_addr_end(start, end);
-       end = pud_addr_end(start, end);
-       end = pmd_addr_end(start, end);
- 
-       /* The page next to the pinned page is the first we will try to get */
-       start += PAGE_SIZE;
-       while (start < end) {
-               struct page *page = NULL;
-               pte++;
-               if (pte_present(*pte))
-                       page = vm_normal_page(vma, start, *pte);
-               /*
-                * Break if page could not be obtained or the page's node+zone does not
-                * match
-                */
-               if (!page || page_zone(page) != zone)
-                       break;
+       ptl = pmd_trans_huge_lock(pmd, vma);
+       if (ptl) {
+               if (!pmd_present(*pmd))
+                       goto out;
+               if (is_huge_zero_pmd(*pmd))
+                       goto out;
+               page = pmd_page(*pmd);
+               if (vma->vm_flags & VM_LOCKED)
+                       mlock_folio(page_folio(page));
+               else
+                       munlock_page(page);
+               goto out;
+       }
   
-               /*
-                * Do not use pagevec for PTE-mapped THP,
-                * munlock_vma_pages_range() will handle them.
-                */
+       start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+       for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
+               if (!pte_present(*pte))
+                       continue;
+               page = vm_normal_page(vma, addr, *pte);
+               if (!page)
+                       continue;
                 if (PageTransCompound(page))
-                       break;
- 
-               get_page(page);
-               /*
-                * Increase the address that will be returned *before* the
-                * eventual break due to pvec becoming full by adding the page
-                */
-               start += PAGE_SIZE;
-               if (pagevec_add(pvec, page) == 0)
-                       break;
+                       continue;
+               if (vma->vm_flags & VM_LOCKED)
+                       mlock_folio(page_folio(page));
+               else
+                       munlock_page(page);
         }
-       pte_unmap_unlock(pte, ptl);
-       return start;
+       pte_unmap(start_pte);
+ out:
+       spin_unlock(ptl);
+       cond_resched();
+       return 0;
   }
   
   /*
-  * munlock_vma_pages_range() - munlock all pages in the vma range.'
-  * @vma - vma containing range to be munlock()ed.
+  * mlock_vma_pages_range() - mlock any pages already in the range,
+  *                           or munlock all pages in the range.
+  * @vma - vma containing range to be mlock()ed or munlock()ed
    * @start - start address in @vma of the range
-  * @end - end of range in @vma.
-  *
-  *  For mremap(), munmap() and exit().
-  *
-  * Called with @vma VM_LOCKED.
+  * @end - end of range in @vma
+  * @newflags - the new set of flags for @vma.
    *
-  * Returns with VM_LOCKED cleared.  Callers must be prepared to
-  * deal with this.
-  *
-  * We don't save and restore VM_LOCKED here because pages are
-  * still on lru.  In unmap path, pages might be scanned by reclaim
-  * and re-mlocked by page_mlock/try_to_unmap before we unmap and
-  * free them.  This will result in freeing mlocked pages.
+  * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
+  * called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
    */
- void munlock_vma_pages_range(struct vm_area_struct *vma,
-                            unsigned long start, unsigned long end)
+ static void mlock_vma_pages_range(struct vm_area_struct *vma,
+       unsigned long start, unsigned long end, vm_flags_t newflags)
   {
-       vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+       static const struct mm_walk_ops mlock_walk_ops = {
+               .pmd_entry = mlock_pte_range,
+       };
   
-       while (start < end) {
-               struct page *page;
-               unsigned int page_mask = 0;
-               unsigned long page_increm;
-               struct pagevec pvec;
-               struct zone *zone;
+       /*
+        * There is a slight chance that concurrent page migration,
+        * or page reclaim finding a page of this now-VM_LOCKED vma,
+        * will call mlock_vma_page() and raise page's mlock_count:
+        * double counting, leaving the page unevictable indefinitely.
+        * Communicate this danger to mlock_vma_page() with VM_IO,
+        * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
+        * mmap_lock is held in write mode here, so this weird
+        * combination should not be visible to other mmap_lock users;
+        * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
+        */
+       if (newflags & VM_LOCKED)
+               newflags |= VM_IO;
+       WRITE_ONCE(vma->vm_flags, newflags);
   
-               pagevec_init(&pvec);
-               /*
-                * Although FOLL_DUMP is intended for get_dump_page(),
-                * it just so happens that its special treatment of the
-                * ZERO_PAGE (returning an error instead of doing get_page)
-                * suits munlock very well (and if somehow an abnormal page
-                * has sneaked into the range, we won't oops here: great).
-                */
-               page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
- 
-               if (page && !IS_ERR(page)) {
-                       if (PageTransTail(page)) {
-                               VM_BUG_ON_PAGE(PageMlocked(page), page);
-                               put_page(page); /* follow_page_mask() */
-                       } else if (PageTransHuge(page)) {
-                               lock_page(page);
-                               /*
-                                * Any THP page found by follow_page_mask() may
-                                * have gotten split before reaching
-                                * munlock_vma_page(), so we need to compute
-                                * the page_mask here instead.
-                                */
-                               page_mask = munlock_vma_page(page);
-                               unlock_page(page);
-                               put_page(page); /* follow_page_mask() */
-                       } else {
-                               /*
-                                * Non-huge pages are handled in batches via
-                                * pagevec. The pin from follow_page_mask()
-                                * prevents them from collapsing by THP.
-                                */
-                               pagevec_add(&pvec, page);
-                               zone = page_zone(page);
- 
-                               /*
-                                * Try to fill the rest of pagevec using fast
-                                * pte walk. This will also update start to
-                                * the next page to process. Then munlock the
-                                * pagevec.
-                                */
-                               start = __munlock_pagevec_fill(&pvec, vma,
-                                               zone, start, end);
-                               __munlock_pagevec(&pvec, zone);
-                               goto next;
-                       }
-               }
-               page_increm = 1 + page_mask;
-               start += page_increm * PAGE_SIZE;
- next:
-               cond_resched();
+       lru_add_drain();
+       walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
+       lru_add_drain();
+ 
+       if (newflags & VM_IO) {
+               newflags &= ~VM_IO;
+               WRITE_ONCE(vma->vm_flags, newflags);
         }
   }
   
@@@ -500,10 -382,9 +382,9 @@@ static int mlock_fixup(struct vm_area_s
         pgoff_t pgoff;
         int nr_pages;
         int ret = 0;
-       int lock = !!(newflags & VM_LOCKED);
-       vm_flags_t old_flags = vma->vm_flags;
+       vm_flags_t oldflags = vma->vm_flags;
   
-       if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
+       if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
             is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
             vma_is_dax(vma) || vma_is_secretmem(vma))
                 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
@@@ -512,7 -393,7 +393,7 @@@
         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
         *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
                           vma->vm_file, pgoff, vma_policy(vma),
- -                        vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ +                        vma->vm_userfaultfd_ctx, anon_vma_name(vma));
         if (*prev) {
                 vma = *prev;
                 goto success;
@@@ -535,9 -416,9 +416,9 @@@ success
          * Keep track of amount of locked VM.
          */
         nr_pages = (end - start) >> PAGE_SHIFT;
-       if (!lock)
+       if (!(newflags & VM_LOCKED))
                 nr_pages = -nr_pages;
-       else if (old_flags & VM_LOCKED)
+       else if (oldflags & VM_LOCKED)
                 nr_pages = 0;
         mm->locked_vm += nr_pages;
   
@@@ -547,11 -428,12 +428,12 @@@
          * set VM_LOCKED, populate_vma_page_range will bring it back.
          */
   
-       if (lock)
+       if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
+               /* No work to do, and mlocking twice would be wrong */
                 vma->vm_flags = newflags;
-       else
-               munlock_vma_pages_range(vma, start, end);
- 
+       } else {
+               mlock_vma_pages_range(vma, start, end, newflags);
+       }
   out:
         *prev = vma;
         return ret;
@@@ -645,6 -527,18 +527,18 @@@ static unsigned long count_mm_mlocked_p
         return count >> PAGE_SHIFT;
   }
   
+ /*
+  * convert get_user_pages() return value to posix mlock() error
+  */
+ static int __mlock_posix_error_return(long retval)
+ {
+       if (retval == -EFAULT)
+               retval = -ENOMEM;
+       else if (retval == -ENOMEM)
+               retval = -EAGAIN;
+       return retval;
+ }
+ 
   static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
   {
         unsigned long locked;
@@@ -839,7 -733,6 +733,7 @@@ int user_shm_lock(size_t size, struct u
         }
         if (!get_ucounts(ucounts)) {
                 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+ +              allowed = 0;
                 goto out;
         }
         allowed = 1;
diff --combined mm/mmap.c

index bd3416eb5fbc66c5deeadb5e07c899b1188b5af4,64b5985b5295c1abe87daeb6ffe87a031dde042c..3aa839f81e63dc8645506af8c99c870b33afd6d8
--- 1/mm/mmap.c
--- 2/mm/mmap.c
+++ b/mm/mmap.c
@@@ -1031,7 -1031,7 +1031,7 @@@ again
   static inline int is_mergeable_vma(struct vm_area_struct *vma,
                                 struct file *file, unsigned long vm_flags,
                                 struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- -                              const char *anon_name)
+ +                              struct anon_vma_name *anon_name)
   {
         /*
          * VM_SOFTDIRTY should not prevent from VMA merging, if we
@@@ -1049,7 -1049,7 +1049,7 @@@
                 return 0;
         if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
                 return 0;
- -      if (!is_same_vma_anon_name(vma, anon_name))
+ +      if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
                 return 0;
         return 1;
   }
@@@ -1084,7 -1084,7 +1084,7 @@@ can_vma_merge_before(struct vm_area_str
                      struct anon_vma *anon_vma, struct file *file,
                      pgoff_t vm_pgoff,
                      struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- -                   const char *anon_name)
+ +                   struct anon_vma_name *anon_name)
   {
         if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
             is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
@@@ -1106,7 -1106,7 +1106,7 @@@ can_vma_merge_after(struct vm_area_stru
                     struct anon_vma *anon_vma, struct file *file,
                     pgoff_t vm_pgoff,
                     struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- -                  const char *anon_name)
+ +                  struct anon_vma_name *anon_name)
   {
         if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
             is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
@@@ -1167,7 -1167,7 +1167,7 @@@ struct vm_area_struct *vma_merge(struc
                         struct anon_vma *anon_vma, struct file *file,
                         pgoff_t pgoff, struct mempolicy *policy,
                         struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- -                      const char *anon_name)
+ +                      struct anon_vma_name *anon_name)
   {
         pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
         struct vm_area_struct *area, *next;
@@@ -1616,6 -1616,8 +1616,6 @@@ unsigned long ksys_mmap_pgoff(unsigned 
                 /*
                  * VM_NORESERVE is used because the reservations will be
                  * taken when vm_ops->mmap() is called
- -               * A dummy user value is used because we are not locking
- -               * memory so no accounting is necessary
                  */
                 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                 VM_NORESERVE,
@@@ -2555,7 -2557,7 +2555,7 @@@ static int __init cmdline_parse_stack_g
         if (!*endptr)
                 stack_guard_gap = val << PAGE_SHIFT;
   
- -      return 0;
+ +      return 1;
   }
   __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
   
@@@ -2672,6 -2674,8 +2672,8 @@@ detach_vmas_to_be_unmapped(struct mm_st
         vma->vm_prev = NULL;
         do {
                 vma_rb_erase(vma, &mm->mm_rb);
+               if (vma->vm_flags & VM_LOCKED)
+                       mm->locked_vm -= vma_pages(vma);
                 mm->map_count--;
                 tail_vma = vma;
                 vma = vma->vm_next;
@@@ -2776,22 -2780,6 +2778,6 @@@ int split_vma(struct mm_struct *mm, str
         return __split_vma(mm, vma, addr, new_below);
   }
   
- static inline void
- unlock_range(struct vm_area_struct *start, unsigned long limit)
- {
-       struct mm_struct *mm = start->vm_mm;
-       struct vm_area_struct *tmp = start;
- 
-       while (tmp && tmp->vm_start < limit) {
-               if (tmp->vm_flags & VM_LOCKED) {
-                       mm->locked_vm -= vma_pages(tmp);
-                       munlock_vma_pages_all(tmp);
-               }
- 
-               tmp = tmp->vm_next;
-       }
- }
- 
   /* Munmap is split into 2 main parts -- this part which finds
    * what needs doing, and the areas themselves, which do the
    * work.  This now handles partial unmappings.
@@@ -2872,12 -2860,6 +2858,6 @@@ int __do_munmap(struct mm_struct *mm, u
                         return error;
         }
   
-       /*
-        * unlock any mlock()ed ranges before detaching vmas
-        */
-       if (mm->locked_vm)
-               unlock_range(vma, end);
- 
         /* Detach vmas from rbtree */
         if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
                 downgrade = false;
@@@ -3145,20 -3127,12 +3125,12 @@@ void exit_mmap(struct mm_struct *mm
                  * Nothing can be holding mm->mmap_lock here and the above call
                  * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
                  * __oom_reap_task_mm() will not block.
-                *
-                * This needs to be done before calling unlock_range(),
-                * which clears VM_LOCKED, otherwise the oom reaper cannot
-                * reliably test it.
                  */
                 (void)__oom_reap_task_mm(mm);
- 
                 set_bit(MMF_OOM_SKIP, &mm->flags);
         }
   
         mmap_write_lock(mm);
-       if (mm->locked_vm)
-               unlock_range(mm->mmap, ULONG_MAX);
- 
         arch_exit_mmap(mm);
   
         vma = mm->mmap;
@@@ -3184,7 -3158,6 +3156,7 @@@
                 vma = remove_vma(vma);
                 cond_resched();
         }
+ +      mm->mmap = NULL;
         mmap_write_unlock(mm);
         vm_unacct_memory(nr_accounted);
   }
@@@ -3254,7 -3227,7 +3226,7 @@@ struct vm_area_struct *copy_vma(struct 
                 return NULL;    /* should never get here */
         new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
                             vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- -                          vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ +                          vma->vm_userfaultfd_ctx, anon_vma_name(vma));
         if (new_vma) {
                 /*
                  * Source vma may have been merged into new_vma
@@@ -3446,7 -3419,6 +3418,7 @@@ static struct vm_area_struct *__install
         vma->vm_end = addr + len;
   
         vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
+ +      vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
         vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
   
         vma->vm_ops = ops;
diff --combined mm/mmzone.c

index d8a9b0e1b52670727adc618fa18ce491fcd61b52,40e1d94283004a37b6b8310ace2705903138e599..0ae7571e35abb07795037c510ec1c161ce1e8b82
--- 1/mm/mmzone.c
--- 2/mm/mmzone.c
+++ b/mm/mmzone.c
@@@ -81,6 -81,13 +81,13 @@@ void lruvec_init(struct lruvec *lruvec
   
         for_each_lru(lru)
                 INIT_LIST_HEAD(&lruvec->lists[lru]);
+       /*
+        * The "Unevictable LRU" is imaginary: though its size is maintained,
+        * it is never scanned, and unevictable pages are not threaded on it
+        * (so that their lru fields can be reused to hold mlock_count).
+        * Poison its list head, so that any operations on it would crash.
+        */
+       list_del(&lruvec->lists[LRU_UNEVICTABLE]);
   }
   
   #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
@@@ -89,14 -96,13 +96,14 @@@ int page_cpupid_xchg_last(struct page *
         unsigned long old_flags, flags;
         int last_cpupid;
   
+ +      old_flags = READ_ONCE(page->flags);
         do {
- -              old_flags = flags = page->flags;
- -              last_cpupid = page_cpupid_last(page);
+ +              flags = old_flags;
+ +              last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
   
                 flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
                 flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
- -      } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
+ +      } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
   
         return last_cpupid;
   }
diff --combined mm/oom_kill.c

index 4db425eedbe8b179ff740f8a70a764ac867783aa,6b875acabd1e7258ca0e1c224f991d46ca8499ed..7ec38194f8e11c927ab5f9f3b7c89ebab33ef0e7
--- 1/mm/oom_kill.c
--- 2/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@@ -93,6 -93,9 +93,6 @@@ static bool oom_cpuset_eligible(struct 
         bool ret = false;
         const nodemask_t *mask = oc->nodemask;
   
- -      if (is_memcg_oom(oc))
- -              return true;
- -
         rcu_read_lock();
         for_each_thread(start, tsk) {
                 if (mask) {
@@@ -523,7 -526,7 +523,7 @@@ bool __oom_reap_task_mm(struct mm_struc
         set_bit(MMF_UNSTABLE, &mm->flags);
   
         for (vma = mm->mmap ; vma; vma = vma->vm_next) {
-               if (!can_madv_lru_vma(vma))
+               if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
                         continue;
   
                 /*
diff --combined mm/page_alloc.c

index 584ed4bac85efaef905c42f5171f2a2ef3052912,02283598fd140872f52d3714ee20c1134eb9dee5..6e0b4596cde9bb1a0e7d47b3d63437aadf305e43
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -128,7 -128,7 +128,7 @@@ static DEFINE_MUTEX(pcp_batch_high_lock
   struct pagesets {
         local_lock_t lock;
   };
- -static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+ +static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = {
         .lock = INIT_LOCAL_LOCK(lock),
   };
   
@@@ -734,8 -734,7 +734,7 @@@ static void prep_compound_head(struct p
         set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
         set_compound_order(page, order);
         atomic_set(compound_mapcount_ptr(page), -1);
-       if (hpage_pincount_available(page))
-               atomic_set(compound_pincount_ptr(page), 0);
+       atomic_set(compound_pincount_ptr(page), 0);
   }
   
   static void prep_compound_tail(struct page *head, int tail_idx)
@@@ -1072,12 -1071,14 +1071,12 @@@ static inline void __free_one_page(stru
                 int migratetype, fpi_t fpi_flags)
   {
         struct capture_control *capc = task_capc(zone);
+ +      unsigned int max_order = pageblock_order;
         unsigned long buddy_pfn;
         unsigned long combined_pfn;
- -      unsigned int max_order;
         struct page *buddy;
         bool to_tail;
   
- -      max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
- -
         VM_BUG_ON(!zone_is_initialized(zone));
         VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
   
@@@ -1115,24 -1116,25 +1114,24 @@@ continue_merging
         }
         if (order < MAX_ORDER - 1) {
                 /* If we are here, it means order is >= pageblock_order.
- -               * We want to prevent merge between freepages on isolate
- -               * pageblock and normal pageblock. Without this, pageblock
- -               * isolation could cause incorrect freepage or CMA accounting.
+ +               * We want to prevent merge between freepages on pageblock
+ +               * without fallbacks and normal pageblock. Without this,
+ +               * pageblock isolation could cause incorrect freepage or CMA
+ +               * accounting or HIGHATOMIC accounting.
                  *
                  * We don't want to hit this code for the more frequent
                  * low-order merging.
                  */
- -              if (unlikely(has_isolate_pageblock(zone))) {
- -                      int buddy_mt;
+ +              int buddy_mt;
   
- -                      buddy_pfn = __find_buddy_pfn(pfn, order);
- -                      buddy = page + (buddy_pfn - pfn);
- -                      buddy_mt = get_pageblock_migratetype(buddy);
+ +              buddy_pfn = __find_buddy_pfn(pfn, order);
+ +              buddy = page + (buddy_pfn - pfn);
+ +              buddy_mt = get_pageblock_migratetype(buddy);
   
- -                      if (migratetype != buddy_mt
- -                                      && (is_migrate_isolate(migratetype) ||
- -                                              is_migrate_isolate(buddy_mt)))
- -                              goto done_merging;
- -              }
+ +              if (migratetype != buddy_mt
+ +                              && (!migratetype_is_mergeable(migratetype) ||
+ +                                      !migratetype_is_mergeable(buddy_mt)))
+ +                      goto done_merging;
                 max_order = order + 1;
                 goto continue_merging;
         }
@@@ -1429,83 -1431,120 +1428,83 @@@ static bool bulkfree_pcp_prepare(struc
   }
   #endif /* CONFIG_DEBUG_VM */
   
- -static inline void prefetch_buddy(struct page *page)
- -{
- -      unsigned long pfn = page_to_pfn(page);
- -      unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
- -      struct page *buddy = page + (buddy_pfn - pfn);
- -
- -      prefetch(buddy);
- -}
- -
   /*
    * Frees a number of pages from the PCP lists
    * Assumes all pages on list are in same zone.
    * count is the number of pages to free.
    */
   static void free_pcppages_bulk(struct zone *zone, int count,
- -                                      struct per_cpu_pages *pcp)
+ +                                      struct per_cpu_pages *pcp,
+ +                                      int pindex)
   {
- -      int pindex = 0;
- -      int batch_free = 0;
- -      int nr_freed = 0;
+ +      int min_pindex = 0;
+ +      int max_pindex = NR_PCP_LISTS - 1;
         unsigned int order;
- -      int prefetch_nr = READ_ONCE(pcp->batch);
         bool isolated_pageblocks;
- -      struct page *page, *tmp;
- -      LIST_HEAD(head);
+ +      struct page *page;
   
         /*
          * Ensure proper count is passed which otherwise would stuck in the
          * below while (list_empty(list)) loop.
          */
         count = min(pcp->count, count);
+ +
+ +      /* Ensure requested pindex is drained first. */
+ +      pindex = pindex - 1;
+ +
+ +      /*
+ +       * local_lock_irq held so equivalent to spin_lock_irqsave for
+ +       * both PREEMPT_RT and non-PREEMPT_RT configurations.
+ +       */
+ +      spin_lock(&zone->lock);
+ +      isolated_pageblocks = has_isolate_pageblock(zone);
+ +
         while (count > 0) {
                 struct list_head *list;
+ +              int nr_pages;
   
- -              /*
- -               * Remove pages from lists in a round-robin fashion. A
- -               * batch_free count is maintained that is incremented when an
- -               * empty list is encountered.  This is so more pages are freed
- -               * off fuller lists instead of spinning excessively around empty
- -               * lists
- -               */
+ +              /* Remove pages from lists in a round-robin fashion. */
                 do {
- -                      batch_free++;
- -                      if (++pindex == NR_PCP_LISTS)
- -                              pindex = 0;
+ +                      if (++pindex > max_pindex)
+ +                              pindex = min_pindex;
                         list = &pcp->lists[pindex];
- -              } while (list_empty(list));
+ +                      if (!list_empty(list))
+ +                              break;
   
- -              /* This is the only non-empty list. Free them all. */
- -              if (batch_free == NR_PCP_LISTS)
- -                      batch_free = count;
+ +                      if (pindex == max_pindex)
+ +                              max_pindex--;
+ +                      if (pindex == min_pindex)
+ +                              min_pindex++;
+ +              } while (1);
   
                 order = pindex_to_order(pindex);
+ +              nr_pages = 1 << order;
                 BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
                 do {
+ +                      int mt;
+ +
                         page = list_last_entry(list, struct page, lru);
+ +                      mt = get_pcppage_migratetype(page);
+ +
                         /* must delete to avoid corrupting pcp list */
                         list_del(&page->lru);
- -                      nr_freed += 1 << order;
- -                      count -= 1 << order;
+ +                      count -= nr_pages;
+ +                      pcp->count -= nr_pages;
   
                         if (bulkfree_pcp_prepare(page))
                                 continue;
   
- -                      /* Encode order with the migratetype */
- -                      page->index <<= NR_PCP_ORDER_WIDTH;
- -                      page->index |= order;
+ +                      /* MIGRATE_ISOLATE page should not go to pcplists */
+ +                      VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ +                      /* Pageblock could have been isolated meanwhile */
+ +                      if (unlikely(isolated_pageblocks))
+ +                              mt = get_pageblock_migratetype(page);
   
- -                      list_add_tail(&page->lru, &head);
- -
- -                      /*
- -                       * We are going to put the page back to the global
- -                       * pool, prefetch its buddy to speed up later access
- -                       * under zone->lock. It is believed the overhead of
- -                       * an additional test and calculating buddy_pfn here
- -                       * can be offset by reduced memory latency later. To
- -                       * avoid excessive prefetching due to large count, only
- -                       * prefetch buddy for the first pcp->batch nr of pages.
- -                       */
- -                      if (prefetch_nr) {
- -                              prefetch_buddy(page);
- -                              prefetch_nr--;
- -                      }
- -              } while (count > 0 && --batch_free && !list_empty(list));
+ +                      __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
+ +                      trace_mm_page_pcpu_drain(page, order, mt);
+ +              } while (count > 0 && !list_empty(list));
         }
- -      pcp->count -= nr_freed;
- -
- -      /*
- -       * local_lock_irq held so equivalent to spin_lock_irqsave for
- -       * both PREEMPT_RT and non-PREEMPT_RT configurations.
- -       */
- -      spin_lock(&zone->lock);
- -      isolated_pageblocks = has_isolate_pageblock(zone);
- -
- -      /*
- -       * Use safe version since after __free_one_page(),
- -       * page->lru.next will not point to original list.
- -       */
- -      list_for_each_entry_safe(page, tmp, &head, lru) {
- -              int mt = get_pcppage_migratetype(page);
- -
- -              /* mt has been encoded with the order (see above) */
- -              order = mt & NR_PCP_ORDER_MASK;
- -              mt >>= NR_PCP_ORDER_WIDTH;
   
- -              /* MIGRATE_ISOLATE page should not go to pcplists */
- -              VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
- -              /* Pageblock could have been isolated meanwhile */
- -              if (unlikely(isolated_pageblocks))
- -                      mt = get_pageblock_migratetype(page);
- -
- -              __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
- -              trace_mm_page_pcpu_drain(page, order, mt);
- -      }
         spin_unlock(&zone->lock);
   }
   
@@@ -2220,8 -2259,19 +2219,8 @@@ void __init init_cma_reserved_pageblock
         } while (++p, --i);
   
         set_pageblock_migratetype(page, MIGRATE_CMA);
- -
- -      if (pageblock_order >= MAX_ORDER) {
- -              i = pageblock_nr_pages;
- -              p = page;
- -              do {
- -                      set_page_refcounted(p);
- -                      __free_pages(p, MAX_ORDER - 1);
- -                      p += MAX_ORDER_NR_PAGES;
- -              } while (i -= MAX_ORDER_NR_PAGES);
- -      } else {
- -              set_page_refcounted(page);
- -              __free_pages(page, pageblock_order);
- -      }
+ +      set_page_refcounted(page);
+ +      __free_pages(page, pageblock_order);
   
         adjust_managed_page_count(page, pageblock_nr_pages);
         page_zone(page)->cma_pages += pageblock_nr_pages;
@@@ -2291,36 -2341,23 +2290,36 @@@ static inline int check_new_page(struc
         return 1;
   }
   
+ +static bool check_new_pages(struct page *page, unsigned int order)
+ +{
+ +      int i;
+ +      for (i = 0; i < (1 << order); i++) {
+ +              struct page *p = page + i;
+ +
+ +              if (unlikely(check_new_page(p)))
+ +                      return true;
+ +      }
+ +
+ +      return false;
+ +}
+ +
   #ifdef CONFIG_DEBUG_VM
   /*
    * With DEBUG_VM enabled, order-0 pages are checked for expected state when
    * being allocated from pcp lists. With debug_pagealloc also enabled, they are
    * also checked when pcp lists are refilled from the free lists.
    */
- -static inline bool check_pcp_refill(struct page *page)
+ +static inline bool check_pcp_refill(struct page *page, unsigned int order)
   {
         if (debug_pagealloc_enabled_static())
- -              return check_new_page(page);
+ +              return check_new_pages(page, order);
         else
                 return false;
   }
   
- -static inline bool check_new_pcp(struct page *page)
+ +static inline bool check_new_pcp(struct page *page, unsigned int order)
   {
- -      return check_new_page(page);
+ +      return check_new_pages(page, order);
   }
   #else
   /*
@@@ -2328,19 -2365,32 +2327,19 @@@
    * when pcp lists are being refilled from the free lists. With debug_pagealloc
    * enabled, they are also checked when being allocated from the pcp lists.
    */
- -static inline bool check_pcp_refill(struct page *page)
+ +static inline bool check_pcp_refill(struct page *page, unsigned int order)
   {
- -      return check_new_page(page);
+ +      return check_new_pages(page, order);
   }
- -static inline bool check_new_pcp(struct page *page)
+ +static inline bool check_new_pcp(struct page *page, unsigned int order)
   {
         if (debug_pagealloc_enabled_static())
- -              return check_new_page(page);
+ +              return check_new_pages(page, order);
         else
                 return false;
   }
   #endif /* CONFIG_DEBUG_VM */
   
- -static bool check_new_pages(struct page *page, unsigned int order)
- -{
- -      int i;
- -      for (i = 0; i < (1 << order); i++) {
- -              struct page *p = page + i;
- -
- -              if (unlikely(check_new_page(p)))
- -                      return true;
- -      }
- -
- -      return false;
- -}
- -
   inline void post_alloc_hook(struct page *page, unsigned int order,
                                 gfp_t gfp_flags)
   {
@@@ -2428,13 -2478,17 +2427,13 @@@ struct page *__rmqueue_smallest(struct 
   /*
    * This array describes the order lists are fallen back to when
    * the free lists for the desirable migrate type are depleted
+ + *
+ + * The other migratetypes do not have fallbacks.
    */
   static int fallbacks[MIGRATE_TYPES][3] = {
         [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
         [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
         [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
- -#ifdef CONFIG_CMA
- -      [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
- -#endif
- -#ifdef CONFIG_MEMORY_ISOLATION
- -      [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
- -#endif
   };
   
   #ifdef CONFIG_CMA
@@@ -2740,8 -2794,8 +2739,8 @@@ static void reserve_highatomic_pagebloc
   
         /* Yoink! */
         mt = get_pageblock_migratetype(page);
- -      if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
- -          && !is_migrate_cma(mt)) {
+ +      /* Only reserve normal pageblocks (i.e., they can merge with others) */
+ +      if (migratetype_is_mergeable(mt)) {
                 zone->nr_reserved_highatomic += pageblock_nr_pages;
                 set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
                 move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
@@@ -2982,7 -3036,7 +2981,7 @@@ static int rmqueue_bulk(struct zone *zo
                 if (unlikely(page == NULL))
                         break;
   
- -              if (unlikely(check_pcp_refill(page)))
+ +              if (unlikely(check_pcp_refill(page, order)))
                         continue;
   
                 /*
@@@ -3031,7 -3085,7 +3030,7 @@@ void drain_zone_pages(struct zone *zone
         batch = READ_ONCE(pcp->batch);
         to_drain = min(pcp->count, batch);
         if (to_drain > 0)
- -              free_pcppages_bulk(zone, to_drain, pcp);
+ +              free_pcppages_bulk(zone, to_drain, pcp, 0);
         local_unlock_irqrestore(&pagesets.lock, flags);
   }
   #endif
@@@ -3052,7 -3106,7 +3051,7 @@@ static void drain_pages_zone(unsigned i
   
         pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
         if (pcp->count)
- -              free_pcppages_bulk(zone, pcp->count, pcp);
+ +              free_pcppages_bulk(zone, pcp->count, pcp, 0);
   
         local_unlock_irqrestore(&pagesets.lock, flags);
   }
@@@ -3275,15 -3329,10 +3274,15 @@@ static bool free_unref_page_prepare(str
         return true;
   }
   
- -static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch)
+ +static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
+ +                     bool free_high)
   {
         int min_nr_free, max_nr_free;
   
+ +      /* Free everything if batch freeing high-order pages. */
+ +      if (unlikely(free_high))
+ +              return pcp->count;
+ +
         /* Check for PCP disabled or boot pageset */
         if (unlikely(high < batch))
                 return 1;
@@@ -3304,12 -3353,11 +3303,12 @@@
         return batch;
   }
   
- -static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
+ +static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
+ +                     bool free_high)
   {
         int high = READ_ONCE(pcp->high);
   
- -      if (unlikely(!high))
+ +      if (unlikely(!high || free_high))
                 return 0;
   
         if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
@@@ -3322,34 -3370,24 +3321,34 @@@
         return min(READ_ONCE(pcp->batch) << 2, high);
   }
   
- -static void free_unref_page_commit(struct page *page, unsigned long pfn,
- -                                 int migratetype, unsigned int order)
+ +static void free_unref_page_commit(struct page *page, int migratetype,
+ +                                 unsigned int order)
   {
         struct zone *zone = page_zone(page);
         struct per_cpu_pages *pcp;
         int high;
         int pindex;
+ +      bool free_high;
   
         __count_vm_event(PGFREE);
         pcp = this_cpu_ptr(zone->per_cpu_pageset);
         pindex = order_to_pindex(migratetype, order);
         list_add(&page->lru, &pcp->lists[pindex]);
         pcp->count += 1 << order;
- -      high = nr_pcp_high(pcp, zone);
+ +
+ +      /*
+ +       * As high-order pages other than THP's stored on PCP can contribute
+ +       * to fragmentation, limit the number stored when PCP is heavily
+ +       * freeing without allocation. The remainder after bulk freeing
+ +       * stops will be drained from vmstat refresh context.
+ +       */
+ +      free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
+ +
+ +      high = nr_pcp_high(pcp, zone, free_high);
         if (pcp->count >= high) {
                 int batch = READ_ONCE(pcp->batch);
   
- -              free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp);
+ +              free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
         }
   }
   
@@@ -3382,7 -3420,7 +3381,7 @@@ void free_unref_page(struct page *page
         }
   
         local_lock_irqsave(&pagesets.lock, flags);
- -      free_unref_page_commit(page, pfn, migratetype, order);
+ +      free_unref_page_commit(page, migratetype, order);
         local_unlock_irqrestore(&pagesets.lock, flags);
   }
   
@@@ -3392,13 -3430,13 +3391,13 @@@
   void free_unref_page_list(struct list_head *list)
   {
         struct page *page, *next;
- -      unsigned long flags, pfn;
+ +      unsigned long flags;
         int batch_count = 0;
         int migratetype;
   
         /* Prepare pages for freeing */
         list_for_each_entry_safe(page, next, list, lru) {
- -              pfn = page_to_pfn(page);
+ +              unsigned long pfn = page_to_pfn(page);
                 if (!free_unref_page_prepare(page, pfn, 0)) {
                         list_del(&page->lru);
                         continue;
@@@ -3414,10 -3452,15 +3413,10 @@@
                         free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
                         continue;
                 }
- -
- -              set_page_private(page, pfn);
         }
   
         local_lock_irqsave(&pagesets.lock, flags);
         list_for_each_entry_safe(page, next, list, lru) {
- -              pfn = page_private(page);
- -              set_page_private(page, 0);
- -
                 /*
                  * Non-isolated types over MIGRATE_PCPTYPES get added
                  * to the MIGRATE_MOVABLE pcp list.
@@@ -3427,7 -3470,7 +3426,7 @@@
                         migratetype = MIGRATE_MOVABLE;
   
                 trace_mm_page_free_batched(page);
- -              free_unref_page_commit(page, pfn, migratetype, 0);
+ +              free_unref_page_commit(page, migratetype, 0);
   
                 /*
                  * Guard against excessive IRQ disabled times when we get
@@@ -3501,11 -3544,8 +3500,11 @@@ int __isolate_free_page(struct page *pa
                 struct page *endpage = page + (1 << order) - 1;
                 for (; page < endpage; page += pageblock_nr_pages) {
                         int mt = get_pageblock_migratetype(page);
- -                      if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
- -                          && !is_migrate_highatomic(mt))
+ +                      /*
+ +                       * Only change normal pageblocks (i.e., they can merge
+ +                       * with others)
+ +                       */
+ +                      if (migratetype_is_mergeable(mt))
                                 set_pageblock_migratetype(page,
                                                           MIGRATE_MOVABLE);
                 }
@@@ -3600,7 -3640,7 +3599,7 @@@ struct page *__rmqueue_pcplist(struct z
                 page = list_first_entry(list, struct page, lru);
                 list_del(&page->lru);
                 pcp->count -= 1 << order;
- -      } while (check_new_pcp(page));
+ +      } while (check_new_pcp(page, order));
   
         return page;
   }
@@@ -3665,10 -3705,10 +3664,10 @@@ struct page *rmqueue(struct zone *prefe
          * allocate greater than order-1 page units with __GFP_NOFAIL.
          */
         WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
- -      spin_lock_irqsave(&zone->lock, flags);
   
         do {
                 page = NULL;
+ +              spin_lock_irqsave(&zone->lock, flags);
                 /*
                  * order-0 request can reach here when the pcplist is skipped
                  * due to non-CMA allocation context. HIGHATOMIC area is
@@@ -3680,15 -3720,15 +3679,15 @@@
                         if (page)
                                 trace_mm_page_alloc_zone_locked(page, order, migratetype);
                 }
- -              if (!page)
+ +              if (!page) {
                         page = __rmqueue(zone, order, migratetype, alloc_flags);
- -      } while (page && check_new_pages(page, order));
- -      if (!page)
- -              goto failed;
- -
- -      __mod_zone_freepage_state(zone, -(1 << order),
- -                                get_pcppage_migratetype(page));
- -      spin_unlock_irqrestore(&zone->lock, flags);
+ +                      if (!page)
+ +                              goto failed;
+ +              }
+ +              __mod_zone_freepage_state(zone, -(1 << order),
+ +                                        get_pcppage_migratetype(page));
+ +              spin_unlock_irqrestore(&zone->lock, flags);
+ +      } while (check_new_pages(page, order));
   
         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
         zone_statistics(preferred_zone, zone, 1);
@@@ -4554,12 -4594,13 +4553,12 @@@ __perform_reclaim(gfp_t gfp_mask, unsig
                                         const struct alloc_context *ac)
   {
         unsigned int noreclaim_flag;
- -      unsigned long pflags, progress;
+ +      unsigned long progress;
   
         cond_resched();
   
         /* We now go into synchronous reclaim */
         cpuset_memory_pressure_bump();
- -      psi_memstall_enter(&pflags);
         fs_reclaim_acquire(gfp_mask);
         noreclaim_flag = memalloc_noreclaim_save();
   
@@@ -4568,6 -4609,7 +4567,6 @@@
   
         memalloc_noreclaim_restore(noreclaim_flag);
         fs_reclaim_release(gfp_mask);
- -      psi_memstall_leave(&pflags);
   
         cond_resched();
   
@@@ -4581,13 -4623,11 +4580,13 @@@ __alloc_pages_direct_reclaim(gfp_t gfp_
                 unsigned long *did_some_progress)
   {
         struct page *page = NULL;
+ +      unsigned long pflags;
         bool drained = false;
   
+ +      psi_memstall_enter(&pflags);
         *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
         if (unlikely(!(*did_some_progress)))
- -              return NULL;
+ +              goto out;
   
   retry:
         page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
@@@ -4603,8 -4643,6 +4602,8 @@@
                 drained = true;
                 goto retry;
         }
+ +out:
+ +      psi_memstall_leave(&pflags);
   
         return page;
   }
@@@ -6341,7 -6379,7 +6340,7 @@@ static void per_cpu_pages_init(struct p
   #define BOOT_PAGESET_BATCH    1
   static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
   static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
- -static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+ +DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
   
   static void __build_all_zonelists(void *data)
   {
@@@ -6363,11 -6401,7 +6362,11 @@@
         if (self && !node_online(self->node_id)) {
                 build_zonelists(self);
         } else {
- -              for_each_online_node(nid) {
+ +              /*
+ +               * All possible nodes have pgdat preallocated
+ +               * in free_area_init
+ +               */
+ +              for_each_node(nid) {
                         pg_data_t *pgdat = NODE_DATA(nid);
   
                         build_zonelists(pgdat);
@@@ -7354,15 -7388,16 +7353,15 @@@ static inline void setup_usemap(struct 
   /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
   void __init set_pageblock_order(void)
   {
- -      unsigned int order;
+ +      unsigned int order = MAX_ORDER - 1;
   
         /* Check that pageblock_nr_pages has not already been setup */
         if (pageblock_order)
                 return;
   
- -      if (HPAGE_SHIFT > PAGE_SHIFT)
+ +      /* Don't let pageblocks exceed the maximum allocation granularity. */
+ +      if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
                 order = HUGETLB_PAGE_ORDER;
- -      else
- -              order = MAX_ORDER - 1;
   
         /*
          * Assume the largest contiguous order of interest is a huge page.
@@@ -7466,33 -7501,12 +7465,33 @@@ static void __meminit zone_init_interna
    * NOTE: this function is only called during memory hotplug
    */
   #ifdef CONFIG_MEMORY_HOTPLUG
- -void __ref free_area_init_core_hotplug(int nid)
+ +void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
   {
+ +      int nid = pgdat->node_id;
         enum zone_type z;
- -      pg_data_t *pgdat = NODE_DATA(nid);
+ +      int cpu;
   
         pgdat_init_internals(pgdat);
+ +
+ +      if (pgdat->per_cpu_nodestats == &boot_nodestats)
+ +              pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
+ +
+ +      /*
+ +       * Reset the nr_zones, order and highest_zoneidx before reuse.
+ +       * Note that kswapd will init kswapd_highest_zoneidx properly
+ +       * when it starts in the near future.
+ +       */
+ +      pgdat->nr_zones = 0;
+ +      pgdat->kswapd_order = 0;
+ +      pgdat->kswapd_highest_zoneidx = 0;
+ +      pgdat->node_start_pfn = 0;
+ +      for_each_online_cpu(cpu) {
+ +              struct per_cpu_nodestat *p;
+ +
+ +              p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ +              memset(p, 0, sizeof(*p));
+ +      }
+ +
         for (z = 0; z < MAX_NR_ZONES; z++)
                 zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
   }
@@@ -7642,14 -7656,9 +7641,14 @@@ static void __init free_area_init_node(
         pgdat->node_start_pfn = start_pfn;
         pgdat->per_cpu_nodestats = NULL;
   
- -      pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
- -              (u64)start_pfn << PAGE_SHIFT,
- -              end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+ +      if (start_pfn != end_pfn) {
+ +              pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
+ +                      (u64)start_pfn << PAGE_SHIFT,
+ +                      end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+ +      } else {
+ +              pr_info("Initmem setup node %d as memoryless\n", nid);
+ +      }
+ +
         calculate_node_totalpages(pgdat, start_pfn, end_pfn);
   
         alloc_node_mem_map(pgdat);
@@@ -7658,7 -7667,7 +7657,7 @@@
         free_area_init_core(pgdat);
   }
   
- -void __init free_area_init_memoryless_node(int nid)
+ +static void __init free_area_init_memoryless_node(int nid)
   {
         free_area_init_node(nid);
   }
@@@ -7962,17 -7971,10 +7961,17 @@@ restart
   
   out2:
         /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
- -      for (nid = 0; nid < MAX_NUMNODES; nid++)
+ +      for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ +              unsigned long start_pfn, end_pfn;
+ +
                 zone_movable_pfn[nid] =
                         roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
   
+ +              get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ +              if (zone_movable_pfn[nid] >= end_pfn)
+ +                      zone_movable_pfn[nid] = 0;
+ +      }
+ +
   out:
         /* restore the node_state */
         node_states[N_MEMORY] = saved_node_state;
@@@ -8093,36 -8095,8 +8092,36 @@@ void __init free_area_init(unsigned lon
         /* Initialise every node */
         mminit_verify_pageflags_layout();
         setup_nr_node_ids();
- -      for_each_online_node(nid) {
- -              pg_data_t *pgdat = NODE_DATA(nid);
+ +      for_each_node(nid) {
+ +              pg_data_t *pgdat;
+ +
+ +              if (!node_online(nid)) {
+ +                      pr_info("Initializing node %d as memoryless\n", nid);
+ +
+ +                      /* Allocator not initialized yet */
+ +                      pgdat = arch_alloc_nodedata(nid);
+ +                      if (!pgdat) {
+ +                              pr_err("Cannot allocate %zuB for node %d.\n",
+ +                                              sizeof(*pgdat), nid);
+ +                              continue;
+ +                      }
+ +                      arch_refresh_nodedata(nid, pgdat);
+ +                      free_area_init_memoryless_node(nid);
+ +
+ +                      /*
+ +                       * We do not want to confuse userspace by sysfs
+ +                       * files/directories for node without any memory
+ +                       * attached to it, so this node is not marked as
+ +                       * N_MEMORY and not marked online so that no sysfs
+ +                       * hierarchy will be created via register_one_node for
+ +                       * it. The pgdat will get fully initialized by
+ +                       * hotadd_init_pgdat() when memory is hotplugged into
+ +                       * this node.
+ +                       */
+ +                      continue;
+ +              }
+ +
+ +              pgdat = NODE_DATA(nid);
                 free_area_init_node(nid);
   
                 /* Any memory on that node */
@@@ -8499,8 -8473,7 +8498,8 @@@ static void __setup_per_zone_wmarks(voi
   
                 zone->watermark_boost = 0;
                 zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
- -              zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ +              zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
+ +              zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
   
                 spin_unlock_irqrestore(&zone->lock, flags);
         }
@@@ -9012,12 -8985,14 +9011,12 @@@ struct page *has_unmovable_pages(struc
   #ifdef CONFIG_CONTIG_ALLOC
   static unsigned long pfn_max_align_down(unsigned long pfn)
   {
- -      return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
- -                           pageblock_nr_pages) - 1);
+ +      return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
   }
   
   static unsigned long pfn_max_align_up(unsigned long pfn)
   {
- -      return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
- -                              pageblock_nr_pages));
+ +      return ALIGN(pfn, MAX_ORDER_NR_PAGES);
   }
   
   #if defined(CONFIG_DYNAMIC_DEBUG) || \
@@@ -9476,7 -9451,6 +9475,7 @@@ bool is_free_buddy_page(struct page *pa
   
         return order < MAX_ORDER;
   }
+ +EXPORT_SYMBOL(is_free_buddy_page);
   
   #ifdef CONFIG_MEMORY_FAILURE
   /*
diff --combined mm/readahead.c

index f61943fd17418e9303c8b69f690bfd026baa1419,a20391d6a71bfa034783e7e2d4e5ef707de0ff46..21e5f9161cf221da0f04ef1a2b5c02f559b40222
--- 1/mm/readahead.c
--- 2/mm/readahead.c
+++ b/mm/readahead.c
@@@ -8,111 -8,6 +8,111 @@@
    *            Initial version.
    */
   
+ +/**
+ + * DOC: Readahead Overview
+ + *
+ + * Readahead is used to read content into the page cache before it is
+ + * explicitly requested by the application.  Readahead only ever
+ + * attempts to read pages that are not yet in the page cache.  If a
+ + * page is present but not up-to-date, readahead will not try to read
+ + * it. In that case a simple ->readpage() will be requested.
+ + *
+ + * Readahead is triggered when an application read request (whether a
+ + * systemcall or a page fault) finds that the requested page is not in
+ + * the page cache, or that it is in the page cache and has the
+ + * %PG_readahead flag set.  This flag indicates that the page was loaded
+ + * as part of a previous read-ahead request and now that it has been
+ + * accessed, it is time for the next read-ahead.
+ + *
+ + * Each readahead request is partly synchronous read, and partly async
+ + * read-ahead.  This is reflected in the struct file_ra_state which
+ + * contains ->size being to total number of pages, and ->async_size
+ + * which is the number of pages in the async section.  The first page in
+ + * this async section will have %PG_readahead set as a trigger for a
+ + * subsequent read ahead.  Once a series of sequential reads has been
+ + * established, there should be no need for a synchronous component and
+ + * all read ahead request will be fully asynchronous.
+ + *
+ + * When either of the triggers causes a readahead, three numbers need to
+ + * be determined: the start of the region, the size of the region, and
+ + * the size of the async tail.
+ + *
+ + * The start of the region is simply the first page address at or after
+ + * the accessed address, which is not currently populated in the page
+ + * cache.  This is found with a simple search in the page cache.
+ + *
+ + * The size of the async tail is determined by subtracting the size that
+ + * was explicitly requested from the determined request size, unless
+ + * this would be less than zero - then zero is used.  NOTE THIS
+ + * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
+ + * PAGE.
+ + *
+ + * The size of the region is normally determined from the size of the
+ + * previous readahead which loaded the preceding pages.  This may be
+ + * discovered from the struct file_ra_state for simple sequential reads,
+ + * or from examining the state of the page cache when multiple
+ + * sequential reads are interleaved.  Specifically: where the readahead
+ + * was triggered by the %PG_readahead flag, the size of the previous
+ + * readahead is assumed to be the number of pages from the triggering
+ + * page to the start of the new readahead.  In these cases, the size of
+ + * the previous readahead is scaled, often doubled, for the new
+ + * readahead, though see get_next_ra_size() for details.
+ + *
+ + * If the size of the previous read cannot be determined, the number of
+ + * preceding pages in the page cache is used to estimate the size of
+ + * a previous read.  This estimate could easily be misled by random
+ + * reads being coincidentally adjacent, so it is ignored unless it is
+ + * larger than the current request, and it is not scaled up, unless it
+ + * is at the start of file.
+ + *
+ + * In general read ahead is accelerated at the start of the file, as
+ + * reads from there are often sequential.  There are other minor
+ + * adjustments to the read ahead size in various special cases and these
+ + * are best discovered by reading the code.
+ + *
+ + * The above calculation determines the readahead, to which any requested
+ + * read size may be added.
+ + *
+ + * Readahead requests are sent to the filesystem using the ->readahead()
+ + * address space operation, for which mpage_readahead() is a canonical
+ + * implementation.  ->readahead() should normally initiate reads on all
+ + * pages, but may fail to read any or all pages without causing an IO
+ + * error.  The page cache reading code will issue a ->readpage() request
+ + * for any page which ->readahead() does not provided, and only an error
+ + * from this will be final.
+ + *
+ + * ->readahead() will generally call readahead_page() repeatedly to get
+ + * each page from those prepared for read ahead.  It may fail to read a
+ + * page by:
+ + *
+ + * * not calling readahead_page() sufficiently many times, effectively
+ + *   ignoring some pages, as might be appropriate if the path to
+ + *   storage is congested.
+ + *
+ + * * failing to actually submit a read request for a given page,
+ + *   possibly due to insufficient resources, or
+ + *
+ + * * getting an error during subsequent processing of a request.
+ + *
+ + * In the last two cases, the page should be unlocked to indicate that
+ + * the read attempt has failed.  In the first case the page will be
+ + * unlocked by the caller.
+ + *
+ + * Those pages not in the final ``async_size`` of the request should be
+ + * considered to be important and ->readahead() should not fail them due
+ + * to congestion or temporary resource unavailability, but should wait
+ + * for necessary resources (e.g.  memory or indexing information) to
+ + * become available.  Pages in the final ``async_size`` may be
+ + * considered less urgent and failure to read them is more acceptable.
+ + * In this case it is best to use delete_from_page_cache() to remove the
+ + * pages from the page cache as is automatically done for pages that
+ + * were not fetched with readahead_page().  This will allow a
+ + * subsequent synchronous read ahead request to try them again.  If they
+ + * are left in the page cache, then they will be read individually using
+ + * ->readpage().
+ + *
+ + */
+ +
   #include <linux/kernel.h>
   #include <linux/dax.h>
   #include <linux/gfp.h>
@@@ -232,17 -127,8 +232,17 @@@ static void read_pages(struct readahead
   
         if (aops->readahead) {
                 aops->readahead(rac);
- -              /* Clean up the remaining pages */
+ +              /*
+ +               * Clean up the remaining pages.  The sizes in ->ra
+ +               * maybe be used to size next read-ahead, so make sure
+ +               * they accurately reflect what happened.
+ +               */
                 while ((page = readahead_page(rac))) {
+ +                      rac->ra->size -= 1;
+ +                      if (rac->ra->async_size > 0) {
+ +                              rac->ra->async_size -= 1;
+ +                              delete_from_page_cache(page);
+ +                      }
                         unlock_page(page);
                         put_page(page);
                 }
@@@ -262,7 -148,7 +262,7 @@@
   
         blk_finish_plug(&plug);
   
-       BUG_ON(!list_empty(pages));
+       BUG_ON(pages && !list_empty(pages));
         BUG_ON(readahead_count(rac));
   
   out:
@@@ -361,7 -247,7 +361,7 @@@ EXPORT_SYMBOL_GPL(page_cache_ra_unbound
    * behaviour which would occur if page allocations are causing VM writeback.
    * We really don't want to intermingle reads and writes like that.
    */
- void do_page_cache_ra(struct readahead_control *ractl,
+ static void do_page_cache_ra(struct readahead_control *ractl,
                 unsigned long nr_to_read, unsigned long lookahead_size)
   {
         struct inode *inode = ractl->mapping->host;
@@@ -545,11 -431,103 +545,103 @@@ static int try_context_readahead(struc
         return 1;
   }
   
+ /*
+  * There are some parts of the kernel which assume that PMD entries
+  * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then,
+  * limit the maximum allocation order to PMD size.  I'm not aware of any
+  * assumptions about maximum order if THP are disabled, but 8 seems like
+  * a good order (that's 1MB if you're using 4kB pages)
+  */
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ #define MAX_PAGECACHE_ORDER   HPAGE_PMD_ORDER
+ #else
+ #define MAX_PAGECACHE_ORDER   8
+ #endif
+ 
+ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
+               pgoff_t mark, unsigned int order, gfp_t gfp)
+ {
+       int err;
+       struct folio *folio = filemap_alloc_folio(gfp, order);
+ 
+       if (!folio)
+               return -ENOMEM;
+       if (mark - index < (1UL << order))
+               folio_set_readahead(folio);
+       err = filemap_add_folio(ractl->mapping, folio, index, gfp);
+       if (err)
+               folio_put(folio);
+       else
+               ractl->_nr_pages += 1UL << order;
+       return err;
+ }
+ 
+ void page_cache_ra_order(struct readahead_control *ractl,
+               struct file_ra_state *ra, unsigned int new_order)
+ {
+       struct address_space *mapping = ractl->mapping;
+       pgoff_t index = readahead_index(ractl);
+       pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
+       pgoff_t mark = index + ra->size - ra->async_size;
+       int err = 0;
+       gfp_t gfp = readahead_gfp_mask(mapping);
+ 
+       if (!mapping_large_folio_support(mapping) || ra->size < 4)
+               goto fallback;
+ 
+       limit = min(limit, index + ra->size - 1);
+ 
+       if (new_order < MAX_PAGECACHE_ORDER) {
+               new_order += 2;
+               if (new_order > MAX_PAGECACHE_ORDER)
+                       new_order = MAX_PAGECACHE_ORDER;
+               while ((1 << new_order) > ra->size)
+                       new_order--;
+       }
+ 
+       while (index <= limit) {
+               unsigned int order = new_order;
+ 
+               /* Align with smaller pages if needed */
+               if (index & ((1UL << order) - 1)) {
+                       order = __ffs(index);
+                       if (order == 1)
+                               order = 0;
+               }
+               /* Don't allocate pages past EOF */
+               while (index + (1UL << order) - 1 > limit) {
+                       if (--order == 1)
+                               order = 0;
+               }
+               err = ra_alloc_folio(ractl, index, mark, order, gfp);
+               if (err)
+                       break;
+               index += 1UL << order;
+       }
+ 
+       if (index > limit) {
+               ra->size += index - limit - 1;
+               ra->async_size += index - limit - 1;
+       }
+ 
+       read_pages(ractl, NULL, false);
+ 
+       /*
+        * If there were already pages in the page cache, then we may have
+        * left some gaps.  Let the regular readahead code take care of this
+        * situation.
+        */
+       if (!err)
+               return;
+ fallback:
+       do_page_cache_ra(ractl, ra->size, ra->async_size);
+ }
+ 
   /*
    * A minimal readahead algorithm for trivial sequential/random reads.
    */
   static void ondemand_readahead(struct readahead_control *ractl,
-               bool hit_readahead_marker, unsigned long req_size)
+               struct folio *folio, unsigned long req_size)
   {
         struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
         struct file_ra_state *ra = ractl->ra;
@@@ -584,12 -562,12 +676,12 @@@
         }
   
         /*
-        * Hit a marked page without valid readahead state.
+        * Hit a marked folio without valid readahead state.
          * E.g. interleaved reads.
          * Query the pagecache for async_size, which normally equals to
          * readahead size. Ramp it up and use it as the new readahead size.
          */
-       if (hit_readahead_marker) {
+       if (folio) {
                 pgoff_t start;
   
                 rcu_read_lock();
@@@ -662,7 -640,7 +754,7 @@@ readit
         }
   
         ractl->_index = ra->start;
-       do_page_cache_ra(ractl, ra->size, ra->async_size);
+       page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0);
   }
   
   void page_cache_sync_ra(struct readahead_control *ractl,
@@@ -690,7 -668,7 +782,7 @@@
         }
   
         /* do read-ahead */
-       ondemand_readahead(ractl, false, req_count);
+       ondemand_readahead(ractl, NULL, req_count);
   }
   EXPORT_SYMBOL_GPL(page_cache_sync_ra);
   
@@@ -709,11 -687,17 +801,11 @@@ void page_cache_async_ra(struct readahe
   
         folio_clear_readahead(folio);
   
- -      /*
- -       * Defer asynchronous read-ahead on IO congestion.
- -       */
- -      if (inode_read_congested(ractl->mapping->host))
- -              return;
- -
         if (blk_cgroup_congested())
                 return;
   
         /* do read-ahead */
-       ondemand_readahead(ractl, true, req_count);
+       ondemand_readahead(ractl, folio, req_count);
   }
   EXPORT_SYMBOL_GPL(page_cache_async_ra);
   
diff --combined mm/rmap.c

index 66cb69c1c5dde0ed17e8a9f9b74f846b2bed2d19,8192cb5809bcccf072fd7327c431b4f0d633a751..9bdca9308e2f1c0a2a8ca90d3c63359633cf9ce0
--- 1/mm/rmap.c
--- 2/mm/rmap.c
+++ b/mm/rmap.c
@@@ -107,15 -107,15 +107,15 @@@ static inline void anon_vma_free(struc
         VM_BUG_ON(atomic_read(&anon_vma->refcount));
   
         /*
-        * Synchronize against page_lock_anon_vma_read() such that
+        * Synchronize against folio_lock_anon_vma_read() such that
          * we can safely hold the lock without the anon_vma getting
          * freed.
          *
          * Relies on the full mb implied by the atomic_dec_and_test() from
          * put_anon_vma() against the acquire barrier implied by
-        * down_read_trylock() from page_lock_anon_vma_read(). This orders:
+        * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
          *
-        * page_lock_anon_vma_read()    VS      put_anon_vma()
+        * folio_lock_anon_vma_read()   VS      put_anon_vma()
          *   down_read_trylock()                  atomic_dec_and_test()
          *   LOCK                                 MB
          *   atomic_read()                        rwsem_is_locked()
@@@ -168,7 -168,7 +168,7 @@@ static void anon_vma_chain_link(struct 
    * allocate a new one.
    *
    * Anon-vma allocations are very subtle, because we may have
-  * optimistically looked up an anon_vma in page_lock_anon_vma_read()
+  * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
    * and that may actually touch the rwsem even in the newly
    * allocated vma (it depends on RCU to make sure that the
    * anon_vma isn't actually destroyed).
@@@ -526,28 -526,28 +526,28 @@@ out
    * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
    * reference like with page_get_anon_vma() and then block on the mutex.
    */
- struct anon_vma *page_lock_anon_vma_read(struct page *page)
+ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio)
   {
         struct anon_vma *anon_vma = NULL;
         struct anon_vma *root_anon_vma;
         unsigned long anon_mapping;
   
         rcu_read_lock();
-       anon_mapping = (unsigned long)READ_ONCE(page->mapping);
+       anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
         if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                 goto out;
-       if (!page_mapped(page))
+       if (!folio_mapped(folio))
                 goto out;
   
         anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
         root_anon_vma = READ_ONCE(anon_vma->root);
         if (down_read_trylock(&root_anon_vma->rwsem)) {
                 /*
-                * If the page is still mapped, then this anon_vma is still
+                * If the folio is still mapped, then this anon_vma is still
                  * its anon_vma, and holding the mutex ensures that it will
                  * not go away, see anon_vma_free().
                  */
-               if (!page_mapped(page)) {
+               if (!folio_mapped(folio)) {
                         up_read(&root_anon_vma->rwsem);
                         anon_vma = NULL;
                 }
@@@ -560,7 -560,7 +560,7 @@@
                 goto out;
         }
   
-       if (!page_mapped(page)) {
+       if (!folio_mapped(folio)) {
                 rcu_read_unlock();
                 put_anon_vma(anon_vma);
                 return NULL;
@@@ -737,8 -737,9 +737,9 @@@ static bool should_defer_flush(struct m
    */
   unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
   {
-       if (PageAnon(page)) {
-               struct anon_vma *page__anon_vma = page_anon_vma(page);
+       struct folio *folio = page_folio(page);
+       if (folio_test_anon(folio)) {
+               struct anon_vma *page__anon_vma = folio_anon_vma(folio);
                 /*
                  * Note: swapoff's unuse_vma() is more efficient with this
                  * check, and needs it to match anon_vma when KSM is active.
@@@ -748,7 -749,7 +749,7 @@@
                         return -EFAULT;
         } else if (!vma->vm_file) {
                 return -EFAULT;
-       } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
+       } else if (vma->vm_file->f_mapping != folio->mapping) {
                 return -EFAULT;
         }
   
@@@ -789,30 -790,29 +790,29 @@@ out
         return pmd;
   }
   
- struct page_referenced_arg {
+ struct folio_referenced_arg {
         int mapcount;
         int referenced;
         unsigned long vm_flags;
         struct mem_cgroup *memcg;
   };
   /*
-  * arg: page_referenced_arg will be passed
+  * arg: folio_referenced_arg will be passed
    */
- static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
-                       unsigned long address, void *arg)
+ static bool folio_referenced_one(struct folio *folio,
+               struct vm_area_struct *vma, unsigned long address, void *arg)
   {
-       struct page_referenced_arg *pra = arg;
-       struct page_vma_mapped_walk pvmw = {
-               .page = page,
-               .vma = vma,
-               .address = address,
-       };
+       struct folio_referenced_arg *pra = arg;
+       DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
         int referenced = 0;
   
         while (page_vma_mapped_walk(&pvmw)) {
                 address = pvmw.address;
   
-               if (vma->vm_flags & VM_LOCKED) {
+               if ((vma->vm_flags & VM_LOCKED) &&
+                   (!folio_test_large(folio) || !pvmw.pte)) {
+                       /* Restore the mlock which got missed */
+                       mlock_vma_folio(folio, vma, !pvmw.pte);
                         page_vma_mapped_walk_done(&pvmw);
                         pra->vm_flags |= VM_LOCKED;
                         return false; /* To break the loop */
@@@ -824,10 -824,10 +824,10 @@@
                                 /*
                                  * Don't treat a reference through
                                  * a sequentially read mapping as such.
-                                * If the page has been used in another mapping,
+                                * If the folio has been used in another mapping,
                                  * we will catch it; if this other mapping is
                                  * already gone, the unmap path will have set
-                                * PG_referenced or activated the page.
+                                * the referenced flag or activated the folio.
                                  */
                                 if (likely(!(vma->vm_flags & VM_SEQ_READ)))
                                         referenced++;
@@@ -837,7 -837,7 +837,7 @@@
                                                 pvmw.pmd))
                                 referenced++;
                 } else {
-                       /* unexpected pmd-mapped page? */
+                       /* unexpected pmd-mapped folio? */
                         WARN_ON_ONCE(1);
                 }
   
@@@ -845,13 -845,13 +845,13 @@@
         }
   
         if (referenced)
-               clear_page_idle(page);
-       if (test_and_clear_page_young(page))
+               folio_clear_idle(folio);
+       if (folio_test_clear_young(folio))
                 referenced++;
   
         if (referenced) {
                 pra->referenced++;
-               pra->vm_flags |= vma->vm_flags;
+               pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
         }
   
         if (!pra->mapcount)
@@@ -860,9 -860,9 +860,9 @@@
         return true;
   }
   
- static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
+ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
   {
-       struct page_referenced_arg *pra = arg;
+       struct folio_referenced_arg *pra = arg;
         struct mem_cgroup *memcg = pra->memcg;
   
         if (!mm_match_cgroup(vma->vm_mm, memcg))
@@@ -872,40 -872,39 +872,39 @@@
   }
   
   /**
-  * page_referenced - test if the page was referenced
-  * @page: the page to test
-  * @is_locked: caller holds lock on the page
+  * folio_referenced() - Test if the folio was referenced.
+  * @folio: The folio to test.
+  * @is_locked: Caller holds lock on the folio.
    * @memcg: target memory cgroup
-  * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
+  * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
    *
-  * Quick test_and_clear_referenced for all mappings to a page,
-  * returns the number of ptes which referenced the page.
+  * Quick test_and_clear_referenced for all mappings of a folio,
+  *
+  * Return: The number of mappings which referenced the folio.
    */
- int page_referenced(struct page *page,
-                   int is_locked,
-                   struct mem_cgroup *memcg,
-                   unsigned long *vm_flags)
+ int folio_referenced(struct folio *folio, int is_locked,
+                    struct mem_cgroup *memcg, unsigned long *vm_flags)
   {
         int we_locked = 0;
-       struct page_referenced_arg pra = {
-               .mapcount = total_mapcount(page),
+       struct folio_referenced_arg pra = {
+               .mapcount = folio_mapcount(folio),
                 .memcg = memcg,
         };
         struct rmap_walk_control rwc = {
-               .rmap_one = page_referenced_one,
+               .rmap_one = folio_referenced_one,
                 .arg = (void *)&pra,
-               .anon_lock = page_lock_anon_vma_read,
+               .anon_lock = folio_lock_anon_vma_read,
         };
   
         *vm_flags = 0;
         if (!pra.mapcount)
                 return 0;
   
-       if (!page_rmapping(page))
+       if (!folio_raw_mapping(folio))
                 return 0;
   
-       if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
-               we_locked = trylock_page(page);
+       if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
+               we_locked = folio_trylock(folio);
                 if (!we_locked)
                         return 1;
         }
@@@ -916,37 -915,32 +915,32 @@@
          * cgroups
          */
         if (memcg) {
-               rwc.invalid_vma = invalid_page_referenced_vma;
+               rwc.invalid_vma = invalid_folio_referenced_vma;
         }
   
-       rmap_walk(page, &rwc);
+       rmap_walk(folio, &rwc);
         *vm_flags = pra.vm_flags;
   
         if (we_locked)
-               unlock_page(page);
+               folio_unlock(folio);
   
         return pra.referenced;
   }
   
- static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
                             unsigned long address, void *arg)
   {
-       struct page_vma_mapped_walk pvmw = {
-               .page = page,
-               .vma = vma,
-               .address = address,
-               .flags = PVMW_SYNC,
-       };
+       DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
         struct mmu_notifier_range range;
         int *cleaned = arg;
   
         /*
          * We have to assume the worse case ie pmd for invalidation. Note that
-        * the page can not be free from this function.
+        * the folio can not be freed from this function.
          */
         mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
                                 0, vma, vma->vm_mm, address,
-                               vma_address_end(page, vma));
+                               vma_address_end(&pvmw));
         mmu_notifier_invalidate_range_start(&range);
   
         while (page_vma_mapped_walk(&pvmw)) {
@@@ -974,14 -968,14 +968,14 @@@
                         if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
                                 continue;
   
-                       flush_cache_page(vma, address, page_to_pfn(page));
+                       flush_cache_page(vma, address, folio_pfn(folio));
                         entry = pmdp_invalidate(vma, address, pmd);
                         entry = pmd_wrprotect(entry);
                         entry = pmd_mkclean(entry);
                         set_pmd_at(vma->vm_mm, address, pmd, entry);
                         ret = 1;
   #else
-                       /* unexpected pmd-mapped page? */
+                       /* unexpected pmd-mapped folio? */
                         WARN_ON_ONCE(1);
   #endif
                 }
@@@ -1029,7 -1023,7 +1023,7 @@@ int folio_mkclean(struct folio *folio
         if (!mapping)
                 return 0;
   
-       rmap_walk(&folio->page, &rwc);
+       rmap_walk(folio, &rwc);
   
         return cleaned;
   }
@@@ -1057,8 -1051,8 +1051,8 @@@ void page_move_anon_rmap(struct page *p
         anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
         /*
          * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
-        * simultaneously, so a concurrent reader (eg page_referenced()'s
-        * PageAnon()) will not see one without the other.
+        * simultaneously, so a concurrent reader (eg folio_referenced()'s
+        * folio_test_anon()) will not see one without the other.
          */
         WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
   }
@@@ -1108,6 -1102,7 +1102,7 @@@ static void __page_set_anon_rmap(struc
   static void __page_check_anon_rmap(struct page *page,
         struct vm_area_struct *vma, unsigned long address)
   {
+       struct folio *folio = page_folio(page);
         /*
          * The page's anon-rmap details (mapping and index) are guaranteed to
          * be set up correctly at this point.
@@@ -1119,7 -1114,8 +1114,8 @@@
          * are initially only visible via the pagetables, and the pte is locked
          * over the call to page_add_new_anon_rmap.
          */
-       VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
+       VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
+                       folio);
         VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
                        page);
   }
@@@ -1181,17 -1177,17 +1177,17 @@@ void do_page_add_anon_rmap(struct page 
                 __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
         }
   
-       if (unlikely(PageKsm(page))) {
+       if (unlikely(PageKsm(page)))
                 unlock_page_memcg(page);
-               return;
-       }
   
         /* address might be in next vma when migration races vma_adjust */
-       if (first)
+       else if (first)
                 __page_set_anon_rmap(page, vma, address,
                                 flags & RMAP_EXCLUSIVE);
         else
                 __page_check_anon_rmap(page, vma, address);
+ 
+       mlock_vma_page(page, vma, compound);
   }
   
   /**
@@@ -1216,8 -1212,7 +1212,7 @@@ void page_add_new_anon_rmap(struct pag
                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                 /* increment count (starts at -1) */
                 atomic_set(compound_mapcount_ptr(page), 0);
-               if (hpage_pincount_available(page))
-                       atomic_set(compound_pincount_ptr(page), 0);
+               atomic_set(compound_pincount_ptr(page), 0);
   
                 __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
         } else {
@@@ -1232,12 -1227,14 +1227,14 @@@
   
   /**
    * page_add_file_rmap - add pte mapping to a file page
-  * @page: the page to add the mapping to
-  * @compound: charge the page as compound or small page
+  * @page:     the page to add the mapping to
+  * @vma:      the vm area in which the mapping is added
+  * @compound: charge the page as compound or small page
    *
    * The caller needs to hold the pte lock.
    */
- void page_add_file_rmap(struct page *page, bool compound)
+ void page_add_file_rmap(struct page *page,
+       struct vm_area_struct *vma, bool compound)
   {
         int i, nr = 1;
   
@@@ -1252,17 -1249,6 +1249,17 @@@
                 }
                 if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
                         goto out;
+ +
+ +              /*
+ +               * It is racy to ClearPageDoubleMap in page_remove_file_rmap();
+ +               * but page lock is held by all page_add_file_rmap() compound
+ +               * callers, and SetPageDoubleMap below warns if !PageLocked:
+ +               * so here is a place that DoubleMap can be safely cleared.
+ +               */
+ +              VM_WARN_ON_ONCE(!PageLocked(page));
+ +              if (nr == nr_pages && PageDoubleMap(page))
+ +                      ClearPageDoubleMap(page);
+ +
                 if (PageSwapBacked(page))
                         __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
                                                 nr_pages);
@@@ -1271,13 -1257,8 +1268,8 @@@
                                                 nr_pages);
         } else {
                 if (PageTransCompound(page) && page_mapping(page)) {
-                       struct page *head = compound_head(page);
- 
                         VM_WARN_ON_ONCE(!PageLocked(page));
- 
-                       SetPageDoubleMap(head);
-                       if (PageMlocked(page))
-                               clear_page_mlock(head);
+                       SetPageDoubleMap(compound_head(page));
                 }
                 if (!atomic_inc_and_test(&page->_mapcount))
                         goto out;
@@@ -1285,6 -1266,8 +1277,8 @@@
         __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
   out:
         unlock_page_memcg(page);
+ 
+       mlock_vma_page(page, vma, compound);
   }
   
   static void page_remove_file_rmap(struct page *page, bool compound)
@@@ -1327,9 -1310,6 +1321,6 @@@
          * pte lock(a spinlock) is held, which implies preemption disabled.
          */
         __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
- 
-       if (unlikely(PageMlocked(page)))
-               clear_page_mlock(page);
   }
   
   static void page_remove_anon_compound_rmap(struct page *page)
@@@ -1369,9 -1349,6 +1360,6 @@@
                 nr = thp_nr_pages(page);
         }
   
-       if (unlikely(PageMlocked(page)))
-               clear_page_mlock(page);
- 
         if (nr)
                 __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
   }
@@@ -1379,11 -1356,13 +1367,13 @@@
   /**
    * page_remove_rmap - take down pte mapping from a page
    * @page:     page to remove mapping from
+  * @vma:      the vm area from which the mapping is removed
    * @compound: uncharge the page as compound or small page
    *
    * The caller needs to hold the pte lock.
    */
- void page_remove_rmap(struct page *page, bool compound)
+ void page_remove_rmap(struct page *page,
+       struct vm_area_struct *vma, bool compound)
   {
         lock_page_memcg(page);
   
@@@ -1408,9 -1387,6 +1398,6 @@@
          */
         __dec_lruvec_page_state(page, NR_ANON_MAPPED);
   
-       if (unlikely(PageMlocked(page)))
-               clear_page_mlock(page);
- 
         if (PageTransCompound(page))
                 deferred_split_huge_page(compound_head(page));
   
@@@ -1425,20 -1401,18 +1412,18 @@@
          */
   out:
         unlock_page_memcg(page);
+ 
+       munlock_vma_page(page, vma, compound);
   }
   
   /*
    * @arg: enum ttu_flags will be passed to this argument
    */
- static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                      unsigned long address, void *arg)
   {
         struct mm_struct *mm = vma->vm_mm;
-       struct page_vma_mapped_walk pvmw = {
-               .page = page,
-               .vma = vma,
-               .address = address,
-       };
+       DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
         pte_t pteval;
         struct page *subpage;
         bool ret = true;
@@@ -1455,21 -1429,20 +1440,20 @@@
                 pvmw.flags = PVMW_SYNC;
   
         if (flags & TTU_SPLIT_HUGE_PMD)
-               split_huge_pmd_address(vma, address, false, page);
+               split_huge_pmd_address(vma, address, false, folio);
   
         /*
          * For THP, we have to assume the worse case ie pmd for invalidation.
          * For hugetlb, it could be much worse if we need to do pud
          * invalidation in the case of pmd sharing.
          *
-        * Note that the page can not be free in this function as call of
-        * try_to_unmap() must hold a reference on the page.
+        * Note that the folio can not be freed in this function as call of
+        * try_to_unmap() must hold a reference on the folio.
          */
-       range.end = PageKsm(page) ?
-                       address + PAGE_SIZE : vma_address_end(page, vma);
+       range.end = vma_address_end(&pvmw);
         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
                                 address, range.end);
-       if (PageHuge(page)) {
+       if (folio_test_hugetlb(folio)) {
                 /*
                  * If sharing is possible, start and end will be adjusted
                  * accordingly.
@@@ -1480,32 -1453,26 +1464,26 @@@
         mmu_notifier_invalidate_range_start(&range);
   
         while (page_vma_mapped_walk(&pvmw)) {
+               /* Unexpected PMD-mapped THP? */
+               VM_BUG_ON_FOLIO(!pvmw.pte, folio);
+ 
                 /*
-                * If the page is mlock()d, we cannot swap it out.
+                * If the folio is in an mlock()d vma, we must not swap it out.
                  */
                 if (!(flags & TTU_IGNORE_MLOCK) &&
                     (vma->vm_flags & VM_LOCKED)) {
-                       /*
-                        * PTE-mapped THP are never marked as mlocked: so do
-                        * not set it on a DoubleMap THP, nor on an Anon THP
-                        * (which may still be PTE-mapped after DoubleMap was
-                        * cleared).  But stop unmapping even in those cases.
-                        */
-                       if (!PageTransCompound(page) || (PageHead(page) &&
-                            !PageDoubleMap(page) && !PageAnon(page)))
-                               mlock_vma_page(page);
+                       /* Restore the mlock which got missed */
+                       mlock_vma_folio(folio, vma, false);
                         page_vma_mapped_walk_done(&pvmw);
                         ret = false;
                         break;
                 }
   
-               /* Unexpected PMD-mapped THP? */
-               VM_BUG_ON_PAGE(!pvmw.pte, page);
- 
-               subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+               subpage = folio_page(folio,
+                                       pte_pfn(*pvmw.pte) - folio_pfn(folio));
                 address = pvmw.address;
   
-               if (PageHuge(page) && !PageAnon(page)) {
+               if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
                         /*
                          * To call huge_pmd_unshare, i_mmap_rwsem must be
                          * held in write mode.  Caller needs to explicitly
@@@ -1544,7 -1511,7 +1522,7 @@@
                 if (should_defer_flush(mm, flags)) {
                         /*
                          * We clear the PTE but do not flush so potentially
-                        * a remote CPU could still be writing to the page.
+                        * a remote CPU could still be writing to the folio.
                          * If the entry was previously clean then the
                          * architecture must guarantee that a clear->dirty
                          * transition on a cached TLB entry is written through
@@@ -1557,22 -1524,22 +1535,22 @@@
                         pteval = ptep_clear_flush(vma, address, pvmw.pte);
                 }
   
-               /* Move the dirty bit to the page. Now the pte is gone. */
+               /* Set the dirty flag on the folio now the pte is gone. */
                 if (pte_dirty(pteval))
-                       set_page_dirty(page);
+                       folio_mark_dirty(folio);
   
                 /* Update high watermark before we lower rss */
                 update_hiwater_rss(mm);
   
                 if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) {
                         pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
-                       if (PageHuge(page)) {
-                               hugetlb_count_sub(compound_nr(page), mm);
+                       if (folio_test_hugetlb(folio)) {
+                               hugetlb_count_sub(folio_nr_pages(folio), mm);
                                 set_huge_swap_pte_at(mm, address,
                                                      pvmw.pte, pteval,
                                                      vma_mmu_pagesize(vma));
                         } else {
-                               dec_mm_counter(mm, mm_counter(page));
+                               dec_mm_counter(mm, mm_counter(&folio->page));
                                 set_pte_at(mm, address, pvmw.pte, pteval);
                         }
   
@@@ -1587,18 -1554,19 +1565,19 @@@
                          * migration) will not expect userfaults on already
                          * copied pages.
                          */
-                       dec_mm_counter(mm, mm_counter(page));
+                       dec_mm_counter(mm, mm_counter(&folio->page));
                         /* We have to invalidate as we cleared the pte */
                         mmu_notifier_invalidate_range(mm, address,
                                                       address + PAGE_SIZE);
-               } else if (PageAnon(page)) {
+               } else if (folio_test_anon(folio)) {
                         swp_entry_t entry = { .val = page_private(subpage) };
                         pte_t swp_pte;
                         /*
                          * Store the swap location in the pte.
                          * See handle_pte_fault() ...
                          */
-                       if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
+                       if (unlikely(folio_test_swapbacked(folio) !=
+                                       folio_test_swapcache(folio))) {
                                 WARN_ON_ONCE(1);
                                 ret = false;
                                 /* We have to invalidate as we cleared the pte */
@@@ -1609,8 -1577,8 +1588,8 @@@
                         }
   
                         /* MADV_FREE page check */
-                       if (!PageSwapBacked(page)) {
-                               if (!PageDirty(page)) {
+                       if (!folio_test_swapbacked(folio)) {
+                               if (!folio_test_dirty(folio)) {
                                         /* Invalidate as we cleared the pte */
                                         mmu_notifier_invalidate_range(mm,
                                                 address, address + PAGE_SIZE);
@@@ -1619,11 -1587,11 +1598,11 @@@
                                 }
   
                                 /*
-                                * If the page was redirtied, it cannot be
+                                * If the folio was redirtied, it cannot be
                                  * discarded. Remap the page to page table.
                                  */
                                 set_pte_at(mm, address, pvmw.pte, pteval);
-                               SetPageSwapBacked(page);
+                               folio_set_swapbacked(folio);
                                 ret = false;
                                 page_vma_mapped_walk_done(&pvmw);
                                 break;
@@@ -1660,16 -1628,17 +1639,17 @@@
                                                       address + PAGE_SIZE);
                 } else {
                         /*
-                        * This is a locked file-backed page, thus it cannot
-                        * be removed from the page cache and replaced by a new
-                        * page before mmu_notifier_invalidate_range_end, so no
-                        * concurrent thread might update its page table to
-                        * point at new page while a device still is using this
-                        * page.
+                        * This is a locked file-backed folio,
+                        * so it cannot be removed from the page
+                        * cache and replaced by a new folio before
+                        * mmu_notifier_invalidate_range_end, so no
+                        * concurrent thread might update its page table
+                        * to point at a new folio while a device is
+                        * still using this folio.
                          *
                          * See Documentation/vm/mmu_notifier.rst
                          */
-                       dec_mm_counter(mm, mm_counter_file(page));
+                       dec_mm_counter(mm, mm_counter_file(&folio->page));
                 }
   discard:
                 /*
@@@ -1679,8 -1648,10 +1659,10 @@@
                  *
                  * See Documentation/vm/mmu_notifier.rst
                  */
-               page_remove_rmap(subpage, PageHuge(page));
-               put_page(page);
+               page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
+               if (vma->vm_flags & VM_LOCKED)
+                       mlock_page_drain(smp_processor_id());
+               folio_put(folio);
         }
   
         mmu_notifier_invalidate_range_end(&range);
@@@ -1693,35 -1664,35 +1675,35 @@@ static bool invalid_migration_vma(struc
         return vma_is_temporary_stack(vma);
   }
   
- static int page_not_mapped(struct page *page)
+ static int page_not_mapped(struct folio *folio)
   {
-       return !page_mapped(page);
+       return !folio_mapped(folio);
   }
   
   /**
-  * try_to_unmap - try to remove all page table mappings to a page
-  * @page: the page to get unmapped
+  * try_to_unmap - Try to remove all page table mappings to a folio.
+  * @folio: The folio to unmap.
    * @flags: action and flags
    *
    * Tries to remove all the page table entries which are mapping this
-  * page, used in the pageout path.  Caller must hold the page lock.
+  * folio.  It is the caller's responsibility to check if the folio is
+  * still mapped if needed (use TTU_SYNC to prevent accounting races).
    *
-  * It is the caller's responsibility to check if the page is still
-  * mapped when needed (use TTU_SYNC to prevent accounting races).
+  * Context: Caller must hold the folio lock.
    */
- void try_to_unmap(struct page *page, enum ttu_flags flags)
+ void try_to_unmap(struct folio *folio, enum ttu_flags flags)
   {
         struct rmap_walk_control rwc = {
                 .rmap_one = try_to_unmap_one,
                 .arg = (void *)flags,
                 .done = page_not_mapped,
-               .anon_lock = page_lock_anon_vma_read,
+               .anon_lock = folio_lock_anon_vma_read,
         };
   
         if (flags & TTU_RMAP_LOCKED)
-               rmap_walk_locked(page, &rwc);
+               rmap_walk_locked(folio, &rwc);
         else
-               rmap_walk(page, &rwc);
+               rmap_walk(folio, &rwc);
   }
   
   /*
@@@ -1730,15 -1701,11 +1712,11 @@@
    * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
    * containing migration entries.
    */
- static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
+ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                      unsigned long address, void *arg)
   {
         struct mm_struct *mm = vma->vm_mm;
-       struct page_vma_mapped_walk pvmw = {
-               .page = page,
-               .vma = vma,
-               .address = address,
-       };
+       DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
         pte_t pteval;
         struct page *subpage;
         bool ret = true;
@@@ -1759,7 -1726,7 +1737,7 @@@
          * TTU_SPLIT_HUGE_PMD and it wants to freeze.
          */
         if (flags & TTU_SPLIT_HUGE_PMD)
-               split_huge_pmd_address(vma, address, true, page);
+               split_huge_pmd_address(vma, address, true, folio);
   
         /*
          * For THP, we have to assume the worse case ie pmd for invalidation.
@@@ -1769,11 -1736,10 +1747,10 @@@
          * Note that the page can not be free in this function as call of
          * try_to_unmap() must hold a reference on the page.
          */
-       range.end = PageKsm(page) ?
-                       address + PAGE_SIZE : vma_address_end(page, vma);
+       range.end = vma_address_end(&pvmw);
         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
                                 address, range.end);
-       if (PageHuge(page)) {
+       if (folio_test_hugetlb(folio)) {
                 /*
                  * If sharing is possible, start and end will be adjusted
                  * accordingly.
@@@ -1787,21 -1753,24 +1764,24 @@@
   #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
                 /* PMD-mapped THP migration entry */
                 if (!pvmw.pte) {
-                       VM_BUG_ON_PAGE(PageHuge(page) ||
-                                      !PageTransCompound(page), page);
+                       subpage = folio_page(folio,
+                               pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
+                       VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
+                                       !folio_test_pmd_mappable(folio), folio);
   
-                       set_pmd_migration_entry(&pvmw, page);
+                       set_pmd_migration_entry(&pvmw, subpage);
                         continue;
                 }
   #endif
   
                 /* Unexpected PMD-mapped THP? */
-               VM_BUG_ON_PAGE(!pvmw.pte, page);
+               VM_BUG_ON_FOLIO(!pvmw.pte, folio);
   
-               subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+               subpage = folio_page(folio,
+                               pte_pfn(*pvmw.pte) - folio_pfn(folio));
                 address = pvmw.address;
   
-               if (PageHuge(page) && !PageAnon(page)) {
+               if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
                         /*
                          * To call huge_pmd_unshare, i_mmap_rwsem must be
                          * held in write mode.  Caller needs to explicitly
@@@ -1839,15 -1808,15 +1819,15 @@@
                 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
                 pteval = ptep_clear_flush(vma, address, pvmw.pte);
   
-               /* Move the dirty bit to the page. Now the pte is gone. */
+               /* Set the dirty flag on the folio now the pte is gone. */
                 if (pte_dirty(pteval))
-                       set_page_dirty(page);
+                       folio_mark_dirty(folio);
   
                 /* Update high watermark before we lower rss */
                 update_hiwater_rss(mm);
   
-               if (is_zone_device_page(page)) {
-                       unsigned long pfn = page_to_pfn(page);
+               if (folio_is_zone_device(folio)) {
+                       unsigned long pfn = folio_pfn(folio);
                         swp_entry_t entry;
                         pte_t swp_pte;
   
@@@ -1883,16 -1852,16 +1863,16 @@@
                          * changed when hugepage migrations to device private
                          * memory are supported.
                          */
-                       subpage = page;
+                       subpage = &folio->page;
                 } else if (PageHWPoison(subpage)) {
                         pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
-                       if (PageHuge(page)) {
-                               hugetlb_count_sub(compound_nr(page), mm);
+                       if (folio_test_hugetlb(folio)) {
+                               hugetlb_count_sub(folio_nr_pages(folio), mm);
                                 set_huge_swap_pte_at(mm, address,
                                                      pvmw.pte, pteval,
                                                      vma_mmu_pagesize(vma));
                         } else {
-                               dec_mm_counter(mm, mm_counter(page));
+                               dec_mm_counter(mm, mm_counter(&folio->page));
                                 set_pte_at(mm, address, pvmw.pte, pteval);
                         }
   
@@@ -1907,7 -1876,7 +1887,7 @@@
                          * migration) will not expect userfaults on already
                          * copied pages.
                          */
-                       dec_mm_counter(mm, mm_counter(page));
+                       dec_mm_counter(mm, mm_counter(&folio->page));
                         /* We have to invalidate as we cleared the pte */
                         mmu_notifier_invalidate_range(mm, address,
                                                       address + PAGE_SIZE);
@@@ -1953,8 -1922,10 +1933,10 @@@
                  *
                  * See Documentation/vm/mmu_notifier.rst
                  */
-               page_remove_rmap(subpage, PageHuge(page));
-               put_page(page);
+               page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
+               if (vma->vm_flags & VM_LOCKED)
+                       mlock_page_drain(smp_processor_id());
+               folio_put(folio);
         }
   
         mmu_notifier_invalidate_range_end(&range);
@@@ -1964,19 -1935,19 +1946,19 @@@
   
   /**
    * try_to_migrate - try to replace all page table mappings with swap entries
-  * @page: the page to replace page table entries for
+  * @folio: the folio to replace page table entries for
    * @flags: action and flags
    *
-  * Tries to remove all the page table entries which are mapping this page and
-  * replace them with special swap entries. Caller must hold the page lock.
+  * Tries to remove all the page table entries which are mapping this folio and
+  * replace them with special swap entries. Caller must hold the folio lock.
    */
- void try_to_migrate(struct page *page, enum ttu_flags flags)
+ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
   {
         struct rmap_walk_control rwc = {
                 .rmap_one = try_to_migrate_one,
                 .arg = (void *)flags,
                 .done = page_not_mapped,
-               .anon_lock = page_lock_anon_vma_read,
+               .anon_lock = folio_lock_anon_vma_read,
         };
   
         /*
@@@ -1987,7 -1958,7 +1969,7 @@@
                                         TTU_SYNC)))
                 return;
   
-       if (is_zone_device_page(page) && !is_device_private_page(page))
+       if (folio_is_zone_device(folio) && !folio_is_device_private(folio))
                 return;
   
         /*
@@@ -1998,83 -1969,13 +1980,13 @@@
          * locking requirements of exec(), migration skips
          * temporary VMAs until after exec() completes.
          */
-       if (!PageKsm(page) && PageAnon(page))
+       if (!folio_test_ksm(folio) && folio_test_anon(folio))
                 rwc.invalid_vma = invalid_migration_vma;
   
         if (flags & TTU_RMAP_LOCKED)
-               rmap_walk_locked(page, &rwc);
+               rmap_walk_locked(folio, &rwc);
         else
-               rmap_walk(page, &rwc);
- }
- 
- /*
-  * Walks the vma's mapping a page and mlocks the page if any locked vma's are
-  * found. Once one is found the page is locked and the scan can be terminated.
-  */
- static bool page_mlock_one(struct page *page, struct vm_area_struct *vma,
-                                unsigned long address, void *unused)
- {
-       struct page_vma_mapped_walk pvmw = {
-               .page = page,
-               .vma = vma,
-               .address = address,
-       };
- 
-       /* An un-locked vma doesn't have any pages to lock, continue the scan */
-       if (!(vma->vm_flags & VM_LOCKED))
-               return true;
- 
-       while (page_vma_mapped_walk(&pvmw)) {
-               /*
-                * Need to recheck under the ptl to serialise with
-                * __munlock_pagevec_fill() after VM_LOCKED is cleared in
-                * munlock_vma_pages_range().
-                */
-               if (vma->vm_flags & VM_LOCKED) {
-                       /*
-                        * PTE-mapped THP are never marked as mlocked; but
-                        * this function is never called on a DoubleMap THP,
-                        * nor on an Anon THP (which may still be PTE-mapped
-                        * after DoubleMap was cleared).
-                        */
-                       mlock_vma_page(page);
-                       /*
-                        * No need to scan further once the page is marked
-                        * as mlocked.
-                        */
-                       page_vma_mapped_walk_done(&pvmw);
-                       return false;
-               }
-       }
- 
-       return true;
- }
- 
- /**
-  * page_mlock - try to mlock a page
-  * @page: the page to be mlocked
-  *
-  * Called from munlock code. Checks all of the VMAs mapping the page and mlocks
-  * the page if any are found. The page will be returned with PG_mlocked cleared
-  * if it is not mapped by any locked vmas.
-  */
- void page_mlock(struct page *page)
- {
-       struct rmap_walk_control rwc = {
-               .rmap_one = page_mlock_one,
-               .done = page_not_mapped,
-               .anon_lock = page_lock_anon_vma_read,
- 
-       };
- 
-       VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
-       VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
- 
-       /* Anon THP are only marked as mlocked when singly mapped */
-       if (PageTransCompound(page) && PageAnon(page))
-               return;
- 
-       rmap_walk(page, &rwc);
+               rmap_walk(folio, &rwc);
   }
   
   #ifdef CONFIG_DEVICE_PRIVATE
@@@ -2085,15 -1986,11 +1997,11 @@@ struct make_exclusive_args 
         bool valid;
   };
   
- static bool page_make_device_exclusive_one(struct page *page,
+ static bool page_make_device_exclusive_one(struct folio *folio,
                 struct vm_area_struct *vma, unsigned long address, void *priv)
   {
         struct mm_struct *mm = vma->vm_mm;
-       struct page_vma_mapped_walk pvmw = {
-               .page = page,
-               .vma = vma,
-               .address = address,
-       };
+       DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
         struct make_exclusive_args *args = priv;
         pte_t pteval;
         struct page *subpage;
@@@ -2104,12 -2001,13 +2012,13 @@@
   
         mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
                                       vma->vm_mm, address, min(vma->vm_end,
-                                     address + page_size(page)), args->owner);
+                                     address + folio_size(folio)),
+                                     args->owner);
         mmu_notifier_invalidate_range_start(&range);
   
         while (page_vma_mapped_walk(&pvmw)) {
                 /* Unexpected PMD-mapped THP? */
-               VM_BUG_ON_PAGE(!pvmw.pte, page);
+               VM_BUG_ON_FOLIO(!pvmw.pte, folio);
   
                 if (!pte_present(*pvmw.pte)) {
                         ret = false;
@@@ -2117,16 -2015,17 +2026,17 @@@
                         break;
                 }
   
-               subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+               subpage = folio_page(folio,
+                               pte_pfn(*pvmw.pte) - folio_pfn(folio));
                 address = pvmw.address;
   
                 /* Nuke the page table entry. */
                 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
                 pteval = ptep_clear_flush(vma, address, pvmw.pte);
   
-               /* Move the dirty bit to the page. Now the pte is gone. */
+               /* Set the dirty flag on the folio now the pte is gone. */
                 if (pte_dirty(pteval))
-                       set_page_dirty(page);
+                       folio_mark_dirty(folio);
   
                 /*
                  * Check that our target page is still mapped at the expected
@@@ -2159,7 -2058,7 +2069,7 @@@
                  * There is a reference on the page for the swap entry which has
                  * been removed, so shouldn't take another.
                  */
-               page_remove_rmap(subpage, false);
+               page_remove_rmap(subpage, vma, false);
         }
   
         mmu_notifier_invalidate_range_end(&range);
@@@ -2168,21 -2067,22 +2078,22 @@@
   }
   
   /**
-  * page_make_device_exclusive - mark the page exclusively owned by a device
-  * @page: the page to replace page table entries for
-  * @mm: the mm_struct where the page is expected to be mapped
-  * @address: address where the page is expected to be mapped
+  * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
+  * @folio: The folio to replace page table entries for.
+  * @mm: The mm_struct where the folio is expected to be mapped.
+  * @address: Address where the folio is expected to be mapped.
    * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
    *
-  * Tries to remove all the page table entries which are mapping this page and
-  * replace them with special device exclusive swap entries to grant a device
-  * exclusive access to the page. Caller must hold the page lock.
+  * Tries to remove all the page table entries which are mapping this
+  * folio and replace them with special device exclusive swap entries to
+  * grant a device exclusive access to the folio.
    *
-  * Returns false if the page is still mapped, or if it could not be unmapped
+  * Context: Caller must hold the folio lock.
+  * Return: false if the page is still mapped, or if it could not be unmapped
    * from the expected address. Otherwise returns true (success).
    */
- static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm,
-                               unsigned long address, void *owner)
+ static bool folio_make_device_exclusive(struct folio *folio,
+               struct mm_struct *mm, unsigned long address, void *owner)
   {
         struct make_exclusive_args args = {
                 .mm = mm,
@@@ -2193,21 -2093,20 +2104,20 @@@
         struct rmap_walk_control rwc = {
                 .rmap_one = page_make_device_exclusive_one,
                 .done = page_not_mapped,
-               .anon_lock = page_lock_anon_vma_read,
+               .anon_lock = folio_lock_anon_vma_read,
                 .arg = &args,
         };
   
         /*
-        * Restrict to anonymous pages for now to avoid potential writeback
-        * issues. Also tail pages shouldn't be passed to rmap_walk so skip
-        * those.
+        * Restrict to anonymous folios for now to avoid potential writeback
+        * issues.
          */
-       if (!PageAnon(page) || PageTail(page))
+       if (!folio_test_anon(folio))
                 return false;
   
-       rmap_walk(page, &rwc);
+       rmap_walk(folio, &rwc);
   
-       return args.valid && !page_mapcount(page);
+       return args.valid && !folio_mapcount(folio);
   }
   
   /**
@@@ -2245,15 -2144,16 +2155,16 @@@ int make_device_exclusive_range(struct 
                 return npages;
   
         for (i = 0; i < npages; i++, start += PAGE_SIZE) {
-               if (!trylock_page(pages[i])) {
-                       put_page(pages[i]);
+               struct folio *folio = page_folio(pages[i]);
+               if (PageTail(pages[i]) || !folio_trylock(folio)) {
+                       folio_put(folio);
                         pages[i] = NULL;
                         continue;
                 }
   
-               if (!page_make_device_exclusive(pages[i], mm, start, owner)) {
-                       unlock_page(pages[i]);
-                       put_page(pages[i]);
+               if (!folio_make_device_exclusive(folio, mm, start, owner)) {
+                       folio_unlock(folio);
+                       folio_put(folio);
                         pages[i] = NULL;
                 }
         }
@@@ -2272,21 -2172,21 +2183,21 @@@ void __put_anon_vma(struct anon_vma *an
                 anon_vma_free(root);
   }
   
- static struct anon_vma *rmap_walk_anon_lock(struct page *page,
-                                       struct rmap_walk_control *rwc)
+ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
+                                       const struct rmap_walk_control *rwc)
   {
         struct anon_vma *anon_vma;
   
         if (rwc->anon_lock)
-               return rwc->anon_lock(page);
+               return rwc->anon_lock(folio);
   
         /*
-        * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
+        * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
          * because that depends on page_mapped(); but not all its usages
          * are holding mmap_lock. Users without mmap_lock are required to
          * take a reference count to prevent the anon_vma disappearing
          */
-       anon_vma = page_anon_vma(page);
+       anon_vma = folio_anon_vma(folio);
         if (!anon_vma)
                 return NULL;
   
@@@ -2302,35 -2202,30 +2213,30 @@@
    *
    * Find all the mappings of a page using the mapping pointer and the vma chains
    * contained in the anon_vma struct it points to.
-  *
-  * When called from page_mlock(), the mmap_lock of the mm containing the vma
-  * where the page was found will be held for write.  So, we won't recheck
-  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
-  * LOCKED.
    */
- static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
-               bool locked)
+ static void rmap_walk_anon(struct folio *folio,
+               const struct rmap_walk_control *rwc, bool locked)
   {
         struct anon_vma *anon_vma;
         pgoff_t pgoff_start, pgoff_end;
         struct anon_vma_chain *avc;
   
         if (locked) {
-               anon_vma = page_anon_vma(page);
+               anon_vma = folio_anon_vma(folio);
                 /* anon_vma disappear under us? */
-               VM_BUG_ON_PAGE(!anon_vma, page);
+               VM_BUG_ON_FOLIO(!anon_vma, folio);
         } else {
-               anon_vma = rmap_walk_anon_lock(page, rwc);
+               anon_vma = rmap_walk_anon_lock(folio, rwc);
         }
         if (!anon_vma)
                 return;
   
-       pgoff_start = page_to_pgoff(page);
-       pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
+       pgoff_start = folio_pgoff(folio);
+       pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
         anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
                         pgoff_start, pgoff_end) {
                 struct vm_area_struct *vma = avc->vma;
-               unsigned long address = vma_address(page, vma);
+               unsigned long address = vma_address(&folio->page, vma);
   
                 VM_BUG_ON_VMA(address == -EFAULT, vma);
                 cond_resched();
@@@ -2338,9 -2233,9 +2244,9 @@@
                 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                         continue;
   
-               if (!rwc->rmap_one(page, vma, address, rwc->arg))
+               if (!rwc->rmap_one(folio, vma, address, rwc->arg))
                         break;
-               if (rwc->done && rwc->done(page))
+               if (rwc->done && rwc->done(folio))
                         break;
         }
   
@@@ -2355,16 -2250,11 +2261,11 @@@
    *
    * Find all the mappings of a page using the mapping pointer and the vma chains
    * contained in the address_space struct it points to.
-  *
-  * When called from page_mlock(), the mmap_lock of the mm containing the vma
-  * where the page was found will be held for write.  So, we won't recheck
-  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
-  * LOCKED.
    */
- static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
-               bool locked)
+ static void rmap_walk_file(struct folio *folio,
+               const struct rmap_walk_control *rwc, bool locked)
   {
-       struct address_space *mapping = page_mapping(page);
+       struct address_space *mapping = folio_mapping(folio);
         pgoff_t pgoff_start, pgoff_end;
         struct vm_area_struct *vma;
   
@@@ -2374,18 -2264,18 +2275,18 @@@
          * structure at mapping cannot be freed and reused yet,
          * so we can safely take mapping->i_mmap_rwsem.
          */
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
   
         if (!mapping)
                 return;
   
-       pgoff_start = page_to_pgoff(page);
-       pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
+       pgoff_start = folio_pgoff(folio);
+       pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
         if (!locked)
                 i_mmap_lock_read(mapping);
         vma_interval_tree_foreach(vma, &mapping->i_mmap,
                         pgoff_start, pgoff_end) {
-               unsigned long address = vma_address(page, vma);
+               unsigned long address = vma_address(&folio->page, vma);
   
                 VM_BUG_ON_VMA(address == -EFAULT, vma);
                 cond_resched();
@@@ -2393,9 -2283,9 +2294,9 @@@
                 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                         continue;
   
-               if (!rwc->rmap_one(page, vma, address, rwc->arg))
+               if (!rwc->rmap_one(folio, vma, address, rwc->arg))
                         goto done;
-               if (rwc->done && rwc->done(page))
+               if (rwc->done && rwc->done(folio))
                         goto done;
         }
   
@@@ -2404,25 -2294,25 +2305,25 @@@ done
                 i_mmap_unlock_read(mapping);
   }
   
- void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
+ void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc)
   {
-       if (unlikely(PageKsm(page)))
-               rmap_walk_ksm(page, rwc);
-       else if (PageAnon(page))
-               rmap_walk_anon(page, rwc, false);
+       if (unlikely(folio_test_ksm(folio)))
+               rmap_walk_ksm(folio, rwc);
+       else if (folio_test_anon(folio))
+               rmap_walk_anon(folio, rwc, false);
         else
-               rmap_walk_file(page, rwc, false);
+               rmap_walk_file(folio, rwc, false);
   }
   
   /* Like rmap_walk, but caller holds relevant rmap lock */
- void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+ void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc)
   {
         /* no ksm support for now */
-       VM_BUG_ON_PAGE(PageKsm(page), page);
-       if (PageAnon(page))
-               rmap_walk_anon(page, rwc, true);
+       VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
+       if (folio_test_anon(folio))
+               rmap_walk_anon(folio, rwc, true);
         else
-               rmap_walk_file(page, rwc, true);
+               rmap_walk_file(folio, rwc, true);
   }
   
   #ifdef CONFIG_HUGETLB_PAGE
@@@ -2450,8 -2340,7 +2351,7 @@@ void hugepage_add_new_anon_rmap(struct 
   {
         BUG_ON(address < vma->vm_start || address >= vma->vm_end);
         atomic_set(compound_mapcount_ptr(page), 0);
-       if (hpage_pincount_available(page))
-               atomic_set(compound_pincount_ptr(page), 0);
+       atomic_set(compound_pincount_ptr(page), 0);
   
         __page_set_anon_rmap(page, vma, address, 1);
   }
diff --combined mm/swap.c

index 754520bab29998c83be96f626e1c1badd5042351,65ec5cbab78b21ad4b423d5d995cd866eef0c046..5b30045207e16e23d333ac57ddabab3846f53b04
--- 1/mm/swap.c
--- 2/mm/swap.c
+++ b/mm/swap.c
@@@ -74,8 -74,8 +74,8 @@@ static DEFINE_PER_CPU(struct lru_pvecs
   };
   
   /*
-  * This path almost never happens for VM activity - pages are normally
-  * freed via pagevecs.  But it gets used by networking.
+  * This path almost never happens for VM activity - pages are normally freed
+  * via pagevecs.  But it gets used by networking - and for compound pages.
    */
   static void __page_cache_release(struct page *page)
   {
@@@ -89,6 -89,14 +89,14 @@@
                 __clear_page_lru_flags(page);
                 unlock_page_lruvec_irqrestore(lruvec, flags);
         }
+       /* See comment on PageMlocked in release_pages() */
+       if (unlikely(PageMlocked(page))) {
+               int nr_pages = thp_nr_pages(page);
+ 
+               __ClearPageMlocked(page);
+               mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+               count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
+       }
         __ClearPageWaiters(page);
   }
   
@@@ -114,17 -122,9 +122,9 @@@ static void __put_compound_page(struct 
   
   void __put_page(struct page *page)
   {
-       if (is_zone_device_page(page)) {
-               put_dev_pagemap(page->pgmap);
- 
-               /*
-                * The page belongs to the device that created pgmap. Do
-                * not return it to page allocator.
-                */
-               return;
-       }
- 
-       if (unlikely(PageCompound(page)))
+       if (unlikely(is_zone_device_page(page)))
+               free_zone_device_page(page);
+       else if (unlikely(PageCompound(page)))
                 __put_compound_page(page);
         else
                 __put_single_page(page);
@@@ -425,7 -425,7 +425,7 @@@ void folio_mark_accessed(struct folio *
                 /*
                  * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
                  * this list is never rotated or maintained, so marking an
- -               * evictable page accessed has no effect.
+ +               * unevictable page accessed has no effect.
                  */
         } else if (!folio_test_active(folio)) {
                 /*
@@@ -482,22 -482,12 +482,12 @@@ EXPORT_SYMBOL(folio_add_lru)
   void lru_cache_add_inactive_or_unevictable(struct page *page,
                                          struct vm_area_struct *vma)
   {
-       bool unevictable;
- 
         VM_BUG_ON_PAGE(PageLRU(page), page);
   
-       unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
-       if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
-               int nr_pages = thp_nr_pages(page);
-               /*
-                * We use the irq-unsafe __mod_zone_page_state because this
-                * counter is not modified from interrupt context, and the pte
-                * lock is held(spinlock), which implies preemption disabled.
-                */
-               __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
-               count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
-       }
-       lru_cache_add(page);
+       if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
+               mlock_new_page(page);
+       else
+               lru_cache_add(page);
   }
   
   /*
@@@ -636,35 -626,37 +626,37 @@@ void lru_add_drain_cpu(int cpu
                 pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
   
         activate_page_drain(cpu);
+       mlock_page_drain(cpu);
   }
   
   /**
-  * deactivate_file_page - forcefully deactivate a file page
-  * @page: page to deactivate
+  * deactivate_file_folio() - Forcefully deactivate a file folio.
+  * @folio: Folio to deactivate.
    *
-  * This function hints the VM that @page is a good reclaim candidate,
-  * for example if its invalidation fails due to the page being dirty
+  * This function hints to the VM that @folio is a good reclaim candidate,
+  * for example if its invalidation fails due to the folio being dirty
    * or under writeback.
+  *
+  * Context: Caller holds a reference on the page.
    */
- void deactivate_file_page(struct page *page)
+ void deactivate_file_folio(struct folio *folio)
   {
+       struct pagevec *pvec;
+ 
         /*
-        * In a workload with many unevictable page such as mprotect,
-        * unevictable page deactivation for accelerating reclaim is pointless.
+        * In a workload with many unevictable pages such as mprotect,
+        * unevictable folio deactivation for accelerating reclaim is pointless.
          */
-       if (PageUnevictable(page))
+       if (folio_test_unevictable(folio))
                 return;
   
-       if (likely(get_page_unless_zero(page))) {
-               struct pagevec *pvec;
- 
-               local_lock(&lru_pvecs.lock);
-               pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
+       folio_get(folio);
+       local_lock(&lru_pvecs.lock);
+       pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
   
-               if (pagevec_add_and_need_flush(pvec, page))
-                       pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
-               local_unlock(&lru_pvecs.lock);
-       }
+       if (pagevec_add_and_need_flush(pvec, &folio->page))
+               pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
+       local_unlock(&lru_pvecs.lock);
   }
   
   /*
@@@ -831,12 -823,14 +823,13 @@@ inline void __lru_add_drain_all(bool fo
         for_each_online_cpu(cpu) {
                 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
   
- -              if (force_all_cpus ||
- -                  pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
+ +              if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
                     data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
                     pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
                     pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
                     pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
                     need_activate_page_drain(cpu) ||
+                   need_mlock_page_drain(cpu) ||
                     has_bh_in_lru(cpu, NULL)) {
                         INIT_WORK(work, lru_add_drain_per_cpu);
                         queue_work_on(cpu, mm_percpu_wq, work);
@@@ -875,21 -869,15 +868,21 @@@ atomic_t lru_disable_count = ATOMIC_INI
   void lru_cache_disable(void)
   {
         atomic_inc(&lru_disable_count);
- -#ifdef CONFIG_SMP
         /*
- -       * lru_add_drain_all in the force mode will schedule draining on
- -       * all online CPUs so any calls of lru_cache_disabled wrapped by
- -       * local_lock or preemption disabled would be ordered by that.
- -       * The atomic operation doesn't need to have stronger ordering
- -       * requirements because that is enforced by the scheduling
- -       * guarantees.
+ +       * Readers of lru_disable_count are protected by either disabling
+ +       * preemption or rcu_read_lock:
+ +       *
+ +       * preempt_disable, local_irq_disable  [bh_lru_lock()]
+ +       * rcu_read_lock                       [rt_spin_lock CONFIG_PREEMPT_RT]
+ +       * preempt_disable                     [local_lock !CONFIG_PREEMPT_RT]
+ +       *
+ +       * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on
+ +       * preempt_disable() regions of code. So any CPU which sees
+ +       * lru_disable_count = 0 will have exited the critical
+ +       * section when synchronize_rcu() returns.
          */
+ +      synchronize_rcu();
+ +#ifdef CONFIG_SMP
         __lru_add_drain_all(true);
   #else
         lru_add_and_bh_lrus_drain();
@@@ -935,18 -923,10 +928,10 @@@ void release_pages(struct page **pages
                                 unlock_page_lruvec_irqrestore(lruvec, flags);
                                 lruvec = NULL;
                         }
-                       /*
-                        * ZONE_DEVICE pages that return 'false' from
-                        * page_is_devmap_managed() do not require special
-                        * processing, and instead, expect a call to
-                        * put_page_testzero().
-                        */
-                       if (page_is_devmap_managed(page)) {
-                               put_devmap_managed_page(page);
+                       if (put_devmap_managed_page(page))
                                 continue;
-                       }
                         if (put_page_testzero(page))
-                               put_dev_pagemap(page->pgmap);
+                               free_zone_device_page(page);
                         continue;
                 }
   
@@@ -974,6 -954,18 +959,18 @@@
                         __clear_page_lru_flags(page);
                 }
   
+               /*
+                * In rare cases, when truncation or holepunching raced with
+                * munlock after VM_LOCKED was cleared, Mlocked may still be
+                * found set here.  This does not indicate a problem, unless
+                * "unevictable_pgs_cleared" appears worryingly large.
+                */
+               if (unlikely(PageMlocked(page))) {
+                       __ClearPageMlocked(page);
+                       dec_zone_page_state(page, NR_MLOCK);
+                       count_vm_event(UNEVICTABLE_PGCLEARED);
+               }
+ 
                 __ClearPageWaiters(page);
   
                 list_add(&page->lru, &pages_to_free);
@@@ -1014,43 -1006,32 +1011,32 @@@ static void __pagevec_lru_add_fn(struc
   
         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
   
+       folio_set_lru(folio);
         /*
-        * A folio becomes evictable in two ways:
-        * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
-        * 2) Before acquiring LRU lock to put the folio on the correct LRU
-        *    and then
-        *   a) do PageLRU check with lock [check_move_unevictable_pages]
-        *   b) do PageLRU check before lock [clear_page_mlock]
-        *
-        * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
-        * following strict ordering:
-        *
-        * #0: __pagevec_lru_add_fn             #1: clear_page_mlock
+        * Is an smp_mb__after_atomic() still required here, before
+        * folio_evictable() tests PageMlocked, to rule out the possibility
+        * of stranding an evictable folio on an unevictable LRU?  I think
+        * not, because __munlock_page() only clears PageMlocked while the LRU
+        * lock is held.
          *
-        * folio_set_lru()                      folio_test_clear_mlocked()
-        * smp_mb() // explicit ordering        // above provides strict
-        *                                      // ordering
-        * folio_test_mlocked()                 folio_test_lru()
-        *
-        *
-        * if '#1' does not observe setting of PG_lru by '#0' and
-        * fails isolation, the explicit barrier will make sure that
-        * folio_evictable check will put the folio on the correct
-        * LRU. Without smp_mb(), folio_set_lru() can be reordered
-        * after folio_test_mlocked() check and can make '#1' fail the
-        * isolation of the folio whose mlocked bit is cleared (#0 is
-        * also looking at the same folio) and the evictable folio will
-        * be stranded on an unevictable LRU.
+        * (That is not true of __page_cache_release(), and not necessarily
+        * true of release_pages(): but those only clear PageMlocked after
+        * put_page_testzero() has excluded any other users of the page.)
          */
-       folio_set_lru(folio);
-       smp_mb__after_atomic();
- 
         if (folio_evictable(folio)) {
                 if (was_unevictable)
                         __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
         } else {
                 folio_clear_active(folio);
                 folio_set_unevictable(folio);
+               /*
+                * folio->mlock_count = !!folio_test_mlocked(folio)?
+                * But that leaves __mlock_page() in doubt whether another
+                * actor has already counted the mlock or not.  Err on the
+                * safe side, underestimate, let page reclaim fix it, rather
+                * than leaving a page on the unevictable LRU indefinitely.
+                */
+               folio->mlock_count = 0;
                 if (!was_unevictable)
                         __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
         }
@@@ -1158,26 -1139,3 +1144,3 @@@ void __init swap_setup(void
          * _really_ don't want to cluster much more
          */
   }
- 
- #ifdef CONFIG_DEV_PAGEMAP_OPS
- void put_devmap_managed_page(struct page *page)
- {
-       int count;
- 
-       if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
-               return;
- 
-       count = page_ref_dec_return(page);
- 
-       /*
-        * devmap page refcounts are 1-based, rather than 0-based: if
-        * refcount is 1, then the page is free and the refcount is
-        * stable because nobody holds a reference on the page.
-        */
-       if (count == 1)
-               free_devmap_managed_page(page);
-       else if (!count)
-               __put_page(page);
- }
- EXPORT_SYMBOL(put_devmap_managed_page);
- #endif
diff --combined mm/userfaultfd.c

index 6ccc534d1c1cb8530d17a7cbd9c91a84cb862828,15d3e97a6e04539072542898e75b3eeba0cbb182..0cb8e5ef17136743f710f0b54c4199075bb7404c
--- 1/mm/userfaultfd.c
--- 2/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@@ -95,10 -95,15 +95,15 @@@ int mfill_atomic_install_pte(struct mm_
         if (!pte_none(*dst_pte))
                 goto out_unlock;
   
-       if (page_in_cache)
-               page_add_file_rmap(page, false);
-       else
+       if (page_in_cache) {
+               /* Usually, cache pages are already added to LRU */
+               if (newly_allocated)
+                       lru_cache_add(page);
+               page_add_file_rmap(page, dst_vma, false);
+       } else {
                 page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+               lru_cache_add_inactive_or_unevictable(page, dst_vma);
+       }
   
         /*
          * Must happen after rmap, as mm_counter() checks mapping (via
@@@ -106,9 -111,6 +111,6 @@@
          */
         inc_mm_counter(dst_mm, mm_counter(page));
   
-       if (newly_allocated)
-               lru_cache_add_inactive_or_unevictable(page, dst_vma);
- 
         set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
   
         /* No need to invalidate - it was non-present before */
@@@ -150,8 -152,6 +152,8 @@@ static int mcopy_atomic_pte(struct mm_s
                         /* don't free the page */
                         goto out;
                 }
+ +
+ +              flush_dcache_page(page);
         } else {
                 page = *pagep;
                 *pagep = NULL;
@@@ -627,7 -627,6 +629,7 @@@ retry
                                 err = -EFAULT;
                                 goto out;
                         }
+ +                      flush_dcache_page(page);
                         goto retry;
                 } else
                         BUG_ON(page);
diff --combined mm/util.c

index d3102081add001d4986f15b6e7dc539ce769e583,13fc88ac8e70052189a6d16862e2626be8cb9219..1e2728736398688e30454e985d4895a875d47895
--- 1/mm/util.c
--- 2/mm/util.c
+++ b/mm/util.c
@@@ -587,10 -587,8 +587,10 @@@ void *kvmalloc_node(size_t size, gfp_t 
                 return ret;
   
         /* Don't even allow crazy sizes */
- -      if (WARN_ON_ONCE(size > INT_MAX))
+ +      if (unlikely(size > INT_MAX)) {
+ +              WARN_ON_ONCE(!(flags & __GFP_NOWARN));
                 return NULL;
+ +      }
   
         return __vmalloc_node(size, 1, flags, node,
                         __builtin_return_address(0));
@@@ -681,9 -679,8 +681,8 @@@ bool folio_mapped(struct folio *folio
   }
   EXPORT_SYMBOL(folio_mapped);
   
- struct anon_vma *page_anon_vma(struct page *page)
+ struct anon_vma *folio_anon_vma(struct folio *folio)
   {
-       struct folio *folio = page_folio(page);
         unsigned long mapping = (unsigned long)folio->mapping;
   
         if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
@@@ -742,6 -739,39 +741,39 @@@ int __page_mapcount(struct page *page
   }
   EXPORT_SYMBOL_GPL(__page_mapcount);
   
+ /**
+  * folio_mapcount() - Calculate the number of mappings of this folio.
+  * @folio: The folio.
+  *
+  * A large folio tracks both how many times the entire folio is mapped,
+  * and how many times each individual page in the folio is mapped.
+  * This function calculates the total number of times the folio is
+  * mapped.
+  *
+  * Return: The number of times this folio is mapped.
+  */
+ int folio_mapcount(struct folio *folio)
+ {
+       int i, compound, nr, ret;
+ 
+       if (likely(!folio_test_large(folio)))
+               return atomic_read(&folio->_mapcount) + 1;
+ 
+       compound = folio_entire_mapcount(folio);
+       nr = folio_nr_pages(folio);
+       if (folio_test_hugetlb(folio))
+               return compound;
+       ret = compound;
+       for (i = 0; i < nr; i++)
+               ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1;
+       /* File pages has compound_mapcount included in _mapcount */
+       if (!folio_test_anon(folio))
+               return ret - compound * nr;
+       if (folio_test_double_map(folio))
+               ret -= nr;
+       return ret;
+ }
+ 
   /**
    * folio_copy - Copy the contents of one folio to another.
    * @dst: Folio to copy to.
diff --combined mm/vmscan.c

index 499fa86e754a07c0c387b5e4cf11aef090cad107,7db5d0237333b96970c543387e862ca9f153ec15..1678802e03e78577c3366afeed00f23fdaa95aca
--- 1/mm/vmscan.c
--- 2/mm/vmscan.c
+++ b/mm/vmscan.c
@@@ -56,7 -56,6 +56,7 @@@
   
   #include <linux/swapops.h>
   #include <linux/balloon_compaction.h>
+ +#include <linux/sched/sysctl.h>
   
   #include "internal.h"
   
@@@ -979,36 -978,47 +979,36 @@@ void drop_slab(void
                 drop_slab_node(nid);
   }
   
- static inline int is_page_cache_freeable(struct page *page)
+ static inline int is_page_cache_freeable(struct folio *folio)
   {
         /*
          * A freeable page cache page is referenced only by the caller
          * that isolated the page, the page cache and optional buffer
          * heads at page->private.
          */
-       int page_cache_pins = thp_nr_pages(page);
-       return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
+       return folio_ref_count(folio) - folio_test_private(folio) ==
+               1 + folio_nr_pages(folio);
   }
   
- -static int may_write_to_inode(struct inode *inode)
- -{
- -      if (current->flags & PF_SWAPWRITE)
- -              return 1;
- -      if (!inode_write_congested(inode))
- -              return 1;
- -      if (inode_to_bdi(inode) == current->backing_dev_info)
- -              return 1;
- -      return 0;
- -}
- -
   /*
-  * We detected a synchronous write error writing a page out.  Probably
+  * We detected a synchronous write error writing a folio out.  Probably
    * -ENOSPC.  We need to propagate that into the address_space for a subsequent
    * fsync(), msync() or close().
    *
    * The tricky part is that after writepage we cannot touch the mapping: nothing
-  * prevents it from being freed up.  But we have a ref on the page and once
-  * that page is locked, the mapping is pinned.
+  * prevents it from being freed up.  But we have a ref on the folio and once
+  * that folio is locked, the mapping is pinned.
    *
-  * We're allowed to run sleeping lock_page() here because we know the caller has
+  * We're allowed to run sleeping folio_lock() here because we know the caller has
    * __GFP_FS.
    */
   static void handle_write_error(struct address_space *mapping,
-                               struct page *page, int error)
+                               struct folio *folio, int error)
   {
-       lock_page(page);
-       if (page_mapping(page) == mapping)
+       folio_lock(folio);
+       if (folio_mapping(folio) == mapping)
                 mapping_set_error(mapping, error);
-       unlock_page(page);
+       folio_unlock(folio);
   }
   
   static bool skip_throttle_noprogress(pg_data_t *pgdat)
@@@ -1155,35 -1165,35 +1155,35 @@@ typedef enum 
    * pageout is called by shrink_page_list() for each dirty page.
    * Calls ->writepage().
    */
- static pageout_t pageout(struct page *page, struct address_space *mapping)
+ static pageout_t pageout(struct folio *folio, struct address_space *mapping)
   {
         /*
-        * If the page is dirty, only perform writeback if that write
+        * If the folio is dirty, only perform writeback if that write
          * will be non-blocking.  To prevent this allocation from being
          * stalled by pagecache activity.  But note that there may be
          * stalls if we need to run get_block().  We could test
          * PagePrivate for that.
          *
          * If this process is currently in __generic_file_write_iter() against
-        * this page's queue, we can perform writeback even if that
+        * this folio's queue, we can perform writeback even if that
          * will block.
          *
-        * If the page is swapcache, write it back even if that would
+        * If the folio is swapcache, write it back even if that would
          * block, for some throttling. This happens by accident, because
          * swap_backing_dev_info is bust: it doesn't reflect the
          * congestion state of the swapdevs.  Easy to fix, if needed.
          */
-       if (!is_page_cache_freeable(page))
+       if (!is_page_cache_freeable(folio))
                 return PAGE_KEEP;
         if (!mapping) {
                 /*
-                * Some data journaling orphaned pages can have
-                * page->mapping == NULL while being dirty with clean buffers.
+                * Some data journaling orphaned folios can have
+                * folio->mapping == NULL while being dirty with clean buffers.
                  */
-               if (page_has_private(page)) {
-                       if (try_to_free_buffers(page)) {
-                               ClearPageDirty(page);
-                               pr_info("%s: orphaned page\n", __func__);
+               if (folio_test_private(folio)) {
+                       if (try_to_free_buffers(&folio->page)) {
+                               folio_clear_dirty(folio);
+                               pr_info("%s: orphaned folio\n", __func__);
                                 return PAGE_CLEAN;
                         }
                 }
@@@ -1191,8 -1201,10 +1191,8 @@@
         }
         if (mapping->a_ops->writepage == NULL)
                 return PAGE_ACTIVATE;
- -      if (!may_write_to_inode(mapping->host))
- -              return PAGE_KEEP;
   
-       if (clear_page_dirty_for_io(page)) {
+       if (folio_clear_dirty_for_io(folio)) {
                 int res;
                 struct writeback_control wbc = {
                         .sync_mode = WB_SYNC_NONE,
@@@ -1202,21 -1214,21 +1202,21 @@@
                         .for_reclaim = 1,
                 };
   
-               SetPageReclaim(page);
-               res = mapping->a_ops->writepage(page, &wbc);
+               folio_set_reclaim(folio);
+               res = mapping->a_ops->writepage(&folio->page, &wbc);
                 if (res < 0)
-                       handle_write_error(mapping, page, res);
+                       handle_write_error(mapping, folio, res);
                 if (res == AOP_WRITEPAGE_ACTIVATE) {
-                       ClearPageReclaim(page);
+                       folio_clear_reclaim(folio);
                         return PAGE_ACTIVATE;
                 }
   
-               if (!PageWriteback(page)) {
+               if (!folio_test_writeback(folio)) {
                         /* synchronous write or broken a_ops? */
-                       ClearPageReclaim(page);
+                       folio_clear_reclaim(folio);
                 }
-               trace_mm_vmscan_writepage(page);
-               inc_node_page_state(page, NR_VMSCAN_WRITE);
+               trace_mm_vmscan_write_folio(folio);
+               node_stat_add_folio(folio, NR_VMSCAN_WRITE);
                 return PAGE_SUCCESS;
         }
   
@@@ -1227,16 -1239,16 +1227,16 @@@
    * Same as remove_mapping, but if the page is removed from the mapping, it
    * gets returned with a refcount of 0.
    */
- static int __remove_mapping(struct address_space *mapping, struct page *page,
+ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
                             bool reclaimed, struct mem_cgroup *target_memcg)
   {
         int refcount;
         void *shadow = NULL;
   
-       BUG_ON(!PageLocked(page));
-       BUG_ON(mapping != page_mapping(page));
+       BUG_ON(!folio_test_locked(folio));
+       BUG_ON(mapping != folio_mapping(folio));
   
-       if (!PageSwapCache(page))
+       if (!folio_test_swapcache(folio))
                 spin_lock(&mapping->host->i_lock);
         xa_lock_irq(&mapping->i_pages);
         /*
@@@ -1264,23 -1276,23 +1264,23 @@@
          * Note that if SetPageDirty is always performed via set_page_dirty,
          * and thus under the i_pages lock, then this ordering is not required.
          */
-       refcount = 1 + compound_nr(page);
-       if (!page_ref_freeze(page, refcount))
+       refcount = 1 + folio_nr_pages(folio);
+       if (!folio_ref_freeze(folio, refcount))
                 goto cannot_free;
         /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
-       if (unlikely(PageDirty(page))) {
-               page_ref_unfreeze(page, refcount);
+       if (unlikely(folio_test_dirty(folio))) {
+               folio_ref_unfreeze(folio, refcount);
                 goto cannot_free;
         }
   
-       if (PageSwapCache(page)) {
-               swp_entry_t swap = { .val = page_private(page) };
-               mem_cgroup_swapout(page, swap);
+       if (folio_test_swapcache(folio)) {
+               swp_entry_t swap = folio_swap_entry(folio);
+               mem_cgroup_swapout(folio, swap);
                 if (reclaimed && !mapping_exiting(mapping))
-                       shadow = workingset_eviction(page, target_memcg);
-               __delete_from_swap_cache(page, swap, shadow);
+                       shadow = workingset_eviction(folio, target_memcg);
+               __delete_from_swap_cache(&folio->page, swap, shadow);
                 xa_unlock_irq(&mapping->i_pages);
-               put_swap_page(page, swap);
+               put_swap_page(&folio->page, swap);
         } else {
                 void (*freepage)(struct page *);
   
@@@ -1301,61 -1313,67 +1301,67 @@@
                  * exceptional entries and shadow exceptional entries in the
                  * same address_space.
                  */
-               if (reclaimed && page_is_file_lru(page) &&
+               if (reclaimed && folio_is_file_lru(folio) &&
                     !mapping_exiting(mapping) && !dax_mapping(mapping))
-                       shadow = workingset_eviction(page, target_memcg);
-               __delete_from_page_cache(page, shadow);
+                       shadow = workingset_eviction(folio, target_memcg);
+               __filemap_remove_folio(folio, shadow);
                 xa_unlock_irq(&mapping->i_pages);
                 if (mapping_shrinkable(mapping))
                         inode_add_lru(mapping->host);
                 spin_unlock(&mapping->host->i_lock);
   
                 if (freepage != NULL)
-                       freepage(page);
+                       freepage(&folio->page);
         }
   
         return 1;
   
   cannot_free:
         xa_unlock_irq(&mapping->i_pages);
-       if (!PageSwapCache(page))
+       if (!folio_test_swapcache(folio))
                 spin_unlock(&mapping->host->i_lock);
         return 0;
   }
   
- /*
-  * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
-  * someone else has a ref on the page, abort and return 0.  If it was
-  * successfully detached, return 1.  Assumes the caller has a single ref on
-  * this page.
+ /**
+  * remove_mapping() - Attempt to remove a folio from its mapping.
+  * @mapping: The address space.
+  * @folio: The folio to remove.
+  *
+  * If the folio is dirty, under writeback or if someone else has a ref
+  * on it, removal will fail.
+  * Return: The number of pages removed from the mapping.  0 if the folio
+  * could not be removed.
+  * Context: The caller should have a single refcount on the folio and
+  * hold its lock.
    */
- int remove_mapping(struct address_space *mapping, struct page *page)
+ long remove_mapping(struct address_space *mapping, struct folio *folio)
   {
-       if (__remove_mapping(mapping, page, false, NULL)) {
+       if (__remove_mapping(mapping, folio, false, NULL)) {
                 /*
-                * Unfreezing the refcount with 1 rather than 2 effectively
+                * Unfreezing the refcount with 1 effectively
                  * drops the pagecache ref for us without requiring another
                  * atomic operation.
                  */
-               page_ref_unfreeze(page, 1);
-               return 1;
+               folio_ref_unfreeze(folio, 1);
+               return folio_nr_pages(folio);
         }
         return 0;
   }
   
   /**
-  * putback_lru_page - put previously isolated page onto appropriate LRU list
-  * @page: page to be put back to appropriate lru list
+  * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
+  * @folio: Folio to be returned to an LRU list.
    *
-  * Add previously isolated @page to appropriate LRU list.
-  * Page may still be unevictable for other reasons.
+  * Add previously isolated @folio to appropriate LRU list.
+  * The folio may still be unevictable for other reasons.
    *
-  * lru_lock must not be held, interrupts must be enabled.
+  * Context: lru_lock must not be held, interrupts must be enabled.
    */
- void putback_lru_page(struct page *page)
+ void folio_putback_lru(struct folio *folio)
   {
-       lru_cache_add(page);
-       put_page(page);         /* drop ref from isolate */
+       folio_add_lru(folio);
+       folio_put(folio);               /* drop ref from isolate */
   }
   
   enum page_references {
@@@ -1365,61 -1383,61 +1371,61 @@@
         PAGEREF_ACTIVATE,
   };
   
- static enum page_references page_check_references(struct page *page,
+ static enum page_references folio_check_references(struct folio *folio,
                                                   struct scan_control *sc)
   {
-       int referenced_ptes, referenced_page;
+       int referenced_ptes, referenced_folio;
         unsigned long vm_flags;
   
-       referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
-                                         &vm_flags);
-       referenced_page = TestClearPageReferenced(page);
+       referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
+                                          &vm_flags);
+       referenced_folio = folio_test_clear_referenced(folio);
   
         /*
-        * Mlock lost the isolation race with us.  Let try_to_unmap()
-        * move the page to the unevictable list.
+        * The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
+        * Let the folio, now marked Mlocked, be moved to the unevictable list.
          */
         if (vm_flags & VM_LOCKED)
-               return PAGEREF_RECLAIM;
+               return PAGEREF_ACTIVATE;
   
         if (referenced_ptes) {
                 /*
-                * All mapped pages start out with page table
+                * All mapped folios start out with page table
                  * references from the instantiating fault, so we need
-                * to look twice if a mapped file/anon page is used more
- -               * to look twice if a mapped file folio is used more
++               * to look twice if a mapped file/anon folio is used more
                  * than once.
                  *
                  * Mark it and spare it for another trip around the
                  * inactive list.  Another page table reference will
                  * lead to its activation.
                  *
-                * Note: the mark is set for activated pages as well
-                * so that recently deactivated but used pages are
+                * Note: the mark is set for activated folios as well
+                * so that recently deactivated but used folios are
                  * quickly recovered.
                  */
-               SetPageReferenced(page);
+               folio_set_referenced(folio);
   
-               if (referenced_page || referenced_ptes > 1)
+               if (referenced_folio || referenced_ptes > 1)
                         return PAGEREF_ACTIVATE;
   
                 /*
-                * Activate file-backed executable pages after first usage.
+                * Activate file-backed executable folios after first usage.
                  */
-               if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
+               if ((vm_flags & VM_EXEC) && !folio_test_swapbacked(folio))
                         return PAGEREF_ACTIVATE;
   
                 return PAGEREF_KEEP;
         }
   
-       /* Reclaim if clean, defer dirty pages to writeback */
-       if (referenced_page && !PageSwapBacked(page))
+       /* Reclaim if clean, defer dirty folios to writeback */
+       if (referenced_folio && !folio_test_swapbacked(folio))
                 return PAGEREF_RECLAIM_CLEAN;
   
         return PAGEREF_RECLAIM;
   }
   
   /* Check if a page is dirty or under writeback */
- static void page_check_dirty_writeback(struct page *page,
+ static void folio_check_dirty_writeback(struct folio *folio,
                                        bool *dirty, bool *writeback)
   {
         struct address_space *mapping;
@@@ -1428,24 -1446,24 +1434,24 @@@
          * Anonymous pages are not handled by flushers and must be written
          * from reclaim context. Do not stall reclaim based on them
          */
-       if (!page_is_file_lru(page) ||
-           (PageAnon(page) && !PageSwapBacked(page))) {
+       if (!folio_is_file_lru(folio) ||
+           (folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
                 *dirty = false;
                 *writeback = false;
                 return;
         }
   
-       /* By default assume that the page flags are accurate */
-       *dirty = PageDirty(page);
-       *writeback = PageWriteback(page);
+       /* By default assume that the folio flags are accurate */
+       *dirty = folio_test_dirty(folio);
+       *writeback = folio_test_writeback(folio);
   
         /* Verify dirty/writeback state if the filesystem supports it */
-       if (!page_has_private(page))
+       if (!folio_test_private(folio))
                 return;
   
-       mapping = page_mapping(page);
+       mapping = folio_mapping(folio);
         if (mapping && mapping->a_ops->is_dirty_writeback)
-               mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
+               mapping->a_ops->is_dirty_writeback(&folio->page, dirty, writeback);
   }
   
   static struct page *alloc_demote_page(struct page *page, unsigned long node)
@@@ -1519,14 -1537,16 +1525,16 @@@ retry
         while (!list_empty(page_list)) {
                 struct address_space *mapping;
                 struct page *page;
+               struct folio *folio;
                 enum page_references references = PAGEREF_RECLAIM;
                 bool dirty, writeback, may_enter_fs;
                 unsigned int nr_pages;
   
                 cond_resched();
   
-               page = lru_to_page(page_list);
-               list_del(&page->lru);
+               folio = lru_to_folio(page_list);
+               list_del(&folio->lru);
+               page = &folio->page;
   
                 if (!trylock_page(page))
                         goto keep;
@@@ -1552,12 -1572,12 +1560,12 @@@
                  * reclaim_congested. kswapd will stall and start writing
                  * pages if the tail of the LRU is all dirty unqueued pages.
                  */
-               page_check_dirty_writeback(page, &dirty, &writeback);
+               folio_check_dirty_writeback(folio, &dirty, &writeback);
                 if (dirty || writeback)
-                       stat->nr_dirty++;
+                       stat->nr_dirty += nr_pages;
   
                 if (dirty && !writeback)
-                       stat->nr_unqueued_dirty++;
+                       stat->nr_unqueued_dirty += nr_pages;
   
                 /*
                  * Treat this page as congested if the underlying BDI is or if
@@@ -1566,8 -1586,10 +1574,8 @@@
                  * end of the LRU a second time.
                  */
                 mapping = page_mapping(page);
- -              if (((dirty || writeback) && mapping &&
- -                   inode_write_congested(mapping->host)) ||
- -                  (writeback && PageReclaim(page)))
+ +              if (writeback && PageReclaim(page))
-                       stat->nr_congested++;
+                       stat->nr_congested += nr_pages;
   
                 /*
                  * If a page at the tail of the LRU is under writeback, there
@@@ -1616,7 -1638,7 +1624,7 @@@
                         if (current_is_kswapd() &&
                             PageReclaim(page) &&
                             test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
-                               stat->nr_immediate++;
+                               stat->nr_immediate += nr_pages;
                                 goto activate_locked;
   
                         /* Case 2 above */
@@@ -1634,7 -1656,7 +1642,7 @@@
                                  * and it's also appropriate in global reclaim.
                                  */
                                 SetPageReclaim(page);
-                               stat->nr_writeback++;
+                               stat->nr_writeback += nr_pages;
                                 goto activate_locked;
   
                         /* Case 3 above */
@@@ -1648,7 -1670,7 +1656,7 @@@
                 }
   
                 if (!ignore_references)
-                       references = page_check_references(page, sc);
+                       references = folio_check_references(folio, sc);
   
                 switch (references) {
                 case PAGEREF_ACTIVATE:
@@@ -1681,28 -1703,28 +1689,28 @@@
                         if (!PageSwapCache(page)) {
                                 if (!(sc->gfp_mask & __GFP_IO))
                                         goto keep_locked;
-                               if (page_maybe_dma_pinned(page))
+                               if (folio_maybe_dma_pinned(folio))
                                         goto keep_locked;
                                 if (PageTransHuge(page)) {
                                         /* cannot split THP, skip it */
-                                       if (!can_split_huge_page(page, NULL))
+                                       if (!can_split_folio(folio, NULL))
                                                 goto activate_locked;
                                         /*
                                          * Split pages without a PMD map right
                                          * away. Chances are some or all of the
                                          * tail pages can be freed without IO.
                                          */
-                                       if (!compound_mapcount(page) &&
-                                           split_huge_page_to_list(page,
-                                                                   page_list))
+                                       if (!folio_entire_mapcount(folio) &&
+                                           split_folio_to_list(folio,
+                                                               page_list))
                                                 goto activate_locked;
                                 }
                                 if (!add_to_swap(page)) {
                                         if (!PageTransHuge(page))
                                                 goto activate_locked_split;
                                         /* Fallback to swap normal pages */
-                                       if (split_huge_page_to_list(page,
-                                                                   page_list))
+                                       if (split_folio_to_list(folio,
+                                                               page_list))
                                                 goto activate_locked;
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
                                         count_vm_event(THP_SWPOUT_FALLBACK);
@@@ -1716,9 -1738,9 +1724,9 @@@
                                 /* Adding to swap updated mapping */
                                 mapping = page_mapping(page);
                         }
-               } else if (unlikely(PageTransHuge(page))) {
-                       /* Split file THP */
-                       if (split_huge_page_to_list(page, page_list))
+               } else if (PageSwapBacked(page) && PageTransHuge(page)) {
+                       /* Split shmem THP */
+                       if (split_folio_to_list(folio, page_list))
                                 goto keep_locked;
                 }
   
@@@ -1742,10 -1764,11 +1750,11 @@@
                         enum ttu_flags flags = TTU_BATCH_FLUSH;
                         bool was_swapbacked = PageSwapBacked(page);
   
-                       if (unlikely(PageTransHuge(page)))
+                       if (PageTransHuge(page) &&
+                                       thp_order(page) >= HPAGE_PMD_ORDER)
                                 flags |= TTU_SPLIT_HUGE_PMD;
   
-                       try_to_unmap(page, flags);
+                       try_to_unmap(folio, flags);
                         if (page_mapped(page)) {
                                 stat->nr_unmap_fail += nr_pages;
                                 if (!was_swapbacked && PageSwapBacked(page))
@@@ -1793,13 -1816,13 +1802,13 @@@
                          * starts and then write it out here.
                          */
                         try_to_unmap_flush_dirty();
-                       switch (pageout(page, mapping)) {
+                       switch (pageout(folio, mapping)) {
                         case PAGE_KEEP:
                                 goto keep_locked;
                         case PAGE_ACTIVATE:
                                 goto activate_locked;
                         case PAGE_SUCCESS:
-                               stat->nr_pageout += thp_nr_pages(page);
+                               stat->nr_pageout += nr_pages;
   
                                 if (PageWriteback(page))
                                         goto keep;
@@@ -1877,7 -1900,7 +1886,7 @@@
                          */
                         count_vm_event(PGLAZYFREED);
                         count_memcg_page_event(page, PGLAZYFREED);
-               } else if (!mapping || !__remove_mapping(mapping, page, true,
+               } else if (!mapping || !__remove_mapping(mapping, folio, true,
                                                          sc->target_mem_cgroup))
                         goto keep_locked;
   
@@@ -1999,6 -2022,69 +2008,6 @@@ unsigned int reclaim_clean_pages_from_l
         return nr_reclaimed;
   }
   
- -/*
- - * Attempt to remove the specified page from its LRU.  Only take this page
- - * if it is of the appropriate PageActive status.  Pages which are being
- - * freed elsewhere are also ignored.
- - *
- - * page:      page to consider
- - * mode:      one of the LRU isolation modes defined above
- - *
- - * returns true on success, false on failure.
- - */
- -bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
- -{
- -      /* Only take pages on the LRU. */
- -      if (!PageLRU(page))
- -              return false;
- -
- -      /* Compaction should not handle unevictable pages but CMA can do so */
- -      if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
- -              return false;
- -
- -      /*
- -       * To minimise LRU disruption, the caller can indicate that it only
- -       * wants to isolate pages it will be able to operate on without
- -       * blocking - clean pages for the most part.
- -       *
- -       * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
- -       * that it is possible to migrate without blocking
- -       */
- -      if (mode & ISOLATE_ASYNC_MIGRATE) {
- -              /* All the caller can do on PageWriteback is block */
- -              if (PageWriteback(page))
- -                      return false;
- -
- -              if (PageDirty(page)) {
- -                      struct address_space *mapping;
- -                      bool migrate_dirty;
- -
- -                      /*
- -                       * Only pages without mappings or that have a
- -                       * ->migratepage callback are possible to migrate
- -                       * without blocking. However, we can be racing with
- -                       * truncation so it's necessary to lock the page
- -                       * to stabilise the mapping as truncation holds
- -                       * the page lock until after the page is removed
- -                       * from the page cache.
- -                       */
- -                      if (!trylock_page(page))
- -                              return false;
- -
- -                      mapping = page_mapping(page);
- -                      migrate_dirty = !mapping || mapping->a_ops->migratepage;
- -                      unlock_page(page);
- -                      if (!migrate_dirty)
- -                              return false;
- -              }
- -      }
- -
- -      if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
- -              return false;
- -
- -      return true;
- -}
- -
   /*
    * Update LRU sizes after isolating pages. The LRU size updates must
    * be complete before mem_cgroup_update_lru_size due to a sanity check.
@@@ -2050,11 -2136,11 +2059,11 @@@ static unsigned long isolate_lru_pages(
         unsigned long skipped = 0;
         unsigned long scan, total_scan, nr_pages;
         LIST_HEAD(pages_skipped);
- -      isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
   
         total_scan = 0;
         scan = 0;
         while (scan < nr_to_scan && !list_empty(src)) {
+ +              struct list_head *move_to = src;
                 struct page *page;
   
                 page = lru_to_page(src);
@@@ -2064,9 -2150,9 +2073,9 @@@
                 total_scan += nr_pages;
   
                 if (page_zonenum(page) > sc->reclaim_idx) {
- -                      list_move(&page->lru, &pages_skipped);
                         nr_skipped[page_zonenum(page)] += nr_pages;
- -                      continue;
+ +                      move_to = &pages_skipped;
+ +                      goto move;
                 }
   
                 /*
@@@ -2074,34 -2160,37 +2083,34 @@@
                  * return with no isolated pages if the LRU mostly contains
                  * ineligible pages.  This causes the VM to not reclaim any
                  * pages, triggering a premature OOM.
- -               *
- -               * Account all tail pages of THP.  This would not cause
- -               * premature OOM since __isolate_lru_page() returns -EBUSY
- -               * only when the page is being freed somewhere else.
+ +               * Account all tail pages of THP.
                  */
                 scan += nr_pages;
- -              if (!__isolate_lru_page_prepare(page, mode)) {
- -                      /* It is being freed elsewhere */
- -                      list_move(&page->lru, src);
- -                      continue;
- -              }
+ +
+ +              if (!PageLRU(page))
+ +                      goto move;
+ +              if (!sc->may_unmap && page_mapped(page))
+ +                      goto move;
+ +
                 /*
                  * Be careful not to clear PageLRU until after we're
                  * sure the page is not being freed elsewhere -- the
                  * page release code relies on it.
                  */
- -              if (unlikely(!get_page_unless_zero(page))) {
- -                      list_move(&page->lru, src);
- -                      continue;
- -              }
+ +              if (unlikely(!get_page_unless_zero(page)))
+ +                      goto move;
   
                 if (!TestClearPageLRU(page)) {
                         /* Another thread is already isolating this page */
                         put_page(page);
- -                      list_move(&page->lru, src);
- -                      continue;
+ +                      goto move;
                 }
   
                 nr_taken += nr_pages;
                 nr_zone_taken[page_zonenum(page)] += nr_pages;
- -              list_move(&page->lru, dst);
+ +              move_to = dst;
+ +move:
+ +              list_move(&page->lru, move_to);
         }
   
         /*
@@@ -2125,52 -2214,46 +2134,47 @@@
         }
         *nr_scanned = total_scan;
         trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
- -                                  total_scan, skipped, nr_taken, mode, lru);
+ +                                  total_scan, skipped, nr_taken,
+ +                                  sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
         update_lru_sizes(lruvec, lru, nr_zone_taken);
         return nr_taken;
   }
   
   /**
-  * isolate_lru_page - tries to isolate a page from its LRU list
-  * @page: page to isolate from its LRU list
-  *
-  * Isolates a @page from an LRU list, clears PageLRU and adjusts the
-  * vmstat statistic corresponding to whatever LRU list the page was on.
+  * folio_isolate_lru() - Try to isolate a folio from its LRU list.
+  * @folio: Folio to isolate from its LRU list.
    *
-  * Returns 0 if the page was removed from an LRU list.
-  * Returns -EBUSY if the page was not on an LRU list.
+  * Isolate a @folio from an LRU list and adjust the vmstat statistic
+  * corresponding to whatever LRU list the folio was on.
    *
-  * The returned page will have PageLRU() cleared.  If it was found on
-  * the active list, it will have PageActive set.  If it was found on
-  * the unevictable list, it will have the PageUnevictable bit set. That flag
+  * The folio will have its LRU flag cleared.  If it was found on the
+  * active list, it will have the Active flag set.  If it was found on the
+  * unevictable list, it will have the Unevictable flag set.  These flags
    * may need to be cleared by the caller before letting the page go.
    *
-  * The vmstat statistic corresponding to the list on which the page was
-  * found will be decremented.
-  *
-  * Restrictions:
+  * Context:
    *
    * (1) Must be called with an elevated refcount on the page. This is a
-  *     fundamental difference from isolate_lru_pages (which is called
+  *     fundamental difference from isolate_lru_pages() (which is called
    *     without a stable reference).
-  * (2) the lru_lock must not be held.
-  * (3) interrupts must be enabled.
+  * (2) The lru_lock must not be held.
+  * (3) Interrupts must be enabled.
+  *
+  * Return: 0 if the folio was removed from an LRU list.
+  * -EBUSY if the folio was not on an LRU list.
    */
- int isolate_lru_page(struct page *page)
+ int folio_isolate_lru(struct folio *folio)
   {
-       struct folio *folio = page_folio(page);
         int ret = -EBUSY;
   
-       VM_BUG_ON_PAGE(!page_count(page), page);
-       WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
+       VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
   
-       if (TestClearPageLRU(page)) {
+       if (folio_test_clear_lru(folio)) {
                 struct lruvec *lruvec;
   
-               get_page(page);
+               folio_get(folio);
                 lruvec = folio_lruvec_lock_irq(folio);
-               del_page_from_lru_list(page, lruvec);
+               lruvec_del_folio(lruvec, folio);
                 unlock_page_lruvec_irq(lruvec);
                 ret = 0;
         }
@@@ -2300,7 -2383,9 +2304,7 @@@ static unsigned int move_pages_to_lru(s
    */
   static int current_may_throttle(void)
   {
- -      return !(current->flags & PF_LOCAL_THROTTLE) ||
- -              current->backing_dev_info == NULL ||
- -              bdi_write_congested(current->backing_dev_info);
+ +      return !(current->flags & PF_LOCAL_THROTTLE);
   }
   
   /*
@@@ -2406,7 -2491,7 +2410,7 @@@ shrink_inactive_list(unsigned long nr_t
    *
    * If the pages are mostly unmapped, the processing is fast and it is
    * appropriate to hold lru_lock across the whole operation.  But if
-  * the pages are mapped, the processing is slow (page_referenced()), so
+  * the pages are mapped, the processing is slow (folio_referenced()), so
    * we should drop lru_lock around each page.  It's impossible to balance
    * this, so instead we remove the pages from the LRU while processing them.
    * It is safe to rely on PG_active against the non-LRU pages in here because
@@@ -2426,7 -2511,6 +2430,6 @@@ static void shrink_active_list(unsigne
         LIST_HEAD(l_hold);      /* The pages which were snipped off */
         LIST_HEAD(l_active);
         LIST_HEAD(l_inactive);
-       struct page *page;
         unsigned nr_deactivate, nr_activate;
         unsigned nr_rotated = 0;
         int file = is_file_lru(lru);
@@@ -2448,9 -2532,13 +2451,13 @@@
         spin_unlock_irq(&lruvec->lru_lock);
   
         while (!list_empty(&l_hold)) {
+               struct folio *folio;
+               struct page *page;
+ 
                 cond_resched();
-               page = lru_to_page(&l_hold);
-               list_del(&page->lru);
+               folio = lru_to_folio(&l_hold);
+               list_del(&folio->lru);
+               page = &folio->page;
   
                 if (unlikely(!page_evictable(page))) {
                         putback_lru_page(page);
@@@ -2465,8 -2553,8 +2472,8 @@@
                         }
                 }
   
-               if (page_referenced(page, 0, sc->target_mem_cgroup,
-                                   &vm_flags)) {
+               if (folio_referenced(folio, 0, sc->target_mem_cgroup,
+                                    &vm_flags)) {
                         /*
                          * Identify referenced, file-backed active pages and
                          * give them one more trip around the active list. So
@@@ -3896,10 -3984,7 +3903,10 @@@ static bool pgdat_balanced(pg_data_t *p
                 if (!managed_zone(zone))
                         continue;
   
- -              mark = high_wmark_pages(zone);
+ +              if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
+ +                      mark = wmark_pages(zone, WMARK_PROMO);
+ +              else
+ +                      mark = high_wmark_pages(zone);
                 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
                         return true;
         }
@@@ -4396,7 -4481,7 +4403,7 @@@ static int kswapd(void *p
          * us from recursively trying to free more memory as we're
          * trying to free the first piece of memory in the first place).
          */
- -      tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ +      tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
         set_freezable();
   
         WRITE_ONCE(pgdat->kswapd_order, 0);
@@@ -4447,7 -4532,7 +4454,7 @@@ kswapd_try_sleep
                         goto kswapd_try_sleep;
         }
   
- -      tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
+ +      tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
   
         return 0;
   }
@@@ -4688,8 -4773,11 +4695,8 @@@ static int __node_reclaim(struct pglist
         fs_reclaim_acquire(sc.gfp_mask);
         /*
          * We need to be able to allocate from the reserves for RECLAIM_UNMAP
- -       * and we also need to be able to write out pages for RECLAIM_WRITE
- -       * and RECLAIM_UNMAP.
          */
         noreclaim_flag = memalloc_noreclaim_save();
- -      p->flags |= PF_SWAPWRITE;
         set_task_reclaim_state(p, &sc.reclaim_state);
   
         if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
@@@ -4703,6 -4791,7 +4710,6 @@@
         }
   
         set_task_reclaim_state(p, NULL);
- -      current->flags &= ~PF_SWAPWRITE;
         memalloc_noreclaim_restore(noreclaim_flag);
         fs_reclaim_release(sc.gfp_mask);
         psi_memstall_leave(&pflags);
diff --combined mm/workingset.c

index 6f616a69eab690cbce4e4273f4d39f375e29590c,b717eae4e0dd59f8bbc051c4d120553833f2956e..8a3828acc0bfd9256bc10d3c8b5b939786fdac0b
--- 1/mm/workingset.c
--- 2/mm/workingset.c
+++ b/mm/workingset.c
@@@ -245,31 -245,32 +245,32 @@@ void workingset_age_nonresident(struct 
   }
   
   /**
-  * workingset_eviction - note the eviction of a page from memory
+  * workingset_eviction - note the eviction of a folio from memory
    * @target_memcg: the cgroup that is causing the reclaim
-  * @page: the page being evicted
+  * @folio: the folio being evicted
    *
-  * Return: a shadow entry to be stored in @page->mapping->i_pages in place
-  * of the evicted @page so that a later refault can be detected.
+  * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
+  * of the evicted @folio so that a later refault can be detected.
    */
- void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
+ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
   {
-       struct pglist_data *pgdat = page_pgdat(page);
+       struct pglist_data *pgdat = folio_pgdat(folio);
         unsigned long eviction;
         struct lruvec *lruvec;
         int memcgid;
   
-       /* Page is fully exclusive and pins page's memory cgroup pointer */
-       VM_BUG_ON_PAGE(PageLRU(page), page);
-       VM_BUG_ON_PAGE(page_count(page), page);
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       /* Folio is fully exclusive and pins folio's memory cgroup pointer */
+       VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+       VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+       VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
   
         lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
         /* XXX: target_memcg can be NULL, go through lruvec */
         memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
         eviction = atomic_long_read(&lruvec->nonresident_age);
-       workingset_age_nonresident(lruvec, thp_nr_pages(page));
-       return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
+       workingset_age_nonresident(lruvec, folio_nr_pages(folio));
+       return pack_shadow(memcgid, pgdat, eviction,
+                               folio_test_workingset(folio));
   }
   
   /**
@@@ -429,12 -430,10 +430,12 @@@ out
    * point where they would still be useful.
    */
   
- -static struct list_lru shadow_nodes;
+ +struct list_lru shadow_nodes;
   
   void workingset_update_node(struct xa_node *node)
   {
+ +      struct address_space *mapping;
+ +
         /*
          * Track non-empty nodes that contain only shadow entries;
          * unlink those that contain pages or are being freed.
@@@ -443,8 -442,7 +444,8 @@@
          * already where they should be. The list_empty() test is safe
          * as node->private_list is protected by the i_pages lock.
          */
- -      VM_WARN_ON_ONCE(!irqs_disabled());  /* For __inc_lruvec_page_state */
+ +      mapping = container_of(node->array, struct address_space, i_pages);
+ +      lockdep_assert_held(&mapping->i_pages.xa_lock);
   
         if (node->count && node->count == node->nr_values) {
                 if (list_empty(&node->private_list)) {
author	Linus Torvalds <[email protected]>
	Wed, 23 Mar 2022 00:03:12 +0000 (17:03 -0700)
committer	Linus Torvalds <[email protected]>
	Wed, 23 Mar 2022 00:03:12 +0000 (17:03 -0700)
		1	2
arch/arm64/mm/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/parisc/include/asm/pgtable.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/pci.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/target/io-cmd-bdev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/filecache.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfsd/vfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/hugetlb.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm_inline.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/pagemap.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/swap.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
mm/damon/paddr.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/gup.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/huge_memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/hugetlb.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/internal.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/ksm.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/madvise.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory-failure.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory_hotplug.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memremap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/migrate.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mlock.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mmzone.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/oom_kill.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/readahead.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/rmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/swap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/userfaultfd.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/util.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmscan.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/workingset.c	patch \|	diff1 \|	diff2 \|	blob \| history