]> Git Repo - linux.git/commitdiff
Merge tag 'mm-nonmm-stable-2023-11-02-14-08' of git://git.kernel.org/pub/scm/linux...
authorLinus Torvalds <[email protected]>
Fri, 3 Nov 2023 06:53:31 +0000 (20:53 -1000)
committerLinus Torvalds <[email protected]>
Fri, 3 Nov 2023 06:53:31 +0000 (20:53 -1000)
Pull non-MM updates from Andrew Morton:
 "As usual, lots of singleton and doubleton patches all over the tree
  and there's little I can say which isn't in the individual changelogs.

  The lengthier patch series are

   - 'kdump: use generic functions to simplify crashkernel reservation
     in arch', from Baoquan He. This is mainly cleanups and
     consolidation of the 'crashkernel=' kernel parameter handling

   - After much discussion, David Laight's 'minmax: Relax type checks in
     min() and max()' is here. Hopefully reduces some typecasting and
     the use of min_t() and max_t()

   - A group of patches from Oleg Nesterov which clean up and slightly
     fix our handling of reads from /proc/PID/task/... and which remove
     task_struct.thread_group"

* tag 'mm-nonmm-stable-2023-11-02-14-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (64 commits)
  scripts/gdb/vmalloc: disable on no-MMU
  scripts/gdb: fix usage of MOD_TEXT not defined when CONFIG_MODULES=n
  .mailmap: add address mapping for Tomeu Vizoso
  mailmap: update email address for Claudiu Beznea
  tools/testing/selftests/mm/run_vmtests.sh: lower the ptrace permissions
  .mailmap: map Benjamin Poirier's address
  scripts/gdb: add lx_current support for riscv
  ocfs2: fix a spelling typo in comment
  proc: test ProtectionKey in proc-empty-vm test
  proc: fix proc-empty-vm test with vsyscall
  fs/proc/base.c: remove unneeded semicolon
  do_io_accounting: use sig->stats_lock
  do_io_accounting: use __for_each_thread()
  ocfs2: replace BUG_ON() at ocfs2_num_free_extents() with ocfs2_error()
  ocfs2: fix a typo in a comment
  scripts/show_delta: add __main__ judgement before main code
  treewide: mark stuff as __ro_after_init
  fs: ocfs2: check status values
  proc: test /proc/${pid}/statm
  compiler.h: move __is_constexpr() to compiler.h
  ...

51 files changed:
1  2 
.mailmap
arch/arm/kernel/setup.c
arch/arm64/Kconfig
arch/arm64/mm/init.c
arch/loongarch/kernel/setup.c
arch/riscv/Kconfig
arch/x86/Kconfig
arch/x86/kernel/setup.c
block/bdev.c
drivers/accel/ivpu/ivpu_job.c
drivers/gpu/drm/i915/gt/selftest_migrate.c
drivers/net/xen-netback/interface.c
drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
fs/buffer.c
fs/char_dev.c
fs/dcache.c
fs/file_table.c
fs/inode.c
fs/kernfs/mount.c
fs/locks.c
fs/namespace.c
fs/notify/dnotify/dnotify.c
fs/notify/fanotify/fanotify_user.c
fs/ocfs2/alloc.c
fs/ocfs2/dlmfs/dlmfs.c
fs/ocfs2/namei.c
fs/pipe.c
fs/proc/base.c
fs/proc/inode.c
fs/proc/task_mmu.c
fs/userfaultfd.c
include/linux/crash_core.h
include/linux/fortify-string.h
include/linux/sched.h
include/linux/sched/signal.h
init/init_task.c
kernel/audit_tree.c
kernel/exit.c
kernel/fork.c
kernel/kthread.c
kernel/sched/core.c
kernel/signal.c
kernel/sys.c
kernel/user_namespace.c
kernel/workqueue.c
mm/damon/core.c
mm/khugepaged.c
mm/shmem.c
net/core/pktgen.c
security/integrity/iint.c
tools/testing/selftests/mm/run_vmtests.sh

diff --combined .mailmap
index 2643b7203a745283d7f0e655f22966939e5ba3a4,27beb64673b8d20b8c38de14a56752979f7ba9dd..43031441b2d922b3126b26ba754ea748a3f63540
+++ b/.mailmap
@@@ -87,7 -87,6 +87,7 @@@ Baolin Wang <[email protected].
  Baolin Wang <[email protected]> <[email protected]>
  Bart Van Assche <[email protected]> <[email protected]>
  Bart Van Assche <[email protected]> <[email protected]>
 +Bartosz Golaszewski <[email protected]> <[email protected]>
  Ben Gardner <[email protected]>
@@@ -95,6 -94,7 +95,7 @@@ Ben M Cahill <[email protected]
  Ben Widawsky <[email protected]> <[email protected]>
  Ben Widawsky <[email protected]> <[email protected]>
  Ben Widawsky <[email protected]> <[email protected]>
+ Benjamin Poirier <[email protected]> <[email protected]>
  Bjorn Andersson <[email protected]> <[email protected]>
  Bjorn Andersson <[email protected]> <[email protected]>
  Bjorn Andersson <[email protected]> <[email protected]>
@@@ -128,6 -128,7 +129,7 @@@ Christian Brauner <[email protected]
  Christian Marangi <[email protected]>
  Christophe Ricard <[email protected]>
  Christoph Hellwig <[email protected]>
+ Claudiu Beznea <[email protected]> <[email protected]>
  Colin Ian King <[email protected]> <[email protected]>
  Corey Minyard <[email protected]>
  Damian Hobson-Garcia <[email protected]>
@@@ -378,7 -379,6 +380,7 @@@ Matthew Wilcox <[email protected]> <w
  Matthew Wilcox <[email protected]> <[email protected]>
  Matthew Wilcox <[email protected]> <[email protected]>
  Matthias Fuchs <[email protected]> <[email protected]>
 +Matthieu Baerts <[email protected]> <[email protected]>
  Matthieu CASTET <[email protected]>
  Matti Vaittinen <[email protected]> <[email protected]>
  Matt Ranostay <[email protected]> <[email protected]>
@@@ -451,10 -451,9 +453,10 @@@ Oleksandr Natalenko <oleksandr@natalenk
  Oleksij Rempel <[email protected]> <[email protected]>
  Oleksij Rempel <[email protected]> <[email protected]>
  Oleksij Rempel <[email protected]> <[email protected]>
 -Oleksij Rempel <[email protected]> <[email protected]>
 -Oleksij Rempel <linux@rempel-privat.de> <[email protected]>
 +Oleksij Rempel <[email protected]>
 +Oleksij Rempel <o.rempel@pengutronix.de> <[email protected]>
  Oliver Upton <[email protected]> <[email protected]>
 +OndÅ™ej Jirman <[email protected]> <[email protected]>
  Oza Pawandeep <[email protected]> <[email protected]>
  Pali Rohár <[email protected]> <[email protected]>
  Paolo 'Blaisorblade' Giarrusso <[email protected]>
@@@ -568,6 -567,7 +570,7 @@@ Takashi YOSHII <takashi.yoshii.zj@renes
  Tamizh Chelvam Raja <[email protected]> <[email protected]>
  Tejun Heo <[email protected]>
+ Tomeu Vizoso <[email protected]> <[email protected]>
  Thomas Graf <[email protected]>
  Thomas Körper <[email protected]> <[email protected]>
  Thomas Pedersen <[email protected]>
diff --combined arch/arm/kernel/setup.c
index 15eca804239edc8d46ead8f21e77990352f5d3d3,e2bb7afd06839b41f74b9c0c0297e6cea7f88f26..b01cac05bd4ccde251c3dcccea50f737f0d57028
  #include <linux/console.h>
  #include <linux/seq_file.h>
  #include <linux/screen_info.h>
 -#include <linux/of_platform.h>
  #include <linux/init.h>
  #include <linux/kexec.h>
  #include <linux/libfdt.h>
 +#include <linux/of.h>
  #include <linux/of_fdt.h>
  #include <linux/cpu.h>
  #include <linux/interrupt.h>
@@@ -1010,7 -1010,8 +1010,8 @@@ static void __init reserve_crashkernel(
  
        total_mem = get_total_mem();
        ret = parse_crashkernel(boot_command_line, total_mem,
-                               &crash_size, &crash_base);
+                               &crash_size, &crash_base,
+                               NULL, NULL);
        /* invalid value specified or crashkernel=0 */
        if (ret || !crash_size)
                return;
diff --combined arch/arm64/Kconfig
index 6062a52a084ffff5b3d94b290b7fdd8846b44293,e7d374d994adaa5fe40f2965e57d64675b92581f..7b071a00425d2b833bc79dd0413bafaaeb94c19a
@@@ -1037,19 -1037,6 +1037,19 @@@ config ARM64_ERRATUM_264519
  
          If unsure, say Y.
  
 +config ARM64_ERRATUM_2966298
 +      bool "Cortex-A520: 2966298: workaround for speculatively executed unprivileged load"
 +      default y
 +      help
 +        This option adds the workaround for ARM Cortex-A520 erratum 2966298.
 +
 +        On an affected Cortex-A520 core, a speculatively executed unprivileged
 +        load might leak data from a privileged level via a cache side channel.
 +
 +        Work around this problem by executing a TLBI before returning to EL0.
 +
 +        If unsure, say Y.
 +
  config CAVIUM_ERRATUM_22375
        bool "Cavium erratum 22375, 24313"
        default y
@@@ -1368,8 -1355,6 +1368,8 @@@ choic
  config CPU_BIG_ENDIAN
        bool "Build big-endian kernel"
        depends on !LD_IS_LLD || LLD_VERSION >= 130000
 +      # https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c
 +      depends on AS_IS_GNU || AS_VERSION >= 150000
        help
          Say Y if you plan on running a kernel with a big-endian userspace.
  
@@@ -1498,6 -1483,9 +1498,9 @@@ config ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_
  config ARCH_SUPPORTS_CRASH_DUMP
        def_bool y
  
+ config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+       def_bool CRASH_CORE
  config TRANS_TABLE
        def_bool y
        depends on HIBERNATION || KEXEC_CORE
diff --combined arch/arm64/mm/init.c
index 8deec68028ac7ecddd3677a9e517565847540853,f2bf32e1937150fec61d1973590e36d97f3e1f19..74c1db8ce271d8c5b0ea19dda0dbf7d3fa5f5230
@@@ -16,7 -16,6 +16,7 @@@
  #include <linux/nodemask.h>
  #include <linux/initrd.h>
  #include <linux/gfp.h>
 +#include <linux/math.h>
  #include <linux/memblock.h>
  #include <linux/sort.h>
  #include <linux/of.h>
@@@ -65,15 -64,6 +65,6 @@@ EXPORT_SYMBOL(memstart_addr)
   */
  phys_addr_t __ro_after_init arm64_dma_phys_limit;
  
- /* Current arm64 boot protocol requires 2MB alignment */
- #define CRASH_ALIGN                   SZ_2M
- #define CRASH_ADDR_LOW_MAX            arm64_dma_phys_limit
- #define CRASH_ADDR_HIGH_MAX           (PHYS_MASK + 1)
- #define CRASH_HIGH_SEARCH_BASE                SZ_4G
- #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
  /*
   * To make optimal use of block mappings when laying out the linear
   * mapping, round down the base of physical memory to a size that can
  #define ARM64_MEMSTART_ALIGN  (1UL << ARM64_MEMSTART_SHIFT)
  #endif
  
- static int __init reserve_crashkernel_low(unsigned long long low_size)
- {
-       unsigned long long low_base;
-       low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
-       if (!low_base) {
-               pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
-               return -ENOMEM;
-       }
-       pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
-               low_base, low_base + low_size, low_size >> 20);
-       crashk_low_res.start = low_base;
-       crashk_low_res.end   = low_base + low_size - 1;
-       insert_resource(&iomem_resource, &crashk_low_res);
-       return 0;
- }
- /*
-  * reserve_crashkernel() - reserves memory for crash kernel
-  *
-  * This function reserves memory area given in "crashkernel=" kernel command
-  * line parameter. The memory reserved is used by dump capture kernel when
-  * primary kernel is crashing.
-  */
- static void __init reserve_crashkernel(void)
+ static void __init arch_reserve_crashkernel(void)
  {
-       unsigned long long crash_low_size = 0, search_base = 0;
-       unsigned long long crash_max = CRASH_ADDR_LOW_MAX;
+       unsigned long long low_size = 0;
        unsigned long long crash_base, crash_size;
        char *cmdline = boot_command_line;
-       bool fixed_base = false;
        bool high = false;
        int ret;
  
        if (!IS_ENABLED(CONFIG_KEXEC_CORE))
                return;
  
-       /* crashkernel=X[@offset] */
        ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
-                               &crash_size, &crash_base);
-       if (ret == -ENOENT) {
-               ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
-               if (ret || !crash_size)
-                       return;
-               /*
-                * crashkernel=Y,low can be specified or not, but invalid value
-                * is not allowed.
-                */
-               ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base);
-               if (ret == -ENOENT)
-                       crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
-               else if (ret)
-                       return;
-               search_base = CRASH_HIGH_SEARCH_BASE;
-               crash_max = CRASH_ADDR_HIGH_MAX;
-               high = true;
-       } else if (ret || !crash_size) {
-               /* The specified value is invalid */
+                               &crash_size, &crash_base,
+                               &low_size, &high);
+       if (ret)
                return;
-       }
-       crash_size = PAGE_ALIGN(crash_size);
-       /* User specifies base address explicitly. */
-       if (crash_base) {
-               fixed_base = true;
-               search_base = crash_base;
-               crash_max = crash_base + crash_size;
-       }
- retry:
-       crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
-                                              search_base, crash_max);
-       if (!crash_base) {
-               /*
-                * For crashkernel=size[KMG]@offset[KMG], print out failure
-                * message if can't reserve the specified region.
-                */
-               if (fixed_base) {
-                       pr_warn("crashkernel reservation failed - memory is in use.\n");
-                       return;
-               }
-               /*
-                * For crashkernel=size[KMG], if the first attempt was for
-                * low memory, fall back to high memory, the minimum required
-                * low memory will be reserved later.
-                */
-               if (!high && crash_max == CRASH_ADDR_LOW_MAX) {
-                       crash_max = CRASH_ADDR_HIGH_MAX;
-                       search_base = CRASH_ADDR_LOW_MAX;
-                       crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
-                       goto retry;
-               }
-               /*
-                * For crashkernel=size[KMG],high, if the first attempt was
-                * for high memory, fall back to low memory.
-                */
-               if (high && crash_max == CRASH_ADDR_HIGH_MAX) {
-                       crash_max = CRASH_ADDR_LOW_MAX;
-                       search_base = 0;
-                       goto retry;
-               }
-               pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
-                       crash_size);
-               return;
-       }
-       if ((crash_base >= CRASH_ADDR_LOW_MAX) && crash_low_size &&
-            reserve_crashkernel_low(crash_low_size)) {
-               memblock_phys_free(crash_base, crash_size);
-               return;
-       }
-       pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
-               crash_base, crash_base + crash_size, crash_size >> 20);
-       /*
-        * The crashkernel memory will be removed from the kernel linear
-        * map. Inform kmemleak so that it won't try to access it.
-        */
-       kmemleak_ignore_phys(crash_base);
-       if (crashk_low_res.end)
-               kmemleak_ignore_phys(crashk_low_res.start);
  
-       crashk_res.start = crash_base;
-       crashk_res.end = crash_base + crash_size - 1;
-       insert_resource(&iomem_resource, &crashk_res);
+       reserve_crashkernel_generic(cmdline, crash_size, crash_base,
+                                   low_size, high);
  }
  
  /*
@@@ -480,7 -355,7 +356,7 @@@ void __init bootmem_init(void
         * request_standard_resources() depends on crashkernel's memory being
         * reserved, so do it here.
         */
-       reserve_crashkernel();
+       arch_reserve_crashkernel();
  
        memblock_dump_all();
  }
@@@ -494,16 -369,8 +370,16 @@@ void __init mem_init(void
  {
        bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
  
 -      if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC))
 +      if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
 +              /*
 +               * If no bouncing needed for ZONE_DMA, reduce the swiotlb
 +               * buffer for kmalloc() bouncing to 1MB per 1GB of RAM.
 +               */
 +              unsigned long size =
 +                      DIV_ROUND_UP(memblock_phys_mem_size(), 1024);
 +              swiotlb_adjust_size(min(swiotlb_size_or_default(), size));
                swiotlb = true;
 +      }
  
        swiotlb_init(swiotlb, SWIOTLB_VERBOSE);
  
index aed65915e932e2963913ac2b4ada44102eed9e32,4de32b07c0dcdb917c473cc96eaf3aa88182647d..b35186f7b2547afd85a9e4224b966fb6ed00fdc8
@@@ -161,19 -161,19 +161,19 @@@ static void __init smbios_parse(void
  }
  
  #ifdef CONFIG_ARCH_WRITECOMBINE
 -pgprot_t pgprot_wc = PAGE_KERNEL_WUC;
 +bool wc_enabled = true;
  #else
 -pgprot_t pgprot_wc = PAGE_KERNEL_SUC;
 +bool wc_enabled = false;
  #endif
  
 -EXPORT_SYMBOL(pgprot_wc);
 +EXPORT_SYMBOL(wc_enabled);
  
  static int __init setup_writecombine(char *p)
  {
        if (!strcmp(p, "on"))
 -              pgprot_wc = PAGE_KERNEL_WUC;
 +              wc_enabled = true;
        else if (!strcmp(p, "off"))
 -              pgprot_wc = PAGE_KERNEL_SUC;
 +              wc_enabled = false;
        else
                pr_warn("Unknown writecombine setting \"%s\".\n", p);
  
@@@ -267,7 -267,9 +267,9 @@@ static void __init arch_parse_crashkern
        unsigned long long crash_base, crash_size;
  
        total_mem = memblock_phys_mem_size();
-       ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
+       ret = parse_crashkernel(boot_command_line, total_mem,
+                               &crash_size, &crash_base,
+                               NULL, NULL);
        if (ret < 0 || crash_size <= 0)
                return;
  
diff --combined arch/riscv/Kconfig
index 9c48fecc671918ed7c77eda92f333e03dbf9f4e3,25474f8c12b79b70a00769f622a266da754b3208..eaa15a20e6ae1537d14efceeafc3b62bd4273fc5
@@@ -273,9 -273,11 +273,9 @@@ config RISCV_DMA_NONCOHEREN
        select ARCH_HAS_SYNC_DMA_FOR_CPU
        select ARCH_HAS_SYNC_DMA_FOR_DEVICE
        select DMA_BOUNCE_UNALIGNED_KMALLOC if SWIOTLB
 -      select DMA_DIRECT_REMAP if MMU
  
  config RISCV_NONSTANDARD_CACHE_OPS
        bool
 -      depends on RISCV_DMA_NONCOHERENT
        help
          This enables function pointer support for non-standard noncoherent
          systems to handle cache management.
@@@ -548,7 -550,6 +548,7 @@@ config RISCV_ISA_ZICBO
        depends on RISCV_ALTERNATIVE
        default y
        select RISCV_DMA_NONCOHERENT
 +      select DMA_DIRECT_REMAP
        help
           Adds support to dynamically detect the presence of the ZICBOM
           extension (Cache Block Management Operations) and enable its
@@@ -693,6 -694,9 +693,9 @@@ config ARCH_SUPPORTS_KEXEC_PURGATOR
  config ARCH_SUPPORTS_CRASH_DUMP
        def_bool y
  
+ config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+       def_bool CRASH_CORE
  config COMPAT
        bool "Kernel support for 32-bit U-mode"
        default 64BIT
diff --combined arch/x86/Kconfig
index 433f5e1906d1a155d341d12718b9fa97c5964897,36b2f12f31c3fdc6ae4e779a6c4f5240d6cb5306..6a917f62eff2068e83900577df2cde1fadbb6e92
@@@ -28,6 -28,7 +28,6 @@@ config X86_6
        select ARCH_HAS_GIGANTIC_PAGE
        select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
        select ARCH_SUPPORTS_PER_VMA_LOCK
 -      select ARCH_USE_CMPXCHG_LOCKREF
        select HAVE_ARCH_SOFT_DIRTY
        select MODULES_USE_ELF_RELA
        select NEED_DMA_MAP_STATE
@@@ -117,7 -118,6 +117,7 @@@ config X8
        select ARCH_SUPPORTS_LTO_CLANG
        select ARCH_SUPPORTS_LTO_CLANG_THIN
        select ARCH_USE_BUILTIN_BSWAP
 +      select ARCH_USE_CMPXCHG_LOCKREF         if X86_CMPXCHG64
        select ARCH_USE_MEMTEST
        select ARCH_USE_QUEUED_RWLOCKS
        select ARCH_USE_QUEUED_SPINLOCKS
@@@ -1534,7 -1534,6 +1534,7 @@@ config NUM
        depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP)
        default y if X86_BIGSMP
        select USE_PERCPU_NUMA_NODE_ID
 +      select OF_NUMA if OF
        help
          Enable NUMA (Non-Uniform Memory Access) support.
  
@@@ -1940,18 -1939,6 +1940,18 @@@ config X86_USER_SHADOW_STAC
  
          If unsure, say N.
  
 +config INTEL_TDX_HOST
 +      bool "Intel Trust Domain Extensions (TDX) host support"
 +      depends on CPU_SUP_INTEL
 +      depends on X86_64
 +      depends on KVM_INTEL
 +      help
 +        Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
 +        host and certain physical attacks.  This option enables necessary TDX
 +        support in the host kernel to run confidential VMs.
 +
 +        If unsure, say N.
 +
  config EFI
        bool "EFI runtime service support"
        depends on ACPI
@@@ -2075,6 -2062,9 +2075,9 @@@ config ARCH_SUPPORTS_CRASH_DUM
  config ARCH_SUPPORTS_CRASH_HOTPLUG
        def_bool y
  
+ config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+       def_bool CRASH_CORE
  config PHYSICAL_START
        hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
        default "0x1000000"
@@@ -2967,15 -2957,6 +2970,15 @@@ config IA32_EMULATIO
          64-bit kernel. You should likely turn this on, unless you're
          100% sure that you don't have any 32-bit programs left.
  
 +config IA32_EMULATION_DEFAULT_DISABLED
 +      bool "IA32 emulation disabled by default"
 +      default n
 +      depends on IA32_EMULATION
 +      help
 +        Make IA32 emulation disabled by default. This prevents loading 32-bit
 +        processes and access to 32-bit syscalls. If unsure, leave it to its
 +        default value.
 +
  config X86_X32_ABI
        bool "x32 ABI for 64-bit mode"
        depends on X86_64
diff --combined arch/x86/kernel/setup.c
index ccd3ad29a1dcfa1bb28fb54bbb6a6acdce726eb6,25a3f9a100f6815f59417e1b3635d9415a0f4238..163c35db3d04393ac904dee4eec1b789b8ecac3a
@@@ -466,154 -466,29 +466,29 @@@ static void __init memblock_x86_reserve
        }
  }
  
- /*
-  * --------- Crashkernel reservation ------------------------------
-  */
- /* 16M alignment for crash kernel regions */
- #define CRASH_ALIGN           SZ_16M
- /*
-  * Keep the crash kernel below this limit.
-  *
-  * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
-  * due to mapping restrictions.
-  *
-  * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
-  * the upper limit of system RAM in 4-level paging mode. Since the kdump
-  * jump could be from 5-level paging to 4-level paging, the jump will fail if
-  * the kernel is put above 64 TB, and during the 1st kernel bootup there's
-  * no good way to detect the paging mode of the target kernel which will be
-  * loaded for dumping.
-  */
- #ifdef CONFIG_X86_32
- # define CRASH_ADDR_LOW_MAX   SZ_512M
- # define CRASH_ADDR_HIGH_MAX  SZ_512M
- #else
- # define CRASH_ADDR_LOW_MAX   SZ_4G
- # define CRASH_ADDR_HIGH_MAX  SZ_64T
- #endif
- static int __init reserve_crashkernel_low(void)
+ static void __init arch_reserve_crashkernel(void)
  {
- #ifdef CONFIG_X86_64
-       unsigned long long base, low_base = 0, low_size = 0;
-       unsigned long low_mem_limit;
-       int ret;
-       low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX);
-       /* crashkernel=Y,low */
-       ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base);
-       if (ret) {
-               /*
-                * two parts from kernel/dma/swiotlb.c:
-                * -swiotlb size: user-specified with swiotlb= or default.
-                *
-                * -swiotlb overflow buffer: now hardcoded to 32k. We round it
-                * to 8M for other buffers that may need to stay low too. Also
-                * make sure we allocate enough extra low memory so that we
-                * don't run out of DMA buffers for 32-bit devices.
-                */
-               low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
-       } else {
-               /* passed with crashkernel=0,low ? */
-               if (!low_size)
-                       return 0;
-       }
-       low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
-       if (!low_base) {
-               pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
-                      (unsigned long)(low_size >> 20));
-               return -ENOMEM;
-       }
-       pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n",
-               (unsigned long)(low_size >> 20),
-               (unsigned long)(low_base >> 20),
-               (unsigned long)(low_mem_limit >> 20));
-       crashk_low_res.start = low_base;
-       crashk_low_res.end   = low_base + low_size - 1;
-       insert_resource(&iomem_resource, &crashk_low_res);
- #endif
-       return 0;
- }
- static void __init reserve_crashkernel(void)
- {
-       unsigned long long crash_size, crash_base, total_mem;
+       unsigned long long crash_base, crash_size, low_size = 0;
+       char *cmdline = boot_command_line;
        bool high = false;
        int ret;
  
        if (!IS_ENABLED(CONFIG_KEXEC_CORE))
                return;
  
-       total_mem = memblock_phys_mem_size();
-       /* crashkernel=XM */
-       ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
-       if (ret != 0 || crash_size <= 0) {
-               /* crashkernel=X,high */
-               ret = parse_crashkernel_high(boot_command_line, total_mem,
-                                            &crash_size, &crash_base);
-               if (ret != 0 || crash_size <= 0)
-                       return;
-               high = true;
-       }
+       ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
+                               &crash_size, &crash_base,
+                               &low_size, &high);
+       if (ret)
+               return;
  
        if (xen_pv_domain()) {
                pr_info("Ignoring crashkernel for a Xen PV domain\n");
                return;
        }
  
-       /* 0 means: find the address automatically */
-       if (!crash_base) {
-               /*
-                * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
-                * crashkernel=x,high reserves memory over 4G, also allocates
-                * 256M extra low memory for DMA buffers and swiotlb.
-                * But the extra memory is not required for all machines.
-                * So try low memory first and fall back to high memory
-                * unless "crashkernel=size[KMG],high" is specified.
-                */
-               if (!high)
-                       crash_base = memblock_phys_alloc_range(crash_size,
-                                               CRASH_ALIGN, CRASH_ALIGN,
-                                               CRASH_ADDR_LOW_MAX);
-               if (!crash_base)
-                       crash_base = memblock_phys_alloc_range(crash_size,
-                                               CRASH_ALIGN, CRASH_ALIGN,
-                                               CRASH_ADDR_HIGH_MAX);
-               if (!crash_base) {
-                       pr_info("crashkernel reservation failed - No suitable area found.\n");
-                       return;
-               }
-       } else {
-               unsigned long long start;
-               start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base,
-                                                 crash_base + crash_size);
-               if (start != crash_base) {
-                       pr_info("crashkernel reservation failed - memory is in use.\n");
-                       return;
-               }
-       }
-       if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
-               memblock_phys_free(crash_base, crash_size);
-               return;
-       }
-       pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
-               (unsigned long)(crash_size >> 20),
-               (unsigned long)(crash_base >> 20),
-               (unsigned long)(total_mem >> 20));
-       crashk_res.start = crash_base;
-       crashk_res.end   = crash_base + crash_size - 1;
-       insert_resource(&iomem_resource, &crashk_res);
+       reserve_crashkernel_generic(cmdline, crash_size, crash_base,
+                                   low_size, high);
  }
  
  static struct resource standard_io_resources[] = {
@@@ -1120,7 -995,7 +995,7 @@@ void __init setup_arch(char **cmdline_p
         * Needs to run after memblock setup because it needs the physical
         * memory size.
         */
 -      sev_setup_arch();
 +      mem_encrypt_setup_arch();
  
        efi_fake_memmap();
        efi_find_mirror();
  
        early_acpi_boot_init();
  
 +      x86_flattree_get_config();
 +
        initmem_init();
        dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
  
         * Reserve memory for crash kernel after SRAT is parsed so that it
         * won't consume hotpluggable memory.
         */
-       reserve_crashkernel();
+       arch_reserve_crashkernel();
  
        memblock_find_dma_reserve();
  
diff --combined block/bdev.c
index 2018d250e1310913952b272481e299f188ea34f3,aea9143d890889e2878c29c08108aaa6caf6d4f1..e4cfb7adb64581d0630d61dc0f5a287245da7d38
@@@ -292,7 -292,7 +292,7 @@@ EXPORT_SYMBOL(thaw_bdev)
   */
  
  static  __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock);
- static struct kmem_cache * bdev_cachep __read_mostly;
+ static struct kmem_cache *bdev_cachep __ro_after_init;
  
  static struct inode *bdev_alloc_inode(struct super_block *sb)
  {
@@@ -361,13 -361,13 +361,13 @@@ static struct file_system_type bd_type 
        .kill_sb        = kill_anon_super,
  };
  
- struct super_block *blockdev_superblock __read_mostly;
+ struct super_block *blockdev_superblock __ro_after_init;
  EXPORT_SYMBOL_GPL(blockdev_superblock);
  
  void __init bdev_cache_init(void)
  {
        int err;
-       static struct vfsmount *bd_mnt;
+       static struct vfsmount *bd_mnt __ro_after_init;
  
        bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
                        0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@@ -829,28 -829,6 +829,28 @@@ put_blkdev
  }
  EXPORT_SYMBOL(blkdev_get_by_dev);
  
 +struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 +                                   const struct blk_holder_ops *hops)
 +{
 +      struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL);
 +      struct block_device *bdev;
 +
 +      if (!handle)
 +              return ERR_PTR(-ENOMEM);
 +      bdev = blkdev_get_by_dev(dev, mode, holder, hops);
 +      if (IS_ERR(bdev)) {
 +              kfree(handle);
 +              return ERR_CAST(bdev);
 +      }
 +      handle->bdev = bdev;
 +      handle->holder = holder;
 +      if (holder)
 +              mode |= BLK_OPEN_EXCL;
 +      handle->mode = mode;
 +      return handle;
 +}
 +EXPORT_SYMBOL(bdev_open_by_dev);
 +
  /**
   * blkdev_get_by_path - open a block device by name
   * @path: path to the block device to open
@@@ -889,28 -867,6 +889,28 @@@ struct block_device *blkdev_get_by_path
  }
  EXPORT_SYMBOL(blkdev_get_by_path);
  
 +struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
 +              void *holder, const struct blk_holder_ops *hops)
 +{
 +      struct bdev_handle *handle;
 +      dev_t dev;
 +      int error;
 +
 +      error = lookup_bdev(path, &dev);
 +      if (error)
 +              return ERR_PTR(error);
 +
 +      handle = bdev_open_by_dev(dev, mode, holder, hops);
 +      if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
 +          bdev_read_only(handle->bdev)) {
 +              bdev_release(handle);
 +              return ERR_PTR(-EACCES);
 +      }
 +
 +      return handle;
 +}
 +EXPORT_SYMBOL(bdev_open_by_path);
 +
  void blkdev_put(struct block_device *bdev, void *holder)
  {
        struct gendisk *disk = bdev->bd_disk;
  }
  EXPORT_SYMBOL(blkdev_put);
  
 +void bdev_release(struct bdev_handle *handle)
 +{
 +      blkdev_put(handle->bdev, handle->holder);
 +      kfree(handle);
 +}
 +EXPORT_SYMBOL(bdev_release);
 +
  /**
   * lookup_bdev() - Look up a struct block_device by name.
   * @pathname: Name of the block device in the filesystem.
@@@ -1012,20 -961,20 +1012,20 @@@ void bdev_mark_dead(struct block_devic
        mutex_lock(&bdev->bd_holder_lock);
        if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
                bdev->bd_holder_ops->mark_dead(bdev, surprise);
 -      else
 +      else {
 +              mutex_unlock(&bdev->bd_holder_lock);
                sync_blockdev(bdev);
 -      mutex_unlock(&bdev->bd_holder_lock);
 +      }
  
        invalidate_bdev(bdev);
  }
 -#ifdef CONFIG_DASD_MODULE
  /*
 - * Drivers should not use this directly, but the DASD driver has historically
 - * had a shutdown to offline mode that doesn't actually remove the gendisk
 - * that otherwise looks a lot like a safe device removal.
 + * New drivers should not use this directly.  There are some drivers however
 + * that needs this for historical reasons. For example, the DASD driver has
 + * historically had a shutdown to offline mode that doesn't actually remove the
 + * gendisk that otherwise looks a lot like a safe device removal.
   */
  EXPORT_SYMBOL_GPL(bdev_mark_dead);
 -#endif
  
  void sync_bdevs(bool wait)
  {
index 689dc0d13b8fadece4cbc0e56cbc21c4e2cc3509,76f468c9f761bfedfcb5cc5f17825e7b5ce4d0db..8983e3a4fdf91a73893adfd0d1d526852dbe02f2
@@@ -48,10 -48,10 +48,10 @@@ static struct ivpu_cmdq *ivpu_cmdq_allo
                goto cmdq_free;
  
        cmdq->db_id = file_priv->ctx.id + engine * ivpu_get_context_count(vdev);
 -      cmdq->entry_count = (u32)((cmdq->mem->base.size - sizeof(struct vpu_job_queue_header)) /
 +      cmdq->entry_count = (u32)((ivpu_bo_size(cmdq->mem) - sizeof(struct vpu_job_queue_header)) /
                                  sizeof(struct vpu_job_queue_entry));
  
 -      cmdq->jobq = (struct vpu_job_queue *)cmdq->mem->kvaddr;
 +      cmdq->jobq = (struct vpu_job_queue *)ivpu_bo_vaddr(cmdq->mem);
        jobq_header = &cmdq->jobq->header;
        jobq_header->engine_idx = engine;
        jobq_header->head = 0;
@@@ -93,7 -93,7 +93,7 @@@ static struct ivpu_cmdq *ivpu_cmdq_acqu
                return cmdq;
  
        ret = ivpu_jsm_register_db(vdev, file_priv->ctx.id, cmdq->db_id,
 -                                 cmdq->mem->vpu_addr, cmdq->mem->base.size);
 +                                 cmdq->mem->vpu_addr, ivpu_bo_size(cmdq->mem));
        if (ret)
                return NULL;
  
@@@ -453,7 -453,7 +453,7 @@@ ivpu_job_prepare_bos_for_submit(struct 
                return -EBUSY;
        }
  
 -      if (commands_offset >= bo->base.size) {
 +      if (commands_offset >= ivpu_bo_size(bo)) {
                ivpu_warn(vdev, "Invalid command buffer offset %u\n", commands_offset);
                return -EINVAL;
        }
@@@ -618,6 -618,5 +618,5 @@@ int ivpu_job_done_thread_init(struct iv
  
  void ivpu_job_done_thread_fini(struct ivpu_device *vdev)
  {
-       kthread_stop(vdev->job_done_thread);
-       put_task_struct(vdev->job_done_thread);
+       kthread_stop_put(vdev->job_done_thread);
  }
index 1a34cbe04fb64692832a3fe0474f1dc19ff311de,0fb07f073baa61f0fb68138b3cf429b4ff4b0ee4..3eff364ccf3ac7a1a0ea9afd54c744298b8c5b89
@@@ -710,7 -710,7 +710,7 @@@ static int threaded_migrate(struct inte
                thread[i].tsk = tsk;
        }
  
 -      msleep(10); /* start all threads before we kthread_stop() */
 +      msleep(10 * n_cpus); /* start all threads before we kthread_stop() */
  
        for (i = 0; i < n_cpus; ++i) {
                struct task_struct *tsk = thread[i].tsk;
                if (IS_ERR_OR_NULL(tsk))
                        continue;
  
-               status = kthread_stop(tsk);
+               status = kthread_stop_put(tsk);
                if (status && !err)
                        err = status;
-               put_task_struct(tsk);
        }
  
        kfree(thread);
index db304f178136cdcf8e63ec35cd124388da045255,33c8143619f002ae9a42c63cd150fde3471f529e..7cff90aa8d24c280cdd36afad282fb08877f8d86
@@@ -41,6 -41,8 +41,6 @@@
  #include <asm/xen/hypercall.h>
  #include <xen/balloon.h>
  
 -#define XENVIF_QUEUE_LENGTH 32
 -
  /* Number of bytes allowed on the internal guest Rx queue. */
  #define XENVIF_RX_QUEUE_BYTES (XEN_NETIF_RX_RING_SIZE/2 * PAGE_SIZE)
  
@@@ -252,9 -254,6 +252,9 @@@ xenvif_start_xmit(struct sk_buff *skb, 
        if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE)
                skb_clear_hash(skb);
  
 +      /* timestamp packet in software */
 +      skb_tx_timestamp(skb);
 +
        if (!xenvif_rx_queue_tail(queue, skb))
                goto drop;
  
@@@ -461,7 -460,7 +461,7 @@@ static void xenvif_get_strings(struct n
  
  static const struct ethtool_ops xenvif_ethtool_ops = {
        .get_link       = ethtool_op_get_link,
 -
 +      .get_ts_info    = ethtool_op_get_ts_info,
        .get_sset_count = xenvif_get_sset_count,
        .get_ethtool_stats = xenvif_get_ethtool_stats,
        .get_strings = xenvif_get_strings,
@@@ -531,6 -530,8 +531,6 @@@ struct xenvif *xenvif_alloc(struct devi
        dev->features = dev->hw_features | NETIF_F_RXCSUM;
        dev->ethtool_ops = &xenvif_ethtool_ops;
  
 -      dev->tx_queue_len = XENVIF_QUEUE_LENGTH;
 -
        dev->min_mtu = ETH_MIN_MTU;
        dev->max_mtu = ETH_MAX_MTU - VLAN_ETH_HLEN;
  
@@@ -671,8 -672,7 +671,7 @@@ err
  static void xenvif_disconnect_queue(struct xenvif_queue *queue)
  {
        if (queue->task) {
-               kthread_stop(queue->task);
-               put_task_struct(queue->task);
+               kthread_stop_put(queue->task);
                queue->task = NULL;
        }
  
index ccc5acb39f5a6af954bd452913e4a4dcdb49df12,5bb35c3ea4e521ce7b7799cef6ec4f677be40c2d..d8437a98037b96874c0d7a5544ba457a5384e83f
@@@ -558,7 -558,8 +558,7 @@@ static int experimental_iopoll_q_cnt
  module_param(experimental_iopoll_q_cnt, int, 0444);
  MODULE_PARM_DESC(experimental_iopoll_q_cnt, "number of queues to be used as poll mode, def=0");
  
 -static void debugfs_work_handler_v3_hw(struct work_struct *work);
 -static void debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba);
 +static int debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba);
  
  static u32 hisi_sas_read32(struct hisi_hba *hisi_hba, u32 off)
  {
@@@ -3387,6 -3388,7 +3387,6 @@@ hisi_sas_shost_alloc_pci(struct pci_de
        hisi_hba = shost_priv(shost);
  
        INIT_WORK(&hisi_hba->rst_work, hisi_sas_rst_work_handler);
 -      INIT_WORK(&hisi_hba->debugfs_work, debugfs_work_handler_v3_hw);
        hisi_hba->hw = &hisi_sas_v3_hw;
        hisi_hba->pci_dev = pdev;
        hisi_hba->dev = dev;
@@@ -3858,6 -3860,37 +3858,6 @@@ static void debugfs_create_files_v3_hw(
                            &debugfs_ras_v3_hw_fops);
  }
  
 -static void debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba)
 -{
 -      int debugfs_dump_index = hisi_hba->debugfs_dump_index;
 -      struct device *dev = hisi_hba->dev;
 -      u64 timestamp = local_clock();
 -
 -      if (debugfs_dump_index >= hisi_sas_debugfs_dump_count) {
 -              dev_warn(dev, "dump count exceeded!\n");
 -              return;
 -      }
 -
 -      do_div(timestamp, NSEC_PER_MSEC);
 -      hisi_hba->debugfs_timestamp[debugfs_dump_index] = timestamp;
 -
 -      debugfs_snapshot_prepare_v3_hw(hisi_hba);
 -
 -      debugfs_snapshot_global_reg_v3_hw(hisi_hba);
 -      debugfs_snapshot_port_reg_v3_hw(hisi_hba);
 -      debugfs_snapshot_axi_reg_v3_hw(hisi_hba);
 -      debugfs_snapshot_ras_reg_v3_hw(hisi_hba);
 -      debugfs_snapshot_cq_reg_v3_hw(hisi_hba);
 -      debugfs_snapshot_dq_reg_v3_hw(hisi_hba);
 -      debugfs_snapshot_itct_reg_v3_hw(hisi_hba);
 -      debugfs_snapshot_iost_reg_v3_hw(hisi_hba);
 -
 -      debugfs_create_files_v3_hw(hisi_hba);
 -
 -      debugfs_snapshot_restore_v3_hw(hisi_hba);
 -      hisi_hba->debugfs_dump_index++;
 -}
 -
  static ssize_t debugfs_trigger_dump_v3_hw_write(struct file *file,
                                                const char __user *user_buf,
                                                size_t count, loff_t *ppos)
        struct hisi_hba *hisi_hba = file->f_inode->i_private;
        char buf[8];
  
 -      if (hisi_hba->debugfs_dump_index >= hisi_sas_debugfs_dump_count)
 -              return -EFAULT;
 -
        if (count > 8)
                return -EFAULT;
  
        if (buf[0] != '1')
                return -EFAULT;
  
 -      queue_work(hisi_hba->wq, &hisi_hba->debugfs_work);
 +      down(&hisi_hba->sem);
 +      if (debugfs_snapshot_regs_v3_hw(hisi_hba)) {
 +              up(&hisi_hba->sem);
 +              return -EFAULT;
 +      }
 +      up(&hisi_hba->sem);
  
        return count;
  }
@@@ -3959,22 -3990,7 +3959,7 @@@ static ssize_t debugfs_bist_linkrate_v3
  
        return count;
  }
- static int debugfs_bist_linkrate_v3_hw_open(struct inode *inode,
-                                           struct file *filp)
- {
-       return single_open(filp, debugfs_bist_linkrate_v3_hw_show,
-                          inode->i_private);
- }
- static const struct file_operations debugfs_bist_linkrate_v3_hw_fops = {
-       .open = debugfs_bist_linkrate_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_linkrate_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_linkrate_v3_hw);
  
  static const struct {
        int             value;
@@@ -4049,22 -4065,7 +4034,7 @@@ static ssize_t debugfs_bist_code_mode_v
  
        return count;
  }
- static int debugfs_bist_code_mode_v3_hw_open(struct inode *inode,
-                                            struct file *filp)
- {
-       return single_open(filp, debugfs_bist_code_mode_v3_hw_show,
-                          inode->i_private);
- }
- static const struct file_operations debugfs_bist_code_mode_v3_hw_fops = {
-       .open = debugfs_bist_code_mode_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_code_mode_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_code_mode_v3_hw);
  
  static ssize_t debugfs_bist_phy_v3_hw_write(struct file *filp,
                                            const char __user *buf,
@@@ -4098,22 -4099,7 +4068,7 @@@ static int debugfs_bist_phy_v3_hw_show(
  
        return 0;
  }
- static int debugfs_bist_phy_v3_hw_open(struct inode *inode,
-                                      struct file *filp)
- {
-       return single_open(filp, debugfs_bist_phy_v3_hw_show,
-                          inode->i_private);
- }
- static const struct file_operations debugfs_bist_phy_v3_hw_fops = {
-       .open = debugfs_bist_phy_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_phy_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_phy_v3_hw);
  
  static ssize_t debugfs_bist_cnt_v3_hw_write(struct file *filp,
                                        const char __user *buf,
@@@ -4146,22 -4132,7 +4101,7 @@@ static int debugfs_bist_cnt_v3_hw_show(
  
        return 0;
  }
- static int debugfs_bist_cnt_v3_hw_open(struct inode *inode,
-                                         struct file *filp)
- {
-       return single_open(filp, debugfs_bist_cnt_v3_hw_show,
-                          inode->i_private);
- }
- static const struct file_operations debugfs_bist_cnt_v3_hw_ops = {
-       .open = debugfs_bist_cnt_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_cnt_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_cnt_v3_hw);
  
  static const struct {
        int             value;
@@@ -4225,22 -4196,7 +4165,7 @@@ static ssize_t debugfs_bist_mode_v3_hw_
  
        return count;
  }
- static int debugfs_bist_mode_v3_hw_open(struct inode *inode,
-                                       struct file *filp)
- {
-       return single_open(filp, debugfs_bist_mode_v3_hw_show,
-                          inode->i_private);
- }
- static const struct file_operations debugfs_bist_mode_v3_hw_fops = {
-       .open = debugfs_bist_mode_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_mode_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_mode_v3_hw);
  
  static ssize_t debugfs_bist_enable_v3_hw_write(struct file *filp,
                                               const char __user *buf,
@@@ -4278,22 -4234,7 +4203,7 @@@ static int debugfs_bist_enable_v3_hw_sh
  
        return 0;
  }
- static int debugfs_bist_enable_v3_hw_open(struct inode *inode,
-                                         struct file *filp)
- {
-       return single_open(filp, debugfs_bist_enable_v3_hw_show,
-                          inode->i_private);
- }
- static const struct file_operations debugfs_bist_enable_v3_hw_fops = {
-       .open = debugfs_bist_enable_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_enable_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_enable_v3_hw);
  
  static const struct {
        char *name;
@@@ -4331,21 -4272,7 +4241,7 @@@ static int debugfs_v3_hw_show(struct se
  
        return 0;
  }
- static int debugfs_v3_hw_open(struct inode *inode, struct file *filp)
- {
-       return single_open(filp, debugfs_v3_hw_show,
-                          inode->i_private);
- }
- static const struct file_operations debugfs_v3_hw_fops = {
-       .open = debugfs_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_v3_hw);
  
  static ssize_t debugfs_phy_down_cnt_v3_hw_write(struct file *filp,
                                                const char __user *buf,
@@@ -4376,22 -4303,7 +4272,7 @@@ static int debugfs_phy_down_cnt_v3_hw_s
  
        return 0;
  }
- static int debugfs_phy_down_cnt_v3_hw_open(struct inode *inode,
-                                          struct file *filp)
- {
-       return single_open(filp, debugfs_phy_down_cnt_v3_hw_show,
-                          inode->i_private);
- }
- static const struct file_operations debugfs_phy_down_cnt_v3_hw_fops = {
-       .open = debugfs_phy_down_cnt_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_phy_down_cnt_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_phy_down_cnt_v3_hw);
  
  enum fifo_dump_mode_v3_hw {
        FIFO_DUMP_FORVER =              (1U << 0),
@@@ -4630,6 -4542,14 +4511,6 @@@ static void debugfs_fifo_init_v3_hw(str
        }
  }
  
 -static void debugfs_work_handler_v3_hw(struct work_struct *work)
 -{
 -      struct hisi_hba *hisi_hba =
 -              container_of(work, struct hisi_hba, debugfs_work);
 -
 -      debugfs_snapshot_regs_v3_hw(hisi_hba);
 -}
 -
  static void debugfs_release_v3_hw(struct hisi_hba *hisi_hba, int dump_index)
  {
        struct device *dev = hisi_hba->dev;
@@@ -4664,7 -4584,7 +4545,7 @@@ static int debugfs_alloc_v3_hw(struct h
  {
        const struct hisi_sas_hw *hw = hisi_hba->hw;
        struct device *dev = hisi_hba->dev;
 -      int p, c, d, r, i;
 +      int p, c, d, r;
        size_t sz;
  
        for (r = 0; r < DEBUGFS_REGS_NUM; r++) {
  
        return 0;
  fail:
 -      for (i = 0; i < hisi_sas_debugfs_dump_count; i++)
 -              debugfs_release_v3_hw(hisi_hba, i);
 +      debugfs_release_v3_hw(hisi_hba, dump_index);
        return -ENOMEM;
  }
  
 +static int debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba)
 +{
 +      int debugfs_dump_index = hisi_hba->debugfs_dump_index;
 +      struct device *dev = hisi_hba->dev;
 +      u64 timestamp = local_clock();
 +
 +      if (debugfs_dump_index >= hisi_sas_debugfs_dump_count) {
 +              dev_warn(dev, "dump count exceeded!\n");
 +              return -EINVAL;
 +      }
 +
 +      if (debugfs_alloc_v3_hw(hisi_hba, debugfs_dump_index)) {
 +              dev_warn(dev, "failed to alloc memory\n");
 +              return -ENOMEM;
 +      }
 +
 +      do_div(timestamp, NSEC_PER_MSEC);
 +      hisi_hba->debugfs_timestamp[debugfs_dump_index] = timestamp;
 +
 +      debugfs_snapshot_prepare_v3_hw(hisi_hba);
 +
 +      debugfs_snapshot_global_reg_v3_hw(hisi_hba);
 +      debugfs_snapshot_port_reg_v3_hw(hisi_hba);
 +      debugfs_snapshot_axi_reg_v3_hw(hisi_hba);
 +      debugfs_snapshot_ras_reg_v3_hw(hisi_hba);
 +      debugfs_snapshot_cq_reg_v3_hw(hisi_hba);
 +      debugfs_snapshot_dq_reg_v3_hw(hisi_hba);
 +      debugfs_snapshot_itct_reg_v3_hw(hisi_hba);
 +      debugfs_snapshot_iost_reg_v3_hw(hisi_hba);
 +
 +      debugfs_create_files_v3_hw(hisi_hba);
 +
 +      debugfs_snapshot_restore_v3_hw(hisi_hba);
 +      hisi_hba->debugfs_dump_index++;
 +
 +      return 0;
 +}
 +
  static void debugfs_phy_down_cnt_init_v3_hw(struct hisi_hba *hisi_hba)
  {
        struct dentry *dir = debugfs_create_dir("phy_down_cnt",
@@@ -4830,7 -4713,7 +4711,7 @@@ static void debugfs_bist_init_v3_hw(str
                            hisi_hba, &debugfs_bist_phy_v3_hw_fops);
  
        debugfs_create_file("cnt", 0600, hisi_hba->debugfs_bist_dentry,
-                           hisi_hba, &debugfs_bist_cnt_v3_hw_ops);
+                           hisi_hba, &debugfs_bist_cnt_v3_hw_fops);
  
        debugfs_create_file("loopback_mode", 0600,
                            hisi_hba->debugfs_bist_dentry,
        hisi_hba->debugfs_bist_linkrate = SAS_LINK_RATE_1_5_GBPS;
  }
  
 +static void debugfs_exit_v3_hw(struct hisi_hba *hisi_hba)
 +{
 +      debugfs_remove_recursive(hisi_hba->debugfs_dir);
 +      hisi_hba->debugfs_dir = NULL;
 +}
 +
  static void debugfs_init_v3_hw(struct hisi_hba *hisi_hba)
  {
        struct device *dev = hisi_hba->dev;
 -      int i;
  
        hisi_hba->debugfs_dir = debugfs_create_dir(dev_name(dev),
                                                   hisi_sas_debugfs_dir);
  
        debugfs_phy_down_cnt_init_v3_hw(hisi_hba);
        debugfs_fifo_init_v3_hw(hisi_hba);
 -
 -      for (i = 0; i < hisi_sas_debugfs_dump_count; i++) {
 -              if (debugfs_alloc_v3_hw(hisi_hba, i)) {
 -                      debugfs_remove_recursive(hisi_hba->debugfs_dir);
 -                      dev_dbg(dev, "failed to init debugfs!\n");
 -                      break;
 -              }
 -      }
 -}
 -
 -static void debugfs_exit_v3_hw(struct hisi_hba *hisi_hba)
 -{
 -      debugfs_remove_recursive(hisi_hba->debugfs_dir);
  }
  
  static int
diff --combined fs/buffer.c
index 657a62bab73d753016d235b08f8b6b5e8397ff11,a19fef583116d1fb3a2de262bc53474ee8ccd64e..967f34b70aa8f73aa56beec71a93a08b20546698
@@@ -282,7 -282,13 +282,7 @@@ static void end_buffer_async_read(struc
        } while (tmp != bh);
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
  
 -      /*
 -       * If all of the buffers are uptodate then we can set the page
 -       * uptodate.
 -       */
 -      if (folio_uptodate)
 -              folio_mark_uptodate(folio);
 -      folio_unlock(folio);
 +      folio_end_read(folio, folio_uptodate);
        return;
  
  still_busy:
@@@ -909,12 -915,16 +909,12 @@@ int remove_inode_buffers(struct inode *
   * which may not fail from ordinary buffer allocations.
   */
  struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
 -                                      bool retry)
 +                                      gfp_t gfp)
  {
        struct buffer_head *bh, *head;
 -      gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
        long offset;
        struct mem_cgroup *memcg, *old_memcg;
  
 -      if (retry)
 -              gfp |= __GFP_NOFAIL;
 -
        /* The folio lock pins the memcg */
        memcg = folio_memcg(folio);
        old_memcg = set_active_memcg(memcg);
@@@ -957,11 -967,7 +957,11 @@@ EXPORT_SYMBOL_GPL(folio_alloc_buffers)
  struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
                                       bool retry)
  {
 -      return folio_alloc_buffers(page_folio(page), size, retry);
 +      gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
 +      if (retry)
 +              gfp |= __GFP_NOFAIL;
 +
 +      return folio_alloc_buffers(page_folio(page), size, gfp);
  }
  EXPORT_SYMBOL_GPL(alloc_page_buffers);
  
@@@ -1037,11 -1043,20 +1037,11 @@@ grow_dev_page(struct block_device *bdev
        struct buffer_head *bh;
        sector_t end_block;
        int ret = 0;
 -      gfp_t gfp_mask;
 -
 -      gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
 -
 -      /*
 -       * XXX: __getblk_slow() can not really deal with failure and
 -       * will endlessly loop on improvised global reclaim.  Prefer
 -       * looping in the allocator rather than here, at least that
 -       * code knows what it's doing.
 -       */
 -      gfp_mask |= __GFP_NOFAIL;
  
        folio = __filemap_get_folio(inode->i_mapping, index,
 -                      FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask);
 +                      FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
 +      if (IS_ERR(folio))
 +              return PTR_ERR(folio);
  
        bh = folio_buffers(folio);
        if (bh) {
                        goto failed;
        }
  
 -      bh = folio_alloc_buffers(folio, size, true);
 +      ret = -ENOMEM;
 +      bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
 +      if (!bh)
 +              goto failed;
  
        /*
         * Link the folio to the buffers and initialise them.  Take the
@@@ -1408,36 -1420,33 +1408,36 @@@ __find_get_block(struct block_device *b
  }
  EXPORT_SYMBOL(__find_get_block);
  
 -/*
 - * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
 - * which corresponds to the passed block_device, block and size. The
 - * returned buffer has its reference count incremented.
 +/**
 + * bdev_getblk - Get a buffer_head in a block device's buffer cache.
 + * @bdev: The block device.
 + * @block: The block number.
 + * @size: The size of buffer_heads for this @bdev.
 + * @gfp: The memory allocation flags to use.
   *
 - * __getblk_gfp() will lock up the machine if grow_dev_page's
 - * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
 + * Return: The buffer head, or NULL if memory could not be allocated.
   */
 -struct buffer_head *
 -__getblk_gfp(struct block_device *bdev, sector_t block,
 -           unsigned size, gfp_t gfp)
 +struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
 +              unsigned size, gfp_t gfp)
  {
        struct buffer_head *bh = __find_get_block(bdev, block, size);
  
 -      might_sleep();
 -      if (bh == NULL)
 -              bh = __getblk_slow(bdev, block, size, gfp);
 -      return bh;
 +      might_alloc(gfp);
 +      if (bh)
 +              return bh;
 +
 +      return __getblk_slow(bdev, block, size, gfp);
  }
 -EXPORT_SYMBOL(__getblk_gfp);
 +EXPORT_SYMBOL(bdev_getblk);
  
  /*
   * Do async read-ahead on a buffer..
   */
  void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
  {
 -      struct buffer_head *bh = __getblk(bdev, block, size);
 +      struct buffer_head *bh = bdev_getblk(bdev, block, size,
 +                      GFP_NOWAIT | __GFP_MOVABLE);
 +
        if (likely(bh)) {
                bh_readahead(bh, REQ_RAHEAD);
                brelse(bh);
@@@ -1461,17 -1470,7 +1461,17 @@@ struct buffer_head 
  __bread_gfp(struct block_device *bdev, sector_t block,
                   unsigned size, gfp_t gfp)
  {
 -      struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
 +      struct buffer_head *bh;
 +
 +      gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
 +
 +      /*
 +       * Prefer looping in the allocator rather than here, at least that
 +       * code knows what it's doing.
 +       */
 +      gfp |= __GFP_NOFAIL;
 +
 +      bh = bdev_getblk(bdev, block, size, gfp);
  
        if (likely(bh) && !buffer_uptodate(bh))
                bh = __bread_slow(bh);
@@@ -1641,13 -1640,12 +1641,13 @@@ EXPORT_SYMBOL(block_invalidate_folio)
   * block_dirty_folio() via private_lock.  try_to_free_buffers
   * is already excluded via the folio lock.
   */
 -void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
 -                              unsigned long b_state)
 +struct buffer_head *create_empty_buffers(struct folio *folio,
 +              unsigned long blocksize, unsigned long b_state)
  {
        struct buffer_head *bh, *head, *tail;
 +      gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
  
 -      head = folio_alloc_buffers(folio, blocksize, true);
 +      head = folio_alloc_buffers(folio, blocksize, gfp);
        bh = head;
        do {
                bh->b_state |= b_state;
        }
        folio_attach_private(folio, head);
        spin_unlock(&folio->mapping->private_lock);
 -}
 -EXPORT_SYMBOL(folio_create_empty_buffers);
  
 -void create_empty_buffers(struct page *page,
 -                      unsigned long blocksize, unsigned long b_state)
 -{
 -      folio_create_empty_buffers(page_folio(page), blocksize, b_state);
 +      return head;
  }
  EXPORT_SYMBOL(create_empty_buffers);
  
@@@ -1765,15 -1768,13 +1765,15 @@@ static struct buffer_head *folio_create
                                                struct inode *inode,
                                                unsigned int b_state)
  {
 +      struct buffer_head *bh;
 +
        BUG_ON(!folio_test_locked(folio));
  
 -      if (!folio_buffers(folio))
 -              folio_create_empty_buffers(folio,
 -                                         1 << READ_ONCE(inode->i_blkbits),
 -                                         b_state);
 -      return folio_buffers(folio);
 +      bh = folio_buffers(folio);
 +      if (!bh)
 +              bh = create_empty_buffers(folio,
 +                              1 << READ_ONCE(inode->i_blkbits), b_state);
 +      return bh;
  }
  
  /*
@@@ -2424,10 -2425,12 +2424,10 @@@ int block_read_full_folio(struct folio 
  
        if (!nr) {
                /*
 -               * All buffers are uptodate - we can set the folio uptodate
 -               * as well. But not if get_block() returned an error.
 +               * All buffers are uptodate or get_block() returned an
 +               * error when trying to map them - we can finish the read.
                 */
 -              if (!page_error)
 -                      folio_mark_uptodate(folio);
 -              folio_unlock(folio);
 +              folio_end_read(folio, !page_error);
                return 0;
        }
  
@@@ -2673,8 -2676,10 +2673,8 @@@ int block_truncate_page(struct address_
                return PTR_ERR(folio);
  
        bh = folio_buffers(folio);
 -      if (!bh) {
 -              folio_create_empty_buffers(folio, blocksize, 0);
 -              bh = folio_buffers(folio);
 -      }
 +      if (!bh)
 +              bh = create_empty_buffers(folio, blocksize, 0);
  
        /* Find the buffer that contains "offset" */
        offset = offset_in_folio(folio, from);
@@@ -2983,13 -2988,13 +2983,13 @@@ EXPORT_SYMBOL(try_to_free_buffers)
  /*
   * Buffer-head allocation
   */
- static struct kmem_cache *bh_cachep __read_mostly;
+ static struct kmem_cache *bh_cachep __ro_after_init;
  
  /*
   * Once the number of bh's in the machine exceeds this level, we start
   * stripping them in writeback.
   */
- static unsigned long max_buffer_heads;
+ static unsigned long max_buffer_heads __ro_after_init;
  
  int buffer_heads_over_limit;
  
diff --combined fs/char_dev.c
index 6ba032442b39d37e0d3901b6f7219237e8758c40,3d52f3d3ae77548b8aa908e02bd619403d4c8cf9..57cc096c498a29fc3bc2278a142a494ffbe5f3fd
@@@ -25,7 -25,7 +25,7 @@@
  
  #include "internal.h"
  
- static struct kobj_map *cdev_map;
+ static struct kobj_map *cdev_map __ro_after_init;
  
  static DEFINE_MUTEX(chrdevs_lock);
  
@@@ -350,7 -350,7 +350,7 @@@ static struct kobject *cdev_get(struct 
        struct module *owner = p->owner;
        struct kobject *kobj;
  
 -      if (owner && !try_module_get(owner))
 +      if (!try_module_get(owner))
                return NULL;
        kobj = kobject_get_unless_zero(&p->kobj);
        if (!kobj)
diff --combined fs/dcache.c
index 796e23761ba0505748faf160011f49736ac26522,0650ccdaa3357f21ff468ca87af3bc46a86e5f7c..c82ae731df9af780e58db42b45ba198375be2819
@@@ -78,7 -78,7 +78,7 @@@ __cacheline_aligned_in_smp DEFINE_SEQLO
  
  EXPORT_SYMBOL(rename_lock);
  
- static struct kmem_cache *dentry_cache __read_mostly;
+ static struct kmem_cache *dentry_cache __ro_after_init;
  
  const struct qstr empty_name = QSTR_INIT("", 0);
  EXPORT_SYMBOL(empty_name);
@@@ -96,9 -96,9 +96,9 @@@ EXPORT_SYMBOL(dotdot_name)
   * information, yet avoid using a prime hash-size or similar.
   */
  
- static unsigned int d_hash_shift __read_mostly;
+ static unsigned int d_hash_shift __ro_after_init;
  
- static struct hlist_bl_head *dentry_hashtable __read_mostly;
+ static struct hlist_bl_head *dentry_hashtable __ro_after_init;
  
  static inline struct hlist_bl_head *d_hash(unsigned int hash)
  {
@@@ -3246,10 -3246,11 +3246,10 @@@ void d_genocide(struct dentry *parent
        d_walk(parent, parent, d_genocide_kill);
  }
  
 -void d_tmpfile(struct file *file, struct inode *inode)
 +void d_mark_tmpfile(struct file *file, struct inode *inode)
  {
        struct dentry *dentry = file->f_path.dentry;
  
 -      inode_dec_link_count(inode);
        BUG_ON(dentry->d_name.name != dentry->d_iname ||
                !hlist_unhashed(&dentry->d_u.d_alias) ||
                !d_unlinked(dentry));
                                (unsigned long long)inode->i_ino);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dentry->d_parent->d_lock);
 +}
 +EXPORT_SYMBOL(d_mark_tmpfile);
 +
 +void d_tmpfile(struct file *file, struct inode *inode)
 +{
 +      struct dentry *dentry = file->f_path.dentry;
 +
 +      inode_dec_link_count(inode);
 +      d_mark_tmpfile(file, inode);
        d_instantiate(dentry, inode);
  }
  EXPORT_SYMBOL(d_tmpfile);
@@@ -3332,7 -3324,7 +3332,7 @@@ static void __init dcache_init(void
  }
  
  /* SLAB cache for __getname() consumers */
- struct kmem_cache *names_cachep __read_mostly;
+ struct kmem_cache *names_cachep __ro_after_init;
  EXPORT_SYMBOL(names_cachep);
  
  void __init vfs_caches_init_early(void)
diff --combined fs/file_table.c
index fa92743ba6a91691f3621e0d8d050460c24649ef,687d33865035ffbc39491d7bb1e23b0506c612a0..de4a2915bfd4941281915be92ce7dbdd8f67ac3b
@@@ -40,14 -40,14 +40,14 @@@ static struct files_stat_struct files_s
  };
  
  /* SLAB cache for file structures */
- static struct kmem_cache *filp_cachep __read_mostly;
+ static struct kmem_cache *filp_cachep __ro_after_init;
  
  static struct percpu_counter nr_files __cacheline_aligned_in_smp;
  
 -/* Container for backing file with optional real path */
 +/* Container for backing file with optional user path */
  struct backing_file {
        struct file file;
 -      struct path real_path;
 +      struct path user_path;
  };
  
  static inline struct backing_file *backing_file(struct file *f)
        return container_of(f, struct backing_file, file);
  }
  
 -struct path *backing_file_real_path(struct file *f)
 +struct path *backing_file_user_path(struct file *f)
  {
 -      return &backing_file(f)->real_path;
 +      return &backing_file(f)->user_path;
  }
 -EXPORT_SYMBOL_GPL(backing_file_real_path);
 +EXPORT_SYMBOL_GPL(backing_file_user_path);
  
 -static void file_free_rcu(struct rcu_head *head)
 +static inline void file_free(struct file *f)
  {
 -      struct file *f = container_of(head, struct file, f_rcuhead);
 -
 +      security_file_free(f);
 +      if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
 +              percpu_counter_dec(&nr_files);
        put_cred(f->f_cred);
 -      if (unlikely(f->f_mode & FMODE_BACKING))
 +      if (unlikely(f->f_mode & FMODE_BACKING)) {
 +              path_put(backing_file_user_path(f));
                kfree(backing_file(f));
 -      else
 +      } else {
                kmem_cache_free(filp_cachep, f);
 +      }
  }
  
 -static inline void file_free(struct file *f)
 +void release_empty_file(struct file *f)
  {
 -      security_file_free(f);
 -      if (unlikely(f->f_mode & FMODE_BACKING))
 -              path_put(backing_file_real_path(f));
 -      if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
 -              percpu_counter_dec(&nr_files);
 -      call_rcu(&f->f_rcuhead, file_free_rcu);
 +      WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
 +      if (atomic_long_dec_and_test(&f->f_count)) {
 +              security_file_free(f);
 +              put_cred(f->f_cred);
 +              if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
 +                      percpu_counter_dec(&nr_files);
 +              kmem_cache_free(filp_cachep, f);
 +      }
  }
  
  /*
@@@ -169,6 -164,7 +169,6 @@@ static int init_file(struct file *f, in
                return error;
        }
  
 -      atomic_long_set(&f->f_count, 1);
        rwlock_init(&f->f_owner.lock);
        spin_lock_init(&f->f_lock);
        mutex_init(&f->f_pos_lock);
        f->f_mode = OPEN_FMODE(flags);
        /* f->f_version: 0 */
  
 +      /*
 +       * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
 +       * fget-rcu pattern users need to be able to handle spurious
 +       * refcount bumps we should reinitialize the reused file first.
 +       */
 +      atomic_long_set(&f->f_count, 1);
        return 0;
  }
  
@@@ -481,8 -471,7 +481,8 @@@ EXPORT_SYMBOL(__fput_sync)
  void __init files_init(void)
  {
        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 -                      SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
 +                              SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
 +                              SLAB_PANIC | SLAB_ACCOUNT, NULL);
        percpu_counter_init(&nr_files, 0, GFP_KERNEL);
  }
  
diff --combined fs/inode.c
index 4f8984b97df082fff002e27d795b78dacce882e2,2b1d473a363154fe3ba39fa104af6911242c2ca7..edcd8a61975f34c7a4cf467589848870430b3b8a
@@@ -54,9 -54,9 +54,9 @@@
   *   inode_hash_lock
   */
  
- static unsigned int i_hash_mask __read_mostly;
- static unsigned int i_hash_shift __read_mostly;
- static struct hlist_head *inode_hashtable __read_mostly;
+ static unsigned int i_hash_mask __ro_after_init;
+ static unsigned int i_hash_shift __ro_after_init;
+ static struct hlist_head *inode_hashtable __ro_after_init;
  static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
  
  /*
@@@ -70,7 -70,7 +70,7 @@@ EXPORT_SYMBOL(empty_aops)
  static DEFINE_PER_CPU(unsigned long, nr_inodes);
  static DEFINE_PER_CPU(unsigned long, nr_unused);
  
- static struct kmem_cache *inode_cachep __read_mostly;
+ static struct kmem_cache *inode_cachep __ro_after_init;
  
  static long get_nr_inodes(void)
  {
@@@ -1837,29 -1837,27 +1837,29 @@@ EXPORT_SYMBOL(bmap)
  static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
                             struct timespec64 now)
  {
 -      struct timespec64 ctime;
 +      struct timespec64 atime, mtime, ctime;
  
        if (!(mnt->mnt_flags & MNT_RELATIME))
                return 1;
        /*
         * Is mtime younger than or equal to atime? If yes, update atime:
         */
 -      if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
 +      atime = inode_get_atime(inode);
 +      mtime = inode_get_mtime(inode);
 +      if (timespec64_compare(&mtime, &atime) >= 0)
                return 1;
        /*
         * Is ctime younger than or equal to atime? If yes, update atime:
         */
        ctime = inode_get_ctime(inode);
 -      if (timespec64_compare(&ctime, &inode->i_atime) >= 0)
 +      if (timespec64_compare(&ctime, &atime) >= 0)
                return 1;
  
        /*
         * Is the previous atime value older than a day? If yes,
         * update atime:
         */
 -      if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
 +      if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
                return 1;
        /*
         * Good, we can skip the atime update:
@@@ -1890,13 -1888,12 +1890,13 @@@ int inode_update_timestamps(struct inod
  
        if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
                struct timespec64 ctime = inode_get_ctime(inode);
 +              struct timespec64 mtime = inode_get_mtime(inode);
  
                now = inode_set_ctime_current(inode);
                if (!timespec64_equal(&now, &ctime))
                        updated |= S_CTIME;
 -              if (!timespec64_equal(&now, &inode->i_mtime)) {
 -                      inode->i_mtime = now;
 +              if (!timespec64_equal(&now, &mtime)) {
 +                      inode_set_mtime_to_ts(inode, now);
                        updated |= S_MTIME;
                }
                if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
        }
  
        if (flags & S_ATIME) {
 -              if (!timespec64_equal(&now, &inode->i_atime)) {
 -                      inode->i_atime = now;
 +              struct timespec64 atime = inode_get_atime(inode);
 +
 +              if (!timespec64_equal(&now, &atime)) {
 +                      inode_set_atime_to_ts(inode, now);
                        updated |= S_ATIME;
                }
        }
@@@ -1968,7 -1963,7 +1968,7 @@@ EXPORT_SYMBOL(inode_update_time)
  bool atime_needs_update(const struct path *path, struct inode *inode)
  {
        struct vfsmount *mnt = path->mnt;
 -      struct timespec64 now;
 +      struct timespec64 now, atime;
  
        if (inode->i_flags & S_NOATIME)
                return false;
        if (!relatime_need_update(mnt, inode, now))
                return false;
  
 -      if (timespec64_equal(&inode->i_atime, &now))
 +      atime = inode_get_atime(inode);
 +      if (timespec64_equal(&atime, &now))
                return false;
  
        return true;
@@@ -2012,7 -2006,7 +2012,7 @@@ void touch_atime(const struct path *pat
        if (!sb_start_write_trylock(inode->i_sb))
                return;
  
 -      if (__mnt_want_write(mnt) != 0)
 +      if (mnt_get_write_access(mnt) != 0)
                goto skip_update;
        /*
         * File systems can error out when updating inodes if they need to
         * of the fs read only, e.g. subvolumes in Btrfs.
         */
        inode_update_time(inode, S_ATIME);
 -      __mnt_drop_write(mnt);
 +      mnt_put_write_access(mnt);
  skip_update:
        sb_end_write(inode->i_sb);
  }
@@@ -2112,18 -2106,17 +2112,18 @@@ static int inode_needs_update_time(stru
  {
        int sync_it = 0;
        struct timespec64 now = current_time(inode);
 -      struct timespec64 ctime;
 +      struct timespec64 ts;
  
        /* First try to exhaust all avenues to not sync */
        if (IS_NOCMTIME(inode))
                return 0;
  
 -      if (!timespec64_equal(&inode->i_mtime, &now))
 +      ts = inode_get_mtime(inode);
 +      if (!timespec64_equal(&ts, &now))
                sync_it = S_MTIME;
  
 -      ctime = inode_get_ctime(inode);
 -      if (!timespec64_equal(&ctime, &now))
 +      ts = inode_get_ctime(inode);
 +      if (!timespec64_equal(&ts, &now))
                sync_it |= S_CTIME;
  
        if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@@ -2138,9 -2131,9 +2138,9 @@@ static int __file_update_time(struct fi
        struct inode *inode = file_inode(file);
  
        /* try to update time settings */
 -      if (!__mnt_want_write_file(file)) {
 +      if (!mnt_get_write_access_file(file)) {
                ret = inode_update_time(inode, sync_mode);
 -              __mnt_drop_write_file(file);
 +              mnt_put_write_access_file(file);
        }
  
        return ret;
diff --combined fs/kernfs/mount.c
index 79b96e74a8a09d3ce3e73e326ca897d309ae00d4,43aea0ad95c8493ff839dbc97bcbe94bba3bb42d..4628edde2e7e1ad7a4e59b7f1117331a83d29b38
@@@ -21,8 -21,9 +21,9 @@@
  
  #include "kernfs-internal.h"
  
- struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
- struct kernfs_global_locks *kernfs_locks;
+ struct kmem_cache *kernfs_node_cache __ro_after_init;
+ struct kmem_cache *kernfs_iattrs_cache __ro_after_init;
+ struct kernfs_global_locks *kernfs_locks __ro_after_init;
  
  static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
  {
@@@ -265,7 -266,7 +266,7 @@@ static int kernfs_fill_super(struct sup
        sb->s_time_gran = 1;
  
        /* sysfs dentries and inodes don't require IO to create */
 -      sb->s_shrink.seeks = 0;
 +      sb->s_shrink->seeks = 0;
  
        /* get root inode, initialize and unlock it */
        down_read(&kf_root->kernfs_rwsem);
diff --combined fs/locks.c
index d4e49a990a8daaffebfd1f3dd22b5e2973812684,dbd2fb1f74949ac27577b0d7830155e5f2c23871..46d88b9e222cf2c50897e120248f7231821f579f
@@@ -167,8 -167,8 +167,8 @@@ static DEFINE_HASHTABLE(blocked_hash, B
   */
  static DEFINE_SPINLOCK(blocked_lock_lock);
  
- static struct kmem_cache *flctx_cache __read_mostly;
- static struct kmem_cache *filelock_cache __read_mostly;
+ static struct kmem_cache *flctx_cache __ro_after_init;
+ static struct kmem_cache *filelock_cache __ro_after_init;
  
  static struct file_lock_context *
  locks_get_lock_context(struct inode *inode, int type)
   * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
   * locks, the ->lock() interface may return asynchronously, before the lock has
   * been granted or denied by the underlying filesystem, if (and only if)
 - * lm_grant is set. Callers expecting ->lock() to return asynchronously
 - * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
 - * the request is for a blocking lock. When ->lock() does return asynchronously,
 - * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock
 - * request completes.
 + * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
 + * flags need to be set.
 + *
 + * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
 + * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
 + * blocking lock. When ->lock() does return asynchronously, it must return
 + * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
   * If the request is for non-blocking lock the file system should return
   * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
   * with the result. If the request timed out the callback routine will return a
diff --combined fs/namespace.c
index 6bde71735efa79e03fdb07fe5938efe21d70eb79,69df848fbc177bff81e6227ad96dc7e0022acbdf..fbf0e596fcd30c9bae8d8cc1fbe09cf309f02334
  /* Maximum number of mounts in a mount namespace */
  static unsigned int sysctl_mount_max __read_mostly = 100000;
  
- static unsigned int m_hash_mask __read_mostly;
- static unsigned int m_hash_shift __read_mostly;
- static unsigned int mp_hash_mask __read_mostly;
- static unsigned int mp_hash_shift __read_mostly;
+ static unsigned int m_hash_mask __ro_after_init;
+ static unsigned int m_hash_shift __ro_after_init;
+ static unsigned int mp_hash_mask __ro_after_init;
+ static unsigned int mp_hash_shift __ro_after_init;
  
  static __initdata unsigned long mhash_entries;
  static int __init set_mhash_entries(char *str)
@@@ -68,9 -68,9 +68,9 @@@ static u64 event
  static DEFINE_IDA(mnt_id_ida);
  static DEFINE_IDA(mnt_group_ida);
  
- static struct hlist_head *mount_hashtable __read_mostly;
- static struct hlist_head *mountpoint_hashtable __read_mostly;
- static struct kmem_cache *mnt_cache __read_mostly;
+ static struct hlist_head *mount_hashtable __ro_after_init;
+ static struct hlist_head *mountpoint_hashtable __ro_after_init;
+ static struct kmem_cache *mnt_cache __ro_after_init;
  static DECLARE_RWSEM(namespace_sem);
  static HLIST_HEAD(unmounted); /* protected by namespace_sem */
  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
@@@ -86,7 -86,7 +86,7 @@@ struct mount_kattr 
  };
  
  /* /sys/fs */
- struct kobject *fs_kobj;
+ struct kobject *fs_kobj __ro_after_init;
  EXPORT_SYMBOL_GPL(fs_kobj);
  
  /*
@@@ -330,16 -330,16 +330,16 @@@ static int mnt_is_readonly(struct vfsmo
   * can determine when writes are able to occur to a filesystem.
   */
  /**
 - * __mnt_want_write - get write access to a mount without freeze protection
 + * mnt_get_write_access - get write access to a mount without freeze protection
   * @m: the mount on which to take a write
   *
   * This tells the low-level filesystem that a write is about to be performed to
   * it, and makes sure that writes are allowed (mnt it read-write) before
   * returning success. This operation does not protect against filesystem being
 - * frozen. When the write operation is finished, __mnt_drop_write() must be
 + * frozen. When the write operation is finished, mnt_put_write_access() must be
   * called. This is effectively a refcount.
   */
 -int __mnt_want_write(struct vfsmount *m)
 +int mnt_get_write_access(struct vfsmount *m)
  {
        struct mount *mnt = real_mount(m);
        int ret = 0;
  
        return ret;
  }
 +EXPORT_SYMBOL_GPL(mnt_get_write_access);
  
  /**
   * mnt_want_write - get write access to a mount
@@@ -402,7 -401,7 +402,7 @@@ int mnt_want_write(struct vfsmount *m
        int ret;
  
        sb_start_write(m->mnt_sb);
 -      ret = __mnt_want_write(m);
 +      ret = mnt_get_write_access(m);
        if (ret)
                sb_end_write(m->mnt_sb);
        return ret;
  EXPORT_SYMBOL_GPL(mnt_want_write);
  
  /**
 - * __mnt_want_write_file - get write access to a file's mount
 + * mnt_get_write_access_file - get write access to a file's mount
   * @file: the file who's mount on which to take a write
   *
 - * This is like __mnt_want_write, but if the file is already open for writing it
 + * This is like mnt_get_write_access, but if @file is already open for write it
   * skips incrementing mnt_writers (since the open file already has a reference)
   * and instead only does the check for emergency r/o remounts.  This must be
 - * paired with __mnt_drop_write_file.
 + * paired with mnt_put_write_access_file.
   */
 -int __mnt_want_write_file(struct file *file)
 +int mnt_get_write_access_file(struct file *file)
  {
        if (file->f_mode & FMODE_WRITER) {
                /*
                        return -EROFS;
                return 0;
        }
 -      return __mnt_want_write(file->f_path.mnt);
 +      return mnt_get_write_access(file->f_path.mnt);
  }
  
  /**
@@@ -446,7 -445,7 +446,7 @@@ int mnt_want_write_file(struct file *fi
        int ret;
  
        sb_start_write(file_inode(file)->i_sb);
 -      ret = __mnt_want_write_file(file);
 +      ret = mnt_get_write_access_file(file);
        if (ret)
                sb_end_write(file_inode(file)->i_sb);
        return ret;
  EXPORT_SYMBOL_GPL(mnt_want_write_file);
  
  /**
 - * __mnt_drop_write - give up write access to a mount
 + * mnt_put_write_access - give up write access to a mount
   * @mnt: the mount on which to give up write access
   *
   * Tells the low-level filesystem that we are done
   * performing writes to it.  Must be matched with
 - * __mnt_want_write() call above.
 + * mnt_get_write_access() call above.
   */
 -void __mnt_drop_write(struct vfsmount *mnt)
 +void mnt_put_write_access(struct vfsmount *mnt)
  {
        preempt_disable();
        mnt_dec_writers(real_mount(mnt));
        preempt_enable();
  }
 +EXPORT_SYMBOL_GPL(mnt_put_write_access);
  
  /**
   * mnt_drop_write - give up write access to a mount
   */
  void mnt_drop_write(struct vfsmount *mnt)
  {
 -      __mnt_drop_write(mnt);
 +      mnt_put_write_access(mnt);
        sb_end_write(mnt->mnt_sb);
  }
  EXPORT_SYMBOL_GPL(mnt_drop_write);
  
 -void __mnt_drop_write_file(struct file *file)
 +void mnt_put_write_access_file(struct file *file)
  {
        if (!(file->f_mode & FMODE_WRITER))
 -              __mnt_drop_write(file->f_path.mnt);
 +              mnt_put_write_access(file->f_path.mnt);
  }
  
  void mnt_drop_write_file(struct file *file)
  {
 -      __mnt_drop_write_file(file);
 +      mnt_put_write_access_file(file);
        sb_end_write(file_inode(file)->i_sb);
  }
  EXPORT_SYMBOL(mnt_drop_write_file);
@@@ -1346,9 -1344,9 +1346,9 @@@ void mntput(struct vfsmount *mnt
  {
        if (mnt) {
                struct mount *m = real_mount(mnt);
 -              /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
 +              /* avoid cacheline pingpong */
                if (unlikely(m->mnt_expiry_mark))
 -                      m->mnt_expiry_mark = 0;
 +                      WRITE_ONCE(m->mnt_expiry_mark, 0);
                mntput_no_expire(m);
        }
  }
index 869b016014d2c2bc3882feac344b0828ce26a50b,7914d223289a9c5e5b3d235e1dd45e20cc095a4d..1cb9ad7e884e1516320b7dd7f01ba686757db159
@@@ -39,9 -39,9 +39,9 @@@ static void __init dnotify_sysctl_init(
  #define dnotify_sysctl_init() do { } while (0)
  #endif
  
- static struct kmem_cache *dnotify_struct_cache __read_mostly;
- static struct kmem_cache *dnotify_mark_cache __read_mostly;
- static struct fsnotify_group *dnotify_group __read_mostly;
+ static struct kmem_cache *dnotify_struct_cache __ro_after_init;
+ static struct kmem_cache *dnotify_mark_cache __ro_after_init;
+ static struct fsnotify_group *dnotify_group __ro_after_init;
  
  /*
   * dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@@ -265,7 -265,7 +265,7 @@@ int fcntl_dirnotify(int fd, struct fil
        struct dnotify_struct *dn;
        struct inode *inode;
        fl_owner_t id = current->files;
 -      struct file *f;
 +      struct file *f = NULL;
        int destroy = 0, error = 0;
        __u32 mask;
  
        }
  
        rcu_read_lock();
 -      f = lookup_fd_rcu(fd);
 +      f = lookup_fdget_rcu(fd);
        rcu_read_unlock();
  
        /* if (f != filp) means that we lost a race and another task/thread
@@@ -392,8 -392,6 +392,8 @@@ out_err
                fsnotify_put_mark(new_fsn_mark);
        if (dn)
                kmem_cache_free(dnotify_struct_cache, dn);
 +      if (f)
 +              fput(f);
        return error;
  }
  
index 62fe0b679e586ccbe181000fe94ed5e2b142b203,614b435c4a8cfc09283c80811e9a9c0c83386038..45aecdc302f4d9a41fee3cc364a553f2aeb59a3f
@@@ -112,10 -112,10 +112,10 @@@ static void __init fanotify_sysctls_ini
  
  extern const struct fsnotify_ops fanotify_fsnotify_ops;
  
- struct kmem_cache *fanotify_mark_cache __read_mostly;
- struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
- struct kmem_cache *fanotify_path_event_cachep __read_mostly;
- struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+ struct kmem_cache *fanotify_mark_cache __ro_after_init;
+ struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
+ struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
+ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
  
  #define FANOTIFY_EVENT_ALIGN 4
  #define FANOTIFY_FID_INFO_HDR_LEN \
@@@ -1585,25 -1585,16 +1585,25 @@@ static int fanotify_test_fsid(struct de
  }
  
  /* Check if filesystem can encode a unique fid */
 -static int fanotify_test_fid(struct dentry *dentry)
 +static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
  {
 +      unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
 +      const struct export_operations *nop = dentry->d_sb->s_export_op;
 +
 +      /*
 +       * We need to make sure that the filesystem supports encoding of
 +       * file handles so user can use name_to_handle_at() to compare fids
 +       * reported with events to the file handle of watched objects.
 +       */
 +      if (!nop)
 +              return -EOPNOTSUPP;
 +
        /*
 -       * We need to make sure that the file system supports at least
 -       * encoding a file handle so user can use name_to_handle_at() to
 -       * compare fid returned with event to the file handle of watched
 -       * objects. However, even the relaxed AT_HANDLE_FID flag requires
 -       * at least empty export_operations for ecoding unique file ids.
 +       * For sb/mount mark, we also need to make sure that the filesystem
 +       * supports decoding file handles, so user has a way to map back the
 +       * reported fids to filesystem objects.
         */
 -      if (!dentry->d_sb->s_export_op)
 +      if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry)
                return -EOPNOTSUPP;
  
        return 0;
@@@ -1821,7 -1812,7 +1821,7 @@@ static int do_fanotify_mark(int fanotif
                if (ret)
                        goto path_put_and_out;
  
 -              ret = fanotify_test_fid(path.dentry);
 +              ret = fanotify_test_fid(path.dentry, flags);
                if (ret)
                        goto path_put_and_out;
  
diff --combined fs/ocfs2/alloc.c
index f0937902f7b46e48d51b6dbcd0a46dc1ae5b569a,dea3de833b4781530a64b0c9eb5ff702aa5fecca..91b32b2377acc9336cbe03a5c7be2d4cfe4e9cda
@@@ -967,7 -967,14 +967,14 @@@ int ocfs2_num_free_extents(struct ocfs2
                el = &eb->h_list;
        }
  
-       BUG_ON(el->l_tree_depth != 0);
+       if (el->l_tree_depth != 0) {
+               retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+                               "Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n",
+                               (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+                               (unsigned long long)last_eb_blk,
+                               le16_to_cpu(el->l_tree_depth));
+               goto bail;
+       }
  
        retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
  bail:
@@@ -7436,10 -7443,10 +7443,10 @@@ int ocfs2_truncate_inline(struct inode 
        }
  
        inode->i_blocks = ocfs2_inode_sector_count(inode);
 -      inode->i_mtime = inode_set_ctime_current(inode);
 +      inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
  
 -      di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
 -      di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
 +      di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
 +      di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
  
        ocfs2_update_inode_fsync_trans(handle, inode, 1);
        ocfs2_journal_dirty(handle, di_bh);
@@@ -7642,7 -7649,7 +7649,7 @@@ out_mutex
                goto next_group;
        }
  out:
-       range->len = trimmed * sb->s_blocksize;
+       range->len = trimmed * osb->s_clustersize;
        return ret;
  }
  
diff --combined fs/ocfs2/dlmfs/dlmfs.c
index 9b57d012fd5cfe6c76a4d74694bbddcf2addfedf,b38776ba3306876938effbe1724a36bcad0ba55d..85215162c9dd59f25347049f2546c1a1cfd10409
@@@ -80,8 -80,7 +80,7 @@@ static int param_set_dlmfs_capabilities
  static int param_get_dlmfs_capabilities(char *buffer,
                                        const struct kernel_param *kp)
  {
-       return strlcpy(buffer, DLMFS_CAPABILITIES,
-                      strlen(DLMFS_CAPABILITIES) + 1);
+       return sysfs_emit(buffer, DLMFS_CAPABILITIES);
  }
  module_param_call(capabilities, param_set_dlmfs_capabilities,
                  param_get_dlmfs_capabilities, NULL, 0444);
@@@ -337,7 -336,7 +336,7 @@@ static struct inode *dlmfs_get_root_ino
        if (inode) {
                inode->i_ino = get_next_ino();
                inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
 -              inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
 +              simple_inode_init_ts(inode);
                inc_nlink(inode);
  
                inode->i_fop = &simple_dir_operations;
@@@ -360,7 -359,7 +359,7 @@@ static struct inode *dlmfs_get_inode(st
  
        inode->i_ino = get_next_ino();
        inode_init_owner(&nop_mnt_idmap, inode, parent, mode);
 -      inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
 +      simple_inode_init_ts(inode);
  
        ip = DLMFS_I(inode);
        ip->ip_conn = DLMFS_I(parent)->ip_conn;
diff --combined fs/ocfs2/namei.c
index 681e9501cdd35a8a08ce37aec13a6f7e27325cfa,836c4279a979b8e20625b2529b0bd4368c0f688f..814733ba2f4ba0fba52b985301725fb323efda13
@@@ -795,8 -795,8 +795,8 @@@ static int ocfs2_link(struct dentry *ol
        inc_nlink(inode);
        inode_set_ctime_current(inode);
        ocfs2_set_links_count(fe, inode->i_nlink);
 -      fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
 -      fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
 +      fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
 +      fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
        ocfs2_journal_dirty(handle, fe_bh);
  
        err = ocfs2_add_entry(handle, dentry, inode,
@@@ -995,7 -995,7 +995,7 @@@ static int ocfs2_unlink(struct inode *d
        ocfs2_set_links_count(fe, inode->i_nlink);
        ocfs2_journal_dirty(handle, fe_bh);
  
 -      dir->i_mtime = inode_set_ctime_current(dir);
 +      inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        if (S_ISDIR(inode->i_mode))
                drop_nlink(dir);
  
@@@ -1550,8 -1550,8 +1550,8 @@@ static int ocfs2_rename(struct mnt_idma
        if (status >= 0) {
                old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
  
 -              old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec);
 -              old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec);
 +              old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode));
 +              old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode));
                ocfs2_journal_dirty(handle, old_inode_bh);
        } else
                mlog_errno(status);
                drop_nlink(new_inode);
                inode_set_ctime_current(new_inode);
        }
 -      old_dir->i_mtime = inode_set_ctime_current(old_dir);
 +      inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
  
        if (update_dot_dot) {
                status = ocfs2_update_entry(old_inode, handle,
                                            &old_inode_dot_dot_res, new_dir);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
                drop_nlink(old_dir);
                if (new_inode) {
                        drop_nlink(new_inode);
  
        if (old_dir != new_dir) {
                /* Keep the same times on both directories.*/
 -              new_dir->i_mtime = inode_set_ctime_to_ts(new_dir,
 -                                                       inode_get_ctime(old_dir));
 +              inode_set_mtime_to_ts(new_dir,
 +                                    inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir)));
  
                /*
                 * This will also pick up the i_nlink change from the
                                                         INODE_CACHE(old_dir),
                                                         old_dir_bh,
                                                         OCFS2_JOURNAL_ACCESS_WRITE);
+                       if (status < 0) {
+                               mlog_errno(status);
+                               goto bail;
+                       }
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
                        ocfs2_set_links_count(fe, old_dir->i_nlink);
                        ocfs2_journal_dirty(handle, old_dir_bh);
diff --combined fs/pipe.c
index 8916c455a469c18e0b56125dac5f553330938fe7,6b279abf01296525c2f7a177da2be3112ca913ee..804a7d78945217efd3b5394a8a8e6e7d605a937a
+++ b/fs/pipe.c
@@@ -227,36 -227,6 +227,36 @@@ static inline bool pipe_readable(const 
        return !pipe_empty(head, tail) || !writers;
  }
  
 +static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
 +                                          struct pipe_buffer *buf,
 +                                          unsigned int tail)
 +{
 +      pipe_buf_release(pipe, buf);
 +
 +      /*
 +       * If the pipe has a watch_queue, we need additional protection
 +       * by the spinlock because notifications get posted with only
 +       * this spinlock, no mutex
 +       */
 +      if (pipe_has_watch_queue(pipe)) {
 +              spin_lock_irq(&pipe->rd_wait.lock);
 +#ifdef CONFIG_WATCH_QUEUE
 +              if (buf->flags & PIPE_BUF_FLAG_LOSS)
 +                      pipe->note_loss = true;
 +#endif
 +              pipe->tail = ++tail;
 +              spin_unlock_irq(&pipe->rd_wait.lock);
 +              return tail;
 +      }
 +
 +      /*
 +       * Without a watch_queue, we can simply increment the tail
 +       * without the spinlock - the mutex is enough.
 +       */
 +      pipe->tail = ++tail;
 +      return tail;
 +}
 +
  static ssize_t
  pipe_read(struct kiocb *iocb, struct iov_iter *to)
  {
                                buf->len = 0;
                        }
  
 -                      if (!buf->len) {
 -                              pipe_buf_release(pipe, buf);
 -                              spin_lock_irq(&pipe->rd_wait.lock);
 -#ifdef CONFIG_WATCH_QUEUE
 -                              if (buf->flags & PIPE_BUF_FLAG_LOSS)
 -                                      pipe->note_loss = true;
 -#endif
 -                              tail++;
 -                              pipe->tail = tail;
 -                              spin_unlock_irq(&pipe->rd_wait.lock);
 -                      }
 +                      if (!buf->len)
 +                              tail = pipe_update_tail(pipe, buf, tail);
                        total_len -= chars;
                        if (!total_len)
                                break;  /* common path: read succeeded */
@@@ -458,10 -437,12 +458,10 @@@ pipe_write(struct kiocb *iocb, struct i
                goto out;
        }
  
 -#ifdef CONFIG_WATCH_QUEUE
 -      if (pipe->watch_queue) {
 +      if (pipe_has_watch_queue(pipe)) {
                ret = -EXDEV;
                goto out;
        }
 -#endif
  
        /*
         * If it wasn't empty we try to merge new data into
                         * it, either the reader will consume it or it'll still
                         * be there for the next write.
                         */
 -                      spin_lock_irq(&pipe->rd_wait.lock);
 -
 -                      head = pipe->head;
 -                      if (pipe_full(head, pipe->tail, pipe->max_usage)) {
 -                              spin_unlock_irq(&pipe->rd_wait.lock);
 -                              continue;
 -                      }
 -
                        pipe->head = head + 1;
 -                      spin_unlock_irq(&pipe->rd_wait.lock);
  
                        /* Insert it into the buffer array */
                        buf = &pipe->bufs[head & mask];
@@@ -864,7 -854,7 +864,7 @@@ void free_pipe_info(struct pipe_inode_i
        kfree(pipe);
  }
  
- static struct vfsmount *pipe_mnt __read_mostly;
+ static struct vfsmount *pipe_mnt __ro_after_init;
  
  /*
   * pipefs_dname() is called from d_path().
@@@ -908,7 -898,7 +908,7 @@@ static struct inode * get_pipe_inode(vo
        inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
 -      inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
 +      simple_inode_init_ts(inode);
  
        return inode;
  
@@@ -1334,8 -1324,10 +1334,8 @@@ static long pipe_set_size(struct pipe_i
        unsigned int nr_slots, size;
        long ret = 0;
  
 -#ifdef CONFIG_WATCH_QUEUE
 -      if (pipe->watch_queue)
 +      if (pipe_has_watch_queue(pipe))
                return -EBUSY;
 -#endif
  
        size = round_pipe_size(arg);
        nr_slots = size >> PAGE_SHIFT;
@@@ -1387,8 -1379,10 +1387,8 @@@ struct pipe_inode_info *get_pipe_info(s
  
        if (file->f_op != &pipefifo_fops || !pipe)
                return NULL;
 -#ifdef CONFIG_WATCH_QUEUE
 -      if (for_splice && pipe->watch_queue)
 +      if (for_splice && pipe_has_watch_queue(pipe))
                return NULL;
 -#endif
        return pipe;
  }
  
diff --combined fs/proc/base.c
index 83396ab149985496eb3a20092a1f43720ab7670c,b13d3e804debaea4487e0cedba091cf9ad328625..dd31e3b6bf77cc84d6e05a223e989bc3ea3bb6f7
@@@ -1153,11 -1153,10 +1153,10 @@@ err_unlock
  static ssize_t oom_adj_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
  {
-       char buffer[PROC_NUMBUF];
+       char buffer[PROC_NUMBUF] = {};
        int oom_adj;
        int err;
  
-       memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count)) {
@@@ -1213,11 -1212,10 +1212,10 @@@ static ssize_t oom_score_adj_read(struc
  static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                                        size_t count, loff_t *ppos)
  {
-       char buffer[PROC_NUMBUF];
+       char buffer[PROC_NUMBUF] = {};
        int oom_score_adj;
        int err;
  
-       memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count)) {
@@@ -1358,13 -1356,13 +1356,13 @@@ static ssize_t proc_fault_inject_write(
                        const char __user * buf, size_t count, loff_t *ppos)
  {
        struct task_struct *task;
-       char buffer[PROC_NUMBUF];
+       char buffer[PROC_NUMBUF] = {};
        int make_it_fail;
        int rv;
  
        if (!capable(CAP_SYS_RESOURCE))
                return -EPERM;
-       memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
@@@ -1509,11 -1507,10 +1507,10 @@@ sched_autogroup_write(struct file *file
  {
        struct inode *inode = file_inode(file);
        struct task_struct *p;
-       char buffer[PROC_NUMBUF];
+       char buffer[PROC_NUMBUF] = {};
        int nice;
        int err;
  
-       memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
@@@ -1666,10 -1663,9 +1663,9 @@@ static ssize_t comm_write(struct file *
  {
        struct inode *inode = file_inode(file);
        struct task_struct *p;
-       char buffer[TASK_COMM_LEN];
+       char buffer[TASK_COMM_LEN] = {};
        const size_t maxlen = sizeof(buffer) - 1;
  
-       memset(buffer, 0, sizeof(buffer));
        if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
                return -EFAULT;
  
@@@ -1902,7 -1898,7 +1898,7 @@@ struct inode *proc_pid_make_inode(struc
        ei = PROC_I(inode);
        inode->i_mode = mode;
        inode->i_ino = get_next_ino();
 -      inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
 +      simple_inode_init_ts(inode);
        inode->i_op = &proc_def_inode_operations;
  
        /*
@@@ -2218,7 -2214,7 +2214,7 @@@ static int map_files_get_link(struct de
        rc = -ENOENT;
        vma = find_exact_vma(mm, vm_start, vm_end);
        if (vma && vma->vm_file) {
 -              *path = vma->vm_file->f_path;
 +              *path = *file_user_path(vma->vm_file);
                path_get(path);
                rc = 0;
        }
@@@ -2976,8 -2972,7 +2972,7 @@@ static const struct file_operations pro
  #ifdef CONFIG_TASK_IO_ACCOUNTING
  static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
  {
-       struct task_io_accounting acct = task->ioac;
-       unsigned long flags;
+       struct task_io_accounting acct;
        int result;
  
        result = down_read_killable(&task->signal->exec_update_lock);
                goto out_unlock;
        }
  
-       if (whole && lock_task_sighand(task, &flags)) {
-               struct task_struct *t = task;
+       if (whole) {
+               struct signal_struct *sig = task->signal;
+               struct task_struct *t;
+               unsigned int seq = 1;
+               unsigned long flags;
+               rcu_read_lock();
+               do {
+                       seq++; /* 2 on the 1st/lockless path, otherwise odd */
+                       flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
  
-               task_io_accounting_add(&acct, &task->signal->ioac);
-               while_each_thread(task, t)
-                       task_io_accounting_add(&acct, &t->ioac);
+                       acct = sig->ioac;
+                       __for_each_thread(sig, t)
+                               task_io_accounting_add(&acct, &t->ioac);
  
-               unlock_task_sighand(task, &flags);
+               } while (need_seqretry(&sig->stats_lock, seq));
+               done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+               rcu_read_unlock();
+       } else {
+               acct = task->ioac;
        }
        seq_printf(m,
                   "rchar: %llu\n"
                   "wchar: %llu\n"
@@@ -3818,7 -3826,7 +3826,7 @@@ static struct task_struct *first_tid(st
        for_each_thread(task, pos) {
                if (!nr--)
                        goto found;
-       };
+       }
  fail:
        pos = NULL;
        goto out;
@@@ -3840,10 -3848,8 +3848,8 @@@ static struct task_struct *next_tid(str
        struct task_struct *pos = NULL;
        rcu_read_lock();
        if (pid_alive(start)) {
-               pos = next_thread(start);
-               if (thread_group_leader(pos))
-                       pos = NULL;
-               else
+               pos = __next_thread(start);
+               if (pos)
                        get_task_struct(pos);
        }
        rcu_read_unlock();
diff --combined fs/proc/inode.c
index 592ed2516f47881458b275951412072ab68d3217,5933c78af6de19b67bb486fd4bdb7f2b8efe2b1c..b33e490e3fd9f88f569e3453d603041e665cf6bf
@@@ -110,18 -110,15 +110,15 @@@ void __init proc_init_kmemcache(void
  
  void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
  {
-       struct inode *inode;
-       struct proc_inode *ei;
        struct hlist_node *node;
        struct super_block *old_sb = NULL;
  
        rcu_read_lock();
-       for (;;) {
+       while ((node = hlist_first_rcu(inodes))) {
+               struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
                struct super_block *sb;
-               node = hlist_first_rcu(inodes);
-               if (!node)
-                       break;
-               ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+               struct inode *inode;
                spin_lock(lock);
                hlist_del_init_rcu(&ei->sibling_inodes);
                spin_unlock(lock);
@@@ -660,7 -657,7 +657,7 @@@ struct inode *proc_get_inode(struct sup
  
        inode->i_private = de->data;
        inode->i_ino = de->low_ino;
 -      inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
 +      simple_inode_init_ts(inode);
        PROC_I(inode)->pde = de;
        if (is_empty_pde(de)) {
                make_empty_dir_inode(inode);
diff --combined fs/proc/task_mmu.c
index 4abd51053f76d92d0998a4e07f5e021a263c1026,d4d55d5bae51b016883eb6e4ff0feceb462fd620..ef2eb12906da88c6fe3a227e82598020f0badc44
@@@ -20,8 -20,6 +20,8 @@@
  #include <linux/shmem_fs.h>
  #include <linux/uaccess.h>
  #include <linux/pkeys.h>
 +#include <linux/minmax.h>
 +#include <linux/overflow.h>
  
  #include <asm/elf.h>
  #include <asm/tlb.h>
@@@ -298,7 -296,7 +298,7 @@@ show_map_vma(struct seq_file *m, struc
                if (anon_name)
                        seq_printf(m, "[anon_shmem:%s]", anon_name->name);
                else
 -                      seq_file_path(m, file, "\n");
 +                      seq_path(m, file_user_path(file), "\n");
                goto done;
        }
  
@@@ -851,9 -849,7 +851,7 @@@ static void __show_smap(struct seq_fil
  static int show_smap(struct seq_file *m, void *v)
  {
        struct vm_area_struct *vma = v;
-       struct mem_size_stats mss;
-       memset(&mss, 0, sizeof(mss));
+       struct mem_size_stats mss = {};
  
        smap_gather_stats(vma, &mss, 0);
  
  static int show_smaps_rollup(struct seq_file *m, void *v)
  {
        struct proc_maps_private *priv = m->private;
-       struct mem_size_stats mss;
+       struct mem_size_stats mss = {};
        struct mm_struct *mm = priv->mm;
        struct vm_area_struct *vma;
        unsigned long vma_start = 0, last_vma_end = 0;
                goto out_put_task;
        }
  
-       memset(&mss, 0, sizeof(mss));
        ret = mmap_read_lock_killable(mm);
        if (ret)
                goto out_put_mm;
@@@ -1248,14 -1242,13 +1244,13 @@@ static ssize_t clear_refs_write(struct 
                                size_t count, loff_t *ppos)
  {
        struct task_struct *task;
-       char buffer[PROC_NUMBUF];
+       char buffer[PROC_NUMBUF] = {};
        struct mm_struct *mm;
        struct vm_area_struct *vma;
        enum clear_refs_types type;
        int itype;
        int rv;
  
-       memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
                count = sizeof(buffer) - 1;
        if (copy_from_user(buffer, buf, count))
@@@ -1763,737 -1756,11 +1758,737 @@@ static int pagemap_release(struct inod
        return 0;
  }
  
 +#define PM_SCAN_CATEGORIES    (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN |  \
 +                               PAGE_IS_FILE | PAGE_IS_PRESENT |       \
 +                               PAGE_IS_SWAPPED | PAGE_IS_PFNZERO |    \
 +                               PAGE_IS_HUGE)
 +#define PM_SCAN_FLAGS         (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
 +
 +struct pagemap_scan_private {
 +      struct pm_scan_arg arg;
 +      unsigned long masks_of_interest, cur_vma_category;
 +      struct page_region *vec_buf;
 +      unsigned long vec_buf_len, vec_buf_index, found_pages;
 +      struct page_region __user *vec_out;
 +};
 +
 +static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
 +                                         struct vm_area_struct *vma,
 +                                         unsigned long addr, pte_t pte)
 +{
 +      unsigned long categories = 0;
 +
 +      if (pte_present(pte)) {
 +              struct page *page;
 +
 +              categories |= PAGE_IS_PRESENT;
 +              if (!pte_uffd_wp(pte))
 +                      categories |= PAGE_IS_WRITTEN;
 +
 +              if (p->masks_of_interest & PAGE_IS_FILE) {
 +                      page = vm_normal_page(vma, addr, pte);
 +                      if (page && !PageAnon(page))
 +                              categories |= PAGE_IS_FILE;
 +              }
 +
 +              if (is_zero_pfn(pte_pfn(pte)))
 +                      categories |= PAGE_IS_PFNZERO;
 +      } else if (is_swap_pte(pte)) {
 +              swp_entry_t swp;
 +
 +              categories |= PAGE_IS_SWAPPED;
 +              if (!pte_swp_uffd_wp_any(pte))
 +                      categories |= PAGE_IS_WRITTEN;
 +
 +              if (p->masks_of_interest & PAGE_IS_FILE) {
 +                      swp = pte_to_swp_entry(pte);
 +                      if (is_pfn_swap_entry(swp) &&
 +                          !PageAnon(pfn_swap_entry_to_page(swp)))
 +                              categories |= PAGE_IS_FILE;
 +              }
 +      }
 +
 +      return categories;
 +}
 +
 +static void make_uffd_wp_pte(struct vm_area_struct *vma,
 +                           unsigned long addr, pte_t *pte)
 +{
 +      pte_t ptent = ptep_get(pte);
 +
 +      if (pte_present(ptent)) {
 +              pte_t old_pte;
 +
 +              old_pte = ptep_modify_prot_start(vma, addr, pte);
 +              ptent = pte_mkuffd_wp(ptent);
 +              ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
 +      } else if (is_swap_pte(ptent)) {
 +              ptent = pte_swp_mkuffd_wp(ptent);
 +              set_pte_at(vma->vm_mm, addr, pte, ptent);
 +      } else {
 +              set_pte_at(vma->vm_mm, addr, pte,
 +                         make_pte_marker(PTE_MARKER_UFFD_WP));
 +      }
 +}
 +
 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 +static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
 +                                        struct vm_area_struct *vma,
 +                                        unsigned long addr, pmd_t pmd)
 +{
 +      unsigned long categories = PAGE_IS_HUGE;
 +
 +      if (pmd_present(pmd)) {
 +              struct page *page;
 +
 +              categories |= PAGE_IS_PRESENT;
 +              if (!pmd_uffd_wp(pmd))
 +                      categories |= PAGE_IS_WRITTEN;
 +
 +              if (p->masks_of_interest & PAGE_IS_FILE) {
 +                      page = vm_normal_page_pmd(vma, addr, pmd);
 +                      if (page && !PageAnon(page))
 +                              categories |= PAGE_IS_FILE;
 +              }
 +
 +              if (is_zero_pfn(pmd_pfn(pmd)))
 +                      categories |= PAGE_IS_PFNZERO;
 +      } else if (is_swap_pmd(pmd)) {
 +              swp_entry_t swp;
 +
 +              categories |= PAGE_IS_SWAPPED;
 +              if (!pmd_swp_uffd_wp(pmd))
 +                      categories |= PAGE_IS_WRITTEN;
 +
 +              if (p->masks_of_interest & PAGE_IS_FILE) {
 +                      swp = pmd_to_swp_entry(pmd);
 +                      if (is_pfn_swap_entry(swp) &&
 +                          !PageAnon(pfn_swap_entry_to_page(swp)))
 +                              categories |= PAGE_IS_FILE;
 +              }
 +      }
 +
 +      return categories;
 +}
 +
 +static void make_uffd_wp_pmd(struct vm_area_struct *vma,
 +                           unsigned long addr, pmd_t *pmdp)
 +{
 +      pmd_t old, pmd = *pmdp;
 +
 +      if (pmd_present(pmd)) {
 +              old = pmdp_invalidate_ad(vma, addr, pmdp);
 +              pmd = pmd_mkuffd_wp(old);
 +              set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 +      } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
 +              pmd = pmd_swp_mkuffd_wp(pmd);
 +              set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 +      }
 +}
 +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 +
 +#ifdef CONFIG_HUGETLB_PAGE
 +static unsigned long pagemap_hugetlb_category(pte_t pte)
 +{
 +      unsigned long categories = PAGE_IS_HUGE;
 +
 +      /*
 +       * According to pagemap_hugetlb_range(), file-backed HugeTLB
 +       * page cannot be swapped. So PAGE_IS_FILE is not checked for
 +       * swapped pages.
 +       */
 +      if (pte_present(pte)) {
 +              categories |= PAGE_IS_PRESENT;
 +              if (!huge_pte_uffd_wp(pte))
 +                      categories |= PAGE_IS_WRITTEN;
 +              if (!PageAnon(pte_page(pte)))
 +                      categories |= PAGE_IS_FILE;
 +              if (is_zero_pfn(pte_pfn(pte)))
 +                      categories |= PAGE_IS_PFNZERO;
 +      } else if (is_swap_pte(pte)) {
 +              categories |= PAGE_IS_SWAPPED;
 +              if (!pte_swp_uffd_wp_any(pte))
 +                      categories |= PAGE_IS_WRITTEN;
 +      }
 +
 +      return categories;
 +}
 +
 +static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
 +                                unsigned long addr, pte_t *ptep,
 +                                pte_t ptent)
 +{
 +      unsigned long psize;
 +
 +      if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
 +              return;
 +
 +      psize = huge_page_size(hstate_vma(vma));
 +
 +      if (is_hugetlb_entry_migration(ptent))
 +              set_huge_pte_at(vma->vm_mm, addr, ptep,
 +                              pte_swp_mkuffd_wp(ptent), psize);
 +      else if (!huge_pte_none(ptent))
 +              huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
 +                                           huge_pte_mkuffd_wp(ptent));
 +      else
 +              set_huge_pte_at(vma->vm_mm, addr, ptep,
 +                              make_pte_marker(PTE_MARKER_UFFD_WP), psize);
 +}
 +#endif /* CONFIG_HUGETLB_PAGE */
 +
 +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
 +static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
 +                                     unsigned long addr, unsigned long end)
 +{
 +      struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
 +
 +      if (cur_buf->start != addr)
 +              cur_buf->end = addr;
 +      else
 +              cur_buf->start = cur_buf->end = 0;
 +
 +      p->found_pages -= (end - addr) / PAGE_SIZE;
 +}
 +#endif
 +
 +static bool pagemap_scan_is_interesting_page(unsigned long categories,
 +                                           const struct pagemap_scan_private *p)
 +{
 +      categories ^= p->arg.category_inverted;
 +      if ((categories & p->arg.category_mask) != p->arg.category_mask)
 +              return false;
 +      if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
 +              return false;
 +
 +      return true;
 +}
 +
 +static bool pagemap_scan_is_interesting_vma(unsigned long categories,
 +                                          const struct pagemap_scan_private *p)
 +{
 +      unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
 +
 +      categories ^= p->arg.category_inverted;
 +      if ((categories & required) != required)
 +              return false;
 +
 +      return true;
 +}
 +
 +static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
 +                                struct mm_walk *walk)
 +{
 +      struct pagemap_scan_private *p = walk->private;
 +      struct vm_area_struct *vma = walk->vma;
 +      unsigned long vma_category = 0;
 +
 +      if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma))
 +              vma_category |= PAGE_IS_WPALLOWED;
 +      else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
 +              return -EPERM;
 +
 +      if (vma->vm_flags & VM_PFNMAP)
 +              return 1;
 +
 +      if (!pagemap_scan_is_interesting_vma(vma_category, p))
 +              return 1;
 +
 +      p->cur_vma_category = vma_category;
 +
 +      return 0;
 +}
 +
 +static bool pagemap_scan_push_range(unsigned long categories,
 +                                  struct pagemap_scan_private *p,
 +                                  unsigned long addr, unsigned long end)
 +{
 +      struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
 +
 +      /*
 +       * When there is no output buffer provided at all, the sentinel values
 +       * won't match here. There is no other way for `cur_buf->end` to be
 +       * non-zero other than it being non-empty.
 +       */
 +      if (addr == cur_buf->end && categories == cur_buf->categories) {
 +              cur_buf->end = end;
 +              return true;
 +      }
 +
 +      if (cur_buf->end) {
 +              if (p->vec_buf_index >= p->vec_buf_len - 1)
 +                      return false;
 +
 +              cur_buf = &p->vec_buf[++p->vec_buf_index];
 +      }
 +
 +      cur_buf->start = addr;
 +      cur_buf->end = end;
 +      cur_buf->categories = categories;
 +
 +      return true;
 +}
 +
 +static int pagemap_scan_output(unsigned long categories,
 +                             struct pagemap_scan_private *p,
 +                             unsigned long addr, unsigned long *end)
 +{
 +      unsigned long n_pages, total_pages;
 +      int ret = 0;
 +
 +      if (!p->vec_buf)
 +              return 0;
 +
 +      categories &= p->arg.return_mask;
 +
 +      n_pages = (*end - addr) / PAGE_SIZE;
 +      if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
 +          total_pages > p->arg.max_pages) {
 +              size_t n_too_much = total_pages - p->arg.max_pages;
 +              *end -= n_too_much * PAGE_SIZE;
 +              n_pages -= n_too_much;
 +              ret = -ENOSPC;
 +      }
 +
 +      if (!pagemap_scan_push_range(categories, p, addr, *end)) {
 +              *end = addr;
 +              n_pages = 0;
 +              ret = -ENOSPC;
 +      }
 +
 +      p->found_pages += n_pages;
 +      if (ret)
 +              p->arg.walk_end = *end;
 +
 +      return ret;
 +}
 +
 +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
 +                                unsigned long end, struct mm_walk *walk)
 +{
 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 +      struct pagemap_scan_private *p = walk->private;
 +      struct vm_area_struct *vma = walk->vma;
 +      unsigned long categories;
 +      spinlock_t *ptl;
 +      int ret = 0;
 +
 +      ptl = pmd_trans_huge_lock(pmd, vma);
 +      if (!ptl)
 +              return -ENOENT;
 +
 +      categories = p->cur_vma_category |
 +                   pagemap_thp_category(p, vma, start, *pmd);
 +
 +      if (!pagemap_scan_is_interesting_page(categories, p))
 +              goto out_unlock;
 +
 +      ret = pagemap_scan_output(categories, p, start, &end);
 +      if (start == end)
 +              goto out_unlock;
 +
 +      if (~p->arg.flags & PM_SCAN_WP_MATCHING)
 +              goto out_unlock;
 +      if (~categories & PAGE_IS_WRITTEN)
 +              goto out_unlock;
 +
 +      /*
 +       * Break huge page into small pages if the WP operation
 +       * needs to be performed on a portion of the huge page.
 +       */
 +      if (end != start + HPAGE_SIZE) {
 +              spin_unlock(ptl);
 +              split_huge_pmd(vma, pmd, start);
 +              pagemap_scan_backout_range(p, start, end);
 +              /* Report as if there was no THP */
 +              return -ENOENT;
 +      }
 +
 +      make_uffd_wp_pmd(vma, start, pmd);
 +      flush_tlb_range(vma, start, end);
 +out_unlock:
 +      spin_unlock(ptl);
 +      return ret;
 +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
 +      return -ENOENT;
 +#endif
 +}
 +
 +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
 +                                unsigned long end, struct mm_walk *walk)
 +{
 +      struct pagemap_scan_private *p = walk->private;
 +      struct vm_area_struct *vma = walk->vma;
 +      unsigned long addr, flush_end = 0;
 +      pte_t *pte, *start_pte;
 +      spinlock_t *ptl;
 +      int ret;
 +
 +      arch_enter_lazy_mmu_mode();
 +
 +      ret = pagemap_scan_thp_entry(pmd, start, end, walk);
 +      if (ret != -ENOENT) {
 +              arch_leave_lazy_mmu_mode();
 +              return ret;
 +      }
 +
 +      ret = 0;
 +      start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
 +      if (!pte) {
 +              arch_leave_lazy_mmu_mode();
 +              walk->action = ACTION_AGAIN;
 +              return 0;
 +      }
 +
 +      if (!p->vec_out) {
 +              /* Fast path for performing exclusive WP */
 +              for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
 +                      if (pte_uffd_wp(ptep_get(pte)))
 +                              continue;
 +                      make_uffd_wp_pte(vma, addr, pte);
 +                      if (!flush_end)
 +                              start = addr;
 +                      flush_end = addr + PAGE_SIZE;
 +              }
 +              goto flush_and_return;
 +      }
 +
 +      if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
 +          p->arg.category_mask == PAGE_IS_WRITTEN &&
 +          p->arg.return_mask == PAGE_IS_WRITTEN) {
 +              for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
 +                      unsigned long next = addr + PAGE_SIZE;
 +
 +                      if (pte_uffd_wp(ptep_get(pte)))
 +                              continue;
 +                      ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
 +                                                p, addr, &next);
 +                      if (next == addr)
 +                              break;
 +                      if (~p->arg.flags & PM_SCAN_WP_MATCHING)
 +                              continue;
 +                      make_uffd_wp_pte(vma, addr, pte);
 +                      if (!flush_end)
 +                              start = addr;
 +                      flush_end = next;
 +              }
 +              goto flush_and_return;
 +      }
 +
 +      for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
 +              unsigned long categories = p->cur_vma_category |
 +                                         pagemap_page_category(p, vma, addr, ptep_get(pte));
 +              unsigned long next = addr + PAGE_SIZE;
 +
 +              if (!pagemap_scan_is_interesting_page(categories, p))
 +                      continue;
 +
 +              ret = pagemap_scan_output(categories, p, addr, &next);
 +              if (next == addr)
 +                      break;
 +
 +              if (~p->arg.flags & PM_SCAN_WP_MATCHING)
 +                      continue;
 +              if (~categories & PAGE_IS_WRITTEN)
 +                      continue;
 +
 +              make_uffd_wp_pte(vma, addr, pte);
 +              if (!flush_end)
 +                      start = addr;
 +              flush_end = next;
 +      }
 +
 +flush_and_return:
 +      if (flush_end)
 +              flush_tlb_range(vma, start, addr);
 +
 +      pte_unmap_unlock(start_pte, ptl);
 +      arch_leave_lazy_mmu_mode();
 +
 +      cond_resched();
 +      return ret;
 +}
 +
 +#ifdef CONFIG_HUGETLB_PAGE
 +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
 +                                    unsigned long start, unsigned long end,
 +                                    struct mm_walk *walk)
 +{
 +      struct pagemap_scan_private *p = walk->private;
 +      struct vm_area_struct *vma = walk->vma;
 +      unsigned long categories;
 +      spinlock_t *ptl;
 +      int ret = 0;
 +      pte_t pte;
 +
 +      if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
 +              /* Go the short route when not write-protecting pages. */
 +
 +              pte = huge_ptep_get(ptep);
 +              categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
 +
 +              if (!pagemap_scan_is_interesting_page(categories, p))
 +                      return 0;
 +
 +              return pagemap_scan_output(categories, p, start, &end);
 +      }
 +
 +      i_mmap_lock_write(vma->vm_file->f_mapping);
 +      ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
 +
 +      pte = huge_ptep_get(ptep);
 +      categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
 +
 +      if (!pagemap_scan_is_interesting_page(categories, p))
 +              goto out_unlock;
 +
 +      ret = pagemap_scan_output(categories, p, start, &end);
 +      if (start == end)
 +              goto out_unlock;
 +
 +      if (~categories & PAGE_IS_WRITTEN)
 +              goto out_unlock;
 +
 +      if (end != start + HPAGE_SIZE) {
 +              /* Partial HugeTLB page WP isn't possible. */
 +              pagemap_scan_backout_range(p, start, end);
 +              p->arg.walk_end = start;
 +              ret = 0;
 +              goto out_unlock;
 +      }
 +
 +      make_uffd_wp_huge_pte(vma, start, ptep, pte);
 +      flush_hugetlb_tlb_range(vma, start, end);
 +
 +out_unlock:
 +      spin_unlock(ptl);
 +      i_mmap_unlock_write(vma->vm_file->f_mapping);
 +
 +      return ret;
 +}
 +#else
 +#define pagemap_scan_hugetlb_entry NULL
 +#endif
 +
 +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
 +                               int depth, struct mm_walk *walk)
 +{
 +      struct pagemap_scan_private *p = walk->private;
 +      struct vm_area_struct *vma = walk->vma;
 +      int ret, err;
 +
 +      if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
 +              return 0;
 +
 +      ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
 +      if (addr == end)
 +              return ret;
 +
 +      if (~p->arg.flags & PM_SCAN_WP_MATCHING)
 +              return ret;
 +
 +      err = uffd_wp_range(vma, addr, end - addr, true);
 +      if (err < 0)
 +              ret = err;
 +
 +      return ret;
 +}
 +
 +static const struct mm_walk_ops pagemap_scan_ops = {
 +      .test_walk = pagemap_scan_test_walk,
 +      .pmd_entry = pagemap_scan_pmd_entry,
 +      .pte_hole = pagemap_scan_pte_hole,
 +      .hugetlb_entry = pagemap_scan_hugetlb_entry,
 +};
 +
 +static int pagemap_scan_get_args(struct pm_scan_arg *arg,
 +                               unsigned long uarg)
 +{
 +      if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
 +              return -EFAULT;
 +
 +      if (arg->size != sizeof(struct pm_scan_arg))
 +              return -EINVAL;
 +
 +      /* Validate requested features */
 +      if (arg->flags & ~PM_SCAN_FLAGS)
 +              return -EINVAL;
 +      if ((arg->category_inverted | arg->category_mask |
 +           arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
 +              return -EINVAL;
 +
 +      arg->start = untagged_addr((unsigned long)arg->start);
 +      arg->end = untagged_addr((unsigned long)arg->end);
 +      arg->vec = untagged_addr((unsigned long)arg->vec);
 +
 +      /* Validate memory pointers */
 +      if (!IS_ALIGNED(arg->start, PAGE_SIZE))
 +              return -EINVAL;
 +      if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
 +              return -EFAULT;
 +      if (!arg->vec && arg->vec_len)
 +              return -EINVAL;
 +      if (arg->vec && !access_ok((void __user *)(long)arg->vec,
 +                            arg->vec_len * sizeof(struct page_region)))
 +              return -EFAULT;
 +
 +      /* Fixup default values */
 +      arg->end = ALIGN(arg->end, PAGE_SIZE);
 +      arg->walk_end = 0;
 +      if (!arg->max_pages)
 +              arg->max_pages = ULONG_MAX;
 +
 +      return 0;
 +}
 +
 +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
 +                                     unsigned long uargl)
 +{
 +      struct pm_scan_arg __user *uarg = (void __user *)uargl;
 +
 +      if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
 +              return -EFAULT;
 +
 +      return 0;
 +}
 +
 +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
 +{
 +      if (!p->arg.vec_len)
 +              return 0;
 +
 +      p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
 +                             p->arg.vec_len);
 +      p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
 +                                 GFP_KERNEL);
 +      if (!p->vec_buf)
 +              return -ENOMEM;
 +
 +      p->vec_buf->start = p->vec_buf->end = 0;
 +      p->vec_out = (struct page_region __user *)(long)p->arg.vec;
 +
 +      return 0;
 +}
 +
 +static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
 +{
 +      const struct page_region *buf = p->vec_buf;
 +      long n = p->vec_buf_index;
 +
 +      if (!p->vec_buf)
 +              return 0;
 +
 +      if (buf[n].end != buf[n].start)
 +              n++;
 +
 +      if (!n)
 +              return 0;
 +
 +      if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
 +              return -EFAULT;
 +
 +      p->arg.vec_len -= n;
 +      p->vec_out += n;
 +
 +      p->vec_buf_index = 0;
 +      p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
 +      p->vec_buf->start = p->vec_buf->end = 0;
 +
 +      return n;
 +}
 +
 +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
 +{
 +      struct mmu_notifier_range range;
 +      struct pagemap_scan_private p = {0};
 +      unsigned long walk_start;
 +      size_t n_ranges_out = 0;
 +      int ret;
 +
 +      ret = pagemap_scan_get_args(&p.arg, uarg);
 +      if (ret)
 +              return ret;
 +
 +      p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
 +                            p.arg.return_mask;
 +      ret = pagemap_scan_init_bounce_buffer(&p);
 +      if (ret)
 +              return ret;
 +
 +      /* Protection change for the range is going to happen. */
 +      if (p.arg.flags & PM_SCAN_WP_MATCHING) {
 +              mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
 +                                      mm, p.arg.start, p.arg.end);
 +              mmu_notifier_invalidate_range_start(&range);
 +      }
 +
 +      for (walk_start = p.arg.start; walk_start < p.arg.end;
 +                      walk_start = p.arg.walk_end) {
 +              long n_out;
 +
 +              if (fatal_signal_pending(current)) {
 +                      ret = -EINTR;
 +                      break;
 +              }
 +
 +              ret = mmap_read_lock_killable(mm);
 +              if (ret)
 +                      break;
 +              ret = walk_page_range(mm, walk_start, p.arg.end,
 +                                    &pagemap_scan_ops, &p);
 +              mmap_read_unlock(mm);
 +
 +              n_out = pagemap_scan_flush_buffer(&p);
 +              if (n_out < 0)
 +                      ret = n_out;
 +              else
 +                      n_ranges_out += n_out;
 +
 +              if (ret != -ENOSPC)
 +                      break;
 +
 +              if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
 +                      break;
 +      }
 +
 +      /* ENOSPC signifies early stop (buffer full) from the walk. */
 +      if (!ret || ret == -ENOSPC)
 +              ret = n_ranges_out;
 +
 +      /* The walk_end isn't set when ret is zero */
 +      if (!p.arg.walk_end)
 +              p.arg.walk_end = p.arg.end;
 +      if (pagemap_scan_writeback_args(&p.arg, uarg))
 +              ret = -EFAULT;
 +
 +      if (p.arg.flags & PM_SCAN_WP_MATCHING)
 +              mmu_notifier_invalidate_range_end(&range);
 +
 +      kfree(p.vec_buf);
 +      return ret;
 +}
 +
 +static long do_pagemap_cmd(struct file *file, unsigned int cmd,
 +                         unsigned long arg)
 +{
 +      struct mm_struct *mm = file->private_data;
 +
 +      switch (cmd) {
 +      case PAGEMAP_SCAN:
 +              return do_pagemap_scan(mm, arg);
 +
 +      default:
 +              return -EINVAL;
 +      }
 +}
 +
  const struct file_operations proc_pagemap_operations = {
        .llseek         = mem_lseek, /* borrow this */
        .read           = pagemap_read,
        .open           = pagemap_open,
        .release        = pagemap_release,
 +      .unlocked_ioctl = do_pagemap_cmd,
 +      .compat_ioctl   = do_pagemap_cmd,
  };
  #endif /* CONFIG_PROC_PAGE_MONITOR */
  
@@@ -2673,9 -1940,8 +2668,9 @@@ static int show_numa_map(struct seq_fil
        struct numa_maps *md = &numa_priv->md;
        struct file *file = vma->vm_file;
        struct mm_struct *mm = vma->vm_mm;
 -      struct mempolicy *pol;
        char buffer[64];
 +      struct mempolicy *pol;
 +      pgoff_t ilx;
        int nid;
  
        if (!mm)
        /* Ensure we start with an empty set of numa_maps statistics. */
        memset(md, 0, sizeof(*md));
  
 -      pol = __get_vma_policy(vma, vma->vm_start);
 +      pol = __get_vma_policy(vma, vma->vm_start, &ilx);
        if (pol) {
                mpol_to_str(buffer, sizeof(buffer), pol);
                mpol_cond_put(pol);
  
        if (file) {
                seq_puts(m, " file=");
 -              seq_file_path(m, file, "\n\t= ");
 +              seq_path(m, file_user_path(file), "\n\t= ");
        } else if (vma_is_initial_heap(vma)) {
                seq_puts(m, " heap");
        } else if (vma_is_initial_stack(vma)) {
diff --combined fs/userfaultfd.c
index ac616cfbacf5a8ea64e948c8340de08bfe2462c1,ed09d70027a07c8463bd7d26393a51b0deda1382..e8af40b0554956e16e9bdd0af00c33c04c424ec6
@@@ -49,7 -49,7 +49,7 @@@ static struct ctl_table vm_userfaultfd_
  };
  #endif
  
- static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+ static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
  
  /*
   * Start with fault_pending_wqh and fault_wqh so they're more likely
@@@ -123,11 -123,6 +123,11 @@@ static bool userfaultfd_is_initialized(
        return ctx->features & UFFD_FEATURE_INITIALIZED;
  }
  
 +static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
 +{
 +      return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
 +}
 +
  /*
   * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
   * meaningful when userfaultfd_wp()==true on the vma and when it's
@@@ -927,15 -922,20 +927,15 @@@ static int userfaultfd_release(struct i
                        continue;
                }
                new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
 -              prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
 -                               new_flags, vma->anon_vma,
 -                               vma->vm_file, vma->vm_pgoff,
 -                               vma_policy(vma),
 -                               NULL_VM_UFFD_CTX, anon_vma_name(vma));
 -              if (prev) {
 -                      vma = prev;
 -              } else {
 -                      prev = vma;
 -              }
 +              vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
 +                                          vma->vm_end, new_flags,
 +                                          NULL_VM_UFFD_CTX);
  
                vma_start_write(vma);
                userfaultfd_set_vm_flags(vma, new_flags);
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 +
 +              prev = vma;
        }
        mmap_write_unlock(mm);
        mmput(mm);
@@@ -1325,7 -1325,7 +1325,7 @@@ static int userfaultfd_register(struct 
        bool basic_ioctls;
        unsigned long start, end, vma_end;
        struct vma_iterator vmi;
 -      pgoff_t pgoff;
 +      bool wp_async = userfaultfd_wp_async_ctx(ctx);
  
        user_uffdio_register = (struct uffdio_register __user *) arg;
  
  
                /* check not compatible vmas */
                ret = -EINVAL;
 -              if (!vma_can_userfault(cur, vm_flags))
 +              if (!vma_can_userfault(cur, vm_flags, wp_async))
                        goto out_unlock;
  
                /*
        for_each_vma_range(vmi, vma, end) {
                cond_resched();
  
 -              BUG_ON(!vma_can_userfault(vma, vm_flags));
 +              BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
                BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
                       vma->vm_userfaultfd_ctx.ctx != ctx);
                WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
                vma_end = min(end, vma->vm_end);
  
                new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
 -              pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 -              prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
 -                               vma->anon_vma, vma->vm_file, pgoff,
 -                               vma_policy(vma),
 -                               ((struct vm_userfaultfd_ctx){ ctx }),
 -                               anon_vma_name(vma));
 -              if (prev) {
 -                      /* vma_merge() invalidated the mas */
 -                      vma = prev;
 -                      goto next;
 -              }
 -              if (vma->vm_start < start) {
 -                      ret = split_vma(&vmi, vma, start, 1);
 -                      if (ret)
 -                              break;
 -              }
 -              if (vma->vm_end > end) {
 -                      ret = split_vma(&vmi, vma, end, 0);
 -                      if (ret)
 -                              break;
 +              vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
 +                                          new_flags,
 +                                          (struct vm_userfaultfd_ctx){ctx});
 +              if (IS_ERR(vma)) {
 +                      ret = PTR_ERR(vma);
 +                      break;
                }
 -      next:
 +
                /*
                 * In the vma_merge() successful mprotect-like case 8:
                 * the next vma was merged into the current one and
@@@ -1547,7 -1561,7 +1547,7 @@@ static int userfaultfd_unregister(struc
        unsigned long start, end, vma_end;
        const void __user *buf = (void __user *)arg;
        struct vma_iterator vmi;
 -      pgoff_t pgoff;
 +      bool wp_async = userfaultfd_wp_async_ctx(ctx);
  
        ret = -EFAULT;
        if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
                 * provides for more strict behavior to notice
                 * unregistration errors.
                 */
 -              if (!vma_can_userfault(cur, cur->vm_flags))
 +              if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
                        goto out_unlock;
  
                found = true;
        for_each_vma_range(vmi, vma, end) {
                cond_resched();
  
 -              BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
 +              BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
  
                /*
                 * Nothing to do: this vma is already registered into this
                        uffd_wp_range(vma, start, vma_end - start, false);
  
                new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
 -              pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 -              prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
 -                               vma->anon_vma, vma->vm_file, pgoff,
 -                               vma_policy(vma),
 -                               NULL_VM_UFFD_CTX, anon_vma_name(vma));
 -              if (prev) {
 -                      vma = prev;
 -                      goto next;
 -              }
 -              if (vma->vm_start < start) {
 -                      ret = split_vma(&vmi, vma, start, 1);
 -                      if (ret)
 -                              break;
 -              }
 -              if (vma->vm_end > end) {
 -                      ret = split_vma(&vmi, vma, end, 0);
 -                      if (ret)
 -                              break;
 +              vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
 +                                          new_flags, NULL_VM_UFFD_CTX);
 +              if (IS_ERR(vma)) {
 +                      ret = PTR_ERR(vma);
 +                      break;
                }
 -      next:
 +
                /*
                 * In the vma_merge() successful mprotect-like case 8:
                 * the next vma was merged into the current one and
        return ret;
  }
  
 +bool userfaultfd_wp_async(struct vm_area_struct *vma)
 +{
 +      return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
 +}
 +
  static inline unsigned int uffd_ctx_features(__u64 user_features)
  {
        /*
@@@ -2029,11 -2051,6 +2029,11 @@@ static int userfaultfd_api(struct userf
        ret = -EPERM;
        if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
                goto err_out;
 +
 +      /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
 +      if (features & UFFD_FEATURE_WP_ASYNC)
 +              features |= UFFD_FEATURE_WP_UNPOPULATED;
 +
        /* report all available features and ioctls to userland */
        uffdio_api.features = UFFD_API_FEATURES;
  #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
  #ifndef CONFIG_PTE_MARKER_UFFD_WP
        uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
        uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
 +      uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
  #endif
        uffdio_api.ioctls = UFFD_API_IOCTLS;
        ret = -EFAULT;
index 08704c29fdb4fc35114c5f8d280b4b73017c2a48,3426f6eef60b49dd00d2085ed83e90c9444137d2..5126a4fecb442846b589981b3a7b82f6d3846d36
@@@ -5,6 -5,14 +5,14 @@@
  #include <linux/linkage.h>
  #include <linux/elfcore.h>
  #include <linux/elf.h>
+ #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ #include <asm/crash_core.h>
+ #endif
+ /* Location of a reserved region to hold the crash kernel.
+  */
+ extern struct resource crashk_res;
+ extern struct resource crashk_low_res;
  
  #define CRASH_CORE_NOTE_NAME     "CORE"
  #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
@@@ -79,12 -87,43 +87,43 @@@ Elf_Word *append_elf_note(Elf_Word *buf
                          void *data, size_t data_len);
  void final_note(Elf_Word *buf);
  
+ #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+ #define DEFAULT_CRASH_KERNEL_LOW_SIZE  (128UL << 20)
+ #endif
+ #endif
  int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
-               unsigned long long *crash_size, unsigned long long *crash_base);
- int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
-               unsigned long long *crash_size, unsigned long long *crash_base);
- int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
-               unsigned long long *crash_size, unsigned long long *crash_base);
+               unsigned long long *crash_size, unsigned long long *crash_base,
+               unsigned long long *low_size, bool *high);
+ #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+ #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
+ #endif
+ #ifndef CRASH_ALIGN
+ #define CRASH_ALIGN                   SZ_2M
+ #endif
+ #ifndef CRASH_ADDR_LOW_MAX
+ #define CRASH_ADDR_LOW_MAX            SZ_4G
+ #endif
+ #ifndef CRASH_ADDR_HIGH_MAX
+ #define CRASH_ADDR_HIGH_MAX           memblock_end_of_DRAM()
+ #endif
+ void __init reserve_crashkernel_generic(char *cmdline,
+               unsigned long long crash_size,
+               unsigned long long crash_base,
+               unsigned long long crash_low_size,
+               bool high);
+ #else
+ static inline void __init reserve_crashkernel_generic(char *cmdline,
+               unsigned long long crash_size,
+               unsigned long long crash_base,
+               unsigned long long crash_low_size,
+               bool high)
+ {}
+ #endif
  
  /* Alignment required for elf header segment */
  #define ELF_CORE_HEADER_ALIGN   4096
  struct crash_mem {
        unsigned int max_nr_ranges;
        unsigned int nr_ranges;
 -      struct range ranges[];
 +      struct range ranges[] __counted_by(max_nr_ranges);
  };
  
  extern int crash_exclude_mem_range(struct crash_mem *mem,
index 1e7711185ec694a596b0f221f528a27e88514aac,442ee9170259f2a4da0876d9c416e70f2b4593c7..79ef6ac4c02113e92454d94e80565b06073c4722
@@@ -93,9 -93,13 +93,9 @@@ extern char *__underlying_strncpy(char 
  #if __has_builtin(__builtin_dynamic_object_size)
  #define POS                   __pass_dynamic_object_size(1)
  #define POS0                  __pass_dynamic_object_size(0)
 -#define __struct_size(p)      __builtin_dynamic_object_size(p, 0)
 -#define __member_size(p)      __builtin_dynamic_object_size(p, 1)
  #else
  #define POS                   __pass_object_size(1)
  #define POS0                  __pass_object_size(0)
 -#define __struct_size(p)      __builtin_object_size(p, 0)
 -#define __member_size(p)      __builtin_object_size(p, 1)
  #endif
  
  #define __compiletime_lessthan(bounds, length)        (       \
@@@ -639,7 -643,7 +639,7 @@@ __FORTIFY_INLINE bool fortify_memcpy_ch
                                     __q_size_field, #op),              \
                  #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
                  __fortify_size,                                       \
-                 "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \
+                 "field \"" #p "\" at " FILE_LINE,                     \
                  __p_size_field);                                      \
        __underlying_##op(p, q, __fortify_size);                        \
  })
diff --combined include/linux/sched.h
index b49ca40f633550b191dd60e33d86fbf097f17a97,6d1341b1673f5c48c730a6133c56258d8bc7a20b..292c316972485dae579a2518b4a2b847d35b961c
@@@ -63,6 -63,7 +63,6 @@@ struct robust_list_head
  struct root_domain;
  struct rq;
  struct sched_attr;
 -struct sched_param;
  struct seq_file;
  struct sighand_struct;
  struct signal_struct;
@@@ -369,10 -370,6 +369,10 @@@ extern struct root_domain def_root_doma
  extern struct mutex sched_domains_mutex;
  #endif
  
 +struct sched_param {
 +      int sched_priority;
 +};
 +
  struct sched_info {
  #ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */
@@@ -753,8 -750,10 +753,8 @@@ struct task_struct 
  #endif
        unsigned int                    __state;
  
 -#ifdef CONFIG_PREEMPT_RT
        /* saved state for "spinlock sleepers" */
        unsigned int                    saved_state;
 -#endif
  
        /*
         * This begins the randomizable portion of task_struct. Only
  
        struct mm_struct                *mm;
        struct mm_struct                *active_mm;
 +      struct address_space            *faults_disabled_mapping;
  
        int                             exit_state;
        int                             exit_code;
         * ->sched_remote_wakeup gets used, so it can be in this word.
         */
        unsigned                        sched_remote_wakeup:1;
 +#ifdef CONFIG_RT_MUTEXES
 +      unsigned                        sched_rt_mutex:1;
 +#endif
  
        /* Bit to tell LSMs we're in execve(): */
        unsigned                        in_execve:1;
        /* PID/PID hash table linkage. */
        struct pid                      *thread_pid;
        struct hlist_node               pid_links[PIDTYPE_MAX];
-       struct list_head                thread_group;
        struct list_head                thread_node;
  
        struct completion               *vfork_done;
        struct mem_cgroup               *active_memcg;
  #endif
  
 +#ifdef CONFIG_MEMCG_KMEM
 +      struct obj_cgroup               *objcg;
 +#endif
 +
  #ifdef CONFIG_BLK_CGROUP
        struct gendisk                  *throttle_disk;
  #endif
index 3b28cff24cc12f708e338ae9a3ae45cd1fb5fb80,d7fa3ca2fa5326ebd6cf2322b49256ba35fbdc5e..3499c1a8b9295a0b3366f4261a30eb11c38d577c
@@@ -303,11 -303,20 +303,11 @@@ static inline void kernel_signal_stop(v
  
        schedule();
  }
 -#ifdef __ia64__
 -# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3
 -#else
 -# define ___ARCH_SI_IA64(_a1, _a2, _a3)
 -#endif
  
 -int force_sig_fault_to_task(int sig, int code, void __user *addr
 -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
 -      , struct task_struct *t);
 -int force_sig_fault(int sig, int code, void __user *addr
 -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr));
 -int send_sig_fault(int sig, int code, void __user *addr
 -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
 -      , struct task_struct *t);
 +int force_sig_fault_to_task(int sig, int code, void __user *addr,
 +                          struct task_struct *t);
 +int force_sig_fault(int sig, int code, void __user *addr);
 +int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);
  
  int force_sig_mceerr(int code, void __user *, short);
  int send_sig_mceerr(int code, void __user *, short, struct task_struct *);
@@@ -647,8 -656,7 +647,8 @@@ extern bool current_is_single_threaded(
        while ((t = next_thread(t)) != g)
  
  #define __for_each_thread(signal, t)  \
 -      list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
 +      list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
 +              lockdep_is_held(&tasklist_lock))
  
  #define for_each_thread(p, t)         \
        __for_each_thread((p)->signal, t)
@@@ -707,15 -715,26 +707,26 @@@ bool same_thread_group(struct task_stru
        return p1->signal == p2->signal;
  }
  
- static inline struct task_struct *next_thread(const struct task_struct *p)
+ /*
+  * returns NULL if p is the last thread in the thread group
+  */
+ static inline struct task_struct *__next_thread(struct task_struct *p)
+ {
+       return list_next_or_null_rcu(&p->signal->thread_head,
+                                       &p->thread_node,
+                                       struct task_struct,
+                                       thread_node);
+ }
+ static inline struct task_struct *next_thread(struct task_struct *p)
  {
-       return list_entry_rcu(p->thread_group.next,
-                             struct task_struct, thread_group);
+       return __next_thread(p) ?: p->group_leader;
  }
  
  static inline int thread_group_empty(struct task_struct *p)
  {
-       return list_empty(&p->thread_group);
+       return thread_group_leader(p) &&
+              list_is_last(&p->thread_node, &p->signal->thread_head);
  }
  
  #define delay_group_leader(p) \
diff --combined init/init_task.c
index f703116e052301b7171304eaf25fe692547f5e60,c0de0200fd56277ebc99fd7860b7827384d1b7b5..5727d42149c334a989a2e657b3f71e26ae2899fc
@@@ -85,7 -85,6 +85,7 @@@ struct task_struct init_tas
        .nr_cpus_allowed= NR_CPUS,
        .mm             = NULL,
        .active_mm      = &init_mm,
 +      .faults_disabled_mapping = NULL,
        .restart_block  = {
                .fn = do_no_restart_syscall,
        },
        .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
        .timer_slack_ns = 50000, /* 50 usec default slack */
        .thread_pid     = &init_struct_pid,
-       .thread_group   = LIST_HEAD_INIT(init_task.thread_group),
        .thread_node    = LIST_HEAD_INIT(init_signals.thread_head),
  #ifdef CONFIG_AUDIT
        .loginuid       = INVALID_UID,
diff --combined kernel/audit_tree.c
index 85a5b306733b08fba26357ee5472ab0ae47dae74,b21b9652c1a8661326dfc10bbf9f38a568bf90e6..1b07e6f12a07a85a3fc97df244d1120bc4e4c150
@@@ -34,7 -34,7 +34,7 @@@ struct audit_chunk 
                struct list_head list;
                struct audit_tree *owner;
                unsigned index;         /* index; upper bit indicates 'will prune' */
 -      } owners[];
 +      } owners[] __counted_by(count);
  };
  
  struct audit_tree_mark {
@@@ -87,8 -87,8 +87,8 @@@ static struct task_struct *prune_thread
   * that makes a difference.  Some.
   */
  
- static struct fsnotify_group *audit_tree_group;
- static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
+ static struct fsnotify_group *audit_tree_group __ro_after_init;
+ static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
  
  static struct audit_tree *alloc_tree(const char *s)
  {
diff --combined kernel/exit.c
index 61ebba96909b98408d4bb802436385e4b25aad75,f3ba4b97a7d97cbcf7e45054c591f0c625d33972..ee9f43bed49a240ac60c9f4054c663374e36ccf0
@@@ -74,8 -74,6 +74,8 @@@
  #include <asm/unistd.h>
  #include <asm/mmu_context.h>
  
 +#include "exit.h"
 +
  /*
   * The default value should be high enough to not crash a system that randomly
   * crashes its kernel from time to time, but low enough to at least not permit
@@@ -135,7 -133,6 +135,6 @@@ static void __unhash_process(struct tas
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
        }
-       list_del_rcu(&p->thread_group);
        list_del_rcu(&p->thread_node);
  }
  
@@@ -541,6 -538,7 +540,6 @@@ static void exit_mm(void
        exit_mm_release(current, mm);
        if (!mm)
                return;
 -      sync_mm_rss(mm);
        mmap_read_lock(mm);
        mmgrab_lazy_tlb(mm);
        BUG_ON(mm != current->active_mm);
@@@ -830,6 -828,9 +829,6 @@@ void __noreturn do_exit(long code
        io_uring_files_cancel();
        exit_signals(tsk);  /* sets PF_EXITING */
  
 -      /* sync mm's RSS info before statistics gathering */
 -      if (tsk->mm)
 -              sync_mm_rss(tsk->mm);
        acct_update_integrals(tsk);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
@@@ -1035,6 -1036,26 +1034,6 @@@ SYSCALL_DEFINE1(exit_group, int, error_
        return 0;
  }
  
 -struct waitid_info {
 -      pid_t pid;
 -      uid_t uid;
 -      int status;
 -      int cause;
 -};
 -
 -struct wait_opts {
 -      enum pid_type           wo_type;
 -      int                     wo_flags;
 -      struct pid              *wo_pid;
 -
 -      struct waitid_info      *wo_info;
 -      int                     wo_stat;
 -      struct rusage           *wo_rusage;
 -
 -      wait_queue_entry_t              child_wait;
 -      int                     notask_error;
 -};
 -
  static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
  {
        return  wo->wo_type == PIDTYPE_MAX ||
@@@ -1498,17 -1519,6 +1497,17 @@@ static int ptrace_do_wait(struct wait_o
        return 0;
  }
  
 +bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
 +{
 +      if (!eligible_pid(wo, p))
 +              return false;
 +
 +      if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
 +              return false;
 +
 +      return true;
 +}
 +
  static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
                                int sync, void *key)
  {
                                                child_wait);
        struct task_struct *p = key;
  
 -      if (!eligible_pid(wo, p))
 -              return 0;
 +      if (pid_child_should_wake(wo, p))
 +              return default_wake_function(wait, mode, sync, key);
  
 -      if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
 -              return 0;
 -
 -      return default_wake_function(wait, mode, sync, key);
 +      return 0;
  }
  
  void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
@@@ -1568,10 -1581,16 +1567,10 @@@ static int do_wait_pid(struct wait_opt
        return 0;
  }
  
 -static long do_wait(struct wait_opts *wo)
 +long __do_wait(struct wait_opts *wo)
  {
 -      int retval;
 -
 -      trace_sched_process_wait(wo->wo_pid);
 +      long retval;
  
 -      init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
 -      wo->child_wait.private = current;
 -      add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 -repeat:
        /*
         * If there is nothing that can match our criteria, just get out.
         * We will clear ->notask_error to zero if we see any child that
           (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
                goto notask;
  
 -      set_current_state(TASK_INTERRUPTIBLE);
        read_lock(&tasklist_lock);
  
        if (wo->wo_type == PIDTYPE_PID) {
                retval = do_wait_pid(wo);
                if (retval)
 -                      goto end;
 +                      return retval;
        } else {
                struct task_struct *tsk = current;
  
                do {
                        retval = do_wait_thread(wo, tsk);
                        if (retval)
 -                              goto end;
 +                              return retval;
  
                        retval = ptrace_do_wait(wo, tsk);
                        if (retval)
 -                              goto end;
 +                              return retval;
  
                        if (wo->wo_flags & __WNOTHREAD)
                                break;
  
  notask:
        retval = wo->notask_error;
 -      if (!retval && !(wo->wo_flags & WNOHANG)) {
 -              retval = -ERESTARTSYS;
 -              if (!signal_pending(current)) {
 -                      schedule();
 -                      goto repeat;
 -              }
 -      }
 -end:
 +      if (!retval && !(wo->wo_flags & WNOHANG))
 +              return -ERESTARTSYS;
 +
 +      return retval;
 +}
 +
 +static long do_wait(struct wait_opts *wo)
 +{
 +      int retval;
 +
 +      trace_sched_process_wait(wo->wo_pid);
 +
 +      init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
 +      wo->child_wait.private = current;
 +      add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 +
 +      do {
 +              set_current_state(TASK_INTERRUPTIBLE);
 +              retval = __do_wait(wo);
 +              if (retval != -ERESTARTSYS)
 +                      break;
 +              if (signal_pending(current))
 +                      break;
 +              schedule();
 +      } while (1);
 +
        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
        return retval;
  }
  
 -static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
 -                        int options, struct rusage *ru)
 +int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
 +                        struct waitid_info *infop, int options,
 +                        struct rusage *ru)
  {
 -      struct wait_opts wo;
 +      unsigned int f_flags = 0;
        struct pid *pid = NULL;
        enum pid_type type;
 -      long ret;
 -      unsigned int f_flags = 0;
  
        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;
        }
  
 -      wo.wo_type      = type;
 -      wo.wo_pid       = pid;
 -      wo.wo_flags     = options;
 -      wo.wo_info      = infop;
 -      wo.wo_rusage    = ru;
 +      wo->wo_type     = type;
 +      wo->wo_pid      = pid;
 +      wo->wo_flags    = options;
 +      wo->wo_info     = infop;
 +      wo->wo_rusage   = ru;
        if (f_flags & O_NONBLOCK)
 -              wo.wo_flags |= WNOHANG;
 +              wo->wo_flags |= WNOHANG;
 +
 +      return 0;
 +}
 +
 +static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
 +                        int options, struct rusage *ru)
 +{
 +      struct wait_opts wo;
 +      long ret;
 +
 +      ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
 +      if (ret)
 +              return ret;
  
        ret = do_wait(&wo);
 -      if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
 +      if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
                ret = -EAGAIN;
  
 -      put_pid(pid);
 +      put_pid(wo.wo_pid);
        return ret;
  }
  
diff --combined kernel/fork.c
index 373fa2f739bc41ced8dc9074d84ec1ce5336483a,b9d3aa493bbd2dffae2dd716ae05effaaabbbb4e..10917c3e1f0366b5fbf60d98c2042c636cd74c87
@@@ -733,7 -733,7 +733,7 @@@ static __latent_entropy int dup_mmap(st
  
                        get_file(file);
                        i_mmap_lock_write(mapping);
 -                      if (tmp->vm_flags & VM_SHARED)
 +                      if (vma_is_shared_maywrite(tmp))
                                mapping_allow_writable(mapping);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
@@@ -1288,7 -1288,7 +1288,7 @@@ static struct mm_struct *mm_init(struc
        hugetlb_count_init(mm);
  
        if (current->mm) {
 -              mm->flags = current->mm->flags & MMF_INIT_MASK;
 +              mm->flags = mmf_init_flags(current->mm->flags);
                mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
        } else {
                mm->flags = default_dump_filter;
@@@ -1393,8 -1393,6 +1393,8 @@@ EXPORT_SYMBOL_GPL(mmput_async)
  
  /**
   * set_mm_exe_file - change a reference to the mm's executable file
 + * @mm: The mm to change.
 + * @new_exe_file: The new file to use.
   *
   * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
   *
@@@ -1434,8 -1432,6 +1434,8 @@@ int set_mm_exe_file(struct mm_struct *m
  
  /**
   * replace_mm_exe_file - replace a reference to the mm's executable file
 + * @mm: The mm to change.
 + * @new_exe_file: The new file to use.
   *
   * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
   *
@@@ -1487,7 -1483,6 +1487,7 @@@ int replace_mm_exe_file(struct mm_struc
  
  /**
   * get_mm_exe_file - acquire a reference to the mm's executable file
 + * @mm: The mm of interest.
   *
   * Returns %NULL if mm has no associated executable file.
   * User must release file via fput().
@@@ -1497,14 -1492,15 +1497,14 @@@ struct file *get_mm_exe_file(struct mm_
        struct file *exe_file;
  
        rcu_read_lock();
 -      exe_file = rcu_dereference(mm->exe_file);
 -      if (exe_file && !get_file_rcu(exe_file))
 -              exe_file = NULL;
 +      exe_file = get_file_rcu(&mm->exe_file);
        rcu_read_unlock();
        return exe_file;
  }
  
  /**
   * get_task_exe_file - acquire a reference to the task's executable file
 + * @task: The task.
   *
   * Returns %NULL if task's mm (if any) has no associated executable file or
   * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
@@@ -1527,7 -1523,6 +1527,7 @@@ struct file *get_task_exe_file(struct t
  
  /**
   * get_task_mm - acquire a reference to the task's mm
 + * @task: The task.
   *
   * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
   * this kernel workthread has transiently adopted a user mm with use_mm,
@@@ -2107,11 -2102,11 +2107,11 @@@ const struct file_operations pidfd_fop
   * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
   * @pid:   the struct pid for which to create a pidfd
   * @flags: flags of the new @pidfd
 - * @pidfd: the pidfd to return
 + * @ret: Where to return the file for the pidfd.
   *
   * Allocate a new file that stashes @pid and reserve a new pidfd number in the
   * caller's file descriptor table. The pidfd is reserved but not installed yet.
 -
 + *
   * The helper doesn't perform checks on @pid which makes it useful for pidfds
   * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
   * pidfd file are prepared.
@@@ -2158,7 -2153,7 +2158,7 @@@ static int __pidfd_prepare(struct pid *
   * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
   * @pid:   the struct pid for which to create a pidfd
   * @flags: flags of the new @pidfd
 - * @pidfd: the pidfd to return
 + * @ret: Where to return the pidfd.
   *
   * Allocate a new file that stashes @pid and reserve a new pidfd number in the
   * caller's file descriptor table. The pidfd is reserved but not installed yet.
@@@ -2411,6 -2406,10 +2411,6 @@@ __latent_entropy struct task_struct *co
        p->io_uring = NULL;
  #endif
  
 -#if defined(SPLIT_RSS_COUNTING)
 -      memset(&p->rss_stat, 0, sizeof(p->rss_stat));
 -#endif
 -
        p->default_timer_slack_ns = current->timer_slack_ns;
  
  #ifdef CONFIG_PSI
        p->dirty_paused_when = 0;
  
        p->pdeath_signal = 0;
-       INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
        clear_posix_cputimers_work(p);
  
                        atomic_inc(&current->signal->live);
                        refcount_inc(&current->signal->sigcnt);
                        task_join_group_stop(p);
-                       list_add_tail_rcu(&p->thread_group,
-                                         &p->group_leader->thread_group);
                        list_add_tail_rcu(&p->thread_node,
                                          &p->signal->thread_head);
                }
@@@ -3145,7 -3141,7 +3142,7 @@@ static inline bool clone3_stack_valid(s
                if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
                        return false;
  
 -#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
 +#if !defined(CONFIG_STACK_GROWSUP)
                kargs->stack += kargs->stack_size;
  #endif
        }
@@@ -3182,7 -3178,7 +3179,7 @@@ static bool clone3_args_valid(struct ke
  }
  
  /**
 - * clone3 - create a new process with specific properties
 + * sys_clone3 - create a new process with specific properties
   * @uargs: argument structure
   * @size:  size of @uargs
   *
diff --combined kernel/kthread.c
index c46128ec0c0a2aa9790897af2a32e9cfeaf3641f,290cbc845225e41d39994399e50ae3e82852a685..c5e40830c1f2d5e91dea786c24fd70f8b76ee488
@@@ -715,6 -715,24 +715,24 @@@ int kthread_stop(struct task_struct *k
  }
  EXPORT_SYMBOL(kthread_stop);
  
+ /**
+  * kthread_stop_put - stop a thread and put its task struct
+  * @k: thread created by kthread_create().
+  *
+  * Stops a thread created by kthread_create() and put its task_struct.
+  * Only use when holding an extra task struct reference obtained by
+  * calling get_task_struct().
+  */
+ int kthread_stop_put(struct task_struct *k)
+ {
+       int ret;
+       ret = kthread_stop(k);
+       put_task_struct(k);
+       return ret;
+ }
+ EXPORT_SYMBOL(kthread_stop_put);
  int kthreadd(void *unused)
  {
        struct task_struct *tsk = current;
@@@ -1469,6 -1487,7 +1487,6 @@@ void kthread_unuse_mm(struct mm_struct 
         * clearing tsk->mm.
         */
        smp_mb__after_spinlock();
 -      sync_mm_rss(mm);
        local_irq_disable();
        tsk->mm = NULL;
        membarrier_update_current_mm(NULL);
diff --combined kernel/sched/core.c
index 3d7e2d7026998af5cb85fe12fb0b5442e90f6f63,7d4cf741e0879b89df3107188d5d466f48625f4c..a708d225c28e861922997ed27a25d1b21f072d2a
@@@ -85,6 -85,7 +85,6 @@@
  
  #include "sched.h"
  #include "stats.h"
 -#include "autogroup.h"
  
  #include "autogroup.h"
  #include "pelt.h"
@@@ -113,7 -114,6 +113,7 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_over
  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
  EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
 +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
  
  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  
@@@ -919,13 -919,14 +919,13 @@@ static bool set_nr_if_polling(struct ta
        struct thread_info *ti = task_thread_info(p);
        typeof(ti->flags) val = READ_ONCE(ti->flags);
  
 -      for (;;) {
 +      do {
                if (!(val & _TIF_POLLING_NRFLAG))
                        return false;
                if (val & _TIF_NEED_RESCHED)
                        return true;
 -              if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
 -                      break;
 -      }
 +      } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
 +
        return true;
  }
  
@@@ -1479,12 -1480,16 +1479,12 @@@ static void __uclamp_update_util_min_rt
  
  static void uclamp_update_util_min_rt_default(struct task_struct *p)
  {
 -      struct rq_flags rf;
 -      struct rq *rq;
 -
        if (!rt_task(p))
                return;
  
        /* Protect updates to p->uclamp_* */
 -      rq = task_rq_lock(p, &rf);
 +      guard(task_rq_lock)(p);
        __uclamp_update_util_min_rt_default(p);
 -      task_rq_unlock(rq, p, &rf);
  }
  
  static inline struct uclamp_se
@@@ -1780,8 -1785,9 +1780,8 @@@ static void uclamp_update_root_tg(void
        uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
                      sysctl_sched_uclamp_util_max, false);
  
 -      rcu_read_lock();
 +      guard(rcu)();
        cpu_util_update_eff(&root_task_group.css);
 -      rcu_read_unlock();
  }
  #else
  static void uclamp_update_root_tg(void) { }
@@@ -1808,9 -1814,10 +1808,9 @@@ static void uclamp_sync_util_min_rt_def
        smp_mb__after_spinlock();
        read_unlock(&tasklist_lock);
  
 -      rcu_read_lock();
 +      guard(rcu)();
        for_each_process_thread(g, p)
                uclamp_update_util_min_rt_default(p);
 -      rcu_read_unlock();
  }
  
  static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
@@@ -2211,10 -2218,10 +2211,10 @@@ static inline void check_class_changed(
                p->sched_class->prio_changed(rq, p, oldprio);
  }
  
 -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 +void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
  {
        if (p->sched_class == rq->curr->sched_class)
 -              rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 +              rq->curr->sched_class->wakeup_preempt(rq, p, flags);
        else if (sched_class_above(p->sched_class, rq->curr->sched_class))
                resched_curr(rq);
  
@@@ -2232,21 -2239,31 +2232,21 @@@ int __task_state_match(struct task_stru
        if (READ_ONCE(p->__state) & state)
                return 1;
  
 -#ifdef CONFIG_PREEMPT_RT
        if (READ_ONCE(p->saved_state) & state)
                return -1;
 -#endif
 +
        return 0;
  }
  
  static __always_inline
  int task_state_match(struct task_struct *p, unsigned int state)
  {
 -#ifdef CONFIG_PREEMPT_RT
 -      int match;
 -
        /*
 -       * Serialize against current_save_and_set_rtlock_wait_state() and
 -       * current_restore_rtlock_saved_state().
 +       * Serialize against current_save_and_set_rtlock_wait_state(),
 +       * current_restore_rtlock_saved_state(), and __refrigerator().
         */
 -      raw_spin_lock_irq(&p->pi_lock);
 -      match = __task_state_match(p, state);
 -      raw_spin_unlock_irq(&p->pi_lock);
 -
 -      return match;
 -#else
 +      guard(raw_spinlock_irq)(&p->pi_lock);
        return __task_state_match(p, state);
 -#endif
  }
  
  /*
@@@ -2400,9 -2417,10 +2400,9 @@@ void migrate_disable(void
                return;
        }
  
 -      preempt_disable();
 +      guard(preempt)();
        this_rq()->nr_pinned++;
        p->migration_disabled = 1;
 -      preempt_enable();
  }
  EXPORT_SYMBOL_GPL(migrate_disable);
  
@@@ -2426,7 -2444,7 +2426,7 @@@ void migrate_enable(void
         * Ensure stop_task runs either before or after this, and that
         * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
         */
 -      preempt_disable();
 +      guard(preempt)();
        if (p->cpus_ptr != &p->cpus_mask)
                __set_cpus_allowed_ptr(p, &ac);
        /*
        barrier();
        p->migration_disabled = 0;
        this_rq()->nr_pinned--;
 -      preempt_enable();
  }
  EXPORT_SYMBOL_GPL(migrate_enable);
  
@@@ -2508,7 -2527,7 +2508,7 @@@ static struct rq *move_queued_task(stru
        rq_lock(rq, rf);
        WARN_ON_ONCE(task_cpu(p) != new_cpu);
        activate_task(rq, p, 0);
 -      check_preempt_curr(rq, p, 0);
 +      wakeup_preempt(rq, p, 0);
  
        return rq;
  }
@@@ -2645,11 -2664,9 +2645,11 @@@ static int migration_cpu_stop(void *dat
                 * it.
                 */
                WARN_ON_ONCE(!pending->stop_pending);
 +              preempt_disable();
                task_rq_unlock(rq, p, &rf);
                stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
                                    &pending->arg, &pending->stop_work);
 +              preempt_enable();
                return 0;
        }
  out:
@@@ -2969,13 -2986,12 +2969,13 @@@ static int affine_move_task(struct rq *
                        complete = true;
                }
  
 +              preempt_disable();
                task_rq_unlock(rq, p, rf);
 -
                if (push_task) {
                        stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
                                            p, &rq->push_work);
                }
 +              preempt_enable();
  
                if (complete)
                        complete_all(&pending->done);
                if (flags & SCA_MIGRATE_ENABLE)
                        p->migration_flags &= ~MDF_PUSH;
  
 +              preempt_disable();
                task_rq_unlock(rq, p, rf);
 -
                if (!stop_pending) {
                        stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
                                            &pending->arg, &pending->stop_work);
                }
 +              preempt_enable();
  
                if (flags & SCA_MIGRATE_ENABLE)
                        return 0;
@@@ -3394,7 -3409,7 +3394,7 @@@ static void __migrate_swap_task(struct 
                deactivate_task(src_rq, p, 0);
                set_task_cpu(p, cpu);
                activate_task(dst_rq, p, 0);
 -              check_preempt_curr(dst_rq, p, 0);
 +              wakeup_preempt(dst_rq, p, 0);
  
                rq_unpin_lock(dst_rq, &drf);
                rq_unpin_lock(src_rq, &srf);
   */
  void kick_process(struct task_struct *p)
  {
 -      int cpu;
 +      guard(preempt)();
 +      int cpu = task_cpu(p);
  
 -      preempt_disable();
 -      cpu = task_cpu(p);
        if ((cpu != smp_processor_id()) && task_curr(p))
                smp_send_reschedule(cpu);
 -      preempt_enable();
  }
  EXPORT_SYMBOL_GPL(kick_process);
  
@@@ -3768,7 -3785,7 +3768,7 @@@ ttwu_do_activate(struct rq *rq, struct 
        }
  
        activate_task(rq, p, en_flags);
 -      check_preempt_curr(rq, p, wake_flags);
 +      wakeup_preempt(rq, p, wake_flags);
  
        ttwu_do_wakeup(p);
  
                if (rq->avg_idle > max)
                        rq->avg_idle = max;
  
 -              rq->wake_stamp = jiffies;
 -              rq->wake_avg_idle = rq->avg_idle / 2;
 -
                rq->idle_stamp = 0;
        }
  #endif
@@@ -3836,7 -3856,7 +3836,7 @@@ static int ttwu_runnable(struct task_st
                         * it should preempt the task that is current now.
                         */
                        update_rq_clock(rq);
 -                      check_preempt_curr(rq, p, wake_flags);
 +                      wakeup_preempt(rq, p, wake_flags);
                }
                ttwu_do_wakeup(p);
                ret = 1;
@@@ -3936,18 -3956,6 +3936,18 @@@ bool cpus_share_cache(int this_cpu, in
        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
  }
  
 +/*
 + * Whether CPUs are share cache resources, which means LLC on non-cluster
 + * machines and LLC tag or L2 on machines with clusters.
 + */
 +bool cpus_share_resources(int this_cpu, int that_cpu)
 +{
 +      if (this_cpu == that_cpu)
 +              return true;
 +
 +      return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
 +}
 +
  static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
  {
        /*
@@@ -4028,17 -4036,13 +4028,17 @@@ static void ttwu_queue(struct task_stru
   * The caller holds p::pi_lock if p != current or has preemption
   * disabled when p == current.
   *
 - * The rules of PREEMPT_RT saved_state:
 + * The rules of saved_state:
   *
   *   The related locking code always holds p::pi_lock when updating
   *   p::saved_state, which means the code is fully serialized in both cases.
   *
 - *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
 - *   bits set. This allows to distinguish all wakeup scenarios.
 + *   For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
 + *   No other bits set. This allows to distinguish all wakeup scenarios.
 + *
 + *   For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
 + *   allows us to prevent early wakeup of tasks before they can be run on
 + *   asymmetric ISA architectures (eg ARMv9).
   */
  static __always_inline
  bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
  
        *success = !!(match = __task_state_match(p, state));
  
 -#ifdef CONFIG_PREEMPT_RT
        /*
         * Saved state preserves the task state across blocking on
 -       * an RT lock.  If the state matches, set p::saved_state to
 -       * TASK_RUNNING, but do not wake the task because it waits
 -       * for a lock wakeup. Also indicate success because from
 -       * the regular waker's point of view this has succeeded.
 +       * an RT lock or TASK_FREEZABLE tasks.  If the state matches,
 +       * set p::saved_state to TASK_RUNNING, but do not wake the task
 +       * because it waits for a lock wakeup or __thaw_task(). Also
 +       * indicate success because from the regular waker's point of
 +       * view this has succeeded.
         *
         * After acquiring the lock the task will restore p::__state
         * from p::saved_state which ensures that the regular
         */
        if (match < 0)
                p->saved_state = TASK_RUNNING;
 -#endif
 +
        return match > 0;
  }
  
@@@ -4250,7 -4254,7 +4250,7 @@@ int try_to_wake_up(struct task_struct *
                 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
                 * __schedule().  See the comment for smp_mb__after_spinlock().
                 *
 -               * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
 +               * A similar smp_rmb() lives in __task_needs_rq_lock().
                 */
                smp_rmb();
                if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
@@@ -4867,7 -4871,7 +4867,7 @@@ void wake_up_new_task(struct task_struc
  
        activate_task(rq, p, ENQUEUE_NOCLOCK);
        trace_sched_wakeup_new(p);
 -      check_preempt_curr(rq, p, WF_FORK);
 +      wakeup_preempt(rq, p, WF_FORK);
  #ifdef CONFIG_SMP
        if (p->sched_class->task_woken) {
                /*
@@@ -5370,6 -5374,8 +5370,6 @@@ context_switch(struct rq *rq, struct ta
        /* switch_mm_cid() requires the memory barriers above. */
        switch_mm_cid(rq, prev, next);
  
 -      rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 -
        prepare_lock_switch(rq, next, rf);
  
        /* Here we just switch the register state and the stack. */
@@@ -5910,7 -5916,8 +5910,7 @@@ static noinline void __schedule_bug(str
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
 -      if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
 -          && in_atomic_preempt_off()) {
 +      if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
                pr_err("Preemption disabled at:");
                print_ip_sym(KERN_ERR, preempt_disable_ip);
        }
@@@ -6361,9 -6368,8 +6361,9 @@@ static void sched_core_balance(struct r
        struct sched_domain *sd;
        int cpu = cpu_of(rq);
  
 -      preempt_disable();
 -      rcu_read_lock();
 +      guard(preempt)();
 +      guard(rcu)();
 +
        raw_spin_rq_unlock_irq(rq);
        for_each_domain(cpu, sd) {
                if (need_resched())
                        break;
        }
        raw_spin_rq_lock_irq(rq);
 -      rcu_read_unlock();
 -      preempt_enable();
  }
  
  static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
@@@ -6607,7 -6615,6 +6607,7 @@@ static void __sched notrace __schedule(
        /* Promote REQ to ACT */
        rq->clock_update_flags <<= 1;
        update_rq_clock(rq);
 +      rq->clock_update_flags = RQCF_UPDATED;
  
        switch_count = &prev->nivcsw;
  
                /* Also unlocks the rq: */
                rq = context_switch(rq, prev, next, &rf);
        } else {
 -              rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 -
                rq_unpin_lock(rq, &rf);
                __balance_callbacks(rq);
                raw_spin_rq_unlock_irq(rq);
@@@ -6711,24 -6720,22 +6711,24 @@@ void __noreturn do_task_dead(void
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
 +      static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
        unsigned int task_flags;
  
 -      if (task_is_running(tsk))
 -              return;
 +      /*
 +       * Establish LD_WAIT_CONFIG context to ensure none of the code called
 +       * will use a blocking primitive -- which would lead to recursion.
 +       */
 +      lock_map_acquire_try(&sched_map);
  
        task_flags = tsk->flags;
        /*
         * If a worker goes to sleep, notify and ask workqueue whether it
         * wants to wake up a task to maintain concurrency.
         */
 -      if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
 -              if (task_flags & PF_WQ_WORKER)
 -                      wq_worker_sleeping(tsk);
 -              else
 -                      io_wq_worker_sleeping(tsk);
 -      }
 +      if (task_flags & PF_WQ_WORKER)
 +              wq_worker_sleeping(tsk);
 +      else if (task_flags & PF_IO_WORKER)
 +              io_wq_worker_sleeping(tsk);
  
        /*
         * spinlock and rwlock must not flush block requests.  This will
         * make sure to submit it to avoid deadlocks.
         */
        blk_flush_plug(tsk->plug, true);
 +
 +      lock_map_release(&sched_map);
  }
  
  static void sched_update_worker(struct task_struct *tsk)
        }
  }
  
 -asmlinkage __visible void __sched schedule(void)
 +static __always_inline void __schedule_loop(unsigned int sched_mode)
  {
 -      struct task_struct *tsk = current;
 -
 -      sched_submit_work(tsk);
        do {
                preempt_disable();
 -              __schedule(SM_NONE);
 +              __schedule(sched_mode);
                sched_preempt_enable_no_resched();
        } while (need_resched());
 +}
 +
 +asmlinkage __visible void __sched schedule(void)
 +{
 +      struct task_struct *tsk = current;
 +
 +#ifdef CONFIG_RT_MUTEXES
 +      lockdep_assert(!tsk->sched_rt_mutex);
 +#endif
 +
 +      if (!task_is_running(tsk))
 +              sched_submit_work(tsk);
 +      __schedule_loop(SM_NONE);
        sched_update_worker(tsk);
  }
  EXPORT_SYMBOL(schedule);
@@@ -6839,7 -6834,11 +6839,7 @@@ void __sched schedule_preempt_disabled(
  #ifdef CONFIG_PREEMPT_RT
  void __sched notrace schedule_rtlock(void)
  {
 -      do {
 -              preempt_disable();
 -              __schedule(SM_RTLOCK_WAIT);
 -              sched_preempt_enable_no_resched();
 -      } while (need_resched());
 +      __schedule_loop(SM_RTLOCK_WAIT);
  }
  NOKPROBE_SYMBOL(schedule_rtlock);
  #endif
@@@ -7035,32 -7034,6 +7035,32 @@@ static void __setscheduler_prio(struct 
  
  #ifdef CONFIG_RT_MUTEXES
  
 +/*
 + * Would be more useful with typeof()/auto_type but they don't mix with
 + * bit-fields. Since it's a local thing, use int. Keep the generic sounding
 + * name such that if someone were to implement this function we get to compare
 + * notes.
 + */
 +#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
 +
 +void rt_mutex_pre_schedule(void)
 +{
 +      lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
 +      sched_submit_work(current);
 +}
 +
 +void rt_mutex_schedule(void)
 +{
 +      lockdep_assert(current->sched_rt_mutex);
 +      __schedule_loop(SM_NONE);
 +}
 +
 +void rt_mutex_post_schedule(void)
 +{
 +      sched_update_worker(current);
 +      lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
 +}
 +
  static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
  {
        if (pi_task)
@@@ -7214,8 -7187,9 +7214,8 @@@ static inline int rt_effective_prio(str
  void set_user_nice(struct task_struct *p, long nice)
  {
        bool queued, running;
 -      int old_prio;
 -      struct rq_flags rf;
        struct rq *rq;
 +      int old_prio;
  
        if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
                return;
         * We have to be careful, if called from sys_setpriority(),
         * the task might be in the middle of scheduling on another CPU.
         */
 -      rq = task_rq_lock(p, &rf);
 +      CLASS(task_rq_lock, rq_guard)(p);
 +      rq = rq_guard.rq;
 +
        update_rq_clock(rq);
  
        /*
         */
        if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
                p->static_prio = NICE_TO_PRIO(nice);
 -              goto out_unlock;
 +              return;
        }
 +
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
         * lowered its priority, then reschedule its CPU:
         */
        p->sched_class->prio_changed(rq, p, old_prio);
 -
 -out_unlock:
 -      task_rq_unlock(rq, p, &rf);
  }
  EXPORT_SYMBOL(set_user_nice);
  
@@@ -7533,21 -7507,6 +7533,21 @@@ static struct task_struct *find_process
        return pid ? find_task_by_vpid(pid) : current;
  }
  
 +static struct task_struct *find_get_task(pid_t pid)
 +{
 +      struct task_struct *p;
 +      guard(rcu)();
 +
 +      p = find_process_by_pid(pid);
 +      if (likely(p))
 +              get_task_struct(p);
 +
 +      return p;
 +}
 +
 +DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
 +           find_get_task(pid), pid_t pid)
 +
  /*
   * sched_setparam() passes in -1 for its policy, to let the functions
   * it calls know not to change it.
@@@ -7585,11 -7544,14 +7585,11 @@@ static void __setscheduler_params(struc
  static bool check_same_owner(struct task_struct *p)
  {
        const struct cred *cred = current_cred(), *pcred;
 -      bool match;
 +      guard(rcu)();
  
 -      rcu_read_lock();
        pcred = __task_cred(p);
 -      match = (uid_eq(cred->euid, pcred->euid) ||
 -               uid_eq(cred->euid, pcred->uid));
 -      rcu_read_unlock();
 -      return match;
 +      return (uid_eq(cred->euid, pcred->euid) ||
 +              uid_eq(cred->euid, pcred->uid));
  }
  
  /*
@@@ -8001,17 -7963,27 +8001,17 @@@ static in
  do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  {
        struct sched_param lparam;
 -      struct task_struct *p;
 -      int retval;
  
        if (!param || pid < 0)
                return -EINVAL;
        if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
                return -EFAULT;
  
 -      rcu_read_lock();
 -      retval = -ESRCH;
 -      p = find_process_by_pid(pid);
 -      if (likely(p))
 -              get_task_struct(p);
 -      rcu_read_unlock();
 -
 -      if (likely(p)) {
 -              retval = sched_setscheduler(p, policy, &lparam);
 -              put_task_struct(p);
 -      }
 +      CLASS(find_get_task, p)(pid);
 +      if (!p)
 +              return -ESRCH;
  
 -      return retval;
 +      return sched_setscheduler(p, policy, &lparam);
  }
  
  /*
@@@ -8107,6 -8079,7 +8107,6 @@@ SYSCALL_DEFINE3(sched_setattr, pid_t, p
                               unsigned int, flags)
  {
        struct sched_attr attr;
 -      struct task_struct *p;
        int retval;
  
        if (!uattr || pid < 0 || flags)
        if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
                attr.sched_policy = SETPARAM_POLICY;
  
 -      rcu_read_lock();
 -      retval = -ESRCH;
 -      p = find_process_by_pid(pid);
 -      if (likely(p))
 -              get_task_struct(p);
 -      rcu_read_unlock();
 +      CLASS(find_get_task, p)(pid);
 +      if (!p)
 +              return -ESRCH;
  
 -      if (likely(p)) {
 -              if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
 -                      get_params(p, &attr);
 -              retval = sched_setattr(p, &attr);
 -              put_task_struct(p);
 -      }
 +      if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
 +              get_params(p, &attr);
  
 -      return retval;
 +      return sched_setattr(p, &attr);
  }
  
  /**
@@@ -8146,17 -8126,16 +8146,17 @@@ SYSCALL_DEFINE1(sched_getscheduler, pid
        if (pid < 0)
                return -EINVAL;
  
 -      retval = -ESRCH;
 -      rcu_read_lock();
 +      guard(rcu)();
        p = find_process_by_pid(pid);
 -      if (p) {
 -              retval = security_task_getscheduler(p);
 -              if (!retval)
 -                      retval = p->policy
 -                              | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
 +      if (!p)
 +              return -ESRCH;
 +
 +      retval = security_task_getscheduler(p);
 +      if (!retval) {
 +              retval = p->policy;
 +              if (p->sched_reset_on_fork)
 +                      retval |= SCHED_RESET_ON_FORK;
        }
 -      rcu_read_unlock();
        return retval;
  }
  
@@@ -8177,23 -8156,30 +8177,23 @@@ SYSCALL_DEFINE2(sched_getparam, pid_t, 
        if (!param || pid < 0)
                return -EINVAL;
  
 -      rcu_read_lock();
 -      p = find_process_by_pid(pid);
 -      retval = -ESRCH;
 -      if (!p)
 -              goto out_unlock;
 +      scoped_guard (rcu) {
 +              p = find_process_by_pid(pid);
 +              if (!p)
 +                      return -ESRCH;
  
 -      retval = security_task_getscheduler(p);
 -      if (retval)
 -              goto out_unlock;
 +              retval = security_task_getscheduler(p);
 +              if (retval)
 +                      return retval;
  
 -      if (task_has_rt_policy(p))
 -              lp.sched_priority = p->rt_priority;
 -      rcu_read_unlock();
 +              if (task_has_rt_policy(p))
 +                      lp.sched_priority = p->rt_priority;
 +      }
  
        /*
         * This one might sleep, we cannot do it with a spinlock held ...
         */
 -      retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
 -
 -      return retval;
 -
 -out_unlock:
 -      rcu_read_unlock();
 -      return retval;
 +      return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
  }
  
  /*
@@@ -8253,38 -8239,46 +8253,38 @@@ SYSCALL_DEFINE4(sched_getattr, pid_t, p
            usize < SCHED_ATTR_SIZE_VER0 || flags)
                return -EINVAL;
  
 -      rcu_read_lock();
 -      p = find_process_by_pid(pid);
 -      retval = -ESRCH;
 -      if (!p)
 -              goto out_unlock;
 +      scoped_guard (rcu) {
 +              p = find_process_by_pid(pid);
 +              if (!p)
 +                      return -ESRCH;
  
 -      retval = security_task_getscheduler(p);
 -      if (retval)
 -              goto out_unlock;
 +              retval = security_task_getscheduler(p);
 +              if (retval)
 +                      return retval;
  
 -      kattr.sched_policy = p->policy;
 -      if (p->sched_reset_on_fork)
 -              kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
 -      get_params(p, &kattr);
 -      kattr.sched_flags &= SCHED_FLAG_ALL;
 +              kattr.sched_policy = p->policy;
 +              if (p->sched_reset_on_fork)
 +                      kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
 +              get_params(p, &kattr);
 +              kattr.sched_flags &= SCHED_FLAG_ALL;
  
  #ifdef CONFIG_UCLAMP_TASK
 -      /*
 -       * This could race with another potential updater, but this is fine
 -       * because it'll correctly read the old or the new value. We don't need
 -       * to guarantee who wins the race as long as it doesn't return garbage.
 -       */
 -      kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
 -      kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
 +              /*
 +               * This could race with another potential updater, but this is fine
 +               * because it'll correctly read the old or the new value. We don't need
 +               * to guarantee who wins the race as long as it doesn't return garbage.
 +               */
 +              kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
 +              kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
  #endif
 -
 -      rcu_read_unlock();
 +      }
  
        return sched_attr_copy_to_user(uattr, &kattr, usize);
 -
 -out_unlock:
 -      rcu_read_unlock();
 -      return retval;
  }
  
  #ifdef CONFIG_SMP
  int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
  {
 -      int ret = 0;
 -
        /*
         * If the task isn't a deadline task or admission control is
         * disabled then we don't care about affinity changes.
         * tasks allowed to run on all the CPUs in the task's
         * root_domain.
         */
 -      rcu_read_lock();
 +      guard(rcu)();
        if (!cpumask_subset(task_rq(p)->rd->span, mask))
 -              ret = -EBUSY;
 -      rcu_read_unlock();
 -      return ret;
 +              return -EBUSY;
 +
 +      return 0;
  }
  #endif
  
@@@ -8372,24 -8366,39 +8372,24 @@@ long sched_setaffinity(pid_t pid, cons
  {
        struct affinity_context ac;
        struct cpumask *user_mask;
 -      struct task_struct *p;
        int retval;
  
 -      rcu_read_lock();
 -
 -      p = find_process_by_pid(pid);
 -      if (!p) {
 -              rcu_read_unlock();
 +      CLASS(find_get_task, p)(pid);
 +      if (!p)
                return -ESRCH;
 -      }
 -
 -      /* Prevent p going away */
 -      get_task_struct(p);
 -      rcu_read_unlock();
  
 -      if (p->flags & PF_NO_SETAFFINITY) {
 -              retval = -EINVAL;
 -              goto out_put_task;
 -      }
 +      if (p->flags & PF_NO_SETAFFINITY)
 +              return -EINVAL;
  
        if (!check_same_owner(p)) {
 -              rcu_read_lock();
 -              if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
 -                      rcu_read_unlock();
 -                      retval = -EPERM;
 -                      goto out_put_task;
 -              }
 -              rcu_read_unlock();
 +              guard(rcu)();
 +              if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
 +                      return -EPERM;
        }
  
        retval = security_task_setscheduler(p);
        if (retval)
 -              goto out_put_task;
 +              return retval;
  
        /*
         * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
        if (user_mask) {
                cpumask_copy(user_mask, in_mask);
        } else if (IS_ENABLED(CONFIG_SMP)) {
 -              retval = -ENOMEM;
 -              goto out_put_task;
 +              return -ENOMEM;
        }
  
        ac = (struct affinity_context){
        retval = __sched_setaffinity(p, &ac);
        kfree(ac.user_mask);
  
 -out_put_task:
 -      put_task_struct(p);
        return retval;
  }
  
@@@ -8452,21 -8464,28 +8452,21 @@@ SYSCALL_DEFINE3(sched_setaffinity, pid_
  long sched_getaffinity(pid_t pid, struct cpumask *mask)
  {
        struct task_struct *p;
 -      unsigned long flags;
        int retval;
  
 -      rcu_read_lock();
 -
 -      retval = -ESRCH;
 +      guard(rcu)();
        p = find_process_by_pid(pid);
        if (!p)
 -              goto out_unlock;
 +              return -ESRCH;
  
        retval = security_task_getscheduler(p);
        if (retval)
 -              goto out_unlock;
 +              return retval;
  
 -      raw_spin_lock_irqsave(&p->pi_lock, flags);
 +      guard(raw_spinlock_irqsave)(&p->pi_lock);
        cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
 -      raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
 -out_unlock:
 -      rcu_read_unlock();
 -
 -      return retval;
 +      return 0;
  }
  
  /**
@@@ -8913,46 -8932,55 +8913,46 @@@ int __sched yield_to(struct task_struc
  {
        struct task_struct *curr = current;
        struct rq *rq, *p_rq;
 -      unsigned long flags;
        int yielded = 0;
  
 -      local_irq_save(flags);
 -      rq = this_rq();
 +      scoped_guard (irqsave) {
 +              rq = this_rq();
  
  again:
 -      p_rq = task_rq(p);
 -      /*
 -       * If we're the only runnable task on the rq and target rq also
 -       * has only one task, there's absolutely no point in yielding.
 -       */
 -      if (rq->nr_running == 1 && p_rq->nr_running == 1) {
 -              yielded = -ESRCH;
 -              goto out_irq;
 -      }
 +              p_rq = task_rq(p);
 +              /*
 +               * If we're the only runnable task on the rq and target rq also
 +               * has only one task, there's absolutely no point in yielding.
 +               */
 +              if (rq->nr_running == 1 && p_rq->nr_running == 1)
 +                      return -ESRCH;
  
 -      double_rq_lock(rq, p_rq);
 -      if (task_rq(p) != p_rq) {
 -              double_rq_unlock(rq, p_rq);
 -              goto again;
 -      }
 +              guard(double_rq_lock)(rq, p_rq);
 +              if (task_rq(p) != p_rq)
 +                      goto again;
  
 -      if (!curr->sched_class->yield_to_task)
 -              goto out_unlock;
 +              if (!curr->sched_class->yield_to_task)
 +                      return 0;
  
 -      if (curr->sched_class != p->sched_class)
 -              goto out_unlock;
 +              if (curr->sched_class != p->sched_class)
 +                      return 0;
  
 -      if (task_on_cpu(p_rq, p) || !task_is_running(p))
 -              goto out_unlock;
 +              if (task_on_cpu(p_rq, p) || !task_is_running(p))
 +                      return 0;
  
 -      yielded = curr->sched_class->yield_to_task(rq, p);
 -      if (yielded) {
 -              schedstat_inc(rq->yld_count);
 -              /*
 -               * Make p's CPU reschedule; pick_next_entity takes care of
 -               * fairness.
 -               */
 -              if (preempt && rq != p_rq)
 -                      resched_curr(p_rq);
 +              yielded = curr->sched_class->yield_to_task(rq, p);
 +              if (yielded) {
 +                      schedstat_inc(rq->yld_count);
 +                      /*
 +                       * Make p's CPU reschedule; pick_next_entity
 +                       * takes care of fairness.
 +                       */
 +                      if (preempt && rq != p_rq)
 +                              resched_curr(p_rq);
 +              }
        }
  
 -out_unlock:
 -      double_rq_unlock(rq, p_rq);
 -out_irq:
 -      local_irq_restore(flags);
 -
 -      if (yielded > 0)
 +      if (yielded)
                schedule();
  
        return yielded;
@@@ -9055,30 -9083,38 +9055,30 @@@ SYSCALL_DEFINE1(sched_get_priority_min
  
  static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
  {
 -      struct task_struct *p;
 -      unsigned int time_slice;
 -      struct rq_flags rf;
 -      struct rq *rq;
 +      unsigned int time_slice = 0;
        int retval;
  
        if (pid < 0)
                return -EINVAL;
  
 -      retval = -ESRCH;
 -      rcu_read_lock();
 -      p = find_process_by_pid(pid);
 -      if (!p)
 -              goto out_unlock;
 +      scoped_guard (rcu) {
 +              struct task_struct *p = find_process_by_pid(pid);
 +              if (!p)
 +                      return -ESRCH;
  
 -      retval = security_task_getscheduler(p);
 -      if (retval)
 -              goto out_unlock;
 +              retval = security_task_getscheduler(p);
 +              if (retval)
 +                      return retval;
  
 -      rq = task_rq_lock(p, &rf);
 -      time_slice = 0;
 -      if (p->sched_class->get_rr_interval)
 -              time_slice = p->sched_class->get_rr_interval(rq, p);
 -      task_rq_unlock(rq, p, &rf);
 +              scoped_guard (task_rq_lock, p) {
 +                      struct rq *rq = scope.rq;
 +                      if (p->sched_class->get_rr_interval)
 +                              time_slice = p->sched_class->get_rr_interval(rq, p);
 +              }
 +      }
  
 -      rcu_read_unlock();
        jiffies_to_timespec64(time_slice, t);
        return 0;
 -
 -out_unlock:
 -      rcu_read_unlock();
 -      return retval;
  }
  
  /**
@@@ -9137,9 -9173,9 +9137,9 @@@ void sched_show_task(struct task_struc
        if (pid_alive(p))
                ppid = task_pid_nr(rcu_dereference(p->real_parent));
        rcu_read_unlock();
 -      pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
 -              free, task_pid_nr(p), ppid,
 -              read_task_thread_flags(p));
 +      pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n",
 +              free, task_pid_nr(p), task_tgid_nr(p),
 +              ppid, read_task_thread_flags(p));
  
        print_worker_info(KERN_INFO, p);
        print_stop_info(KERN_INFO, p);
@@@ -9469,11 -9505,9 +9469,11 @@@ static void balance_push(struct rq *rq
         * Temporarily drop rq->lock such that we can wake-up the stop task.
         * Both preemption and IRQs are still disabled.
         */
 +      preempt_disable();
        raw_spin_rq_unlock(rq);
        stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
                            this_cpu_ptr(&push_work));
 +      preempt_enable();
        /*
         * At this point need_resched() is true and we'll take the loop in
         * schedule(). The next pick is obviously going to be the stop task
@@@ -9869,7 -9903,7 +9869,7 @@@ struct task_group root_task_group
  LIST_HEAD(task_groups);
  
  /* Cacheline aligned slab cache for task_group */
- static struct kmem_cache *task_group_cache __read_mostly;
+ static struct kmem_cache *task_group_cache __ro_after_init;
  #endif
  
  void __init sched_init(void)
  #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
 -              rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
 +              rq->cpu_capacity = SCHED_CAPACITY_SCALE;
                rq->balance_callback = &balance_push_callback;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
                rq->online = 0;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
 -              rq->wake_stamp = jiffies;
 -              rq->wake_avg_idle = rq->avg_idle;
                rq->max_idle_balance_cost = sysctl_sched_migration_cost;
  
                INIT_LIST_HEAD(&rq->cfs_tasks);
@@@ -10253,9 -10289,9 +10253,9 @@@ void normalize_rt_tasks(void
  
  #endif /* CONFIG_MAGIC_SYSRQ */
  
 -#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
 +#if defined(CONFIG_KGDB_KDB)
  /*
 - * These functions are only useful for the IA64 MCA handling, or kdb.
 + * These functions are only useful for kdb.
   *
   * They can only be called when the whole system has been
   * stopped - every CPU needs to be quiescent, and no scheduling
@@@ -10277,7 -10313,30 +10277,7 @@@ struct task_struct *curr_task(int cpu
        return cpu_curr(cpu);
  }
  
 -#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
 -
 -#ifdef CONFIG_IA64
 -/**
 - * ia64_set_curr_task - set the current task for a given CPU.
 - * @cpu: the processor in question.
 - * @p: the task pointer to set.
 - *
 - * Description: This function must only be used when non-maskable interrupts
 - * are serviced on a separate stack. It allows the architecture to switch the
 - * notion of the current task on a CPU in a non-blocking manner. This function
 - * must be called with all CPU's synchronized, and interrupts disabled, the
 - * and caller must save the original value of the current task (see
 - * curr_task() above) and restore that value before reenabling interrupts and
 - * re-starting the system.
 - *
 - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
 - */
 -void ia64_set_curr_task(int cpu, struct task_struct *p)
 -{
 -      cpu_curr(cpu) = p;
 -}
 -
 -#endif
 +#endif /* defined(CONFIG_KGDB_KDB) */
  
  #ifdef CONFIG_CGROUP_SCHED
  /* task_group_lock serializes the addition/removal of task groups */
@@@ -10439,18 -10498,17 +10439,18 @@@ void sched_move_task(struct task_struc
        int queued, running, queue_flags =
                DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        struct task_group *group;
 -      struct rq_flags rf;
        struct rq *rq;
  
 -      rq = task_rq_lock(tsk, &rf);
 +      CLASS(task_rq_lock, rq_guard)(tsk);
 +      rq = rq_guard.rq;
 +
        /*
         * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
         * group changes.
         */
        group = sched_get_task_group(tsk);
        if (group == tsk->sched_task_group)
 -              goto unlock;
 +              return;
  
        update_rq_clock(rq);
  
                 */
                resched_curr(rq);
        }
 -
 -unlock:
 -      task_rq_unlock(rq, tsk, &rf);
  }
  
  static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@@ -10511,9 -10572,11 +10511,9 @@@ static int cpu_cgroup_css_online(struc
  
  #ifdef CONFIG_UCLAMP_TASK_GROUP
        /* Propagate the effective uclamp value for the new group */
 -      mutex_lock(&uclamp_mutex);
 -      rcu_read_lock();
 +      guard(mutex)(&uclamp_mutex);
 +      guard(rcu)();
        cpu_util_update_eff(css);
 -      rcu_read_unlock();
 -      mutex_unlock(&uclamp_mutex);
  #endif
  
        return 0;
@@@ -10664,8 -10727,8 +10664,8 @@@ static ssize_t cpu_uclamp_write(struct 
  
        static_branch_enable(&sched_uclamp_used);
  
 -      mutex_lock(&uclamp_mutex);
 -      rcu_read_lock();
 +      guard(mutex)(&uclamp_mutex);
 +      guard(rcu)();
  
        tg = css_tg(of_css(of));
        if (tg->uclamp_req[clamp_id].value != req.util)
        /* Update effective clamps to track the most restrictive value */
        cpu_util_update_eff(of_css(of));
  
 -      rcu_read_unlock();
 -      mutex_unlock(&uclamp_mutex);
 -
        return nbytes;
  }
  
@@@ -10705,10 -10771,10 +10705,10 @@@ static inline void cpu_uclamp_print(str
        u64 percent;
        u32 rem;
  
 -      rcu_read_lock();
 -      tg = css_tg(seq_css(sf));
 -      util_clamp = tg->uclamp_req[clamp_id].value;
 -      rcu_read_unlock();
 +      scoped_guard (rcu) {
 +              tg = css_tg(seq_css(sf));
 +              util_clamp = tg->uclamp_req[clamp_id].value;
 +      }
  
        if (util_clamp == SCHED_CAPACITY_SCALE) {
                seq_puts(sf, "max\n");
@@@ -10799,12 -10865,11 +10799,12 @@@ static int tg_set_cfs_bandwidth(struct 
         * Prevent race between setting of cfs_rq->runtime_enabled and
         * unthrottle_offline_cfs_rqs().
         */
 -      cpus_read_lock();
 -      mutex_lock(&cfs_constraints_mutex);
 +      guard(cpus_read_lock)();
 +      guard(mutex)(&cfs_constraints_mutex);
 +
        ret = __cfs_schedulable(tg, period, quota);
        if (ret)
 -              goto out_unlock;
 +              return ret;
  
        runtime_enabled = quota != RUNTIME_INF;
        runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
         */
        if (runtime_enabled && !runtime_was_enabled)
                cfs_bandwidth_usage_inc();
 -      raw_spin_lock_irq(&cfs_b->lock);
 -      cfs_b->period = ns_to_ktime(period);
 -      cfs_b->quota = quota;
 -      cfs_b->burst = burst;
  
 -      __refill_cfs_bandwidth_runtime(cfs_b);
 +      scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
 +              cfs_b->period = ns_to_ktime(period);
 +              cfs_b->quota = quota;
 +              cfs_b->burst = burst;
  
 -      /* Restart the period timer (if active) to handle new period expiry: */
 -      if (runtime_enabled)
 -              start_cfs_bandwidth(cfs_b);
 +              __refill_cfs_bandwidth_runtime(cfs_b);
  
 -      raw_spin_unlock_irq(&cfs_b->lock);
 +              /*
 +               * Restart the period timer (if active) to handle new
 +               * period expiry:
 +               */
 +              if (runtime_enabled)
 +                      start_cfs_bandwidth(cfs_b);
 +      }
  
        for_each_online_cpu(i) {
                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
                struct rq *rq = cfs_rq->rq;
 -              struct rq_flags rf;
  
 -              rq_lock_irq(rq, &rf);
 +              guard(rq_lock_irq)(rq);
                cfs_rq->runtime_enabled = runtime_enabled;
                cfs_rq->runtime_remaining = 0;
  
                if (cfs_rq->throttled)
                        unthrottle_cfs_rq(cfs_rq);
 -              rq_unlock_irq(rq, &rf);
        }
 +
        if (runtime_was_enabled && !runtime_enabled)
                cfs_bandwidth_usage_dec();
 -out_unlock:
 -      mutex_unlock(&cfs_constraints_mutex);
 -      cpus_read_unlock();
  
 -      return ret;
 +      return 0;
  }
  
  static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
@@@ -11030,6 -11096,7 +11030,6 @@@ static int tg_cfs_schedulable_down(stru
  
  static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
  {
 -      int ret;
        struct cfs_schedulable_data data = {
                .tg = tg,
                .period = period,
                do_div(data.quota, NSEC_PER_USEC);
        }
  
 -      rcu_read_lock();
 -      ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
 -      rcu_read_unlock();
 -
 -      return ret;
 +      guard(rcu)();
 +      return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
  }
  
  static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
@@@ -11647,12 -11717,14 +11647,12 @@@ int __sched_mm_cid_migrate_from_fetch_c
         * are not the last task to be migrated from this cpu for this mm, so
         * there is no need to move src_cid to the destination cpu.
         */
 -      rcu_read_lock();
 +      guard(rcu)();
        src_task = rcu_dereference(src_rq->curr);
        if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
 -              rcu_read_unlock();
                t->last_mm_cid = -1;
                return -1;
        }
 -      rcu_read_unlock();
  
        return src_cid;
  }
@@@ -11696,17 -11768,18 +11696,17 @@@ int __sched_mm_cid_migrate_from_try_ste
         * the lazy-put flag, this task will be responsible for transitioning
         * from lazy-put flag set to MM_CID_UNSET.
         */
 -      rcu_read_lock();
 -      src_task = rcu_dereference(src_rq->curr);
 -      if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
 -              rcu_read_unlock();
 -              /*
 -               * We observed an active task for this mm, there is therefore
 -               * no point in moving this cid to the destination cpu.
 -               */
 -              t->last_mm_cid = -1;
 -              return -1;
 +      scoped_guard (rcu) {
 +              src_task = rcu_dereference(src_rq->curr);
 +              if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
 +                      /*
 +                       * We observed an active task for this mm, there is therefore
 +                       * no point in moving this cid to the destination cpu.
 +                       */
 +                      t->last_mm_cid = -1;
 +                      return -1;
 +              }
        }
 -      rcu_read_unlock();
  
        /*
         * The src_cid is unused, so it can be unset.
@@@ -11779,6 -11852,7 +11779,6 @@@ static void sched_mm_cid_remote_clear(s
  {
        struct rq *rq = cpu_rq(cpu);
        struct task_struct *t;
 -      unsigned long flags;
        int cid, lazy_cid;
  
        cid = READ_ONCE(pcpu_cid->cid);
         * the lazy-put flag, that task will be responsible for transitioning
         * from lazy-put flag set to MM_CID_UNSET.
         */
 -      rcu_read_lock();
 -      t = rcu_dereference(rq->curr);
 -      if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
 -              rcu_read_unlock();
 -              return;
 +      scoped_guard (rcu) {
 +              t = rcu_dereference(rq->curr);
 +              if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
 +                      return;
        }
 -      rcu_read_unlock();
  
        /*
         * The cid is unused, so it can be unset.
         * Disable interrupts to keep the window of cid ownership without rq
         * lock small.
         */
 -      local_irq_save(flags);
 -      if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
 -              __mm_cid_put(mm, cid);
 -      local_irq_restore(flags);
 +      scoped_guard (irqsave) {
 +              if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
 +                      __mm_cid_put(mm, cid);
 +      }
  }
  
  static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
         * snapshot associated with this cid if an active task using the mm is
         * observed on this rq.
         */
 -      rcu_read_lock();
 -      curr = rcu_dereference(rq->curr);
 -      if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
 -              WRITE_ONCE(pcpu_cid->time, rq_clock);
 -              rcu_read_unlock();
 -              return;
 +      scoped_guard (rcu) {
 +              curr = rcu_dereference(rq->curr);
 +              if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
 +                      WRITE_ONCE(pcpu_cid->time, rq_clock);
 +                      return;
 +              }
        }
 -      rcu_read_unlock();
  
        if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
                return;
@@@ -11949,6 -12026,7 +11949,6 @@@ void task_tick_mm_cid(struct rq *rq, st
  void sched_mm_cid_exit_signals(struct task_struct *t)
  {
        struct mm_struct *mm = t->mm;
 -      struct rq_flags rf;
        struct rq *rq;
  
        if (!mm)
  
        preempt_disable();
        rq = this_rq();
 -      rq_lock_irqsave(rq, &rf);
 +      guard(rq_lock_irqsave)(rq);
        preempt_enable_no_resched();    /* holding spinlock */
        WRITE_ONCE(t->mm_cid_active, 0);
        /*
        smp_mb();
        mm_cid_put(mm);
        t->last_mm_cid = t->mm_cid = -1;
 -      rq_unlock_irqrestore(rq, &rf);
  }
  
  void sched_mm_cid_before_execve(struct task_struct *t)
  {
        struct mm_struct *mm = t->mm;
 -      struct rq_flags rf;
        struct rq *rq;
  
        if (!mm)
  
        preempt_disable();
        rq = this_rq();
 -      rq_lock_irqsave(rq, &rf);
 +      guard(rq_lock_irqsave)(rq);
        preempt_enable_no_resched();    /* holding spinlock */
        WRITE_ONCE(t->mm_cid_active, 0);
        /*
        smp_mb();
        mm_cid_put(mm);
        t->last_mm_cid = t->mm_cid = -1;
 -      rq_unlock_irqrestore(rq, &rf);
  }
  
  void sched_mm_cid_after_execve(struct task_struct *t)
  {
        struct mm_struct *mm = t->mm;
 -      struct rq_flags rf;
        struct rq *rq;
  
        if (!mm)
  
        preempt_disable();
        rq = this_rq();
 -      rq_lock_irqsave(rq, &rf);
 -      preempt_enable_no_resched();    /* holding spinlock */
 -      WRITE_ONCE(t->mm_cid_active, 1);
 -      /*
 -       * Store t->mm_cid_active before loading per-mm/cpu cid.
 -       * Matches barrier in sched_mm_cid_remote_clear_old().
 -       */
 -      smp_mb();
 -      t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
 -      rq_unlock_irqrestore(rq, &rf);
 +      scoped_guard (rq_lock_irqsave, rq) {
 +              preempt_enable_no_resched();    /* holding spinlock */
 +              WRITE_ONCE(t->mm_cid_active, 1);
 +              /*
 +               * Store t->mm_cid_active before loading per-mm/cpu cid.
 +               * Matches barrier in sched_mm_cid_remote_clear_old().
 +               */
 +              smp_mb();
 +              t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
 +      }
        rseq_set_notify_resume(t);
  }
  
diff --combined kernel/signal.c
index 83fcbaf0e82de0be624c64bcebf6bbdaeda855c6,edaf39382d21d8d851002a4962e1bc8df0c5d040..47a7602dfe8df43e786b7b33119b71599372b151
@@@ -415,7 -415,7 +415,7 @@@ __sigqueue_alloc(int sig, struct task_s
                 int override_rlimit, const unsigned int sigqueue_flags)
  {
        struct sigqueue *q = NULL;
-       struct ucounts *ucounts = NULL;
+       struct ucounts *ucounts;
        long sigpending;
  
        /*
@@@ -1058,12 -1058,11 +1058,11 @@@ static void complete_signal(int sig, st
                        signal->flags = SIGNAL_GROUP_EXIT;
                        signal->group_exit_code = sig;
                        signal->group_stop_count = 0;
-                       t = p;
-                       do {
+                       __for_each_thread(signal, t) {
                                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                                sigaddset(&t->pending.signal, SIGKILL);
                                signal_wake_up(t, 1);
-                       } while_each_thread(p, t);
+                       }
                        return;
                }
        }
@@@ -1471,16 -1470,21 +1470,21 @@@ int group_send_sig_info(int sig, struc
  int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
  {
        struct task_struct *p = NULL;
-       int retval, success;
+       int ret = -ESRCH;
  
-       success = 0;
-       retval = -ESRCH;
        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
-               success |= !err;
-               retval = err;
+               /*
+                * If group_send_sig_info() succeeds at least once ret
+                * becomes 0 and after that the code below has no effect.
+                * Otherwise we return the last err or -ESRCH if this
+                * process group is empty.
+                */
+               if (ret)
+                       ret = err;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
-       return success ? 0 : retval;
+       return ret;
  }
  
  int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
@@@ -1718,8 -1722,9 +1722,8 @@@ void force_sigsegv(int sig
                force_sig(SIGSEGV);
  }
  
 -int force_sig_fault_to_task(int sig, int code, void __user *addr
 -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
 -      , struct task_struct *t)
 +int force_sig_fault_to_task(int sig, int code, void __user *addr,
 +                          struct task_struct *t)
  {
        struct kernel_siginfo info;
  
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
 -#ifdef __ia64__
 -      info.si_imm = imm;
 -      info.si_flags = flags;
 -      info.si_isr = isr;
 -#endif
        return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
  }
  
 -int force_sig_fault(int sig, int code, void __user *addr
 -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
 +int force_sig_fault(int sig, int code, void __user *addr)
  {
 -      return force_sig_fault_to_task(sig, code, addr
 -                                     ___ARCH_SI_IA64(imm, flags, isr), current);
 +      return force_sig_fault_to_task(sig, code, addr, current);
  }
  
 -int send_sig_fault(int sig, int code, void __user *addr
 -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
 -      , struct task_struct *t)
 +int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t)
  {
        struct kernel_siginfo info;
  
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
 -#ifdef __ia64__
 -      info.si_imm = imm;
 -      info.si_flags = flags;
 -      info.si_isr = isr;
 -#endif
        return send_sig_info(info.si_signo, &info, t);
  }
  
@@@ -2314,38 -2333,15 +2318,38 @@@ static int ptrace_stop(int exit_code, i
                do_notify_parent_cldstop(current, false, why);
  
        /*
 -       * Don't want to allow preemption here, because
 -       * sys_ptrace() needs this task to be inactive.
 +       * The previous do_notify_parent_cldstop() invocation woke ptracer.
 +       * One a PREEMPTION kernel this can result in preemption requirement
 +       * which will be fulfilled after read_unlock() and the ptracer will be
 +       * put on the CPU.
 +       * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
 +       * this task wait in schedule(). If this task gets preempted then it
 +       * remains enqueued on the runqueue. The ptracer will observe this and
 +       * then sleep for a delay of one HZ tick. In the meantime this task
 +       * gets scheduled, enters schedule() and will wait for the ptracer.
 +       *
 +       * This preemption point is not bad from a correctness point of
 +       * view but extends the runtime by one HZ tick time due to the
 +       * ptracer's sleep.  The preempt-disable section ensures that there
 +       * will be no preemption between unlock and schedule() and so
 +       * improving the performance since the ptracer will observe that
 +       * the tracee is scheduled out once it gets on the CPU.
         *
 -       * XXX: implement read_unlock_no_resched().
 +       * On PREEMPT_RT locking tasklist_lock does not disable preemption.
 +       * Therefore the task can be preempted after do_notify_parent_cldstop()
 +       * before unlocking tasklist_lock so there is no benefit in doing this.
 +       *
 +       * In fact disabling preemption is harmful on PREEMPT_RT because
 +       * the spinlock_t in cgroup_enter_frozen() must not be acquired
 +       * with preemption disabled due to the 'sleeping' spinlock
 +       * substitution of RT.
         */
 -      preempt_disable();
 +      if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 +              preempt_disable();
        read_unlock(&tasklist_lock);
        cgroup_enter_frozen();
 -      preempt_enable_no_resched();
 +      if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 +              preempt_enable_no_resched();
        schedule();
        cgroup_leave_frozen(true);
  
diff --combined kernel/sys.c
index 4a8073c1b2558e6d5f050316b21c3da7e15c4331,67436d465be45160772e6977c3b97dd20194fd43..420d9cb9cc8e203f50014bb2ec564f6598d9869c
@@@ -1786,6 -1786,7 +1786,7 @@@ void getrusage(struct task_struct *p, i
        unsigned long flags;
        u64 tgutime, tgstime, utime, stime;
        unsigned long maxrss = 0;
+       struct signal_struct *sig = p->signal;
  
        memset((char *)r, 0, sizeof (*r));
        utime = stime = 0;
        if (who == RUSAGE_THREAD) {
                task_cputime_adjusted(current, &utime, &stime);
                accumulate_thread_rusage(p, r);
-               maxrss = p->signal->maxrss;
+               maxrss = sig->maxrss;
                goto out;
        }
  
        switch (who) {
        case RUSAGE_BOTH:
        case RUSAGE_CHILDREN:
-               utime = p->signal->cutime;
-               stime = p->signal->cstime;
-               r->ru_nvcsw = p->signal->cnvcsw;
-               r->ru_nivcsw = p->signal->cnivcsw;
-               r->ru_minflt = p->signal->cmin_flt;
-               r->ru_majflt = p->signal->cmaj_flt;
-               r->ru_inblock = p->signal->cinblock;
-               r->ru_oublock = p->signal->coublock;
-               maxrss = p->signal->cmaxrss;
+               utime = sig->cutime;
+               stime = sig->cstime;
+               r->ru_nvcsw = sig->cnvcsw;
+               r->ru_nivcsw = sig->cnivcsw;
+               r->ru_minflt = sig->cmin_flt;
+               r->ru_majflt = sig->cmaj_flt;
+               r->ru_inblock = sig->cinblock;
+               r->ru_oublock = sig->coublock;
+               maxrss = sig->cmaxrss;
  
                if (who == RUSAGE_CHILDREN)
                        break;
                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                utime += tgutime;
                stime += tgstime;
-               r->ru_nvcsw += p->signal->nvcsw;
-               r->ru_nivcsw += p->signal->nivcsw;
-               r->ru_minflt += p->signal->min_flt;
-               r->ru_majflt += p->signal->maj_flt;
-               r->ru_inblock += p->signal->inblock;
-               r->ru_oublock += p->signal->oublock;
-               if (maxrss < p->signal->maxrss)
-                       maxrss = p->signal->maxrss;
-               t = p;
-               do {
+               r->ru_nvcsw += sig->nvcsw;
+               r->ru_nivcsw += sig->nivcsw;
+               r->ru_minflt += sig->min_flt;
+               r->ru_majflt += sig->maj_flt;
+               r->ru_inblock += sig->inblock;
+               r->ru_oublock += sig->oublock;
+               if (maxrss < sig->maxrss)
+                       maxrss = sig->maxrss;
+               __for_each_thread(sig, t)
                        accumulate_thread_rusage(t, r);
-               } while_each_thread(p, t);
                break;
  
        default:
@@@ -2368,41 -2367,19 +2367,41 @@@ static int prctl_set_vma(unsigned long 
  }
  #endif /* CONFIG_ANON_VMA_NAME */
  
 +static inline unsigned long get_current_mdwe(void)
 +{
 +      unsigned long ret = 0;
 +
 +      if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
 +              ret |= PR_MDWE_REFUSE_EXEC_GAIN;
 +      if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags))
 +              ret |= PR_MDWE_NO_INHERIT;
 +
 +      return ret;
 +}
 +
  static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
                                 unsigned long arg4, unsigned long arg5)
  {
 +      unsigned long current_bits;
 +
        if (arg3 || arg4 || arg5)
                return -EINVAL;
  
 -      if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
 +      if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT))
 +              return -EINVAL;
 +
 +      /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */
 +      if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
                return -EINVAL;
  
 +      current_bits = get_current_mdwe();
 +      if (current_bits && current_bits != bits)
 +              return -EPERM; /* Cannot unset the flags */
 +
 +      if (bits & PR_MDWE_NO_INHERIT)
 +              set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags);
        if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
                set_bit(MMF_HAS_MDWE, &current->mm->flags);
 -      else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
 -              return -EPERM; /* Cannot unset the flag */
  
        return 0;
  }
@@@ -2412,7 -2389,9 +2411,7 @@@ static inline int prctl_get_mdwe(unsign
  {
        if (arg2 || arg3 || arg4 || arg5)
                return -EINVAL;
 -
 -      return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
 -              PR_MDWE_REFUSE_EXEC_GAIN : 0;
 +      return get_current_mdwe();
  }
  
  static int prctl_get_auxv(void __user *addr, unsigned long len)
diff --combined kernel/user_namespace.c
index d52a894ecf57d0abfb33e196d8fae879c6c6f884,bf2cb8c115711a10d6a07b2aa6a948eeb2f82290..eabe8bcc70426519bdfef4b08e53a86451ba76c2
@@@ -22,7 -22,7 +22,7 @@@
  #include <linux/bsearch.h>
  #include <linux/sort.h>
  
- static struct kmem_cache *user_ns_cachep __read_mostly;
+ static struct kmem_cache *user_ns_cachep __ro_after_init;
  static DEFINE_MUTEX(userns_state_mutex);
  
  static bool new_idmap_permitted(const struct file *file,
@@@ -213,9 -213,6 +213,9 @@@ static void free_user_ns(struct work_st
                        kfree(ns->projid_map.forward);
                        kfree(ns->projid_map.reverse);
                }
 +#if IS_ENABLED(CONFIG_BINFMT_MISC)
 +              kfree(ns->binfmt_misc);
 +#endif
                retire_userns_sysctls(ns);
                key_free_user_ns(ns);
                ns_free_inum(&ns->ns);
diff --combined kernel/workqueue.c
index 0f682da96e1c52ea42aa8a23590cf8448fa27334,96b89f0edbe3ae56aab0e152fa45d4f0af59b737..6e578f576a6f2b73b98817b0f5489a79c9d85524
@@@ -418,21 -418,21 +418,21 @@@ static struct workqueue_attrs *ordered_
   * process context while holding a pool lock. Bounce to a dedicated kthread
   * worker to avoid A-A deadlocks.
   */
- static struct kthread_worker *pwq_release_worker;
+ static struct kthread_worker *pwq_release_worker __ro_after_init;
  
- struct workqueue_struct *system_wq __read_mostly;
+ struct workqueue_struct *system_wq __ro_after_init;
  EXPORT_SYMBOL(system_wq);
- struct workqueue_struct *system_highpri_wq __read_mostly;
+ struct workqueue_struct *system_highpri_wq __ro_after_init;
  EXPORT_SYMBOL_GPL(system_highpri_wq);
- struct workqueue_struct *system_long_wq __read_mostly;
+ struct workqueue_struct *system_long_wq __ro_after_init;
  EXPORT_SYMBOL_GPL(system_long_wq);
- struct workqueue_struct *system_unbound_wq __read_mostly;
+ struct workqueue_struct *system_unbound_wq __ro_after_init;
  EXPORT_SYMBOL_GPL(system_unbound_wq);
- struct workqueue_struct *system_freezable_wq __read_mostly;
+ struct workqueue_struct *system_freezable_wq __ro_after_init;
  EXPORT_SYMBOL_GPL(system_freezable_wq);
- struct workqueue_struct *system_power_efficient_wq __read_mostly;
+ struct workqueue_struct *system_power_efficient_wq __ro_after_init;
  EXPORT_SYMBOL_GPL(system_power_efficient_wq);
- struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+ struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
  
  static int worker_thread(void *__worker);
@@@ -2166,7 -2166,7 +2166,7 @@@ static struct worker *create_worker(str
  {
        struct worker *worker;
        int id;
 -      char id_buf[16];
 +      char id_buf[23];
  
        /* ID is needed to determine kthread name */
        id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
@@@ -4600,22 -4600,12 +4600,22 @@@ static int alloc_and_link_pwqs(struct w
        }
        cpus_read_unlock();
  
 +      /* for unbound pwq, flush the pwq_release_worker ensures that the
 +       * pwq_release_workfn() completes before calling kfree(wq).
 +       */
 +      if (ret)
 +              kthread_flush_worker(pwq_release_worker);
 +
        return ret;
  
  enomem:
        if (wq->cpu_pwq) {
 -              for_each_possible_cpu(cpu)
 -                      kfree(*per_cpu_ptr(wq->cpu_pwq, cpu));
 +              for_each_possible_cpu(cpu) {
 +                      struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
 +
 +                      if (pwq)
 +                              kmem_cache_free(pwq_cache, pwq);
 +              }
                free_percpu(wq->cpu_pwq);
                wq->cpu_pwq = NULL;
        }
@@@ -5622,54 -5612,50 +5622,54 @@@ static void work_for_cpu_fn(struct work
  }
  
  /**
 - * work_on_cpu - run a function in thread context on a particular cpu
 + * work_on_cpu_key - run a function in thread context on a particular cpu
   * @cpu: the cpu to run on
   * @fn: the function to run
   * @arg: the function arg
 + * @key: The lock class key for lock debugging purposes
   *
   * It is up to the caller to ensure that the cpu doesn't go offline.
   * The caller must not hold any locks which would prevent @fn from completing.
   *
   * Return: The value @fn returns.
   */
 -long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 +long work_on_cpu_key(int cpu, long (*fn)(void *),
 +                   void *arg, struct lock_class_key *key)
  {
        struct work_for_cpu wfc = { .fn = fn, .arg = arg };
  
 -      INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
 +      INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
        schedule_work_on(cpu, &wfc.work);
        flush_work(&wfc.work);
        destroy_work_on_stack(&wfc.work);
        return wfc.ret;
  }
 -EXPORT_SYMBOL_GPL(work_on_cpu);
 +EXPORT_SYMBOL_GPL(work_on_cpu_key);
  
  /**
 - * work_on_cpu_safe - run a function in thread context on a particular cpu
 + * work_on_cpu_safe_key - run a function in thread context on a particular cpu
   * @cpu: the cpu to run on
   * @fn:  the function to run
   * @arg: the function argument
 + * @key: The lock class key for lock debugging purposes
   *
   * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
   * any locks which would prevent @fn from completing.
   *
   * Return: The value @fn returns.
   */
 -long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
 +long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
 +                        void *arg, struct lock_class_key *key)
  {
        long ret = -ENODEV;
  
        cpus_read_lock();
        if (cpu_online(cpu))
 -              ret = work_on_cpu(cpu, fn, arg);
 +              ret = work_on_cpu_key(cpu, fn, arg, key);
        cpus_read_unlock();
        return ret;
  }
 -EXPORT_SYMBOL_GPL(work_on_cpu_safe);
 +EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
  #endif /* CONFIG_SMP */
  
  #ifdef CONFIG_FREEZER
@@@ -5796,13 -5782,9 +5796,13 @@@ static int workqueue_apply_unbound_cpum
        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_UNBOUND))
                        continue;
 +
                /* creating multiple pwqs breaks ordering guarantee */
 -              if (wq->flags & __WQ_ORDERED)
 -                      continue;
 +              if (!list_empty(&wq->pwqs)) {
 +                      if (wq->flags & __WQ_ORDERED_EXPLICIT)
 +                              continue;
 +                      wq->flags &= ~__WQ_ORDERED;
 +              }
  
                ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
                if (IS_ERR(ctx)) {
diff --combined mm/damon/core.c
index aa2dc7087cd93192bc21d5d8d6977823ff43c949,2f54f153d7f53228fa183ee4f025e3975d72d42f..630077d95dc60721015ea4b195c85e1c73f484ce
@@@ -128,7 -128,6 +128,7 @@@ struct damon_region *damon_new_region(u
        region->ar.start = start;
        region->ar.end = end;
        region->nr_accesses = 0;
 +      region->nr_accesses_bp = 0;
        INIT_LIST_HEAD(&region->list);
  
        region->age = 0;
@@@ -313,9 -312,7 +313,9 @@@ static struct damos_quota *damos_quota_
  }
  
  struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 -                      enum damos_action action, struct damos_quota *quota,
 +                      enum damos_action action,
 +                      unsigned long apply_interval_us,
 +                      struct damos_quota *quota,
                        struct damos_watermarks *wmarks)
  {
        struct damos *scheme;
                return NULL;
        scheme->pattern = *pattern;
        scheme->action = action;
 +      scheme->apply_interval_us = apply_interval_us;
 +      /*
 +       * next_apply_sis will be set when kdamond starts.  While kdamond is
 +       * running, it will also updated when it is added to the DAMON context,
 +       * or damon_attrs are updated.
 +       */
 +      scheme->next_apply_sis = 0;
        INIT_LIST_HEAD(&scheme->filters);
        scheme->stat = (struct damos_stat){};
        INIT_LIST_HEAD(&scheme->list);
        return scheme;
  }
  
 +static void damos_set_next_apply_sis(struct damos *s, struct damon_ctx *ctx)
 +{
 +      unsigned long sample_interval = ctx->attrs.sample_interval ?
 +              ctx->attrs.sample_interval : 1;
 +      unsigned long apply_interval = s->apply_interval_us ?
 +              s->apply_interval_us : ctx->attrs.aggr_interval;
 +
 +      s->next_apply_sis = ctx->passed_sample_intervals +
 +              apply_interval / sample_interval;
 +}
 +
  void damon_add_scheme(struct damon_ctx *ctx, struct damos *s)
  {
        list_add_tail(&s->list, &ctx->schemes);
 +      damos_set_next_apply_sis(s, ctx);
  }
  
  static void damon_del_scheme(struct damos *s)
@@@ -449,10 -427,8 +449,10 @@@ struct damon_ctx *damon_new_ctx(void
        ctx->attrs.aggr_interval = 100 * 1000;
        ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
  
 -      ktime_get_coarse_ts64(&ctx->last_aggregation);
 -      ctx->last_ops_update = ctx->last_aggregation;
 +      ctx->passed_sample_intervals = 0;
 +      /* These will be set from kdamond_init_intervals_sis() */
 +      ctx->next_aggregation_sis = 0;
 +      ctx->next_ops_update_sis = 0;
  
        mutex_init(&ctx->kdamond_lock);
  
@@@ -500,14 -476,20 +500,14 @@@ static unsigned int damon_age_for_new_a
  static unsigned int damon_accesses_bp_to_nr_accesses(
                unsigned int accesses_bp, struct damon_attrs *attrs)
  {
 -      unsigned int max_nr_accesses =
 -              attrs->aggr_interval / attrs->sample_interval;
 -
 -      return accesses_bp * max_nr_accesses / 10000;
 +      return accesses_bp * damon_max_nr_accesses(attrs) / 10000;
  }
  
  /* convert nr_accesses to access ratio in bp (per 10,000) */
  static unsigned int damon_nr_accesses_to_accesses_bp(
                unsigned int nr_accesses, struct damon_attrs *attrs)
  {
 -      unsigned int max_nr_accesses =
 -              attrs->aggr_interval / attrs->sample_interval;
 -
 -      return nr_accesses * 10000 / max_nr_accesses;
 +      return nr_accesses * 10000 / damon_max_nr_accesses(attrs);
  }
  
  static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses,
@@@ -524,7 -506,6 +524,7 @@@ static void damon_update_monitoring_res
  {
        r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses,
                        old_attrs, new_attrs);
 +      r->nr_accesses_bp = r->nr_accesses * 10000;
        r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs);
  }
  
@@@ -560,21 -541,13 +560,21 @@@ static void damon_update_monitoring_res
   * @ctx:              monitoring context
   * @attrs:            monitoring attributes
   *
 - * This function should not be called while the kdamond is running.
 + * This function should be called while the kdamond is not running, or an
 + * access check results aggregation is not ongoing (e.g., from
 + * &struct damon_callback->after_aggregation or
 + * &struct damon_callback->after_wmarks_check callbacks).
 + *
   * Every time interval is in micro-seconds.
   *
   * Return: 0 on success, negative error code otherwise.
   */
  int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
  {
 +      unsigned long sample_interval = attrs->sample_interval ?
 +              attrs->sample_interval : 1;
 +      struct damos *s;
 +
        if (attrs->min_nr_regions < 3)
                return -EINVAL;
        if (attrs->min_nr_regions > attrs->max_nr_regions)
        if (attrs->sample_interval > attrs->aggr_interval)
                return -EINVAL;
  
 +      ctx->next_aggregation_sis = ctx->passed_sample_intervals +
 +              attrs->aggr_interval / sample_interval;
 +      ctx->next_ops_update_sis = ctx->passed_sample_intervals +
 +              attrs->ops_update_interval / sample_interval;
 +
        damon_update_monitoring_results(ctx, attrs);
        ctx->attrs = *attrs;
 +
 +      damon_for_each_scheme(s, ctx)
 +              damos_set_next_apply_sis(s, ctx);
 +
        return 0;
  }
  
@@@ -735,8 -699,7 +735,7 @@@ static int __damon_stop(struct damon_ct
        if (tsk) {
                get_task_struct(tsk);
                mutex_unlock(&ctx->kdamond_lock);
-               kthread_stop(tsk);
-               put_task_struct(tsk);
+               kthread_stop_put(tsk);
                return 0;
        }
        mutex_unlock(&ctx->kdamond_lock);
@@@ -764,6 -727,38 +763,6 @@@ int damon_stop(struct damon_ctx **ctxs
        return err;
  }
  
 -/*
 - * damon_check_reset_time_interval() - Check if a time interval is elapsed.
 - * @baseline: the time to check whether the interval has elapsed since
 - * @interval: the time interval (microseconds)
 - *
 - * See whether the given time interval has passed since the given baseline
 - * time.  If so, it also updates the baseline to current time for next check.
 - *
 - * Return:    true if the time interval has passed, or false otherwise.
 - */
 -static bool damon_check_reset_time_interval(struct timespec64 *baseline,
 -              unsigned long interval)
 -{
 -      struct timespec64 now;
 -
 -      ktime_get_coarse_ts64(&now);
 -      if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) <
 -                      interval * 1000)
 -              return false;
 -      *baseline = now;
 -      return true;
 -}
 -
 -/*
 - * Check whether it is time to flush the aggregated information
 - */
 -static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
 -{
 -      return damon_check_reset_time_interval(&ctx->last_aggregation,
 -                      ctx->attrs.aggr_interval);
 -}
 -
  /*
   * Reset the aggregated monitoring results ('nr_accesses' of each region).
   */
@@@ -776,7 -771,7 +775,7 @@@ static void kdamond_reset_aggregated(st
                struct damon_region *r;
  
                damon_for_each_region(r, t) {
 -                      trace_damon_aggregated(t, ti, r, damon_nr_regions(t));
 +                      trace_damon_aggregated(ti, r, damon_nr_regions(t));
                        r->last_nr_accesses = r->nr_accesses;
                        r->nr_accesses = 0;
                }
@@@ -790,13 -785,12 +789,13 @@@ static void damon_split_region_at(struc
  static bool __damos_valid_target(struct damon_region *r, struct damos *s)
  {
        unsigned long sz;
 +      unsigned int nr_accesses = r->nr_accesses_bp / 10000;
  
        sz = damon_sz_region(r);
        return s->pattern.min_sz_region <= sz &&
                sz <= s->pattern.max_sz_region &&
 -              s->pattern.min_nr_accesses <= r->nr_accesses &&
 -              r->nr_accesses <= s->pattern.max_nr_accesses &&
 +              s->pattern.min_nr_accesses <= nr_accesses &&
 +              nr_accesses <= s->pattern.max_nr_accesses &&
                s->pattern.min_age_region <= r->age &&
                r->age <= s->pattern.max_age_region;
  }
@@@ -951,33 -945,6 +950,33 @@@ static void damos_apply_scheme(struct d
        struct timespec64 begin, end;
        unsigned long sz_applied = 0;
        int err = 0;
 +      /*
 +       * We plan to support multiple context per kdamond, as DAMON sysfs
 +       * implies with 'nr_contexts' file.  Nevertheless, only single context
 +       * per kdamond is supported for now.  So, we can simply use '0' context
 +       * index here.
 +       */
 +      unsigned int cidx = 0;
 +      struct damos *siter;            /* schemes iterator */
 +      unsigned int sidx = 0;
 +      struct damon_target *titer;     /* targets iterator */
 +      unsigned int tidx = 0;
 +      bool do_trace = false;
 +
 +      /* get indices for trace_damos_before_apply() */
 +      if (trace_damos_before_apply_enabled()) {
 +              damon_for_each_scheme(siter, c) {
 +                      if (siter == s)
 +                              break;
 +                      sidx++;
 +              }
 +              damon_for_each_target(titer, c) {
 +                      if (titer == t)
 +                              break;
 +                      tidx++;
 +              }
 +              do_trace = true;
 +      }
  
        if (c->ops.apply_scheme) {
                if (quota->esz && quota->charged_sz + sz > quota->esz) {
                ktime_get_coarse_ts64(&begin);
                if (c->callback.before_damos_apply)
                        err = c->callback.before_damos_apply(c, t, r, s);
 -              if (!err)
 +              if (!err) {
 +                      trace_damos_before_apply(cidx, sidx, tidx, r,
 +                                      damon_nr_regions(t), do_trace);
                        sz_applied = c->ops.apply_scheme(c, t, r, s);
 +              }
                ktime_get_coarse_ts64(&end);
                quota->total_charged_ns += timespec64_to_ns(&end) -
                        timespec64_to_ns(&begin);
@@@ -1114,29 -1078,14 +1113,29 @@@ static void kdamond_apply_schemes(struc
        struct damon_target *t;
        struct damon_region *r, *next_r;
        struct damos *s;
 +      unsigned long sample_interval = c->attrs.sample_interval ?
 +              c->attrs.sample_interval : 1;
 +      bool has_schemes_to_apply = false;
  
        damon_for_each_scheme(s, c) {
 +              if (c->passed_sample_intervals != s->next_apply_sis)
 +                      continue;
 +
 +              s->next_apply_sis +=
 +                      (s->apply_interval_us ? s->apply_interval_us :
 +                       c->attrs.aggr_interval) / sample_interval;
 +
                if (!s->wmarks.activated)
                        continue;
  
 +              has_schemes_to_apply = true;
 +
                damos_adjust_quota(c, s);
        }
  
 +      if (!has_schemes_to_apply)
 +              return;
 +
        damon_for_each_target(t, c) {
                damon_for_each_region_safe(r, next_r, t)
                        damon_do_apply_schemes(c, t, r);
@@@ -1153,7 -1102,6 +1152,7 @@@ static void damon_merge_two_regions(str
  
        l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
                        (sz_l + sz_r);
 +      l->nr_accesses_bp = l->nr_accesses * 10000;
        l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
        l->ar.end = r->ar.end;
        damon_destroy_region(r, t);
@@@ -1225,7 -1173,6 +1224,7 @@@ static void damon_split_region_at(struc
  
        new->age = r->age;
        new->last_nr_accesses = r->last_nr_accesses;
 +      new->nr_accesses_bp = r->nr_accesses_bp;
  
        damon_insert_region(new, r, damon_next_region(r), t);
  }
@@@ -1292,6 -1239,18 +1291,6 @@@ static void kdamond_split_regions(struc
        last_nr_regions = nr_regions;
  }
  
 -/*
 - * Check whether it is time to check and apply the operations-related data
 - * structures.
 - *
 - * Returns true if it is.
 - */
 -static bool kdamond_need_update_operations(struct damon_ctx *ctx)
 -{
 -      return damon_check_reset_time_interval(&ctx->last_ops_update,
 -                      ctx->attrs.ops_update_interval);
 -}
 -
  /*
   * Check whether current monitoring should be stopped
   *
@@@ -1320,10 -1279,12 +1319,10 @@@ static bool kdamond_need_stop(struct da
  
  static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric)
  {
 -      struct sysinfo i;
 -
        switch (metric) {
        case DAMOS_WMARK_FREE_MEM_RATE:
 -              si_meminfo(&i);
 -              return i.freeram * 1000 / i.totalram;
 +              return global_zone_page_state(NR_FREE_PAGES) * 1000 /
 +                     totalram_pages();
        default:
                break;
        }
@@@ -1401,25 -1362,6 +1400,25 @@@ static int kdamond_wait_activation(stru
        return -EBUSY;
  }
  
 +static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
 +{
 +      unsigned long sample_interval = ctx->attrs.sample_interval ?
 +              ctx->attrs.sample_interval : 1;
 +      unsigned long apply_interval;
 +      struct damos *scheme;
 +
 +      ctx->passed_sample_intervals = 0;
 +      ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval;
 +      ctx->next_ops_update_sis = ctx->attrs.ops_update_interval /
 +              sample_interval;
 +
 +      damon_for_each_scheme(scheme, ctx) {
 +              apply_interval = scheme->apply_interval_us ?
 +                      scheme->apply_interval_us : ctx->attrs.aggr_interval;
 +              scheme->next_apply_sis = apply_interval / sample_interval;
 +      }
 +}
 +
  /*
   * The monitoring daemon that runs as a kernel thread
   */
@@@ -1433,8 -1375,6 +1432,8 @@@ static int kdamond_fn(void *data
  
        pr_debug("kdamond (%d) starts\n", current->pid);
  
 +      kdamond_init_intervals_sis(ctx);
 +
        if (ctx->ops.init)
                ctx->ops.init(ctx);
        if (ctx->callback.before_start && ctx->callback.before_start(ctx))
        sz_limit = damon_region_sz_limit(ctx);
  
        while (!kdamond_need_stop(ctx)) {
 +              /*
 +               * ctx->attrs and ctx->next_{aggregation,ops_update}_sis could
 +               * be changed from after_wmarks_check() or after_aggregation()
 +               * callbacks.  Read the values here, and use those for this
 +               * iteration.  That is, damon_set_attrs() updated new values
 +               * are respected from next iteration.
 +               */
 +              unsigned long next_aggregation_sis = ctx->next_aggregation_sis;
 +              unsigned long next_ops_update_sis = ctx->next_ops_update_sis;
 +              unsigned long sample_interval = ctx->attrs.sample_interval;
 +
                if (kdamond_wait_activation(ctx))
                        break;
  
                                ctx->callback.after_sampling(ctx))
                        break;
  
 -              kdamond_usleep(ctx->attrs.sample_interval);
 +              kdamond_usleep(sample_interval);
 +              ctx->passed_sample_intervals++;
  
                if (ctx->ops.check_accesses)
                        max_nr_accesses = ctx->ops.check_accesses(ctx);
  
 -              if (kdamond_aggregate_interval_passed(ctx)) {
 +              if (ctx->passed_sample_intervals == next_aggregation_sis) {
                        kdamond_merge_regions(ctx,
                                        max_nr_accesses / 10,
                                        sz_limit);
                        if (ctx->callback.after_aggregation &&
                                        ctx->callback.after_aggregation(ctx))
                                break;
 -                      if (!list_empty(&ctx->schemes))
 -                              kdamond_apply_schemes(ctx);
 +              }
 +
 +              /*
 +               * do kdamond_apply_schemes() after kdamond_merge_regions() if
 +               * possible, to reduce overhead
 +               */
 +              if (!list_empty(&ctx->schemes))
 +                      kdamond_apply_schemes(ctx);
 +
 +              sample_interval = ctx->attrs.sample_interval ?
 +                      ctx->attrs.sample_interval : 1;
 +              if (ctx->passed_sample_intervals == next_aggregation_sis) {
 +                      ctx->next_aggregation_sis = next_aggregation_sis +
 +                              ctx->attrs.aggr_interval / sample_interval;
 +
                        kdamond_reset_aggregated(ctx);
                        kdamond_split_regions(ctx);
                        if (ctx->ops.reset_aggregated)
                                ctx->ops.reset_aggregated(ctx);
                }
  
 -              if (kdamond_need_update_operations(ctx)) {
 +              if (ctx->passed_sample_intervals == next_ops_update_sis) {
 +                      ctx->next_ops_update_sis = next_ops_update_sis +
 +                              ctx->attrs.ops_update_interval /
 +                              sample_interval;
                        if (ctx->ops.update)
                                ctx->ops.update(ctx);
                        sz_limit = damon_region_sz_limit(ctx);
@@@ -1604,76 -1516,6 +1603,76 @@@ int damon_set_region_biggest_system_ram
        return damon_set_regions(t, &addr_range, 1);
  }
  
 +/*
 + * damon_moving_sum() - Calculate an inferred moving sum value.
 + * @mvsum:    Inferred sum of the last @len_window values.
 + * @nomvsum:  Non-moving sum of the last discrete @len_window window values.
 + * @len_window:       The number of last values to take care of.
 + * @new_value:        New value that will be added to the pseudo moving sum.
 + *
 + * Moving sum (moving average * window size) is good for handling noise, but
 + * the cost of keeping past values can be high for arbitrary window size.  This
 + * function implements a lightweight pseudo moving sum function that doesn't
 + * keep the past window values.
 + *
 + * It simply assumes there was no noise in the past, and get the no-noise
 + * assumed past value to drop from @nomvsum and @len_window.  @nomvsum is a
 + * non-moving sum of the last window.  For example, if @len_window is 10 and we
 + * have 25 values, @nomvsum is the sum of the 11th to 20th values of the 25
 + * values.  Hence, this function simply drops @nomvsum / @len_window from
 + * given @mvsum and add @new_value.
 + *
 + * For example, if @len_window is 10 and @nomvsum is 50, the last 10 values for
 + * the last window could be vary, e.g., 0, 10, 0, 10, 0, 10, 0, 0, 0, 20.  For
 + * calculating next moving sum with a new value, we should drop 0 from 50 and
 + * add the new value.  However, this function assumes it got value 5 for each
 + * of the last ten times.  Based on the assumption, when the next value is
 + * measured, it drops the assumed past value, 5 from the current sum, and add
 + * the new value to get the updated pseduo-moving average.
 + *
 + * This means the value could have errors, but the errors will be disappeared
 + * for every @len_window aligned calls.  For example, if @len_window is 10, the
 + * pseudo moving sum with 11th value to 19th value would have an error.  But
 + * the sum with 20th value will not have the error.
 + *
 + * Return: Pseudo-moving average after getting the @new_value.
 + */
 +static unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
 +              unsigned int len_window, unsigned int new_value)
 +{
 +      return mvsum - nomvsum / len_window + new_value;
 +}
 +
 +/**
 + * damon_update_region_access_rate() - Update the access rate of a region.
 + * @r:                The DAMON region to update for its access check result.
 + * @accessed: Whether the region has accessed during last sampling interval.
 + * @attrs:    The damon_attrs of the DAMON context.
 + *
 + * Update the access rate of a region with the region's last sampling interval
 + * access check result.
 + *
 + * Usually this will be called by &damon_operations->check_accesses callback.
 + */
 +void damon_update_region_access_rate(struct damon_region *r, bool accessed,
 +              struct damon_attrs *attrs)
 +{
 +      unsigned int len_window = 1;
 +
 +      /*
 +       * sample_interval can be zero, but cannot be larger than
 +       * aggr_interval, owing to validation of damon_set_attrs().
 +       */
 +      if (attrs->sample_interval)
 +              len_window = damon_max_nr_accesses(attrs);
 +      r->nr_accesses_bp = damon_moving_sum(r->nr_accesses_bp,
 +                      r->last_nr_accesses * 10000, len_window,
 +                      accessed ? 10000 : 0);
 +
 +      if (accessed)
 +              r->nr_accesses++;
 +}
 +
  static int __init damon_init(void)
  {
        damon_region_cache = KMEM_CACHE(damon_region, 0);
diff --combined mm/khugepaged.c
index bc2d8ff269c7340283d7a316efe1466a0e10ffa0,cb3f1d738810ed4a1a00b18b2a33c5e14bd8c101..064654717843ea4bff84bef4a5bd9e8f174a44d9
@@@ -91,7 -91,7 +91,7 @@@ static unsigned int khugepaged_max_ptes
  #define MM_SLOTS_HASH_BITS 10
  static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
  
- static struct kmem_cache *mm_slot_cache __read_mostly;
+ static struct kmem_cache *mm_slot_cache __ro_after_init;
  
  struct collapse_control {
        bool is_khugepaged;
@@@ -524,15 -524,15 +524,15 @@@ static void release_pte_pages(pte_t *pt
        }
  }
  
 -static bool is_refcount_suitable(struct page *page)
 +static bool is_refcount_suitable(struct folio *folio)
  {
        int expected_refcount;
  
 -      expected_refcount = total_mapcount(page);
 -      if (PageSwapCache(page))
 -              expected_refcount += compound_nr(page);
 +      expected_refcount = folio_mapcount(folio);
 +      if (folio_test_swapcache(folio))
 +              expected_refcount += folio_nr_pages(folio);
  
 -      return page_count(page) == expected_refcount;
 +      return folio_ref_count(folio) == expected_refcount;
  }
  
  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                                        struct list_head *compound_pagelist)
  {
        struct page *page = NULL;
 +      struct folio *folio = NULL;
        pte_t *_pte;
        int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
        bool writable = false;
                        goto out;
                }
  
 -              VM_BUG_ON_PAGE(!PageAnon(page), page);
 +              folio = page_folio(page);
 +              VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
  
                if (page_mapcount(page) > 1) {
                        ++shared;
                        }
                }
  
 -              if (PageCompound(page)) {
 -                      struct page *p;
 -                      page = compound_head(page);
 +              if (folio_test_large(folio)) {
 +                      struct folio *f;
  
                        /*
                         * Check if we have dealt with the compound page
                         * already
                         */
 -                      list_for_each_entry(p, compound_pagelist, lru) {
 -                              if (page == p)
 +                      list_for_each_entry(f, compound_pagelist, lru) {
 +                              if (folio == f)
                                        goto next;
                        }
                }
                 * is needed to serialize against split_huge_page
                 * when invoked from the VM.
                 */
 -              if (!trylock_page(page)) {
 +              if (!folio_trylock(folio)) {
                        result = SCAN_PAGE_LOCK;
                        goto out;
                }
                 * but not from this process. The other process cannot write to
                 * the page, only trigger CoW.
                 */
 -              if (!is_refcount_suitable(page)) {
 -                      unlock_page(page);
 +              if (!is_refcount_suitable(folio)) {
 +                      folio_unlock(folio);
                        result = SCAN_PAGE_COUNT;
                        goto out;
                }
                 * Isolate the page to avoid collapsing an hugepage
                 * currently in use by the VM.
                 */
 -              if (!isolate_lru_page(page)) {
 -                      unlock_page(page);
 +              if (!folio_isolate_lru(folio)) {
 +                      folio_unlock(folio);
                        result = SCAN_DEL_PAGE_LRU;
                        goto out;
                }
 -              mod_node_page_state(page_pgdat(page),
 -                              NR_ISOLATED_ANON + page_is_file_lru(page),
 -                              compound_nr(page));
 -              VM_BUG_ON_PAGE(!PageLocked(page), page);
 -              VM_BUG_ON_PAGE(PageLRU(page), page);
 -
 -              if (PageCompound(page))
 -                      list_add_tail(&page->lru, compound_pagelist);
 +              node_stat_mod_folio(folio,
 +                              NR_ISOLATED_ANON + folio_is_file_lru(folio),
 +                              folio_nr_pages(folio));
 +              VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 +              VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
 +
 +              if (folio_test_large(folio))
 +                      list_add_tail(&folio->lru, compound_pagelist);
  next:
                /*
                 * If collapse was initiated by khugepaged, check that there is
                 * enough young pte to justify collapsing the page
                 */
                if (cc->is_khugepaged &&
 -                  (pte_young(pteval) || page_is_young(page) ||
 -                   PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
 +                  (pte_young(pteval) || folio_test_young(folio) ||
 +                   folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
                                                                     address)))
                        referenced++;
  
                result = SCAN_LACK_REFERENCED_PAGE;
        } else {
                result = SCAN_SUCCEED;
 -              trace_mm_collapse_huge_page_isolate(page, none_or_zero,
 +              trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
                                                    referenced, writable, result);
                return result;
        }
  out:
        release_pte_pages(pte, _pte, compound_pagelist);
 -      trace_mm_collapse_huge_page_isolate(page, none_or_zero,
 +      trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
                                            referenced, writable, result);
        return result;
  }
@@@ -888,16 -887,16 +888,16 @@@ static int hpage_collapse_find_target_n
  }
  #endif
  
 -static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
 +static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node,
                                      nodemask_t *nmask)
  {
 -      *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
 -      if (unlikely(!*hpage)) {
 +      *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask);
 +
 +      if (unlikely(!*folio)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                return false;
        }
  
 -      folio_prep_large_rmappable((struct folio *)*hpage);
        count_vm_event(THP_COLLAPSE_ALLOC);
        return true;
  }
@@@ -1064,20 -1063,17 +1064,20 @@@ static int alloc_charge_hpage(struct pa
        int node = hpage_collapse_find_target_node(cc);
        struct folio *folio;
  
 -      if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
 +      if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) {
 +              *hpage = NULL;
                return SCAN_ALLOC_HUGE_PAGE_FAIL;
 +      }
  
 -      folio = page_folio(*hpage);
        if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
                folio_put(folio);
                *hpage = NULL;
                return SCAN_CGROUP_CHARGE_FAIL;
        }
 -      count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
  
 +      count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
 +
 +      *hpage = folio_page(folio, 0);
        return SCAN_SUCCEED;
  }
  
@@@ -1251,7 -1247,6 +1251,7 @@@ static int hpage_collapse_scan_pmd(stru
        int result = SCAN_FAIL, referenced = 0;
        int none_or_zero = 0, shared = 0;
        struct page *page = NULL;
 +      struct folio *folio = NULL;
        unsigned long _address;
        spinlock_t *ptl;
        int node = NUMA_NO_NODE, unmapped = 0;
                        }
                }
  
 -              page = compound_head(page);
 -
 +              folio = page_folio(page);
                /*
                 * Record which node the original page is from and save this
                 * information to cc->node_load[].
                 * Khugepaged will allocate hugepage from the node has the max
                 * hit record.
                 */
 -              node = page_to_nid(page);
 +              node = folio_nid(folio);
                if (hpage_collapse_scan_abort(node, cc)) {
                        result = SCAN_SCAN_ABORT;
                        goto out_unmap;
                }
                cc->node_load[node]++;
 -              if (!PageLRU(page)) {
 +              if (!folio_test_lru(folio)) {
                        result = SCAN_PAGE_LRU;
                        goto out_unmap;
                }
 -              if (PageLocked(page)) {
 +              if (folio_test_locked(folio)) {
                        result = SCAN_PAGE_LOCK;
                        goto out_unmap;
                }
 -              if (!PageAnon(page)) {
 +              if (!folio_test_anon(folio)) {
                        result = SCAN_PAGE_ANON;
                        goto out_unmap;
                }
                 * has excessive GUP pins (i.e. 512).  Anyway the same check
                 * will be done again later the risk seems low.
                 */
 -              if (!is_refcount_suitable(page)) {
 +              if (!is_refcount_suitable(folio)) {
                        result = SCAN_PAGE_COUNT;
                        goto out_unmap;
                }
                 * enough young pte to justify collapsing the page
                 */
                if (cc->is_khugepaged &&
 -                  (pte_young(pteval) || page_is_young(page) ||
 -                   PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
 +                  (pte_young(pteval) || folio_test_young(folio) ||
 +                   folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
                                                                     address)))
                        referenced++;
        }
@@@ -1407,7 -1403,7 +1407,7 @@@ out_unmap
                *mmap_locked = false;
        }
  out:
 -      trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
 +      trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
                                     none_or_zero, result, unmapped);
        return result;
  }
@@@ -1477,7 -1473,7 +1477,7 @@@ int collapse_pte_mapped_thp(struct mm_s
        bool notified = false;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
        struct vm_area_struct *vma = vma_lookup(mm, haddr);
 -      struct page *hpage;
 +      struct folio *folio;
        pte_t *start_pte, *pte;
        pmd_t *pmd, pgt_pmd;
        spinlock_t *pml = NULL, *ptl;
        if (userfaultfd_wp(vma))
                return SCAN_PTE_UFFD_WP;
  
 -      hpage = find_lock_page(vma->vm_file->f_mapping,
 +      folio = filemap_lock_folio(vma->vm_file->f_mapping,
                               linear_page_index(vma, haddr));
 -      if (!hpage)
 +      if (IS_ERR(folio))
                return SCAN_PAGE_NULL;
  
 -      if (!PageHead(hpage)) {
 -              result = SCAN_FAIL;
 -              goto drop_hpage;
 -      }
 -
 -      if (compound_order(hpage) != HPAGE_PMD_ORDER) {
 +      if (folio_order(folio) != HPAGE_PMD_ORDER) {
                result = SCAN_PAGE_COMPOUND;
 -              goto drop_hpage;
 +              goto drop_folio;
        }
  
        result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
                 */
                goto maybe_install_pmd;
        default:
 -              goto drop_hpage;
 +              goto drop_folio;
        }
  
        result = SCAN_FAIL;
        start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
        if (!start_pte)         /* mmap_lock + page lock should prevent this */
 -              goto drop_hpage;
 +              goto drop_folio;
  
        /* step 1: check all mapped PTEs are to the right huge page */
        for (i = 0, addr = haddr, pte = start_pte;
                 * Note that uprobe, debugger, or MAP_PRIVATE may change the
                 * page table, but the new page will not be a subpage of hpage.
                 */
 -              if (hpage + i != page)
 +              if (folio_page(folio, i) != page)
                        goto abort;
        }
  
         * page_table_lock) ptl nests inside pml. The less time we hold pml,
         * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
         * inserts a valid as-if-COWed PTE without even looking up page cache.
 -       * So page lock of hpage does not protect from it, so we must not drop
 +       * So page lock of folio does not protect from it, so we must not drop
         * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
         */
        if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
                        continue;
                /*
                 * We dropped ptl after the first scan, to do the mmu_notifier:
 -               * page lock stops more PTEs of the hpage being faulted in, but
 +               * page lock stops more PTEs of the folio being faulted in, but
                 * does not stop write faults COWing anon copies from existing
                 * PTEs; and does not stop those being swapped out or migrated.
                 */
                        goto abort;
                }
                page = vm_normal_page(vma, addr, ptent);
 -              if (hpage + i != page)
 +              if (folio_page(folio, i) != page)
                        goto abort;
  
                /*
  
        /* step 3: set proper refcount and mm_counters. */
        if (nr_ptes) {
 -              page_ref_sub(hpage, nr_ptes);
 -              add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
 +              folio_ref_sub(folio, nr_ptes);
 +              add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
        }
  
        /* step 4: remove empty page table */
  maybe_install_pmd:
        /* step 5: install pmd entry */
        result = install_pmd
 -                      ? set_huge_pmd(vma, haddr, pmd, hpage)
 +                      ? set_huge_pmd(vma, haddr, pmd, &folio->page)
                        : SCAN_SUCCEED;
 -      goto drop_hpage;
 +      goto drop_folio;
  abort:
        if (nr_ptes) {
                flush_tlb_mm(mm);
 -              page_ref_sub(hpage, nr_ptes);
 -              add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
 +              folio_ref_sub(folio, nr_ptes);
 +              add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
        }
        if (start_pte)
                pte_unmap_unlock(start_pte, ptl);
                spin_unlock(pml);
        if (notified)
                mmu_notifier_invalidate_range_end(&range);
 -drop_hpage:
 -      unlock_page(hpage);
 -      put_page(hpage);
 +drop_folio:
 +      folio_unlock(folio);
 +      folio_put(folio);
        return result;
  }
  
diff --combined mm/shmem.c
index 71b8d957b63bec8384feb8c369289afcd95d65b4,389212972e726755a9c5a2641e31ef71ae5fed83..91e2620148b2f6d789420e6736daef7a53e2cc5c
@@@ -42,7 -42,7 +42,7 @@@
  #include <linux/iversion.h>
  #include "swap.h"
  
- static struct vfsmount *shm_mnt;
+ static struct vfsmount *shm_mnt __ro_after_init;
  
  #ifdef CONFIG_SHMEM
  /*
@@@ -146,8 -146,9 +146,8 @@@ static unsigned long shmem_default_max_
  #endif
  
  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 -                           struct folio **foliop, enum sgp_type sgp,
 -                           gfp_t gfp, struct vm_area_struct *vma,
 -                           vm_fault_t *fault_type);
 +                      struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
 +                      struct mm_struct *fault_mm, vm_fault_t *fault_type);
  
  static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
  {
@@@ -188,10 -189,10 +188,10 @@@ static inline int shmem_reacct_size(uns
  /*
   * ... whereas tmpfs objects are accounted incrementally as
   * pages are allocated, in order to allow large sparse files.
 - * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
 + * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
   * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
   */
 -static inline int shmem_acct_block(unsigned long flags, long pages)
 +static inline int shmem_acct_blocks(unsigned long flags, long pages)
  {
        if (!(flags & VM_NORESERVE))
                return 0;
@@@ -206,26 -207,26 +206,26 @@@ static inline void shmem_unacct_blocks(
                vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
  }
  
 -static int shmem_inode_acct_block(struct inode *inode, long pages)
 +static int shmem_inode_acct_blocks(struct inode *inode, long pages)
  {
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        int err = -ENOSPC;
  
 -      if (shmem_acct_block(info->flags, pages))
 +      if (shmem_acct_blocks(info->flags, pages))
                return err;
  
        might_sleep();  /* when quotas */
        if (sbinfo->max_blocks) {
 -              if (percpu_counter_compare(&sbinfo->used_blocks,
 -                                         sbinfo->max_blocks - pages) > 0)
 +              if (!percpu_counter_limited_add(&sbinfo->used_blocks,
 +                                              sbinfo->max_blocks, pages))
                        goto unacct;
  
                err = dquot_alloc_block_nodirty(inode, pages);
 -              if (err)
 +              if (err) {
 +                      percpu_counter_sub(&sbinfo->used_blocks, pages);
                        goto unacct;
 -
 -              percpu_counter_add(&sbinfo->used_blocks, pages);
 +              }
        } else {
                err = dquot_alloc_block_nodirty(inode, pages);
                if (err)
@@@ -446,7 -447,7 +446,7 @@@ bool shmem_charge(struct inode *inode, 
  {
        struct address_space *mapping = inode->i_mapping;
  
 -      if (shmem_inode_acct_block(inode, pages))
 +      if (shmem_inode_acct_blocks(inode, pages))
                return false;
  
        /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
@@@ -755,14 -756,16 +755,14 @@@ static unsigned long shmem_unused_huge_
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
  /*
 - * Like filemap_add_folio, but error if expected item has gone.
 + * Somewhat like filemap_add_folio, but error if expected item has gone.
   */
  static int shmem_add_to_page_cache(struct folio *folio,
                                   struct address_space *mapping,
 -                                 pgoff_t index, void *expected, gfp_t gfp,
 -                                 struct mm_struct *charge_mm)
 +                                 pgoff_t index, void *expected, gfp_t gfp)
  {
        XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
        long nr = folio_nr_pages(folio);
 -      int error;
  
        VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        folio->mapping = mapping;
        folio->index = index;
  
 -      if (!folio_test_swapcache(folio)) {
 -              error = mem_cgroup_charge(folio, charge_mm, gfp);
 -              if (error) {
 -                      if (folio_test_pmd_mappable(folio)) {
 -                              count_vm_event(THP_FILE_FALLBACK);
 -                              count_vm_event(THP_FILE_FALLBACK_CHARGE);
 -                      }
 -                      goto error;
 -              }
 -      }
 +      gfp &= GFP_RECLAIM_MASK;
        folio_throttle_swaprate(folio, gfp);
  
        do {
                xas_store(&xas, folio);
                if (xas_error(&xas))
                        goto unlock;
 -              if (folio_test_pmd_mappable(folio)) {
 -                      count_vm_event(THP_FILE_ALLOC);
 +              if (folio_test_pmd_mappable(folio))
                        __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
 -              }
 -              mapping->nrpages += nr;
                __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
                __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
 +              mapping->nrpages += nr;
  unlock:
                xas_unlock_irq(&xas);
        } while (xas_nomem(&xas, gfp));
  
        if (xas_error(&xas)) {
 -              error = xas_error(&xas);
 -              goto error;
 +              folio->mapping = NULL;
 +              folio_ref_sub(folio, nr);
 +              return xas_error(&xas);
        }
  
        return 0;
 -error:
 -      folio->mapping = NULL;
 -      folio_ref_sub(folio, nr);
 -      return error;
  }
  
  /*
 - * Like delete_from_page_cache, but substitutes swap for @folio.
 + * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
   */
  static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
  {
@@@ -870,6 -887,7 +870,6 @@@ unsigned long shmem_partial_swap_usage(
                        cond_resched_rcu();
                }
        }
 -
        rcu_read_unlock();
  
        return swapped << PAGE_SHIFT;
@@@ -1094,7 -1112,7 +1094,7 @@@ whole_folios
  void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
  {
        shmem_undo_range(inode, lstart, lend, false);
 -      inode->i_mtime = inode_set_ctime_current(inode);
 +      inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        inode_inc_iversion(inode);
  }
  EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@@ -1195,6 -1213,7 +1195,6 @@@ static int shmem_setattr(struct mnt_idm
        if (i_uid_needs_update(idmap, attr, inode) ||
            i_gid_needs_update(idmap, attr, inode)) {
                error = dquot_transfer(idmap, inode, attr);
 -
                if (error)
                        return error;
        }
        if (!error && update_ctime) {
                inode_set_ctime_current(inode);
                if (update_mtime)
 -                      inode->i_mtime = inode_get_ctime(inode);
 +                      inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
                inode_inc_iversion(inode);
        }
        return error;
@@@ -1307,8 -1326,10 +1307,8 @@@ static int shmem_unuse_swap_entries(str
  
                if (!xa_is_value(folio))
                        continue;
 -              error = shmem_swapin_folio(inode, indices[i],
 -                                        &folio, SGP_CACHE,
 -                                        mapping_gfp_mask(mapping),
 -                                        NULL, NULL);
 +              error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
 +                                      mapping_gfp_mask(mapping), NULL, NULL);
                if (error == 0) {
                        folio_unlock(folio);
                        folio_put(folio);
@@@ -1544,20 -1565,38 +1544,20 @@@ static inline struct mempolicy *shmem_g
        return NULL;
  }
  #endif /* CONFIG_NUMA && CONFIG_TMPFS */
 -#ifndef CONFIG_NUMA
 -#define vm_policy vm_private_data
 -#endif
  
 -static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
 -              struct shmem_inode_info *info, pgoff_t index)
 -{
 -      /* Create a pseudo vma that just contains the policy */
 -      vma_init(vma, NULL);
 -      /* Bias interleave by inode number to distribute better across nodes */
 -      vma->vm_pgoff = index + info->vfs_inode.i_ino;
 -      vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
 -}
 -
 -static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
 -{
 -      /* Drop reference taken by mpol_shared_policy_lookup() */
 -      mpol_cond_put(vma->vm_policy);
 -}
 +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
 +                      pgoff_t index, unsigned int order, pgoff_t *ilx);
  
 -static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 +static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
                        struct shmem_inode_info *info, pgoff_t index)
  {
 -      struct vm_area_struct pvma;
 +      struct mempolicy *mpol;
 +      pgoff_t ilx;
        struct page *page;
 -      struct vm_fault vmf = {
 -              .vma = &pvma,
 -      };
  
 -      shmem_pseudo_vma_init(&pvma, info, index);
 -      page = swap_cluster_readahead(swap, gfp, &vmf);
 -      shmem_pseudo_vma_destroy(&pvma);
 +      mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
 +      page = swap_cluster_readahead(swap, gfp, mpol, ilx);
 +      mpol_cond_put(mpol);
  
        if (!page)
                return NULL;
@@@ -1591,126 -1630,67 +1591,126 @@@ static gfp_t limit_gfp_mask(gfp_t huge_
  static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
                struct shmem_inode_info *info, pgoff_t index)
  {
 -      struct vm_area_struct pvma;
 -      struct address_space *mapping = info->vfs_inode.i_mapping;
 -      pgoff_t hindex;
 -      struct folio *folio;
 +      struct mempolicy *mpol;
 +      pgoff_t ilx;
 +      struct page *page;
  
 -      hindex = round_down(index, HPAGE_PMD_NR);
 -      if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
 -                                                              XA_PRESENT))
 -              return NULL;
 +      mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx);
 +      page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id());
 +      mpol_cond_put(mpol);
  
 -      shmem_pseudo_vma_init(&pvma, info, hindex);
 -      folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
 -      shmem_pseudo_vma_destroy(&pvma);
 -      if (!folio)
 -              count_vm_event(THP_FILE_FALLBACK);
 -      return folio;
 +      return page_rmappable_folio(page);
  }
  
  static struct folio *shmem_alloc_folio(gfp_t gfp,
 -                      struct shmem_inode_info *info, pgoff_t index)
 +              struct shmem_inode_info *info, pgoff_t index)
  {
 -      struct vm_area_struct pvma;
 -      struct folio *folio;
 +      struct mempolicy *mpol;
 +      pgoff_t ilx;
 +      struct page *page;
  
 -      shmem_pseudo_vma_init(&pvma, info, index);
 -      folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
 -      shmem_pseudo_vma_destroy(&pvma);
 +      mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
 +      page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id());
 +      mpol_cond_put(mpol);
  
 -      return folio;
 +      return (struct folio *)page;
  }
  
 -static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
 -              pgoff_t index, bool huge)
 +static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
 +              struct inode *inode, pgoff_t index,
 +              struct mm_struct *fault_mm, bool huge)
  {
 +      struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct folio *folio;
 -      int nr;
 -      int err;
 +      long pages;
 +      int error;
  
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                huge = false;
 -      nr = huge ? HPAGE_PMD_NR : 1;
  
 -      err = shmem_inode_acct_block(inode, nr);
 -      if (err)
 -              goto failed;
 +      if (huge) {
 +              pages = HPAGE_PMD_NR;
 +              index = round_down(index, HPAGE_PMD_NR);
 +
 +              /*
 +               * Check for conflict before waiting on a huge allocation.
 +               * Conflict might be that a huge page has just been allocated
 +               * and added to page cache by a racing thread, or that there
 +               * is already at least one small page in the huge extent.
 +               * Be careful to retry when appropriate, but not forever!
 +               * Elsewhere -EEXIST would be the right code, but not here.
 +               */
 +              if (xa_find(&mapping->i_pages, &index,
 +                              index + HPAGE_PMD_NR - 1, XA_PRESENT))
 +                      return ERR_PTR(-E2BIG);
  
 -      if (huge)
                folio = shmem_alloc_hugefolio(gfp, info, index);
 -      else
 +              if (!folio)
 +                      count_vm_event(THP_FILE_FALLBACK);
 +      } else {
 +              pages = 1;
                folio = shmem_alloc_folio(gfp, info, index);
 -      if (folio) {
 -              __folio_set_locked(folio);
 -              __folio_set_swapbacked(folio);
 -              return folio;
        }
 +      if (!folio)
 +              return ERR_PTR(-ENOMEM);
  
 -      err = -ENOMEM;
 -      shmem_inode_unacct_blocks(inode, nr);
 -failed:
 -      return ERR_PTR(err);
 +      __folio_set_locked(folio);
 +      __folio_set_swapbacked(folio);
 +
 +      gfp &= GFP_RECLAIM_MASK;
 +      error = mem_cgroup_charge(folio, fault_mm, gfp);
 +      if (error) {
 +              if (xa_find(&mapping->i_pages, &index,
 +                              index + pages - 1, XA_PRESENT)) {
 +                      error = -EEXIST;
 +              } else if (huge) {
 +                      count_vm_event(THP_FILE_FALLBACK);
 +                      count_vm_event(THP_FILE_FALLBACK_CHARGE);
 +              }
 +              goto unlock;
 +      }
 +
 +      error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
 +      if (error)
 +              goto unlock;
 +
 +      error = shmem_inode_acct_blocks(inode, pages);
 +      if (error) {
 +              struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 +              long freed;
 +              /*
 +               * Try to reclaim some space by splitting a few
 +               * large folios beyond i_size on the filesystem.
 +               */
 +              shmem_unused_huge_shrink(sbinfo, NULL, 2);
 +              /*
 +               * And do a shmem_recalc_inode() to account for freed pages:
 +               * except our folio is there in cache, so not quite balanced.
 +               */
 +              spin_lock(&info->lock);
 +              freed = pages + info->alloced - info->swapped -
 +                      READ_ONCE(mapping->nrpages);
 +              if (freed > 0)
 +                      info->alloced -= freed;
 +              spin_unlock(&info->lock);
 +              if (freed > 0)
 +                      shmem_inode_unacct_blocks(inode, freed);
 +              error = shmem_inode_acct_blocks(inode, pages);
 +              if (error) {
 +                      filemap_remove_folio(folio);
 +                      goto unlock;
 +              }
 +      }
 +
 +      shmem_recalc_inode(inode, pages, 0);
 +      folio_add_lru(folio);
 +      return folio;
 +
 +unlock:
 +      folio_unlock(folio);
 +      folio_put(folio);
 +      return ERR_PTR(error);
  }
  
  /*
@@@ -1832,11 -1812,12 +1832,11 @@@ static void shmem_set_folio_swapin_erro
   */
  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                             struct folio **foliop, enum sgp_type sgp,
 -                           gfp_t gfp, struct vm_area_struct *vma,
 +                           gfp_t gfp, struct mm_struct *fault_mm,
                             vm_fault_t *fault_type)
  {
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
 -      struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
        struct swap_info_struct *si;
        struct folio *folio = NULL;
        swp_entry_t swap;
                if (fault_type) {
                        *fault_type |= VM_FAULT_MAJOR;
                        count_vm_event(PGMAJFAULT);
 -                      count_memcg_event_mm(charge_mm, PGMAJFAULT);
 +                      count_memcg_event_mm(fault_mm, PGMAJFAULT);
                }
                /* Here we actually start the io */
 -              folio = shmem_swapin(swap, gfp, info, index);
 +              folio = shmem_swapin_cluster(swap, gfp, info, index);
                if (!folio) {
                        error = -ENOMEM;
                        goto failed;
        }
  
        error = shmem_add_to_page_cache(folio, mapping, index,
 -                                      swp_to_radix_entry(swap), gfp,
 -                                      charge_mm);
 +                                      swp_to_radix_entry(swap), gfp);
        if (error)
                goto failed;
  
@@@ -1939,29 -1921,37 +1939,29 @@@ unlock
   * vm. If we swap it in we mark it dirty since we also free the swap
   * entry since a page cannot live in both the swap and page cache.
   *
 - * vma, vmf, and fault_type are only supplied by shmem_fault:
 - * otherwise they are NULL.
 + * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
   */
  static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
                struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
 -              struct vm_area_struct *vma, struct vm_fault *vmf,
 -              vm_fault_t *fault_type)
 +              struct vm_fault *vmf, vm_fault_t *fault_type)
  {
 -      struct address_space *mapping = inode->i_mapping;
 -      struct shmem_inode_info *info = SHMEM_I(inode);
 -      struct shmem_sb_info *sbinfo;
 -      struct mm_struct *charge_mm;
 +      struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
 +      struct mm_struct *fault_mm;
        struct folio *folio;
 -      pgoff_t hindex;
 -      gfp_t huge_gfp;
        int error;
 -      int once = 0;
 -      int alloced = 0;
 +      bool alloced;
  
        if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
                return -EFBIG;
  repeat:
        if (sgp <= SGP_CACHE &&
 -          ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
 +          ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
                return -EINVAL;
 -      }
  
 -      sbinfo = SHMEM_SB(inode->i_sb);
 -      charge_mm = vma ? vma->vm_mm : NULL;
 +      alloced = false;
 +      fault_mm = vma ? vma->vm_mm : NULL;
  
 -      folio = filemap_get_entry(mapping, index);
 +      folio = filemap_get_entry(inode->i_mapping, index);
        if (folio && vma && userfaultfd_minor(vma)) {
                if (!xa_is_value(folio))
                        folio_put(folio);
  
        if (xa_is_value(folio)) {
                error = shmem_swapin_folio(inode, index, &folio,
 -                                        sgp, gfp, vma, fault_type);
 +                                         sgp, gfp, fault_mm, fault_type);
                if (error == -EEXIST)
                        goto repeat;
  
                folio_lock(folio);
  
                /* Has the folio been truncated or swapped out? */
 -              if (unlikely(folio->mapping != mapping)) {
 +              if (unlikely(folio->mapping != inode->i_mapping)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto repeat;
                return 0;
        }
  
 -      if (!shmem_is_huge(inode, index, false,
 -                         vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0))
 -              goto alloc_nohuge;
 +      if (shmem_is_huge(inode, index, false, fault_mm,
 +                        vma ? vma->vm_flags : 0)) {
 +              gfp_t huge_gfp;
  
 -      huge_gfp = vma_thp_gfp_mask(vma);
 -      huge_gfp = limit_gfp_mask(huge_gfp, gfp);
 -      folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
 -      if (IS_ERR(folio)) {
 -alloc_nohuge:
 -              folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
 +              huge_gfp = vma_thp_gfp_mask(vma);
 +              huge_gfp = limit_gfp_mask(huge_gfp, gfp);
 +              folio = shmem_alloc_and_add_folio(huge_gfp,
 +                              inode, index, fault_mm, true);
 +              if (!IS_ERR(folio)) {
 +                      count_vm_event(THP_FILE_ALLOC);
 +                      goto alloced;
 +              }
 +              if (PTR_ERR(folio) == -EEXIST)
 +                      goto repeat;
        }
 -      if (IS_ERR(folio)) {
 -              int retry = 5;
  
 +      folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false);
 +      if (IS_ERR(folio)) {
                error = PTR_ERR(folio);
 +              if (error == -EEXIST)
 +                      goto repeat;
                folio = NULL;
 -              if (error != -ENOSPC)
 -                      goto unlock;
 -              /*
 -               * Try to reclaim some space by splitting a large folio
 -               * beyond i_size on the filesystem.
 -               */
 -              while (retry--) {
 -                      int ret;
 -
 -                      ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
 -                      if (ret == SHRINK_STOP)
 -                              break;
 -                      if (ret)
 -                              goto alloc_nohuge;
 -              }
                goto unlock;
        }
  
 -      hindex = round_down(index, folio_nr_pages(folio));
 -
 -      if (sgp == SGP_WRITE)
 -              __folio_set_referenced(folio);
 -
 -      error = shmem_add_to_page_cache(folio, mapping, hindex,
 -                                      NULL, gfp & GFP_RECLAIM_MASK,
 -                                      charge_mm);
 -      if (error)
 -              goto unacct;
 -
 -      folio_add_lru(folio);
 -      shmem_recalc_inode(inode, folio_nr_pages(folio), 0);
 +alloced:
        alloced = true;
 -
        if (folio_test_pmd_mappable(folio) &&
            DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
                                        folio_next_index(folio) - 1) {
 +              struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 +              struct shmem_inode_info *info = SHMEM_I(inode);
                /*
                 * Part of the large folio is beyond i_size: subject
                 * to shrink under memory pressure.
                spin_unlock(&sbinfo->shrinklist_lock);
        }
  
 +      if (sgp == SGP_WRITE)
 +              folio_set_referenced(folio);
        /*
         * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
         */
@@@ -2092,6 -2100,11 +2092,6 @@@ clear
        /* Perhaps the file has been truncated since we checked */
        if (sgp <= SGP_CACHE &&
            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
 -              if (alloced) {
 -                      folio_clear_dirty(folio);
 -                      filemap_remove_folio(folio);
 -                      shmem_recalc_inode(inode, 0, 0);
 -              }
                error = -EINVAL;
                goto unlock;
        }
        /*
         * Error recovery.
         */
 -unacct:
 -      shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
 -
 -      if (folio_test_large(folio)) {
 -              folio_unlock(folio);
 -              folio_put(folio);
 -              goto alloc_nohuge;
 -      }
  unlock:
 +      if (alloced)
 +              filemap_remove_folio(folio);
 +      shmem_recalc_inode(inode, 0, 0);
        if (folio) {
                folio_unlock(folio);
                folio_put(folio);
        }
 -      if (error == -ENOSPC && !once++) {
 -              shmem_recalc_inode(inode, 0, 0);
 -              goto repeat;
 -      }
 -      if (error == -EEXIST)
 -              goto repeat;
        return error;
  }
  
@@@ -2117,7 -2141,7 +2117,7 @@@ int shmem_get_folio(struct inode *inode
                enum sgp_type sgp)
  {
        return shmem_get_folio_gfp(inode, index, foliop, sgp,
 -                      mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
 +                      mapping_gfp_mask(inode->i_mapping), NULL, NULL);
  }
  
  /*
   * entry unconditionally - even if something else had already woken the
   * target.
   */
 -static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 +static int synchronous_wake_function(wait_queue_entry_t *wait,
 +                      unsigned int mode, int sync, void *key)
  {
        int ret = default_wake_function(wait, mode, sync, key);
        list_del_init(&wait->entry);
        return ret;
  }
  
 +/*
 + * Trinity finds that probing a hole which tmpfs is punching can
 + * prevent the hole-punch from ever completing: which in turn
 + * locks writers out with its hold on i_rwsem.  So refrain from
 + * faulting pages into the hole while it's being punched.  Although
 + * shmem_undo_range() does remove the additions, it may be unable to
 + * keep up, as each new page needs its own unmap_mapping_range() call,
 + * and the i_mmap tree grows ever slower to scan if new vmas are added.
 + *
 + * It does not matter if we sometimes reach this check just before the
 + * hole-punch begins, so that one fault then races with the punch:
 + * we just need to make racing faults a rare case.
 + *
 + * The implementation below would be much simpler if we just used a
 + * standard mutex or completion: but we cannot take i_rwsem in fault,
 + * and bloating every shmem inode for this unlikely case would be sad.
 + */
 +static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
 +{
 +      struct shmem_falloc *shmem_falloc;
 +      struct file *fpin = NULL;
 +      vm_fault_t ret = 0;
 +
 +      spin_lock(&inode->i_lock);
 +      shmem_falloc = inode->i_private;
 +      if (shmem_falloc &&
 +          shmem_falloc->waitq &&
 +          vmf->pgoff >= shmem_falloc->start &&
 +          vmf->pgoff < shmem_falloc->next) {
 +              wait_queue_head_t *shmem_falloc_waitq;
 +              DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
 +
 +              ret = VM_FAULT_NOPAGE;
 +              fpin = maybe_unlock_mmap_for_io(vmf, NULL);
 +              shmem_falloc_waitq = shmem_falloc->waitq;
 +              prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
 +                              TASK_UNINTERRUPTIBLE);
 +              spin_unlock(&inode->i_lock);
 +              schedule();
 +
 +              /*
 +               * shmem_falloc_waitq points into the shmem_fallocate()
 +               * stack of the hole-punching task: shmem_falloc_waitq
 +               * is usually invalid by the time we reach here, but
 +               * finish_wait() does not dereference it in that case;
 +               * though i_lock needed lest racing with wake_up_all().
 +               */
 +              spin_lock(&inode->i_lock);
 +              finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
 +      }
 +      spin_unlock(&inode->i_lock);
 +      if (fpin) {
 +              fput(fpin);
 +              ret = VM_FAULT_RETRY;
 +      }
 +      return ret;
 +}
 +
  static vm_fault_t shmem_fault(struct vm_fault *vmf)
  {
 -      struct vm_area_struct *vma = vmf->vma;
 -      struct inode *inode = file_inode(vma->vm_file);
 +      struct inode *inode = file_inode(vmf->vma->vm_file);
        gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
        struct folio *folio = NULL;
 +      vm_fault_t ret = 0;
        int err;
 -      vm_fault_t ret = VM_FAULT_LOCKED;
  
        /*
         * Trinity finds that probing a hole which tmpfs is punching can
 -       * prevent the hole-punch from ever completing: which in turn
 -       * locks writers out with its hold on i_rwsem.  So refrain from
 -       * faulting pages into the hole while it's being punched.  Although
 -       * shmem_undo_range() does remove the additions, it may be unable to
 -       * keep up, as each new page needs its own unmap_mapping_range() call,
 -       * and the i_mmap tree grows ever slower to scan if new vmas are added.
 -       *
 -       * It does not matter if we sometimes reach this check just before the
 -       * hole-punch begins, so that one fault then races with the punch:
 -       * we just need to make racing faults a rare case.
 -       *
 -       * The implementation below would be much simpler if we just used a
 -       * standard mutex or completion: but we cannot take i_rwsem in fault,
 -       * and bloating every shmem inode for this unlikely case would be sad.
 +       * prevent the hole-punch from ever completing: noted in i_private.
         */
        if (unlikely(inode->i_private)) {
 -              struct shmem_falloc *shmem_falloc;
 -
 -              spin_lock(&inode->i_lock);
 -              shmem_falloc = inode->i_private;
 -              if (shmem_falloc &&
 -                  shmem_falloc->waitq &&
 -                  vmf->pgoff >= shmem_falloc->start &&
 -                  vmf->pgoff < shmem_falloc->next) {
 -                      struct file *fpin;
 -                      wait_queue_head_t *shmem_falloc_waitq;
 -                      DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
 -
 -                      ret = VM_FAULT_NOPAGE;
 -                      fpin = maybe_unlock_mmap_for_io(vmf, NULL);
 -                      if (fpin)
 -                              ret = VM_FAULT_RETRY;
 -
 -                      shmem_falloc_waitq = shmem_falloc->waitq;
 -                      prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
 -                                      TASK_UNINTERRUPTIBLE);
 -                      spin_unlock(&inode->i_lock);
 -                      schedule();
 -
 -                      /*
 -                       * shmem_falloc_waitq points into the shmem_fallocate()
 -                       * stack of the hole-punching task: shmem_falloc_waitq
 -                       * is usually invalid by the time we reach here, but
 -                       * finish_wait() does not dereference it in that case;
 -                       * though i_lock needed lest racing with wake_up_all().
 -                       */
 -                      spin_lock(&inode->i_lock);
 -                      finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
 -                      spin_unlock(&inode->i_lock);
 -
 -                      if (fpin)
 -                              fput(fpin);
 +              ret = shmem_falloc_wait(vmf, inode);
 +              if (ret)
                        return ret;
 -              }
 -              spin_unlock(&inode->i_lock);
        }
  
 +      WARN_ON_ONCE(vmf->page != NULL);
        err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
 -                                gfp, vma, vmf, &ret);
 +                                gfp, vmf, &ret);
        if (err)
                return vmf_error(err);
 -      if (folio)
 +      if (folio) {
                vmf->page = folio_file_page(folio, vmf->pgoff);
 +              ret |= VM_FAULT_LOCKED;
 +      }
        return ret;
  }
  
@@@ -2318,41 -2330,15 +2318,41 @@@ static int shmem_set_policy(struct vm_a
  }
  
  static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 -                                        unsigned long addr)
 +                                        unsigned long addr, pgoff_t *ilx)
  {
        struct inode *inode = file_inode(vma->vm_file);
        pgoff_t index;
  
 +      /*
 +       * Bias interleave by inode number to distribute better across nodes;
 +       * but this interface is independent of which page order is used, so
 +       * supplies only that bias, letting caller apply the offset (adjusted
 +       * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
 +       */
 +      *ilx = inode->i_ino;
        index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
        return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
  }
 -#endif
 +
 +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
 +                      pgoff_t index, unsigned int order, pgoff_t *ilx)
 +{
 +      struct mempolicy *mpol;
 +
 +      /* Bias interleave by inode number to distribute better across nodes */
 +      *ilx = info->vfs_inode.i_ino + (index >> order);
 +
 +      mpol = mpol_shared_policy_lookup(&info->policy, index);
 +      return mpol ? mpol : get_task_policy(current);
 +}
 +#else
 +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
 +                      pgoff_t index, unsigned int order, pgoff_t *ilx)
 +{
 +      *ilx = 0;
 +      return NULL;
 +}
 +#endif /* CONFIG_NUMA */
  
  int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
  {
@@@ -2388,7 -2374,7 +2388,7 @@@ static int shmem_mmap(struct file *file
        struct shmem_inode_info *info = SHMEM_I(inode);
        int ret;
  
 -      ret = seal_check_future_write(info->seals, vma);
 +      ret = seal_check_write(info->seals, vma);
        if (ret)
                return ret;
  
@@@ -2459,6 -2445,7 +2459,6 @@@ static struct inode *__shmem_get_inode(
        if (err)
                return ERR_PTR(err);
  
 -
        inode = new_inode(sb);
        if (!inode) {
                shmem_free_inode(sb, 0);
        inode->i_ino = ino;
        inode_init_owner(idmap, inode, dir, mode);
        inode->i_blocks = 0;
 -      inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
 +      simple_inode_init_ts(inode);
        inode->i_generation = get_random_u32();
        info = SHMEM_I(inode);
        memset(info, 0, (char *)inode - (char *)info);
        atomic_set(&info->stop_eviction, 0);
        info->seals = F_SEAL_SEAL;
        info->flags = flags & VM_NORESERVE;
 -      info->i_crtime = inode->i_mtime;
 +      info->i_crtime = inode_get_mtime(inode);
        info->fsflags = (dir == NULL) ? 0 :
                SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
        if (info->fsflags)
                shmem_set_inode_flags(inode, info->fsflags);
        INIT_LIST_HEAD(&info->shrinklist);
        INIT_LIST_HEAD(&info->swaplist);
 -      INIT_LIST_HEAD(&info->swaplist);
 -      if (sbinfo->noswap)
 -              mapping_set_unevictable(inode->i_mapping);
        simple_xattrs_init(&info->xattrs);
        cache_no_acl(inode);
 +      if (sbinfo->noswap)
 +              mapping_set_unevictable(inode->i_mapping);
        mapping_set_large_folios(inode->i_mapping);
  
        switch (mode & S_IFMT) {
@@@ -2577,7 -2565,7 +2577,7 @@@ int shmem_mfill_atomic_pte(pmd_t *dst_p
        int ret;
        pgoff_t max_off;
  
 -      if (shmem_inode_acct_block(inode, 1)) {
 +      if (shmem_inode_acct_blocks(inode, 1)) {
                /*
                 * We may have got a page, returned -ENOENT triggering a retry,
                 * and now we find ourselves with -ENOMEM. Release the page, to
        if (unlikely(pgoff >= max_off))
                goto out_release;
  
 -      ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
 -                                    gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
 +      ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
 +      if (ret)
 +              goto out_release;
 +      ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
        if (ret)
                goto out_release;
  
@@@ -2700,6 -2686,7 +2700,6 @@@ shmem_write_begin(struct file *file, st
        }
  
        ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
 -
        if (ret)
                return ret;
  
@@@ -3231,7 -3218,8 +3231,7 @@@ shmem_mknod(struct mnt_idmap *idmap, st
        error = simple_acl_create(dir, inode);
        if (error)
                goto out_iput;
 -      error = security_inode_init_security(inode, dir,
 -                                           &dentry->d_name,
 +      error = security_inode_init_security(inode, dir, &dentry->d_name,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP)
                goto out_iput;
                goto out_iput;
  
        dir->i_size += BOGO_DIRENT_SIZE;
 -      dir->i_mtime = inode_set_ctime_current(dir);
 +      inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        inode_inc_iversion(dir);
        d_instantiate(dentry, inode);
        dget(dentry); /* Extra count - pin the dentry in core */
@@@ -3260,11 -3248,14 +3260,11 @@@ shmem_tmpfile(struct mnt_idmap *idmap, 
        int error;
  
        inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
 -
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto err_out;
        }
 -
 -      error = security_inode_init_security(inode, dir,
 -                                           NULL,
 +      error = security_inode_init_security(inode, dir, NULL,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP)
                goto out_iput;
@@@ -3301,8 -3292,7 +3301,8 @@@ static int shmem_create(struct mnt_idma
  /*
   * Link a file..
   */
 -static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 +static int shmem_link(struct dentry *old_dentry, struct inode *dir,
 +                    struct dentry *dentry)
  {
        struct inode *inode = d_inode(old_dentry);
        int ret = 0;
        }
  
        dir->i_size += BOGO_DIRENT_SIZE;
 -      dir->i_mtime = inode_set_ctime_to_ts(dir,
 -                                           inode_set_ctime_current(inode));
 +      inode_set_mtime_to_ts(dir,
 +                            inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        inode_inc_iversion(dir);
        inc_nlink(inode);
        ihold(inode);   /* New dentry reference */
 -      dget(dentry);           /* Extra pinning count for the created dentry */
 +      dget(dentry);   /* Extra pinning count for the created dentry */
        d_instantiate(dentry, inode);
  out:
        return ret;
@@@ -3349,11 -3339,11 +3349,11 @@@ static int shmem_unlink(struct inode *d
        simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
  
        dir->i_size -= BOGO_DIRENT_SIZE;
 -      dir->i_mtime = inode_set_ctime_to_ts(dir,
 -                                           inode_set_ctime_current(inode));
 +      inode_set_mtime_to_ts(dir,
 +                            inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        inode_inc_iversion(dir);
        drop_nlink(inode);
 -      dput(dentry);   /* Undo the count from "create" - this does all the work */
 +      dput(dentry);   /* Undo the count from "create" - does all the work */
        return 0;
  }
  
@@@ -3463,6 -3453,7 +3463,6 @@@ static int shmem_symlink(struct mnt_idm
  
        inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
                                VM_NORESERVE);
 -
        if (IS_ERR(inode))
                return PTR_ERR(inode);
  
                folio_put(folio);
        }
        dir->i_size += BOGO_DIRENT_SIZE;
 -      dir->i_mtime = inode_set_ctime_current(dir);
 +      inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        inode_inc_iversion(dir);
        d_instantiate(dentry, inode);
        dget(dentry);
@@@ -3516,7 -3507,8 +3516,7 @@@ static void shmem_put_link(void *arg
        folio_put(arg);
  }
  
 -static const char *shmem_get_link(struct dentry *dentry,
 -                                struct inode *inode,
 +static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
                                  struct delayed_call *done)
  {
        struct folio *folio = NULL;
@@@ -3590,7 -3582,8 +3590,7 @@@ static int shmem_fileattr_set(struct mn
   * Callback for security_inode_init_security() for acquiring xattrs.
   */
  static int shmem_initxattrs(struct inode *inode,
 -                          const struct xattr *xattr_array,
 -                          void *fs_info)
 +                          const struct xattr *xattr_array, void *fs_info)
  {
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
@@@ -3721,7 -3714,7 +3721,7 @@@ static const struct xattr_handler shmem
        .set = shmem_xattr_handler_set,
  };
  
 -static const struct xattr_handler *shmem_xattr_handlers[] = {
 +static const struct xattr_handler * const shmem_xattr_handlers[] = {
        &shmem_security_xattr_handler,
        &shmem_trusted_xattr_handler,
        &shmem_user_xattr_handler,
@@@ -3774,6 -3767,7 +3774,6 @@@ static struct dentry *shmem_find_alias(
        return alias ?: d_find_any_alias(inode);
  }
  
 -
  static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
  {
@@@ -4357,8 -4351,8 +4357,8 @@@ static int shmem_fill_super(struct supe
        }
  #endif /* CONFIG_TMPFS_QUOTA */
  
 -      inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
 -                              VM_NORESERVE);
 +      inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
 +                              S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto failed;
@@@ -4400,7 -4394,7 +4400,7 @@@ static const struct fs_context_operatio
  #endif
  };
  
- static struct kmem_cache *shmem_inode_cachep;
+ static struct kmem_cache *shmem_inode_cachep __ro_after_init;
  
  static struct inode *shmem_alloc_inode(struct super_block *sb)
  {
@@@ -4432,14 -4426,14 +4432,14 @@@ static void shmem_init_inode(void *foo
        inode_init_once(&info->vfs_inode);
  }
  
- static void shmem_init_inodecache(void)
+ static void __init shmem_init_inodecache(void)
  {
        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                sizeof(struct shmem_inode_info),
                                0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
  }
  
- static void shmem_destroy_inodecache(void)
+ static void __init shmem_destroy_inodecache(void)
  {
        kmem_cache_destroy(shmem_inode_cachep);
  }
@@@ -4591,7 -4585,11 +4591,7 @@@ static struct file_system_type shmem_fs
        .parameters     = shmem_fs_parameters,
  #endif
        .kill_sb        = kill_litter_super,
 -#ifdef CONFIG_SHMEM
        .fs_flags       = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
 -#else
 -      .fs_flags       = FS_USERNS_MOUNT,
 -#endif
  };
  
  void __init shmem_init(void)
@@@ -4657,9 -4655,11 +4657,9 @@@ static ssize_t shmem_enabled_show(struc
  
        for (i = 0; i < ARRAY_SIZE(values); i++) {
                len += sysfs_emit_at(buf, len,
 -                                   shmem_huge == values[i] ? "%s[%s]" : "%s%s",
 -                                   i ? " " : "",
 -                                   shmem_format_huge(values[i]));
 +                              shmem_huge == values[i] ? "%s[%s]" : "%s%s",
 +                              i ? " " : "", shmem_format_huge(values[i]));
        }
 -
        len += sysfs_emit_at(buf, len, "\n");
  
        return len;
@@@ -4756,9 -4756,8 +4756,9 @@@ EXPORT_SYMBOL_GPL(shmem_truncate_range)
  #define shmem_acct_size(flags, size)          0
  #define shmem_unacct_size(flags, size)                do {} while (0)
  
 -static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir,
 -                                          umode_t mode, dev_t dev, unsigned long flags)
 +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
 +                              struct super_block *sb, struct inode *dir,
 +                              umode_t mode, dev_t dev, unsigned long flags)
  {
        struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
        return inode ? inode : ERR_PTR(-ENOSPC);
  
  /* common code */
  
 -static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
 -                                     unsigned long flags, unsigned int i_flags)
 +static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
 +                      loff_t size, unsigned long flags, unsigned int i_flags)
  {
        struct inode *inode;
        struct file *res;
  
        inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
                                S_IFREG | S_IRWXUGO, 0, flags);
 -
        if (IS_ERR(inode)) {
                shmem_unacct_size(flags, size);
                return ERR_CAST(inode);
@@@ -4897,7 -4897,7 +4897,7 @@@ struct folio *shmem_read_folio_gfp(stru
  
        BUG_ON(!shmem_mapping(mapping));
        error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
 -                                gfp, NULL, NULL, NULL);
 +                                  gfp, NULL, NULL);
        if (error)
                return ERR_PTR(error);
  
diff --combined net/core/pktgen.c
index 8afcfadf8d5a28d2e195484f9dd75194c03cf45f,826250a0f5b16e9fea1432faef56967a360f53a5..57cea67b75624696dd521904fb6eb36864c1e7eb
        pf(VID_RND)             /* Random VLAN ID */                    \
        pf(SVID_RND)            /* Random SVLAN ID */                   \
        pf(NODE)                /* Node memory alloc*/                  \
 +      pf(SHARED)              /* Shared SKB */                        \
  
  #define pf(flag)              flag##_SHIFT,
  enum pkt_flags {
@@@ -670,19 -669,19 +670,19 @@@ static int pktgen_if_show(struct seq_fi
        seq_puts(seq, "     Flags: ");
  
        for (i = 0; i < NR_PKT_FLAGS; i++) {
 -              if (i == F_FLOW_SEQ)
 +              if (i == FLOW_SEQ_SHIFT)
                        if (!pkt_dev->cflows)
                                continue;
  
 -              if (pkt_dev->flags & (1 << i))
 +              if (pkt_dev->flags & (1 << i)) {
                        seq_printf(seq, "%s  ", pkt_flag_names[i]);
 -              else if (i == F_FLOW_SEQ)
 -                      seq_puts(seq, "FLOW_RND  ");
 -
  #ifdef CONFIG_XFRM
 -              if (i == F_IPSEC && pkt_dev->spi)
 -                      seq_printf(seq, "spi:%u", pkt_dev->spi);
 +                      if (i == IPSEC_SHIFT && pkt_dev->spi)
 +                              seq_printf(seq, "spi:%u  ", pkt_dev->spi);
  #endif
 +              } else if (i == FLOW_SEQ_SHIFT) {
 +                      seq_puts(seq, "FLOW_RND  ");
 +              }
        }
  
        seq_puts(seq, "\n");
@@@ -1199,8 -1198,7 +1199,8 @@@ static ssize_t pktgen_if_write(struct f
                    ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
                     !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
                        return -ENOTSUPP;
 -              if (value > 0 && pkt_dev->n_imix_entries > 0)
 +              if (value > 0 && (pkt_dev->n_imix_entries > 0 ||
 +                                !(pkt_dev->flags & F_SHARED)))
                        return -EINVAL;
  
                i += len;
                     ((pkt_dev->xmit_mode == M_START_XMIT) &&
                     (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
                        return -ENOTSUPP;
 +
 +              if (value > 1 && !(pkt_dev->flags & F_SHARED))
 +                      return -EINVAL;
 +
                pkt_dev->burst = value < 1 ? 1 : value;
                sprintf(pg_result, "OK: burst=%u", pkt_dev->burst);
                return count;
                return count;
        }
        if (!strcmp(name, "flag")) {
 +              bool disable = false;
                __u32 flag;
                char f[32];
 -              bool disable = false;
 +              char *end;
  
                memset(f, 0, 32);
                len = strn_len(&user_buffer[i], sizeof(f) - 1);
                i += len;
  
                flag = pktgen_read_flag(f, &disable);
 -
                if (flag) {
 -                      if (disable)
 +                      if (disable) {
 +                              /* If "clone_skb", or "burst" parameters are
 +                               * configured, it means that the skb still
 +                               * needs to be referenced by the pktgen, so
 +                               * the skb must be shared.
 +                               */
 +                              if (flag == F_SHARED && (pkt_dev->clone_skb ||
 +                                                       pkt_dev->burst > 1))
 +                                      return -EINVAL;
                                pkt_dev->flags &= ~flag;
 -                      else
 +                      } else {
                                pkt_dev->flags |= flag;
 -              } else {
 -                      sprintf(pg_result,
 -                              "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
 -                              f,
 -                              "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
 -                              "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
 -                              "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
 -                              "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
 -                              "NO_TIMESTAMP, "
 -#ifdef CONFIG_XFRM
 -                              "IPSEC, "
 -#endif
 -                              "NODE_ALLOC\n");
 +                      }
 +
 +                      sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
                        return count;
                }
 -              sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
 +
 +              /* Unknown flag */
 +              end = pkt_dev->result + sizeof(pkt_dev->result);
 +              pg_result += sprintf(pg_result,
 +                      "Flag -:%s:- unknown\n"
 +                      "Available flags, (prepend ! to un-set flag):\n", f);
 +
 +              for (int n = 0; n < NR_PKT_FLAGS && pg_result < end; n++) {
 +                      if (!IS_ENABLED(CONFIG_XFRM) && n == IPSEC_SHIFT)
 +                              continue;
 +                      pg_result += snprintf(pg_result, end - pg_result,
 +                                            "%s, ", pkt_flag_names[n]);
 +              }
 +              if (!WARN_ON_ONCE(pg_result >= end)) {
 +                      /* Remove the comma and whitespace at the end */
 +                      *(pg_result - 2) = '\0';
 +              }
 +
                return count;
        }
        if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
@@@ -3461,24 -3440,12 +3461,24 @@@ static void pktgen_wait_for_skb(struct 
  
  static void pktgen_xmit(struct pktgen_dev *pkt_dev)
  {
 -      unsigned int burst = READ_ONCE(pkt_dev->burst);
 +      bool skb_shared = !!(READ_ONCE(pkt_dev->flags) & F_SHARED);
        struct net_device *odev = pkt_dev->odev;
        struct netdev_queue *txq;
 +      unsigned int burst = 1;
        struct sk_buff *skb;
 +      int clone_skb = 0;
        int ret;
  
 +      /* If 'skb_shared' is false, the read of possible
 +       * new values (if any) for 'burst' and 'clone_skb' will be skipped to
 +       * prevent some concurrent changes from slipping in. And the stabilized
 +       * config will be read in during the next run of pktgen_xmit.
 +       */
 +      if (skb_shared) {
 +              burst = READ_ONCE(pkt_dev->burst);
 +              clone_skb = READ_ONCE(pkt_dev->clone_skb);
 +      }
 +
        /* If device is offline, then don't send */
        if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
                pktgen_stop_device(pkt_dev);
  
        /* If no skb or clone count exhausted then get new one */
        if (!pkt_dev->skb || (pkt_dev->last_ok &&
 -                            ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
 +                            ++pkt_dev->clone_count >= clone_skb)) {
                /* build a new pkt */
                kfree_skb(pkt_dev->skb);
  
        if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
                skb = pkt_dev->skb;
                skb->protocol = eth_type_trans(skb, skb->dev);
 -              refcount_add(burst, &skb->users);
 +              if (skb_shared)
 +                      refcount_add(burst, &skb->users);
                local_bh_disable();
                do {
                        ret = netif_receive_skb(skb);
                                pkt_dev->errors++;
                        pkt_dev->sofar++;
                        pkt_dev->seq_num++;
 +                      if (unlikely(!skb_shared)) {
 +                              pkt_dev->skb = NULL;
 +                              break;
 +                      }
                        if (refcount_read(&skb->users) != burst) {
                                /* skb was queued by rps/rfs or taps,
                                 * so cannot reuse this skb
                goto out; /* Skips xmit_mode M_START_XMIT */
        } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
                local_bh_disable();
 -              refcount_inc(&pkt_dev->skb->users);
 +              if (skb_shared)
 +                      refcount_inc(&pkt_dev->skb->users);
  
                ret = dev_queue_xmit(pkt_dev->skb);
 +
 +              if (!skb_shared && dev_xmit_complete(ret))
 +                      pkt_dev->skb = NULL;
 +
                switch (ret) {
                case NET_XMIT_SUCCESS:
                        pkt_dev->sofar++;
                pkt_dev->last_ok = 0;
                goto unlock;
        }
 -      refcount_add(burst, &pkt_dev->skb->users);
 +      if (skb_shared)
 +              refcount_add(burst, &pkt_dev->skb->users);
  
  xmit_more:
        ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
  
 +      if (!skb_shared && dev_xmit_complete(ret))
 +              pkt_dev->skb = NULL;
 +
        switch (ret) {
        case NETDEV_TX_OK:
                pkt_dev->last_ok = 1;
                fallthrough;
        case NETDEV_TX_BUSY:
                /* Retry it next time */
 -              refcount_dec(&(pkt_dev->skb->users));
 +              if (skb_shared)
 +                      refcount_dec(&pkt_dev->skb->users);
                pkt_dev->last_ok = 0;
        }
        if (unlikely(burst))
@@@ -3636,8 -3588,7 +3636,8 @@@ out
  
        /* If pkt_dev->count is zero, then run forever */
        if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
 -              pktgen_wait_for_skb(pkt_dev);
 +              if (pkt_dev->skb)
 +                      pktgen_wait_for_skb(pkt_dev);
  
                /* Done with this */
                pktgen_stop_device(pkt_dev);
@@@ -3820,7 -3771,6 +3820,7 @@@ static int pktgen_add_device(struct pkt
        pkt_dev->svlan_id = 0xffff;
        pkt_dev->burst = 1;
        pkt_dev->node = NUMA_NO_NODE;
 +      pkt_dev->flags = F_SHARED;      /* SKB shared by default */
  
        err = pktgen_setup_dev(t->net, pkt_dev, ifname);
        if (err)
@@@ -4032,8 -3982,7 +4032,7 @@@ static void __net_exit pg_net_exit(stru
        list_for_each_safe(q, n, &list) {
                t = list_entry(q, struct pktgen_thread, th_list);
                list_del(&t->th_list);
-               kthread_stop(t->tsk);
-               put_task_struct(t->tsk);
+               kthread_stop_put(t->tsk);
                kfree(t);
        }
  
index 27ea19fb1f54c7b0ec384ab66a6dea10977ddbd5,5b1c2de6cc54617bf82b6b4b8c9eabfd97e02b53..d4419a2a1e24be840d71b5ca8927c3d52f9704f7
@@@ -23,7 -23,7 +23,7 @@@
  
  static struct rb_root integrity_iint_tree = RB_ROOT;
  static DEFINE_RWLOCK(integrity_iint_lock);
- static struct kmem_cache *iint_cache __read_mostly;
+ static struct kmem_cache *iint_cache __ro_after_init;
  
  struct dentry *integrity_dir;
  
@@@ -66,32 -66,9 +66,32 @@@ struct integrity_iint_cache *integrity_
        return iint;
  }
  
 -static void iint_free(struct integrity_iint_cache *iint)
 +#define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH+1)
 +
 +/*
 + * It is not clear that IMA should be nested at all, but as long is it measures
 + * files both on overlayfs and on underlying fs, we need to annotate the iint
 + * mutex to avoid lockdep false positives related to IMA + overlayfs.
 + * See ovl_lockdep_annotate_inode_mutex_key() for more details.
 + */
 +static inline void iint_lockdep_annotate(struct integrity_iint_cache *iint,
 +                                       struct inode *inode)
 +{
 +#ifdef CONFIG_LOCKDEP
 +      static struct lock_class_key iint_mutex_key[IMA_MAX_NESTING];
 +
 +      int depth = inode->i_sb->s_stack_depth;
 +
 +      if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING))
 +              depth = 0;
 +
 +      lockdep_set_class(&iint->mutex, &iint_mutex_key[depth]);
 +#endif
 +}
 +
 +static void iint_init_always(struct integrity_iint_cache *iint,
 +                           struct inode *inode)
  {
 -      kfree(iint->ima_hash);
        iint->ima_hash = NULL;
        iint->version = 0;
        iint->flags = 0UL;
        iint->ima_creds_status = INTEGRITY_UNKNOWN;
        iint->evm_status = INTEGRITY_UNKNOWN;
        iint->measured_pcrs = 0;
 +      mutex_init(&iint->mutex);
 +      iint_lockdep_annotate(iint, inode);
 +}
 +
 +static void iint_free(struct integrity_iint_cache *iint)
 +{
 +      kfree(iint->ima_hash);
 +      mutex_destroy(&iint->mutex);
        kmem_cache_free(iint_cache, iint);
  }
  
@@@ -135,8 -104,6 +135,8 @@@ struct integrity_iint_cache *integrity_
        if (!iint)
                return NULL;
  
 +      iint_init_always(iint, inode);
 +
        write_lock(&integrity_iint_lock);
  
        p = &integrity_iint_tree.rb_node;
@@@ -186,18 -153,25 +186,18 @@@ void integrity_inode_free(struct inode 
        iint_free(iint);
  }
  
 -static void init_once(void *foo)
 +static void iint_init_once(void *foo)
  {
        struct integrity_iint_cache *iint = (struct integrity_iint_cache *) foo;
  
        memset(iint, 0, sizeof(*iint));
 -      iint->ima_file_status = INTEGRITY_UNKNOWN;
 -      iint->ima_mmap_status = INTEGRITY_UNKNOWN;
 -      iint->ima_bprm_status = INTEGRITY_UNKNOWN;
 -      iint->ima_read_status = INTEGRITY_UNKNOWN;
 -      iint->ima_creds_status = INTEGRITY_UNKNOWN;
 -      iint->evm_status = INTEGRITY_UNKNOWN;
 -      mutex_init(&iint->mutex);
  }
  
  static int __init integrity_iintcache_init(void)
  {
        iint_cache =
            kmem_cache_create("iint_cache", sizeof(struct integrity_iint_cache),
 -                            0, SLAB_PANIC, init_once);
 +                            0, SLAB_PANIC, iint_init_once);
        return 0;
  }
  DEFINE_LSM(integrity) = {
index bf4c4cd46600e3a709f3993373be19d3a2e2e8c8,7d31718ce8343db04ad51351cc862020b7698d28..cc16f6ca85333225004f06d5e7083700dda1d8c9
@@@ -56,8 -56,6 +56,8 @@@ separated by spaces
        memory protection key tests
  - soft_dirty
        test soft dirty page bit semantics
 +- pagemap
 +      test pagemap_scan IOCTL
  - cow
        test copy-on-write semantics
  - thp
@@@ -223,10 -221,6 +223,10 @@@ CATEGORY="hugetlb" run_test ./hugepage-
  CATEGORY="hugetlb" run_test ./hugepage-vmemmap
  CATEGORY="hugetlb" run_test ./hugetlb-madvise
  
 +# For this test, we need one and just one huge page
 +echo 1 > /proc/sys/vm/nr_hugepages
 +CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv
 +
  if test_selected "hugetlb"; then
        echo "NOTE: These hugetlb tests provide minimal coverage.  Use"
        echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
@@@ -309,6 -303,7 +309,7 @@@ CATEGORY="hmm" run_test bash ./test_hmm
  # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
  CATEGORY="madv_populate" run_test ./madv_populate
  
+ echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
  CATEGORY="memfd_secret" run_test ./memfd_secret
  
  # KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100
@@@ -348,8 -343,6 +349,8 @@@ the
        CATEGORY="soft_dirty" run_test ./soft-dirty
  fi
  
 +CATEGORY="pagemap" run_test ./pagemap_ioctl
 +
  # COW tests
  CATEGORY="cow" run_test ./cow
  
This page took 0.602839 seconds and 4 git commands to generate.