Merge tag 'mm-nonmm-stable-2023-11-02-14-08' of git://git.kernel.org/pub/scm/linux...

author Linus Torvalds <[email protected]>

Fri, 3 Nov 2023 06:53:31 +0000 (20:53 -1000)

committer Linus Torvalds <[email protected]>

Fri, 3 Nov 2023 06:53:31 +0000 (20:53 -1000)
author Linus Torvalds <[email protected]>
Fri, 3 Nov 2023 06:53:31 +0000 (20:53 -1000)
committer Linus Torvalds <[email protected]>
Fri, 3 Nov 2023 06:53:31 +0000 (20:53 -1000)
diff --combined .mailmap

index 2643b7203a745283d7f0e655f22966939e5ba3a4,27beb64673b8d20b8c38de14a56752979f7ba9dd..43031441b2d922b3126b26ba754ea748a3f63540
--- 1/.mailmap
--- 2/.mailmap
+++ b/.mailmap
@@@ -87,7 -87,6 +87,7 @@@ Baolin Wang <[email protected].
   Baolin Wang <[email protected]> <[email protected]>
   Bart Van Assche <[email protected]> <[email protected]>
   Bart Van Assche <[email protected]> <[email protected]>
+ +Bartosz Golaszewski <[email protected]> <[email protected]>
   Ben Dooks <[email protected]> <[email protected]>
   Ben Dooks <[email protected]> <[email protected]>
   Ben Gardner <[email protected]>
@@@ -95,6 -94,7 +95,7 @@@ Ben M Cahill <[email protected]
   Ben Widawsky <[email protected]> <[email protected]>
   Ben Widawsky <[email protected]> <[email protected]>
   Ben Widawsky <[email protected]> <[email protected]>
+ Benjamin Poirier <[email protected]> <[email protected]>
   Bjorn Andersson <[email protected]> <[email protected]>
   Bjorn Andersson <[email protected]> <[email protected]>
   Bjorn Andersson <[email protected]> <[email protected]>
@@@ -128,6 -128,7 +129,7 @@@ Christian Brauner <[email protected]> 
   Christian Marangi <[email protected]>
   Christophe Ricard <[email protected]>
   Christoph Hellwig <[email protected]>
+ Claudiu Beznea <[email protected]> <[email protected]>
   Colin Ian King <[email protected]> <[email protected]>
   Corey Minyard <[email protected]>
   Damian Hobson-Garcia <[email protected]>
@@@ -378,7 -379,6 +380,7 @@@ Matthew Wilcox <[email protected]> <w
   Matthew Wilcox <[email protected]> <[email protected]>
   Matthew Wilcox <[email protected]> <[email protected]>
   Matthias Fuchs <[email protected]> <[email protected]>
+ +Matthieu Baerts <[email protected]> <[email protected]>
   Matthieu CASTET <[email protected]>
   Matti Vaittinen <[email protected]> <[email protected]>
   Matt Ranostay <[email protected]> <[email protected]>
@@@ -451,10 -451,9 +453,10 @@@ Oleksandr Natalenko <oleksandr@natalenk
   Oleksij Rempel <[email protected]> <[email protected]>
   Oleksij Rempel <[email protected]> <[email protected]>
   Oleksij Rempel <[email protected]> <[email protected]>
- -Oleksij Rempel <[email protected]> <[email protected]>
- -Oleksij Rempel <linux@rempel-privat.de> <[email protected]>
+ +Oleksij Rempel <[email protected]>
+ +Oleksij Rempel <o.rempel@pengutronix.de> <[email protected]>
   Oliver Upton <[email protected]> <[email protected]>
+ +Ondřej Jirman <[email protected]> <[email protected]>
   Oza Pawandeep <[email protected]> <[email protected]>
   Pali Rohár <[email protected]> <[email protected]>
   Paolo 'Blaisorblade' Giarrusso <[email protected]>
@@@ -568,6 -567,7 +570,7 @@@ Takashi YOSHII <takashi.yoshii.zj@renes
   Tamizh Chelvam Raja <[email protected]> <[email protected]>
   Taniya Das <[email protected]> <[email protected]>
   Tejun Heo <[email protected]>
+ Tomeu Vizoso <[email protected]> <[email protected]>
   Thomas Graf <[email protected]>
   Thomas Körper <[email protected]> <[email protected]>
   Thomas Pedersen <[email protected]>
diff --combined arch/arm/kernel/setup.c

index 15eca804239edc8d46ead8f21e77990352f5d3d3,e2bb7afd06839b41f74b9c0c0297e6cea7f88f26..b01cac05bd4ccde251c3dcccea50f737f0d57028
--- 1/arch/arm/kernel/setup.c
--- 2/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@@ -15,10 -15,10 +15,10 @@@
   #include <linux/console.h>
   #include <linux/seq_file.h>
   #include <linux/screen_info.h>
- -#include <linux/of_platform.h>
   #include <linux/init.h>
   #include <linux/kexec.h>
   #include <linux/libfdt.h>
+ +#include <linux/of.h>
   #include <linux/of_fdt.h>
   #include <linux/cpu.h>
   #include <linux/interrupt.h>
@@@ -1010,7 -1010,8 +1010,8 @@@ static void __init reserve_crashkernel(
   
         total_mem = get_total_mem();
         ret = parse_crashkernel(boot_command_line, total_mem,
-                               &crash_size, &crash_base);
+                               &crash_size, &crash_base,
+                               NULL, NULL);
         /* invalid value specified or crashkernel=0 */
         if (ret || !crash_size)
                 return;
diff --combined arch/arm64/Kconfig

index 6062a52a084ffff5b3d94b290b7fdd8846b44293,e7d374d994adaa5fe40f2965e57d64675b92581f..7b071a00425d2b833bc79dd0413bafaaeb94c19a
--- 1/arch/arm64/Kconfig
--- 2/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@@ -1037,19 -1037,6 +1037,19 @@@ config ARM64_ERRATUM_264519
   
           If unsure, say Y.
   
+ +config ARM64_ERRATUM_2966298
+ +      bool "Cortex-A520: 2966298: workaround for speculatively executed unprivileged load"
+ +      default y
+ +      help
+ +        This option adds the workaround for ARM Cortex-A520 erratum 2966298.
+ +
+ +        On an affected Cortex-A520 core, a speculatively executed unprivileged
+ +        load might leak data from a privileged level via a cache side channel.
+ +
+ +        Work around this problem by executing a TLBI before returning to EL0.
+ +
+ +        If unsure, say Y.
+ +
   config CAVIUM_ERRATUM_22375
         bool "Cavium erratum 22375, 24313"
         default y
@@@ -1368,8 -1355,6 +1368,8 @@@ choic
   config CPU_BIG_ENDIAN
         bool "Build big-endian kernel"
         depends on !LD_IS_LLD || LLD_VERSION >= 130000
+ +      # https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c
+ +      depends on AS_IS_GNU || AS_VERSION >= 150000
         help
           Say Y if you plan on running a kernel with a big-endian userspace.
   
@@@ -1498,6 -1483,9 +1498,9 @@@ config ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_
   config ARCH_SUPPORTS_CRASH_DUMP
         def_bool y
   
+ config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+       def_bool CRASH_CORE
+ 
   config TRANS_TABLE
         def_bool y
         depends on HIBERNATION || KEXEC_CORE
diff --combined arch/arm64/mm/init.c

index 8deec68028ac7ecddd3677a9e517565847540853,f2bf32e1937150fec61d1973590e36d97f3e1f19..74c1db8ce271d8c5b0ea19dda0dbf7d3fa5f5230
--- 1/arch/arm64/mm/init.c
--- 2/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@@ -16,7 -16,6 +16,7 @@@
   #include <linux/nodemask.h>
   #include <linux/initrd.h>
   #include <linux/gfp.h>
+ +#include <linux/math.h>
   #include <linux/memblock.h>
   #include <linux/sort.h>
   #include <linux/of.h>
@@@ -65,15 -64,6 +65,6 @@@ EXPORT_SYMBOL(memstart_addr)
    */
   phys_addr_t __ro_after_init arm64_dma_phys_limit;
   
- /* Current arm64 boot protocol requires 2MB alignment */
- #define CRASH_ALIGN                   SZ_2M
- 
- #define CRASH_ADDR_LOW_MAX            arm64_dma_phys_limit
- #define CRASH_ADDR_HIGH_MAX           (PHYS_MASK + 1)
- #define CRASH_HIGH_SEARCH_BASE                SZ_4G
- 
- #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
- 
   /*
    * To make optimal use of block mappings when laying out the linear
    * mapping, round down the base of physical memory to a size that can
@@@ -101,140 -91,25 +92,25 @@@
   #define ARM64_MEMSTART_ALIGN  (1UL << ARM64_MEMSTART_SHIFT)
   #endif
   
- static int __init reserve_crashkernel_low(unsigned long long low_size)
- {
-       unsigned long long low_base;
- 
-       low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
-       if (!low_base) {
-               pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size);
-               return -ENOMEM;
-       }
- 
-       pr_info("crashkernel low memory reserved: 0x%08llx - 0x%08llx (%lld MB)\n",
-               low_base, low_base + low_size, low_size >> 20);
- 
-       crashk_low_res.start = low_base;
-       crashk_low_res.end   = low_base + low_size - 1;
-       insert_resource(&iomem_resource, &crashk_low_res);
- 
-       return 0;
- }
- 
- /*
-  * reserve_crashkernel() - reserves memory for crash kernel
-  *
-  * This function reserves memory area given in "crashkernel=" kernel command
-  * line parameter. The memory reserved is used by dump capture kernel when
-  * primary kernel is crashing.
-  */
- static void __init reserve_crashkernel(void)
+ static void __init arch_reserve_crashkernel(void)
   {
-       unsigned long long crash_low_size = 0, search_base = 0;
-       unsigned long long crash_max = CRASH_ADDR_LOW_MAX;
+       unsigned long long low_size = 0;
         unsigned long long crash_base, crash_size;
         char *cmdline = boot_command_line;
-       bool fixed_base = false;
         bool high = false;
         int ret;
   
         if (!IS_ENABLED(CONFIG_KEXEC_CORE))
                 return;
   
-       /* crashkernel=X[@offset] */
         ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
-                               &crash_size, &crash_base);
-       if (ret == -ENOENT) {
-               ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base);
-               if (ret || !crash_size)
-                       return;
- 
-               /*
-                * crashkernel=Y,low can be specified or not, but invalid value
-                * is not allowed.
-                */
-               ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base);
-               if (ret == -ENOENT)
-                       crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
-               else if (ret)
-                       return;
- 
-               search_base = CRASH_HIGH_SEARCH_BASE;
-               crash_max = CRASH_ADDR_HIGH_MAX;
-               high = true;
-       } else if (ret || !crash_size) {
-               /* The specified value is invalid */
+                               &crash_size, &crash_base,
+                               &low_size, &high);
+       if (ret)
                 return;
-       }
- 
-       crash_size = PAGE_ALIGN(crash_size);
- 
-       /* User specifies base address explicitly. */
-       if (crash_base) {
-               fixed_base = true;
-               search_base = crash_base;
-               crash_max = crash_base + crash_size;
-       }
- 
- retry:
-       crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
-                                              search_base, crash_max);
-       if (!crash_base) {
-               /*
-                * For crashkernel=size[KMG]@offset[KMG], print out failure
-                * message if can't reserve the specified region.
-                */
-               if (fixed_base) {
-                       pr_warn("crashkernel reservation failed - memory is in use.\n");
-                       return;
-               }
- 
-               /*
-                * For crashkernel=size[KMG], if the first attempt was for
-                * low memory, fall back to high memory, the minimum required
-                * low memory will be reserved later.
-                */
-               if (!high && crash_max == CRASH_ADDR_LOW_MAX) {
-                       crash_max = CRASH_ADDR_HIGH_MAX;
-                       search_base = CRASH_ADDR_LOW_MAX;
-                       crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE;
-                       goto retry;
-               }
- 
-               /*
-                * For crashkernel=size[KMG],high, if the first attempt was
-                * for high memory, fall back to low memory.
-                */
-               if (high && crash_max == CRASH_ADDR_HIGH_MAX) {
-                       crash_max = CRASH_ADDR_LOW_MAX;
-                       search_base = 0;
-                       goto retry;
-               }
-               pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
-                       crash_size);
-               return;
-       }
- 
-       if ((crash_base >= CRASH_ADDR_LOW_MAX) && crash_low_size &&
-            reserve_crashkernel_low(crash_low_size)) {
-               memblock_phys_free(crash_base, crash_size);
-               return;
-       }
- 
-       pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
-               crash_base, crash_base + crash_size, crash_size >> 20);
- 
-       /*
-        * The crashkernel memory will be removed from the kernel linear
-        * map. Inform kmemleak so that it won't try to access it.
-        */
-       kmemleak_ignore_phys(crash_base);
-       if (crashk_low_res.end)
-               kmemleak_ignore_phys(crashk_low_res.start);
   
-       crashk_res.start = crash_base;
-       crashk_res.end = crash_base + crash_size - 1;
-       insert_resource(&iomem_resource, &crashk_res);
+       reserve_crashkernel_generic(cmdline, crash_size, crash_base,
+                                   low_size, high);
   }
   
   /*
@@@ -480,7 -355,7 +356,7 @@@ void __init bootmem_init(void
          * request_standard_resources() depends on crashkernel's memory being
          * reserved, so do it here.
          */
-       reserve_crashkernel();
+       arch_reserve_crashkernel();
   
         memblock_dump_all();
   }
@@@ -494,16 -369,8 +370,16 @@@ void __init mem_init(void
   {
         bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
   
- -      if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC))
+ +      if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
+ +              /*
+ +               * If no bouncing needed for ZONE_DMA, reduce the swiotlb
+ +               * buffer for kmalloc() bouncing to 1MB per 1GB of RAM.
+ +               */
+ +              unsigned long size =
+ +                      DIV_ROUND_UP(memblock_phys_mem_size(), 1024);
+ +              swiotlb_adjust_size(min(swiotlb_size_or_default(), size));
                 swiotlb = true;
+ +      }
   
         swiotlb_init(swiotlb, SWIOTLB_VERBOSE);
   
diff --combined arch/loongarch/kernel/setup.c

index aed65915e932e2963913ac2b4ada44102eed9e32,4de32b07c0dcdb917c473cc96eaf3aa88182647d..b35186f7b2547afd85a9e4224b966fb6ed00fdc8
--- 1/arch/loongarch/kernel/setup.c
--- 2/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@@ -161,19 -161,19 +161,19 @@@ static void __init smbios_parse(void
   }
   
   #ifdef CONFIG_ARCH_WRITECOMBINE
- -pgprot_t pgprot_wc = PAGE_KERNEL_WUC;
+ +bool wc_enabled = true;
   #else
- -pgprot_t pgprot_wc = PAGE_KERNEL_SUC;
+ +bool wc_enabled = false;
   #endif
   
- -EXPORT_SYMBOL(pgprot_wc);
+ +EXPORT_SYMBOL(wc_enabled);
   
   static int __init setup_writecombine(char *p)
   {
         if (!strcmp(p, "on"))
- -              pgprot_wc = PAGE_KERNEL_WUC;
+ +              wc_enabled = true;
         else if (!strcmp(p, "off"))
- -              pgprot_wc = PAGE_KERNEL_SUC;
+ +              wc_enabled = false;
         else
                 pr_warn("Unknown writecombine setting \"%s\".\n", p);
   
@@@ -267,7 -267,9 +267,9 @@@ static void __init arch_parse_crashkern
         unsigned long long crash_base, crash_size;
   
         total_mem = memblock_phys_mem_size();
-       ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
+       ret = parse_crashkernel(boot_command_line, total_mem,
+                               &crash_size, &crash_base,
+                               NULL, NULL);
         if (ret < 0 || crash_size <= 0)
                 return;
   
diff --combined arch/riscv/Kconfig

index 9c48fecc671918ed7c77eda92f333e03dbf9f4e3,25474f8c12b79b70a00769f622a266da754b3208..eaa15a20e6ae1537d14efceeafc3b62bd4273fc5
--- 1/arch/riscv/Kconfig
--- 2/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@@ -273,9 -273,11 +273,9 @@@ config RISCV_DMA_NONCOHEREN
         select ARCH_HAS_SYNC_DMA_FOR_CPU
         select ARCH_HAS_SYNC_DMA_FOR_DEVICE
         select DMA_BOUNCE_UNALIGNED_KMALLOC if SWIOTLB
- -      select DMA_DIRECT_REMAP if MMU
   
   config RISCV_NONSTANDARD_CACHE_OPS
         bool
- -      depends on RISCV_DMA_NONCOHERENT
         help
           This enables function pointer support for non-standard noncoherent
           systems to handle cache management.
@@@ -548,7 -550,6 +548,7 @@@ config RISCV_ISA_ZICBO
         depends on RISCV_ALTERNATIVE
         default y
         select RISCV_DMA_NONCOHERENT
+ +      select DMA_DIRECT_REMAP
         help
            Adds support to dynamically detect the presence of the ZICBOM
            extension (Cache Block Management Operations) and enable its
@@@ -693,6 -694,9 +693,9 @@@ config ARCH_SUPPORTS_KEXEC_PURGATOR
   config ARCH_SUPPORTS_CRASH_DUMP
         def_bool y
   
+ config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+       def_bool CRASH_CORE
+ 
   config COMPAT
         bool "Kernel support for 32-bit U-mode"
         default 64BIT
diff --combined arch/x86/Kconfig

index 433f5e1906d1a155d341d12718b9fa97c5964897,36b2f12f31c3fdc6ae4e779a6c4f5240d6cb5306..6a917f62eff2068e83900577df2cde1fadbb6e92
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -28,6 -28,7 +28,6 @@@ config X86_6
         select ARCH_HAS_GIGANTIC_PAGE
         select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
         select ARCH_SUPPORTS_PER_VMA_LOCK
- -      select ARCH_USE_CMPXCHG_LOCKREF
         select HAVE_ARCH_SOFT_DIRTY
         select MODULES_USE_ELF_RELA
         select NEED_DMA_MAP_STATE
@@@ -117,7 -118,6 +117,7 @@@ config X8
         select ARCH_SUPPORTS_LTO_CLANG
         select ARCH_SUPPORTS_LTO_CLANG_THIN
         select ARCH_USE_BUILTIN_BSWAP
+ +      select ARCH_USE_CMPXCHG_LOCKREF         if X86_CMPXCHG64
         select ARCH_USE_MEMTEST
         select ARCH_USE_QUEUED_RWLOCKS
         select ARCH_USE_QUEUED_SPINLOCKS
@@@ -1534,7 -1534,6 +1534,7 @@@ config NUM
         depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP)
         default y if X86_BIGSMP
         select USE_PERCPU_NUMA_NODE_ID
+ +      select OF_NUMA if OF
         help
           Enable NUMA (Non-Uniform Memory Access) support.
   
@@@ -1940,18 -1939,6 +1940,18 @@@ config X86_USER_SHADOW_STAC
   
           If unsure, say N.
   
+ +config INTEL_TDX_HOST
+ +      bool "Intel Trust Domain Extensions (TDX) host support"
+ +      depends on CPU_SUP_INTEL
+ +      depends on X86_64
+ +      depends on KVM_INTEL
+ +      help
+ +        Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
+ +        host and certain physical attacks.  This option enables necessary TDX
+ +        support in the host kernel to run confidential VMs.
+ +
+ +        If unsure, say N.
+ +
   config EFI
         bool "EFI runtime service support"
         depends on ACPI
@@@ -2075,6 -2062,9 +2075,9 @@@ config ARCH_SUPPORTS_CRASH_DUM
   config ARCH_SUPPORTS_CRASH_HOTPLUG
         def_bool y
   
+ config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+       def_bool CRASH_CORE
+ 
   config PHYSICAL_START
         hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
         default "0x1000000"
@@@ -2967,15 -2957,6 +2970,15 @@@ config IA32_EMULATIO
           64-bit kernel. You should likely turn this on, unless you're
           100% sure that you don't have any 32-bit programs left.
   
+ +config IA32_EMULATION_DEFAULT_DISABLED
+ +      bool "IA32 emulation disabled by default"
+ +      default n
+ +      depends on IA32_EMULATION
+ +      help
+ +        Make IA32 emulation disabled by default. This prevents loading 32-bit
+ +        processes and access to 32-bit syscalls. If unsure, leave it to its
+ +        default value.
+ +
   config X86_X32_ABI
         bool "x32 ABI for 64-bit mode"
         depends on X86_64
diff --combined arch/x86/kernel/setup.c

index ccd3ad29a1dcfa1bb28fb54bbb6a6acdce726eb6,25a3f9a100f6815f59417e1b3635d9415a0f4238..163c35db3d04393ac904dee4eec1b789b8ecac3a
--- 1/arch/x86/kernel/setup.c
--- 2/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@@ -466,154 -466,29 +466,29 @@@ static void __init memblock_x86_reserve
         }
   }
   
- /*
-  * --------- Crashkernel reservation ------------------------------
-  */
- 
- /* 16M alignment for crash kernel regions */
- #define CRASH_ALIGN           SZ_16M
- 
- /*
-  * Keep the crash kernel below this limit.
-  *
-  * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
-  * due to mapping restrictions.
-  *
-  * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
-  * the upper limit of system RAM in 4-level paging mode. Since the kdump
-  * jump could be from 5-level paging to 4-level paging, the jump will fail if
-  * the kernel is put above 64 TB, and during the 1st kernel bootup there's
-  * no good way to detect the paging mode of the target kernel which will be
-  * loaded for dumping.
-  */
- #ifdef CONFIG_X86_32
- # define CRASH_ADDR_LOW_MAX   SZ_512M
- # define CRASH_ADDR_HIGH_MAX  SZ_512M
- #else
- # define CRASH_ADDR_LOW_MAX   SZ_4G
- # define CRASH_ADDR_HIGH_MAX  SZ_64T
- #endif
- 
- static int __init reserve_crashkernel_low(void)
+ static void __init arch_reserve_crashkernel(void)
   {
- #ifdef CONFIG_X86_64
-       unsigned long long base, low_base = 0, low_size = 0;
-       unsigned long low_mem_limit;
-       int ret;
- 
-       low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX);
- 
-       /* crashkernel=Y,low */
-       ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base);
-       if (ret) {
-               /*
-                * two parts from kernel/dma/swiotlb.c:
-                * -swiotlb size: user-specified with swiotlb= or default.
-                *
-                * -swiotlb overflow buffer: now hardcoded to 32k. We round it
-                * to 8M for other buffers that may need to stay low too. Also
-                * make sure we allocate enough extra low memory so that we
-                * don't run out of DMA buffers for 32-bit devices.
-                */
-               low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
-       } else {
-               /* passed with crashkernel=0,low ? */
-               if (!low_size)
-                       return 0;
-       }
- 
-       low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX);
-       if (!low_base) {
-               pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
-                      (unsigned long)(low_size >> 20));
-               return -ENOMEM;
-       }
- 
-       pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n",
-               (unsigned long)(low_size >> 20),
-               (unsigned long)(low_base >> 20),
-               (unsigned long)(low_mem_limit >> 20));
- 
-       crashk_low_res.start = low_base;
-       crashk_low_res.end   = low_base + low_size - 1;
-       insert_resource(&iomem_resource, &crashk_low_res);
- #endif
-       return 0;
- }
- 
- static void __init reserve_crashkernel(void)
- {
-       unsigned long long crash_size, crash_base, total_mem;
+       unsigned long long crash_base, crash_size, low_size = 0;
+       char *cmdline = boot_command_line;
         bool high = false;
         int ret;
   
         if (!IS_ENABLED(CONFIG_KEXEC_CORE))
                 return;
   
-       total_mem = memblock_phys_mem_size();
- 
-       /* crashkernel=XM */
-       ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
-       if (ret != 0 || crash_size <= 0) {
-               /* crashkernel=X,high */
-               ret = parse_crashkernel_high(boot_command_line, total_mem,
-                                            &crash_size, &crash_base);
-               if (ret != 0 || crash_size <= 0)
-                       return;
-               high = true;
-       }
+       ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
+                               &crash_size, &crash_base,
+                               &low_size, &high);
+       if (ret)
+               return;
   
         if (xen_pv_domain()) {
                 pr_info("Ignoring crashkernel for a Xen PV domain\n");
                 return;
         }
   
-       /* 0 means: find the address automatically */
-       if (!crash_base) {
-               /*
-                * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
-                * crashkernel=x,high reserves memory over 4G, also allocates
-                * 256M extra low memory for DMA buffers and swiotlb.
-                * But the extra memory is not required for all machines.
-                * So try low memory first and fall back to high memory
-                * unless "crashkernel=size[KMG],high" is specified.
-                */
-               if (!high)
-                       crash_base = memblock_phys_alloc_range(crash_size,
-                                               CRASH_ALIGN, CRASH_ALIGN,
-                                               CRASH_ADDR_LOW_MAX);
-               if (!crash_base)
-                       crash_base = memblock_phys_alloc_range(crash_size,
-                                               CRASH_ALIGN, CRASH_ALIGN,
-                                               CRASH_ADDR_HIGH_MAX);
-               if (!crash_base) {
-                       pr_info("crashkernel reservation failed - No suitable area found.\n");
-                       return;
-               }
-       } else {
-               unsigned long long start;
- 
-               start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base,
-                                                 crash_base + crash_size);
-               if (start != crash_base) {
-                       pr_info("crashkernel reservation failed - memory is in use.\n");
-                       return;
-               }
-       }
- 
-       if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
-               memblock_phys_free(crash_base, crash_size);
-               return;
-       }
- 
-       pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
-               (unsigned long)(crash_size >> 20),
-               (unsigned long)(crash_base >> 20),
-               (unsigned long)(total_mem >> 20));
- 
-       crashk_res.start = crash_base;
-       crashk_res.end   = crash_base + crash_size - 1;
-       insert_resource(&iomem_resource, &crashk_res);
+       reserve_crashkernel_generic(cmdline, crash_size, crash_base,
+                                   low_size, high);
   }
   
   static struct resource standard_io_resources[] = {
@@@ -1120,7 -995,7 +995,7 @@@ void __init setup_arch(char **cmdline_p
          * Needs to run after memblock setup because it needs the physical
          * memory size.
          */
- -      sev_setup_arch();
+ +      mem_encrypt_setup_arch();
   
         efi_fake_memmap();
         efi_find_mirror();
@@@ -1217,8 -1092,6 +1092,8 @@@
   
         early_acpi_boot_init();
   
+ +      x86_flattree_get_config();
+ +
         initmem_init();
         dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
   
@@@ -1229,7 -1102,7 +1104,7 @@@
          * Reserve memory for crash kernel after SRAT is parsed so that it
          * won't consume hotpluggable memory.
          */
-       reserve_crashkernel();
+       arch_reserve_crashkernel();
   
         memblock_find_dma_reserve();
   
diff --combined block/bdev.c

index 2018d250e1310913952b272481e299f188ea34f3,aea9143d890889e2878c29c08108aaa6caf6d4f1..e4cfb7adb64581d0630d61dc0f5a287245da7d38
--- 1/block/bdev.c
--- 2/block/bdev.c
+++ b/block/bdev.c
@@@ -292,7 -292,7 +292,7 @@@ EXPORT_SYMBOL(thaw_bdev)
    */
   
   static  __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock);
- static struct kmem_cache * bdev_cachep __read_mostly;
+ static struct kmem_cache *bdev_cachep __ro_after_init;
   
   static struct inode *bdev_alloc_inode(struct super_block *sb)
   {
@@@ -361,13 -361,13 +361,13 @@@ static struct file_system_type bd_type 
         .kill_sb        = kill_anon_super,
   };
   
- struct super_block *blockdev_superblock __read_mostly;
+ struct super_block *blockdev_superblock __ro_after_init;
   EXPORT_SYMBOL_GPL(blockdev_superblock);
   
   void __init bdev_cache_init(void)
   {
         int err;
-       static struct vfsmount *bd_mnt;
+       static struct vfsmount *bd_mnt __ro_after_init;
   
         bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
                         0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@@ -829,28 -829,6 +829,28 @@@ put_blkdev
   }
   EXPORT_SYMBOL(blkdev_get_by_dev);
   
+ +struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+ +                                   const struct blk_holder_ops *hops)
+ +{
+ +      struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL);
+ +      struct block_device *bdev;
+ +
+ +      if (!handle)
+ +              return ERR_PTR(-ENOMEM);
+ +      bdev = blkdev_get_by_dev(dev, mode, holder, hops);
+ +      if (IS_ERR(bdev)) {
+ +              kfree(handle);
+ +              return ERR_CAST(bdev);
+ +      }
+ +      handle->bdev = bdev;
+ +      handle->holder = holder;
+ +      if (holder)
+ +              mode |= BLK_OPEN_EXCL;
+ +      handle->mode = mode;
+ +      return handle;
+ +}
+ +EXPORT_SYMBOL(bdev_open_by_dev);
+ +
   /**
    * blkdev_get_by_path - open a block device by name
    * @path: path to the block device to open
@@@ -889,28 -867,6 +889,28 @@@ struct block_device *blkdev_get_by_path
   }
   EXPORT_SYMBOL(blkdev_get_by_path);
   
+ +struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
+ +              void *holder, const struct blk_holder_ops *hops)
+ +{
+ +      struct bdev_handle *handle;
+ +      dev_t dev;
+ +      int error;
+ +
+ +      error = lookup_bdev(path, &dev);
+ +      if (error)
+ +              return ERR_PTR(error);
+ +
+ +      handle = bdev_open_by_dev(dev, mode, holder, hops);
+ +      if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
+ +          bdev_read_only(handle->bdev)) {
+ +              bdev_release(handle);
+ +              return ERR_PTR(-EACCES);
+ +      }
+ +
+ +      return handle;
+ +}
+ +EXPORT_SYMBOL(bdev_open_by_path);
+ +
   void blkdev_put(struct block_device *bdev, void *holder)
   {
         struct gendisk *disk = bdev->bd_disk;
@@@ -947,13 -903,6 +947,13 @@@
   }
   EXPORT_SYMBOL(blkdev_put);
   
+ +void bdev_release(struct bdev_handle *handle)
+ +{
+ +      blkdev_put(handle->bdev, handle->holder);
+ +      kfree(handle);
+ +}
+ +EXPORT_SYMBOL(bdev_release);
+ +
   /**
    * lookup_bdev() - Look up a struct block_device by name.
    * @pathname: Name of the block device in the filesystem.
@@@ -1012,20 -961,20 +1012,20 @@@ void bdev_mark_dead(struct block_devic
         mutex_lock(&bdev->bd_holder_lock);
         if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
                 bdev->bd_holder_ops->mark_dead(bdev, surprise);
- -      else
+ +      else {
+ +              mutex_unlock(&bdev->bd_holder_lock);
                 sync_blockdev(bdev);
- -      mutex_unlock(&bdev->bd_holder_lock);
+ +      }
   
         invalidate_bdev(bdev);
   }
- -#ifdef CONFIG_DASD_MODULE
   /*
- - * Drivers should not use this directly, but the DASD driver has historically
- - * had a shutdown to offline mode that doesn't actually remove the gendisk
- - * that otherwise looks a lot like a safe device removal.
+ + * New drivers should not use this directly.  There are some drivers however
+ + * that needs this for historical reasons. For example, the DASD driver has
+ + * historically had a shutdown to offline mode that doesn't actually remove the
+ + * gendisk that otherwise looks a lot like a safe device removal.
    */
   EXPORT_SYMBOL_GPL(bdev_mark_dead);
- -#endif
   
   void sync_bdevs(bool wait)
   {
diff --combined drivers/accel/ivpu/ivpu_job.c

index 689dc0d13b8fadece4cbc0e56cbc21c4e2cc3509,76f468c9f761bfedfcb5cc5f17825e7b5ce4d0db..8983e3a4fdf91a73893adfd0d1d526852dbe02f2
--- 1/drivers/accel/ivpu/ivpu_job.c
--- 2/drivers/accel/ivpu/ivpu_job.c
+++ b/drivers/accel/ivpu/ivpu_job.c
@@@ -48,10 -48,10 +48,10 @@@ static struct ivpu_cmdq *ivpu_cmdq_allo
                 goto cmdq_free;
   
         cmdq->db_id = file_priv->ctx.id + engine * ivpu_get_context_count(vdev);
- -      cmdq->entry_count = (u32)((cmdq->mem->base.size - sizeof(struct vpu_job_queue_header)) /
+ +      cmdq->entry_count = (u32)((ivpu_bo_size(cmdq->mem) - sizeof(struct vpu_job_queue_header)) /
                                   sizeof(struct vpu_job_queue_entry));
   
- -      cmdq->jobq = (struct vpu_job_queue *)cmdq->mem->kvaddr;
+ +      cmdq->jobq = (struct vpu_job_queue *)ivpu_bo_vaddr(cmdq->mem);
         jobq_header = &cmdq->jobq->header;
         jobq_header->engine_idx = engine;
         jobq_header->head = 0;
@@@ -93,7 -93,7 +93,7 @@@ static struct ivpu_cmdq *ivpu_cmdq_acqu
                 return cmdq;
   
         ret = ivpu_jsm_register_db(vdev, file_priv->ctx.id, cmdq->db_id,
- -                                 cmdq->mem->vpu_addr, cmdq->mem->base.size);
+ +                                 cmdq->mem->vpu_addr, ivpu_bo_size(cmdq->mem));
         if (ret)
                 return NULL;
   
@@@ -453,7 -453,7 +453,7 @@@ ivpu_job_prepare_bos_for_submit(struct 
                 return -EBUSY;
         }
   
- -      if (commands_offset >= bo->base.size) {
+ +      if (commands_offset >= ivpu_bo_size(bo)) {
                 ivpu_warn(vdev, "Invalid command buffer offset %u\n", commands_offset);
                 return -EINVAL;
         }
@@@ -618,6 -618,5 +618,5 @@@ int ivpu_job_done_thread_init(struct iv
   
   void ivpu_job_done_thread_fini(struct ivpu_device *vdev)
   {
-       kthread_stop(vdev->job_done_thread);
-       put_task_struct(vdev->job_done_thread);
+       kthread_stop_put(vdev->job_done_thread);
   }
diff --combined drivers/gpu/drm/i915/gt/selftest_migrate.c

index 1a34cbe04fb64692832a3fe0474f1dc19ff311de,0fb07f073baa61f0fb68138b3cf429b4ff4b0ee4..3eff364ccf3ac7a1a0ea9afd54c744298b8c5b89
--- 1/drivers/gpu/drm/i915/gt/selftest_migrate.c
--- 2/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@@ -710,7 -710,7 +710,7 @@@ static int threaded_migrate(struct inte
                 thread[i].tsk = tsk;
         }
   
- -      msleep(10); /* start all threads before we kthread_stop() */
+ +      msleep(10 * n_cpus); /* start all threads before we kthread_stop() */
   
         for (i = 0; i < n_cpus; ++i) {
                 struct task_struct *tsk = thread[i].tsk;
@@@ -719,11 -719,9 +719,9 @@@
                 if (IS_ERR_OR_NULL(tsk))
                         continue;
   
-               status = kthread_stop(tsk);
+               status = kthread_stop_put(tsk);
                 if (status && !err)
                         err = status;
- 
-               put_task_struct(tsk);
         }
   
         kfree(thread);
diff --combined drivers/net/xen-netback/interface.c

index db304f178136cdcf8e63ec35cd124388da045255,33c8143619f002ae9a42c63cd150fde3471f529e..7cff90aa8d24c280cdd36afad282fb08877f8d86
--- 1/drivers/net/xen-netback/interface.c
--- 2/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@@ -41,6 -41,8 +41,6 @@@
   #include <asm/xen/hypercall.h>
   #include <xen/balloon.h>
   
- -#define XENVIF_QUEUE_LENGTH 32
- -
   /* Number of bytes allowed on the internal guest Rx queue. */
   #define XENVIF_RX_QUEUE_BYTES (XEN_NETIF_RX_RING_SIZE/2 * PAGE_SIZE)
   
@@@ -252,9 -254,6 +252,9 @@@ xenvif_start_xmit(struct sk_buff *skb, 
         if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE)
                 skb_clear_hash(skb);
   
+ +      /* timestamp packet in software */
+ +      skb_tx_timestamp(skb);
+ +
         if (!xenvif_rx_queue_tail(queue, skb))
                 goto drop;
   
@@@ -461,7 -460,7 +461,7 @@@ static void xenvif_get_strings(struct n
   
   static const struct ethtool_ops xenvif_ethtool_ops = {
         .get_link       = ethtool_op_get_link,
- -
+ +      .get_ts_info    = ethtool_op_get_ts_info,
         .get_sset_count = xenvif_get_sset_count,
         .get_ethtool_stats = xenvif_get_ethtool_stats,
         .get_strings = xenvif_get_strings,
@@@ -531,6 -530,8 +531,6 @@@ struct xenvif *xenvif_alloc(struct devi
         dev->features = dev->hw_features | NETIF_F_RXCSUM;
         dev->ethtool_ops = &xenvif_ethtool_ops;
   
- -      dev->tx_queue_len = XENVIF_QUEUE_LENGTH;
- -
         dev->min_mtu = ETH_MIN_MTU;
         dev->max_mtu = ETH_MAX_MTU - VLAN_ETH_HLEN;
   
@@@ -671,8 -672,7 +671,7 @@@ err
   static void xenvif_disconnect_queue(struct xenvif_queue *queue)
   {
         if (queue->task) {
-               kthread_stop(queue->task);
-               put_task_struct(queue->task);
+               kthread_stop_put(queue->task);
                 queue->task = NULL;
         }
   
diff --combined drivers/scsi/hisi_sas/hisi_sas_v3_hw.c

index ccc5acb39f5a6af954bd452913e4a4dcdb49df12,5bb35c3ea4e521ce7b7799cef6ec4f677be40c2d..d8437a98037b96874c0d7a5544ba457a5384e83f
--- 1/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
--- 2/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
@@@ -558,7 -558,8 +558,7 @@@ static int experimental_iopoll_q_cnt
   module_param(experimental_iopoll_q_cnt, int, 0444);
   MODULE_PARM_DESC(experimental_iopoll_q_cnt, "number of queues to be used as poll mode, def=0");
   
- -static void debugfs_work_handler_v3_hw(struct work_struct *work);
- -static void debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba);
+ +static int debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba);
   
   static u32 hisi_sas_read32(struct hisi_hba *hisi_hba, u32 off)
   {
@@@ -3387,6 -3388,7 +3387,6 @@@ hisi_sas_shost_alloc_pci(struct pci_de
         hisi_hba = shost_priv(shost);
   
         INIT_WORK(&hisi_hba->rst_work, hisi_sas_rst_work_handler);
- -      INIT_WORK(&hisi_hba->debugfs_work, debugfs_work_handler_v3_hw);
         hisi_hba->hw = &hisi_sas_v3_hw;
         hisi_hba->pci_dev = pdev;
         hisi_hba->dev = dev;
@@@ -3858,6 -3860,37 +3858,6 @@@ static void debugfs_create_files_v3_hw(
                             &debugfs_ras_v3_hw_fops);
   }
   
- -static void debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba)
- -{
- -      int debugfs_dump_index = hisi_hba->debugfs_dump_index;
- -      struct device *dev = hisi_hba->dev;
- -      u64 timestamp = local_clock();
- -
- -      if (debugfs_dump_index >= hisi_sas_debugfs_dump_count) {
- -              dev_warn(dev, "dump count exceeded!\n");
- -              return;
- -      }
- -
- -      do_div(timestamp, NSEC_PER_MSEC);
- -      hisi_hba->debugfs_timestamp[debugfs_dump_index] = timestamp;
- -
- -      debugfs_snapshot_prepare_v3_hw(hisi_hba);
- -
- -      debugfs_snapshot_global_reg_v3_hw(hisi_hba);
- -      debugfs_snapshot_port_reg_v3_hw(hisi_hba);
- -      debugfs_snapshot_axi_reg_v3_hw(hisi_hba);
- -      debugfs_snapshot_ras_reg_v3_hw(hisi_hba);
- -      debugfs_snapshot_cq_reg_v3_hw(hisi_hba);
- -      debugfs_snapshot_dq_reg_v3_hw(hisi_hba);
- -      debugfs_snapshot_itct_reg_v3_hw(hisi_hba);
- -      debugfs_snapshot_iost_reg_v3_hw(hisi_hba);
- -
- -      debugfs_create_files_v3_hw(hisi_hba);
- -
- -      debugfs_snapshot_restore_v3_hw(hisi_hba);
- -      hisi_hba->debugfs_dump_index++;
- -}
- -
   static ssize_t debugfs_trigger_dump_v3_hw_write(struct file *file,
                                                 const char __user *user_buf,
                                                 size_t count, loff_t *ppos)
@@@ -3865,6 -3898,9 +3865,6 @@@
         struct hisi_hba *hisi_hba = file->f_inode->i_private;
         char buf[8];
   
- -      if (hisi_hba->debugfs_dump_index >= hisi_sas_debugfs_dump_count)
- -              return -EFAULT;
- -
         if (count > 8)
                 return -EFAULT;
   
@@@ -3874,12 -3910,7 +3874,12 @@@
         if (buf[0] != '1')
                 return -EFAULT;
   
- -      queue_work(hisi_hba->wq, &hisi_hba->debugfs_work);
+ +      down(&hisi_hba->sem);
+ +      if (debugfs_snapshot_regs_v3_hw(hisi_hba)) {
+ +              up(&hisi_hba->sem);
+ +              return -EFAULT;
+ +      }
+ +      up(&hisi_hba->sem);
   
         return count;
   }
@@@ -3959,22 -3990,7 +3959,7 @@@ static ssize_t debugfs_bist_linkrate_v3
   
         return count;
   }
- 
- static int debugfs_bist_linkrate_v3_hw_open(struct inode *inode,
-                                           struct file *filp)
- {
-       return single_open(filp, debugfs_bist_linkrate_v3_hw_show,
-                          inode->i_private);
- }
- 
- static const struct file_operations debugfs_bist_linkrate_v3_hw_fops = {
-       .open = debugfs_bist_linkrate_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_linkrate_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_linkrate_v3_hw);
   
   static const struct {
         int             value;
@@@ -4049,22 -4065,7 +4034,7 @@@ static ssize_t debugfs_bist_code_mode_v
   
         return count;
   }
- 
- static int debugfs_bist_code_mode_v3_hw_open(struct inode *inode,
-                                            struct file *filp)
- {
-       return single_open(filp, debugfs_bist_code_mode_v3_hw_show,
-                          inode->i_private);
- }
- 
- static const struct file_operations debugfs_bist_code_mode_v3_hw_fops = {
-       .open = debugfs_bist_code_mode_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_code_mode_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_code_mode_v3_hw);
   
   static ssize_t debugfs_bist_phy_v3_hw_write(struct file *filp,
                                             const char __user *buf,
@@@ -4098,22 -4099,7 +4068,7 @@@ static int debugfs_bist_phy_v3_hw_show(
   
         return 0;
   }
- 
- static int debugfs_bist_phy_v3_hw_open(struct inode *inode,
-                                      struct file *filp)
- {
-       return single_open(filp, debugfs_bist_phy_v3_hw_show,
-                          inode->i_private);
- }
- 
- static const struct file_operations debugfs_bist_phy_v3_hw_fops = {
-       .open = debugfs_bist_phy_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_phy_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_phy_v3_hw);
   
   static ssize_t debugfs_bist_cnt_v3_hw_write(struct file *filp,
                                         const char __user *buf,
@@@ -4146,22 -4132,7 +4101,7 @@@ static int debugfs_bist_cnt_v3_hw_show(
   
         return 0;
   }
- 
- static int debugfs_bist_cnt_v3_hw_open(struct inode *inode,
-                                         struct file *filp)
- {
-       return single_open(filp, debugfs_bist_cnt_v3_hw_show,
-                          inode->i_private);
- }
- 
- static const struct file_operations debugfs_bist_cnt_v3_hw_ops = {
-       .open = debugfs_bist_cnt_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_cnt_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_cnt_v3_hw);
   
   static const struct {
         int             value;
@@@ -4225,22 -4196,7 +4165,7 @@@ static ssize_t debugfs_bist_mode_v3_hw_
   
         return count;
   }
- 
- static int debugfs_bist_mode_v3_hw_open(struct inode *inode,
-                                       struct file *filp)
- {
-       return single_open(filp, debugfs_bist_mode_v3_hw_show,
-                          inode->i_private);
- }
- 
- static const struct file_operations debugfs_bist_mode_v3_hw_fops = {
-       .open = debugfs_bist_mode_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_mode_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_mode_v3_hw);
   
   static ssize_t debugfs_bist_enable_v3_hw_write(struct file *filp,
                                                const char __user *buf,
@@@ -4278,22 -4234,7 +4203,7 @@@ static int debugfs_bist_enable_v3_hw_sh
   
         return 0;
   }
- 
- static int debugfs_bist_enable_v3_hw_open(struct inode *inode,
-                                         struct file *filp)
- {
-       return single_open(filp, debugfs_bist_enable_v3_hw_show,
-                          inode->i_private);
- }
- 
- static const struct file_operations debugfs_bist_enable_v3_hw_fops = {
-       .open = debugfs_bist_enable_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_bist_enable_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_bist_enable_v3_hw);
   
   static const struct {
         char *name;
@@@ -4331,21 -4272,7 +4241,7 @@@ static int debugfs_v3_hw_show(struct se
   
         return 0;
   }
- 
- static int debugfs_v3_hw_open(struct inode *inode, struct file *filp)
- {
-       return single_open(filp, debugfs_v3_hw_show,
-                          inode->i_private);
- }
- 
- static const struct file_operations debugfs_v3_hw_fops = {
-       .open = debugfs_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_v3_hw);
   
   static ssize_t debugfs_phy_down_cnt_v3_hw_write(struct file *filp,
                                                 const char __user *buf,
@@@ -4376,22 -4303,7 +4272,7 @@@ static int debugfs_phy_down_cnt_v3_hw_s
   
         return 0;
   }
- 
- static int debugfs_phy_down_cnt_v3_hw_open(struct inode *inode,
-                                          struct file *filp)
- {
-       return single_open(filp, debugfs_phy_down_cnt_v3_hw_show,
-                          inode->i_private);
- }
- 
- static const struct file_operations debugfs_phy_down_cnt_v3_hw_fops = {
-       .open = debugfs_phy_down_cnt_v3_hw_open,
-       .read = seq_read,
-       .write = debugfs_phy_down_cnt_v3_hw_write,
-       .llseek = seq_lseek,
-       .release = single_release,
-       .owner = THIS_MODULE,
- };
+ DEFINE_SHOW_STORE_ATTRIBUTE(debugfs_phy_down_cnt_v3_hw);
   
   enum fifo_dump_mode_v3_hw {
         FIFO_DUMP_FORVER =              (1U << 0),
@@@ -4630,6 -4542,14 +4511,6 @@@ static void debugfs_fifo_init_v3_hw(str
         }
   }
   
- -static void debugfs_work_handler_v3_hw(struct work_struct *work)
- -{
- -      struct hisi_hba *hisi_hba =
- -              container_of(work, struct hisi_hba, debugfs_work);
- -
- -      debugfs_snapshot_regs_v3_hw(hisi_hba);
- -}
- -
   static void debugfs_release_v3_hw(struct hisi_hba *hisi_hba, int dump_index)
   {
         struct device *dev = hisi_hba->dev;
@@@ -4664,7 -4584,7 +4545,7 @@@ static int debugfs_alloc_v3_hw(struct h
   {
         const struct hisi_sas_hw *hw = hisi_hba->hw;
         struct device *dev = hisi_hba->dev;
- -      int p, c, d, r, i;
+ +      int p, c, d, r;
         size_t sz;
   
         for (r = 0; r < DEBUGFS_REGS_NUM; r++) {
@@@ -4744,48 -4664,11 +4625,48 @@@
   
         return 0;
   fail:
- -      for (i = 0; i < hisi_sas_debugfs_dump_count; i++)
- -              debugfs_release_v3_hw(hisi_hba, i);
+ +      debugfs_release_v3_hw(hisi_hba, dump_index);
         return -ENOMEM;
   }
   
+ +static int debugfs_snapshot_regs_v3_hw(struct hisi_hba *hisi_hba)
+ +{
+ +      int debugfs_dump_index = hisi_hba->debugfs_dump_index;
+ +      struct device *dev = hisi_hba->dev;
+ +      u64 timestamp = local_clock();
+ +
+ +      if (debugfs_dump_index >= hisi_sas_debugfs_dump_count) {
+ +              dev_warn(dev, "dump count exceeded!\n");
+ +              return -EINVAL;
+ +      }
+ +
+ +      if (debugfs_alloc_v3_hw(hisi_hba, debugfs_dump_index)) {
+ +              dev_warn(dev, "failed to alloc memory\n");
+ +              return -ENOMEM;
+ +      }
+ +
+ +      do_div(timestamp, NSEC_PER_MSEC);
+ +      hisi_hba->debugfs_timestamp[debugfs_dump_index] = timestamp;
+ +
+ +      debugfs_snapshot_prepare_v3_hw(hisi_hba);
+ +
+ +      debugfs_snapshot_global_reg_v3_hw(hisi_hba);
+ +      debugfs_snapshot_port_reg_v3_hw(hisi_hba);
+ +      debugfs_snapshot_axi_reg_v3_hw(hisi_hba);
+ +      debugfs_snapshot_ras_reg_v3_hw(hisi_hba);
+ +      debugfs_snapshot_cq_reg_v3_hw(hisi_hba);
+ +      debugfs_snapshot_dq_reg_v3_hw(hisi_hba);
+ +      debugfs_snapshot_itct_reg_v3_hw(hisi_hba);
+ +      debugfs_snapshot_iost_reg_v3_hw(hisi_hba);
+ +
+ +      debugfs_create_files_v3_hw(hisi_hba);
+ +
+ +      debugfs_snapshot_restore_v3_hw(hisi_hba);
+ +      hisi_hba->debugfs_dump_index++;
+ +
+ +      return 0;
+ +}
+ +
   static void debugfs_phy_down_cnt_init_v3_hw(struct hisi_hba *hisi_hba)
   {
         struct dentry *dir = debugfs_create_dir("phy_down_cnt",
@@@ -4830,7 -4713,7 +4711,7 @@@ static void debugfs_bist_init_v3_hw(str
                             hisi_hba, &debugfs_bist_phy_v3_hw_fops);
   
         debugfs_create_file("cnt", 0600, hisi_hba->debugfs_bist_dentry,
-                           hisi_hba, &debugfs_bist_cnt_v3_hw_ops);
+                           hisi_hba, &debugfs_bist_cnt_v3_hw_fops);
   
         debugfs_create_file("loopback_mode", 0600,
                             hisi_hba->debugfs_bist_dentry,
@@@ -4863,15 -4746,10 +4744,15 @@@
         hisi_hba->debugfs_bist_linkrate = SAS_LINK_RATE_1_5_GBPS;
   }
   
+ +static void debugfs_exit_v3_hw(struct hisi_hba *hisi_hba)
+ +{
+ +      debugfs_remove_recursive(hisi_hba->debugfs_dir);
+ +      hisi_hba->debugfs_dir = NULL;
+ +}
+ +
   static void debugfs_init_v3_hw(struct hisi_hba *hisi_hba)
   {
         struct device *dev = hisi_hba->dev;
- -      int i;
   
         hisi_hba->debugfs_dir = debugfs_create_dir(dev_name(dev),
                                                    hisi_sas_debugfs_dir);
@@@ -4888,6 -4766,19 +4769,6 @@@
   
         debugfs_phy_down_cnt_init_v3_hw(hisi_hba);
         debugfs_fifo_init_v3_hw(hisi_hba);
- -
- -      for (i = 0; i < hisi_sas_debugfs_dump_count; i++) {
- -              if (debugfs_alloc_v3_hw(hisi_hba, i)) {
- -                      debugfs_remove_recursive(hisi_hba->debugfs_dir);
- -                      dev_dbg(dev, "failed to init debugfs!\n");
- -                      break;
- -              }
- -      }
- -}
- -
- -static void debugfs_exit_v3_hw(struct hisi_hba *hisi_hba)
- -{
- -      debugfs_remove_recursive(hisi_hba->debugfs_dir);
   }
   
   static int
diff --combined fs/buffer.c

index 657a62bab73d753016d235b08f8b6b5e8397ff11,a19fef583116d1fb3a2de262bc53474ee8ccd64e..967f34b70aa8f73aa56beec71a93a08b20546698
--- 1/fs/buffer.c
--- 2/fs/buffer.c
+++ b/fs/buffer.c
@@@ -282,7 -282,13 +282,7 @@@ static void end_buffer_async_read(struc
         } while (tmp != bh);
         spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
   
- -      /*
- -       * If all of the buffers are uptodate then we can set the page
- -       * uptodate.
- -       */
- -      if (folio_uptodate)
- -              folio_mark_uptodate(folio);
- -      folio_unlock(folio);
+ +      folio_end_read(folio, folio_uptodate);
         return;
   
   still_busy:
@@@ -909,12 -915,16 +909,12 @@@ int remove_inode_buffers(struct inode *
    * which may not fail from ordinary buffer allocations.
    */
   struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
- -                                      bool retry)
+ +                                      gfp_t gfp)
   {
         struct buffer_head *bh, *head;
- -      gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
         long offset;
         struct mem_cgroup *memcg, *old_memcg;
   
- -      if (retry)
- -              gfp |= __GFP_NOFAIL;
- -
         /* The folio lock pins the memcg */
         memcg = folio_memcg(folio);
         old_memcg = set_active_memcg(memcg);
@@@ -957,11 -967,7 +957,11 @@@ EXPORT_SYMBOL_GPL(folio_alloc_buffers)
   struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
                                        bool retry)
   {
- -      return folio_alloc_buffers(page_folio(page), size, retry);
+ +      gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
+ +      if (retry)
+ +              gfp |= __GFP_NOFAIL;
+ +
+ +      return folio_alloc_buffers(page_folio(page), size, gfp);
   }
   EXPORT_SYMBOL_GPL(alloc_page_buffers);
   
@@@ -1037,11 -1043,20 +1037,11 @@@ grow_dev_page(struct block_device *bdev
         struct buffer_head *bh;
         sector_t end_block;
         int ret = 0;
- -      gfp_t gfp_mask;
- -
- -      gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
- -
- -      /*
- -       * XXX: __getblk_slow() can not really deal with failure and
- -       * will endlessly loop on improvised global reclaim.  Prefer
- -       * looping in the allocator rather than here, at least that
- -       * code knows what it's doing.
- -       */
- -      gfp_mask |= __GFP_NOFAIL;
   
         folio = __filemap_get_folio(inode->i_mapping, index,
- -                      FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask);
+ +                      FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
+ +      if (IS_ERR(folio))
+ +              return PTR_ERR(folio);
   
         bh = folio_buffers(folio);
         if (bh) {
@@@ -1054,10 -1069,7 +1054,10 @@@
                         goto failed;
         }
   
- -      bh = folio_alloc_buffers(folio, size, true);
+ +      ret = -ENOMEM;
+ +      bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
+ +      if (!bh)
+ +              goto failed;
   
         /*
          * Link the folio to the buffers and initialise them.  Take the
@@@ -1408,36 -1420,33 +1408,36 @@@ __find_get_block(struct block_device *b
   }
   EXPORT_SYMBOL(__find_get_block);
   
- -/*
- - * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
- - * which corresponds to the passed block_device, block and size. The
- - * returned buffer has its reference count incremented.
+ +/**
+ + * bdev_getblk - Get a buffer_head in a block device's buffer cache.
+ + * @bdev: The block device.
+ + * @block: The block number.
+ + * @size: The size of buffer_heads for this @bdev.
+ + * @gfp: The memory allocation flags to use.
    *
- - * __getblk_gfp() will lock up the machine if grow_dev_page's
- - * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
+ + * Return: The buffer head, or NULL if memory could not be allocated.
    */
- -struct buffer_head *
- -__getblk_gfp(struct block_device *bdev, sector_t block,
- -           unsigned size, gfp_t gfp)
+ +struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
+ +              unsigned size, gfp_t gfp)
   {
         struct buffer_head *bh = __find_get_block(bdev, block, size);
   
- -      might_sleep();
- -      if (bh == NULL)
- -              bh = __getblk_slow(bdev, block, size, gfp);
- -      return bh;
+ +      might_alloc(gfp);
+ +      if (bh)
+ +              return bh;
+ +
+ +      return __getblk_slow(bdev, block, size, gfp);
   }
- -EXPORT_SYMBOL(__getblk_gfp);
+ +EXPORT_SYMBOL(bdev_getblk);
   
   /*
    * Do async read-ahead on a buffer..
    */
   void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
   {
- -      struct buffer_head *bh = __getblk(bdev, block, size);
+ +      struct buffer_head *bh = bdev_getblk(bdev, block, size,
+ +                      GFP_NOWAIT | __GFP_MOVABLE);
+ +
         if (likely(bh)) {
                 bh_readahead(bh, REQ_RAHEAD);
                 brelse(bh);
@@@ -1461,17 -1470,7 +1461,17 @@@ struct buffer_head 
   __bread_gfp(struct block_device *bdev, sector_t block,
                    unsigned size, gfp_t gfp)
   {
- -      struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
+ +      struct buffer_head *bh;
+ +
+ +      gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+ +
+ +      /*
+ +       * Prefer looping in the allocator rather than here, at least that
+ +       * code knows what it's doing.
+ +       */
+ +      gfp |= __GFP_NOFAIL;
+ +
+ +      bh = bdev_getblk(bdev, block, size, gfp);
   
         if (likely(bh) && !buffer_uptodate(bh))
                 bh = __bread_slow(bh);
@@@ -1641,13 -1640,12 +1641,13 @@@ EXPORT_SYMBOL(block_invalidate_folio)
    * block_dirty_folio() via private_lock.  try_to_free_buffers
    * is already excluded via the folio lock.
    */
- -void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
- -                              unsigned long b_state)
+ +struct buffer_head *create_empty_buffers(struct folio *folio,
+ +              unsigned long blocksize, unsigned long b_state)
   {
         struct buffer_head *bh, *head, *tail;
+ +      gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;
   
- -      head = folio_alloc_buffers(folio, blocksize, true);
+ +      head = folio_alloc_buffers(folio, blocksize, gfp);
         bh = head;
         do {
                 bh->b_state |= b_state;
@@@ -1669,8 -1667,13 +1669,8 @@@
         }
         folio_attach_private(folio, head);
         spin_unlock(&folio->mapping->private_lock);
- -}
- -EXPORT_SYMBOL(folio_create_empty_buffers);
   
- -void create_empty_buffers(struct page *page,
- -                      unsigned long blocksize, unsigned long b_state)
- -{
- -      folio_create_empty_buffers(page_folio(page), blocksize, b_state);
+ +      return head;
   }
   EXPORT_SYMBOL(create_empty_buffers);
   
@@@ -1765,15 -1768,13 +1765,15 @@@ static struct buffer_head *folio_create
                                                 struct inode *inode,
                                                 unsigned int b_state)
   {
+ +      struct buffer_head *bh;
+ +
         BUG_ON(!folio_test_locked(folio));
   
- -      if (!folio_buffers(folio))
- -              folio_create_empty_buffers(folio,
- -                                         1 << READ_ONCE(inode->i_blkbits),
- -                                         b_state);
- -      return folio_buffers(folio);
+ +      bh = folio_buffers(folio);
+ +      if (!bh)
+ +              bh = create_empty_buffers(folio,
+ +                              1 << READ_ONCE(inode->i_blkbits), b_state);
+ +      return bh;
   }
   
   /*
@@@ -2424,10 -2425,12 +2424,10 @@@ int block_read_full_folio(struct folio 
   
         if (!nr) {
                 /*
- -               * All buffers are uptodate - we can set the folio uptodate
- -               * as well. But not if get_block() returned an error.
+ +               * All buffers are uptodate or get_block() returned an
+ +               * error when trying to map them - we can finish the read.
                  */
- -              if (!page_error)
- -                      folio_mark_uptodate(folio);
- -              folio_unlock(folio);
+ +              folio_end_read(folio, !page_error);
                 return 0;
         }
   
@@@ -2673,8 -2676,10 +2673,8 @@@ int block_truncate_page(struct address_
                 return PTR_ERR(folio);
   
         bh = folio_buffers(folio);
- -      if (!bh) {
- -              folio_create_empty_buffers(folio, blocksize, 0);
- -              bh = folio_buffers(folio);
- -      }
+ +      if (!bh)
+ +              bh = create_empty_buffers(folio, blocksize, 0);
   
         /* Find the buffer that contains "offset" */
         offset = offset_in_folio(folio, from);
@@@ -2983,13 -2988,13 +2983,13 @@@ EXPORT_SYMBOL(try_to_free_buffers)
   /*
    * Buffer-head allocation
    */
- static struct kmem_cache *bh_cachep __read_mostly;
+ static struct kmem_cache *bh_cachep __ro_after_init;
   
   /*
    * Once the number of bh's in the machine exceeds this level, we start
    * stripping them in writeback.
    */
- static unsigned long max_buffer_heads;
+ static unsigned long max_buffer_heads __ro_after_init;
   
   int buffer_heads_over_limit;
   
diff --combined fs/char_dev.c

index 6ba032442b39d37e0d3901b6f7219237e8758c40,3d52f3d3ae77548b8aa908e02bd619403d4c8cf9..57cc096c498a29fc3bc2278a142a494ffbe5f3fd
--- 1/fs/char_dev.c
--- 2/fs/char_dev.c
+++ b/fs/char_dev.c
@@@ -25,7 -25,7 +25,7 @@@
   
   #include "internal.h"
   
- static struct kobj_map *cdev_map;
+ static struct kobj_map *cdev_map __ro_after_init;
   
   static DEFINE_MUTEX(chrdevs_lock);
   
@@@ -350,7 -350,7 +350,7 @@@ static struct kobject *cdev_get(struct 
         struct module *owner = p->owner;
         struct kobject *kobj;
   
- -      if (owner && !try_module_get(owner))
+ +      if (!try_module_get(owner))
                 return NULL;
         kobj = kobject_get_unless_zero(&p->kobj);
         if (!kobj)
diff --combined fs/dcache.c

index 796e23761ba0505748faf160011f49736ac26522,0650ccdaa3357f21ff468ca87af3bc46a86e5f7c..c82ae731df9af780e58db42b45ba198375be2819
--- 1/fs/dcache.c
--- 2/fs/dcache.c
+++ b/fs/dcache.c
@@@ -78,7 -78,7 +78,7 @@@ __cacheline_aligned_in_smp DEFINE_SEQLO
   
   EXPORT_SYMBOL(rename_lock);
   
- static struct kmem_cache *dentry_cache __read_mostly;
+ static struct kmem_cache *dentry_cache __ro_after_init;
   
   const struct qstr empty_name = QSTR_INIT("", 0);
   EXPORT_SYMBOL(empty_name);
@@@ -96,9 -96,9 +96,9 @@@ EXPORT_SYMBOL(dotdot_name)
    * information, yet avoid using a prime hash-size or similar.
    */
   
- static unsigned int d_hash_shift __read_mostly;
+ static unsigned int d_hash_shift __ro_after_init;
   
- static struct hlist_bl_head *dentry_hashtable __read_mostly;
+ static struct hlist_bl_head *dentry_hashtable __ro_after_init;
   
   static inline struct hlist_bl_head *d_hash(unsigned int hash)
   {
@@@ -3246,10 -3246,11 +3246,10 @@@ void d_genocide(struct dentry *parent
         d_walk(parent, parent, d_genocide_kill);
   }
   
- -void d_tmpfile(struct file *file, struct inode *inode)
+ +void d_mark_tmpfile(struct file *file, struct inode *inode)
   {
         struct dentry *dentry = file->f_path.dentry;
   
- -      inode_dec_link_count(inode);
         BUG_ON(dentry->d_name.name != dentry->d_iname ||
                 !hlist_unhashed(&dentry->d_u.d_alias) ||
                 !d_unlinked(dentry));
@@@ -3259,15 -3260,6 +3259,15 @@@
                                 (unsigned long long)inode->i_ino);
         spin_unlock(&dentry->d_lock);
         spin_unlock(&dentry->d_parent->d_lock);
+ +}
+ +EXPORT_SYMBOL(d_mark_tmpfile);
+ +
+ +void d_tmpfile(struct file *file, struct inode *inode)
+ +{
+ +      struct dentry *dentry = file->f_path.dentry;
+ +
+ +      inode_dec_link_count(inode);
+ +      d_mark_tmpfile(file, inode);
         d_instantiate(dentry, inode);
   }
   EXPORT_SYMBOL(d_tmpfile);
@@@ -3332,7 -3324,7 +3332,7 @@@ static void __init dcache_init(void
   }
   
   /* SLAB cache for __getname() consumers */
- struct kmem_cache *names_cachep __read_mostly;
+ struct kmem_cache *names_cachep __ro_after_init;
   EXPORT_SYMBOL(names_cachep);
   
   void __init vfs_caches_init_early(void)
diff --combined fs/file_table.c

index fa92743ba6a91691f3621e0d8d050460c24649ef,687d33865035ffbc39491d7bb1e23b0506c612a0..de4a2915bfd4941281915be92ce7dbdd8f67ac3b
--- 1/fs/file_table.c
--- 2/fs/file_table.c
+++ b/fs/file_table.c
@@@ -40,14 -40,14 +40,14 @@@ static struct files_stat_struct files_s
   };
   
   /* SLAB cache for file structures */
- static struct kmem_cache *filp_cachep __read_mostly;
+ static struct kmem_cache *filp_cachep __ro_after_init;
   
   static struct percpu_counter nr_files __cacheline_aligned_in_smp;
   
- -/* Container for backing file with optional real path */
+ +/* Container for backing file with optional user path */
   struct backing_file {
         struct file file;
- -      struct path real_path;
+ +      struct path user_path;
   };
   
   static inline struct backing_file *backing_file(struct file *f)
@@@ -55,36 -55,31 +55,36 @@@
         return container_of(f, struct backing_file, file);
   }
   
- -struct path *backing_file_real_path(struct file *f)
+ +struct path *backing_file_user_path(struct file *f)
   {
- -      return &backing_file(f)->real_path;
+ +      return &backing_file(f)->user_path;
   }
- -EXPORT_SYMBOL_GPL(backing_file_real_path);
+ +EXPORT_SYMBOL_GPL(backing_file_user_path);
   
- -static void file_free_rcu(struct rcu_head *head)
+ +static inline void file_free(struct file *f)
   {
- -      struct file *f = container_of(head, struct file, f_rcuhead);
- -
+ +      security_file_free(f);
+ +      if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+ +              percpu_counter_dec(&nr_files);
         put_cred(f->f_cred);
- -      if (unlikely(f->f_mode & FMODE_BACKING))
+ +      if (unlikely(f->f_mode & FMODE_BACKING)) {
+ +              path_put(backing_file_user_path(f));
                 kfree(backing_file(f));
- -      else
+ +      } else {
                 kmem_cache_free(filp_cachep, f);
+ +      }
   }
   
- -static inline void file_free(struct file *f)
+ +void release_empty_file(struct file *f)
   {
- -      security_file_free(f);
- -      if (unlikely(f->f_mode & FMODE_BACKING))
- -              path_put(backing_file_real_path(f));
- -      if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
- -              percpu_counter_dec(&nr_files);
- -      call_rcu(&f->f_rcuhead, file_free_rcu);
+ +      WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
+ +      if (atomic_long_dec_and_test(&f->f_count)) {
+ +              security_file_free(f);
+ +              put_cred(f->f_cred);
+ +              if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+ +                      percpu_counter_dec(&nr_files);
+ +              kmem_cache_free(filp_cachep, f);
+ +      }
   }
   
   /*
@@@ -169,6 -164,7 +169,6 @@@ static int init_file(struct file *f, in
                 return error;
         }
   
- -      atomic_long_set(&f->f_count, 1);
         rwlock_init(&f->f_owner.lock);
         spin_lock_init(&f->f_lock);
         mutex_init(&f->f_pos_lock);
@@@ -176,12 -172,6 +176,12 @@@
         f->f_mode = OPEN_FMODE(flags);
         /* f->f_version: 0 */
   
+ +      /*
+ +       * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+ +       * fget-rcu pattern users need to be able to handle spurious
+ +       * refcount bumps we should reinitialize the reused file first.
+ +       */
+ +      atomic_long_set(&f->f_count, 1);
         return 0;
   }
   
@@@ -481,8 -471,7 +481,8 @@@ EXPORT_SYMBOL(__fput_sync)
   void __init files_init(void)
   {
         filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
- -                      SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ +                              SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
+ +                              SLAB_PANIC | SLAB_ACCOUNT, NULL);
         percpu_counter_init(&nr_files, 0, GFP_KERNEL);
   }
   
diff --combined fs/inode.c

index 4f8984b97df082fff002e27d795b78dacce882e2,2b1d473a363154fe3ba39fa104af6911242c2ca7..edcd8a61975f34c7a4cf467589848870430b3b8a
--- 1/fs/inode.c
--- 2/fs/inode.c
+++ b/fs/inode.c
@@@ -54,9 -54,9 +54,9 @@@
    *   inode_hash_lock
    */
   
- static unsigned int i_hash_mask __read_mostly;
- static unsigned int i_hash_shift __read_mostly;
- static struct hlist_head *inode_hashtable __read_mostly;
+ static unsigned int i_hash_mask __ro_after_init;
+ static unsigned int i_hash_shift __ro_after_init;
+ static struct hlist_head *inode_hashtable __ro_after_init;
   static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
   
   /*
@@@ -70,7 -70,7 +70,7 @@@ EXPORT_SYMBOL(empty_aops)
   static DEFINE_PER_CPU(unsigned long, nr_inodes);
   static DEFINE_PER_CPU(unsigned long, nr_unused);
   
- static struct kmem_cache *inode_cachep __read_mostly;
+ static struct kmem_cache *inode_cachep __ro_after_init;
   
   static long get_nr_inodes(void)
   {
@@@ -1837,29 -1837,27 +1837,29 @@@ EXPORT_SYMBOL(bmap)
   static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
                              struct timespec64 now)
   {
- -      struct timespec64 ctime;
+ +      struct timespec64 atime, mtime, ctime;
   
         if (!(mnt->mnt_flags & MNT_RELATIME))
                 return 1;
         /*
          * Is mtime younger than or equal to atime? If yes, update atime:
          */
- -      if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+ +      atime = inode_get_atime(inode);
+ +      mtime = inode_get_mtime(inode);
+ +      if (timespec64_compare(&mtime, &atime) >= 0)
                 return 1;
         /*
          * Is ctime younger than or equal to atime? If yes, update atime:
          */
         ctime = inode_get_ctime(inode);
- -      if (timespec64_compare(&ctime, &inode->i_atime) >= 0)
+ +      if (timespec64_compare(&ctime, &atime) >= 0)
                 return 1;
   
         /*
          * Is the previous atime value older than a day? If yes,
          * update atime:
          */
- -      if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+ +      if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
                 return 1;
         /*
          * Good, we can skip the atime update:
@@@ -1890,13 -1888,12 +1890,13 @@@ int inode_update_timestamps(struct inod
   
         if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
                 struct timespec64 ctime = inode_get_ctime(inode);
+ +              struct timespec64 mtime = inode_get_mtime(inode);
   
                 now = inode_set_ctime_current(inode);
                 if (!timespec64_equal(&now, &ctime))
                         updated |= S_CTIME;
- -              if (!timespec64_equal(&now, &inode->i_mtime)) {
- -                      inode->i_mtime = now;
+ +              if (!timespec64_equal(&now, &mtime)) {
+ +                      inode_set_mtime_to_ts(inode, now);
                         updated |= S_MTIME;
                 }
                 if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
@@@ -1906,10 -1903,8 +1906,10 @@@
         }
   
         if (flags & S_ATIME) {
- -              if (!timespec64_equal(&now, &inode->i_atime)) {
- -                      inode->i_atime = now;
+ +              struct timespec64 atime = inode_get_atime(inode);
+ +
+ +              if (!timespec64_equal(&now, &atime)) {
+ +                      inode_set_atime_to_ts(inode, now);
                         updated |= S_ATIME;
                 }
         }
@@@ -1968,7 -1963,7 +1968,7 @@@ EXPORT_SYMBOL(inode_update_time)
   bool atime_needs_update(const struct path *path, struct inode *inode)
   {
         struct vfsmount *mnt = path->mnt;
- -      struct timespec64 now;
+ +      struct timespec64 now, atime;
   
         if (inode->i_flags & S_NOATIME)
                 return false;
@@@ -1994,8 -1989,7 +1994,8 @@@
         if (!relatime_need_update(mnt, inode, now))
                 return false;
   
- -      if (timespec64_equal(&inode->i_atime, &now))
+ +      atime = inode_get_atime(inode);
+ +      if (timespec64_equal(&atime, &now))
                 return false;
   
         return true;
@@@ -2012,7 -2006,7 +2012,7 @@@ void touch_atime(const struct path *pat
         if (!sb_start_write_trylock(inode->i_sb))
                 return;
   
- -      if (__mnt_want_write(mnt) != 0)
+ +      if (mnt_get_write_access(mnt) != 0)
                 goto skip_update;
         /*
          * File systems can error out when updating inodes if they need to
@@@ -2024,7 -2018,7 +2024,7 @@@
          * of the fs read only, e.g. subvolumes in Btrfs.
          */
         inode_update_time(inode, S_ATIME);
- -      __mnt_drop_write(mnt);
+ +      mnt_put_write_access(mnt);
   skip_update:
         sb_end_write(inode->i_sb);
   }
@@@ -2112,18 -2106,17 +2112,18 @@@ static int inode_needs_update_time(stru
   {
         int sync_it = 0;
         struct timespec64 now = current_time(inode);
- -      struct timespec64 ctime;
+ +      struct timespec64 ts;
   
         /* First try to exhaust all avenues to not sync */
         if (IS_NOCMTIME(inode))
                 return 0;
   
- -      if (!timespec64_equal(&inode->i_mtime, &now))
+ +      ts = inode_get_mtime(inode);
+ +      if (!timespec64_equal(&ts, &now))
                 sync_it = S_MTIME;
   
- -      ctime = inode_get_ctime(inode);
- -      if (!timespec64_equal(&ctime, &now))
+ +      ts = inode_get_ctime(inode);
+ +      if (!timespec64_equal(&ts, &now))
                 sync_it |= S_CTIME;
   
         if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@@ -2138,9 -2131,9 +2138,9 @@@ static int __file_update_time(struct fi
         struct inode *inode = file_inode(file);
   
         /* try to update time settings */
- -      if (!__mnt_want_write_file(file)) {
+ +      if (!mnt_get_write_access_file(file)) {
                 ret = inode_update_time(inode, sync_mode);
- -              __mnt_drop_write_file(file);
+ +              mnt_put_write_access_file(file);
         }
   
         return ret;
diff --combined fs/kernfs/mount.c

index 79b96e74a8a09d3ce3e73e326ca897d309ae00d4,43aea0ad95c8493ff839dbc97bcbe94bba3bb42d..4628edde2e7e1ad7a4e59b7f1117331a83d29b38
--- 1/fs/kernfs/mount.c
--- 2/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@@ -21,8 -21,9 +21,9 @@@
   
   #include "kernfs-internal.h"
   
- struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
- struct kernfs_global_locks *kernfs_locks;
+ struct kmem_cache *kernfs_node_cache __ro_after_init;
+ struct kmem_cache *kernfs_iattrs_cache __ro_after_init;
+ struct kernfs_global_locks *kernfs_locks __ro_after_init;
   
   static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
   {
@@@ -265,7 -266,7 +266,7 @@@ static int kernfs_fill_super(struct sup
         sb->s_time_gran = 1;
   
         /* sysfs dentries and inodes don't require IO to create */
- -      sb->s_shrink.seeks = 0;
+ +      sb->s_shrink->seeks = 0;
   
         /* get root inode, initialize and unlock it */
         down_read(&kf_root->kernfs_rwsem);
diff --combined fs/locks.c

index d4e49a990a8daaffebfd1f3dd22b5e2973812684,dbd2fb1f74949ac27577b0d7830155e5f2c23871..46d88b9e222cf2c50897e120248f7231821f579f
--- 1/fs/locks.c
--- 2/fs/locks.c
+++ b/fs/locks.c
@@@ -167,8 -167,8 +167,8 @@@ static DEFINE_HASHTABLE(blocked_hash, B
    */
   static DEFINE_SPINLOCK(blocked_lock_lock);
   
- static struct kmem_cache *flctx_cache __read_mostly;
- static struct kmem_cache *filelock_cache __read_mostly;
+ static struct kmem_cache *flctx_cache __ro_after_init;
+ static struct kmem_cache *filelock_cache __ro_after_init;
   
   static struct file_lock_context *
   locks_get_lock_context(struct inode *inode, int type)
@@@ -2264,13 -2264,11 +2264,13 @@@ out
    * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
    * locks, the ->lock() interface may return asynchronously, before the lock has
    * been granted or denied by the underlying filesystem, if (and only if)
- - * lm_grant is set. Callers expecting ->lock() to return asynchronously
- - * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
- - * the request is for a blocking lock. When ->lock() does return asynchronously,
- - * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock
- - * request completes.
+ + * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
+ + * flags need to be set.
+ + *
+ + * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
+ + * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
+ + * blocking lock. When ->lock() does return asynchronously, it must return
+ + * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
    * If the request is for non-blocking lock the file system should return
    * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
    * with the result. If the request timed out the callback routine will return a
diff --combined fs/namespace.c

index 6bde71735efa79e03fdb07fe5938efe21d70eb79,69df848fbc177bff81e6227ad96dc7e0022acbdf..fbf0e596fcd30c9bae8d8cc1fbe09cf309f02334
--- 1/fs/namespace.c
--- 2/fs/namespace.c
+++ b/fs/namespace.c
@@@ -39,10 -39,10 +39,10 @@@
   /* Maximum number of mounts in a mount namespace */
   static unsigned int sysctl_mount_max __read_mostly = 100000;
   
- static unsigned int m_hash_mask __read_mostly;
- static unsigned int m_hash_shift __read_mostly;
- static unsigned int mp_hash_mask __read_mostly;
- static unsigned int mp_hash_shift __read_mostly;
+ static unsigned int m_hash_mask __ro_after_init;
+ static unsigned int m_hash_shift __ro_after_init;
+ static unsigned int mp_hash_mask __ro_after_init;
+ static unsigned int mp_hash_shift __ro_after_init;
   
   static __initdata unsigned long mhash_entries;
   static int __init set_mhash_entries(char *str)
@@@ -68,9 -68,9 +68,9 @@@ static u64 event
   static DEFINE_IDA(mnt_id_ida);
   static DEFINE_IDA(mnt_group_ida);
   
- static struct hlist_head *mount_hashtable __read_mostly;
- static struct hlist_head *mountpoint_hashtable __read_mostly;
- static struct kmem_cache *mnt_cache __read_mostly;
+ static struct hlist_head *mount_hashtable __ro_after_init;
+ static struct hlist_head *mountpoint_hashtable __ro_after_init;
+ static struct kmem_cache *mnt_cache __ro_after_init;
   static DECLARE_RWSEM(namespace_sem);
   static HLIST_HEAD(unmounted); /* protected by namespace_sem */
   static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
@@@ -86,7 -86,7 +86,7 @@@ struct mount_kattr 
   };
   
   /* /sys/fs */
- struct kobject *fs_kobj;
+ struct kobject *fs_kobj __ro_after_init;
   EXPORT_SYMBOL_GPL(fs_kobj);
   
   /*
@@@ -330,16 -330,16 +330,16 @@@ static int mnt_is_readonly(struct vfsmo
    * can determine when writes are able to occur to a filesystem.
    */
   /**
- - * __mnt_want_write - get write access to a mount without freeze protection
+ + * mnt_get_write_access - get write access to a mount without freeze protection
    * @m: the mount on which to take a write
    *
    * This tells the low-level filesystem that a write is about to be performed to
    * it, and makes sure that writes are allowed (mnt it read-write) before
    * returning success. This operation does not protect against filesystem being
- - * frozen. When the write operation is finished, __mnt_drop_write() must be
+ + * frozen. When the write operation is finished, mnt_put_write_access() must be
    * called. This is effectively a refcount.
    */
- -int __mnt_want_write(struct vfsmount *m)
+ +int mnt_get_write_access(struct vfsmount *m)
   {
         struct mount *mnt = real_mount(m);
         int ret = 0;
@@@ -386,7 -386,6 +386,7 @@@
   
         return ret;
   }
+ +EXPORT_SYMBOL_GPL(mnt_get_write_access);
   
   /**
    * mnt_want_write - get write access to a mount
@@@ -402,7 -401,7 +402,7 @@@ int mnt_want_write(struct vfsmount *m
         int ret;
   
         sb_start_write(m->mnt_sb);
- -      ret = __mnt_want_write(m);
+ +      ret = mnt_get_write_access(m);
         if (ret)
                 sb_end_write(m->mnt_sb);
         return ret;
@@@ -410,15 -409,15 +410,15 @@@
   EXPORT_SYMBOL_GPL(mnt_want_write);
   
   /**
- - * __mnt_want_write_file - get write access to a file's mount
+ + * mnt_get_write_access_file - get write access to a file's mount
    * @file: the file who's mount on which to take a write
    *
- - * This is like __mnt_want_write, but if the file is already open for writing it
+ + * This is like mnt_get_write_access, but if @file is already open for write it
    * skips incrementing mnt_writers (since the open file already has a reference)
    * and instead only does the check for emergency r/o remounts.  This must be
- - * paired with __mnt_drop_write_file.
+ + * paired with mnt_put_write_access_file.
    */
- -int __mnt_want_write_file(struct file *file)
+ +int mnt_get_write_access_file(struct file *file)
   {
         if (file->f_mode & FMODE_WRITER) {
                 /*
@@@ -429,7 -428,7 +429,7 @@@
                         return -EROFS;
                 return 0;
         }
- -      return __mnt_want_write(file->f_path.mnt);
+ +      return mnt_get_write_access(file->f_path.mnt);
   }
   
   /**
@@@ -446,7 -445,7 +446,7 @@@ int mnt_want_write_file(struct file *fi
         int ret;
   
         sb_start_write(file_inode(file)->i_sb);
- -      ret = __mnt_want_write_file(file);
+ +      ret = mnt_get_write_access_file(file);
         if (ret)
                 sb_end_write(file_inode(file)->i_sb);
         return ret;
@@@ -454,20 -453,19 +454,20 @@@
   EXPORT_SYMBOL_GPL(mnt_want_write_file);
   
   /**
- - * __mnt_drop_write - give up write access to a mount
+ + * mnt_put_write_access - give up write access to a mount
    * @mnt: the mount on which to give up write access
    *
    * Tells the low-level filesystem that we are done
    * performing writes to it.  Must be matched with
- - * __mnt_want_write() call above.
+ + * mnt_get_write_access() call above.
    */
- -void __mnt_drop_write(struct vfsmount *mnt)
+ +void mnt_put_write_access(struct vfsmount *mnt)
   {
         preempt_disable();
         mnt_dec_writers(real_mount(mnt));
         preempt_enable();
   }
+ +EXPORT_SYMBOL_GPL(mnt_put_write_access);
   
   /**
    * mnt_drop_write - give up write access to a mount
@@@ -479,20 -477,20 +479,20 @@@
    */
   void mnt_drop_write(struct vfsmount *mnt)
   {
- -      __mnt_drop_write(mnt);
+ +      mnt_put_write_access(mnt);
         sb_end_write(mnt->mnt_sb);
   }
   EXPORT_SYMBOL_GPL(mnt_drop_write);
   
- -void __mnt_drop_write_file(struct file *file)
+ +void mnt_put_write_access_file(struct file *file)
   {
         if (!(file->f_mode & FMODE_WRITER))
- -              __mnt_drop_write(file->f_path.mnt);
+ +              mnt_put_write_access(file->f_path.mnt);
   }
   
   void mnt_drop_write_file(struct file *file)
   {
- -      __mnt_drop_write_file(file);
+ +      mnt_put_write_access_file(file);
         sb_end_write(file_inode(file)->i_sb);
   }
   EXPORT_SYMBOL(mnt_drop_write_file);
@@@ -1346,9 -1344,9 +1346,9 @@@ void mntput(struct vfsmount *mnt
   {
         if (mnt) {
                 struct mount *m = real_mount(mnt);
- -              /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+ +              /* avoid cacheline pingpong */
                 if (unlikely(m->mnt_expiry_mark))
- -                      m->mnt_expiry_mark = 0;
+ +                      WRITE_ONCE(m->mnt_expiry_mark, 0);
                 mntput_no_expire(m);
         }
   }
diff --combined fs/notify/dnotify/dnotify.c

index 869b016014d2c2bc3882feac344b0828ce26a50b,7914d223289a9c5e5b3d235e1dd45e20cc095a4d..1cb9ad7e884e1516320b7dd7f01ba686757db159
--- 1/fs/notify/dnotify/dnotify.c
--- 2/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@@ -39,9 -39,9 +39,9 @@@ static void __init dnotify_sysctl_init(
   #define dnotify_sysctl_init() do { } while (0)
   #endif
   
- static struct kmem_cache *dnotify_struct_cache __read_mostly;
- static struct kmem_cache *dnotify_mark_cache __read_mostly;
- static struct fsnotify_group *dnotify_group __read_mostly;
+ static struct kmem_cache *dnotify_struct_cache __ro_after_init;
+ static struct kmem_cache *dnotify_mark_cache __ro_after_init;
+ static struct fsnotify_group *dnotify_group __ro_after_init;
   
   /*
    * dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@@ -265,7 -265,7 +265,7 @@@ int fcntl_dirnotify(int fd, struct fil
         struct dnotify_struct *dn;
         struct inode *inode;
         fl_owner_t id = current->files;
- -      struct file *f;
+ +      struct file *f = NULL;
         int destroy = 0, error = 0;
         __u32 mask;
   
@@@ -345,7 -345,7 +345,7 @@@
         }
   
         rcu_read_lock();
- -      f = lookup_fd_rcu(fd);
+ +      f = lookup_fdget_rcu(fd);
         rcu_read_unlock();
   
         /* if (f != filp) means that we lost a race and another task/thread
@@@ -392,8 -392,6 +392,8 @@@ out_err
                 fsnotify_put_mark(new_fsn_mark);
         if (dn)
                 kmem_cache_free(dnotify_struct_cache, dn);
+ +      if (f)
+ +              fput(f);
         return error;
   }
   
diff --combined fs/notify/fanotify/fanotify_user.c

index 62fe0b679e586ccbe181000fe94ed5e2b142b203,614b435c4a8cfc09283c80811e9a9c0c83386038..45aecdc302f4d9a41fee3cc364a553f2aeb59a3f
--- 1/fs/notify/fanotify/fanotify_user.c
--- 2/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@@ -112,10 -112,10 +112,10 @@@ static void __init fanotify_sysctls_ini
   
   extern const struct fsnotify_ops fanotify_fsnotify_ops;
   
- struct kmem_cache *fanotify_mark_cache __read_mostly;
- struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
- struct kmem_cache *fanotify_path_event_cachep __read_mostly;
- struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+ struct kmem_cache *fanotify_mark_cache __ro_after_init;
+ struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
+ struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
+ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
   
   #define FANOTIFY_EVENT_ALIGN 4
   #define FANOTIFY_FID_INFO_HDR_LEN \
@@@ -1585,25 -1585,16 +1585,25 @@@ static int fanotify_test_fsid(struct de
   }
   
   /* Check if filesystem can encode a unique fid */
- -static int fanotify_test_fid(struct dentry *dentry)
+ +static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
   {
+ +      unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+ +      const struct export_operations *nop = dentry->d_sb->s_export_op;
+ +
+ +      /*
+ +       * We need to make sure that the filesystem supports encoding of
+ +       * file handles so user can use name_to_handle_at() to compare fids
+ +       * reported with events to the file handle of watched objects.
+ +       */
+ +      if (!nop)
+ +              return -EOPNOTSUPP;
+ +
         /*
- -       * We need to make sure that the file system supports at least
- -       * encoding a file handle so user can use name_to_handle_at() to
- -       * compare fid returned with event to the file handle of watched
- -       * objects. However, even the relaxed AT_HANDLE_FID flag requires
- -       * at least empty export_operations for ecoding unique file ids.
+ +       * For sb/mount mark, we also need to make sure that the filesystem
+ +       * supports decoding file handles, so user has a way to map back the
+ +       * reported fids to filesystem objects.
          */
- -      if (!dentry->d_sb->s_export_op)
+ +      if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry)
                 return -EOPNOTSUPP;
   
         return 0;
@@@ -1821,7 -1812,7 +1821,7 @@@ static int do_fanotify_mark(int fanotif
                 if (ret)
                         goto path_put_and_out;
   
- -              ret = fanotify_test_fid(path.dentry);
+ +              ret = fanotify_test_fid(path.dentry, flags);
                 if (ret)
                         goto path_put_and_out;
   
diff --combined fs/ocfs2/alloc.c

index f0937902f7b46e48d51b6dbcd0a46dc1ae5b569a,dea3de833b4781530a64b0c9eb5ff702aa5fecca..91b32b2377acc9336cbe03a5c7be2d4cfe4e9cda
--- 1/fs/ocfs2/alloc.c
--- 2/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@@ -967,7 -967,14 +967,14 @@@ int ocfs2_num_free_extents(struct ocfs2
                 el = &eb->h_list;
         }
   
-       BUG_ON(el->l_tree_depth != 0);
+       if (el->l_tree_depth != 0) {
+               retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+                               "Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n",
+                               (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+                               (unsigned long long)last_eb_blk,
+                               le16_to_cpu(el->l_tree_depth));
+               goto bail;
+       }
   
         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
   bail:
@@@ -7436,10 -7443,10 +7443,10 @@@ int ocfs2_truncate_inline(struct inode 
         }
   
         inode->i_blocks = ocfs2_inode_sector_count(inode);
- -      inode->i_mtime = inode_set_ctime_current(inode);
+ +      inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
   
- -      di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- -      di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ +      di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode));
+ +      di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
   
         ocfs2_update_inode_fsync_trans(handle, inode, 1);
         ocfs2_journal_dirty(handle, di_bh);
@@@ -7642,7 -7649,7 +7649,7 @@@ out_mutex
                 goto next_group;
         }
   out:
-       range->len = trimmed * sb->s_blocksize;
+       range->len = trimmed * osb->s_clustersize;
         return ret;
   }
   
diff --combined fs/ocfs2/dlmfs/dlmfs.c

index 9b57d012fd5cfe6c76a4d74694bbddcf2addfedf,b38776ba3306876938effbe1724a36bcad0ba55d..85215162c9dd59f25347049f2546c1a1cfd10409
--- 1/fs/ocfs2/dlmfs/dlmfs.c
--- 2/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@@ -80,8 -80,7 +80,7 @@@ static int param_set_dlmfs_capabilities
   static int param_get_dlmfs_capabilities(char *buffer,
                                         const struct kernel_param *kp)
   {
-       return strlcpy(buffer, DLMFS_CAPABILITIES,
-                      strlen(DLMFS_CAPABILITIES) + 1);
+       return sysfs_emit(buffer, DLMFS_CAPABILITIES);
   }
   module_param_call(capabilities, param_set_dlmfs_capabilities,
                   param_get_dlmfs_capabilities, NULL, 0444);
@@@ -337,7 -336,7 +336,7 @@@ static struct inode *dlmfs_get_root_ino
         if (inode) {
                 inode->i_ino = get_next_ino();
                 inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
- -              inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ +              simple_inode_init_ts(inode);
                 inc_nlink(inode);
   
                 inode->i_fop = &simple_dir_operations;
@@@ -360,7 -359,7 +359,7 @@@ static struct inode *dlmfs_get_inode(st
   
         inode->i_ino = get_next_ino();
         inode_init_owner(&nop_mnt_idmap, inode, parent, mode);
- -      inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ +      simple_inode_init_ts(inode);
   
         ip = DLMFS_I(inode);
         ip->ip_conn = DLMFS_I(parent)->ip_conn;
diff --combined fs/ocfs2/namei.c

index 681e9501cdd35a8a08ce37aec13a6f7e27325cfa,836c4279a979b8e20625b2529b0bd4368c0f688f..814733ba2f4ba0fba52b985301725fb323efda13
--- 1/fs/ocfs2/namei.c
--- 2/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@@ -795,8 -795,8 +795,8 @@@ static int ocfs2_link(struct dentry *ol
         inc_nlink(inode);
         inode_set_ctime_current(inode);
         ocfs2_set_links_count(fe, inode->i_nlink);
- -      fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
- -      fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
+ +      fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
+ +      fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
         ocfs2_journal_dirty(handle, fe_bh);
   
         err = ocfs2_add_entry(handle, dentry, inode,
@@@ -995,7 -995,7 +995,7 @@@ static int ocfs2_unlink(struct inode *d
         ocfs2_set_links_count(fe, inode->i_nlink);
         ocfs2_journal_dirty(handle, fe_bh);
   
- -      dir->i_mtime = inode_set_ctime_current(dir);
+ +      inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
         if (S_ISDIR(inode->i_mode))
                 drop_nlink(dir);
   
@@@ -1550,8 -1550,8 +1550,8 @@@ static int ocfs2_rename(struct mnt_idma
         if (status >= 0) {
                 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
   
- -              old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec);
- -              old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec);
+ +              old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode));
+ +              old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode));
                 ocfs2_journal_dirty(handle, old_inode_bh);
         } else
                 mlog_errno(status);
@@@ -1592,11 -1592,15 +1592,15 @@@
                 drop_nlink(new_inode);
                 inode_set_ctime_current(new_inode);
         }
- -      old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ +      inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
   
         if (update_dot_dot) {
                 status = ocfs2_update_entry(old_inode, handle,
                                             &old_inode_dot_dot_res, new_dir);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
                 drop_nlink(old_dir);
                 if (new_inode) {
                         drop_nlink(new_inode);
@@@ -1614,8 -1618,8 +1618,8 @@@
   
         if (old_dir != new_dir) {
                 /* Keep the same times on both directories.*/
- -              new_dir->i_mtime = inode_set_ctime_to_ts(new_dir,
- -                                                       inode_get_ctime(old_dir));
+ +              inode_set_mtime_to_ts(new_dir,
+ +                                    inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir)));
   
                 /*
                  * This will also pick up the i_nlink change from the
@@@ -1636,6 -1640,10 +1640,10 @@@
                                                          INODE_CACHE(old_dir),
                                                          old_dir_bh,
                                                          OCFS2_JOURNAL_ACCESS_WRITE);
+                       if (status < 0) {
+                               mlog_errno(status);
+                               goto bail;
+                       }
                         fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
                         ocfs2_set_links_count(fe, old_dir->i_nlink);
                         ocfs2_journal_dirty(handle, old_dir_bh);
diff --combined fs/pipe.c

index 8916c455a469c18e0b56125dac5f553330938fe7,6b279abf01296525c2f7a177da2be3112ca913ee..804a7d78945217efd3b5394a8a8e6e7d605a937a
--- 1/fs/pipe.c
--- 2/fs/pipe.c
+++ b/fs/pipe.c
@@@ -227,36 -227,6 +227,36 @@@ static inline bool pipe_readable(const 
         return !pipe_empty(head, tail) || !writers;
   }
   
+ +static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
+ +                                          struct pipe_buffer *buf,
+ +                                          unsigned int tail)
+ +{
+ +      pipe_buf_release(pipe, buf);
+ +
+ +      /*
+ +       * If the pipe has a watch_queue, we need additional protection
+ +       * by the spinlock because notifications get posted with only
+ +       * this spinlock, no mutex
+ +       */
+ +      if (pipe_has_watch_queue(pipe)) {
+ +              spin_lock_irq(&pipe->rd_wait.lock);
+ +#ifdef CONFIG_WATCH_QUEUE
+ +              if (buf->flags & PIPE_BUF_FLAG_LOSS)
+ +                      pipe->note_loss = true;
+ +#endif
+ +              pipe->tail = ++tail;
+ +              spin_unlock_irq(&pipe->rd_wait.lock);
+ +              return tail;
+ +      }
+ +
+ +      /*
+ +       * Without a watch_queue, we can simply increment the tail
+ +       * without the spinlock - the mutex is enough.
+ +       */
+ +      pipe->tail = ++tail;
+ +      return tail;
+ +}
+ +
   static ssize_t
   pipe_read(struct kiocb *iocb, struct iov_iter *to)
   {
@@@ -350,8 -320,17 +350,8 @@@
                                 buf->len = 0;
                         }
   
- -                      if (!buf->len) {
- -                              pipe_buf_release(pipe, buf);
- -                              spin_lock_irq(&pipe->rd_wait.lock);
- -#ifdef CONFIG_WATCH_QUEUE
- -                              if (buf->flags & PIPE_BUF_FLAG_LOSS)
- -                                      pipe->note_loss = true;
- -#endif
- -                              tail++;
- -                              pipe->tail = tail;
- -                              spin_unlock_irq(&pipe->rd_wait.lock);
- -                      }
+ +                      if (!buf->len)
+ +                              tail = pipe_update_tail(pipe, buf, tail);
                         total_len -= chars;
                         if (!total_len)
                                 break;  /* common path: read succeeded */
@@@ -458,10 -437,12 +458,10 @@@ pipe_write(struct kiocb *iocb, struct i
                 goto out;
         }
   
- -#ifdef CONFIG_WATCH_QUEUE
- -      if (pipe->watch_queue) {
+ +      if (pipe_has_watch_queue(pipe)) {
                 ret = -EXDEV;
                 goto out;
         }
- -#endif
   
         /*
          * If it wasn't empty we try to merge new data into
@@@ -526,7 -507,16 +526,7 @@@
                          * it, either the reader will consume it or it'll still
                          * be there for the next write.
                          */
- -                      spin_lock_irq(&pipe->rd_wait.lock);
- -
- -                      head = pipe->head;
- -                      if (pipe_full(head, pipe->tail, pipe->max_usage)) {
- -                              spin_unlock_irq(&pipe->rd_wait.lock);
- -                              continue;
- -                      }
- -
                         pipe->head = head + 1;
- -                      spin_unlock_irq(&pipe->rd_wait.lock);
   
                         /* Insert it into the buffer array */
                         buf = &pipe->bufs[head & mask];
@@@ -864,7 -854,7 +864,7 @@@ void free_pipe_info(struct pipe_inode_i
         kfree(pipe);
   }
   
- static struct vfsmount *pipe_mnt __read_mostly;
+ static struct vfsmount *pipe_mnt __ro_after_init;
   
   /*
    * pipefs_dname() is called from d_path().
@@@ -908,7 -898,7 +908,7 @@@ static struct inode * get_pipe_inode(vo
         inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
         inode->i_uid = current_fsuid();
         inode->i_gid = current_fsgid();
- -      inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ +      simple_inode_init_ts(inode);
   
         return inode;
   
@@@ -1334,8 -1324,10 +1334,8 @@@ static long pipe_set_size(struct pipe_i
         unsigned int nr_slots, size;
         long ret = 0;
   
- -#ifdef CONFIG_WATCH_QUEUE
- -      if (pipe->watch_queue)
+ +      if (pipe_has_watch_queue(pipe))
                 return -EBUSY;
- -#endif
   
         size = round_pipe_size(arg);
         nr_slots = size >> PAGE_SHIFT;
@@@ -1387,8 -1379,10 +1387,8 @@@ struct pipe_inode_info *get_pipe_info(s
   
         if (file->f_op != &pipefifo_fops || !pipe)
                 return NULL;
- -#ifdef CONFIG_WATCH_QUEUE
- -      if (for_splice && pipe->watch_queue)
+ +      if (for_splice && pipe_has_watch_queue(pipe))
                 return NULL;
- -#endif
         return pipe;
   }
   
diff --combined fs/proc/base.c

index 83396ab149985496eb3a20092a1f43720ab7670c,b13d3e804debaea4487e0cedba091cf9ad328625..dd31e3b6bf77cc84d6e05a223e989bc3ea3bb6f7
--- 1/fs/proc/base.c
--- 2/fs/proc/base.c
+++ b/fs/proc/base.c
@@@ -1153,11 -1153,10 +1153,10 @@@ err_unlock
   static ssize_t oom_adj_write(struct file *file, const char __user *buf,
                              size_t count, loff_t *ppos)
   {
-       char buffer[PROC_NUMBUF];
+       char buffer[PROC_NUMBUF] = {};
         int oom_adj;
         int err;
   
-       memset(buffer, 0, sizeof(buffer));
         if (count > sizeof(buffer) - 1)
                 count = sizeof(buffer) - 1;
         if (copy_from_user(buffer, buf, count)) {
@@@ -1213,11 -1212,10 +1212,10 @@@ static ssize_t oom_score_adj_read(struc
   static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                                         size_t count, loff_t *ppos)
   {
-       char buffer[PROC_NUMBUF];
+       char buffer[PROC_NUMBUF] = {};
         int oom_score_adj;
         int err;
   
-       memset(buffer, 0, sizeof(buffer));
         if (count > sizeof(buffer) - 1)
                 count = sizeof(buffer) - 1;
         if (copy_from_user(buffer, buf, count)) {
@@@ -1358,13 -1356,13 +1356,13 @@@ static ssize_t proc_fault_inject_write(
                         const char __user * buf, size_t count, loff_t *ppos)
   {
         struct task_struct *task;
-       char buffer[PROC_NUMBUF];
+       char buffer[PROC_NUMBUF] = {};
         int make_it_fail;
         int rv;
   
         if (!capable(CAP_SYS_RESOURCE))
                 return -EPERM;
-       memset(buffer, 0, sizeof(buffer));
+ 
         if (count > sizeof(buffer) - 1)
                 count = sizeof(buffer) - 1;
         if (copy_from_user(buffer, buf, count))
@@@ -1509,11 -1507,10 +1507,10 @@@ sched_autogroup_write(struct file *file
   {
         struct inode *inode = file_inode(file);
         struct task_struct *p;
-       char buffer[PROC_NUMBUF];
+       char buffer[PROC_NUMBUF] = {};
         int nice;
         int err;
   
-       memset(buffer, 0, sizeof(buffer));
         if (count > sizeof(buffer) - 1)
                 count = sizeof(buffer) - 1;
         if (copy_from_user(buffer, buf, count))
@@@ -1666,10 -1663,9 +1663,9 @@@ static ssize_t comm_write(struct file *
   {
         struct inode *inode = file_inode(file);
         struct task_struct *p;
-       char buffer[TASK_COMM_LEN];
+       char buffer[TASK_COMM_LEN] = {};
         const size_t maxlen = sizeof(buffer) - 1;
   
-       memset(buffer, 0, sizeof(buffer));
         if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
                 return -EFAULT;
   
@@@ -1902,7 -1898,7 +1898,7 @@@ struct inode *proc_pid_make_inode(struc
         ei = PROC_I(inode);
         inode->i_mode = mode;
         inode->i_ino = get_next_ino();
- -      inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ +      simple_inode_init_ts(inode);
         inode->i_op = &proc_def_inode_operations;
   
         /*
@@@ -2218,7 -2214,7 +2214,7 @@@ static int map_files_get_link(struct de
         rc = -ENOENT;
         vma = find_exact_vma(mm, vm_start, vm_end);
         if (vma && vma->vm_file) {
- -              *path = vma->vm_file->f_path;
+ +              *path = *file_user_path(vma->vm_file);
                 path_get(path);
                 rc = 0;
         }
@@@ -2976,8 -2972,7 +2972,7 @@@ static const struct file_operations pro
   #ifdef CONFIG_TASK_IO_ACCOUNTING
   static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
   {
-       struct task_io_accounting acct = task->ioac;
-       unsigned long flags;
+       struct task_io_accounting acct;
         int result;
   
         result = down_read_killable(&task->signal->exec_update_lock);
@@@ -2989,15 -2984,28 +2984,28 @@@
                 goto out_unlock;
         }
   
-       if (whole && lock_task_sighand(task, &flags)) {
-               struct task_struct *t = task;
+       if (whole) {
+               struct signal_struct *sig = task->signal;
+               struct task_struct *t;
+               unsigned int seq = 1;
+               unsigned long flags;
+ 
+               rcu_read_lock();
+               do {
+                       seq++; /* 2 on the 1st/lockless path, otherwise odd */
+                       flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
   
-               task_io_accounting_add(&acct, &task->signal->ioac);
-               while_each_thread(task, t)
-                       task_io_accounting_add(&acct, &t->ioac);
+                       acct = sig->ioac;
+                       __for_each_thread(sig, t)
+                               task_io_accounting_add(&acct, &t->ioac);
   
-               unlock_task_sighand(task, &flags);
+               } while (need_seqretry(&sig->stats_lock, seq));
+               done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+               rcu_read_unlock();
+       } else {
+               acct = task->ioac;
         }
+ 
         seq_printf(m,
                    "rchar: %llu\n"
                    "wchar: %llu\n"
@@@ -3818,7 -3826,7 +3826,7 @@@ static struct task_struct *first_tid(st
         for_each_thread(task, pos) {
                 if (!nr--)
                         goto found;
-       };
+       }
   fail:
         pos = NULL;
         goto out;
@@@ -3840,10 -3848,8 +3848,8 @@@ static struct task_struct *next_tid(str
         struct task_struct *pos = NULL;
         rcu_read_lock();
         if (pid_alive(start)) {
-               pos = next_thread(start);
-               if (thread_group_leader(pos))
-                       pos = NULL;
-               else
+               pos = __next_thread(start);
+               if (pos)
                         get_task_struct(pos);
         }
         rcu_read_unlock();
diff --combined fs/proc/inode.c

index 592ed2516f47881458b275951412072ab68d3217,5933c78af6de19b67bb486fd4bdb7f2b8efe2b1c..b33e490e3fd9f88f569e3453d603041e665cf6bf
--- 1/fs/proc/inode.c
--- 2/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@@ -110,18 -110,15 +110,15 @@@ void __init proc_init_kmemcache(void
   
   void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
   {
-       struct inode *inode;
-       struct proc_inode *ei;
         struct hlist_node *node;
         struct super_block *old_sb = NULL;
   
         rcu_read_lock();
-       for (;;) {
+       while ((node = hlist_first_rcu(inodes))) {
+               struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes);
                 struct super_block *sb;
-               node = hlist_first_rcu(inodes);
-               if (!node)
-                       break;
-               ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+               struct inode *inode;
+ 
                 spin_lock(lock);
                 hlist_del_init_rcu(&ei->sibling_inodes);
                 spin_unlock(lock);
@@@ -660,7 -657,7 +657,7 @@@ struct inode *proc_get_inode(struct sup
   
         inode->i_private = de->data;
         inode->i_ino = de->low_ino;
- -      inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ +      simple_inode_init_ts(inode);
         PROC_I(inode)->pde = de;
         if (is_empty_pde(de)) {
                 make_empty_dir_inode(inode);
diff --combined fs/proc/task_mmu.c

index 4abd51053f76d92d0998a4e07f5e021a263c1026,d4d55d5bae51b016883eb6e4ff0feceb462fd620..ef2eb12906da88c6fe3a227e82598020f0badc44
--- 1/fs/proc/task_mmu.c
--- 2/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@@ -20,8 -20,6 +20,8 @@@
   #include <linux/shmem_fs.h>
   #include <linux/uaccess.h>
   #include <linux/pkeys.h>
+ +#include <linux/minmax.h>
+ +#include <linux/overflow.h>
   
   #include <asm/elf.h>
   #include <asm/tlb.h>
@@@ -298,7 -296,7 +298,7 @@@ show_map_vma(struct seq_file *m, struc
                 if (anon_name)
                         seq_printf(m, "[anon_shmem:%s]", anon_name->name);
                 else
- -                      seq_file_path(m, file, "\n");
+ +                      seq_path(m, file_user_path(file), "\n");
                 goto done;
         }
   
@@@ -851,9 -849,7 +851,7 @@@ static void __show_smap(struct seq_fil
   static int show_smap(struct seq_file *m, void *v)
   {
         struct vm_area_struct *vma = v;
-       struct mem_size_stats mss;
- 
-       memset(&mss, 0, sizeof(mss));
+       struct mem_size_stats mss = {};
   
         smap_gather_stats(vma, &mss, 0);
   
@@@ -879,7 -875,7 +877,7 @@@
   static int show_smaps_rollup(struct seq_file *m, void *v)
   {
         struct proc_maps_private *priv = m->private;
-       struct mem_size_stats mss;
+       struct mem_size_stats mss = {};
         struct mm_struct *mm = priv->mm;
         struct vm_area_struct *vma;
         unsigned long vma_start = 0, last_vma_end = 0;
@@@ -895,8 -891,6 +893,6 @@@
                 goto out_put_task;
         }
   
-       memset(&mss, 0, sizeof(mss));
- 
         ret = mmap_read_lock_killable(mm);
         if (ret)
                 goto out_put_mm;
@@@ -1248,14 -1242,13 +1244,13 @@@ static ssize_t clear_refs_write(struct 
                                 size_t count, loff_t *ppos)
   {
         struct task_struct *task;
-       char buffer[PROC_NUMBUF];
+       char buffer[PROC_NUMBUF] = {};
         struct mm_struct *mm;
         struct vm_area_struct *vma;
         enum clear_refs_types type;
         int itype;
         int rv;
   
-       memset(buffer, 0, sizeof(buffer));
         if (count > sizeof(buffer) - 1)
                 count = sizeof(buffer) - 1;
         if (copy_from_user(buffer, buf, count))
@@@ -1763,737 -1756,11 +1758,737 @@@ static int pagemap_release(struct inod
         return 0;
   }
   
+ +#define PM_SCAN_CATEGORIES    (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN |  \
+ +                               PAGE_IS_FILE | PAGE_IS_PRESENT |       \
+ +                               PAGE_IS_SWAPPED | PAGE_IS_PFNZERO |    \
+ +                               PAGE_IS_HUGE)
+ +#define PM_SCAN_FLAGS         (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
+ +
+ +struct pagemap_scan_private {
+ +      struct pm_scan_arg arg;
+ +      unsigned long masks_of_interest, cur_vma_category;
+ +      struct page_region *vec_buf;
+ +      unsigned long vec_buf_len, vec_buf_index, found_pages;
+ +      struct page_region __user *vec_out;
+ +};
+ +
+ +static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
+ +                                         struct vm_area_struct *vma,
+ +                                         unsigned long addr, pte_t pte)
+ +{
+ +      unsigned long categories = 0;
+ +
+ +      if (pte_present(pte)) {
+ +              struct page *page;
+ +
+ +              categories |= PAGE_IS_PRESENT;
+ +              if (!pte_uffd_wp(pte))
+ +                      categories |= PAGE_IS_WRITTEN;
+ +
+ +              if (p->masks_of_interest & PAGE_IS_FILE) {
+ +                      page = vm_normal_page(vma, addr, pte);
+ +                      if (page && !PageAnon(page))
+ +                              categories |= PAGE_IS_FILE;
+ +              }
+ +
+ +              if (is_zero_pfn(pte_pfn(pte)))
+ +                      categories |= PAGE_IS_PFNZERO;
+ +      } else if (is_swap_pte(pte)) {
+ +              swp_entry_t swp;
+ +
+ +              categories |= PAGE_IS_SWAPPED;
+ +              if (!pte_swp_uffd_wp_any(pte))
+ +                      categories |= PAGE_IS_WRITTEN;
+ +
+ +              if (p->masks_of_interest & PAGE_IS_FILE) {
+ +                      swp = pte_to_swp_entry(pte);
+ +                      if (is_pfn_swap_entry(swp) &&
+ +                          !PageAnon(pfn_swap_entry_to_page(swp)))
+ +                              categories |= PAGE_IS_FILE;
+ +              }
+ +      }
+ +
+ +      return categories;
+ +}
+ +
+ +static void make_uffd_wp_pte(struct vm_area_struct *vma,
+ +                           unsigned long addr, pte_t *pte)
+ +{
+ +      pte_t ptent = ptep_get(pte);
+ +
+ +      if (pte_present(ptent)) {
+ +              pte_t old_pte;
+ +
+ +              old_pte = ptep_modify_prot_start(vma, addr, pte);
+ +              ptent = pte_mkuffd_wp(ptent);
+ +              ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+ +      } else if (is_swap_pte(ptent)) {
+ +              ptent = pte_swp_mkuffd_wp(ptent);
+ +              set_pte_at(vma->vm_mm, addr, pte, ptent);
+ +      } else {
+ +              set_pte_at(vma->vm_mm, addr, pte,
+ +                         make_pte_marker(PTE_MARKER_UFFD_WP));
+ +      }
+ +}
+ +
+ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ +static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
+ +                                        struct vm_area_struct *vma,
+ +                                        unsigned long addr, pmd_t pmd)
+ +{
+ +      unsigned long categories = PAGE_IS_HUGE;
+ +
+ +      if (pmd_present(pmd)) {
+ +              struct page *page;
+ +
+ +              categories |= PAGE_IS_PRESENT;
+ +              if (!pmd_uffd_wp(pmd))
+ +                      categories |= PAGE_IS_WRITTEN;
+ +
+ +              if (p->masks_of_interest & PAGE_IS_FILE) {
+ +                      page = vm_normal_page_pmd(vma, addr, pmd);
+ +                      if (page && !PageAnon(page))
+ +                              categories |= PAGE_IS_FILE;
+ +              }
+ +
+ +              if (is_zero_pfn(pmd_pfn(pmd)))
+ +                      categories |= PAGE_IS_PFNZERO;
+ +      } else if (is_swap_pmd(pmd)) {
+ +              swp_entry_t swp;
+ +
+ +              categories |= PAGE_IS_SWAPPED;
+ +              if (!pmd_swp_uffd_wp(pmd))
+ +                      categories |= PAGE_IS_WRITTEN;
+ +
+ +              if (p->masks_of_interest & PAGE_IS_FILE) {
+ +                      swp = pmd_to_swp_entry(pmd);
+ +                      if (is_pfn_swap_entry(swp) &&
+ +                          !PageAnon(pfn_swap_entry_to_page(swp)))
+ +                              categories |= PAGE_IS_FILE;
+ +              }
+ +      }
+ +
+ +      return categories;
+ +}
+ +
+ +static void make_uffd_wp_pmd(struct vm_area_struct *vma,
+ +                           unsigned long addr, pmd_t *pmdp)
+ +{
+ +      pmd_t old, pmd = *pmdp;
+ +
+ +      if (pmd_present(pmd)) {
+ +              old = pmdp_invalidate_ad(vma, addr, pmdp);
+ +              pmd = pmd_mkuffd_wp(old);
+ +              set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ +      } else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+ +              pmd = pmd_swp_mkuffd_wp(pmd);
+ +              set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+ +      }
+ +}
+ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ +
+ +#ifdef CONFIG_HUGETLB_PAGE
+ +static unsigned long pagemap_hugetlb_category(pte_t pte)
+ +{
+ +      unsigned long categories = PAGE_IS_HUGE;
+ +
+ +      /*
+ +       * According to pagemap_hugetlb_range(), file-backed HugeTLB
+ +       * page cannot be swapped. So PAGE_IS_FILE is not checked for
+ +       * swapped pages.
+ +       */
+ +      if (pte_present(pte)) {
+ +              categories |= PAGE_IS_PRESENT;
+ +              if (!huge_pte_uffd_wp(pte))
+ +                      categories |= PAGE_IS_WRITTEN;
+ +              if (!PageAnon(pte_page(pte)))
+ +                      categories |= PAGE_IS_FILE;
+ +              if (is_zero_pfn(pte_pfn(pte)))
+ +                      categories |= PAGE_IS_PFNZERO;
+ +      } else if (is_swap_pte(pte)) {
+ +              categories |= PAGE_IS_SWAPPED;
+ +              if (!pte_swp_uffd_wp_any(pte))
+ +                      categories |= PAGE_IS_WRITTEN;
+ +      }
+ +
+ +      return categories;
+ +}
+ +
+ +static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
+ +                                unsigned long addr, pte_t *ptep,
+ +                                pte_t ptent)
+ +{
+ +      unsigned long psize;
+ +
+ +      if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
+ +              return;
+ +
+ +      psize = huge_page_size(hstate_vma(vma));
+ +
+ +      if (is_hugetlb_entry_migration(ptent))
+ +              set_huge_pte_at(vma->vm_mm, addr, ptep,
+ +                              pte_swp_mkuffd_wp(ptent), psize);
+ +      else if (!huge_pte_none(ptent))
+ +              huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
+ +                                           huge_pte_mkuffd_wp(ptent));
+ +      else
+ +              set_huge_pte_at(vma->vm_mm, addr, ptep,
+ +                              make_pte_marker(PTE_MARKER_UFFD_WP), psize);
+ +}
+ +#endif /* CONFIG_HUGETLB_PAGE */
+ +
+ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+ +static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
+ +                                     unsigned long addr, unsigned long end)
+ +{
+ +      struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+ +
+ +      if (cur_buf->start != addr)
+ +              cur_buf->end = addr;
+ +      else
+ +              cur_buf->start = cur_buf->end = 0;
+ +
+ +      p->found_pages -= (end - addr) / PAGE_SIZE;
+ +}
+ +#endif
+ +
+ +static bool pagemap_scan_is_interesting_page(unsigned long categories,
+ +                                           const struct pagemap_scan_private *p)
+ +{
+ +      categories ^= p->arg.category_inverted;
+ +      if ((categories & p->arg.category_mask) != p->arg.category_mask)
+ +              return false;
+ +      if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask))
+ +              return false;
+ +
+ +      return true;
+ +}
+ +
+ +static bool pagemap_scan_is_interesting_vma(unsigned long categories,
+ +                                          const struct pagemap_scan_private *p)
+ +{
+ +      unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED;
+ +
+ +      categories ^= p->arg.category_inverted;
+ +      if ((categories & required) != required)
+ +              return false;
+ +
+ +      return true;
+ +}
+ +
+ +static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
+ +                                struct mm_walk *walk)
+ +{
+ +      struct pagemap_scan_private *p = walk->private;
+ +      struct vm_area_struct *vma = walk->vma;
+ +      unsigned long vma_category = 0;
+ +
+ +      if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma))
+ +              vma_category |= PAGE_IS_WPALLOWED;
+ +      else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
+ +              return -EPERM;
+ +
+ +      if (vma->vm_flags & VM_PFNMAP)
+ +              return 1;
+ +
+ +      if (!pagemap_scan_is_interesting_vma(vma_category, p))
+ +              return 1;
+ +
+ +      p->cur_vma_category = vma_category;
+ +
+ +      return 0;
+ +}
+ +
+ +static bool pagemap_scan_push_range(unsigned long categories,
+ +                                  struct pagemap_scan_private *p,
+ +                                  unsigned long addr, unsigned long end)
+ +{
+ +      struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
+ +
+ +      /*
+ +       * When there is no output buffer provided at all, the sentinel values
+ +       * won't match here. There is no other way for `cur_buf->end` to be
+ +       * non-zero other than it being non-empty.
+ +       */
+ +      if (addr == cur_buf->end && categories == cur_buf->categories) {
+ +              cur_buf->end = end;
+ +              return true;
+ +      }
+ +
+ +      if (cur_buf->end) {
+ +              if (p->vec_buf_index >= p->vec_buf_len - 1)
+ +                      return false;
+ +
+ +              cur_buf = &p->vec_buf[++p->vec_buf_index];
+ +      }
+ +
+ +      cur_buf->start = addr;
+ +      cur_buf->end = end;
+ +      cur_buf->categories = categories;
+ +
+ +      return true;
+ +}
+ +
+ +static int pagemap_scan_output(unsigned long categories,
+ +                             struct pagemap_scan_private *p,
+ +                             unsigned long addr, unsigned long *end)
+ +{
+ +      unsigned long n_pages, total_pages;
+ +      int ret = 0;
+ +
+ +      if (!p->vec_buf)
+ +              return 0;
+ +
+ +      categories &= p->arg.return_mask;
+ +
+ +      n_pages = (*end - addr) / PAGE_SIZE;
+ +      if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
+ +          total_pages > p->arg.max_pages) {
+ +              size_t n_too_much = total_pages - p->arg.max_pages;
+ +              *end -= n_too_much * PAGE_SIZE;
+ +              n_pages -= n_too_much;
+ +              ret = -ENOSPC;
+ +      }
+ +
+ +      if (!pagemap_scan_push_range(categories, p, addr, *end)) {
+ +              *end = addr;
+ +              n_pages = 0;
+ +              ret = -ENOSPC;
+ +      }
+ +
+ +      p->found_pages += n_pages;
+ +      if (ret)
+ +              p->arg.walk_end = *end;
+ +
+ +      return ret;
+ +}
+ +
+ +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
+ +                                unsigned long end, struct mm_walk *walk)
+ +{
+ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ +      struct pagemap_scan_private *p = walk->private;
+ +      struct vm_area_struct *vma = walk->vma;
+ +      unsigned long categories;
+ +      spinlock_t *ptl;
+ +      int ret = 0;
+ +
+ +      ptl = pmd_trans_huge_lock(pmd, vma);
+ +      if (!ptl)
+ +              return -ENOENT;
+ +
+ +      categories = p->cur_vma_category |
+ +                   pagemap_thp_category(p, vma, start, *pmd);
+ +
+ +      if (!pagemap_scan_is_interesting_page(categories, p))
+ +              goto out_unlock;
+ +
+ +      ret = pagemap_scan_output(categories, p, start, &end);
+ +      if (start == end)
+ +              goto out_unlock;
+ +
+ +      if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ +              goto out_unlock;
+ +      if (~categories & PAGE_IS_WRITTEN)
+ +              goto out_unlock;
+ +
+ +      /*
+ +       * Break huge page into small pages if the WP operation
+ +       * needs to be performed on a portion of the huge page.
+ +       */
+ +      if (end != start + HPAGE_SIZE) {
+ +              spin_unlock(ptl);
+ +              split_huge_pmd(vma, pmd, start);
+ +              pagemap_scan_backout_range(p, start, end);
+ +              /* Report as if there was no THP */
+ +              return -ENOENT;
+ +      }
+ +
+ +      make_uffd_wp_pmd(vma, start, pmd);
+ +      flush_tlb_range(vma, start, end);
+ +out_unlock:
+ +      spin_unlock(ptl);
+ +      return ret;
+ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+ +      return -ENOENT;
+ +#endif
+ +}
+ +
+ +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
+ +                                unsigned long end, struct mm_walk *walk)
+ +{
+ +      struct pagemap_scan_private *p = walk->private;
+ +      struct vm_area_struct *vma = walk->vma;
+ +      unsigned long addr, flush_end = 0;
+ +      pte_t *pte, *start_pte;
+ +      spinlock_t *ptl;
+ +      int ret;
+ +
+ +      arch_enter_lazy_mmu_mode();
+ +
+ +      ret = pagemap_scan_thp_entry(pmd, start, end, walk);
+ +      if (ret != -ENOENT) {
+ +              arch_leave_lazy_mmu_mode();
+ +              return ret;
+ +      }
+ +
+ +      ret = 0;
+ +      start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+ +      if (!pte) {
+ +              arch_leave_lazy_mmu_mode();
+ +              walk->action = ACTION_AGAIN;
+ +              return 0;
+ +      }
+ +
+ +      if (!p->vec_out) {
+ +              /* Fast path for performing exclusive WP */
+ +              for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ +                      if (pte_uffd_wp(ptep_get(pte)))
+ +                              continue;
+ +                      make_uffd_wp_pte(vma, addr, pte);
+ +                      if (!flush_end)
+ +                              start = addr;
+ +                      flush_end = addr + PAGE_SIZE;
+ +              }
+ +              goto flush_and_return;
+ +      }
+ +
+ +      if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
+ +          p->arg.category_mask == PAGE_IS_WRITTEN &&
+ +          p->arg.return_mask == PAGE_IS_WRITTEN) {
+ +              for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
+ +                      unsigned long next = addr + PAGE_SIZE;
+ +
+ +                      if (pte_uffd_wp(ptep_get(pte)))
+ +                              continue;
+ +                      ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN,
+ +                                                p, addr, &next);
+ +                      if (next == addr)
+ +                              break;
+ +                      if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ +                              continue;
+ +                      make_uffd_wp_pte(vma, addr, pte);
+ +                      if (!flush_end)
+ +                              start = addr;
+ +                      flush_end = next;
+ +              }
+ +              goto flush_and_return;
+ +      }
+ +
+ +      for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
+ +              unsigned long categories = p->cur_vma_category |
+ +                                         pagemap_page_category(p, vma, addr, ptep_get(pte));
+ +              unsigned long next = addr + PAGE_SIZE;
+ +
+ +              if (!pagemap_scan_is_interesting_page(categories, p))
+ +                      continue;
+ +
+ +              ret = pagemap_scan_output(categories, p, addr, &next);
+ +              if (next == addr)
+ +                      break;
+ +
+ +              if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ +                      continue;
+ +              if (~categories & PAGE_IS_WRITTEN)
+ +                      continue;
+ +
+ +              make_uffd_wp_pte(vma, addr, pte);
+ +              if (!flush_end)
+ +                      start = addr;
+ +              flush_end = next;
+ +      }
+ +
+ +flush_and_return:
+ +      if (flush_end)
+ +              flush_tlb_range(vma, start, addr);
+ +
+ +      pte_unmap_unlock(start_pte, ptl);
+ +      arch_leave_lazy_mmu_mode();
+ +
+ +      cond_resched();
+ +      return ret;
+ +}
+ +
+ +#ifdef CONFIG_HUGETLB_PAGE
+ +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
+ +                                    unsigned long start, unsigned long end,
+ +                                    struct mm_walk *walk)
+ +{
+ +      struct pagemap_scan_private *p = walk->private;
+ +      struct vm_area_struct *vma = walk->vma;
+ +      unsigned long categories;
+ +      spinlock_t *ptl;
+ +      int ret = 0;
+ +      pte_t pte;
+ +
+ +      if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
+ +              /* Go the short route when not write-protecting pages. */
+ +
+ +              pte = huge_ptep_get(ptep);
+ +              categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+ +
+ +              if (!pagemap_scan_is_interesting_page(categories, p))
+ +                      return 0;
+ +
+ +              return pagemap_scan_output(categories, p, start, &end);
+ +      }
+ +
+ +      i_mmap_lock_write(vma->vm_file->f_mapping);
+ +      ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
+ +
+ +      pte = huge_ptep_get(ptep);
+ +      categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+ +
+ +      if (!pagemap_scan_is_interesting_page(categories, p))
+ +              goto out_unlock;
+ +
+ +      ret = pagemap_scan_output(categories, p, start, &end);
+ +      if (start == end)
+ +              goto out_unlock;
+ +
+ +      if (~categories & PAGE_IS_WRITTEN)
+ +              goto out_unlock;
+ +
+ +      if (end != start + HPAGE_SIZE) {
+ +              /* Partial HugeTLB page WP isn't possible. */
+ +              pagemap_scan_backout_range(p, start, end);
+ +              p->arg.walk_end = start;
+ +              ret = 0;
+ +              goto out_unlock;
+ +      }
+ +
+ +      make_uffd_wp_huge_pte(vma, start, ptep, pte);
+ +      flush_hugetlb_tlb_range(vma, start, end);
+ +
+ +out_unlock:
+ +      spin_unlock(ptl);
+ +      i_mmap_unlock_write(vma->vm_file->f_mapping);
+ +
+ +      return ret;
+ +}
+ +#else
+ +#define pagemap_scan_hugetlb_entry NULL
+ +#endif
+ +
+ +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
+ +                               int depth, struct mm_walk *walk)
+ +{
+ +      struct pagemap_scan_private *p = walk->private;
+ +      struct vm_area_struct *vma = walk->vma;
+ +      int ret, err;
+ +
+ +      if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p))
+ +              return 0;
+ +
+ +      ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
+ +      if (addr == end)
+ +              return ret;
+ +
+ +      if (~p->arg.flags & PM_SCAN_WP_MATCHING)
+ +              return ret;
+ +
+ +      err = uffd_wp_range(vma, addr, end - addr, true);
+ +      if (err < 0)
+ +              ret = err;
+ +
+ +      return ret;
+ +}
+ +
+ +static const struct mm_walk_ops pagemap_scan_ops = {
+ +      .test_walk = pagemap_scan_test_walk,
+ +      .pmd_entry = pagemap_scan_pmd_entry,
+ +      .pte_hole = pagemap_scan_pte_hole,
+ +      .hugetlb_entry = pagemap_scan_hugetlb_entry,
+ +};
+ +
+ +static int pagemap_scan_get_args(struct pm_scan_arg *arg,
+ +                               unsigned long uarg)
+ +{
+ +      if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
+ +              return -EFAULT;
+ +
+ +      if (arg->size != sizeof(struct pm_scan_arg))
+ +              return -EINVAL;
+ +
+ +      /* Validate requested features */
+ +      if (arg->flags & ~PM_SCAN_FLAGS)
+ +              return -EINVAL;
+ +      if ((arg->category_inverted | arg->category_mask |
+ +           arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
+ +              return -EINVAL;
+ +
+ +      arg->start = untagged_addr((unsigned long)arg->start);
+ +      arg->end = untagged_addr((unsigned long)arg->end);
+ +      arg->vec = untagged_addr((unsigned long)arg->vec);
+ +
+ +      /* Validate memory pointers */
+ +      if (!IS_ALIGNED(arg->start, PAGE_SIZE))
+ +              return -EINVAL;
+ +      if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start))
+ +              return -EFAULT;
+ +      if (!arg->vec && arg->vec_len)
+ +              return -EINVAL;
+ +      if (arg->vec && !access_ok((void __user *)(long)arg->vec,
+ +                            arg->vec_len * sizeof(struct page_region)))
+ +              return -EFAULT;
+ +
+ +      /* Fixup default values */
+ +      arg->end = ALIGN(arg->end, PAGE_SIZE);
+ +      arg->walk_end = 0;
+ +      if (!arg->max_pages)
+ +              arg->max_pages = ULONG_MAX;
+ +
+ +      return 0;
+ +}
+ +
+ +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
+ +                                     unsigned long uargl)
+ +{
+ +      struct pm_scan_arg __user *uarg = (void __user *)uargl;
+ +
+ +      if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
+ +              return -EFAULT;
+ +
+ +      return 0;
+ +}
+ +
+ +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
+ +{
+ +      if (!p->arg.vec_len)
+ +              return 0;
+ +
+ +      p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
+ +                             p->arg.vec_len);
+ +      p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
+ +                                 GFP_KERNEL);
+ +      if (!p->vec_buf)
+ +              return -ENOMEM;
+ +
+ +      p->vec_buf->start = p->vec_buf->end = 0;
+ +      p->vec_out = (struct page_region __user *)(long)p->arg.vec;
+ +
+ +      return 0;
+ +}
+ +
+ +static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
+ +{
+ +      const struct page_region *buf = p->vec_buf;
+ +      long n = p->vec_buf_index;
+ +
+ +      if (!p->vec_buf)
+ +              return 0;
+ +
+ +      if (buf[n].end != buf[n].start)
+ +              n++;
+ +
+ +      if (!n)
+ +              return 0;
+ +
+ +      if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
+ +              return -EFAULT;
+ +
+ +      p->arg.vec_len -= n;
+ +      p->vec_out += n;
+ +
+ +      p->vec_buf_index = 0;
+ +      p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
+ +      p->vec_buf->start = p->vec_buf->end = 0;
+ +
+ +      return n;
+ +}
+ +
+ +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
+ +{
+ +      struct mmu_notifier_range range;
+ +      struct pagemap_scan_private p = {0};
+ +      unsigned long walk_start;
+ +      size_t n_ranges_out = 0;
+ +      int ret;
+ +
+ +      ret = pagemap_scan_get_args(&p.arg, uarg);
+ +      if (ret)
+ +              return ret;
+ +
+ +      p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask |
+ +                            p.arg.return_mask;
+ +      ret = pagemap_scan_init_bounce_buffer(&p);
+ +      if (ret)
+ +              return ret;
+ +
+ +      /* Protection change for the range is going to happen. */
+ +      if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+ +              mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+ +                                      mm, p.arg.start, p.arg.end);
+ +              mmu_notifier_invalidate_range_start(&range);
+ +      }
+ +
+ +      for (walk_start = p.arg.start; walk_start < p.arg.end;
+ +                      walk_start = p.arg.walk_end) {
+ +              long n_out;
+ +
+ +              if (fatal_signal_pending(current)) {
+ +                      ret = -EINTR;
+ +                      break;
+ +              }
+ +
+ +              ret = mmap_read_lock_killable(mm);
+ +              if (ret)
+ +                      break;
+ +              ret = walk_page_range(mm, walk_start, p.arg.end,
+ +                                    &pagemap_scan_ops, &p);
+ +              mmap_read_unlock(mm);
+ +
+ +              n_out = pagemap_scan_flush_buffer(&p);
+ +              if (n_out < 0)
+ +                      ret = n_out;
+ +              else
+ +                      n_ranges_out += n_out;
+ +
+ +              if (ret != -ENOSPC)
+ +                      break;
+ +
+ +              if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages)
+ +                      break;
+ +      }
+ +
+ +      /* ENOSPC signifies early stop (buffer full) from the walk. */
+ +      if (!ret || ret == -ENOSPC)
+ +              ret = n_ranges_out;
+ +
+ +      /* The walk_end isn't set when ret is zero */
+ +      if (!p.arg.walk_end)
+ +              p.arg.walk_end = p.arg.end;
+ +      if (pagemap_scan_writeback_args(&p.arg, uarg))
+ +              ret = -EFAULT;
+ +
+ +      if (p.arg.flags & PM_SCAN_WP_MATCHING)
+ +              mmu_notifier_invalidate_range_end(&range);
+ +
+ +      kfree(p.vec_buf);
+ +      return ret;
+ +}
+ +
+ +static long do_pagemap_cmd(struct file *file, unsigned int cmd,
+ +                         unsigned long arg)
+ +{
+ +      struct mm_struct *mm = file->private_data;
+ +
+ +      switch (cmd) {
+ +      case PAGEMAP_SCAN:
+ +              return do_pagemap_scan(mm, arg);
+ +
+ +      default:
+ +              return -EINVAL;
+ +      }
+ +}
+ +
   const struct file_operations proc_pagemap_operations = {
         .llseek         = mem_lseek, /* borrow this */
         .read           = pagemap_read,
         .open           = pagemap_open,
         .release        = pagemap_release,
+ +      .unlocked_ioctl = do_pagemap_cmd,
+ +      .compat_ioctl   = do_pagemap_cmd,
   };
   #endif /* CONFIG_PROC_PAGE_MONITOR */
   
@@@ -2673,9 -1940,8 +2668,9 @@@ static int show_numa_map(struct seq_fil
         struct numa_maps *md = &numa_priv->md;
         struct file *file = vma->vm_file;
         struct mm_struct *mm = vma->vm_mm;
- -      struct mempolicy *pol;
         char buffer[64];
+ +      struct mempolicy *pol;
+ +      pgoff_t ilx;
         int nid;
   
         if (!mm)
@@@ -2684,7 -1950,7 +2679,7 @@@
         /* Ensure we start with an empty set of numa_maps statistics. */
         memset(md, 0, sizeof(*md));
   
- -      pol = __get_vma_policy(vma, vma->vm_start);
+ +      pol = __get_vma_policy(vma, vma->vm_start, &ilx);
         if (pol) {
                 mpol_to_str(buffer, sizeof(buffer), pol);
                 mpol_cond_put(pol);
@@@ -2696,7 -1962,7 +2691,7 @@@
   
         if (file) {
                 seq_puts(m, " file=");
- -              seq_file_path(m, file, "\n\t= ");
+ +              seq_path(m, file_user_path(file), "\n\t= ");
         } else if (vma_is_initial_heap(vma)) {
                 seq_puts(m, " heap");
         } else if (vma_is_initial_stack(vma)) {
diff --combined fs/userfaultfd.c

index ac616cfbacf5a8ea64e948c8340de08bfe2462c1,ed09d70027a07c8463bd7d26393a51b0deda1382..e8af40b0554956e16e9bdd0af00c33c04c424ec6
--- 1/fs/userfaultfd.c
--- 2/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@@ -49,7 -49,7 +49,7 @@@ static struct ctl_table vm_userfaultfd_
   };
   #endif
   
- static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+ static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
   
   /*
    * Start with fault_pending_wqh and fault_wqh so they're more likely
@@@ -123,11 -123,6 +123,11 @@@ static bool userfaultfd_is_initialized(
         return ctx->features & UFFD_FEATURE_INITIALIZED;
   }
   
+ +static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
+ +{
+ +      return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+ +}
+ +
   /*
    * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
    * meaningful when userfaultfd_wp()==true on the vma and when it's
@@@ -927,15 -922,20 +927,15 @@@ static int userfaultfd_release(struct i
                         continue;
                 }
                 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- -              prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
- -                               new_flags, vma->anon_vma,
- -                               vma->vm_file, vma->vm_pgoff,
- -                               vma_policy(vma),
- -                               NULL_VM_UFFD_CTX, anon_vma_name(vma));
- -              if (prev) {
- -                      vma = prev;
- -              } else {
- -                      prev = vma;
- -              }
+ +              vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start,
+ +                                          vma->vm_end, new_flags,
+ +                                          NULL_VM_UFFD_CTX);
   
                 vma_start_write(vma);
                 userfaultfd_set_vm_flags(vma, new_flags);
                 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ +
+ +              prev = vma;
         }
         mmap_write_unlock(mm);
         mmput(mm);
@@@ -1325,7 -1325,7 +1325,7 @@@ static int userfaultfd_register(struct 
         bool basic_ioctls;
         unsigned long start, end, vma_end;
         struct vma_iterator vmi;
- -      pgoff_t pgoff;
+ +      bool wp_async = userfaultfd_wp_async_ctx(ctx);
   
         user_uffdio_register = (struct uffdio_register __user *) arg;
   
@@@ -1399,7 -1399,7 +1399,7 @@@
   
                 /* check not compatible vmas */
                 ret = -EINVAL;
- -              if (!vma_can_userfault(cur, vm_flags))
+ +              if (!vma_can_userfault(cur, vm_flags, wp_async))
                         goto out_unlock;
   
                 /*
@@@ -1460,7 -1460,7 +1460,7 @@@
         for_each_vma_range(vmi, vma, end) {
                 cond_resched();
   
- -              BUG_ON(!vma_can_userfault(vma, vm_flags));
+ +              BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
                 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
                        vma->vm_userfaultfd_ctx.ctx != ctx);
                 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@@ -1478,14 -1478,28 +1478,14 @@@
                 vma_end = min(end, vma->vm_end);
   
                 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
- -              pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- -              prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- -                               vma->anon_vma, vma->vm_file, pgoff,
- -                               vma_policy(vma),
- -                               ((struct vm_userfaultfd_ctx){ ctx }),
- -                               anon_vma_name(vma));
- -              if (prev) {
- -                      /* vma_merge() invalidated the mas */
- -                      vma = prev;
- -                      goto next;
- -              }
- -              if (vma->vm_start < start) {
- -                      ret = split_vma(&vmi, vma, start, 1);
- -                      if (ret)
- -                              break;
- -              }
- -              if (vma->vm_end > end) {
- -                      ret = split_vma(&vmi, vma, end, 0);
- -                      if (ret)
- -                              break;
+ +              vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ +                                          new_flags,
+ +                                          (struct vm_userfaultfd_ctx){ctx});
+ +              if (IS_ERR(vma)) {
+ +                      ret = PTR_ERR(vma);
+ +                      break;
                 }
- -      next:
+ +
                 /*
                  * In the vma_merge() successful mprotect-like case 8:
                  * the next vma was merged into the current one and
@@@ -1547,7 -1561,7 +1547,7 @@@ static int userfaultfd_unregister(struc
         unsigned long start, end, vma_end;
         const void __user *buf = (void __user *)arg;
         struct vma_iterator vmi;
- -      pgoff_t pgoff;
+ +      bool wp_async = userfaultfd_wp_async_ctx(ctx);
   
         ret = -EFAULT;
         if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@@ -1601,7 -1615,7 +1601,7 @@@
                  * provides for more strict behavior to notice
                  * unregistration errors.
                  */
- -              if (!vma_can_userfault(cur, cur->vm_flags))
+ +              if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
                         goto out_unlock;
   
                 found = true;
@@@ -1617,7 -1631,7 +1617,7 @@@
         for_each_vma_range(vmi, vma, end) {
                 cond_resched();
   
- -              BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
+ +              BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
   
                 /*
                  * Nothing to do: this vma is already registered into this
@@@ -1650,13 -1664,26 +1650,13 @@@
                         uffd_wp_range(vma, start, vma_end - start, false);
   
                 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
- -              pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- -              prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
- -                               vma->anon_vma, vma->vm_file, pgoff,
- -                               vma_policy(vma),
- -                               NULL_VM_UFFD_CTX, anon_vma_name(vma));
- -              if (prev) {
- -                      vma = prev;
- -                      goto next;
- -              }
- -              if (vma->vm_start < start) {
- -                      ret = split_vma(&vmi, vma, start, 1);
- -                      if (ret)
- -                              break;
- -              }
- -              if (vma->vm_end > end) {
- -                      ret = split_vma(&vmi, vma, end, 0);
- -                      if (ret)
- -                              break;
+ +              vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ +                                          new_flags, NULL_VM_UFFD_CTX);
+ +              if (IS_ERR(vma)) {
+ +                      ret = PTR_ERR(vma);
+ +                      break;
                 }
- -      next:
+ +
                 /*
                  * In the vma_merge() successful mprotect-like case 8:
                  * the next vma was merged into the current one and
@@@ -1991,11 -2018,6 +1991,11 @@@ out
         return ret;
   }
   
+ +bool userfaultfd_wp_async(struct vm_area_struct *vma)
+ +{
+ +      return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+ +}
+ +
   static inline unsigned int uffd_ctx_features(__u64 user_features)
   {
         /*
@@@ -2029,11 -2051,6 +2029,11 @@@ static int userfaultfd_api(struct userf
         ret = -EPERM;
         if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
                 goto err_out;
+ +
+ +      /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
+ +      if (features & UFFD_FEATURE_WP_ASYNC)
+ +              features |= UFFD_FEATURE_WP_UNPOPULATED;
+ +
         /* report all available features and ioctls to userland */
         uffdio_api.features = UFFD_API_FEATURES;
   #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
@@@ -2046,7 -2063,6 +2046,7 @@@
   #ifndef CONFIG_PTE_MARKER_UFFD_WP
         uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
         uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
+ +      uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
   #endif
         uffdio_api.ioctls = UFFD_API_IOCTLS;
         ret = -EFAULT;
diff --combined include/linux/crash_core.h

index 08704c29fdb4fc35114c5f8d280b4b73017c2a48,3426f6eef60b49dd00d2085ed83e90c9444137d2..5126a4fecb442846b589981b3a7b82f6d3846d36
--- 1/include/linux/crash_core.h
--- 2/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@@ -5,6 -5,14 +5,14 @@@
   #include <linux/linkage.h>
   #include <linux/elfcore.h>
   #include <linux/elf.h>
+ #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ #include <asm/crash_core.h>
+ #endif
+ 
+ /* Location of a reserved region to hold the crash kernel.
+  */
+ extern struct resource crashk_res;
+ extern struct resource crashk_low_res;
   
   #define CRASH_CORE_NOTE_NAME     "CORE"
   #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
@@@ -79,12 -87,43 +87,43 @@@ Elf_Word *append_elf_note(Elf_Word *buf
                           void *data, size_t data_len);
   void final_note(Elf_Word *buf);
   
+ #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+ #define DEFAULT_CRASH_KERNEL_LOW_SIZE  (128UL << 20)
+ #endif
+ #endif
+ 
   int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
-               unsigned long long *crash_size, unsigned long long *crash_base);
- int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
-               unsigned long long *crash_size, unsigned long long *crash_base);
- int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
-               unsigned long long *crash_size, unsigned long long *crash_base);
+               unsigned long long *crash_size, unsigned long long *crash_base,
+               unsigned long long *low_size, bool *high);
+ 
+ #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+ #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+ #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
+ #endif
+ #ifndef CRASH_ALIGN
+ #define CRASH_ALIGN                   SZ_2M
+ #endif
+ #ifndef CRASH_ADDR_LOW_MAX
+ #define CRASH_ADDR_LOW_MAX            SZ_4G
+ #endif
+ #ifndef CRASH_ADDR_HIGH_MAX
+ #define CRASH_ADDR_HIGH_MAX           memblock_end_of_DRAM()
+ #endif
+ 
+ void __init reserve_crashkernel_generic(char *cmdline,
+               unsigned long long crash_size,
+               unsigned long long crash_base,
+               unsigned long long crash_low_size,
+               bool high);
+ #else
+ static inline void __init reserve_crashkernel_generic(char *cmdline,
+               unsigned long long crash_size,
+               unsigned long long crash_base,
+               unsigned long long crash_low_size,
+               bool high)
+ {}
+ #endif
   
   /* Alignment required for elf header segment */
   #define ELF_CORE_HEADER_ALIGN   4096
@@@ -92,7 -131,7 +131,7 @@@
   struct crash_mem {
         unsigned int max_nr_ranges;
         unsigned int nr_ranges;
- -      struct range ranges[];
+ +      struct range ranges[] __counted_by(max_nr_ranges);
   };
   
   extern int crash_exclude_mem_range(struct crash_mem *mem,
diff --combined include/linux/fortify-string.h

index 1e7711185ec694a596b0f221f528a27e88514aac,442ee9170259f2a4da0876d9c416e70f2b4593c7..79ef6ac4c02113e92454d94e80565b06073c4722
--- 1/include/linux/fortify-string.h
--- 2/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@@ -93,9 -93,13 +93,9 @@@ extern char *__underlying_strncpy(char 
   #if __has_builtin(__builtin_dynamic_object_size)
   #define POS                   __pass_dynamic_object_size(1)
   #define POS0                  __pass_dynamic_object_size(0)
- -#define __struct_size(p)      __builtin_dynamic_object_size(p, 0)
- -#define __member_size(p)      __builtin_dynamic_object_size(p, 1)
   #else
   #define POS                   __pass_object_size(1)
   #define POS0                  __pass_object_size(0)
- -#define __struct_size(p)      __builtin_object_size(p, 0)
- -#define __member_size(p)      __builtin_object_size(p, 1)
   #endif
   
   #define __compiletime_lessthan(bounds, length)        (       \
@@@ -639,7 -643,7 +639,7 @@@ __FORTIFY_INLINE bool fortify_memcpy_ch
                                      __q_size_field, #op),              \
                   #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
                   __fortify_size,                                       \
-                 "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \
+                 "field \"" #p "\" at " FILE_LINE,                     \
                   __p_size_field);                                      \
         __underlying_##op(p, q, __fortify_size);                        \
   })
diff --combined include/linux/sched.h

index b49ca40f633550b191dd60e33d86fbf097f17a97,6d1341b1673f5c48c730a6133c56258d8bc7a20b..292c316972485dae579a2518b4a2b847d35b961c
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -63,6 -63,7 +63,6 @@@ struct robust_list_head
   struct root_domain;
   struct rq;
   struct sched_attr;
- -struct sched_param;
   struct seq_file;
   struct sighand_struct;
   struct signal_struct;
@@@ -369,10 -370,6 +369,10 @@@ extern struct root_domain def_root_doma
   extern struct mutex sched_domains_mutex;
   #endif
   
+ +struct sched_param {
+ +      int sched_priority;
+ +};
+ +
   struct sched_info {
   #ifdef CONFIG_SCHED_INFO
         /* Cumulative counters: */
@@@ -753,8 -750,10 +753,8 @@@ struct task_struct 
   #endif
         unsigned int                    __state;
   
- -#ifdef CONFIG_PREEMPT_RT
         /* saved state for "spinlock sleepers" */
         unsigned int                    saved_state;
- -#endif
   
         /*
          * This begins the randomizable portion of task_struct. Only
@@@ -876,7 -875,6 +876,7 @@@
   
         struct mm_struct                *mm;
         struct mm_struct                *active_mm;
+ +      struct address_space            *faults_disabled_mapping;
   
         int                             exit_state;
         int                             exit_code;
@@@ -913,9 -911,6 +913,9 @@@
          * ->sched_remote_wakeup gets used, so it can be in this word.
          */
         unsigned                        sched_remote_wakeup:1;
+ +#ifdef CONFIG_RT_MUTEXES
+ +      unsigned                        sched_rt_mutex:1;
+ +#endif
   
         /* Bit to tell LSMs we're in execve(): */
         unsigned                        in_execve:1;
@@@ -1007,7 -1002,6 +1007,6 @@@
         /* PID/PID hash table linkage. */
         struct pid                      *thread_pid;
         struct hlist_node               pid_links[PIDTYPE_MAX];
-       struct list_head                thread_group;
         struct list_head                thread_node;
   
         struct completion               *vfork_done;
@@@ -1448,10 -1442,6 +1447,10 @@@
         struct mem_cgroup               *active_memcg;
   #endif
   
+ +#ifdef CONFIG_MEMCG_KMEM
+ +      struct obj_cgroup               *objcg;
+ +#endif
+ +
   #ifdef CONFIG_BLK_CGROUP
         struct gendisk                  *throttle_disk;
   #endif
diff --combined include/linux/sched/signal.h

index 3b28cff24cc12f708e338ae9a3ae45cd1fb5fb80,d7fa3ca2fa5326ebd6cf2322b49256ba35fbdc5e..3499c1a8b9295a0b3366f4261a30eb11c38d577c
--- 1/include/linux/sched/signal.h
--- 2/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@@ -303,11 -303,20 +303,11 @@@ static inline void kernel_signal_stop(v
   
         schedule();
   }
- -#ifdef __ia64__
- -# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3
- -#else
- -# define ___ARCH_SI_IA64(_a1, _a2, _a3)
- -#endif
   
- -int force_sig_fault_to_task(int sig, int code, void __user *addr
- -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- -      , struct task_struct *t);
- -int force_sig_fault(int sig, int code, void __user *addr
- -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr));
- -int send_sig_fault(int sig, int code, void __user *addr
- -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- -      , struct task_struct *t);
+ +int force_sig_fault_to_task(int sig, int code, void __user *addr,
+ +                          struct task_struct *t);
+ +int force_sig_fault(int sig, int code, void __user *addr);
+ +int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);
   
   int force_sig_mceerr(int code, void __user *, short);
   int send_sig_mceerr(int code, void __user *, short, struct task_struct *);
@@@ -647,8 -656,7 +647,8 @@@ extern bool current_is_single_threaded(
         while ((t = next_thread(t)) != g)
   
   #define __for_each_thread(signal, t)  \
- -      list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+ +      list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
+ +              lockdep_is_held(&tasklist_lock))
   
   #define for_each_thread(p, t)         \
         __for_each_thread((p)->signal, t)
@@@ -707,15 -715,26 +707,26 @@@ bool same_thread_group(struct task_stru
         return p1->signal == p2->signal;
   }
   
- static inline struct task_struct *next_thread(const struct task_struct *p)
+ /*
+  * returns NULL if p is the last thread in the thread group
+  */
+ static inline struct task_struct *__next_thread(struct task_struct *p)
+ {
+       return list_next_or_null_rcu(&p->signal->thread_head,
+                                       &p->thread_node,
+                                       struct task_struct,
+                                       thread_node);
+ }
+ 
+ static inline struct task_struct *next_thread(struct task_struct *p)
   {
-       return list_entry_rcu(p->thread_group.next,
-                             struct task_struct, thread_group);
+       return __next_thread(p) ?: p->group_leader;
   }
   
   static inline int thread_group_empty(struct task_struct *p)
   {
-       return list_empty(&p->thread_group);
+       return thread_group_leader(p) &&
+              list_is_last(&p->thread_node, &p->signal->thread_head);
   }
   
   #define delay_group_leader(p) \
diff --combined init/init_task.c

index f703116e052301b7171304eaf25fe692547f5e60,c0de0200fd56277ebc99fd7860b7827384d1b7b5..5727d42149c334a989a2e657b3f71e26ae2899fc
--- 1/init/init_task.c
--- 2/init/init_task.c
+++ b/init/init_task.c
@@@ -85,7 -85,6 +85,7 @@@ struct task_struct init_tas
         .nr_cpus_allowed= NR_CPUS,
         .mm             = NULL,
         .active_mm      = &init_mm,
+ +      .faults_disabled_mapping = NULL,
         .restart_block  = {
                 .fn = do_no_restart_syscall,
         },
@@@ -133,7 -132,6 +133,6 @@@
         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
         .timer_slack_ns = 50000, /* 50 usec default slack */
         .thread_pid     = &init_struct_pid,
-       .thread_group   = LIST_HEAD_INIT(init_task.thread_group),
         .thread_node    = LIST_HEAD_INIT(init_signals.thread_head),
   #ifdef CONFIG_AUDIT
         .loginuid       = INVALID_UID,
diff --combined kernel/audit_tree.c

index 85a5b306733b08fba26357ee5472ab0ae47dae74,b21b9652c1a8661326dfc10bbf9f38a568bf90e6..1b07e6f12a07a85a3fc97df244d1120bc4e4c150
--- 1/kernel/audit_tree.c
--- 2/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@@ -34,7 -34,7 +34,7 @@@ struct audit_chunk 
                 struct list_head list;
                 struct audit_tree *owner;
                 unsigned index;         /* index; upper bit indicates 'will prune' */
- -      } owners[];
+ +      } owners[] __counted_by(count);
   };
   
   struct audit_tree_mark {
@@@ -87,8 -87,8 +87,8 @@@ static struct task_struct *prune_thread
    * that makes a difference.  Some.
    */
   
- static struct fsnotify_group *audit_tree_group;
- static struct kmem_cache *audit_tree_mark_cachep __read_mostly;
+ static struct fsnotify_group *audit_tree_group __ro_after_init;
+ static struct kmem_cache *audit_tree_mark_cachep __ro_after_init;
   
   static struct audit_tree *alloc_tree(const char *s)
   {
diff --combined kernel/exit.c

index 61ebba96909b98408d4bb802436385e4b25aad75,f3ba4b97a7d97cbcf7e45054c591f0c625d33972..ee9f43bed49a240ac60c9f4054c663374e36ccf0
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -74,8 -74,6 +74,8 @@@
   #include <asm/unistd.h>
   #include <asm/mmu_context.h>
   
+ +#include "exit.h"
+ +
   /*
    * The default value should be high enough to not crash a system that randomly
    * crashes its kernel from time to time, but low enough to at least not permit
@@@ -135,7 -133,6 +135,6 @@@ static void __unhash_process(struct tas
                 list_del_init(&p->sibling);
                 __this_cpu_dec(process_counts);
         }
-       list_del_rcu(&p->thread_group);
         list_del_rcu(&p->thread_node);
   }
   
@@@ -541,6 -538,7 +540,6 @@@ static void exit_mm(void
         exit_mm_release(current, mm);
         if (!mm)
                 return;
- -      sync_mm_rss(mm);
         mmap_read_lock(mm);
         mmgrab_lazy_tlb(mm);
         BUG_ON(mm != current->active_mm);
@@@ -830,6 -828,9 +829,6 @@@ void __noreturn do_exit(long code
         io_uring_files_cancel();
         exit_signals(tsk);  /* sets PF_EXITING */
   
- -      /* sync mm's RSS info before statistics gathering */
- -      if (tsk->mm)
- -              sync_mm_rss(tsk->mm);
         acct_update_integrals(tsk);
         group_dead = atomic_dec_and_test(&tsk->signal->live);
         if (group_dead) {
@@@ -1035,6 -1036,26 +1034,6 @@@ SYSCALL_DEFINE1(exit_group, int, error_
         return 0;
   }
   
- -struct waitid_info {
- -      pid_t pid;
- -      uid_t uid;
- -      int status;
- -      int cause;
- -};
- -
- -struct wait_opts {
- -      enum pid_type           wo_type;
- -      int                     wo_flags;
- -      struct pid              *wo_pid;
- -
- -      struct waitid_info      *wo_info;
- -      int                     wo_stat;
- -      struct rusage           *wo_rusage;
- -
- -      wait_queue_entry_t              child_wait;
- -      int                     notask_error;
- -};
- -
   static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
   {
         return  wo->wo_type == PIDTYPE_MAX ||
@@@ -1498,17 -1519,6 +1497,17 @@@ static int ptrace_do_wait(struct wait_o
         return 0;
   }
   
+ +bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+ +{
+ +      if (!eligible_pid(wo, p))
+ +              return false;
+ +
+ +      if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
+ +              return false;
+ +
+ +      return true;
+ +}
+ +
   static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
                                 int sync, void *key)
   {
@@@ -1516,10 -1526,13 +1515,10 @@@
                                                 child_wait);
         struct task_struct *p = key;
   
- -      if (!eligible_pid(wo, p))
- -              return 0;
+ +      if (pid_child_should_wake(wo, p))
+ +              return default_wake_function(wait, mode, sync, key);
   
- -      if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
- -              return 0;
- -
- -      return default_wake_function(wait, mode, sync, key);
+ +      return 0;
   }
   
   void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
@@@ -1568,10 -1581,16 +1567,10 @@@ static int do_wait_pid(struct wait_opt
         return 0;
   }
   
- -static long do_wait(struct wait_opts *wo)
+ +long __do_wait(struct wait_opts *wo)
   {
- -      int retval;
- -
- -      trace_sched_process_wait(wo->wo_pid);
+ +      long retval;
   
- -      init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
- -      wo->child_wait.private = current;
- -      add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
- -repeat:
         /*
          * If there is nothing that can match our criteria, just get out.
          * We will clear ->notask_error to zero if we see any child that
@@@ -1583,23 -1602,24 +1582,23 @@@
            (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
                 goto notask;
   
- -      set_current_state(TASK_INTERRUPTIBLE);
         read_lock(&tasklist_lock);
   
         if (wo->wo_type == PIDTYPE_PID) {
                 retval = do_wait_pid(wo);
                 if (retval)
- -                      goto end;
+ +                      return retval;
         } else {
                 struct task_struct *tsk = current;
   
                 do {
                         retval = do_wait_thread(wo, tsk);
                         if (retval)
- -                              goto end;
+ +                              return retval;
   
                         retval = ptrace_do_wait(wo, tsk);
                         if (retval)
- -                              goto end;
+ +                              return retval;
   
                         if (wo->wo_flags & __WNOTHREAD)
                                 break;
@@@ -1609,44 -1629,27 +1608,44 @@@
   
   notask:
         retval = wo->notask_error;
- -      if (!retval && !(wo->wo_flags & WNOHANG)) {
- -              retval = -ERESTARTSYS;
- -              if (!signal_pending(current)) {
- -                      schedule();
- -                      goto repeat;
- -              }
- -      }
- -end:
+ +      if (!retval && !(wo->wo_flags & WNOHANG))
+ +              return -ERESTARTSYS;
+ +
+ +      return retval;
+ +}
+ +
+ +static long do_wait(struct wait_opts *wo)
+ +{
+ +      int retval;
+ +
+ +      trace_sched_process_wait(wo->wo_pid);
+ +
+ +      init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+ +      wo->child_wait.private = current;
+ +      add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+ +
+ +      do {
+ +              set_current_state(TASK_INTERRUPTIBLE);
+ +              retval = __do_wait(wo);
+ +              if (retval != -ERESTARTSYS)
+ +                      break;
+ +              if (signal_pending(current))
+ +                      break;
+ +              schedule();
+ +      } while (1);
+ +
         __set_current_state(TASK_RUNNING);
         remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
         return retval;
   }
   
- -static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
- -                        int options, struct rusage *ru)
+ +int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ +                        struct waitid_info *infop, int options,
+ +                        struct rusage *ru)
   {
- -      struct wait_opts wo;
+ +      unsigned int f_flags = 0;
         struct pid *pid = NULL;
         enum pid_type type;
- -      long ret;
- -      unsigned int f_flags = 0;
   
         if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
                         __WNOTHREAD|__WCLONE|__WALL))
@@@ -1689,32 -1692,19 +1688,32 @@@
                 return -EINVAL;
         }
   
- -      wo.wo_type      = type;
- -      wo.wo_pid       = pid;
- -      wo.wo_flags     = options;
- -      wo.wo_info      = infop;
- -      wo.wo_rusage    = ru;
+ +      wo->wo_type     = type;
+ +      wo->wo_pid      = pid;
+ +      wo->wo_flags    = options;
+ +      wo->wo_info     = infop;
+ +      wo->wo_rusage   = ru;
         if (f_flags & O_NONBLOCK)
- -              wo.wo_flags |= WNOHANG;
+ +              wo->wo_flags |= WNOHANG;
+ +
+ +      return 0;
+ +}
+ +
+ +static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+ +                        int options, struct rusage *ru)
+ +{
+ +      struct wait_opts wo;
+ +      long ret;
+ +
+ +      ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
+ +      if (ret)
+ +              return ret;
   
         ret = do_wait(&wo);
- -      if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
+ +      if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
                 ret = -EAGAIN;
   
- -      put_pid(pid);
+ +      put_pid(wo.wo_pid);
         return ret;
   }
   
diff --combined kernel/fork.c

index 373fa2f739bc41ced8dc9074d84ec1ce5336483a,b9d3aa493bbd2dffae2dd716ae05effaaabbbb4e..10917c3e1f0366b5fbf60d98c2042c636cd74c87
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -733,7 -733,7 +733,7 @@@ static __latent_entropy int dup_mmap(st
   
                         get_file(file);
                         i_mmap_lock_write(mapping);
- -                      if (tmp->vm_flags & VM_SHARED)
+ +                      if (vma_is_shared_maywrite(tmp))
                                 mapping_allow_writable(mapping);
                         flush_dcache_mmap_lock(mapping);
                         /* insert tmp into the share list, just after mpnt */
@@@ -1288,7 -1288,7 +1288,7 @@@ static struct mm_struct *mm_init(struc
         hugetlb_count_init(mm);
   
         if (current->mm) {
- -              mm->flags = current->mm->flags & MMF_INIT_MASK;
+ +              mm->flags = mmf_init_flags(current->mm->flags);
                 mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
         } else {
                 mm->flags = default_dump_filter;
@@@ -1393,8 -1393,6 +1393,8 @@@ EXPORT_SYMBOL_GPL(mmput_async)
   
   /**
    * set_mm_exe_file - change a reference to the mm's executable file
+ + * @mm: The mm to change.
+ + * @new_exe_file: The new file to use.
    *
    * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
    *
@@@ -1434,8 -1432,6 +1434,8 @@@ int set_mm_exe_file(struct mm_struct *m
   
   /**
    * replace_mm_exe_file - replace a reference to the mm's executable file
+ + * @mm: The mm to change.
+ + * @new_exe_file: The new file to use.
    *
    * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
    *
@@@ -1487,7 -1483,6 +1487,7 @@@ int replace_mm_exe_file(struct mm_struc
   
   /**
    * get_mm_exe_file - acquire a reference to the mm's executable file
+ + * @mm: The mm of interest.
    *
    * Returns %NULL if mm has no associated executable file.
    * User must release file via fput().
@@@ -1497,14 -1492,15 +1497,14 @@@ struct file *get_mm_exe_file(struct mm_
         struct file *exe_file;
   
         rcu_read_lock();
- -      exe_file = rcu_dereference(mm->exe_file);
- -      if (exe_file && !get_file_rcu(exe_file))
- -              exe_file = NULL;
+ +      exe_file = get_file_rcu(&mm->exe_file);
         rcu_read_unlock();
         return exe_file;
   }
   
   /**
    * get_task_exe_file - acquire a reference to the task's executable file
+ + * @task: The task.
    *
    * Returns %NULL if task's mm (if any) has no associated executable file or
    * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
@@@ -1527,7 -1523,6 +1527,7 @@@ struct file *get_task_exe_file(struct t
   
   /**
    * get_task_mm - acquire a reference to the task's mm
+ + * @task: The task.
    *
    * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
    * this kernel workthread has transiently adopted a user mm with use_mm,
@@@ -2107,11 -2102,11 +2107,11 @@@ const struct file_operations pidfd_fop
    * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
    * @pid:   the struct pid for which to create a pidfd
    * @flags: flags of the new @pidfd
- - * @pidfd: the pidfd to return
+ + * @ret: Where to return the file for the pidfd.
    *
    * Allocate a new file that stashes @pid and reserve a new pidfd number in the
    * caller's file descriptor table. The pidfd is reserved but not installed yet.
- -
+ + *
    * The helper doesn't perform checks on @pid which makes it useful for pidfds
    * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
    * pidfd file are prepared.
@@@ -2158,7 -2153,7 +2158,7 @@@ static int __pidfd_prepare(struct pid *
    * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
    * @pid:   the struct pid for which to create a pidfd
    * @flags: flags of the new @pidfd
- - * @pidfd: the pidfd to return
+ + * @ret: Where to return the pidfd.
    *
    * Allocate a new file that stashes @pid and reserve a new pidfd number in the
    * caller's file descriptor table. The pidfd is reserved but not installed yet.
@@@ -2411,6 -2406,10 +2411,6 @@@ __latent_entropy struct task_struct *co
         p->io_uring = NULL;
   #endif
   
- -#if defined(SPLIT_RSS_COUNTING)
- -      memset(&p->rss_stat, 0, sizeof(p->rss_stat));
- -#endif
- -
         p->default_timer_slack_ns = current->timer_slack_ns;
   
   #ifdef CONFIG_PSI
@@@ -2577,7 -2576,6 +2577,6 @@@
         p->dirty_paused_when = 0;
   
         p->pdeath_signal = 0;
-       INIT_LIST_HEAD(&p->thread_group);
         p->task_works = NULL;
         clear_posix_cputimers_work(p);
   
@@@ -2705,8 -2703,6 +2704,6 @@@
                         atomic_inc(&current->signal->live);
                         refcount_inc(&current->signal->sigcnt);
                         task_join_group_stop(p);
-                       list_add_tail_rcu(&p->thread_group,
-                                         &p->group_leader->thread_group);
                         list_add_tail_rcu(&p->thread_node,
                                           &p->signal->thread_head);
                 }
@@@ -3145,7 -3141,7 +3142,7 @@@ static inline bool clone3_stack_valid(s
                 if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
                         return false;
   
- -#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
+ +#if !defined(CONFIG_STACK_GROWSUP)
                 kargs->stack += kargs->stack_size;
   #endif
         }
@@@ -3182,7 -3178,7 +3179,7 @@@ static bool clone3_args_valid(struct ke
   }
   
   /**
- - * clone3 - create a new process with specific properties
+ + * sys_clone3 - create a new process with specific properties
    * @uargs: argument structure
    * @size:  size of @uargs
    *
diff --combined kernel/kthread.c

index c46128ec0c0a2aa9790897af2a32e9cfeaf3641f,290cbc845225e41d39994399e50ae3e82852a685..c5e40830c1f2d5e91dea786c24fd70f8b76ee488
--- 1/kernel/kthread.c
--- 2/kernel/kthread.c
+++ b/kernel/kthread.c
@@@ -715,6 -715,24 +715,24 @@@ int kthread_stop(struct task_struct *k
   }
   EXPORT_SYMBOL(kthread_stop);
   
+ /**
+  * kthread_stop_put - stop a thread and put its task struct
+  * @k: thread created by kthread_create().
+  *
+  * Stops a thread created by kthread_create() and put its task_struct.
+  * Only use when holding an extra task struct reference obtained by
+  * calling get_task_struct().
+  */
+ int kthread_stop_put(struct task_struct *k)
+ {
+       int ret;
+ 
+       ret = kthread_stop(k);
+       put_task_struct(k);
+       return ret;
+ }
+ EXPORT_SYMBOL(kthread_stop_put);
+ 
   int kthreadd(void *unused)
   {
         struct task_struct *tsk = current;
@@@ -1469,6 -1487,7 +1487,6 @@@ void kthread_unuse_mm(struct mm_struct 
          * clearing tsk->mm.
          */
         smp_mb__after_spinlock();
- -      sync_mm_rss(mm);
         local_irq_disable();
         tsk->mm = NULL;
         membarrier_update_current_mm(NULL);
diff --combined kernel/sched/core.c

index 3d7e2d7026998af5cb85fe12fb0b5442e90f6f63,7d4cf741e0879b89df3107188d5d466f48625f4c..a708d225c28e861922997ed27a25d1b21f072d2a
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -85,6 -85,7 +85,6 @@@
   
   #include "sched.h"
   #include "stats.h"
- -#include "autogroup.h"
   
   #include "autogroup.h"
   #include "pelt.h"
@@@ -113,7 -114,6 +113,7 @@@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_over
   EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
   EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
   EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
+ +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
   
   DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
   
@@@ -919,13 -919,14 +919,13 @@@ static bool set_nr_if_polling(struct ta
         struct thread_info *ti = task_thread_info(p);
         typeof(ti->flags) val = READ_ONCE(ti->flags);
   
- -      for (;;) {
+ +      do {
                 if (!(val & _TIF_POLLING_NRFLAG))
                         return false;
                 if (val & _TIF_NEED_RESCHED)
                         return true;
- -              if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
- -                      break;
- -      }
+ +      } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED));
+ +
         return true;
   }
   
@@@ -1479,12 -1480,16 +1479,12 @@@ static void __uclamp_update_util_min_rt
   
   static void uclamp_update_util_min_rt_default(struct task_struct *p)
   {
- -      struct rq_flags rf;
- -      struct rq *rq;
- -
         if (!rt_task(p))
                 return;
   
         /* Protect updates to p->uclamp_* */
- -      rq = task_rq_lock(p, &rf);
+ +      guard(task_rq_lock)(p);
         __uclamp_update_util_min_rt_default(p);
- -      task_rq_unlock(rq, p, &rf);
   }
   
   static inline struct uclamp_se
@@@ -1780,8 -1785,9 +1780,8 @@@ static void uclamp_update_root_tg(void
         uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
                       sysctl_sched_uclamp_util_max, false);
   
- -      rcu_read_lock();
+ +      guard(rcu)();
         cpu_util_update_eff(&root_task_group.css);
- -      rcu_read_unlock();
   }
   #else
   static void uclamp_update_root_tg(void) { }
@@@ -1808,9 -1814,10 +1808,9 @@@ static void uclamp_sync_util_min_rt_def
         smp_mb__after_spinlock();
         read_unlock(&tasklist_lock);
   
- -      rcu_read_lock();
+ +      guard(rcu)();
         for_each_process_thread(g, p)
                 uclamp_update_util_min_rt_default(p);
- -      rcu_read_unlock();
   }
   
   static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
@@@ -2211,10 -2218,10 +2211,10 @@@ static inline void check_class_changed(
                 p->sched_class->prio_changed(rq, p, oldprio);
   }
   
- -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+ +void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
   {
         if (p->sched_class == rq->curr->sched_class)
- -              rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+ +              rq->curr->sched_class->wakeup_preempt(rq, p, flags);
         else if (sched_class_above(p->sched_class, rq->curr->sched_class))
                 resched_curr(rq);
   
@@@ -2232,21 -2239,31 +2232,21 @@@ int __task_state_match(struct task_stru
         if (READ_ONCE(p->__state) & state)
                 return 1;
   
- -#ifdef CONFIG_PREEMPT_RT
         if (READ_ONCE(p->saved_state) & state)
                 return -1;
- -#endif
+ +
         return 0;
   }
   
   static __always_inline
   int task_state_match(struct task_struct *p, unsigned int state)
   {
- -#ifdef CONFIG_PREEMPT_RT
- -      int match;
- -
         /*
- -       * Serialize against current_save_and_set_rtlock_wait_state() and
- -       * current_restore_rtlock_saved_state().
+ +       * Serialize against current_save_and_set_rtlock_wait_state(),
+ +       * current_restore_rtlock_saved_state(), and __refrigerator().
          */
- -      raw_spin_lock_irq(&p->pi_lock);
- -      match = __task_state_match(p, state);
- -      raw_spin_unlock_irq(&p->pi_lock);
- -
- -      return match;
- -#else
+ +      guard(raw_spinlock_irq)(&p->pi_lock);
         return __task_state_match(p, state);
- -#endif
   }
   
   /*
@@@ -2400,9 -2417,10 +2400,9 @@@ void migrate_disable(void
                 return;
         }
   
- -      preempt_disable();
+ +      guard(preempt)();
         this_rq()->nr_pinned++;
         p->migration_disabled = 1;
- -      preempt_enable();
   }
   EXPORT_SYMBOL_GPL(migrate_disable);
   
@@@ -2426,7 -2444,7 +2426,7 @@@ void migrate_enable(void
          * Ensure stop_task runs either before or after this, and that
          * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
          */
- -      preempt_disable();
+ +      guard(preempt)();
         if (p->cpus_ptr != &p->cpus_mask)
                 __set_cpus_allowed_ptr(p, &ac);
         /*
@@@ -2437,6 -2455,7 +2437,6 @@@
         barrier();
         p->migration_disabled = 0;
         this_rq()->nr_pinned--;
- -      preempt_enable();
   }
   EXPORT_SYMBOL_GPL(migrate_enable);
   
@@@ -2508,7 -2527,7 +2508,7 @@@ static struct rq *move_queued_task(stru
         rq_lock(rq, rf);
         WARN_ON_ONCE(task_cpu(p) != new_cpu);
         activate_task(rq, p, 0);
- -      check_preempt_curr(rq, p, 0);
+ +      wakeup_preempt(rq, p, 0);
   
         return rq;
   }
@@@ -2645,11 -2664,9 +2645,11 @@@ static int migration_cpu_stop(void *dat
                  * it.
                  */
                 WARN_ON_ONCE(!pending->stop_pending);
+ +              preempt_disable();
                 task_rq_unlock(rq, p, &rf);
                 stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
                                     &pending->arg, &pending->stop_work);
+ +              preempt_enable();
                 return 0;
         }
   out:
@@@ -2969,13 -2986,12 +2969,13 @@@ static int affine_move_task(struct rq *
                         complete = true;
                 }
   
+ +              preempt_disable();
                 task_rq_unlock(rq, p, rf);
- -
                 if (push_task) {
                         stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
                                             p, &rq->push_work);
                 }
+ +              preempt_enable();
   
                 if (complete)
                         complete_all(&pending->done);
@@@ -3041,13 -3057,12 +3041,13 @@@
                 if (flags & SCA_MIGRATE_ENABLE)
                         p->migration_flags &= ~MDF_PUSH;
   
+ +              preempt_disable();
                 task_rq_unlock(rq, p, rf);
- -
                 if (!stop_pending) {
                         stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
                                             &pending->arg, &pending->stop_work);
                 }
+ +              preempt_enable();
   
                 if (flags & SCA_MIGRATE_ENABLE)
                         return 0;
@@@ -3394,7 -3409,7 +3394,7 @@@ static void __migrate_swap_task(struct 
                 deactivate_task(src_rq, p, 0);
                 set_task_cpu(p, cpu);
                 activate_task(dst_rq, p, 0);
- -              check_preempt_curr(dst_rq, p, 0);
+ +              wakeup_preempt(dst_rq, p, 0);
   
                 rq_unpin_lock(dst_rq, &drf);
                 rq_unpin_lock(src_rq, &srf);
@@@ -3501,11 -3516,13 +3501,11 @@@ out
    */
   void kick_process(struct task_struct *p)
   {
- -      int cpu;
+ +      guard(preempt)();
+ +      int cpu = task_cpu(p);
   
- -      preempt_disable();
- -      cpu = task_cpu(p);
         if ((cpu != smp_processor_id()) && task_curr(p))
                 smp_send_reschedule(cpu);
- -      preempt_enable();
   }
   EXPORT_SYMBOL_GPL(kick_process);
   
@@@ -3768,7 -3785,7 +3768,7 @@@ ttwu_do_activate(struct rq *rq, struct 
         }
   
         activate_task(rq, p, en_flags);
- -      check_preempt_curr(rq, p, wake_flags);
+ +      wakeup_preempt(rq, p, wake_flags);
   
         ttwu_do_wakeup(p);
   
@@@ -3792,6 -3809,9 +3792,6 @@@
                 if (rq->avg_idle > max)
                         rq->avg_idle = max;
   
- -              rq->wake_stamp = jiffies;
- -              rq->wake_avg_idle = rq->avg_idle / 2;
- -
                 rq->idle_stamp = 0;
         }
   #endif
@@@ -3836,7 -3856,7 +3836,7 @@@ static int ttwu_runnable(struct task_st
                          * it should preempt the task that is current now.
                          */
                         update_rq_clock(rq);
- -                      check_preempt_curr(rq, p, wake_flags);
+ +                      wakeup_preempt(rq, p, wake_flags);
                 }
                 ttwu_do_wakeup(p);
                 ret = 1;
@@@ -3936,18 -3956,6 +3936,18 @@@ bool cpus_share_cache(int this_cpu, in
         return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
   }
   
+ +/*
+ + * Whether CPUs are share cache resources, which means LLC on non-cluster
+ + * machines and LLC tag or L2 on machines with clusters.
+ + */
+ +bool cpus_share_resources(int this_cpu, int that_cpu)
+ +{
+ +      if (this_cpu == that_cpu)
+ +              return true;
+ +
+ +      return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
+ +}
+ +
   static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
   {
         /*
@@@ -4028,17 -4036,13 +4028,17 @@@ static void ttwu_queue(struct task_stru
    * The caller holds p::pi_lock if p != current or has preemption
    * disabled when p == current.
    *
- - * The rules of PREEMPT_RT saved_state:
+ + * The rules of saved_state:
    *
    *   The related locking code always holds p::pi_lock when updating
    *   p::saved_state, which means the code is fully serialized in both cases.
    *
- - *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
- - *   bits set. This allows to distinguish all wakeup scenarios.
+ + *   For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
+ + *   No other bits set. This allows to distinguish all wakeup scenarios.
+ + *
+ + *   For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
+ + *   allows us to prevent early wakeup of tasks before they can be run on
+ + *   asymmetric ISA architectures (eg ARMv9).
    */
   static __always_inline
   bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
@@@ -4052,13 -4056,13 +4052,13 @@@
   
         *success = !!(match = __task_state_match(p, state));
   
- -#ifdef CONFIG_PREEMPT_RT
         /*
          * Saved state preserves the task state across blocking on
- -       * an RT lock.  If the state matches, set p::saved_state to
- -       * TASK_RUNNING, but do not wake the task because it waits
- -       * for a lock wakeup. Also indicate success because from
- -       * the regular waker's point of view this has succeeded.
+ +       * an RT lock or TASK_FREEZABLE tasks.  If the state matches,
+ +       * set p::saved_state to TASK_RUNNING, but do not wake the task
+ +       * because it waits for a lock wakeup or __thaw_task(). Also
+ +       * indicate success because from the regular waker's point of
+ +       * view this has succeeded.
          *
          * After acquiring the lock the task will restore p::__state
          * from p::saved_state which ensures that the regular
@@@ -4068,7 -4072,7 +4068,7 @@@
          */
         if (match < 0)
                 p->saved_state = TASK_RUNNING;
- -#endif
+ +
         return match > 0;
   }
   
@@@ -4250,7 -4254,7 +4250,7 @@@ int try_to_wake_up(struct task_struct *
                  * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
                  * __schedule().  See the comment for smp_mb__after_spinlock().
                  *
- -               * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
+ +               * A similar smp_rmb() lives in __task_needs_rq_lock().
                  */
                 smp_rmb();
                 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
@@@ -4867,7 -4871,7 +4867,7 @@@ void wake_up_new_task(struct task_struc
   
         activate_task(rq, p, ENQUEUE_NOCLOCK);
         trace_sched_wakeup_new(p);
- -      check_preempt_curr(rq, p, WF_FORK);
+ +      wakeup_preempt(rq, p, WF_FORK);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_woken) {
                 /*
@@@ -5370,6 -5374,8 +5370,6 @@@ context_switch(struct rq *rq, struct ta
         /* switch_mm_cid() requires the memory barriers above. */
         switch_mm_cid(rq, prev, next);
   
- -      rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- -
         prepare_lock_switch(rq, next, rf);
   
         /* Here we just switch the register state and the stack. */
@@@ -5910,7 -5916,8 +5910,7 @@@ static noinline void __schedule_bug(str
         print_modules();
         if (irqs_disabled())
                 print_irqtrace_events(prev);
- -      if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
- -          && in_atomic_preempt_off()) {
+ +      if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
                 pr_err("Preemption disabled at:");
                 print_ip_sym(KERN_ERR, preempt_disable_ip);
         }
@@@ -6361,9 -6368,8 +6361,9 @@@ static void sched_core_balance(struct r
         struct sched_domain *sd;
         int cpu = cpu_of(rq);
   
- -      preempt_disable();
- -      rcu_read_lock();
+ +      guard(preempt)();
+ +      guard(rcu)();
+ +
         raw_spin_rq_unlock_irq(rq);
         for_each_domain(cpu, sd) {
                 if (need_resched())
@@@ -6373,6 -6379,8 +6373,6 @@@
                         break;
         }
         raw_spin_rq_lock_irq(rq);
- -      rcu_read_unlock();
- -      preempt_enable();
   }
   
   static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
@@@ -6607,7 -6615,6 +6607,7 @@@ static void __sched notrace __schedule(
         /* Promote REQ to ACT */
         rq->clock_update_flags <<= 1;
         update_rq_clock(rq);
+ +      rq->clock_update_flags = RQCF_UPDATED;
   
         switch_count = &prev->nivcsw;
   
@@@ -6687,6 -6694,8 +6687,6 @@@
                 /* Also unlocks the rq: */
                 rq = context_switch(rq, prev, next, &rf);
         } else {
- -              rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- -
                 rq_unpin_lock(rq, &rf);
                 __balance_callbacks(rq);
                 raw_spin_rq_unlock_irq(rq);
@@@ -6711,24 -6720,22 +6711,24 @@@ void __noreturn do_task_dead(void
   
   static inline void sched_submit_work(struct task_struct *tsk)
   {
+ +      static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
         unsigned int task_flags;
   
- -      if (task_is_running(tsk))
- -              return;
+ +      /*
+ +       * Establish LD_WAIT_CONFIG context to ensure none of the code called
+ +       * will use a blocking primitive -- which would lead to recursion.
+ +       */
+ +      lock_map_acquire_try(&sched_map);
   
         task_flags = tsk->flags;
         /*
          * If a worker goes to sleep, notify and ask workqueue whether it
          * wants to wake up a task to maintain concurrency.
          */
- -      if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
- -              if (task_flags & PF_WQ_WORKER)
- -                      wq_worker_sleeping(tsk);
- -              else
- -                      io_wq_worker_sleeping(tsk);
- -      }
+ +      if (task_flags & PF_WQ_WORKER)
+ +              wq_worker_sleeping(tsk);
+ +      else if (task_flags & PF_IO_WORKER)
+ +              io_wq_worker_sleeping(tsk);
   
         /*
          * spinlock and rwlock must not flush block requests.  This will
@@@ -6742,8 -6749,6 +6742,8 @@@
          * make sure to submit it to avoid deadlocks.
          */
         blk_flush_plug(tsk->plug, true);
+ +
+ +      lock_map_release(&sched_map);
   }
   
   static void sched_update_worker(struct task_struct *tsk)
@@@ -6756,26 -6761,16 +6756,26 @@@
         }
   }
   
- -asmlinkage __visible void __sched schedule(void)
+ +static __always_inline void __schedule_loop(unsigned int sched_mode)
   {
- -      struct task_struct *tsk = current;
- -
- -      sched_submit_work(tsk);
         do {
                 preempt_disable();
- -              __schedule(SM_NONE);
+ +              __schedule(sched_mode);
                 sched_preempt_enable_no_resched();
         } while (need_resched());
+ +}
+ +
+ +asmlinkage __visible void __sched schedule(void)
+ +{
+ +      struct task_struct *tsk = current;
+ +
+ +#ifdef CONFIG_RT_MUTEXES
+ +      lockdep_assert(!tsk->sched_rt_mutex);
+ +#endif
+ +
+ +      if (!task_is_running(tsk))
+ +              sched_submit_work(tsk);
+ +      __schedule_loop(SM_NONE);
         sched_update_worker(tsk);
   }
   EXPORT_SYMBOL(schedule);
@@@ -6839,7 -6834,11 +6839,7 @@@ void __sched schedule_preempt_disabled(
   #ifdef CONFIG_PREEMPT_RT
   void __sched notrace schedule_rtlock(void)
   {
- -      do {
- -              preempt_disable();
- -              __schedule(SM_RTLOCK_WAIT);
- -              sched_preempt_enable_no_resched();
- -      } while (need_resched());
+ +      __schedule_loop(SM_RTLOCK_WAIT);
   }
   NOKPROBE_SYMBOL(schedule_rtlock);
   #endif
@@@ -7035,32 -7034,6 +7035,32 @@@ static void __setscheduler_prio(struct 
   
   #ifdef CONFIG_RT_MUTEXES
   
+ +/*
+ + * Would be more useful with typeof()/auto_type but they don't mix with
+ + * bit-fields. Since it's a local thing, use int. Keep the generic sounding
+ + * name such that if someone were to implement this function we get to compare
+ + * notes.
+ + */
+ +#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
+ +
+ +void rt_mutex_pre_schedule(void)
+ +{
+ +      lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
+ +      sched_submit_work(current);
+ +}
+ +
+ +void rt_mutex_schedule(void)
+ +{
+ +      lockdep_assert(current->sched_rt_mutex);
+ +      __schedule_loop(SM_NONE);
+ +}
+ +
+ +void rt_mutex_post_schedule(void)
+ +{
+ +      sched_update_worker(current);
+ +      lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
+ +}
+ +
   static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
   {
         if (pi_task)
@@@ -7214,8 -7187,9 +7214,8 @@@ static inline int rt_effective_prio(str
   void set_user_nice(struct task_struct *p, long nice)
   {
         bool queued, running;
- -      int old_prio;
- -      struct rq_flags rf;
         struct rq *rq;
+ +      int old_prio;
   
         if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
                 return;
@@@ -7223,9 -7197,7 +7223,9 @@@
          * We have to be careful, if called from sys_setpriority(),
          * the task might be in the middle of scheduling on another CPU.
          */
- -      rq = task_rq_lock(p, &rf);
+ +      CLASS(task_rq_lock, rq_guard)(p);
+ +      rq = rq_guard.rq;
+ +
         update_rq_clock(rq);
   
         /*
@@@ -7236,9 -7208,8 +7236,9 @@@
          */
         if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
                 p->static_prio = NICE_TO_PRIO(nice);
- -              goto out_unlock;
+ +              return;
         }
+ +
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
@@@ -7261,6 -7232,9 +7261,6 @@@
          * lowered its priority, then reschedule its CPU:
          */
         p->sched_class->prio_changed(rq, p, old_prio);
- -
- -out_unlock:
- -      task_rq_unlock(rq, p, &rf);
   }
   EXPORT_SYMBOL(set_user_nice);
   
@@@ -7533,21 -7507,6 +7533,21 @@@ static struct task_struct *find_process
         return pid ? find_task_by_vpid(pid) : current;
   }
   
+ +static struct task_struct *find_get_task(pid_t pid)
+ +{
+ +      struct task_struct *p;
+ +      guard(rcu)();
+ +
+ +      p = find_process_by_pid(pid);
+ +      if (likely(p))
+ +              get_task_struct(p);
+ +
+ +      return p;
+ +}
+ +
+ +DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+ +           find_get_task(pid), pid_t pid)
+ +
   /*
    * sched_setparam() passes in -1 for its policy, to let the functions
    * it calls know not to change it.
@@@ -7585,11 -7544,14 +7585,11 @@@ static void __setscheduler_params(struc
   static bool check_same_owner(struct task_struct *p)
   {
         const struct cred *cred = current_cred(), *pcred;
- -      bool match;
+ +      guard(rcu)();
   
- -      rcu_read_lock();
         pcred = __task_cred(p);
- -      match = (uid_eq(cred->euid, pcred->euid) ||
- -               uid_eq(cred->euid, pcred->uid));
- -      rcu_read_unlock();
- -      return match;
+ +      return (uid_eq(cred->euid, pcred->euid) ||
+ +              uid_eq(cred->euid, pcred->uid));
   }
   
   /*
@@@ -8001,17 -7963,27 +8001,17 @@@ static in
   do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
   {
         struct sched_param lparam;
- -      struct task_struct *p;
- -      int retval;
   
         if (!param || pid < 0)
                 return -EINVAL;
         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
                 return -EFAULT;
   
- -      rcu_read_lock();
- -      retval = -ESRCH;
- -      p = find_process_by_pid(pid);
- -      if (likely(p))
- -              get_task_struct(p);
- -      rcu_read_unlock();
- -
- -      if (likely(p)) {
- -              retval = sched_setscheduler(p, policy, &lparam);
- -              put_task_struct(p);
- -      }
+ +      CLASS(find_get_task, p)(pid);
+ +      if (!p)
+ +              return -ESRCH;
   
- -      return retval;
+ +      return sched_setscheduler(p, policy, &lparam);
   }
   
   /*
@@@ -8107,6 -8079,7 +8107,6 @@@ SYSCALL_DEFINE3(sched_setattr, pid_t, p
                                unsigned int, flags)
   {
         struct sched_attr attr;
- -      struct task_struct *p;
         int retval;
   
         if (!uattr || pid < 0 || flags)
@@@ -8121,14 -8094,21 +8121,14 @@@
         if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
                 attr.sched_policy = SETPARAM_POLICY;
   
- -      rcu_read_lock();
- -      retval = -ESRCH;
- -      p = find_process_by_pid(pid);
- -      if (likely(p))
- -              get_task_struct(p);
- -      rcu_read_unlock();
+ +      CLASS(find_get_task, p)(pid);
+ +      if (!p)
+ +              return -ESRCH;
   
- -      if (likely(p)) {
- -              if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
- -                      get_params(p, &attr);
- -              retval = sched_setattr(p, &attr);
- -              put_task_struct(p);
- -      }
+ +      if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ +              get_params(p, &attr);
   
- -      return retval;
+ +      return sched_setattr(p, &attr);
   }
   
   /**
@@@ -8146,17 -8126,16 +8146,17 @@@ SYSCALL_DEFINE1(sched_getscheduler, pid
         if (pid < 0)
                 return -EINVAL;
   
- -      retval = -ESRCH;
- -      rcu_read_lock();
+ +      guard(rcu)();
         p = find_process_by_pid(pid);
- -      if (p) {
- -              retval = security_task_getscheduler(p);
- -              if (!retval)
- -                      retval = p->policy
- -                              | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+ +      if (!p)
+ +              return -ESRCH;
+ +
+ +      retval = security_task_getscheduler(p);
+ +      if (!retval) {
+ +              retval = p->policy;
+ +              if (p->sched_reset_on_fork)
+ +                      retval |= SCHED_RESET_ON_FORK;
         }
- -      rcu_read_unlock();
         return retval;
   }
   
@@@ -8177,23 -8156,30 +8177,23 @@@ SYSCALL_DEFINE2(sched_getparam, pid_t, 
         if (!param || pid < 0)
                 return -EINVAL;
   
- -      rcu_read_lock();
- -      p = find_process_by_pid(pid);
- -      retval = -ESRCH;
- -      if (!p)
- -              goto out_unlock;
+ +      scoped_guard (rcu) {
+ +              p = find_process_by_pid(pid);
+ +              if (!p)
+ +                      return -ESRCH;
   
- -      retval = security_task_getscheduler(p);
- -      if (retval)
- -              goto out_unlock;
+ +              retval = security_task_getscheduler(p);
+ +              if (retval)
+ +                      return retval;
   
- -      if (task_has_rt_policy(p))
- -              lp.sched_priority = p->rt_priority;
- -      rcu_read_unlock();
+ +              if (task_has_rt_policy(p))
+ +                      lp.sched_priority = p->rt_priority;
+ +      }
   
         /*
          * This one might sleep, we cannot do it with a spinlock held ...
          */
- -      retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
- -
- -      return retval;
- -
- -out_unlock:
- -      rcu_read_unlock();
- -      return retval;
+ +      return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
   }
   
   /*
@@@ -8253,38 -8239,46 +8253,38 @@@ SYSCALL_DEFINE4(sched_getattr, pid_t, p
             usize < SCHED_ATTR_SIZE_VER0 || flags)
                 return -EINVAL;
   
- -      rcu_read_lock();
- -      p = find_process_by_pid(pid);
- -      retval = -ESRCH;
- -      if (!p)
- -              goto out_unlock;
+ +      scoped_guard (rcu) {
+ +              p = find_process_by_pid(pid);
+ +              if (!p)
+ +                      return -ESRCH;
   
- -      retval = security_task_getscheduler(p);
- -      if (retval)
- -              goto out_unlock;
+ +              retval = security_task_getscheduler(p);
+ +              if (retval)
+ +                      return retval;
   
- -      kattr.sched_policy = p->policy;
- -      if (p->sched_reset_on_fork)
- -              kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- -      get_params(p, &kattr);
- -      kattr.sched_flags &= SCHED_FLAG_ALL;
+ +              kattr.sched_policy = p->policy;
+ +              if (p->sched_reset_on_fork)
+ +                      kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ +              get_params(p, &kattr);
+ +              kattr.sched_flags &= SCHED_FLAG_ALL;
   
   #ifdef CONFIG_UCLAMP_TASK
- -      /*
- -       * This could race with another potential updater, but this is fine
- -       * because it'll correctly read the old or the new value. We don't need
- -       * to guarantee who wins the race as long as it doesn't return garbage.
- -       */
- -      kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- -      kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ +              /*
+ +               * This could race with another potential updater, but this is fine
+ +               * because it'll correctly read the old or the new value. We don't need
+ +               * to guarantee who wins the race as long as it doesn't return garbage.
+ +               */
+ +              kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ +              kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
   #endif
- -
- -      rcu_read_unlock();
+ +      }
   
         return sched_attr_copy_to_user(uattr, &kattr, usize);
- -
- -out_unlock:
- -      rcu_read_unlock();
- -      return retval;
   }
   
   #ifdef CONFIG_SMP
   int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
   {
- -      int ret = 0;
- -
         /*
          * If the task isn't a deadline task or admission control is
          * disabled then we don't care about affinity changes.
@@@ -8298,11 -8292,11 +8298,11 @@@
          * tasks allowed to run on all the CPUs in the task's
          * root_domain.
          */
- -      rcu_read_lock();
+ +      guard(rcu)();
         if (!cpumask_subset(task_rq(p)->rd->span, mask))
- -              ret = -EBUSY;
- -      rcu_read_unlock();
- -      return ret;
+ +              return -EBUSY;
+ +
+ +      return 0;
   }
   #endif
   
@@@ -8372,24 -8366,39 +8372,24 @@@ long sched_setaffinity(pid_t pid, cons
   {
         struct affinity_context ac;
         struct cpumask *user_mask;
- -      struct task_struct *p;
         int retval;
   
- -      rcu_read_lock();
- -
- -      p = find_process_by_pid(pid);
- -      if (!p) {
- -              rcu_read_unlock();
+ +      CLASS(find_get_task, p)(pid);
+ +      if (!p)
                 return -ESRCH;
- -      }
- -
- -      /* Prevent p going away */
- -      get_task_struct(p);
- -      rcu_read_unlock();
   
- -      if (p->flags & PF_NO_SETAFFINITY) {
- -              retval = -EINVAL;
- -              goto out_put_task;
- -      }
+ +      if (p->flags & PF_NO_SETAFFINITY)
+ +              return -EINVAL;
   
         if (!check_same_owner(p)) {
- -              rcu_read_lock();
- -              if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
- -                      rcu_read_unlock();
- -                      retval = -EPERM;
- -                      goto out_put_task;
- -              }
- -              rcu_read_unlock();
+ +              guard(rcu)();
+ +              if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+ +                      return -EPERM;
         }
   
         retval = security_task_setscheduler(p);
         if (retval)
- -              goto out_put_task;
+ +              return retval;
   
         /*
          * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
@@@ -8399,7 -8408,8 +8399,7 @@@
         if (user_mask) {
                 cpumask_copy(user_mask, in_mask);
         } else if (IS_ENABLED(CONFIG_SMP)) {
- -              retval = -ENOMEM;
- -              goto out_put_task;
+ +              return -ENOMEM;
         }
   
         ac = (struct affinity_context){
@@@ -8411,6 -8421,8 +8411,6 @@@
         retval = __sched_setaffinity(p, &ac);
         kfree(ac.user_mask);
   
- -out_put_task:
- -      put_task_struct(p);
         return retval;
   }
   
@@@ -8452,21 -8464,28 +8452,21 @@@ SYSCALL_DEFINE3(sched_setaffinity, pid_
   long sched_getaffinity(pid_t pid, struct cpumask *mask)
   {
         struct task_struct *p;
- -      unsigned long flags;
         int retval;
   
- -      rcu_read_lock();
- -
- -      retval = -ESRCH;
+ +      guard(rcu)();
         p = find_process_by_pid(pid);
         if (!p)
- -              goto out_unlock;
+ +              return -ESRCH;
   
         retval = security_task_getscheduler(p);
         if (retval)
- -              goto out_unlock;
+ +              return retval;
   
- -      raw_spin_lock_irqsave(&p->pi_lock, flags);
+ +      guard(raw_spinlock_irqsave)(&p->pi_lock);
         cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
- -      raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   
- -out_unlock:
- -      rcu_read_unlock();
- -
- -      return retval;
+ +      return 0;
   }
   
   /**
@@@ -8913,46 -8932,55 +8913,46 @@@ int __sched yield_to(struct task_struc
   {
         struct task_struct *curr = current;
         struct rq *rq, *p_rq;
- -      unsigned long flags;
         int yielded = 0;
   
- -      local_irq_save(flags);
- -      rq = this_rq();
+ +      scoped_guard (irqsave) {
+ +              rq = this_rq();
   
   again:
- -      p_rq = task_rq(p);
- -      /*
- -       * If we're the only runnable task on the rq and target rq also
- -       * has only one task, there's absolutely no point in yielding.
- -       */
- -      if (rq->nr_running == 1 && p_rq->nr_running == 1) {
- -              yielded = -ESRCH;
- -              goto out_irq;
- -      }
+ +              p_rq = task_rq(p);
+ +              /*
+ +               * If we're the only runnable task on the rq and target rq also
+ +               * has only one task, there's absolutely no point in yielding.
+ +               */
+ +              if (rq->nr_running == 1 && p_rq->nr_running == 1)
+ +                      return -ESRCH;
   
- -      double_rq_lock(rq, p_rq);
- -      if (task_rq(p) != p_rq) {
- -              double_rq_unlock(rq, p_rq);
- -              goto again;
- -      }
+ +              guard(double_rq_lock)(rq, p_rq);
+ +              if (task_rq(p) != p_rq)
+ +                      goto again;
   
- -      if (!curr->sched_class->yield_to_task)
- -              goto out_unlock;
+ +              if (!curr->sched_class->yield_to_task)
+ +                      return 0;
   
- -      if (curr->sched_class != p->sched_class)
- -              goto out_unlock;
+ +              if (curr->sched_class != p->sched_class)
+ +                      return 0;
   
- -      if (task_on_cpu(p_rq, p) || !task_is_running(p))
- -              goto out_unlock;
+ +              if (task_on_cpu(p_rq, p) || !task_is_running(p))
+ +                      return 0;
   
- -      yielded = curr->sched_class->yield_to_task(rq, p);
- -      if (yielded) {
- -              schedstat_inc(rq->yld_count);
- -              /*
- -               * Make p's CPU reschedule; pick_next_entity takes care of
- -               * fairness.
- -               */
- -              if (preempt && rq != p_rq)
- -                      resched_curr(p_rq);
+ +              yielded = curr->sched_class->yield_to_task(rq, p);
+ +              if (yielded) {
+ +                      schedstat_inc(rq->yld_count);
+ +                      /*
+ +                       * Make p's CPU reschedule; pick_next_entity
+ +                       * takes care of fairness.
+ +                       */
+ +                      if (preempt && rq != p_rq)
+ +                              resched_curr(p_rq);
+ +              }
         }
   
- -out_unlock:
- -      double_rq_unlock(rq, p_rq);
- -out_irq:
- -      local_irq_restore(flags);
- -
- -      if (yielded > 0)
+ +      if (yielded)
                 schedule();
   
         return yielded;
@@@ -9055,30 -9083,38 +9055,30 @@@ SYSCALL_DEFINE1(sched_get_priority_min
   
   static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
   {
- -      struct task_struct *p;
- -      unsigned int time_slice;
- -      struct rq_flags rf;
- -      struct rq *rq;
+ +      unsigned int time_slice = 0;
         int retval;
   
         if (pid < 0)
                 return -EINVAL;
   
- -      retval = -ESRCH;
- -      rcu_read_lock();
- -      p = find_process_by_pid(pid);
- -      if (!p)
- -              goto out_unlock;
+ +      scoped_guard (rcu) {
+ +              struct task_struct *p = find_process_by_pid(pid);
+ +              if (!p)
+ +                      return -ESRCH;
   
- -      retval = security_task_getscheduler(p);
- -      if (retval)
- -              goto out_unlock;
+ +              retval = security_task_getscheduler(p);
+ +              if (retval)
+ +                      return retval;
   
- -      rq = task_rq_lock(p, &rf);
- -      time_slice = 0;
- -      if (p->sched_class->get_rr_interval)
- -              time_slice = p->sched_class->get_rr_interval(rq, p);
- -      task_rq_unlock(rq, p, &rf);
+ +              scoped_guard (task_rq_lock, p) {
+ +                      struct rq *rq = scope.rq;
+ +                      if (p->sched_class->get_rr_interval)
+ +                              time_slice = p->sched_class->get_rr_interval(rq, p);
+ +              }
+ +      }
   
- -      rcu_read_unlock();
         jiffies_to_timespec64(time_slice, t);
         return 0;
- -
- -out_unlock:
- -      rcu_read_unlock();
- -      return retval;
   }
   
   /**
@@@ -9137,9 -9173,9 +9137,9 @@@ void sched_show_task(struct task_struc
         if (pid_alive(p))
                 ppid = task_pid_nr(rcu_dereference(p->real_parent));
         rcu_read_unlock();
- -      pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
- -              free, task_pid_nr(p), ppid,
- -              read_task_thread_flags(p));
+ +      pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n",
+ +              free, task_pid_nr(p), task_tgid_nr(p),
+ +              ppid, read_task_thread_flags(p));
   
         print_worker_info(KERN_INFO, p);
         print_stop_info(KERN_INFO, p);
@@@ -9469,11 -9505,9 +9469,11 @@@ static void balance_push(struct rq *rq
          * Temporarily drop rq->lock such that we can wake-up the stop task.
          * Both preemption and IRQs are still disabled.
          */
+ +      preempt_disable();
         raw_spin_rq_unlock(rq);
         stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
                             this_cpu_ptr(&push_work));
+ +      preempt_enable();
         /*
          * At this point need_resched() is true and we'll take the loop in
          * schedule(). The next pick is obviously going to be the stop task
@@@ -9869,7 -9903,7 +9869,7 @@@ struct task_group root_task_group
   LIST_HEAD(task_groups);
   
   /* Cacheline aligned slab cache for task_group */
- static struct kmem_cache *task_group_cache __read_mostly;
+ static struct kmem_cache *task_group_cache __ro_after_init;
   #endif
   
   void __init sched_init(void)
@@@ -9979,7 -10013,7 +9979,7 @@@
   #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
- -              rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
+ +              rq->cpu_capacity = SCHED_CAPACITY_SCALE;
                 rq->balance_callback = &balance_push_callback;
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;
@@@ -9988,6 -10022,8 +9988,6 @@@
                 rq->online = 0;
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
- -              rq->wake_stamp = jiffies;
- -              rq->wake_avg_idle = rq->avg_idle;
                 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
   
                 INIT_LIST_HEAD(&rq->cfs_tasks);
@@@ -10253,9 -10289,9 +10253,9 @@@ void normalize_rt_tasks(void
   
   #endif /* CONFIG_MAGIC_SYSRQ */
   
- -#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+ +#if defined(CONFIG_KGDB_KDB)
   /*
- - * These functions are only useful for the IA64 MCA handling, or kdb.
+ + * These functions are only useful for kdb.
    *
    * They can only be called when the whole system has been
    * stopped - every CPU needs to be quiescent, and no scheduling
@@@ -10277,7 -10313,30 +10277,7 @@@ struct task_struct *curr_task(int cpu
         return cpu_curr(cpu);
   }
   
- -#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
- -
- -#ifdef CONFIG_IA64
- -/**
- - * ia64_set_curr_task - set the current task for a given CPU.
- - * @cpu: the processor in question.
- - * @p: the task pointer to set.
- - *
- - * Description: This function must only be used when non-maskable interrupts
- - * are serviced on a separate stack. It allows the architecture to switch the
- - * notion of the current task on a CPU in a non-blocking manner. This function
- - * must be called with all CPU's synchronized, and interrupts disabled, the
- - * and caller must save the original value of the current task (see
- - * curr_task() above) and restore that value before reenabling interrupts and
- - * re-starting the system.
- - *
- - * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- - */
- -void ia64_set_curr_task(int cpu, struct task_struct *p)
- -{
- -      cpu_curr(cpu) = p;
- -}
- -
- -#endif
+ +#endif /* defined(CONFIG_KGDB_KDB) */
   
   #ifdef CONFIG_CGROUP_SCHED
   /* task_group_lock serializes the addition/removal of task groups */
@@@ -10439,18 -10498,17 +10439,18 @@@ void sched_move_task(struct task_struc
         int queued, running, queue_flags =
                 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
         struct task_group *group;
- -      struct rq_flags rf;
         struct rq *rq;
   
- -      rq = task_rq_lock(tsk, &rf);
+ +      CLASS(task_rq_lock, rq_guard)(tsk);
+ +      rq = rq_guard.rq;
+ +
         /*
          * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
          * group changes.
          */
         group = sched_get_task_group(tsk);
         if (group == tsk->sched_task_group)
- -              goto unlock;
+ +              return;
   
         update_rq_clock(rq);
   
@@@ -10475,6 -10533,9 +10475,6 @@@
                  */
                 resched_curr(rq);
         }
- -
- -unlock:
- -      task_rq_unlock(rq, tsk, &rf);
   }
   
   static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@@ -10511,9 -10572,11 +10511,9 @@@ static int cpu_cgroup_css_online(struc
   
   #ifdef CONFIG_UCLAMP_TASK_GROUP
         /* Propagate the effective uclamp value for the new group */
- -      mutex_lock(&uclamp_mutex);
- -      rcu_read_lock();
+ +      guard(mutex)(&uclamp_mutex);
+ +      guard(rcu)();
         cpu_util_update_eff(css);
- -      rcu_read_unlock();
- -      mutex_unlock(&uclamp_mutex);
   #endif
   
         return 0;
@@@ -10664,8 -10727,8 +10664,8 @@@ static ssize_t cpu_uclamp_write(struct 
   
         static_branch_enable(&sched_uclamp_used);
   
- -      mutex_lock(&uclamp_mutex);
- -      rcu_read_lock();
+ +      guard(mutex)(&uclamp_mutex);
+ +      guard(rcu)();
   
         tg = css_tg(of_css(of));
         if (tg->uclamp_req[clamp_id].value != req.util)
@@@ -10680,6 -10743,9 +10680,6 @@@
         /* Update effective clamps to track the most restrictive value */
         cpu_util_update_eff(of_css(of));
   
- -      rcu_read_unlock();
- -      mutex_unlock(&uclamp_mutex);
- -
         return nbytes;
   }
   
@@@ -10705,10 -10771,10 +10705,10 @@@ static inline void cpu_uclamp_print(str
         u64 percent;
         u32 rem;
   
- -      rcu_read_lock();
- -      tg = css_tg(seq_css(sf));
- -      util_clamp = tg->uclamp_req[clamp_id].value;
- -      rcu_read_unlock();
+ +      scoped_guard (rcu) {
+ +              tg = css_tg(seq_css(sf));
+ +              util_clamp = tg->uclamp_req[clamp_id].value;
+ +      }
   
         if (util_clamp == SCHED_CAPACITY_SCALE) {
                 seq_puts(sf, "max\n");
@@@ -10799,12 -10865,11 +10799,12 @@@ static int tg_set_cfs_bandwidth(struct 
          * Prevent race between setting of cfs_rq->runtime_enabled and
          * unthrottle_offline_cfs_rqs().
          */
- -      cpus_read_lock();
- -      mutex_lock(&cfs_constraints_mutex);
+ +      guard(cpus_read_lock)();
+ +      guard(mutex)(&cfs_constraints_mutex);
+ +
         ret = __cfs_schedulable(tg, period, quota);
         if (ret)
- -              goto out_unlock;
+ +              return ret;
   
         runtime_enabled = quota != RUNTIME_INF;
         runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@@ -10814,38 -10879,39 +10814,38 @@@
          */
         if (runtime_enabled && !runtime_was_enabled)
                 cfs_bandwidth_usage_inc();
- -      raw_spin_lock_irq(&cfs_b->lock);
- -      cfs_b->period = ns_to_ktime(period);
- -      cfs_b->quota = quota;
- -      cfs_b->burst = burst;
   
- -      __refill_cfs_bandwidth_runtime(cfs_b);
+ +      scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
+ +              cfs_b->period = ns_to_ktime(period);
+ +              cfs_b->quota = quota;
+ +              cfs_b->burst = burst;
   
- -      /* Restart the period timer (if active) to handle new period expiry: */
- -      if (runtime_enabled)
- -              start_cfs_bandwidth(cfs_b);
+ +              __refill_cfs_bandwidth_runtime(cfs_b);
   
- -      raw_spin_unlock_irq(&cfs_b->lock);
+ +              /*
+ +               * Restart the period timer (if active) to handle new
+ +               * period expiry:
+ +               */
+ +              if (runtime_enabled)
+ +                      start_cfs_bandwidth(cfs_b);
+ +      }
   
         for_each_online_cpu(i) {
                 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
                 struct rq *rq = cfs_rq->rq;
- -              struct rq_flags rf;
   
- -              rq_lock_irq(rq, &rf);
+ +              guard(rq_lock_irq)(rq);
                 cfs_rq->runtime_enabled = runtime_enabled;
                 cfs_rq->runtime_remaining = 0;
   
                 if (cfs_rq->throttled)
                         unthrottle_cfs_rq(cfs_rq);
- -              rq_unlock_irq(rq, &rf);
         }
+ +
         if (runtime_was_enabled && !runtime_enabled)
                 cfs_bandwidth_usage_dec();
- -out_unlock:
- -      mutex_unlock(&cfs_constraints_mutex);
- -      cpus_read_unlock();
   
- -      return ret;
+ +      return 0;
   }
   
   static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
@@@ -11030,6 -11096,7 +11030,6 @@@ static int tg_cfs_schedulable_down(stru
   
   static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
   {
- -      int ret;
         struct cfs_schedulable_data data = {
                 .tg = tg,
                 .period = period,
@@@ -11041,8 -11108,11 +11041,8 @@@
                 do_div(data.quota, NSEC_PER_USEC);
         }
   
- -      rcu_read_lock();
- -      ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
- -      rcu_read_unlock();
- -
- -      return ret;
+ +      guard(rcu)();
+ +      return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
   }
   
   static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
@@@ -11647,12 -11717,14 +11647,12 @@@ int __sched_mm_cid_migrate_from_fetch_c
          * are not the last task to be migrated from this cpu for this mm, so
          * there is no need to move src_cid to the destination cpu.
          */
- -      rcu_read_lock();
+ +      guard(rcu)();
         src_task = rcu_dereference(src_rq->curr);
         if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- -              rcu_read_unlock();
                 t->last_mm_cid = -1;
                 return -1;
         }
- -      rcu_read_unlock();
   
         return src_cid;
   }
@@@ -11696,17 -11768,18 +11696,17 @@@ int __sched_mm_cid_migrate_from_try_ste
          * the lazy-put flag, this task will be responsible for transitioning
          * from lazy-put flag set to MM_CID_UNSET.
          */
- -      rcu_read_lock();
- -      src_task = rcu_dereference(src_rq->curr);
- -      if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- -              rcu_read_unlock();
- -              /*
- -               * We observed an active task for this mm, there is therefore
- -               * no point in moving this cid to the destination cpu.
- -               */
- -              t->last_mm_cid = -1;
- -              return -1;
+ +      scoped_guard (rcu) {
+ +              src_task = rcu_dereference(src_rq->curr);
+ +              if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ +                      /*
+ +                       * We observed an active task for this mm, there is therefore
+ +                       * no point in moving this cid to the destination cpu.
+ +                       */
+ +                      t->last_mm_cid = -1;
+ +                      return -1;
+ +              }
         }
- -      rcu_read_unlock();
   
         /*
          * The src_cid is unused, so it can be unset.
@@@ -11779,6 -11852,7 +11779,6 @@@ static void sched_mm_cid_remote_clear(s
   {
         struct rq *rq = cpu_rq(cpu);
         struct task_struct *t;
- -      unsigned long flags;
         int cid, lazy_cid;
   
         cid = READ_ONCE(pcpu_cid->cid);
@@@ -11813,21 -11887,23 +11813,21 @@@
          * the lazy-put flag, that task will be responsible for transitioning
          * from lazy-put flag set to MM_CID_UNSET.
          */
- -      rcu_read_lock();
- -      t = rcu_dereference(rq->curr);
- -      if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
- -              rcu_read_unlock();
- -              return;
+ +      scoped_guard (rcu) {
+ +              t = rcu_dereference(rq->curr);
+ +              if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
+ +                      return;
         }
- -      rcu_read_unlock();
   
         /*
          * The cid is unused, so it can be unset.
          * Disable interrupts to keep the window of cid ownership without rq
          * lock small.
          */
- -      local_irq_save(flags);
- -      if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- -              __mm_cid_put(mm, cid);
- -      local_irq_restore(flags);
+ +      scoped_guard (irqsave) {
+ +              if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ +                      __mm_cid_put(mm, cid);
+ +      }
   }
   
   static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
@@@ -11849,13 -11925,14 +11849,13 @@@
          * snapshot associated with this cid if an active task using the mm is
          * observed on this rq.
          */
- -      rcu_read_lock();
- -      curr = rcu_dereference(rq->curr);
- -      if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
- -              WRITE_ONCE(pcpu_cid->time, rq_clock);
- -              rcu_read_unlock();
- -              return;
+ +      scoped_guard (rcu) {
+ +              curr = rcu_dereference(rq->curr);
+ +              if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+ +                      WRITE_ONCE(pcpu_cid->time, rq_clock);
+ +                      return;
+ +              }
         }
- -      rcu_read_unlock();
   
         if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
                 return;
@@@ -11949,6 -12026,7 +11949,6 @@@ void task_tick_mm_cid(struct rq *rq, st
   void sched_mm_cid_exit_signals(struct task_struct *t)
   {
         struct mm_struct *mm = t->mm;
- -      struct rq_flags rf;
         struct rq *rq;
   
         if (!mm)
@@@ -11956,7 -12034,7 +11956,7 @@@
   
         preempt_disable();
         rq = this_rq();
- -      rq_lock_irqsave(rq, &rf);
+ +      guard(rq_lock_irqsave)(rq);
         preempt_enable_no_resched();    /* holding spinlock */
         WRITE_ONCE(t->mm_cid_active, 0);
         /*
@@@ -11966,11 -12044,13 +11966,11 @@@
         smp_mb();
         mm_cid_put(mm);
         t->last_mm_cid = t->mm_cid = -1;
- -      rq_unlock_irqrestore(rq, &rf);
   }
   
   void sched_mm_cid_before_execve(struct task_struct *t)
   {
         struct mm_struct *mm = t->mm;
- -      struct rq_flags rf;
         struct rq *rq;
   
         if (!mm)
@@@ -11978,7 -12058,7 +11978,7 @@@
   
         preempt_disable();
         rq = this_rq();
- -      rq_lock_irqsave(rq, &rf);
+ +      guard(rq_lock_irqsave)(rq);
         preempt_enable_no_resched();    /* holding spinlock */
         WRITE_ONCE(t->mm_cid_active, 0);
         /*
@@@ -11988,11 -12068,13 +11988,11 @@@
         smp_mb();
         mm_cid_put(mm);
         t->last_mm_cid = t->mm_cid = -1;
- -      rq_unlock_irqrestore(rq, &rf);
   }
   
   void sched_mm_cid_after_execve(struct task_struct *t)
   {
         struct mm_struct *mm = t->mm;
- -      struct rq_flags rf;
         struct rq *rq;
   
         if (!mm)
@@@ -12000,16 -12082,16 +12000,16 @@@
   
         preempt_disable();
         rq = this_rq();
- -      rq_lock_irqsave(rq, &rf);
- -      preempt_enable_no_resched();    /* holding spinlock */
- -      WRITE_ONCE(t->mm_cid_active, 1);
- -      /*
- -       * Store t->mm_cid_active before loading per-mm/cpu cid.
- -       * Matches barrier in sched_mm_cid_remote_clear_old().
- -       */
- -      smp_mb();
- -      t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
- -      rq_unlock_irqrestore(rq, &rf);
+ +      scoped_guard (rq_lock_irqsave, rq) {
+ +              preempt_enable_no_resched();    /* holding spinlock */
+ +              WRITE_ONCE(t->mm_cid_active, 1);
+ +              /*
+ +               * Store t->mm_cid_active before loading per-mm/cpu cid.
+ +               * Matches barrier in sched_mm_cid_remote_clear_old().
+ +               */
+ +              smp_mb();
+ +              t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+ +      }
         rseq_set_notify_resume(t);
   }
   
diff --combined kernel/signal.c

index 83fcbaf0e82de0be624c64bcebf6bbdaeda855c6,edaf39382d21d8d851002a4962e1bc8df0c5d040..47a7602dfe8df43e786b7b33119b71599372b151
--- 1/kernel/signal.c
--- 2/kernel/signal.c
+++ b/kernel/signal.c
@@@ -415,7 -415,7 +415,7 @@@ __sigqueue_alloc(int sig, struct task_s
                  int override_rlimit, const unsigned int sigqueue_flags)
   {
         struct sigqueue *q = NULL;
-       struct ucounts *ucounts = NULL;
+       struct ucounts *ucounts;
         long sigpending;
   
         /*
@@@ -1058,12 -1058,11 +1058,11 @@@ static void complete_signal(int sig, st
                         signal->flags = SIGNAL_GROUP_EXIT;
                         signal->group_exit_code = sig;
                         signal->group_stop_count = 0;
-                       t = p;
-                       do {
+                       __for_each_thread(signal, t) {
                                 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                                 sigaddset(&t->pending.signal, SIGKILL);
                                 signal_wake_up(t, 1);
-                       } while_each_thread(p, t);
+                       }
                         return;
                 }
         }
@@@ -1471,16 -1470,21 +1470,21 @@@ int group_send_sig_info(int sig, struc
   int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
   {
         struct task_struct *p = NULL;
-       int retval, success;
+       int ret = -ESRCH;
   
-       success = 0;
-       retval = -ESRCH;
         do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                 int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
-               success |= !err;
-               retval = err;
+               /*
+                * If group_send_sig_info() succeeds at least once ret
+                * becomes 0 and after that the code below has no effect.
+                * Otherwise we return the last err or -ESRCH if this
+                * process group is empty.
+                */
+               if (ret)
+                       ret = err;
         } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
-       return success ? 0 : retval;
+ 
+       return ret;
   }
   
   int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
@@@ -1718,8 -1722,9 +1722,8 @@@ void force_sigsegv(int sig
                 force_sig(SIGSEGV);
   }
   
- -int force_sig_fault_to_task(int sig, int code, void __user *addr
- -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- -      , struct task_struct *t)
+ +int force_sig_fault_to_task(int sig, int code, void __user *addr,
+ +                          struct task_struct *t)
   {
         struct kernel_siginfo info;
   
@@@ -1728,15 -1733,24 +1732,15 @@@
         info.si_errno = 0;
         info.si_code  = code;
         info.si_addr  = addr;
- -#ifdef __ia64__
- -      info.si_imm = imm;
- -      info.si_flags = flags;
- -      info.si_isr = isr;
- -#endif
         return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
   }
   
- -int force_sig_fault(int sig, int code, void __user *addr
- -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
+ +int force_sig_fault(int sig, int code, void __user *addr)
   {
- -      return force_sig_fault_to_task(sig, code, addr
- -                                     ___ARCH_SI_IA64(imm, flags, isr), current);
+ +      return force_sig_fault_to_task(sig, code, addr, current);
   }
   
- -int send_sig_fault(int sig, int code, void __user *addr
- -      ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
- -      , struct task_struct *t)
+ +int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t)
   {
         struct kernel_siginfo info;
   
@@@ -1745,6 -1759,11 +1749,6 @@@
         info.si_errno = 0;
         info.si_code  = code;
         info.si_addr  = addr;
- -#ifdef __ia64__
- -      info.si_imm = imm;
- -      info.si_flags = flags;
- -      info.si_isr = isr;
- -#endif
         return send_sig_info(info.si_signo, &info, t);
   }
   
@@@ -2314,38 -2333,15 +2318,38 @@@ static int ptrace_stop(int exit_code, i
                 do_notify_parent_cldstop(current, false, why);
   
         /*
- -       * Don't want to allow preemption here, because
- -       * sys_ptrace() needs this task to be inactive.
+ +       * The previous do_notify_parent_cldstop() invocation woke ptracer.
+ +       * One a PREEMPTION kernel this can result in preemption requirement
+ +       * which will be fulfilled after read_unlock() and the ptracer will be
+ +       * put on the CPU.
+ +       * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
+ +       * this task wait in schedule(). If this task gets preempted then it
+ +       * remains enqueued on the runqueue. The ptracer will observe this and
+ +       * then sleep for a delay of one HZ tick. In the meantime this task
+ +       * gets scheduled, enters schedule() and will wait for the ptracer.
+ +       *
+ +       * This preemption point is not bad from a correctness point of
+ +       * view but extends the runtime by one HZ tick time due to the
+ +       * ptracer's sleep.  The preempt-disable section ensures that there
+ +       * will be no preemption between unlock and schedule() and so
+ +       * improving the performance since the ptracer will observe that
+ +       * the tracee is scheduled out once it gets on the CPU.
          *
- -       * XXX: implement read_unlock_no_resched().
+ +       * On PREEMPT_RT locking tasklist_lock does not disable preemption.
+ +       * Therefore the task can be preempted after do_notify_parent_cldstop()
+ +       * before unlocking tasklist_lock so there is no benefit in doing this.
+ +       *
+ +       * In fact disabling preemption is harmful on PREEMPT_RT because
+ +       * the spinlock_t in cgroup_enter_frozen() must not be acquired
+ +       * with preemption disabled due to the 'sleeping' spinlock
+ +       * substitution of RT.
          */
- -      preempt_disable();
+ +      if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ +              preempt_disable();
         read_unlock(&tasklist_lock);
         cgroup_enter_frozen();
- -      preempt_enable_no_resched();
+ +      if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ +              preempt_enable_no_resched();
         schedule();
         cgroup_leave_frozen(true);
   
diff --combined kernel/sys.c

index 4a8073c1b2558e6d5f050316b21c3da7e15c4331,67436d465be45160772e6977c3b97dd20194fd43..420d9cb9cc8e203f50014bb2ec564f6598d9869c
--- 1/kernel/sys.c
--- 2/kernel/sys.c
+++ b/kernel/sys.c
@@@ -1786,6 -1786,7 +1786,7 @@@ void getrusage(struct task_struct *p, i
         unsigned long flags;
         u64 tgutime, tgstime, utime, stime;
         unsigned long maxrss = 0;
+       struct signal_struct *sig = p->signal;
   
         memset((char *)r, 0, sizeof (*r));
         utime = stime = 0;
@@@ -1793,7 -1794,7 +1794,7 @@@
         if (who == RUSAGE_THREAD) {
                 task_cputime_adjusted(current, &utime, &stime);
                 accumulate_thread_rusage(p, r);
-               maxrss = p->signal->maxrss;
+               maxrss = sig->maxrss;
                 goto out;
         }
   
@@@ -1803,15 -1804,15 +1804,15 @@@
         switch (who) {
         case RUSAGE_BOTH:
         case RUSAGE_CHILDREN:
-               utime = p->signal->cutime;
-               stime = p->signal->cstime;
-               r->ru_nvcsw = p->signal->cnvcsw;
-               r->ru_nivcsw = p->signal->cnivcsw;
-               r->ru_minflt = p->signal->cmin_flt;
-               r->ru_majflt = p->signal->cmaj_flt;
-               r->ru_inblock = p->signal->cinblock;
-               r->ru_oublock = p->signal->coublock;
-               maxrss = p->signal->cmaxrss;
+               utime = sig->cutime;
+               stime = sig->cstime;
+               r->ru_nvcsw = sig->cnvcsw;
+               r->ru_nivcsw = sig->cnivcsw;
+               r->ru_minflt = sig->cmin_flt;
+               r->ru_majflt = sig->cmaj_flt;
+               r->ru_inblock = sig->cinblock;
+               r->ru_oublock = sig->coublock;
+               maxrss = sig->cmaxrss;
   
                 if (who == RUSAGE_CHILDREN)
                         break;
@@@ -1821,18 -1822,16 +1822,16 @@@
                 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                 utime += tgutime;
                 stime += tgstime;
-               r->ru_nvcsw += p->signal->nvcsw;
-               r->ru_nivcsw += p->signal->nivcsw;
-               r->ru_minflt += p->signal->min_flt;
-               r->ru_majflt += p->signal->maj_flt;
-               r->ru_inblock += p->signal->inblock;
-               r->ru_oublock += p->signal->oublock;
-               if (maxrss < p->signal->maxrss)
-                       maxrss = p->signal->maxrss;
-               t = p;
-               do {
+               r->ru_nvcsw += sig->nvcsw;
+               r->ru_nivcsw += sig->nivcsw;
+               r->ru_minflt += sig->min_flt;
+               r->ru_majflt += sig->maj_flt;
+               r->ru_inblock += sig->inblock;
+               r->ru_oublock += sig->oublock;
+               if (maxrss < sig->maxrss)
+                       maxrss = sig->maxrss;
+               __for_each_thread(sig, t)
                         accumulate_thread_rusage(t, r);
-               } while_each_thread(p, t);
                 break;
   
         default:
@@@ -2368,41 -2367,19 +2367,41 @@@ static int prctl_set_vma(unsigned long 
   }
   #endif /* CONFIG_ANON_VMA_NAME */
   
+ +static inline unsigned long get_current_mdwe(void)
+ +{
+ +      unsigned long ret = 0;
+ +
+ +      if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
+ +              ret |= PR_MDWE_REFUSE_EXEC_GAIN;
+ +      if (test_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags))
+ +              ret |= PR_MDWE_NO_INHERIT;
+ +
+ +      return ret;
+ +}
+ +
   static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
                                  unsigned long arg4, unsigned long arg5)
   {
+ +      unsigned long current_bits;
+ +
         if (arg3 || arg4 || arg5)
                 return -EINVAL;
   
- -      if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
+ +      if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT))
+ +              return -EINVAL;
+ +
+ +      /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */
+ +      if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
                 return -EINVAL;
   
+ +      current_bits = get_current_mdwe();
+ +      if (current_bits && current_bits != bits)
+ +              return -EPERM; /* Cannot unset the flags */
+ +
+ +      if (bits & PR_MDWE_NO_INHERIT)
+ +              set_bit(MMF_HAS_MDWE_NO_INHERIT, &current->mm->flags);
         if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
                 set_bit(MMF_HAS_MDWE, &current->mm->flags);
- -      else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
- -              return -EPERM; /* Cannot unset the flag */
   
         return 0;
   }
@@@ -2412,7 -2389,9 +2411,7 @@@ static inline int prctl_get_mdwe(unsign
   {
         if (arg2 || arg3 || arg4 || arg5)
                 return -EINVAL;
- -
- -      return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
- -              PR_MDWE_REFUSE_EXEC_GAIN : 0;
+ +      return get_current_mdwe();
   }
   
   static int prctl_get_auxv(void __user *addr, unsigned long len)
diff --combined kernel/user_namespace.c

index d52a894ecf57d0abfb33e196d8fae879c6c6f884,bf2cb8c115711a10d6a07b2aa6a948eeb2f82290..eabe8bcc70426519bdfef4b08e53a86451ba76c2
--- 1/kernel/user_namespace.c
--- 2/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@@ -22,7 -22,7 +22,7 @@@
   #include <linux/bsearch.h>
   #include <linux/sort.h>
   
- static struct kmem_cache *user_ns_cachep __read_mostly;
+ static struct kmem_cache *user_ns_cachep __ro_after_init;
   static DEFINE_MUTEX(userns_state_mutex);
   
   static bool new_idmap_permitted(const struct file *file,
@@@ -213,9 -213,6 +213,9 @@@ static void free_user_ns(struct work_st
                         kfree(ns->projid_map.forward);
                         kfree(ns->projid_map.reverse);
                 }
+ +#if IS_ENABLED(CONFIG_BINFMT_MISC)
+ +              kfree(ns->binfmt_misc);
+ +#endif
                 retire_userns_sysctls(ns);
                 key_free_user_ns(ns);
                 ns_free_inum(&ns->ns);
diff --combined kernel/workqueue.c

index 0f682da96e1c52ea42aa8a23590cf8448fa27334,96b89f0edbe3ae56aab0e152fa45d4f0af59b737..6e578f576a6f2b73b98817b0f5489a79c9d85524
--- 1/kernel/workqueue.c
--- 2/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@@ -418,21 -418,21 +418,21 @@@ static struct workqueue_attrs *ordered_
    * process context while holding a pool lock. Bounce to a dedicated kthread
    * worker to avoid A-A deadlocks.
    */
- static struct kthread_worker *pwq_release_worker;
+ static struct kthread_worker *pwq_release_worker __ro_after_init;
   
- struct workqueue_struct *system_wq __read_mostly;
+ struct workqueue_struct *system_wq __ro_after_init;
   EXPORT_SYMBOL(system_wq);
- struct workqueue_struct *system_highpri_wq __read_mostly;
+ struct workqueue_struct *system_highpri_wq __ro_after_init;
   EXPORT_SYMBOL_GPL(system_highpri_wq);
- struct workqueue_struct *system_long_wq __read_mostly;
+ struct workqueue_struct *system_long_wq __ro_after_init;
   EXPORT_SYMBOL_GPL(system_long_wq);
- struct workqueue_struct *system_unbound_wq __read_mostly;
+ struct workqueue_struct *system_unbound_wq __ro_after_init;
   EXPORT_SYMBOL_GPL(system_unbound_wq);
- struct workqueue_struct *system_freezable_wq __read_mostly;
+ struct workqueue_struct *system_freezable_wq __ro_after_init;
   EXPORT_SYMBOL_GPL(system_freezable_wq);
- struct workqueue_struct *system_power_efficient_wq __read_mostly;
+ struct workqueue_struct *system_power_efficient_wq __ro_after_init;
   EXPORT_SYMBOL_GPL(system_power_efficient_wq);
- struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+ struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
   EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
   
   static int worker_thread(void *__worker);
@@@ -2166,7 -2166,7 +2166,7 @@@ static struct worker *create_worker(str
   {
         struct worker *worker;
         int id;
- -      char id_buf[16];
+ +      char id_buf[23];
   
         /* ID is needed to determine kthread name */
         id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
@@@ -4600,22 -4600,12 +4600,22 @@@ static int alloc_and_link_pwqs(struct w
         }
         cpus_read_unlock();
   
+ +      /* for unbound pwq, flush the pwq_release_worker ensures that the
+ +       * pwq_release_workfn() completes before calling kfree(wq).
+ +       */
+ +      if (ret)
+ +              kthread_flush_worker(pwq_release_worker);
+ +
         return ret;
   
   enomem:
         if (wq->cpu_pwq) {
- -              for_each_possible_cpu(cpu)
- -                      kfree(*per_cpu_ptr(wq->cpu_pwq, cpu));
+ +              for_each_possible_cpu(cpu) {
+ +                      struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
+ +
+ +                      if (pwq)
+ +                              kmem_cache_free(pwq_cache, pwq);
+ +              }
                 free_percpu(wq->cpu_pwq);
                 wq->cpu_pwq = NULL;
         }
@@@ -5622,54 -5612,50 +5622,54 @@@ static void work_for_cpu_fn(struct work
   }
   
   /**
- - * work_on_cpu - run a function in thread context on a particular cpu
+ + * work_on_cpu_key - run a function in thread context on a particular cpu
    * @cpu: the cpu to run on
    * @fn: the function to run
    * @arg: the function arg
+ + * @key: The lock class key for lock debugging purposes
    *
    * It is up to the caller to ensure that the cpu doesn't go offline.
    * The caller must not hold any locks which would prevent @fn from completing.
    *
    * Return: The value @fn returns.
    */
- -long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
+ +long work_on_cpu_key(int cpu, long (*fn)(void *),
+ +                   void *arg, struct lock_class_key *key)
   {
         struct work_for_cpu wfc = { .fn = fn, .arg = arg };
   
- -      INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+ +      INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
         schedule_work_on(cpu, &wfc.work);
         flush_work(&wfc.work);
         destroy_work_on_stack(&wfc.work);
         return wfc.ret;
   }
- -EXPORT_SYMBOL_GPL(work_on_cpu);
+ +EXPORT_SYMBOL_GPL(work_on_cpu_key);
   
   /**
- - * work_on_cpu_safe - run a function in thread context on a particular cpu
+ + * work_on_cpu_safe_key - run a function in thread context on a particular cpu
    * @cpu: the cpu to run on
    * @fn:  the function to run
    * @arg: the function argument
+ + * @key: The lock class key for lock debugging purposes
    *
    * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
    * any locks which would prevent @fn from completing.
    *
    * Return: The value @fn returns.
    */
- -long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+ +long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
+ +                        void *arg, struct lock_class_key *key)
   {
         long ret = -ENODEV;
   
         cpus_read_lock();
         if (cpu_online(cpu))
- -              ret = work_on_cpu(cpu, fn, arg);
+ +              ret = work_on_cpu_key(cpu, fn, arg, key);
         cpus_read_unlock();
         return ret;
   }
- -EXPORT_SYMBOL_GPL(work_on_cpu_safe);
+ +EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
   #endif /* CONFIG_SMP */
   
   #ifdef CONFIG_FREEZER
@@@ -5796,13 -5782,9 +5796,13 @@@ static int workqueue_apply_unbound_cpum
         list_for_each_entry(wq, &workqueues, list) {
                 if (!(wq->flags & WQ_UNBOUND))
                         continue;
+ +
                 /* creating multiple pwqs breaks ordering guarantee */
- -              if (wq->flags & __WQ_ORDERED)
- -                      continue;
+ +              if (!list_empty(&wq->pwqs)) {
+ +                      if (wq->flags & __WQ_ORDERED_EXPLICIT)
+ +                              continue;
+ +                      wq->flags &= ~__WQ_ORDERED;
+ +              }
   
                 ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
                 if (IS_ERR(ctx)) {
diff --combined mm/damon/core.c

index aa2dc7087cd93192bc21d5d8d6977823ff43c949,2f54f153d7f53228fa183ee4f025e3975d72d42f..630077d95dc60721015ea4b195c85e1c73f484ce
--- 1/mm/damon/core.c
--- 2/mm/damon/core.c
+++ b/mm/damon/core.c
@@@ -128,7 -128,6 +128,7 @@@ struct damon_region *damon_new_region(u
         region->ar.start = start;
         region->ar.end = end;
         region->nr_accesses = 0;
+ +      region->nr_accesses_bp = 0;
         INIT_LIST_HEAD(&region->list);
   
         region->age = 0;
@@@ -313,9 -312,7 +313,9 @@@ static struct damos_quota *damos_quota_
   }
   
   struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
- -                      enum damos_action action, struct damos_quota *quota,
+ +                      enum damos_action action,
+ +                      unsigned long apply_interval_us,
+ +                      struct damos_quota *quota,
                         struct damos_watermarks *wmarks)
   {
         struct damos *scheme;
@@@ -325,13 -322,6 +325,13 @@@
                 return NULL;
         scheme->pattern = *pattern;
         scheme->action = action;
+ +      scheme->apply_interval_us = apply_interval_us;
+ +      /*
+ +       * next_apply_sis will be set when kdamond starts.  While kdamond is
+ +       * running, it will also updated when it is added to the DAMON context,
+ +       * or damon_attrs are updated.
+ +       */
+ +      scheme->next_apply_sis = 0;
         INIT_LIST_HEAD(&scheme->filters);
         scheme->stat = (struct damos_stat){};
         INIT_LIST_HEAD(&scheme->list);
@@@ -344,21 -334,9 +344,21 @@@
         return scheme;
   }
   
+ +static void damos_set_next_apply_sis(struct damos *s, struct damon_ctx *ctx)
+ +{
+ +      unsigned long sample_interval = ctx->attrs.sample_interval ?
+ +              ctx->attrs.sample_interval : 1;
+ +      unsigned long apply_interval = s->apply_interval_us ?
+ +              s->apply_interval_us : ctx->attrs.aggr_interval;
+ +
+ +      s->next_apply_sis = ctx->passed_sample_intervals +
+ +              apply_interval / sample_interval;
+ +}
+ +
   void damon_add_scheme(struct damon_ctx *ctx, struct damos *s)
   {
         list_add_tail(&s->list, &ctx->schemes);
+ +      damos_set_next_apply_sis(s, ctx);
   }
   
   static void damon_del_scheme(struct damos *s)
@@@ -449,10 -427,8 +449,10 @@@ struct damon_ctx *damon_new_ctx(void
         ctx->attrs.aggr_interval = 100 * 1000;
         ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
   
- -      ktime_get_coarse_ts64(&ctx->last_aggregation);
- -      ctx->last_ops_update = ctx->last_aggregation;
+ +      ctx->passed_sample_intervals = 0;
+ +      /* These will be set from kdamond_init_intervals_sis() */
+ +      ctx->next_aggregation_sis = 0;
+ +      ctx->next_ops_update_sis = 0;
   
         mutex_init(&ctx->kdamond_lock);
   
@@@ -500,14 -476,20 +500,14 @@@ static unsigned int damon_age_for_new_a
   static unsigned int damon_accesses_bp_to_nr_accesses(
                 unsigned int accesses_bp, struct damon_attrs *attrs)
   {
- -      unsigned int max_nr_accesses =
- -              attrs->aggr_interval / attrs->sample_interval;
- -
- -      return accesses_bp * max_nr_accesses / 10000;
+ +      return accesses_bp * damon_max_nr_accesses(attrs) / 10000;
   }
   
   /* convert nr_accesses to access ratio in bp (per 10,000) */
   static unsigned int damon_nr_accesses_to_accesses_bp(
                 unsigned int nr_accesses, struct damon_attrs *attrs)
   {
- -      unsigned int max_nr_accesses =
- -              attrs->aggr_interval / attrs->sample_interval;
- -
- -      return nr_accesses * 10000 / max_nr_accesses;
+ +      return nr_accesses * 10000 / damon_max_nr_accesses(attrs);
   }
   
   static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses,
@@@ -524,7 -506,6 +524,7 @@@ static void damon_update_monitoring_res
   {
         r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses,
                         old_attrs, new_attrs);
+ +      r->nr_accesses_bp = r->nr_accesses * 10000;
         r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs);
   }
   
@@@ -560,21 -541,13 +560,21 @@@ static void damon_update_monitoring_res
    * @ctx:              monitoring context
    * @attrs:            monitoring attributes
    *
- - * This function should not be called while the kdamond is running.
+ + * This function should be called while the kdamond is not running, or an
+ + * access check results aggregation is not ongoing (e.g., from
+ + * &struct damon_callback->after_aggregation or
+ + * &struct damon_callback->after_wmarks_check callbacks).
+ + *
    * Every time interval is in micro-seconds.
    *
    * Return: 0 on success, negative error code otherwise.
    */
   int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
   {
+ +      unsigned long sample_interval = attrs->sample_interval ?
+ +              attrs->sample_interval : 1;
+ +      struct damos *s;
+ +
         if (attrs->min_nr_regions < 3)
                 return -EINVAL;
         if (attrs->min_nr_regions > attrs->max_nr_regions)
@@@ -582,17 -555,8 +582,17 @@@
         if (attrs->sample_interval > attrs->aggr_interval)
                 return -EINVAL;
   
+ +      ctx->next_aggregation_sis = ctx->passed_sample_intervals +
+ +              attrs->aggr_interval / sample_interval;
+ +      ctx->next_ops_update_sis = ctx->passed_sample_intervals +
+ +              attrs->ops_update_interval / sample_interval;
+ +
         damon_update_monitoring_results(ctx, attrs);
         ctx->attrs = *attrs;
+ +
+ +      damon_for_each_scheme(s, ctx)
+ +              damos_set_next_apply_sis(s, ctx);
+ +
         return 0;
   }
   
@@@ -735,8 -699,7 +735,7 @@@ static int __damon_stop(struct damon_ct
         if (tsk) {
                 get_task_struct(tsk);
                 mutex_unlock(&ctx->kdamond_lock);
-               kthread_stop(tsk);
-               put_task_struct(tsk);
+               kthread_stop_put(tsk);
                 return 0;
         }
         mutex_unlock(&ctx->kdamond_lock);
@@@ -764,6 -727,38 +763,6 @@@ int damon_stop(struct damon_ctx **ctxs
         return err;
   }
   
- -/*
- - * damon_check_reset_time_interval() - Check if a time interval is elapsed.
- - * @baseline: the time to check whether the interval has elapsed since
- - * @interval: the time interval (microseconds)
- - *
- - * See whether the given time interval has passed since the given baseline
- - * time.  If so, it also updates the baseline to current time for next check.
- - *
- - * Return:    true if the time interval has passed, or false otherwise.
- - */
- -static bool damon_check_reset_time_interval(struct timespec64 *baseline,
- -              unsigned long interval)
- -{
- -      struct timespec64 now;
- -
- -      ktime_get_coarse_ts64(&now);
- -      if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) <
- -                      interval * 1000)
- -              return false;
- -      *baseline = now;
- -      return true;
- -}
- -
- -/*
- - * Check whether it is time to flush the aggregated information
- - */
- -static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
- -{
- -      return damon_check_reset_time_interval(&ctx->last_aggregation,
- -                      ctx->attrs.aggr_interval);
- -}
- -
   /*
    * Reset the aggregated monitoring results ('nr_accesses' of each region).
    */
@@@ -776,7 -771,7 +775,7 @@@ static void kdamond_reset_aggregated(st
                 struct damon_region *r;
   
                 damon_for_each_region(r, t) {
- -                      trace_damon_aggregated(t, ti, r, damon_nr_regions(t));
+ +                      trace_damon_aggregated(ti, r, damon_nr_regions(t));
                         r->last_nr_accesses = r->nr_accesses;
                         r->nr_accesses = 0;
                 }
@@@ -790,13 -785,12 +789,13 @@@ static void damon_split_region_at(struc
   static bool __damos_valid_target(struct damon_region *r, struct damos *s)
   {
         unsigned long sz;
+ +      unsigned int nr_accesses = r->nr_accesses_bp / 10000;
   
         sz = damon_sz_region(r);
         return s->pattern.min_sz_region <= sz &&
                 sz <= s->pattern.max_sz_region &&
- -              s->pattern.min_nr_accesses <= r->nr_accesses &&
- -              r->nr_accesses <= s->pattern.max_nr_accesses &&
+ +              s->pattern.min_nr_accesses <= nr_accesses &&
+ +              nr_accesses <= s->pattern.max_nr_accesses &&
                 s->pattern.min_age_region <= r->age &&
                 r->age <= s->pattern.max_age_region;
   }
@@@ -951,33 -945,6 +950,33 @@@ static void damos_apply_scheme(struct d
         struct timespec64 begin, end;
         unsigned long sz_applied = 0;
         int err = 0;
+ +      /*
+ +       * We plan to support multiple context per kdamond, as DAMON sysfs
+ +       * implies with 'nr_contexts' file.  Nevertheless, only single context
+ +       * per kdamond is supported for now.  So, we can simply use '0' context
+ +       * index here.
+ +       */
+ +      unsigned int cidx = 0;
+ +      struct damos *siter;            /* schemes iterator */
+ +      unsigned int sidx = 0;
+ +      struct damon_target *titer;     /* targets iterator */
+ +      unsigned int tidx = 0;
+ +      bool do_trace = false;
+ +
+ +      /* get indices for trace_damos_before_apply() */
+ +      if (trace_damos_before_apply_enabled()) {
+ +              damon_for_each_scheme(siter, c) {
+ +                      if (siter == s)
+ +                              break;
+ +                      sidx++;
+ +              }
+ +              damon_for_each_target(titer, c) {
+ +                      if (titer == t)
+ +                              break;
+ +                      tidx++;
+ +              }
+ +              do_trace = true;
+ +      }
   
         if (c->ops.apply_scheme) {
                 if (quota->esz && quota->charged_sz + sz > quota->esz) {
@@@ -992,11 -959,8 +991,11 @@@
                 ktime_get_coarse_ts64(&begin);
                 if (c->callback.before_damos_apply)
                         err = c->callback.before_damos_apply(c, t, r, s);
- -              if (!err)
+ +              if (!err) {
+ +                      trace_damos_before_apply(cidx, sidx, tidx, r,
+ +                                      damon_nr_regions(t), do_trace);
                         sz_applied = c->ops.apply_scheme(c, t, r, s);
+ +              }
                 ktime_get_coarse_ts64(&end);
                 quota->total_charged_ns += timespec64_to_ns(&end) -
                         timespec64_to_ns(&begin);
@@@ -1114,29 -1078,14 +1113,29 @@@ static void kdamond_apply_schemes(struc
         struct damon_target *t;
         struct damon_region *r, *next_r;
         struct damos *s;
+ +      unsigned long sample_interval = c->attrs.sample_interval ?
+ +              c->attrs.sample_interval : 1;
+ +      bool has_schemes_to_apply = false;
   
         damon_for_each_scheme(s, c) {
+ +              if (c->passed_sample_intervals != s->next_apply_sis)
+ +                      continue;
+ +
+ +              s->next_apply_sis +=
+ +                      (s->apply_interval_us ? s->apply_interval_us :
+ +                       c->attrs.aggr_interval) / sample_interval;
+ +
                 if (!s->wmarks.activated)
                         continue;
   
+ +              has_schemes_to_apply = true;
+ +
                 damos_adjust_quota(c, s);
         }
   
+ +      if (!has_schemes_to_apply)
+ +              return;
+ +
         damon_for_each_target(t, c) {
                 damon_for_each_region_safe(r, next_r, t)
                         damon_do_apply_schemes(c, t, r);
@@@ -1153,7 -1102,6 +1152,7 @@@ static void damon_merge_two_regions(str
   
         l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
                         (sz_l + sz_r);
+ +      l->nr_accesses_bp = l->nr_accesses * 10000;
         l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
         l->ar.end = r->ar.end;
         damon_destroy_region(r, t);
@@@ -1225,7 -1173,6 +1224,7 @@@ static void damon_split_region_at(struc
   
         new->age = r->age;
         new->last_nr_accesses = r->last_nr_accesses;
+ +      new->nr_accesses_bp = r->nr_accesses_bp;
   
         damon_insert_region(new, r, damon_next_region(r), t);
   }
@@@ -1292,6 -1239,18 +1291,6 @@@ static void kdamond_split_regions(struc
         last_nr_regions = nr_regions;
   }
   
- -/*
- - * Check whether it is time to check and apply the operations-related data
- - * structures.
- - *
- - * Returns true if it is.
- - */
- -static bool kdamond_need_update_operations(struct damon_ctx *ctx)
- -{
- -      return damon_check_reset_time_interval(&ctx->last_ops_update,
- -                      ctx->attrs.ops_update_interval);
- -}
- -
   /*
    * Check whether current monitoring should be stopped
    *
@@@ -1320,10 -1279,12 +1319,10 @@@ static bool kdamond_need_stop(struct da
   
   static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric)
   {
- -      struct sysinfo i;
- -
         switch (metric) {
         case DAMOS_WMARK_FREE_MEM_RATE:
- -              si_meminfo(&i);
- -              return i.freeram * 1000 / i.totalram;
+ +              return global_zone_page_state(NR_FREE_PAGES) * 1000 /
+ +                     totalram_pages();
         default:
                 break;
         }
@@@ -1401,25 -1362,6 +1400,25 @@@ static int kdamond_wait_activation(stru
         return -EBUSY;
   }
   
+ +static void kdamond_init_intervals_sis(struct damon_ctx *ctx)
+ +{
+ +      unsigned long sample_interval = ctx->attrs.sample_interval ?
+ +              ctx->attrs.sample_interval : 1;
+ +      unsigned long apply_interval;
+ +      struct damos *scheme;
+ +
+ +      ctx->passed_sample_intervals = 0;
+ +      ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval;
+ +      ctx->next_ops_update_sis = ctx->attrs.ops_update_interval /
+ +              sample_interval;
+ +
+ +      damon_for_each_scheme(scheme, ctx) {
+ +              apply_interval = scheme->apply_interval_us ?
+ +                      scheme->apply_interval_us : ctx->attrs.aggr_interval;
+ +              scheme->next_apply_sis = apply_interval / sample_interval;
+ +      }
+ +}
+ +
   /*
    * The monitoring daemon that runs as a kernel thread
    */
@@@ -1433,8 -1375,6 +1432,8 @@@ static int kdamond_fn(void *data
   
         pr_debug("kdamond (%d) starts\n", current->pid);
   
+ +      kdamond_init_intervals_sis(ctx);
+ +
         if (ctx->ops.init)
                 ctx->ops.init(ctx);
         if (ctx->callback.before_start && ctx->callback.before_start(ctx))
@@@ -1443,17 -1383,6 +1442,17 @@@
         sz_limit = damon_region_sz_limit(ctx);
   
         while (!kdamond_need_stop(ctx)) {
+ +              /*
+ +               * ctx->attrs and ctx->next_{aggregation,ops_update}_sis could
+ +               * be changed from after_wmarks_check() or after_aggregation()
+ +               * callbacks.  Read the values here, and use those for this
+ +               * iteration.  That is, damon_set_attrs() updated new values
+ +               * are respected from next iteration.
+ +               */
+ +              unsigned long next_aggregation_sis = ctx->next_aggregation_sis;
+ +              unsigned long next_ops_update_sis = ctx->next_ops_update_sis;
+ +              unsigned long sample_interval = ctx->attrs.sample_interval;
+ +
                 if (kdamond_wait_activation(ctx))
                         break;
   
@@@ -1463,44 -1392,27 +1462,44 @@@
                                 ctx->callback.after_sampling(ctx))
                         break;
   
- -              kdamond_usleep(ctx->attrs.sample_interval);
+ +              kdamond_usleep(sample_interval);
+ +              ctx->passed_sample_intervals++;
   
                 if (ctx->ops.check_accesses)
                         max_nr_accesses = ctx->ops.check_accesses(ctx);
   
- -              if (kdamond_aggregate_interval_passed(ctx)) {
+ +              if (ctx->passed_sample_intervals == next_aggregation_sis) {
                         kdamond_merge_regions(ctx,
                                         max_nr_accesses / 10,
                                         sz_limit);
                         if (ctx->callback.after_aggregation &&
                                         ctx->callback.after_aggregation(ctx))
                                 break;
- -                      if (!list_empty(&ctx->schemes))
- -                              kdamond_apply_schemes(ctx);
+ +              }
+ +
+ +              /*
+ +               * do kdamond_apply_schemes() after kdamond_merge_regions() if
+ +               * possible, to reduce overhead
+ +               */
+ +              if (!list_empty(&ctx->schemes))
+ +                      kdamond_apply_schemes(ctx);
+ +
+ +              sample_interval = ctx->attrs.sample_interval ?
+ +                      ctx->attrs.sample_interval : 1;
+ +              if (ctx->passed_sample_intervals == next_aggregation_sis) {
+ +                      ctx->next_aggregation_sis = next_aggregation_sis +
+ +                              ctx->attrs.aggr_interval / sample_interval;
+ +
                         kdamond_reset_aggregated(ctx);
                         kdamond_split_regions(ctx);
                         if (ctx->ops.reset_aggregated)
                                 ctx->ops.reset_aggregated(ctx);
                 }
   
- -              if (kdamond_need_update_operations(ctx)) {
+ +              if (ctx->passed_sample_intervals == next_ops_update_sis) {
+ +                      ctx->next_ops_update_sis = next_ops_update_sis +
+ +                              ctx->attrs.ops_update_interval /
+ +                              sample_interval;
                         if (ctx->ops.update)
                                 ctx->ops.update(ctx);
                         sz_limit = damon_region_sz_limit(ctx);
@@@ -1604,76 -1516,6 +1603,76 @@@ int damon_set_region_biggest_system_ram
         return damon_set_regions(t, &addr_range, 1);
   }
   
+ +/*
+ + * damon_moving_sum() - Calculate an inferred moving sum value.
+ + * @mvsum:    Inferred sum of the last @len_window values.
+ + * @nomvsum:  Non-moving sum of the last discrete @len_window window values.
+ + * @len_window:       The number of last values to take care of.
+ + * @new_value:        New value that will be added to the pseudo moving sum.
+ + *
+ + * Moving sum (moving average * window size) is good for handling noise, but
+ + * the cost of keeping past values can be high for arbitrary window size.  This
+ + * function implements a lightweight pseudo moving sum function that doesn't
+ + * keep the past window values.
+ + *
+ + * It simply assumes there was no noise in the past, and get the no-noise
+ + * assumed past value to drop from @nomvsum and @len_window.  @nomvsum is a
+ + * non-moving sum of the last window.  For example, if @len_window is 10 and we
+ + * have 25 values, @nomvsum is the sum of the 11th to 20th values of the 25
+ + * values.  Hence, this function simply drops @nomvsum / @len_window from
+ + * given @mvsum and add @new_value.
+ + *
+ + * For example, if @len_window is 10 and @nomvsum is 50, the last 10 values for
+ + * the last window could be vary, e.g., 0, 10, 0, 10, 0, 10, 0, 0, 0, 20.  For
+ + * calculating next moving sum with a new value, we should drop 0 from 50 and
+ + * add the new value.  However, this function assumes it got value 5 for each
+ + * of the last ten times.  Based on the assumption, when the next value is
+ + * measured, it drops the assumed past value, 5 from the current sum, and add
+ + * the new value to get the updated pseduo-moving average.
+ + *
+ + * This means the value could have errors, but the errors will be disappeared
+ + * for every @len_window aligned calls.  For example, if @len_window is 10, the
+ + * pseudo moving sum with 11th value to 19th value would have an error.  But
+ + * the sum with 20th value will not have the error.
+ + *
+ + * Return: Pseudo-moving average after getting the @new_value.
+ + */
+ +static unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum,
+ +              unsigned int len_window, unsigned int new_value)
+ +{
+ +      return mvsum - nomvsum / len_window + new_value;
+ +}
+ +
+ +/**
+ + * damon_update_region_access_rate() - Update the access rate of a region.
+ + * @r:                The DAMON region to update for its access check result.
+ + * @accessed: Whether the region has accessed during last sampling interval.
+ + * @attrs:    The damon_attrs of the DAMON context.
+ + *
+ + * Update the access rate of a region with the region's last sampling interval
+ + * access check result.
+ + *
+ + * Usually this will be called by &damon_operations->check_accesses callback.
+ + */
+ +void damon_update_region_access_rate(struct damon_region *r, bool accessed,
+ +              struct damon_attrs *attrs)
+ +{
+ +      unsigned int len_window = 1;
+ +
+ +      /*
+ +       * sample_interval can be zero, but cannot be larger than
+ +       * aggr_interval, owing to validation of damon_set_attrs().
+ +       */
+ +      if (attrs->sample_interval)
+ +              len_window = damon_max_nr_accesses(attrs);
+ +      r->nr_accesses_bp = damon_moving_sum(r->nr_accesses_bp,
+ +                      r->last_nr_accesses * 10000, len_window,
+ +                      accessed ? 10000 : 0);
+ +
+ +      if (accessed)
+ +              r->nr_accesses++;
+ +}
+ +
   static int __init damon_init(void)
   {
         damon_region_cache = KMEM_CACHE(damon_region, 0);
diff --combined mm/khugepaged.c

index bc2d8ff269c7340283d7a316efe1466a0e10ffa0,cb3f1d738810ed4a1a00b18b2a33c5e14bd8c101..064654717843ea4bff84bef4a5bd9e8f174a44d9
--- 1/mm/khugepaged.c
--- 2/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@@ -91,7 -91,7 +91,7 @@@ static unsigned int khugepaged_max_ptes
   #define MM_SLOTS_HASH_BITS 10
   static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
   
- static struct kmem_cache *mm_slot_cache __read_mostly;
+ static struct kmem_cache *mm_slot_cache __ro_after_init;
   
   struct collapse_control {
         bool is_khugepaged;
@@@ -524,15 -524,15 +524,15 @@@ static void release_pte_pages(pte_t *pt
         }
   }
   
- -static bool is_refcount_suitable(struct page *page)
+ +static bool is_refcount_suitable(struct folio *folio)
   {
         int expected_refcount;
   
- -      expected_refcount = total_mapcount(page);
- -      if (PageSwapCache(page))
- -              expected_refcount += compound_nr(page);
+ +      expected_refcount = folio_mapcount(folio);
+ +      if (folio_test_swapcache(folio))
+ +              expected_refcount += folio_nr_pages(folio);
   
- -      return page_count(page) == expected_refcount;
+ +      return folio_ref_count(folio) == expected_refcount;
   }
   
   static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
@@@ -542,7 -542,6 +542,7 @@@
                                         struct list_head *compound_pagelist)
   {
         struct page *page = NULL;
+ +      struct folio *folio = NULL;
         pte_t *_pte;
         int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
         bool writable = false;
@@@ -577,8 -576,7 +577,8 @@@
                         goto out;
                 }
   
- -              VM_BUG_ON_PAGE(!PageAnon(page), page);
+ +              folio = page_folio(page);
+ +              VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
   
                 if (page_mapcount(page) > 1) {
                         ++shared;
@@@ -590,15 -588,16 +590,15 @@@
                         }
                 }
   
- -              if (PageCompound(page)) {
- -                      struct page *p;
- -                      page = compound_head(page);
+ +              if (folio_test_large(folio)) {
+ +                      struct folio *f;
   
                         /*
                          * Check if we have dealt with the compound page
                          * already
                          */
- -                      list_for_each_entry(p, compound_pagelist, lru) {
- -                              if (page == p)
+ +                      list_for_each_entry(f, compound_pagelist, lru) {
+ +                              if (folio == f)
                                         goto next;
                         }
                 }
@@@ -609,7 -608,7 +609,7 @@@
                  * is needed to serialize against split_huge_page
                  * when invoked from the VM.
                  */
- -              if (!trylock_page(page)) {
+ +              if (!folio_trylock(folio)) {
                         result = SCAN_PAGE_LOCK;
                         goto out;
                 }
@@@ -625,8 -624,8 +625,8 @@@
                  * but not from this process. The other process cannot write to
                  * the page, only trigger CoW.
                  */
- -              if (!is_refcount_suitable(page)) {
- -                      unlock_page(page);
+ +              if (!is_refcount_suitable(folio)) {
+ +                      folio_unlock(folio);
                         result = SCAN_PAGE_COUNT;
                         goto out;
                 }
@@@ -635,27 -634,27 +635,27 @@@
                  * Isolate the page to avoid collapsing an hugepage
                  * currently in use by the VM.
                  */
- -              if (!isolate_lru_page(page)) {
- -                      unlock_page(page);
+ +              if (!folio_isolate_lru(folio)) {
+ +                      folio_unlock(folio);
                         result = SCAN_DEL_PAGE_LRU;
                         goto out;
                 }
- -              mod_node_page_state(page_pgdat(page),
- -                              NR_ISOLATED_ANON + page_is_file_lru(page),
- -                              compound_nr(page));
- -              VM_BUG_ON_PAGE(!PageLocked(page), page);
- -              VM_BUG_ON_PAGE(PageLRU(page), page);
- -
- -              if (PageCompound(page))
- -                      list_add_tail(&page->lru, compound_pagelist);
+ +              node_stat_mod_folio(folio,
+ +                              NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ +                              folio_nr_pages(folio));
+ +              VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ +              VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ +
+ +              if (folio_test_large(folio))
+ +                      list_add_tail(&folio->lru, compound_pagelist);
   next:
                 /*
                  * If collapse was initiated by khugepaged, check that there is
                  * enough young pte to justify collapsing the page
                  */
                 if (cc->is_khugepaged &&
- -                  (pte_young(pteval) || page_is_young(page) ||
- -                   PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ +                  (pte_young(pteval) || folio_test_young(folio) ||
+ +                   folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
                                                                      address)))
                         referenced++;
   
@@@ -669,13 -668,13 +669,13 @@@
                 result = SCAN_LACK_REFERENCED_PAGE;
         } else {
                 result = SCAN_SUCCEED;
- -              trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ +              trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
                                                     referenced, writable, result);
                 return result;
         }
   out:
         release_pte_pages(pte, _pte, compound_pagelist);
- -      trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ +      trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
                                             referenced, writable, result);
         return result;
   }
@@@ -888,16 -887,16 +888,16 @@@ static int hpage_collapse_find_target_n
   }
   #endif
   
- -static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
+ +static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node,
                                       nodemask_t *nmask)
   {
- -      *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
- -      if (unlikely(!*hpage)) {
+ +      *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask);
+ +
+ +      if (unlikely(!*folio)) {
                 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                 return false;
         }
   
- -      folio_prep_large_rmappable((struct folio *)*hpage);
         count_vm_event(THP_COLLAPSE_ALLOC);
         return true;
   }
@@@ -1064,20 -1063,17 +1064,20 @@@ static int alloc_charge_hpage(struct pa
         int node = hpage_collapse_find_target_node(cc);
         struct folio *folio;
   
- -      if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
+ +      if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) {
+ +              *hpage = NULL;
                 return SCAN_ALLOC_HUGE_PAGE_FAIL;
+ +      }
   
- -      folio = page_folio(*hpage);
         if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
                 folio_put(folio);
                 *hpage = NULL;
                 return SCAN_CGROUP_CHARGE_FAIL;
         }
- -      count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
   
+ +      count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
+ +
+ +      *hpage = folio_page(folio, 0);
         return SCAN_SUCCEED;
   }
   
@@@ -1251,7 -1247,6 +1251,7 @@@ static int hpage_collapse_scan_pmd(stru
         int result = SCAN_FAIL, referenced = 0;
         int none_or_zero = 0, shared = 0;
         struct page *page = NULL;
+ +      struct folio *folio = NULL;
         unsigned long _address;
         spinlock_t *ptl;
         int node = NUMA_NO_NODE, unmapped = 0;
@@@ -1338,28 -1333,29 +1338,28 @@@
                         }
                 }
   
- -              page = compound_head(page);
- -
+ +              folio = page_folio(page);
                 /*
                  * Record which node the original page is from and save this
                  * information to cc->node_load[].
                  * Khugepaged will allocate hugepage from the node has the max
                  * hit record.
                  */
- -              node = page_to_nid(page);
+ +              node = folio_nid(folio);
                 if (hpage_collapse_scan_abort(node, cc)) {
                         result = SCAN_SCAN_ABORT;
                         goto out_unmap;
                 }
                 cc->node_load[node]++;
- -              if (!PageLRU(page)) {
+ +              if (!folio_test_lru(folio)) {
                         result = SCAN_PAGE_LRU;
                         goto out_unmap;
                 }
- -              if (PageLocked(page)) {
+ +              if (folio_test_locked(folio)) {
                         result = SCAN_PAGE_LOCK;
                         goto out_unmap;
                 }
- -              if (!PageAnon(page)) {
+ +              if (!folio_test_anon(folio)) {
                         result = SCAN_PAGE_ANON;
                         goto out_unmap;
                 }
@@@ -1374,7 -1370,7 +1374,7 @@@
                  * has excessive GUP pins (i.e. 512).  Anyway the same check
                  * will be done again later the risk seems low.
                  */
- -              if (!is_refcount_suitable(page)) {
+ +              if (!is_refcount_suitable(folio)) {
                         result = SCAN_PAGE_COUNT;
                         goto out_unmap;
                 }
@@@ -1384,8 -1380,8 +1384,8 @@@
                  * enough young pte to justify collapsing the page
                  */
                 if (cc->is_khugepaged &&
- -                  (pte_young(pteval) || page_is_young(page) ||
- -                   PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ +                  (pte_young(pteval) || folio_test_young(folio) ||
+ +                   folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
                                                                      address)))
                         referenced++;
         }
@@@ -1407,7 -1403,7 +1407,7 @@@ out_unmap
                 *mmap_locked = false;
         }
   out:
- -      trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
+ +      trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
                                      none_or_zero, result, unmapped);
         return result;
   }
@@@ -1477,7 -1473,7 +1477,7 @@@ int collapse_pte_mapped_thp(struct mm_s
         bool notified = false;
         unsigned long haddr = addr & HPAGE_PMD_MASK;
         struct vm_area_struct *vma = vma_lookup(mm, haddr);
- -      struct page *hpage;
+ +      struct folio *folio;
         pte_t *start_pte, *pte;
         pmd_t *pmd, pgt_pmd;
         spinlock_t *pml = NULL, *ptl;
@@@ -1510,14 -1506,19 +1510,14 @@@
         if (userfaultfd_wp(vma))
                 return SCAN_PTE_UFFD_WP;
   
- -      hpage = find_lock_page(vma->vm_file->f_mapping,
+ +      folio = filemap_lock_folio(vma->vm_file->f_mapping,
                                linear_page_index(vma, haddr));
- -      if (!hpage)
+ +      if (IS_ERR(folio))
                 return SCAN_PAGE_NULL;
   
- -      if (!PageHead(hpage)) {
- -              result = SCAN_FAIL;
- -              goto drop_hpage;
- -      }
- -
- -      if (compound_order(hpage) != HPAGE_PMD_ORDER) {
+ +      if (folio_order(folio) != HPAGE_PMD_ORDER) {
                 result = SCAN_PAGE_COMPOUND;
- -              goto drop_hpage;
+ +              goto drop_folio;
         }
   
         result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
@@@ -1531,13 -1532,13 +1531,13 @@@
                  */
                 goto maybe_install_pmd;
         default:
- -              goto drop_hpage;
+ +              goto drop_folio;
         }
   
         result = SCAN_FAIL;
         start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
         if (!start_pte)         /* mmap_lock + page lock should prevent this */
- -              goto drop_hpage;
+ +              goto drop_folio;
   
         /* step 1: check all mapped PTEs are to the right huge page */
         for (i = 0, addr = haddr, pte = start_pte;
@@@ -1562,7 -1563,7 +1562,7 @@@
                  * Note that uprobe, debugger, or MAP_PRIVATE may change the
                  * page table, but the new page will not be a subpage of hpage.
                  */
- -              if (hpage + i != page)
+ +              if (folio_page(folio, i) != page)
                         goto abort;
         }
   
@@@ -1577,7 -1578,7 +1577,7 @@@
          * page_table_lock) ptl nests inside pml. The less time we hold pml,
          * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
          * inserts a valid as-if-COWed PTE without even looking up page cache.
- -       * So page lock of hpage does not protect from it, so we must not drop
+ +       * So page lock of folio does not protect from it, so we must not drop
          * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
          */
         if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
@@@ -1601,7 -1602,7 +1601,7 @@@
                         continue;
                 /*
                  * We dropped ptl after the first scan, to do the mmu_notifier:
- -               * page lock stops more PTEs of the hpage being faulted in, but
+ +               * page lock stops more PTEs of the folio being faulted in, but
                  * does not stop write faults COWing anon copies from existing
                  * PTEs; and does not stop those being swapped out or migrated.
                  */
@@@ -1610,7 -1611,7 +1610,7 @@@
                         goto abort;
                 }
                 page = vm_normal_page(vma, addr, ptent);
- -              if (hpage + i != page)
+ +              if (folio_page(folio, i) != page)
                         goto abort;
   
                 /*
@@@ -1629,8 -1630,8 +1629,8 @@@
   
         /* step 3: set proper refcount and mm_counters. */
         if (nr_ptes) {
- -              page_ref_sub(hpage, nr_ptes);
- -              add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
+ +              folio_ref_sub(folio, nr_ptes);
+ +              add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
         }
   
         /* step 4: remove empty page table */
@@@ -1654,14 -1655,14 +1654,14 @@@
   maybe_install_pmd:
         /* step 5: install pmd entry */
         result = install_pmd
- -                      ? set_huge_pmd(vma, haddr, pmd, hpage)
+ +                      ? set_huge_pmd(vma, haddr, pmd, &folio->page)
                         : SCAN_SUCCEED;
- -      goto drop_hpage;
+ +      goto drop_folio;
   abort:
         if (nr_ptes) {
                 flush_tlb_mm(mm);
- -              page_ref_sub(hpage, nr_ptes);
- -              add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes);
+ +              folio_ref_sub(folio, nr_ptes);
+ +              add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes);
         }
         if (start_pte)
                 pte_unmap_unlock(start_pte, ptl);
@@@ -1669,9 -1670,9 +1669,9 @@@
                 spin_unlock(pml);
         if (notified)
                 mmu_notifier_invalidate_range_end(&range);
- -drop_hpage:
- -      unlock_page(hpage);
- -      put_page(hpage);
+ +drop_folio:
+ +      folio_unlock(folio);
+ +      folio_put(folio);
         return result;
   }
   
diff --combined mm/shmem.c

index 71b8d957b63bec8384feb8c369289afcd95d65b4,389212972e726755a9c5a2641e31ef71ae5fed83..91e2620148b2f6d789420e6736daef7a53e2cc5c
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -42,7 -42,7 +42,7 @@@
   #include <linux/iversion.h>
   #include "swap.h"
   
- static struct vfsmount *shm_mnt;
+ static struct vfsmount *shm_mnt __ro_after_init;
   
   #ifdef CONFIG_SHMEM
   /*
@@@ -146,8 -146,9 +146,8 @@@ static unsigned long shmem_default_max_
   #endif
   
   static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
- -                           struct folio **foliop, enum sgp_type sgp,
- -                           gfp_t gfp, struct vm_area_struct *vma,
- -                           vm_fault_t *fault_type);
+ +                      struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
+ +                      struct mm_struct *fault_mm, vm_fault_t *fault_type);
   
   static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
   {
@@@ -188,10 -189,10 +188,10 @@@ static inline int shmem_reacct_size(uns
   /*
    * ... whereas tmpfs objects are accounted incrementally as
    * pages are allocated, in order to allow large sparse files.
- - * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
+ + * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
    * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
    */
- -static inline int shmem_acct_block(unsigned long flags, long pages)
+ +static inline int shmem_acct_blocks(unsigned long flags, long pages)
   {
         if (!(flags & VM_NORESERVE))
                 return 0;
@@@ -206,26 -207,26 +206,26 @@@ static inline void shmem_unacct_blocks(
                 vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
   }
   
- -static int shmem_inode_acct_block(struct inode *inode, long pages)
+ +static int shmem_inode_acct_blocks(struct inode *inode, long pages)
   {
         struct shmem_inode_info *info = SHMEM_I(inode);
         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
         int err = -ENOSPC;
   
- -      if (shmem_acct_block(info->flags, pages))
+ +      if (shmem_acct_blocks(info->flags, pages))
                 return err;
   
         might_sleep();  /* when quotas */
         if (sbinfo->max_blocks) {
- -              if (percpu_counter_compare(&sbinfo->used_blocks,
- -                                         sbinfo->max_blocks - pages) > 0)
+ +              if (!percpu_counter_limited_add(&sbinfo->used_blocks,
+ +                                              sbinfo->max_blocks, pages))
                         goto unacct;
   
                 err = dquot_alloc_block_nodirty(inode, pages);
- -              if (err)
+ +              if (err) {
+ +                      percpu_counter_sub(&sbinfo->used_blocks, pages);
                         goto unacct;
- -
- -              percpu_counter_add(&sbinfo->used_blocks, pages);
+ +              }
         } else {
                 err = dquot_alloc_block_nodirty(inode, pages);
                 if (err)
@@@ -446,7 -447,7 +446,7 @@@ bool shmem_charge(struct inode *inode, 
   {
         struct address_space *mapping = inode->i_mapping;
   
- -      if (shmem_inode_acct_block(inode, pages))
+ +      if (shmem_inode_acct_blocks(inode, pages))
                 return false;
   
         /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
@@@ -755,14 -756,16 +755,14 @@@ static unsigned long shmem_unused_huge_
   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
   
   /*
- - * Like filemap_add_folio, but error if expected item has gone.
+ + * Somewhat like filemap_add_folio, but error if expected item has gone.
    */
   static int shmem_add_to_page_cache(struct folio *folio,
                                    struct address_space *mapping,
- -                                 pgoff_t index, void *expected, gfp_t gfp,
- -                                 struct mm_struct *charge_mm)
+ +                                 pgoff_t index, void *expected, gfp_t gfp)
   {
         XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
         long nr = folio_nr_pages(folio);
- -      int error;
   
         VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@@ -773,7 -776,16 +773,7 @@@
         folio->mapping = mapping;
         folio->index = index;
   
- -      if (!folio_test_swapcache(folio)) {
- -              error = mem_cgroup_charge(folio, charge_mm, gfp);
- -              if (error) {
- -                      if (folio_test_pmd_mappable(folio)) {
- -                              count_vm_event(THP_FILE_FALLBACK);
- -                              count_vm_event(THP_FILE_FALLBACK_CHARGE);
- -                      }
- -                      goto error;
- -              }
- -      }
+ +      gfp &= GFP_RECLAIM_MASK;
         folio_throttle_swaprate(folio, gfp);
   
         do {
@@@ -789,26 -801,31 +789,26 @@@
                 xas_store(&xas, folio);
                 if (xas_error(&xas))
                         goto unlock;
- -              if (folio_test_pmd_mappable(folio)) {
- -                      count_vm_event(THP_FILE_ALLOC);
+ +              if (folio_test_pmd_mappable(folio))
                         __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
- -              }
- -              mapping->nrpages += nr;
                 __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
                 __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
+ +              mapping->nrpages += nr;
   unlock:
                 xas_unlock_irq(&xas);
         } while (xas_nomem(&xas, gfp));
   
         if (xas_error(&xas)) {
- -              error = xas_error(&xas);
- -              goto error;
+ +              folio->mapping = NULL;
+ +              folio_ref_sub(folio, nr);
+ +              return xas_error(&xas);
         }
   
         return 0;
- -error:
- -      folio->mapping = NULL;
- -      folio_ref_sub(folio, nr);
- -      return error;
   }
   
   /*
- - * Like delete_from_page_cache, but substitutes swap for @folio.
+ + * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
    */
   static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
   {
@@@ -870,6 -887,7 +870,6 @@@ unsigned long shmem_partial_swap_usage(
                         cond_resched_rcu();
                 }
         }
- -
         rcu_read_unlock();
   
         return swapped << PAGE_SHIFT;
@@@ -1094,7 -1112,7 +1094,7 @@@ whole_folios
   void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
   {
         shmem_undo_range(inode, lstart, lend, false);
- -      inode->i_mtime = inode_set_ctime_current(inode);
+ +      inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
         inode_inc_iversion(inode);
   }
   EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@@ -1195,6 -1213,7 +1195,6 @@@ static int shmem_setattr(struct mnt_idm
         if (i_uid_needs_update(idmap, attr, inode) ||
             i_gid_needs_update(idmap, attr, inode)) {
                 error = dquot_transfer(idmap, inode, attr);
- -
                 if (error)
                         return error;
         }
@@@ -1205,7 -1224,7 +1205,7 @@@
         if (!error && update_ctime) {
                 inode_set_ctime_current(inode);
                 if (update_mtime)
- -                      inode->i_mtime = inode_get_ctime(inode);
+ +                      inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
                 inode_inc_iversion(inode);
         }
         return error;
@@@ -1307,8 -1326,10 +1307,8 @@@ static int shmem_unuse_swap_entries(str
   
                 if (!xa_is_value(folio))
                         continue;
- -              error = shmem_swapin_folio(inode, indices[i],
- -                                        &folio, SGP_CACHE,
- -                                        mapping_gfp_mask(mapping),
- -                                        NULL, NULL);
+ +              error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
+ +                                      mapping_gfp_mask(mapping), NULL, NULL);
                 if (error == 0) {
                         folio_unlock(folio);
                         folio_put(folio);
@@@ -1544,20 -1565,38 +1544,20 @@@ static inline struct mempolicy *shmem_g
         return NULL;
   }
   #endif /* CONFIG_NUMA && CONFIG_TMPFS */
- -#ifndef CONFIG_NUMA
- -#define vm_policy vm_private_data
- -#endif
   
- -static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
- -              struct shmem_inode_info *info, pgoff_t index)
- -{
- -      /* Create a pseudo vma that just contains the policy */
- -      vma_init(vma, NULL);
- -      /* Bias interleave by inode number to distribute better across nodes */
- -      vma->vm_pgoff = index + info->vfs_inode.i_ino;
- -      vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
- -}
- -
- -static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
- -{
- -      /* Drop reference taken by mpol_shared_policy_lookup() */
- -      mpol_cond_put(vma->vm_policy);
- -}
+ +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ +                      pgoff_t index, unsigned int order, pgoff_t *ilx);
   
- -static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+ +static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
                         struct shmem_inode_info *info, pgoff_t index)
   {
- -      struct vm_area_struct pvma;
+ +      struct mempolicy *mpol;
+ +      pgoff_t ilx;
         struct page *page;
- -      struct vm_fault vmf = {
- -              .vma = &pvma,
- -      };
   
- -      shmem_pseudo_vma_init(&pvma, info, index);
- -      page = swap_cluster_readahead(swap, gfp, &vmf);
- -      shmem_pseudo_vma_destroy(&pvma);
+ +      mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+ +      page = swap_cluster_readahead(swap, gfp, mpol, ilx);
+ +      mpol_cond_put(mpol);
   
         if (!page)
                 return NULL;
@@@ -1591,126 -1630,67 +1591,126 @@@ static gfp_t limit_gfp_mask(gfp_t huge_
   static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
                 struct shmem_inode_info *info, pgoff_t index)
   {
- -      struct vm_area_struct pvma;
- -      struct address_space *mapping = info->vfs_inode.i_mapping;
- -      pgoff_t hindex;
- -      struct folio *folio;
+ +      struct mempolicy *mpol;
+ +      pgoff_t ilx;
+ +      struct page *page;
   
- -      hindex = round_down(index, HPAGE_PMD_NR);
- -      if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
- -                                                              XA_PRESENT))
- -              return NULL;
+ +      mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx);
+ +      page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id());
+ +      mpol_cond_put(mpol);
   
- -      shmem_pseudo_vma_init(&pvma, info, hindex);
- -      folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
- -      shmem_pseudo_vma_destroy(&pvma);
- -      if (!folio)
- -              count_vm_event(THP_FILE_FALLBACK);
- -      return folio;
+ +      return page_rmappable_folio(page);
   }
   
   static struct folio *shmem_alloc_folio(gfp_t gfp,
- -                      struct shmem_inode_info *info, pgoff_t index)
+ +              struct shmem_inode_info *info, pgoff_t index)
   {
- -      struct vm_area_struct pvma;
- -      struct folio *folio;
+ +      struct mempolicy *mpol;
+ +      pgoff_t ilx;
+ +      struct page *page;
   
- -      shmem_pseudo_vma_init(&pvma, info, index);
- -      folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
- -      shmem_pseudo_vma_destroy(&pvma);
+ +      mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
+ +      page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id());
+ +      mpol_cond_put(mpol);
   
- -      return folio;
+ +      return (struct folio *)page;
   }
   
- -static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
- -              pgoff_t index, bool huge)
+ +static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
+ +              struct inode *inode, pgoff_t index,
+ +              struct mm_struct *fault_mm, bool huge)
   {
+ +      struct address_space *mapping = inode->i_mapping;
         struct shmem_inode_info *info = SHMEM_I(inode);
         struct folio *folio;
- -      int nr;
- -      int err;
+ +      long pages;
+ +      int error;
   
         if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                 huge = false;
- -      nr = huge ? HPAGE_PMD_NR : 1;
   
- -      err = shmem_inode_acct_block(inode, nr);
- -      if (err)
- -              goto failed;
+ +      if (huge) {
+ +              pages = HPAGE_PMD_NR;
+ +              index = round_down(index, HPAGE_PMD_NR);
+ +
+ +              /*
+ +               * Check for conflict before waiting on a huge allocation.
+ +               * Conflict might be that a huge page has just been allocated
+ +               * and added to page cache by a racing thread, or that there
+ +               * is already at least one small page in the huge extent.
+ +               * Be careful to retry when appropriate, but not forever!
+ +               * Elsewhere -EEXIST would be the right code, but not here.
+ +               */
+ +              if (xa_find(&mapping->i_pages, &index,
+ +                              index + HPAGE_PMD_NR - 1, XA_PRESENT))
+ +                      return ERR_PTR(-E2BIG);
   
- -      if (huge)
                 folio = shmem_alloc_hugefolio(gfp, info, index);
- -      else
+ +              if (!folio)
+ +                      count_vm_event(THP_FILE_FALLBACK);
+ +      } else {
+ +              pages = 1;
                 folio = shmem_alloc_folio(gfp, info, index);
- -      if (folio) {
- -              __folio_set_locked(folio);
- -              __folio_set_swapbacked(folio);
- -              return folio;
         }
+ +      if (!folio)
+ +              return ERR_PTR(-ENOMEM);
   
- -      err = -ENOMEM;
- -      shmem_inode_unacct_blocks(inode, nr);
- -failed:
- -      return ERR_PTR(err);
+ +      __folio_set_locked(folio);
+ +      __folio_set_swapbacked(folio);
+ +
+ +      gfp &= GFP_RECLAIM_MASK;
+ +      error = mem_cgroup_charge(folio, fault_mm, gfp);
+ +      if (error) {
+ +              if (xa_find(&mapping->i_pages, &index,
+ +                              index + pages - 1, XA_PRESENT)) {
+ +                      error = -EEXIST;
+ +              } else if (huge) {
+ +                      count_vm_event(THP_FILE_FALLBACK);
+ +                      count_vm_event(THP_FILE_FALLBACK_CHARGE);
+ +              }
+ +              goto unlock;
+ +      }
+ +
+ +      error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
+ +      if (error)
+ +              goto unlock;
+ +
+ +      error = shmem_inode_acct_blocks(inode, pages);
+ +      if (error) {
+ +              struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ +              long freed;
+ +              /*
+ +               * Try to reclaim some space by splitting a few
+ +               * large folios beyond i_size on the filesystem.
+ +               */
+ +              shmem_unused_huge_shrink(sbinfo, NULL, 2);
+ +              /*
+ +               * And do a shmem_recalc_inode() to account for freed pages:
+ +               * except our folio is there in cache, so not quite balanced.
+ +               */
+ +              spin_lock(&info->lock);
+ +              freed = pages + info->alloced - info->swapped -
+ +                      READ_ONCE(mapping->nrpages);
+ +              if (freed > 0)
+ +                      info->alloced -= freed;
+ +              spin_unlock(&info->lock);
+ +              if (freed > 0)
+ +                      shmem_inode_unacct_blocks(inode, freed);
+ +              error = shmem_inode_acct_blocks(inode, pages);
+ +              if (error) {
+ +                      filemap_remove_folio(folio);
+ +                      goto unlock;
+ +              }
+ +      }
+ +
+ +      shmem_recalc_inode(inode, pages, 0);
+ +      folio_add_lru(folio);
+ +      return folio;
+ +
+ +unlock:
+ +      folio_unlock(folio);
+ +      folio_put(folio);
+ +      return ERR_PTR(error);
   }
   
   /*
@@@ -1832,11 -1812,12 +1832,11 @@@ static void shmem_set_folio_swapin_erro
    */
   static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                              struct folio **foliop, enum sgp_type sgp,
- -                           gfp_t gfp, struct vm_area_struct *vma,
+ +                           gfp_t gfp, struct mm_struct *fault_mm,
                              vm_fault_t *fault_type)
   {
         struct address_space *mapping = inode->i_mapping;
         struct shmem_inode_info *info = SHMEM_I(inode);
- -      struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
         struct swap_info_struct *si;
         struct folio *folio = NULL;
         swp_entry_t swap;
@@@ -1864,10 -1845,10 +1864,10 @@@
                 if (fault_type) {
                         *fault_type |= VM_FAULT_MAJOR;
                         count_vm_event(PGMAJFAULT);
- -                      count_memcg_event_mm(charge_mm, PGMAJFAULT);
+ +                      count_memcg_event_mm(fault_mm, PGMAJFAULT);
                 }
                 /* Here we actually start the io */
- -              folio = shmem_swapin(swap, gfp, info, index);
+ +              folio = shmem_swapin_cluster(swap, gfp, info, index);
                 if (!folio) {
                         error = -ENOMEM;
                         goto failed;
@@@ -1901,7 -1882,8 +1901,7 @@@
         }
   
         error = shmem_add_to_page_cache(folio, mapping, index,
- -                                      swp_to_radix_entry(swap), gfp,
- -                                      charge_mm);
+ +                                      swp_to_radix_entry(swap), gfp);
         if (error)
                 goto failed;
   
@@@ -1939,29 -1921,37 +1939,29 @@@ unlock
    * vm. If we swap it in we mark it dirty since we also free the swap
    * entry since a page cannot live in both the swap and page cache.
    *
- - * vma, vmf, and fault_type are only supplied by shmem_fault:
- - * otherwise they are NULL.
+ + * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
    */
   static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
                 struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
- -              struct vm_area_struct *vma, struct vm_fault *vmf,
- -              vm_fault_t *fault_type)
+ +              struct vm_fault *vmf, vm_fault_t *fault_type)
   {
- -      struct address_space *mapping = inode->i_mapping;
- -      struct shmem_inode_info *info = SHMEM_I(inode);
- -      struct shmem_sb_info *sbinfo;
- -      struct mm_struct *charge_mm;
+ +      struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
+ +      struct mm_struct *fault_mm;
         struct folio *folio;
- -      pgoff_t hindex;
- -      gfp_t huge_gfp;
         int error;
- -      int once = 0;
- -      int alloced = 0;
+ +      bool alloced;
   
         if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
                 return -EFBIG;
   repeat:
         if (sgp <= SGP_CACHE &&
- -          ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
+ +          ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
                 return -EINVAL;
- -      }
   
- -      sbinfo = SHMEM_SB(inode->i_sb);
- -      charge_mm = vma ? vma->vm_mm : NULL;
+ +      alloced = false;
+ +      fault_mm = vma ? vma->vm_mm : NULL;
   
- -      folio = filemap_get_entry(mapping, index);
+ +      folio = filemap_get_entry(inode->i_mapping, index);
         if (folio && vma && userfaultfd_minor(vma)) {
                 if (!xa_is_value(folio))
                         folio_put(folio);
@@@ -1971,7 -1961,7 +1971,7 @@@
   
         if (xa_is_value(folio)) {
                 error = shmem_swapin_folio(inode, index, &folio,
- -                                        sgp, gfp, vma, fault_type);
+ +                                         sgp, gfp, fault_mm, fault_type);
                 if (error == -EEXIST)
                         goto repeat;
   
@@@ -1983,7 -1973,7 +1983,7 @@@
                 folio_lock(folio);
   
                 /* Has the folio been truncated or swapped out? */
- -              if (unlikely(folio->mapping != mapping)) {
+ +              if (unlikely(folio->mapping != inode->i_mapping)) {
                         folio_unlock(folio);
                         folio_put(folio);
                         goto repeat;
@@@ -2018,38 -2008,58 +2018,38 @@@
                 return 0;
         }
   
- -      if (!shmem_is_huge(inode, index, false,
- -                         vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0))
- -              goto alloc_nohuge;
+ +      if (shmem_is_huge(inode, index, false, fault_mm,
+ +                        vma ? vma->vm_flags : 0)) {
+ +              gfp_t huge_gfp;
   
- -      huge_gfp = vma_thp_gfp_mask(vma);
- -      huge_gfp = limit_gfp_mask(huge_gfp, gfp);
- -      folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
- -      if (IS_ERR(folio)) {
- -alloc_nohuge:
- -              folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
+ +              huge_gfp = vma_thp_gfp_mask(vma);
+ +              huge_gfp = limit_gfp_mask(huge_gfp, gfp);
+ +              folio = shmem_alloc_and_add_folio(huge_gfp,
+ +                              inode, index, fault_mm, true);
+ +              if (!IS_ERR(folio)) {
+ +                      count_vm_event(THP_FILE_ALLOC);
+ +                      goto alloced;
+ +              }
+ +              if (PTR_ERR(folio) == -EEXIST)
+ +                      goto repeat;
         }
- -      if (IS_ERR(folio)) {
- -              int retry = 5;
   
+ +      folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false);
+ +      if (IS_ERR(folio)) {
                 error = PTR_ERR(folio);
+ +              if (error == -EEXIST)
+ +                      goto repeat;
                 folio = NULL;
- -              if (error != -ENOSPC)
- -                      goto unlock;
- -              /*
- -               * Try to reclaim some space by splitting a large folio
- -               * beyond i_size on the filesystem.
- -               */
- -              while (retry--) {
- -                      int ret;
- -
- -                      ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
- -                      if (ret == SHRINK_STOP)
- -                              break;
- -                      if (ret)
- -                              goto alloc_nohuge;
- -              }
                 goto unlock;
         }
   
- -      hindex = round_down(index, folio_nr_pages(folio));
- -
- -      if (sgp == SGP_WRITE)
- -              __folio_set_referenced(folio);
- -
- -      error = shmem_add_to_page_cache(folio, mapping, hindex,
- -                                      NULL, gfp & GFP_RECLAIM_MASK,
- -                                      charge_mm);
- -      if (error)
- -              goto unacct;
- -
- -      folio_add_lru(folio);
- -      shmem_recalc_inode(inode, folio_nr_pages(folio), 0);
+ +alloced:
         alloced = true;
- -
         if (folio_test_pmd_mappable(folio) &&
             DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
                                         folio_next_index(folio) - 1) {
+ +              struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ +              struct shmem_inode_info *info = SHMEM_I(inode);
                 /*
                  * Part of the large folio is beyond i_size: subject
                  * to shrink under memory pressure.
@@@ -2067,8 -2077,6 +2067,8 @@@
                 spin_unlock(&sbinfo->shrinklist_lock);
         }
   
+ +      if (sgp == SGP_WRITE)
+ +              folio_set_referenced(folio);
         /*
          * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
          */
@@@ -2092,6 -2100,11 +2092,6 @@@ clear
         /* Perhaps the file has been truncated since we checked */
         if (sgp <= SGP_CACHE &&
             ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
- -              if (alloced) {
- -                      folio_clear_dirty(folio);
- -                      filemap_remove_folio(folio);
- -                      shmem_recalc_inode(inode, 0, 0);
- -              }
                 error = -EINVAL;
                 goto unlock;
         }
@@@ -2102,14 -2115,25 +2102,14 @@@ out
         /*
          * Error recovery.
          */
- -unacct:
- -      shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
- -
- -      if (folio_test_large(folio)) {
- -              folio_unlock(folio);
- -              folio_put(folio);
- -              goto alloc_nohuge;
- -      }
   unlock:
+ +      if (alloced)
+ +              filemap_remove_folio(folio);
+ +      shmem_recalc_inode(inode, 0, 0);
         if (folio) {
                 folio_unlock(folio);
                 folio_put(folio);
         }
- -      if (error == -ENOSPC && !once++) {
- -              shmem_recalc_inode(inode, 0, 0);
- -              goto repeat;
- -      }
- -      if (error == -EEXIST)
- -              goto repeat;
         return error;
   }
   
@@@ -2117,7 -2141,7 +2117,7 @@@ int shmem_get_folio(struct inode *inode
                 enum sgp_type sgp)
   {
         return shmem_get_folio_gfp(inode, index, foliop, sgp,
- -                      mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
+ +                      mapping_gfp_mask(inode->i_mapping), NULL, NULL);
   }
   
   /*
@@@ -2125,99 -2149,87 +2125,99 @@@
    * entry unconditionally - even if something else had already woken the
    * target.
    */
- -static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+ +static int synchronous_wake_function(wait_queue_entry_t *wait,
+ +                      unsigned int mode, int sync, void *key)
   {
         int ret = default_wake_function(wait, mode, sync, key);
         list_del_init(&wait->entry);
         return ret;
   }
   
+ +/*
+ + * Trinity finds that probing a hole which tmpfs is punching can
+ + * prevent the hole-punch from ever completing: which in turn
+ + * locks writers out with its hold on i_rwsem.  So refrain from
+ + * faulting pages into the hole while it's being punched.  Although
+ + * shmem_undo_range() does remove the additions, it may be unable to
+ + * keep up, as each new page needs its own unmap_mapping_range() call,
+ + * and the i_mmap tree grows ever slower to scan if new vmas are added.
+ + *
+ + * It does not matter if we sometimes reach this check just before the
+ + * hole-punch begins, so that one fault then races with the punch:
+ + * we just need to make racing faults a rare case.
+ + *
+ + * The implementation below would be much simpler if we just used a
+ + * standard mutex or completion: but we cannot take i_rwsem in fault,
+ + * and bloating every shmem inode for this unlikely case would be sad.
+ + */
+ +static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
+ +{
+ +      struct shmem_falloc *shmem_falloc;
+ +      struct file *fpin = NULL;
+ +      vm_fault_t ret = 0;
+ +
+ +      spin_lock(&inode->i_lock);
+ +      shmem_falloc = inode->i_private;
+ +      if (shmem_falloc &&
+ +          shmem_falloc->waitq &&
+ +          vmf->pgoff >= shmem_falloc->start &&
+ +          vmf->pgoff < shmem_falloc->next) {
+ +              wait_queue_head_t *shmem_falloc_waitq;
+ +              DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
+ +
+ +              ret = VM_FAULT_NOPAGE;
+ +              fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ +              shmem_falloc_waitq = shmem_falloc->waitq;
+ +              prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+ +                              TASK_UNINTERRUPTIBLE);
+ +              spin_unlock(&inode->i_lock);
+ +              schedule();
+ +
+ +              /*
+ +               * shmem_falloc_waitq points into the shmem_fallocate()
+ +               * stack of the hole-punching task: shmem_falloc_waitq
+ +               * is usually invalid by the time we reach here, but
+ +               * finish_wait() does not dereference it in that case;
+ +               * though i_lock needed lest racing with wake_up_all().
+ +               */
+ +              spin_lock(&inode->i_lock);
+ +              finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+ +      }
+ +      spin_unlock(&inode->i_lock);
+ +      if (fpin) {
+ +              fput(fpin);
+ +              ret = VM_FAULT_RETRY;
+ +      }
+ +      return ret;
+ +}
+ +
   static vm_fault_t shmem_fault(struct vm_fault *vmf)
   {
- -      struct vm_area_struct *vma = vmf->vma;
- -      struct inode *inode = file_inode(vma->vm_file);
+ +      struct inode *inode = file_inode(vmf->vma->vm_file);
         gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
         struct folio *folio = NULL;
+ +      vm_fault_t ret = 0;
         int err;
- -      vm_fault_t ret = VM_FAULT_LOCKED;
   
         /*
          * Trinity finds that probing a hole which tmpfs is punching can
- -       * prevent the hole-punch from ever completing: which in turn
- -       * locks writers out with its hold on i_rwsem.  So refrain from
- -       * faulting pages into the hole while it's being punched.  Although
- -       * shmem_undo_range() does remove the additions, it may be unable to
- -       * keep up, as each new page needs its own unmap_mapping_range() call,
- -       * and the i_mmap tree grows ever slower to scan if new vmas are added.
- -       *
- -       * It does not matter if we sometimes reach this check just before the
- -       * hole-punch begins, so that one fault then races with the punch:
- -       * we just need to make racing faults a rare case.
- -       *
- -       * The implementation below would be much simpler if we just used a
- -       * standard mutex or completion: but we cannot take i_rwsem in fault,
- -       * and bloating every shmem inode for this unlikely case would be sad.
+ +       * prevent the hole-punch from ever completing: noted in i_private.
          */
         if (unlikely(inode->i_private)) {
- -              struct shmem_falloc *shmem_falloc;
- -
- -              spin_lock(&inode->i_lock);
- -              shmem_falloc = inode->i_private;
- -              if (shmem_falloc &&
- -                  shmem_falloc->waitq &&
- -                  vmf->pgoff >= shmem_falloc->start &&
- -                  vmf->pgoff < shmem_falloc->next) {
- -                      struct file *fpin;
- -                      wait_queue_head_t *shmem_falloc_waitq;
- -                      DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
- -
- -                      ret = VM_FAULT_NOPAGE;
- -                      fpin = maybe_unlock_mmap_for_io(vmf, NULL);
- -                      if (fpin)
- -                              ret = VM_FAULT_RETRY;
- -
- -                      shmem_falloc_waitq = shmem_falloc->waitq;
- -                      prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
- -                                      TASK_UNINTERRUPTIBLE);
- -                      spin_unlock(&inode->i_lock);
- -                      schedule();
- -
- -                      /*
- -                       * shmem_falloc_waitq points into the shmem_fallocate()
- -                       * stack of the hole-punching task: shmem_falloc_waitq
- -                       * is usually invalid by the time we reach here, but
- -                       * finish_wait() does not dereference it in that case;
- -                       * though i_lock needed lest racing with wake_up_all().
- -                       */
- -                      spin_lock(&inode->i_lock);
- -                      finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
- -                      spin_unlock(&inode->i_lock);
- -
- -                      if (fpin)
- -                              fput(fpin);
+ +              ret = shmem_falloc_wait(vmf, inode);
+ +              if (ret)
                         return ret;
- -              }
- -              spin_unlock(&inode->i_lock);
         }
   
+ +      WARN_ON_ONCE(vmf->page != NULL);
         err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
- -                                gfp, vma, vmf, &ret);
+ +                                gfp, vmf, &ret);
         if (err)
                 return vmf_error(err);
- -      if (folio)
+ +      if (folio) {
                 vmf->page = folio_file_page(folio, vmf->pgoff);
+ +              ret |= VM_FAULT_LOCKED;
+ +      }
         return ret;
   }
   
@@@ -2318,41 -2330,15 +2318,41 @@@ static int shmem_set_policy(struct vm_a
   }
   
   static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
- -                                        unsigned long addr)
+ +                                        unsigned long addr, pgoff_t *ilx)
   {
         struct inode *inode = file_inode(vma->vm_file);
         pgoff_t index;
   
+ +      /*
+ +       * Bias interleave by inode number to distribute better across nodes;
+ +       * but this interface is independent of which page order is used, so
+ +       * supplies only that bias, letting caller apply the offset (adjusted
+ +       * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
+ +       */
+ +      *ilx = inode->i_ino;
         index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
         return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
   }
- -#endif
+ +
+ +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ +                      pgoff_t index, unsigned int order, pgoff_t *ilx)
+ +{
+ +      struct mempolicy *mpol;
+ +
+ +      /* Bias interleave by inode number to distribute better across nodes */
+ +      *ilx = info->vfs_inode.i_ino + (index >> order);
+ +
+ +      mpol = mpol_shared_policy_lookup(&info->policy, index);
+ +      return mpol ? mpol : get_task_policy(current);
+ +}
+ +#else
+ +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
+ +                      pgoff_t index, unsigned int order, pgoff_t *ilx)
+ +{
+ +      *ilx = 0;
+ +      return NULL;
+ +}
+ +#endif /* CONFIG_NUMA */
   
   int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
   {
@@@ -2388,7 -2374,7 +2388,7 @@@ static int shmem_mmap(struct file *file
         struct shmem_inode_info *info = SHMEM_I(inode);
         int ret;
   
- -      ret = seal_check_future_write(info->seals, vma);
+ +      ret = seal_check_write(info->seals, vma);
         if (ret)
                 return ret;
   
@@@ -2459,6 -2445,7 +2459,6 @@@ static struct inode *__shmem_get_inode(
         if (err)
                 return ERR_PTR(err);
   
- -
         inode = new_inode(sb);
         if (!inode) {
                 shmem_free_inode(sb, 0);
@@@ -2468,7 -2455,7 +2468,7 @@@
         inode->i_ino = ino;
         inode_init_owner(idmap, inode, dir, mode);
         inode->i_blocks = 0;
- -      inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
+ +      simple_inode_init_ts(inode);
         inode->i_generation = get_random_u32();
         info = SHMEM_I(inode);
         memset(info, 0, (char *)inode - (char *)info);
@@@ -2476,17 -2463,18 +2476,17 @@@
         atomic_set(&info->stop_eviction, 0);
         info->seals = F_SEAL_SEAL;
         info->flags = flags & VM_NORESERVE;
- -      info->i_crtime = inode->i_mtime;
+ +      info->i_crtime = inode_get_mtime(inode);
         info->fsflags = (dir == NULL) ? 0 :
                 SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
         if (info->fsflags)
                 shmem_set_inode_flags(inode, info->fsflags);
         INIT_LIST_HEAD(&info->shrinklist);
         INIT_LIST_HEAD(&info->swaplist);
- -      INIT_LIST_HEAD(&info->swaplist);
- -      if (sbinfo->noswap)
- -              mapping_set_unevictable(inode->i_mapping);
         simple_xattrs_init(&info->xattrs);
         cache_no_acl(inode);
+ +      if (sbinfo->noswap)
+ +              mapping_set_unevictable(inode->i_mapping);
         mapping_set_large_folios(inode->i_mapping);
   
         switch (mode & S_IFMT) {
@@@ -2577,7 -2565,7 +2577,7 @@@ int shmem_mfill_atomic_pte(pmd_t *dst_p
         int ret;
         pgoff_t max_off;
   
- -      if (shmem_inode_acct_block(inode, 1)) {
+ +      if (shmem_inode_acct_blocks(inode, 1)) {
                 /*
                  * We may have got a page, returned -ENOENT triggering a retry,
                  * and now we find ourselves with -ENOMEM. Release the page, to
@@@ -2649,10 -2637,8 +2649,10 @@@
         if (unlikely(pgoff >= max_off))
                 goto out_release;
   
- -      ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
- -                                    gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
+ +      ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
+ +      if (ret)
+ +              goto out_release;
+ +      ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
         if (ret)
                 goto out_release;
   
@@@ -2700,6 -2686,7 +2700,6 @@@ shmem_write_begin(struct file *file, st
         }
   
         ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
- -
         if (ret)
                 return ret;
   
@@@ -3231,7 -3218,8 +3231,7 @@@ shmem_mknod(struct mnt_idmap *idmap, st
         error = simple_acl_create(dir, inode);
         if (error)
                 goto out_iput;
- -      error = security_inode_init_security(inode, dir,
- -                                           &dentry->d_name,
+ +      error = security_inode_init_security(inode, dir, &dentry->d_name,
                                              shmem_initxattrs, NULL);
         if (error && error != -EOPNOTSUPP)
                 goto out_iput;
@@@ -3241,7 -3229,7 +3241,7 @@@
                 goto out_iput;
   
         dir->i_size += BOGO_DIRENT_SIZE;
- -      dir->i_mtime = inode_set_ctime_current(dir);
+ +      inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
         inode_inc_iversion(dir);
         d_instantiate(dentry, inode);
         dget(dentry); /* Extra count - pin the dentry in core */
@@@ -3260,11 -3248,14 +3260,11 @@@ shmem_tmpfile(struct mnt_idmap *idmap, 
         int error;
   
         inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
- -
         if (IS_ERR(inode)) {
                 error = PTR_ERR(inode);
                 goto err_out;
         }
- -
- -      error = security_inode_init_security(inode, dir,
- -                                           NULL,
+ +      error = security_inode_init_security(inode, dir, NULL,
                                              shmem_initxattrs, NULL);
         if (error && error != -EOPNOTSUPP)
                 goto out_iput;
@@@ -3301,8 -3292,7 +3301,8 @@@ static int shmem_create(struct mnt_idma
   /*
    * Link a file..
    */
- -static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+ +static int shmem_link(struct dentry *old_dentry, struct inode *dir,
+ +                    struct dentry *dentry)
   {
         struct inode *inode = d_inode(old_dentry);
         int ret = 0;
@@@ -3328,12 -3318,12 +3328,12 @@@
         }
   
         dir->i_size += BOGO_DIRENT_SIZE;
- -      dir->i_mtime = inode_set_ctime_to_ts(dir,
- -                                           inode_set_ctime_current(inode));
+ +      inode_set_mtime_to_ts(dir,
+ +                            inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
         inode_inc_iversion(dir);
         inc_nlink(inode);
         ihold(inode);   /* New dentry reference */
- -      dget(dentry);           /* Extra pinning count for the created dentry */
+ +      dget(dentry);   /* Extra pinning count for the created dentry */
         d_instantiate(dentry, inode);
   out:
         return ret;
@@@ -3349,11 -3339,11 +3349,11 @@@ static int shmem_unlink(struct inode *d
         simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
   
         dir->i_size -= BOGO_DIRENT_SIZE;
- -      dir->i_mtime = inode_set_ctime_to_ts(dir,
- -                                           inode_set_ctime_current(inode));
+ +      inode_set_mtime_to_ts(dir,
+ +                            inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
         inode_inc_iversion(dir);
         drop_nlink(inode);
- -      dput(dentry);   /* Undo the count from "create" - this does all the work */
+ +      dput(dentry);   /* Undo the count from "create" - does all the work */
         return 0;
   }
   
@@@ -3463,6 -3453,7 +3463,6 @@@ static int shmem_symlink(struct mnt_idm
   
         inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
                                 VM_NORESERVE);
- -
         if (IS_ERR(inode))
                 return PTR_ERR(inode);
   
@@@ -3497,7 -3488,7 +3497,7 @@@
                 folio_put(folio);
         }
         dir->i_size += BOGO_DIRENT_SIZE;
- -      dir->i_mtime = inode_set_ctime_current(dir);
+ +      inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
         inode_inc_iversion(dir);
         d_instantiate(dentry, inode);
         dget(dentry);
@@@ -3516,7 -3507,8 +3516,7 @@@ static void shmem_put_link(void *arg
         folio_put(arg);
   }
   
- -static const char *shmem_get_link(struct dentry *dentry,
- -                                struct inode *inode,
+ +static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
                                   struct delayed_call *done)
   {
         struct folio *folio = NULL;
@@@ -3590,7 -3582,8 +3590,7 @@@ static int shmem_fileattr_set(struct mn
    * Callback for security_inode_init_security() for acquiring xattrs.
    */
   static int shmem_initxattrs(struct inode *inode,
- -                          const struct xattr *xattr_array,
- -                          void *fs_info)
+ +                          const struct xattr *xattr_array, void *fs_info)
   {
         struct shmem_inode_info *info = SHMEM_I(inode);
         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
@@@ -3721,7 -3714,7 +3721,7 @@@ static const struct xattr_handler shmem
         .set = shmem_xattr_handler_set,
   };
   
- -static const struct xattr_handler *shmem_xattr_handlers[] = {
+ +static const struct xattr_handler * const shmem_xattr_handlers[] = {
         &shmem_security_xattr_handler,
         &shmem_trusted_xattr_handler,
         &shmem_user_xattr_handler,
@@@ -3774,6 -3767,7 +3774,6 @@@ static struct dentry *shmem_find_alias(
         return alias ?: d_find_any_alias(inode);
   }
   
- -
   static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
                 struct fid *fid, int fh_len, int fh_type)
   {
@@@ -4357,8 -4351,8 +4357,8 @@@ static int shmem_fill_super(struct supe
         }
   #endif /* CONFIG_TMPFS_QUOTA */
   
- -      inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0,
- -                              VM_NORESERVE);
+ +      inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
+ +                              S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
         if (IS_ERR(inode)) {
                 error = PTR_ERR(inode);
                 goto failed;
@@@ -4400,7 -4394,7 +4400,7 @@@ static const struct fs_context_operatio
   #endif
   };
   
- static struct kmem_cache *shmem_inode_cachep;
+ static struct kmem_cache *shmem_inode_cachep __ro_after_init;
   
   static struct inode *shmem_alloc_inode(struct super_block *sb)
   {
@@@ -4432,14 -4426,14 +4432,14 @@@ static void shmem_init_inode(void *foo
         inode_init_once(&info->vfs_inode);
   }
   
- static void shmem_init_inodecache(void)
+ static void __init shmem_init_inodecache(void)
   {
         shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                 sizeof(struct shmem_inode_info),
                                 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
   }
   
- static void shmem_destroy_inodecache(void)
+ static void __init shmem_destroy_inodecache(void)
   {
         kmem_cache_destroy(shmem_inode_cachep);
   }
@@@ -4591,7 -4585,11 +4591,7 @@@ static struct file_system_type shmem_fs
         .parameters     = shmem_fs_parameters,
   #endif
         .kill_sb        = kill_litter_super,
- -#ifdef CONFIG_SHMEM
         .fs_flags       = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
- -#else
- -      .fs_flags       = FS_USERNS_MOUNT,
- -#endif
   };
   
   void __init shmem_init(void)
@@@ -4657,9 -4655,11 +4657,9 @@@ static ssize_t shmem_enabled_show(struc
   
         for (i = 0; i < ARRAY_SIZE(values); i++) {
                 len += sysfs_emit_at(buf, len,
- -                                   shmem_huge == values[i] ? "%s[%s]" : "%s%s",
- -                                   i ? " " : "",
- -                                   shmem_format_huge(values[i]));
+ +                              shmem_huge == values[i] ? "%s[%s]" : "%s%s",
+ +                              i ? " " : "", shmem_format_huge(values[i]));
         }
- -
         len += sysfs_emit_at(buf, len, "\n");
   
         return len;
@@@ -4756,9 -4756,8 +4756,9 @@@ EXPORT_SYMBOL_GPL(shmem_truncate_range)
   #define shmem_acct_size(flags, size)          0
   #define shmem_unacct_size(flags, size)                do {} while (0)
   
- -static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir,
- -                                          umode_t mode, dev_t dev, unsigned long flags)
+ +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
+ +                              struct super_block *sb, struct inode *dir,
+ +                              umode_t mode, dev_t dev, unsigned long flags)
   {
         struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
         return inode ? inode : ERR_PTR(-ENOSPC);
@@@ -4768,8 -4767,8 +4768,8 @@@
   
   /* common code */
   
- -static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
- -                                     unsigned long flags, unsigned int i_flags)
+ +static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
+ +                      loff_t size, unsigned long flags, unsigned int i_flags)
   {
         struct inode *inode;
         struct file *res;
@@@ -4788,6 -4787,7 +4788,6 @@@
   
         inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
                                 S_IFREG | S_IRWXUGO, 0, flags);
- -
         if (IS_ERR(inode)) {
                 shmem_unacct_size(flags, size);
                 return ERR_CAST(inode);
@@@ -4897,7 -4897,7 +4897,7 @@@ struct folio *shmem_read_folio_gfp(stru
   
         BUG_ON(!shmem_mapping(mapping));
         error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
- -                                gfp, NULL, NULL, NULL);
+ +                                  gfp, NULL, NULL);
         if (error)
                 return ERR_PTR(error);
   
diff --combined net/core/pktgen.c

index 8afcfadf8d5a28d2e195484f9dd75194c03cf45f,826250a0f5b16e9fea1432faef56967a360f53a5..57cea67b75624696dd521904fb6eb36864c1e7eb
--- 1/net/core/pktgen.c
--- 2/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@@ -200,7 -200,6 +200,7 @@@
         pf(VID_RND)             /* Random VLAN ID */                    \
         pf(SVID_RND)            /* Random SVLAN ID */                   \
         pf(NODE)                /* Node memory alloc*/                  \
+ +      pf(SHARED)              /* Shared SKB */                        \
   
   #define pf(flag)              flag##_SHIFT,
   enum pkt_flags {
@@@ -670,19 -669,19 +670,19 @@@ static int pktgen_if_show(struct seq_fi
         seq_puts(seq, "     Flags: ");
   
         for (i = 0; i < NR_PKT_FLAGS; i++) {
- -              if (i == F_FLOW_SEQ)
+ +              if (i == FLOW_SEQ_SHIFT)
                         if (!pkt_dev->cflows)
                                 continue;
   
- -              if (pkt_dev->flags & (1 << i))
+ +              if (pkt_dev->flags & (1 << i)) {
                         seq_printf(seq, "%s  ", pkt_flag_names[i]);
- -              else if (i == F_FLOW_SEQ)
- -                      seq_puts(seq, "FLOW_RND  ");
- -
   #ifdef CONFIG_XFRM
- -              if (i == F_IPSEC && pkt_dev->spi)
- -                      seq_printf(seq, "spi:%u", pkt_dev->spi);
+ +                      if (i == IPSEC_SHIFT && pkt_dev->spi)
+ +                              seq_printf(seq, "spi:%u  ", pkt_dev->spi);
   #endif
+ +              } else if (i == FLOW_SEQ_SHIFT) {
+ +                      seq_puts(seq, "FLOW_RND  ");
+ +              }
         }
   
         seq_puts(seq, "\n");
@@@ -1199,8 -1198,7 +1199,8 @@@ static ssize_t pktgen_if_write(struct f
                     ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
                      !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
                         return -ENOTSUPP;
- -              if (value > 0 && pkt_dev->n_imix_entries > 0)
+ +              if (value > 0 && (pkt_dev->n_imix_entries > 0 ||
+ +                                !(pkt_dev->flags & F_SHARED)))
                         return -EINVAL;
   
                 i += len;
@@@ -1259,10 -1257,6 +1259,10 @@@
                      ((pkt_dev->xmit_mode == M_START_XMIT) &&
                      (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
                         return -ENOTSUPP;
+ +
+ +              if (value > 1 && !(pkt_dev->flags & F_SHARED))
+ +                      return -EINVAL;
+ +
                 pkt_dev->burst = value < 1 ? 1 : value;
                 sprintf(pg_result, "OK: burst=%u", pkt_dev->burst);
                 return count;
@@@ -1324,10 -1318,9 +1324,10 @@@
                 return count;
         }
         if (!strcmp(name, "flag")) {
+ +              bool disable = false;
                 __u32 flag;
                 char f[32];
- -              bool disable = false;
+ +              char *end;
   
                 memset(f, 0, 32);
                 len = strn_len(&user_buffer[i], sizeof(f) - 1);
@@@ -1339,42 -1332,28 +1339,42 @@@
                 i += len;
   
                 flag = pktgen_read_flag(f, &disable);
- -
                 if (flag) {
- -                      if (disable)
+ +                      if (disable) {
+ +                              /* If "clone_skb", or "burst" parameters are
+ +                               * configured, it means that the skb still
+ +                               * needs to be referenced by the pktgen, so
+ +                               * the skb must be shared.
+ +                               */
+ +                              if (flag == F_SHARED && (pkt_dev->clone_skb ||
+ +                                                       pkt_dev->burst > 1))
+ +                                      return -EINVAL;
                                 pkt_dev->flags &= ~flag;
- -                      else
+ +                      } else {
                                 pkt_dev->flags |= flag;
- -              } else {
- -                      sprintf(pg_result,
- -                              "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
- -                              f,
- -                              "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
- -                              "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
- -                              "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
- -                              "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
- -                              "NO_TIMESTAMP, "
- -#ifdef CONFIG_XFRM
- -                              "IPSEC, "
- -#endif
- -                              "NODE_ALLOC\n");
+ +                      }
+ +
+ +                      sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
                         return count;
                 }
- -              sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
+ +
+ +              /* Unknown flag */
+ +              end = pkt_dev->result + sizeof(pkt_dev->result);
+ +              pg_result += sprintf(pg_result,
+ +                      "Flag -:%s:- unknown\n"
+ +                      "Available flags, (prepend ! to un-set flag):\n", f);
+ +
+ +              for (int n = 0; n < NR_PKT_FLAGS && pg_result < end; n++) {
+ +                      if (!IS_ENABLED(CONFIG_XFRM) && n == IPSEC_SHIFT)
+ +                              continue;
+ +                      pg_result += snprintf(pg_result, end - pg_result,
+ +                                            "%s, ", pkt_flag_names[n]);
+ +              }
+ +              if (!WARN_ON_ONCE(pg_result >= end)) {
+ +                      /* Remove the comma and whitespace at the end */
+ +                      *(pg_result - 2) = '\0';
+ +              }
+ +
                 return count;
         }
         if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
@@@ -3461,24 -3440,12 +3461,24 @@@ static void pktgen_wait_for_skb(struct 
   
   static void pktgen_xmit(struct pktgen_dev *pkt_dev)
   {
- -      unsigned int burst = READ_ONCE(pkt_dev->burst);
+ +      bool skb_shared = !!(READ_ONCE(pkt_dev->flags) & F_SHARED);
         struct net_device *odev = pkt_dev->odev;
         struct netdev_queue *txq;
+ +      unsigned int burst = 1;
         struct sk_buff *skb;
+ +      int clone_skb = 0;
         int ret;
   
+ +      /* If 'skb_shared' is false, the read of possible
+ +       * new values (if any) for 'burst' and 'clone_skb' will be skipped to
+ +       * prevent some concurrent changes from slipping in. And the stabilized
+ +       * config will be read in during the next run of pktgen_xmit.
+ +       */
+ +      if (skb_shared) {
+ +              burst = READ_ONCE(pkt_dev->burst);
+ +              clone_skb = READ_ONCE(pkt_dev->clone_skb);
+ +      }
+ +
         /* If device is offline, then don't send */
         if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
                 pktgen_stop_device(pkt_dev);
@@@ -3495,7 -3462,7 +3495,7 @@@
   
         /* If no skb or clone count exhausted then get new one */
         if (!pkt_dev->skb || (pkt_dev->last_ok &&
- -                            ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
+ +                            ++pkt_dev->clone_count >= clone_skb)) {
                 /* build a new pkt */
                 kfree_skb(pkt_dev->skb);
   
@@@ -3516,8 -3483,7 +3516,8 @@@
         if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
                 skb = pkt_dev->skb;
                 skb->protocol = eth_type_trans(skb, skb->dev);
- -              refcount_add(burst, &skb->users);
+ +              if (skb_shared)
+ +                      refcount_add(burst, &skb->users);
                 local_bh_disable();
                 do {
                         ret = netif_receive_skb(skb);
@@@ -3525,10 -3491,6 +3525,10 @@@
                                 pkt_dev->errors++;
                         pkt_dev->sofar++;
                         pkt_dev->seq_num++;
+ +                      if (unlikely(!skb_shared)) {
+ +                              pkt_dev->skb = NULL;
+ +                              break;
+ +                      }
                         if (refcount_read(&skb->users) != burst) {
                                 /* skb was queued by rps/rfs or taps,
                                  * so cannot reuse this skb
@@@ -3547,14 -3509,9 +3547,14 @@@
                 goto out; /* Skips xmit_mode M_START_XMIT */
         } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
                 local_bh_disable();
- -              refcount_inc(&pkt_dev->skb->users);
+ +              if (skb_shared)
+ +                      refcount_inc(&pkt_dev->skb->users);
   
                 ret = dev_queue_xmit(pkt_dev->skb);
+ +
+ +              if (!skb_shared && dev_xmit_complete(ret))
+ +                      pkt_dev->skb = NULL;
+ +
                 switch (ret) {
                 case NET_XMIT_SUCCESS:
                         pkt_dev->sofar++;
@@@ -3592,15 -3549,11 +3592,15 @@@
                 pkt_dev->last_ok = 0;
                 goto unlock;
         }
- -      refcount_add(burst, &pkt_dev->skb->users);
+ +      if (skb_shared)
+ +              refcount_add(burst, &pkt_dev->skb->users);
   
   xmit_more:
         ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
   
+ +      if (!skb_shared && dev_xmit_complete(ret))
+ +              pkt_dev->skb = NULL;
+ +
         switch (ret) {
         case NETDEV_TX_OK:
                 pkt_dev->last_ok = 1;
@@@ -3622,8 -3575,7 +3622,8 @@@
                 fallthrough;
         case NETDEV_TX_BUSY:
                 /* Retry it next time */
- -              refcount_dec(&(pkt_dev->skb->users));
+ +              if (skb_shared)
+ +                      refcount_dec(&pkt_dev->skb->users);
                 pkt_dev->last_ok = 0;
         }
         if (unlikely(burst))
@@@ -3636,8 -3588,7 +3636,8 @@@ out
   
         /* If pkt_dev->count is zero, then run forever */
         if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
- -              pktgen_wait_for_skb(pkt_dev);
+ +              if (pkt_dev->skb)
+ +                      pktgen_wait_for_skb(pkt_dev);
   
                 /* Done with this */
                 pktgen_stop_device(pkt_dev);
@@@ -3820,7 -3771,6 +3820,7 @@@ static int pktgen_add_device(struct pkt
         pkt_dev->svlan_id = 0xffff;
         pkt_dev->burst = 1;
         pkt_dev->node = NUMA_NO_NODE;
+ +      pkt_dev->flags = F_SHARED;      /* SKB shared by default */
   
         err = pktgen_setup_dev(t->net, pkt_dev, ifname);
         if (err)
@@@ -4032,8 -3982,7 +4032,7 @@@ static void __net_exit pg_net_exit(stru
         list_for_each_safe(q, n, &list) {
                 t = list_entry(q, struct pktgen_thread, th_list);
                 list_del(&t->th_list);
-               kthread_stop(t->tsk);
-               put_task_struct(t->tsk);
+               kthread_stop_put(t->tsk);
                 kfree(t);
         }
   
diff --combined security/integrity/iint.c

index 27ea19fb1f54c7b0ec384ab66a6dea10977ddbd5,5b1c2de6cc54617bf82b6b4b8c9eabfd97e02b53..d4419a2a1e24be840d71b5ca8927c3d52f9704f7
--- 1/security/integrity/iint.c
--- 2/security/integrity/iint.c
+++ b/security/integrity/iint.c
@@@ -23,7 -23,7 +23,7 @@@
   
   static struct rb_root integrity_iint_tree = RB_ROOT;
   static DEFINE_RWLOCK(integrity_iint_lock);
- static struct kmem_cache *iint_cache __read_mostly;
+ static struct kmem_cache *iint_cache __ro_after_init;
   
   struct dentry *integrity_dir;
   
@@@ -66,32 -66,9 +66,32 @@@ struct integrity_iint_cache *integrity_
         return iint;
   }
   
- -static void iint_free(struct integrity_iint_cache *iint)
+ +#define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH+1)
+ +
+ +/*
+ + * It is not clear that IMA should be nested at all, but as long is it measures
+ + * files both on overlayfs and on underlying fs, we need to annotate the iint
+ + * mutex to avoid lockdep false positives related to IMA + overlayfs.
+ + * See ovl_lockdep_annotate_inode_mutex_key() for more details.
+ + */
+ +static inline void iint_lockdep_annotate(struct integrity_iint_cache *iint,
+ +                                       struct inode *inode)
+ +{
+ +#ifdef CONFIG_LOCKDEP
+ +      static struct lock_class_key iint_mutex_key[IMA_MAX_NESTING];
+ +
+ +      int depth = inode->i_sb->s_stack_depth;
+ +
+ +      if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING))
+ +              depth = 0;
+ +
+ +      lockdep_set_class(&iint->mutex, &iint_mutex_key[depth]);
+ +#endif
+ +}
+ +
+ +static void iint_init_always(struct integrity_iint_cache *iint,
+ +                           struct inode *inode)
   {
- -      kfree(iint->ima_hash);
         iint->ima_hash = NULL;
         iint->version = 0;
         iint->flags = 0UL;
@@@ -103,14 -80,6 +103,14 @@@
         iint->ima_creds_status = INTEGRITY_UNKNOWN;
         iint->evm_status = INTEGRITY_UNKNOWN;
         iint->measured_pcrs = 0;
+ +      mutex_init(&iint->mutex);
+ +      iint_lockdep_annotate(iint, inode);
+ +}
+ +
+ +static void iint_free(struct integrity_iint_cache *iint)
+ +{
+ +      kfree(iint->ima_hash);
+ +      mutex_destroy(&iint->mutex);
         kmem_cache_free(iint_cache, iint);
   }
   
@@@ -135,8 -104,6 +135,8 @@@ struct integrity_iint_cache *integrity_
         if (!iint)
                 return NULL;
   
+ +      iint_init_always(iint, inode);
+ +
         write_lock(&integrity_iint_lock);
   
         p = &integrity_iint_tree.rb_node;
@@@ -186,18 -153,25 +186,18 @@@ void integrity_inode_free(struct inode 
         iint_free(iint);
   }
   
- -static void init_once(void *foo)
+ +static void iint_init_once(void *foo)
   {
         struct integrity_iint_cache *iint = (struct integrity_iint_cache *) foo;
   
         memset(iint, 0, sizeof(*iint));
- -      iint->ima_file_status = INTEGRITY_UNKNOWN;
- -      iint->ima_mmap_status = INTEGRITY_UNKNOWN;
- -      iint->ima_bprm_status = INTEGRITY_UNKNOWN;
- -      iint->ima_read_status = INTEGRITY_UNKNOWN;
- -      iint->ima_creds_status = INTEGRITY_UNKNOWN;
- -      iint->evm_status = INTEGRITY_UNKNOWN;
- -      mutex_init(&iint->mutex);
   }
   
   static int __init integrity_iintcache_init(void)
   {
         iint_cache =
             kmem_cache_create("iint_cache", sizeof(struct integrity_iint_cache),
- -                            0, SLAB_PANIC, init_once);
+ +                            0, SLAB_PANIC, iint_init_once);
         return 0;
   }
   DEFINE_LSM(integrity) = {
diff --combined tools/testing/selftests/mm/run_vmtests.sh

index bf4c4cd46600e3a709f3993373be19d3a2e2e8c8,7d31718ce8343db04ad51351cc862020b7698d28..cc16f6ca85333225004f06d5e7083700dda1d8c9
--- 1/tools/testing/selftests/mm/run_vmtests.sh
--- 2/tools/testing/selftests/mm/run_vmtests.sh
+++ b/tools/testing/selftests/mm/run_vmtests.sh
@@@ -56,8 -56,6 +56,8 @@@ separated by spaces
         memory protection key tests
   - soft_dirty
         test soft dirty page bit semantics
+ +- pagemap
+ +      test pagemap_scan IOCTL
   - cow
         test copy-on-write semantics
   - thp
@@@ -223,10 -221,6 +223,10 @@@ CATEGORY="hugetlb" run_test ./hugepage-
   CATEGORY="hugetlb" run_test ./hugepage-vmemmap
   CATEGORY="hugetlb" run_test ./hugetlb-madvise
   
+ +# For this test, we need one and just one huge page
+ +echo 1 > /proc/sys/vm/nr_hugepages
+ +CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv
+ +
   if test_selected "hugetlb"; then
         echo "NOTE: These hugetlb tests provide minimal coverage.  Use"
         echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
@@@ -309,6 -303,7 +309,7 @@@ CATEGORY="hmm" run_test bash ./test_hmm
   # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
   CATEGORY="madv_populate" run_test ./madv_populate
   
+ echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
   CATEGORY="memfd_secret" run_test ./memfd_secret
   
   # KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100
@@@ -348,8 -343,6 +349,8 @@@ the
         CATEGORY="soft_dirty" run_test ./soft-dirty
   fi
   
+ +CATEGORY="pagemap" run_test ./pagemap_ioctl
+ +
   # COW tests
   CATEGORY="cow" run_test ./cow
author	Linus Torvalds <[email protected]>
	Fri, 3 Nov 2023 06:53:31 +0000 (20:53 -1000)
committer	Linus Torvalds <[email protected]>
	Fri, 3 Nov 2023 06:53:31 +0000 (20:53 -1000)
		1	2
.mailmap	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/kernel/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm64/mm/init.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/loongarch/kernel/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/riscv/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/bdev.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/accel/ivpu/ivpu_job.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/gpu/drm/i915/gt/selftest_migrate.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/xen-netback/interface.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/hisi_sas/hisi_sas_v3_hw.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/buffer.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/char_dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/dcache.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/file_table.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/kernfs/mount.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/locks.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/namespace.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/notify/dnotify/dnotify.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/notify/fanotify/fanotify_user.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/dlmfs/dlmfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/pipe.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/base.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/task_mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/userfaultfd.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/crash_core.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fortify-string.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched/signal.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/init_task.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/audit_tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/kthread.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/signal.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/user_namespace.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/workqueue.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/damon/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/khugepaged.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/pktgen.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/integrity/iint.c	patch \|	diff1 \|	diff2 \|	blob \| history
tools/testing/selftests/mm/run_vmtests.sh	patch \|	diff1 \|	diff2 \|	blob \| history