Merge tag 'header_cleanup-2024-01-10' of https://evilpiepirate.org/git/bcachefs

author Linus Torvalds <[email protected]>

Thu, 11 Jan 2024 00:43:55 +0000 (16:43 -0800)

committer Linus Torvalds <[email protected]>

Thu, 11 Jan 2024 00:43:55 +0000 (16:43 -0800)
author Linus Torvalds <[email protected]>
Thu, 11 Jan 2024 00:43:55 +0000 (16:43 -0800)
committer Linus Torvalds <[email protected]>
Thu, 11 Jan 2024 00:43:55 +0000 (16:43 -0800)
diff --combined arch/x86/include/asm/fpu/types.h

index f1fadc318a88ff895b09c088ed4f057b93f5169b,3dad7cf25505aa4cb2601e55b12d1b71868a5c84..ace9aa3b78a3055e547edeab9f3378be6067386b
--- 1/arch/x86/include/asm/fpu/types.h
--- 2/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@@ -5,6 -5,8 +5,8 @@@
   #ifndef _ASM_X86_FPU_H
   #define _ASM_X86_FPU_H
   
+ #include <asm/page_types.h>
+ 
   /*
    * The legacy x87 FPU state format, as saved by FSAVE and
    * restored by the FRSTOR instructions:
@@@ -415,7 -417,7 +417,7 @@@ struct fpu_state_perm 
          *
          * This master permission field is only to be used when
          * task.fpu.fpstate based checks fail to validate whether the task
- -       * is allowed to expand it's xfeatures set which requires to
+ +       * is allowed to expand its xfeatures set which requires to
          * allocate a larger sized fpstate buffer.
          *
          * Do not access this field directly.  Use the provided helper
diff --combined arch/x86/include/asm/paravirt.h

index 8bcf7584e7dd885b9d7d8fee9e3088318e905d7f,49020002663b36bf7dd1b06eacce1a54bf3a5140..d4eb9e1d61b8ef8a3fc3a2510b0615ea93c11cb8
--- 1/arch/x86/include/asm/paravirt.h
--- 2/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@@ -6,6 -6,10 +6,10 @@@
   
   #include <asm/paravirt_types.h>
   
+ #ifndef __ASSEMBLY__
+ struct mm_struct;
+ #endif
+ 
   #ifdef CONFIG_PARAVIRT
   #include <asm/pgtable_types.h>
   #include <asm/asm.h>
@@@ -142,7 -146,8 +146,7 @@@ static inline void write_cr0(unsigned l
   static __always_inline unsigned long read_cr2(void)
   {
         return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
- -                              "mov %%cr2, %%rax;",
- -                              ALT_NOT(X86_FEATURE_XENPV));
+ +                              "mov %%cr2, %%rax;", ALT_NOT_XEN);
   }
   
   static __always_inline void write_cr2(unsigned long x)
@@@ -153,12 -158,13 +157,12 @@@
   static inline unsigned long __read_cr3(void)
   {
         return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
- -                            "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV));
+ +                            "mov %%cr3, %%rax;", ALT_NOT_XEN);
   }
   
   static inline void write_cr3(unsigned long x)
   {
- -      PVOP_ALT_VCALL1(mmu.write_cr3, x,
- -                      "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV));
+ +      PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN);
   }
   
   static inline void __write_cr4(unsigned long x)
@@@ -180,7 -186,7 +184,7 @@@ extern noinstr void pv_native_wbinvd(vo
   
   static __always_inline void wbinvd(void)
   {
- -      PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
+ +      PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
   }
   
   static inline u64 paravirt_read_msr(unsigned msr)
@@@ -388,25 -394,27 +392,25 @@@ static inline void paravirt_release_p4d
   static inline pte_t __pte(pteval_t val)
   {
         return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
- -                                        "mov %%rdi, %%rax",
- -                                        ALT_NOT(X86_FEATURE_XENPV)) };
+ +                                        "mov %%rdi, %%rax", ALT_NOT_XEN) };
   }
   
   static inline pteval_t pte_val(pte_t pte)
   {
         return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
- -                              "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ +                              "mov %%rdi, %%rax", ALT_NOT_XEN);
   }
   
   static inline pgd_t __pgd(pgdval_t val)
   {
         return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
- -                                        "mov %%rdi, %%rax",
- -                                        ALT_NOT(X86_FEATURE_XENPV)) };
+ +                                        "mov %%rdi, %%rax", ALT_NOT_XEN) };
   }
   
   static inline pgdval_t pgd_val(pgd_t pgd)
   {
         return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
- -                              "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ +                              "mov %%rdi, %%rax", ALT_NOT_XEN);
   }
   
   #define  __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@@ -440,13 -448,14 +444,13 @@@ static inline void set_pmd(pmd_t *pmdp
   static inline pmd_t __pmd(pmdval_t val)
   {
         return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
- -                                        "mov %%rdi, %%rax",
- -                                        ALT_NOT(X86_FEATURE_XENPV)) };
+ +                                        "mov %%rdi, %%rax", ALT_NOT_XEN) };
   }
   
   static inline pmdval_t pmd_val(pmd_t pmd)
   {
         return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
- -                              "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ +                              "mov %%rdi, %%rax", ALT_NOT_XEN);
   }
   
   static inline void set_pud(pud_t *pudp, pud_t pud)
@@@ -459,7 -468,7 +463,7 @@@ static inline pud_t __pud(pudval_t val
         pudval_t ret;
   
         ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
- -                             "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ +                             "mov %%rdi, %%rax", ALT_NOT_XEN);
   
         return (pud_t) { ret };
   }
@@@ -467,7 -476,7 +471,7 @@@
   static inline pudval_t pud_val(pud_t pud)
   {
         return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
- -                              "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ +                              "mov %%rdi, %%rax", ALT_NOT_XEN);
   }
   
   static inline void pud_clear(pud_t *pudp)
@@@ -487,7 -496,8 +491,7 @@@ static inline void set_p4d(p4d_t *p4dp
   static inline p4d_t __p4d(p4dval_t val)
   {
         p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
- -                                      "mov %%rdi, %%rax",
- -                                      ALT_NOT(X86_FEATURE_XENPV));
+ +                                      "mov %%rdi, %%rax", ALT_NOT_XEN);
   
         return (p4d_t) { ret };
   }
@@@ -495,7 -505,7 +499,7 @@@
   static inline p4dval_t p4d_val(p4d_t p4d)
   {
         return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d,
- -                              "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ +                              "mov %%rdi, %%rax", ALT_NOT_XEN);
   }
   
   static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
@@@ -681,17 -691,17 +685,17 @@@ bool __raw_callee_save___native_vcpu_is
   static __always_inline unsigned long arch_local_save_flags(void)
   {
         return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
- -                              ALT_NOT(X86_FEATURE_XENPV));
+ +                              ALT_NOT_XEN);
   }
   
   static __always_inline void arch_local_irq_disable(void)
   {
- -      PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV));
+ +      PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN);
   }
   
   static __always_inline void arch_local_irq_enable(void)
   {
- -      PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV));
+ +      PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN);
   }
   
   static __always_inline unsigned long arch_local_irq_save(void)
@@@ -720,25 -730,52 +724,25 @@@
   #undef PVOP_VCALL4
   #undef PVOP_CALL4
   
- -#define DEFINE_PARAVIRT_ASM(func, instr, sec)         \
- -      asm (".pushsection " #sec ", \"ax\"\n"          \
- -           ".global " #func "\n\t"                    \
- -           ".type " #func ", @function\n\t"           \
- -           ASM_FUNC_ALIGN "\n"                        \
- -           #func ":\n\t"                              \
- -           ASM_ENDBR                                  \
- -           instr "\n\t"                               \
- -           ASM_RET                                    \
- -           ".size " #func ", . - " #func "\n\t"       \
- -           ".popsection")
- -
   extern void default_banner(void);
   void native_pv_lock_init(void) __init;
   
   #else  /* __ASSEMBLY__ */
   
- -#define _PVSITE(ptype, ops, word, algn)               \
- -771:;                                         \
- -      ops;                                    \
- -772:;                                         \
- -      .pushsection .parainstructions,"a";     \
- -       .align algn;                           \
- -       word 771b;                             \
- -       .byte ptype;                           \
- -       .byte 772b-771b;                       \
- -       _ASM_ALIGN;                            \
- -      .popsection
- -
- -
   #ifdef CONFIG_X86_64
   #ifdef CONFIG_PARAVIRT_XXL
+ +#ifdef CONFIG_DEBUG_ENTRY
   
- -#define PARA_PATCH(off)               ((off) / 8)
- -#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
   #define PARA_INDIRECT(addr)   *addr(%rip)
   
- -#ifdef CONFIG_DEBUG_ENTRY
   .macro PARA_IRQ_save_fl
- -      PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
- -                ANNOTATE_RETPOLINE_SAFE;
- -                call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);)
+ +      ANNOTATE_RETPOLINE_SAFE;
+ +      call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);
   .endm
   
- -#define SAVE_FLAGS    ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \
- -                                  ALT_NOT(X86_FEATURE_XENPV)
+ +#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;",                 \
+ +                               "ALT_CALL_INSTR;", ALT_CALL_ALWAYS,    \
+ +                               "pushf; pop %rax;", ALT_NOT_XEN
   #endif
   #endif /* CONFIG_PARAVIRT_XXL */
   #endif        /* CONFIG_X86_64 */
diff --combined arch/x86/include/asm/paravirt_types.h

index d8e85d2cf8d56e4d591a1747e8d54f5b67611174,3cfcd5db083b35185dafa96b29f8f10b493a6d17..8d4fbe1be489549ad33c968c2132bdbaf739b871
--- 1/arch/x86/include/asm/paravirt_types.h
--- 2/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@@ -2,9 -2,20 +2,10 @@@
   #ifndef _ASM_X86_PARAVIRT_TYPES_H
   #define _ASM_X86_PARAVIRT_TYPES_H
   
- -#ifndef __ASSEMBLY__
- -#include <linux/types.h>
- -
- -/* These all sit in the .parainstructions section to tell us what to patch. */
- -struct paravirt_patch_site {
- -      u8 *instr;              /* original instructions */
- -      u8 type;                /* type of this instruction */
- -      u8 len;                 /* length of original instruction */
- -};
- -#endif
- -
   #ifdef CONFIG_PARAVIRT
   
   #ifndef __ASSEMBLY__
++#include <linux/types.h>
   
   #include <asm/desc_defs.h>
   #include <asm/pgtable_types.h>
@@@ -241,11 -252,43 +242,11 @@@ struct paravirt_patch_template 
   extern struct pv_info pv_info;
   extern struct paravirt_patch_template pv_ops;
   
- -#define PARAVIRT_PATCH(x)                                     \
- -      (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
- -
- -#define paravirt_type(op)                             \
- -      [paravirt_typenum] "i" (PARAVIRT_PATCH(op)),    \
- -      [paravirt_opptr] "m" (pv_ops.op)
- -/*
- - * Generate some code, and mark it as patchable by the
- - * apply_paravirt() alternate instruction patcher.
- - */
- -#define _paravirt_alt(insn_string, type)              \
- -      "771:\n\t" insn_string "\n" "772:\n"            \
- -      ".pushsection .parainstructions,\"a\"\n"        \
- -      _ASM_ALIGN "\n"                                 \
- -      _ASM_PTR " 771b\n"                              \
- -      "  .byte " type "\n"                            \
- -      "  .byte 772b-771b\n"                           \
- -      _ASM_ALIGN "\n"                                 \
- -      ".popsection\n"
- -
- -/* Generate patchable code, with the default asm parameters. */
- -#define paravirt_alt(insn_string)                                     \
- -      _paravirt_alt(insn_string, "%c[paravirt_typenum]")
- -
- -/* Simple instruction patching code. */
- -#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
- -
- -unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len);
+ +#define paravirt_ptr(op)      [paravirt_opptr] "m" (pv_ops.op)
   
   int paravirt_disable_iospace(void);
   
- -/*
- - * This generates an indirect call based on the operation type number.
- - * The type number, computed in PARAVIRT_PATCH, is derived from the
- - * offset into the paravirt_patch_template structure, and can therefore be
- - * freely converted back into a structure offset.
- - */
+ +/* This generates an indirect call based on the operation type number. */
   #define PARAVIRT_CALL                                 \
         ANNOTATE_RETPOLINE_SAFE                         \
         "call *%[paravirt_opptr];"
@@@ -278,6 -321,12 +279,6 @@@
    * However, x86_64 also has to clobber all caller saved registers, which
    * unfortunately, are quite a bit (r8 - r11)
    *
- - * The call instruction itself is marked by placing its start address
- - * and size into the .parainstructions section, so that
- - * apply_paravirt() in arch/i386/kernel/alternative.c can do the
- - * appropriate patching under the control of the backend pv_init_ops
- - * implementation.
- - *
    * Unfortunately there's no way to get gcc to generate the args setup
    * for the call, and then allow the call itself to be generated by an
    * inline asm.  Because of this, we must do the complete arg setup and
@@@ -376,27 -425,14 +377,27 @@@
                 __mask & __eax;                                         \
         })
   
- -
+ +/*
+ + * Use alternative patching for paravirt calls:
+ + * - For replacing an indirect call with a direct one, use the "normal"
+ + *   ALTERNATIVE() macro with the indirect call as the initial code sequence,
+ + *   which will be replaced with the related direct call by using the
+ + *   ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
+ + * - In case the replacement is either a direct call or a short code sequence
+ + *   depending on a feature bit, the ALTERNATIVE_2() macro is being used.
+ + *   The indirect call is the initial code sequence again, while the special
+ + *   code sequence is selected with the specified feature bit. In case the
+ + *   feature is not active, the direct call is used as above via the
+ + *   ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
+ + */
   #define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...)    \
         ({                                                              \
                 PVOP_CALL_ARGS;                                         \
                 PVOP_TEST_NULL(op);                                     \
- -              asm volatile(paravirt_alt(PARAVIRT_CALL)                \
+ +              asm volatile(ALTERNATIVE(PARAVIRT_CALL, ALT_CALL_INSTR, \
+ +                              ALT_CALL_ALWAYS)                        \
                              : call_clbr, ASM_CALL_CONSTRAINT           \
- -                           : paravirt_type(op),                       \
+ +                           : paravirt_ptr(op),                        \
                                ##__VA_ARGS__                            \
                              : "memory", "cc" extra_clbr);              \
                 ret;                                                    \
@@@ -407,11 -443,10 +408,11 @@@
         ({                                                              \
                 PVOP_CALL_ARGS;                                         \
                 PVOP_TEST_NULL(op);                                     \
- -              asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL),   \
- -                                       alt, cond)                     \
+ +              asm volatile(ALTERNATIVE_2(PARAVIRT_CALL,               \
+ +                               ALT_CALL_INSTR, ALT_CALL_ALWAYS,       \
+ +                               alt, cond)                             \
                              : call_clbr, ASM_CALL_CONSTRAINT           \
- -                           : paravirt_type(op),                       \
+ +                           : paravirt_ptr(op),                        \
                                ##__VA_ARGS__                            \
                              : "memory", "cc" extra_clbr);              \
                 ret;                                                    \
@@@ -509,6 -544,8 +510,6 @@@
         __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),    \
                      PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
   
- -void _paravirt_nop(void);
- -void paravirt_BUG(void);
   unsigned long paravirt_ret0(void);
   #ifdef CONFIG_PARAVIRT_XXL
   u64 _paravirt_ident_64(u64);
@@@ -518,11 -555,11 +519,11 @@@ void pv_native_irq_enable(void)
   unsigned long pv_native_read_cr2(void);
   #endif
   
- -#define paravirt_nop  ((void *)_paravirt_nop)
- -
- -extern struct paravirt_patch_site __parainstructions[],
- -      __parainstructions_end[];
+ +#define paravirt_nop  ((void *)nop_func)
   
   #endif        /* __ASSEMBLY__ */
+ +
+ +#define ALT_NOT_XEN   ALT_NOT(X86_FEATURE_XENPV)
+ +
   #endif  /* CONFIG_PARAVIRT */
   #endif        /* _ASM_X86_PARAVIRT_TYPES_H */
diff --combined fs/exec.c

index ee43597cb45311bec9e3bc03269fe62b799a2f57,41773af7e3dca1fdba360b545fbf50925eaff496..88ce7d9ceff5884b03573a6fb6b0227decf4fe85
--- 1/fs/exec.c
--- 2/fs/exec.c
+++ b/fs/exec.c
@@@ -66,6 -66,7 +66,7 @@@
   #include <linux/coredump.h>
   #include <linux/time_namespace.h>
   #include <linux/user_events.h>
+ #include <linux/rseq.h>
   
   #include <linux/uaccess.h>
   #include <asm/mmu_context.h>
@@@ -1578,10 -1579,11 +1579,10 @@@ static void check_unsafe_exec(struct li
          * will be able to manipulate the current directory, etc.
          * It would be nice to force an unshare instead...
          */
- -      t = p;
         n_fs = 1;
         spin_lock(&p->fs->lock);
         rcu_read_lock();
- -      while_each_thread(p, t) {
+ +      for_other_threads(p, t) {
                 if (t->fs == p->fs)
                         n_fs++;
         }
diff --combined include/linux/lockdep_types.h

index 857d785e89e6a94f4b0cd70b5a8489e83533cb1d,9c533c8d701e7870be1be42f8bf1ed16fa8bf5e1..70d30d40ea4a9e1e0acb2c250981716eeb8687d7
--- 1/include/linux/lockdep_types.h
--- 2/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@@ -127,12 -127,12 +127,12 @@@ struct lock_class 
         unsigned long                   usage_mask;
         const struct lock_trace         *usage_traces[LOCK_TRACE_STATES];
   
+ +      const char                      *name;
         /*
          * Generation counter, when doing certain classes of graph walking,
          * to ensure that we check one node only once:
          */
         int                             name_version;
- -      const char                      *name;
   
         u8                              wait_type_inner;
         u8                              wait_type_outer;
@@@ -198,6 -198,63 +198,63 @@@ struct lockdep_map 
   
   struct pin_cookie { unsigned int val; };
   
+ #define MAX_LOCKDEP_KEYS_BITS         13
+ #define MAX_LOCKDEP_KEYS              (1UL << MAX_LOCKDEP_KEYS_BITS)
+ #define INITIAL_CHAIN_KEY             -1
+ 
+ struct held_lock {
+       /*
+        * One-way hash of the dependency chain up to this point. We
+        * hash the hashes step by step as the dependency chain grows.
+        *
+        * We use it for dependency-caching and we skip detection
+        * passes and dependency-updates if there is a cache-hit, so
+        * it is absolutely critical for 100% coverage of the validator
+        * to have a unique key value for every unique dependency path
+        * that can occur in the system, to make a unique hash value
+        * as likely as possible - hence the 64-bit width.
+        *
+        * The task struct holds the current hash value (initialized
+        * with zero), here we store the previous hash value:
+        */
+       u64                             prev_chain_key;
+       unsigned long                   acquire_ip;
+       struct lockdep_map              *instance;
+       struct lockdep_map              *nest_lock;
+ #ifdef CONFIG_LOCK_STAT
+       u64                             waittime_stamp;
+       u64                             holdtime_stamp;
+ #endif
+       /*
+        * class_idx is zero-indexed; it points to the element in
+        * lock_classes this held lock instance belongs to. class_idx is in
+        * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive.
+        */
+       unsigned int                    class_idx:MAX_LOCKDEP_KEYS_BITS;
+       /*
+        * The lock-stack is unified in that the lock chains of interrupt
+        * contexts nest ontop of process context chains, but we 'separate'
+        * the hashes by starting with 0 if we cross into an interrupt
+        * context, and we also keep do not add cross-context lock
+        * dependencies - the lock usage graph walking covers that area
+        * anyway, and we'd just unnecessarily increase the number of
+        * dependencies otherwise. [Note: hardirq and softirq contexts
+        * are separated from each other too.]
+        *
+        * The following field is used to detect when we cross into an
+        * interrupt context:
+        */
+       unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
+       unsigned int trylock:1;                                         /* 16 bits */
+ 
+       unsigned int read:2;        /* see lock_acquire() comment */
+       unsigned int check:1;       /* see lock_acquire() comment */
+       unsigned int hardirqs_off:1;
+       unsigned int sync:1;
+       unsigned int references:11;                                     /* 32 bits */
+       unsigned int pin_count;
+ };
+ 
   #else /* !CONFIG_LOCKDEP */
   
   /*
diff --combined include/linux/mutex.h

index 95d11308f995d01c5972a2c21cf99d2bf9a23955,0dfba5df652435a679422065255637593183a9c1..7e208d46ba5b838bb391fe821dae1e10cd77204e
--- 1/include/linux/mutex.h
--- 2/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@@ -20,6 -20,7 +20,7 @@@
   #include <linux/osq_lock.h>
   #include <linux/debug_locks.h>
   #include <linux/cleanup.h>
+ #include <linux/mutex_types.h>
   
   #ifdef CONFIG_DEBUG_LOCK_ALLOC
   # define __DEP_MAP_MUTEX_INITIALIZER(lockname)                        \
@@@ -33,49 -34,6 +34,6 @@@
   
   #ifndef CONFIG_PREEMPT_RT
   
- /*
-  * Simple, straightforward mutexes with strict semantics:
-  *
-  * - only one task can hold the mutex at a time
-  * - only the owner can unlock the mutex
-  * - multiple unlocks are not permitted
-  * - recursive locking is not permitted
-  * - a mutex object must be initialized via the API
-  * - a mutex object must not be initialized via memset or copying
-  * - task may not exit with mutex held
-  * - memory areas where held locks reside must not be freed
-  * - held mutexes must not be reinitialized
-  * - mutexes may not be used in hardware or software interrupt
-  *   contexts such as tasklets and timers
-  *
-  * These semantics are fully enforced when DEBUG_MUTEXES is
-  * enabled. Furthermore, besides enforcing the above rules, the mutex
-  * debugging code also implements a number of additional features
-  * that make lock debugging easier and faster:
-  *
-  * - uses symbolic names of mutexes, whenever they are printed in debug output
-  * - point-of-acquire tracking, symbolic lookup of function names
-  * - list of all locks held in the system, printout of them
-  * - owner tracking
-  * - detects self-recursing locks and prints out all relevant info
-  * - detects multi-task circular deadlocks and prints out all affected
-  *   locks and tasks (and only those tasks)
-  */
- struct mutex {
-       atomic_long_t           owner;
-       raw_spinlock_t          wait_lock;
- #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-       struct optimistic_spin_queue osq; /* Spinner MCS lock */
- #endif
-       struct list_head        wait_list;
- #ifdef CONFIG_DEBUG_MUTEXES
-       void                    *magic;
- #endif
- #ifdef CONFIG_DEBUG_LOCK_ALLOC
-       struct lockdep_map      dep_map;
- #endif
- };
- 
   #ifdef CONFIG_DEBUG_MUTEXES
   
   #define __DEBUG_MUTEX_INITIALIZER(lockname)                           \
@@@ -131,14 -89,6 +89,6 @@@ extern bool mutex_is_locked(struct mute
   /*
    * Preempt-RT variant based on rtmutexes.
    */
- #include <linux/rtmutex.h>
- 
- struct mutex {
-       struct rt_mutex_base    rtmutex;
- #ifdef CONFIG_DEBUG_LOCK_ALLOC
-       struct lockdep_map      dep_map;
- #endif
- };
   
   #define __MUTEX_INITIALIZER(mutexname)                                        \
   {                                                                     \
@@@ -221,7 -171,6 +171,7 @@@ extern void mutex_unlock(struct mutex *
   extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
   
   DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T))
- -DEFINE_FREE(mutex, struct mutex *, if (_T) mutex_unlock(_T))
+ +DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T))
+ +DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0)
   
   #endif /* __LINUX_MUTEX_H */
diff --combined include/linux/sched.h

index d3097f0682d76dab1bed3eecfd4c2477bb3e2b6f,814bfdafbc1caeff756a517cce606e0757227ef0..9a66147915b2702588d33c9c443b65a27a0cc7fb
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -10,33 -10,41 +10,41 @@@
   #include <uapi/linux/sched.h>
   
   #include <asm/current.h>
- 
- #include <linux/pid.h>
- #include <linux/sem.h>
+ #include <asm/processor.h>
+ #include <linux/thread_info.h>
+ #include <linux/preempt.h>
+ #include <linux/cpumask.h>
+ 
+ #include <linux/cache.h>
+ #include <linux/irqflags_types.h>
+ #include <linux/smp_types.h>
+ #include <linux/pid_types.h>
+ #include <linux/sem_types.h>
   #include <linux/shm.h>
   #include <linux/kmsan_types.h>
- #include <linux/mutex.h>
- #include <linux/plist.h>
- #include <linux/hrtimer.h>
- #include <linux/irqflags.h>
- #include <linux/seccomp.h>
- #include <linux/nodemask.h>
- #include <linux/rcupdate.h>
- #include <linux/refcount.h>
+ #include <linux/mutex_types.h>
+ #include <linux/plist_types.h>
+ #include <linux/hrtimer_types.h>
+ #include <linux/timer_types.h>
+ #include <linux/seccomp_types.h>
+ #include <linux/nodemask_types.h>
+ #include <linux/refcount_types.h>
   #include <linux/resource.h>
   #include <linux/latencytop.h>
   #include <linux/sched/prio.h>
   #include <linux/sched/types.h>
   #include <linux/signal_types.h>
- #include <linux/syscall_user_dispatch.h>
+ #include <linux/syscall_user_dispatch_types.h>
   #include <linux/mm_types_task.h>
   #include <linux/task_io_accounting.h>
- #include <linux/posix-timers.h>
- #include <linux/rseq.h>
- #include <linux/seqlock.h>
+ #include <linux/posix-timers_types.h>
+ #include <linux/restart_block.h>
+ #include <uapi/linux/rseq.h>
+ #include <linux/seqlock_types.h>
   #include <linux/kcsan.h>
   #include <linux/rv.h>
   #include <linux/livepatch_sched.h>
+ #include <linux/uidgid_types.h>
   #include <asm/kmap_size.h>
   
   /* task_struct member predeclarations (sorted alphabetically): */
@@@ -63,13 -71,11 +71,13 @@@ struct robust_list_head
   struct root_domain;
   struct rq;
   struct sched_attr;
+ +struct sched_dl_entity;
   struct seq_file;
   struct sighand_struct;
   struct signal_struct;
   struct task_delay_info;
   struct task_group;
+ +struct task_struct;
   struct user_event_mm;
   
   /*
@@@ -415,6 -421,42 +423,6 @@@ struct load_weight 
         u32                             inv_weight;
   };
   
- -/**
- - * struct util_est - Estimation utilization of FAIR tasks
- - * @enqueued: instantaneous estimated utilization of a task/cpu
- - * @ewma:     the Exponential Weighted Moving Average (EWMA)
- - *            utilization of a task
- - *
- - * Support data structure to track an Exponential Weighted Moving Average
- - * (EWMA) of a FAIR task's utilization. New samples are added to the moving
- - * average each time a task completes an activation. Sample's weight is chosen
- - * so that the EWMA will be relatively insensitive to transient changes to the
- - * task's workload.
- - *
- - * The enqueued attribute has a slightly different meaning for tasks and cpus:
- - * - task:   the task's util_avg at last task dequeue time
- - * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
- - * Thus, the util_est.enqueued of a task represents the contribution on the
- - * estimated utilization of the CPU where that task is currently enqueued.
- - *
- - * Only for tasks we track a moving average of the past instantaneous
- - * estimated utilization. This allows to absorb sporadic drops in utilization
- - * of an otherwise almost periodic task.
- - *
- - * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
- - * updates. When a task is dequeued, its util_est should not be updated if its
- - * util_avg has not been updated in the meantime.
- - * This information is mapped into the MSB bit of util_est.enqueued at dequeue
- - * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
- - * for a task) it is safe to use MSB.
- - */
- -struct util_est {
- -      unsigned int                    enqueued;
- -      unsigned int                    ewma;
- -#define UTIL_EST_WEIGHT_SHIFT         2
- -#define UTIL_AVG_UNCHANGED            0x80000000
- -} __attribute__((__aligned__(sizeof(u64))));
- -
   /*
    * The load/runnable/util_avg accumulates an infinite geometric series
    * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@@ -469,20 -511,9 +477,20 @@@ struct sched_avg 
         unsigned long                   load_avg;
         unsigned long                   runnable_avg;
         unsigned long                   util_avg;
- -      struct util_est                 util_est;
+ +      unsigned int                    util_est;
   } ____cacheline_aligned;
   
+ +/*
+ + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ + * updates. When a task is dequeued, its util_est should not be updated if its
+ + * util_avg has not been updated in the meantime.
+ + * This information is mapped into the MSB bit of util_est at dequeue time.
+ + * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
+ + * it is safe to use MSB.
+ + */
+ +#define UTIL_EST_WEIGHT_SHIFT         2
+ +#define UTIL_AVG_UNCHANGED            0x80000000
+ +
   struct sched_statistics {
   #ifdef CONFIG_SCHEDSTATS
         u64                             wait_start;
@@@ -500,7 -531,7 +508,7 @@@
         u64                             block_max;
         s64                             sum_block_runtime;
   
- -      u64                             exec_max;
+ +      s64                             exec_max;
         u64                             slice_max;
   
         u64                             nr_migrations_cold;
@@@ -530,7 -561,7 +538,7 @@@ struct sched_entity 
         struct load_weight              load;
         struct rb_node                  run_node;
         u64                             deadline;
- -      u64                             min_deadline;
+ +      u64                             min_vruntime;
   
         struct list_head                group_node;
         unsigned int                    on_rq;
@@@ -584,9 -615,6 +592,9 @@@ struct sched_rt_entity 
   #endif
   } __randomize_layout;
   
+ +typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
+ +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
+ +
   struct sched_dl_entity {
         struct rb_node                  rb_node;
   
@@@ -634,7 -662,6 +642,7 @@@
         unsigned int                    dl_yielded        : 1;
         unsigned int                    dl_non_contending : 1;
         unsigned int                    dl_overrun        : 1;
+ +      unsigned int                    dl_server         : 1;
   
         /*
          * Bandwidth enforcement timer. Each -deadline task has its
@@@ -649,20 -676,7 +657,20 @@@
          * timer is needed to decrease the active utilization at the correct
          * time.
          */
- -      struct hrtimer inactive_timer;
+ +      struct hrtimer                  inactive_timer;
+ +
+ +      /*
+ +       * Bits for DL-server functionality. Also see the comment near
+ +       * dl_server_update().
+ +       *
+ +       * @rq the runqueue this server is for
+ +       *
+ +       * @server_has_tasks() returns true if @server_pick return a
+ +       * runnable task.
+ +       */
+ +      struct rq                       *rq;
+ +      dl_server_has_tasks_f           server_has_tasks;
+ +      dl_server_pick_f                server_pick;
   
   #ifdef CONFIG_RT_MUTEXES
         /*
@@@ -789,7 -803,6 +797,7 @@@ struct task_struct 
         struct sched_entity             se;
         struct sched_rt_entity          rt;
         struct sched_dl_entity          dl;
+ +      struct sched_dl_entity          *dl_server;
         const struct sched_class        *sched_class;
   
   #ifdef CONFIG_SCHED_CORE
@@@ -1556,114 -1569,6 +1564,6 @@@
          */
   };
   
- static inline struct pid *task_pid(struct task_struct *task)
- {
-       return task->thread_pid;
- }
- 
- /*
-  * the helpers to get the task's different pids as they are seen
-  * from various namespaces
-  *
-  * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
-  * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
-  *                     current.
-  * task_xid_nr_ns()  : id seen from the ns specified;
-  *
-  * see also pid_nr() etc in include/linux/pid.h
-  */
- pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
- 
- static inline pid_t task_pid_nr(struct task_struct *tsk)
- {
-       return tsk->pid;
- }
- 
- static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
- }
- 
- static inline pid_t task_pid_vnr(struct task_struct *tsk)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
- }
- 
- 
- static inline pid_t task_tgid_nr(struct task_struct *tsk)
- {
-       return tsk->tgid;
- }
- 
- /**
-  * pid_alive - check that a task structure is not stale
-  * @p: Task structure to be checked.
-  *
-  * Test if a process is not yet dead (at most zombie state)
-  * If pid_alive fails, then pointers within the task structure
-  * can be stale and must not be dereferenced.
-  *
-  * Return: 1 if the process is alive. 0 otherwise.
-  */
- static inline int pid_alive(const struct task_struct *p)
- {
-       return p->thread_pid != NULL;
- }
- 
- static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
- }
- 
- static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
- }
- 
- 
- static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
- }
- 
- static inline pid_t task_session_vnr(struct task_struct *tsk)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
- }
- 
- static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
- }
- 
- static inline pid_t task_tgid_vnr(struct task_struct *tsk)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
- }
- 
- static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
- {
-       pid_t pid = 0;
- 
-       rcu_read_lock();
-       if (pid_alive(tsk))
-               pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
-       rcu_read_unlock();
- 
-       return pid;
- }
- 
- static inline pid_t task_ppid_nr(const struct task_struct *tsk)
- {
-       return task_ppid_nr_ns(tsk, &init_pid_ns);
- }
- 
- /* Obsolete, do not use: */
- static inline pid_t task_pgrp_nr(struct task_struct *tsk)
- {
-       return task_pgrp_nr_ns(tsk, &init_pid_ns);
- }
- 
   #define TASK_REPORT_IDLE      (TASK_REPORT + 1)
   #define TASK_REPORT_MAX               (TASK_REPORT_IDLE << 1)
   
@@@ -1707,20 -1612,6 +1607,6 @@@ static inline char task_state_to_char(s
         return task_index_to_char(task_state_index(tsk));
   }
   
- /**
-  * is_global_init - check if a task structure is init. Since init
-  * is free to have sub-threads we need to check tgid.
-  * @tsk: Task structure to be checked.
-  *
-  * Check if a task structure is the first user space task the kernel created.
-  *
-  * Return: 1 if the task structure is init. 0 otherwise.
-  */
- static inline int is_global_init(struct task_struct *tsk)
- {
-       return task_tgid_nr(tsk) == 1;
- }
- 
   extern struct pid *cad_pid;
   
   /*
@@@ -1950,7 -1841,9 +1836,7 @@@ extern void ia64_set_curr_task(int cpu
   void yield(void);
   
   union thread_union {
- -#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
         struct task_struct task;
- -#endif
   #ifndef CONFIG_THREAD_INFO_IN_TASK
         struct thread_info thread_info;
   #endif
@@@ -2170,15 -2063,6 +2056,6 @@@ extern int __cond_resched_rwlock_write(
         __cond_resched_rwlock_write(lock);                                      \
   })
   
- static inline void cond_resched_rcu(void)
- {
- #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
-       rcu_read_unlock();
-       cond_resched();
-       rcu_read_lock();
- #endif
- }
- 
   #ifdef CONFIG_PREEMPT_DYNAMIC
   
   extern bool preempt_model_none(void);
@@@ -2220,37 -2104,6 +2097,6 @@@ static inline bool preempt_model_preemp
         return preempt_model_full() || preempt_model_rt();
   }
   
- /*
-  * Does a critical section need to be broken due to another
-  * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
-  * but a general need for low latency)
-  */
- static inline int spin_needbreak(spinlock_t *lock)
- {
- #ifdef CONFIG_PREEMPTION
-       return spin_is_contended(lock);
- #else
-       return 0;
- #endif
- }
- 
- /*
-  * Check if a rwlock is contended.
-  * Returns non-zero if there is another task waiting on the rwlock.
-  * Returns zero if the lock is not contended or the system / underlying
-  * rwlock implementation does not support contention detection.
-  * Technically does not depend on CONFIG_PREEMPTION, but a general need
-  * for low latency.
-  */
- static inline int rwlock_needbreak(rwlock_t *lock)
- {
- #ifdef CONFIG_PREEMPTION
-       return rwlock_is_contended(lock);
- #else
-       return 0;
- #endif
- }
- 
   static __always_inline bool need_resched(void)
   {
         return unlikely(tif_need_resched());
@@@ -2285,6 -2138,8 +2131,8 @@@ extern bool sched_task_on_rq(struct tas
   extern unsigned long get_wchan(struct task_struct *p);
   extern struct task_struct *cpu_curr_snapshot(int cpu);
   
+ #include <linux/spinlock.h>
+ 
   /*
    * In order to reduce various lock holder preemption latencies provide an
    * interface to see if a vCPU is currently running or not.
@@@ -2321,129 -2176,6 +2169,6 @@@ static inline bool owner_on_cpu(struct 
   unsigned long sched_cpu_util(int cpu);
   #endif /* CONFIG_SMP */
   
- #ifdef CONFIG_RSEQ
- 
- /*
-  * Map the event mask on the user-space ABI enum rseq_cs_flags
-  * for direct mask checks.
-  */
- enum rseq_event_mask_bits {
-       RSEQ_EVENT_PREEMPT_BIT  = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
-       RSEQ_EVENT_SIGNAL_BIT   = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
-       RSEQ_EVENT_MIGRATE_BIT  = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
- };
- 
- enum rseq_event_mask {
-       RSEQ_EVENT_PREEMPT      = (1U << RSEQ_EVENT_PREEMPT_BIT),
-       RSEQ_EVENT_SIGNAL       = (1U << RSEQ_EVENT_SIGNAL_BIT),
-       RSEQ_EVENT_MIGRATE      = (1U << RSEQ_EVENT_MIGRATE_BIT),
- };
- 
- static inline void rseq_set_notify_resume(struct task_struct *t)
- {
-       if (t->rseq)
-               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
- }
- 
- void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
- 
- static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-                                            struct pt_regs *regs)
- {
-       if (current->rseq)
-               __rseq_handle_notify_resume(ksig, regs);
- }
- 
- static inline void rseq_signal_deliver(struct ksignal *ksig,
-                                      struct pt_regs *regs)
- {
-       preempt_disable();
-       __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
-       preempt_enable();
-       rseq_handle_notify_resume(ksig, regs);
- }
- 
- /* rseq_preempt() requires preemption to be disabled. */
- static inline void rseq_preempt(struct task_struct *t)
- {
-       __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
-       rseq_set_notify_resume(t);
- }
- 
- /* rseq_migrate() requires preemption to be disabled. */
- static inline void rseq_migrate(struct task_struct *t)
- {
-       __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
-       rseq_set_notify_resume(t);
- }
- 
- /*
-  * If parent process has a registered restartable sequences area, the
-  * child inherits. Unregister rseq for a clone with CLONE_VM set.
-  */
- static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
- {
-       if (clone_flags & CLONE_VM) {
-               t->rseq = NULL;
-               t->rseq_len = 0;
-               t->rseq_sig = 0;
-               t->rseq_event_mask = 0;
-       } else {
-               t->rseq = current->rseq;
-               t->rseq_len = current->rseq_len;
-               t->rseq_sig = current->rseq_sig;
-               t->rseq_event_mask = current->rseq_event_mask;
-       }
- }
- 
- static inline void rseq_execve(struct task_struct *t)
- {
-       t->rseq = NULL;
-       t->rseq_len = 0;
-       t->rseq_sig = 0;
-       t->rseq_event_mask = 0;
- }
- 
- #else
- 
- static inline void rseq_set_notify_resume(struct task_struct *t)
- {
- }
- static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-                                            struct pt_regs *regs)
- {
- }
- static inline void rseq_signal_deliver(struct ksignal *ksig,
-                                      struct pt_regs *regs)
- {
- }
- static inline void rseq_preempt(struct task_struct *t)
- {
- }
- static inline void rseq_migrate(struct task_struct *t)
- {
- }
- static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
- {
- }
- static inline void rseq_execve(struct task_struct *t)
- {
- }
- 
- #endif
- 
- #ifdef CONFIG_DEBUG_RSEQ
- 
- void rseq_syscall(struct pt_regs *regs);
- 
- #else
- 
- static inline void rseq_syscall(struct pt_regs *regs)
- {
- }
- 
- #endif
- 
   #ifdef CONFIG_SCHED_CORE
   extern void sched_core_free(struct task_struct *tsk);
   extern void sched_core_fork(struct task_struct *p);
diff --combined include/linux/sched/signal.h

index 015c0e3a3e1d14f65db5eaf2b1869a8098a7875a,b847d8fa75a9792686d9b86bd4588405a04fb211..4b7664c56208f9db855c527cd6daff672454d106
--- 1/include/linux/sched/signal.h
--- 2/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@@ -9,6 -9,7 +9,7 @@@
   #include <linux/sched/task.h>
   #include <linux/cred.h>
   #include <linux/refcount.h>
+ #include <linux/pid.h>
   #include <linux/posix-timers.h>
   #include <linux/mm_types.h>
   #include <asm/ptrace.h>
@@@ -432,6 -433,7 +433,6 @@@ static inline bool fault_signal_pending
    * This is required every time the blocked sigset_t changes.
    * callers must hold sighand->siglock.
    */
- -extern void recalc_sigpending_and_wake(struct task_struct *t);
   extern void recalc_sigpending(void);
   extern void calculate_sigpending(void);
   
@@@ -645,9 -647,6 +646,9 @@@ extern bool current_is_single_threaded(
   #define while_each_thread(g, t) \
         while ((t = next_thread(t)) != g)
   
+ +#define for_other_threads(p, t)       \
+ +      for (t = p; (t = next_thread(t)) != p; )
+ +
   #define __for_each_thread(signal, t)  \
         list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
                 lockdep_is_held(&tasklist_lock))
diff --combined include/linux/sched/task.h

index 4f3dca3535568b8c9bfd614da359785300cfcb3d,538cdfbe895f9ba41eee5e33da354312cd298a36..d362aacf9f897343f8cfababed5f21fdf430bfb7
--- 1/include/linux/sched/task.h
--- 2/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@@ -7,6 -7,8 +7,8 @@@
    * functionality:
    */
   
+ #include <linux/rcupdate.h>
+ #include <linux/refcount.h>
   #include <linux/sched.h>
   #include <linux/uaccess.h>
   
@@@ -226,6 -228,4 +228,6 @@@ static inline void task_unlock(struct t
         spin_unlock(&p->alloc_lock);
   }
   
+ +DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))
+ +
   #endif /* _LINUX_SCHED_TASK_H */
diff --combined include/linux/spinlock.h

index 90bc853cafb6aeedd433d3016f17a86086df817e,0c71f06454d9e100fff6a79c047164e0f403ba19..eaac8b0da25b8aef964a311eee34d0313c549838
--- 1/include/linux/spinlock.h
--- 2/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@@ -449,6 -449,37 +449,37 @@@ static __always_inline int spin_is_cont
         return raw_spin_is_contended(&lock->rlock);
   }
   
+ /*
+  * Does a critical section need to be broken due to another
+  * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
+  * but a general need for low latency)
+  */
+ static inline int spin_needbreak(spinlock_t *lock)
+ {
+ #ifdef CONFIG_PREEMPTION
+       return spin_is_contended(lock);
+ #else
+       return 0;
+ #endif
+ }
+ 
+ /*
+  * Check if a rwlock is contended.
+  * Returns non-zero if there is another task waiting on the rwlock.
+  * Returns zero if the lock is not contended or the system / underlying
+  * rwlock implementation does not support contention detection.
+  * Technically does not depend on CONFIG_PREEMPTION, but a general need
+  * for low latency.
+  */
+ static inline int rwlock_needbreak(rwlock_t *lock)
+ {
+ #ifdef CONFIG_PREEMPTION
+       return rwlock_is_contended(lock);
+ #else
+       return 0;
+ #endif
+ }
+ 
   #define assert_spin_locked(lock)      assert_raw_spin_locked(&(lock)->rlock)
   
   #else  /* !CONFIG_PREEMPT_RT */
@@@ -507,8 -538,6 +538,8 @@@ DEFINE_LOCK_GUARD_1(raw_spinlock, raw_s
                     raw_spin_lock(_T->lock),
                     raw_spin_unlock(_T->lock))
   
+ +DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock))
+ +
   DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t,
                     raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING),
                     raw_spin_unlock(_T->lock))
@@@ -517,62 -546,23 +548,62 @@@ DEFINE_LOCK_GUARD_1(raw_spinlock_irq, r
                     raw_spin_lock_irq(_T->lock),
                     raw_spin_unlock_irq(_T->lock))
   
+ +DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock))
+ +
   DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t,
                     raw_spin_lock_irqsave(_T->lock, _T->flags),
                     raw_spin_unlock_irqrestore(_T->lock, _T->flags),
                     unsigned long flags)
   
+ +DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try,
+ +                       raw_spin_trylock_irqsave(_T->lock, _T->flags))
+ +
   DEFINE_LOCK_GUARD_1(spinlock, spinlock_t,
                     spin_lock(_T->lock),
                     spin_unlock(_T->lock))
   
+ +DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock))
+ +
   DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t,
                     spin_lock_irq(_T->lock),
                     spin_unlock_irq(_T->lock))
   
+ +DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try,
+ +                       spin_trylock_irq(_T->lock))
+ +
   DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
                     spin_lock_irqsave(_T->lock, _T->flags),
                     spin_unlock_irqrestore(_T->lock, _T->flags),
                     unsigned long flags)
   
+ +DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
+ +                       spin_trylock_irqsave(_T->lock, _T->flags))
+ +
+ +DEFINE_LOCK_GUARD_1(read_lock, rwlock_t,
+ +                  read_lock(_T->lock),
+ +                  read_unlock(_T->lock))
+ +
+ +DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t,
+ +                  read_lock_irq(_T->lock),
+ +                  read_unlock_irq(_T->lock))
+ +
+ +DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t,
+ +                  read_lock_irqsave(_T->lock, _T->flags),
+ +                  read_unlock_irqrestore(_T->lock, _T->flags),
+ +                  unsigned long flags)
+ +
+ +DEFINE_LOCK_GUARD_1(write_lock, rwlock_t,
+ +                  write_lock(_T->lock),
+ +                  write_unlock(_T->lock))
+ +
+ +DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t,
+ +                  write_lock_irq(_T->lock),
+ +                  write_unlock_irq(_T->lock))
+ +
+ +DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t,
+ +                  write_lock_irqsave(_T->lock, _T->flags),
+ +                  write_unlock_irqrestore(_T->lock, _T->flags),
+ +                  unsigned long flags)
+ +
   #undef __LINUX_INSIDE_SPINLOCK_H
   #endif /* __LINUX_SPINLOCK_H */
diff --combined include/linux/uidgid.h

index 415a7ca2b8829ce9ec2077a34d1ffc915d5eb559,ba20b62f13e1d6f2fd1ba41f424b8ffe01f10dc9..f85ec5613721fe1b4fc6c51b80c9a32cc617c19b
--- 1/include/linux/uidgid.h
--- 2/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@@ -12,22 -12,12 +12,13 @@@
    * to detect when we overlook these differences.
    *
    */
- #include <linux/types.h>
+ #include <linux/uidgid_types.h>
   #include <linux/highuid.h>
   
   struct user_namespace;
   extern struct user_namespace init_user_ns;
+ +struct uid_gid_map;
   
- typedef struct {
-       uid_t val;
- } kuid_t;
- 
- 
- typedef struct {
-       gid_t val;
- } kgid_t;
- 
   #define KUIDT_INIT(value) (kuid_t){ value }
   #define KGIDT_INIT(value) (kgid_t){ value }
   
@@@ -139,9 -129,6 +130,9 @@@ static inline bool kgid_has_mapping(str
         return from_kgid(ns, gid) != (gid_t) -1;
   }
   
+ +u32 map_id_down(struct uid_gid_map *map, u32 id);
+ +u32 map_id_up(struct uid_gid_map *map, u32 id);
+ +
   #else
   
   static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
@@@ -190,15 -177,6 +181,15 @@@ static inline bool kgid_has_mapping(str
         return gid_valid(gid);
   }
   
+ +static inline u32 map_id_down(struct uid_gid_map *map, u32 id)
+ +{
+ +      return id;
+ +}
+ +
+ +static inline u32 map_id_up(struct uid_gid_map *map, u32 id)
+ +{
+ +      return id;
+ +}
   #endif /* CONFIG_USER_NS */
   
   #endif /* _LINUX_UIDGID_H */
diff --combined include/linux/workqueue.h

index b0b9604b76b88a0f2b256564282f5fa4286fdb3a,f1bb2e35301f9493e07521ae4b29a46968149f94..2cc0a9606175fa6f653db5eccadf0071027d6c2f
--- 1/include/linux/workqueue.h
--- 2/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@@ -14,12 -14,7 +14,7 @@@
   #include <linux/atomic.h>
   #include <linux/cpumask.h>
   #include <linux/rcupdate.h>
- 
- struct workqueue_struct;
- 
- struct work_struct;
- typedef void (*work_func_t)(struct work_struct *work);
- void delayed_work_timer_fn(struct timer_list *t);
+ #include <linux/workqueue_types.h>
   
   /*
    * The first word is the work queue pointer and the flags rolled into
@@@ -95,15 -90,6 +90,6 @@@ enum 
   #define WORK_STRUCT_FLAG_MASK    ((1ul << WORK_STRUCT_FLAG_BITS) - 1)
   #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
   
- struct work_struct {
-       atomic_long_t data;
-       struct list_head entry;
-       work_func_t func;
- #ifdef CONFIG_LOCKDEP
-       struct lockdep_map lockdep_map;
- #endif
- };
- 
   #define WORK_DATA_INIT()      ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
   #define WORK_DATA_STATIC_INIT()       \
         ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))
@@@ -491,7 -477,7 +477,7 @@@ struct workqueue_attrs *alloc_workqueue
   void free_workqueue_attrs(struct workqueue_attrs *attrs);
   int apply_workqueue_attrs(struct workqueue_struct *wq,
                           const struct workqueue_attrs *attrs);
- -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
+ +extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);
   
   extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
                         struct work_struct *work);
diff --combined init/init_task.c

index 6f6485d554df1d7237aa01f129b13d2dad55035c,56220898a256b132fe217f79a319e47dcd218f12..7ecb458eb3da60eb73123f4b2072910f194c4bb5
--- 1/init/init_task.c
--- 2/init/init_task.c
+++ b/init/init_task.c
@@@ -12,6 -12,7 +12,7 @@@
   #include <linux/audit.h>
   #include <linux/numa.h>
   #include <linux/scs.h>
+ #include <linux/plist.h>
   
   #include <linux/uaccess.h>
   
@@@ -51,7 -52,8 +52,7 @@@ static struct sighand_struct init_sigha
   };
   
   #ifdef CONFIG_SHADOW_CALL_STACK
- -unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)]
- -              __init_task_data = {
+ +unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = {
         [(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
   };
   #endif
@@@ -60,7 -62,12 +61,7 @@@
    * Set up the first task table, touch at your own risk!. Base=0,
    * limit=0x1fffff (=2MB)
    */
- -struct task_struct init_task
- -#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
- -      __init_task_data
- -#endif
- -      __aligned(L1_CACHE_BYTES)
- -= {
+ +struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
   #ifdef CONFIG_THREAD_INFO_IN_TASK
         .thread_info    = INIT_THREAD_INFO(init_task),
         .stack_refcount = REFCOUNT_INIT(1),
diff --combined kernel/async.c

index 673bba6bdf3a0b0c0041e4bb7d63e61b8a63d0d7,79f6a3034b1f9016b0dc70f4d856f823177c0ea6..97f224a5257b4e49ed959580ae1566a46e09f3fe
--- 1/kernel/async.c
--- 2/kernel/async.c
+++ b/kernel/async.c
@@@ -46,11 -46,12 +46,12 @@@ asynchronous and synchronous parts of t
   
   #include <linux/async.h>
   #include <linux/atomic.h>
- #include <linux/ktime.h>
   #include <linux/export.h>
- #include <linux/wait.h>
+ #include <linux/ktime.h>
+ #include <linux/pid.h>
   #include <linux/sched.h>
   #include <linux/slab.h>
+ #include <linux/wait.h>
   #include <linux/workqueue.h>
   
   #include "workqueue_internal.h"
@@@ -145,39 -146,6 +146,39 @@@ static void async_run_entry_fn(struct w
         wake_up(&async_done);
   }
   
+ +static async_cookie_t __async_schedule_node_domain(async_func_t func,
+ +                                                 void *data, int node,
+ +                                                 struct async_domain *domain,
+ +                                                 struct async_entry *entry)
+ +{
+ +      async_cookie_t newcookie;
+ +      unsigned long flags;
+ +
+ +      INIT_LIST_HEAD(&entry->domain_list);
+ +      INIT_LIST_HEAD(&entry->global_list);
+ +      INIT_WORK(&entry->work, async_run_entry_fn);
+ +      entry->func = func;
+ +      entry->data = data;
+ +      entry->domain = domain;
+ +
+ +      spin_lock_irqsave(&async_lock, flags);
+ +
+ +      /* allocate cookie and queue */
+ +      newcookie = entry->cookie = next_cookie++;
+ +
+ +      list_add_tail(&entry->domain_list, &domain->pending);
+ +      if (domain->registered)
+ +              list_add_tail(&entry->global_list, &async_global_pending);
+ +
+ +      atomic_inc(&entry_count);
+ +      spin_unlock_irqrestore(&async_lock, flags);
+ +
+ +      /* schedule for execution */
+ +      queue_work_node(node, system_unbound_wq, &entry->work);
+ +
+ +      return newcookie;
+ +}
+ +
   /**
    * async_schedule_node_domain - NUMA specific version of async_schedule_domain
    * @func: function to execute asynchronously
@@@ -219,8 -187,29 +220,8 @@@ async_cookie_t async_schedule_node_doma
                 func(data, newcookie);
                 return newcookie;
         }
- -      INIT_LIST_HEAD(&entry->domain_list);
- -      INIT_LIST_HEAD(&entry->global_list);
- -      INIT_WORK(&entry->work, async_run_entry_fn);
- -      entry->func = func;
- -      entry->data = data;
- -      entry->domain = domain;
- -
- -      spin_lock_irqsave(&async_lock, flags);
- -
- -      /* allocate cookie and queue */
- -      newcookie = entry->cookie = next_cookie++;
- -
- -      list_add_tail(&entry->domain_list, &domain->pending);
- -      if (domain->registered)
- -              list_add_tail(&entry->global_list, &async_global_pending);
- -
- -      atomic_inc(&entry_count);
- -      spin_unlock_irqrestore(&async_lock, flags);
- -
- -      /* schedule for execution */
- -      queue_work_node(node, system_unbound_wq, &entry->work);
   
- -      return newcookie;
+ +      return __async_schedule_node_domain(func, data, node, domain, entry);
   }
   EXPORT_SYMBOL_GPL(async_schedule_node_domain);
   
@@@ -243,35 -232,6 +244,35 @@@ async_cookie_t async_schedule_node(asyn
   }
   EXPORT_SYMBOL_GPL(async_schedule_node);
   
+ +/**
+ + * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
+ + * @func: function to execute asynchronously
+ + * @dev: device argument to be passed to function
+ + *
+ + * @dev is used as both the argument for the function and to provide NUMA
+ + * context for where to run the function.
+ + *
+ + * If the asynchronous execution of @func is scheduled successfully, return
+ + * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
+ + * that will run the function synchronously then.
+ + */
+ +bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
+ +{
+ +      struct async_entry *entry;
+ +
+ +      entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL);
+ +
+ +      /* Give up if there is no memory or too much work. */
+ +      if (!entry || atomic_read(&entry_count) > MAX_WORK) {
+ +              kfree(entry);
+ +              return false;
+ +      }
+ +
+ +      __async_schedule_node_domain(func, dev, dev_to_node(dev),
+ +                                   &async_dfl_domain, entry);
+ +      return true;
+ +}
+ +
   /**
    * async_synchronize_full - synchronize all asynchronous function calls
    *
diff --combined kernel/exit.c

index aedc0832c9f4ded6578233a611ee362e0dad77ff,2ef33047371bcfe6fd9de007266333fa3d00c9c5..3988a02efaef06444654a415ce298d378ab925ec
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -69,8 -69,10 +69,10 @@@
   #include <linux/rethook.h>
   #include <linux/sysfs.h>
   #include <linux/user_events.h>
- 
   #include <linux/uaccess.h>
+ 
+ #include <uapi/linux/wait.h>
+ 
   #include <asm/unistd.h>
   #include <asm/mmu_context.h>
   
@@@ -824,6 -826,8 +826,6 @@@ void __noreturn do_exit(long code
         ptrace_event(PTRACE_EVENT_EXIT, code);
         user_events_exit(tsk);
   
- -      validate_creds_for_do_exit(tsk);
- -
         io_uring_files_cancel();
         exit_signals(tsk);  /* sets PF_EXITING */
   
@@@ -907,6 -911,7 +909,6 @@@
         if (tsk->task_frag.page)
                 put_page(tsk->task_frag.page);
   
- -      validate_creds_for_do_exit(tsk);
         exit_task_stack_account(tsk);
   
         check_stack_usage();
diff --combined kernel/fork.c

index b32e323adbbf9fea352eb7d8fd483be652c53e0e,53816393995bb94bdf876ba5a8ba8f34a5db76bb..c981fa6171c1aebee1d4e69bc493e218b5d1bcac
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -53,6 -53,7 +53,7 @@@
   #include <linux/seccomp.h>
   #include <linux/swap.h>
   #include <linux/syscalls.h>
+ #include <linux/syscall_user_dispatch.h>
   #include <linux/jiffies.h>
   #include <linux/futex.h>
   #include <linux/compat.h>
@@@ -99,6 -100,7 +100,7 @@@
   #include <linux/stackprotector.h>
   #include <linux/user_events.h>
   #include <linux/iommu.h>
+ #include <linux/rseq.h>
   
   #include <asm/pgalloc.h>
   #include <linux/uaccess.h>
@@@ -165,6 -167,7 +167,6 @@@ void __weak arch_release_task_struct(st
   {
   }
   
- -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
   static struct kmem_cache *task_struct_cachep;
   
   static inline struct task_struct *alloc_task_struct_node(int node)
@@@ -176,6 -179,9 +178,6 @@@ static inline void free_task_struct(str
   {
         kmem_cache_free(task_struct_cachep, tsk);
   }
- -#endif
- -
- -#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
   
   /*
    * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@@ -408,6 -414,24 +410,6 @@@ void thread_stack_cache_init(void
   }
   
   # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
- -#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
- -
- -static int alloc_thread_stack_node(struct task_struct *tsk, int node)
- -{
- -      unsigned long *stack;
- -
- -      stack = arch_alloc_thread_stack_node(tsk, node);
- -      tsk->stack = stack;
- -      return stack ? 0 : -ENOMEM;
- -}
- -
- -static void free_thread_stack(struct task_struct *tsk)
- -{
- -      arch_free_thread_stack(tsk);
- -      tsk->stack = NULL;
- -}
- -
- -#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
   
   /* SLAB cache for signal_struct structures (tsk->signal) */
   static struct kmem_cache *signal_cachep;
@@@ -628,6 -652,7 +630,6 @@@ static __latent_entropy int dup_mmap(st
         int retval;
         unsigned long charge = 0;
         LIST_HEAD(uf);
- -      VMA_ITERATOR(old_vmi, oldmm, 0);
         VMA_ITERATOR(vmi, mm, 0);
   
         uprobe_start_dup_mmap();
@@@ -655,22 -680,16 +657,22 @@@
                 goto out;
         khugepaged_fork(mm, oldmm);
   
- -      retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
- -      if (retval)
+ +      /* Use __mt_dup() to efficiently build an identical maple tree. */
+ +      retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+ +      if (unlikely(retval))
                 goto out;
   
         mt_clear_in_rcu(vmi.mas.tree);
- -      for_each_vma(old_vmi, mpnt) {
+ +      for_each_vma(vmi, mpnt) {
                 struct file *file;
   
                 vma_start_write(mpnt);
                 if (mpnt->vm_flags & VM_DONTCOPY) {
+ +                      retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
+ +                                                  mpnt->vm_end, GFP_KERNEL);
+ +                      if (retval)
+ +                              goto loop_out;
+ +
                         vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
                         continue;
                 }
@@@ -732,11 -751,9 +734,11 @@@
                 if (is_vm_hugetlb_page(tmp))
                         hugetlb_dup_vma_private(tmp);
   
- -              /* Link the vma into the MT */
- -              if (vma_iter_bulk_store(&vmi, tmp))
- -                      goto fail_nomem_vmi_store;
+ +              /*
+ +               * Link the vma into the MT. After using __mt_dup(), memory
+ +               * allocation is not necessary here, so it cannot fail.
+ +               */
+ +              vma_iter_bulk_store(&vmi, tmp);
   
                 mm->map_count++;
                 if (!(tmp->vm_flags & VM_WIPEONFORK))
@@@ -745,28 -762,15 +747,28 @@@
                 if (tmp->vm_ops && tmp->vm_ops->open)
                         tmp->vm_ops->open(tmp);
   
- -              if (retval)
+ +              if (retval) {
+ +                      mpnt = vma_next(&vmi);
                         goto loop_out;
+ +              }
         }
         /* a new mm has just been created */
         retval = arch_dup_mmap(oldmm, mm);
   loop_out:
         vma_iter_free(&vmi);
- -      if (!retval)
+ +      if (!retval) {
                 mt_set_in_rcu(vmi.mas.tree);
+ +      } else if (mpnt) {
+ +              /*
+ +               * The entire maple tree has already been duplicated. If the
+ +               * mmap duplication fails, mark the failure point with
+ +               * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+ +               * stop releasing VMAs that have not been duplicated after this
+ +               * point.
+ +               */
+ +              mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ +              mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ +      }
   out:
         mmap_write_unlock(mm);
         flush_tlb_mm(oldmm);
@@@ -776,6 -780,8 +778,6 @@@ fail_uprobe_end
         uprobe_end_dup_mmap();
         return retval;
   
- -fail_nomem_vmi_store:
- -      unlink_anon_vmas(tmp);
   fail_nomem_anon_vma_fork:
         mpol_put(vma_policy(tmp));
   fail_nomem_policy:
@@@ -1017,6 -1023,7 +1019,6 @@@ static void set_max_threads(unsigned in
   int arch_task_struct_size __read_mostly;
   #endif
   
- -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
   static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
   {
         /* Fetch thread_struct whitelist for the architecture. */
@@@ -1031,10 -1038,12 +1033,10 @@@
         else
                 *offset += offsetof(struct task_struct, thread);
   }
- -#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
   
   void __init fork_init(void)
   {
         int i;
- -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
   #ifndef ARCH_MIN_TASKALIGN
   #define ARCH_MIN_TASKALIGN    0
   #endif
@@@ -1047,6 -1056,7 +1049,6 @@@
                         arch_task_struct_size, align,
                         SLAB_PANIC|SLAB_ACCOUNT,
                         useroffset, usersize, NULL);
- -#endif
   
         /* do the arch specific task caches init */
         arch_task_cache_init();
@@@ -1580,7 -1590,7 +1582,7 @@@ static void complete_vfork_done(struct 
   static int wait_for_vfork_done(struct task_struct *child,
                                 struct completion *vfork)
   {
- -      unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
+ +      unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
         int killed;
   
         cgroup_enter_frozen();
@@@ -2920,7 -2930,7 +2922,7 @@@ pid_t kernel_clone(struct kernel_clone_
                 get_task_struct(p);
         }
   
- -      if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ +      if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
                 /* lock the task to synchronize with memcg migration */
                 task_lock(p);
                 lru_gen_add_mm(p->mm);
diff --combined kernel/sched/core.c

index db4be4921e7f0eeb8e19b4544d54b20f48e47c8b,d04cf3c47899b85f1f6d75190f08078a90dba19e..9116bcc903467fe0d5854e3deb06b8d334cf85eb
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -57,6 -57,7 +57,7 @@@
   #include <linux/profile.h>
   #include <linux/psi.h>
   #include <linux/rcuwait_api.h>
+ #include <linux/rseq.h>
   #include <linux/sched/wake_q.h>
   #include <linux/scs.h>
   #include <linux/slab.h>
@@@ -1131,28 -1132,6 +1132,28 @@@ static void wake_up_idle_cpu(int cpu
         if (cpu == smp_processor_id())
                 return;
   
+ +      /*
+ +       * Set TIF_NEED_RESCHED and send an IPI if in the non-polling
+ +       * part of the idle loop. This forces an exit from the idle loop
+ +       * and a round trip to schedule(). Now this could be optimized
+ +       * because a simple new idle loop iteration is enough to
+ +       * re-evaluate the next tick. Provided some re-ordering of tick
+ +       * nohz functions that would need to follow TIF_NR_POLLING
+ +       * clearing:
+ +       *
+ +       * - On most archs, a simple fetch_or on ti::flags with a
+ +       *   "0" value would be enough to know if an IPI needs to be sent.
+ +       *
+ +       * - x86 needs to perform a last need_resched() check between
+ +       *   monitor and mwait which doesn't take timers into account.
+ +       *   There a dedicated TIF_TIMER flag would be required to
+ +       *   fetch_or here and be checked along with TIF_NEED_RESCHED
+ +       *   before mwait().
+ +       *
+ +       * However, remote timer enqueue is not such a frequent event
+ +       * and testing of the above solutions didn't appear to report
+ +       * much benefits.
+ +       */
         if (set_nr_and_not_polling(rq->idle))
                 smp_send_reschedule(cpu);
         else
@@@ -2146,14 -2125,12 +2147,14 @@@ void activate_task(struct rq *rq, struc
   
         enqueue_task(rq, p, flags);
   
- -      p->on_rq = TASK_ON_RQ_QUEUED;
+ +      WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED);
+ +      ASSERT_EXCLUSIVE_WRITER(p->on_rq);
   }
   
   void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
   {
- -      p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+ +      WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
+ +      ASSERT_EXCLUSIVE_WRITER(p->on_rq);
   
         dequeue_task(rq, p, flags);
   }
@@@ -3819,8 -3796,6 +3820,8 @@@ ttwu_do_activate(struct rq *rq, struct 
                 rq->idle_stamp = 0;
         }
   #endif
+ +
+ +      p->dl_server = NULL;
   }
   
   /*
@@@ -4535,7 -4510,10 +4536,7 @@@ static void __sched_fork(unsigned long 
         memset(&p->stats, 0, sizeof(p->stats));
   #endif
   
- -      RB_CLEAR_NODE(&p->dl.rb_node);
- -      init_dl_task_timer(&p->dl);
- -      init_dl_inactive_task_timer(&p->dl);
- -      __dl_clear_params(p);
+ +      init_dl_entity(&p->dl);
   
         INIT_LIST_HEAD(&p->rt.run_list);
         p->rt.timeout           = 0;
@@@ -6027,27 -6005,12 +6028,27 @@@ __pick_next_task(struct rq *rq, struct 
                         p = pick_next_task_idle(rq);
                 }
   
+ +              /*
+ +               * This is the fast path; it cannot be a DL server pick;
+ +               * therefore even if @p == @prev, ->dl_server must be NULL.
+ +               */
+ +              if (p->dl_server)
+ +                      p->dl_server = NULL;
+ +
                 return p;
         }
   
   restart:
         put_prev_task_balance(rq, prev, rf);
   
+ +      /*
+ +       * We've updated @prev and no longer need the server link, clear it.
+ +       * Must be done before ->pick_next_task() because that can (re)set
+ +       * ->dl_server.
+ +       */
+ +      if (prev->dl_server)
+ +              prev->dl_server = NULL;
+ +
         for_each_class(class) {
                 p = class->pick_next_task(rq);
                 if (p)
@@@ -7467,13 -7430,18 +7468,13 @@@ int sched_core_idle_cpu(int cpu
    * required to meet deadlines.
    */
   unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- -                               enum cpu_util_type type,
- -                               struct task_struct *p)
+ +                               unsigned long *min,
+ +                               unsigned long *max)
   {
- -      unsigned long dl_util, util, irq, max;
+ +      unsigned long util, irq, scale;
         struct rq *rq = cpu_rq(cpu);
   
- -      max = arch_scale_cpu_capacity(cpu);
- -
- -      if (!uclamp_is_used() &&
- -          type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
- -              return max;
- -      }
+ +      scale = arch_scale_cpu_capacity(cpu);
   
         /*
          * Early check to see if IRQ/steal time saturates the CPU, can be
@@@ -7481,49 -7449,45 +7482,49 @@@
          * update_irq_load_avg().
          */
         irq = cpu_util_irq(rq);
- -      if (unlikely(irq >= max))
- -              return max;
+ +      if (unlikely(irq >= scale)) {
+ +              if (min)
+ +                      *min = scale;
+ +              if (max)
+ +                      *max = scale;
+ +              return scale;
+ +      }
+ +
+ +      if (min) {
+ +              /*
+ +               * The minimum utilization returns the highest level between:
+ +               * - the computed DL bandwidth needed with the IRQ pressure which
+ +               *   steals time to the deadline task.
+ +               * - The minimum performance requirement for CFS and/or RT.
+ +               */
+ +              *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+ +
+ +              /*
+ +               * When an RT task is runnable and uclamp is not used, we must
+ +               * ensure that the task will run at maximum compute capacity.
+ +               */
+ +              if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ +                      *min = max(*min, scale);
+ +      }
   
         /*
          * Because the time spend on RT/DL tasks is visible as 'lost' time to
          * CFS tasks and we use the same metric to track the effective
          * utilization (PELT windows are synchronized) we can directly add them
          * to obtain the CPU's actual utilization.
- -       *
- -       * CFS and RT utilization can be boosted or capped, depending on
- -       * utilization clamp constraints requested by currently RUNNABLE
- -       * tasks.
- -       * When there are no CFS RUNNABLE tasks, clamps are released and
- -       * frequency will be gracefully reduced with the utilization decay.
          */
         util = util_cfs + cpu_util_rt(rq);
- -      if (type == FREQUENCY_UTIL)
- -              util = uclamp_rq_util_with(rq, util, p);
- -
- -      dl_util = cpu_util_dl(rq);
+ +      util += cpu_util_dl(rq);
   
         /*
- -       * For frequency selection we do not make cpu_util_dl() a permanent part
- -       * of this sum because we want to use cpu_bw_dl() later on, but we need
- -       * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
- -       * that we select f_max when there is no idle time.
- -       *
- -       * NOTE: numerical errors or stop class might cause us to not quite hit
- -       * saturation when we should -- something for later.
+ +       * The maximum hint is a soft bandwidth requirement, which can be lower
+ +       * than the actual utilization because of uclamp_max requirements.
          */
- -      if (util + dl_util >= max)
- -              return max;
+ +      if (max)
+ +              *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
   
- -      /*
- -       * OTOH, for energy computation we need the estimated running time, so
- -       * include util_dl and ignore dl_bw.
- -       */
- -      if (type == ENERGY_UTIL)
- -              util += dl_util;
+ +      if (util >= scale)
+ +              return scale;
   
         /*
          * There is still idle time; further improve the number by using the
@@@ -7534,15 -7498,28 +7535,15 @@@
          *   U' = irq + --------- * U
          *                 max
          */
- -      util = scale_irq_capacity(util, irq, max);
+ +      util = scale_irq_capacity(util, irq, scale);
         util += irq;
   
- -      /*
- -       * Bandwidth required by DEADLINE must always be granted while, for
- -       * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
- -       * to gracefully reduce the frequency when no tasks show up for longer
- -       * periods of time.
- -       *
- -       * Ideally we would like to set bw_dl as min/guaranteed freq and util +
- -       * bw_dl as requested freq. However, cpufreq is not yet ready for such
- -       * an interface. So, we only do the latter for now.
- -       */
- -      if (type == FREQUENCY_UTIL)
- -              util += cpu_bw_dl(rq);
- -
- -      return min(max, util);
+ +      return min(scale, util);
   }
   
   unsigned long sched_cpu_util(int cpu)
   {
- -      return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
+ +      return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
   }
   #endif /* CONFIG_SMP */
   
diff --combined mm/filemap.c

index c8dafe70d4ccedca9f80ec04d6d9319309cc5ad9,1219ffc04a26133abeea103e98bd4c210f49a653..ea49677c63385af4a82981511384f63fc21e7c60
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -45,6 -45,7 +45,7 @@@
   #include <linux/migrate.h>
   #include <linux/pipe_fs_i.h>
   #include <linux/splice.h>
+ #include <linux/rcupdate_wait.h>
   #include <asm/pgalloc.h>
   #include <asm/tlbflush.h>
   #include "internal.h"
@@@ -113,11 -114,11 +114,11 @@@
    *    ->i_pages lock          (try_to_unmap_one)
    *    ->lruvec->lru_lock      (follow_page->mark_page_accessed)
    *    ->lruvec->lru_lock      (check_pte_range->isolate_lru_page)
- - *    ->private_lock          (page_remove_rmap->set_page_dirty)
- - *    ->i_pages lock          (page_remove_rmap->set_page_dirty)
- - *    bdi.wb->list_lock               (page_remove_rmap->set_page_dirty)
- - *    ->inode->i_lock         (page_remove_rmap->set_page_dirty)
- - *    ->memcg->move_lock      (page_remove_rmap->folio_memcg_lock)
+ + *    ->private_lock          (folio_remove_rmap_pte->set_page_dirty)
+ + *    ->i_pages lock          (folio_remove_rmap_pte->set_page_dirty)
+ + *    bdi.wb->list_lock               (folio_remove_rmap_pte->set_page_dirty)
+ + *    ->inode->i_lock         (folio_remove_rmap_pte->set_page_dirty)
+ + *    ->memcg->move_lock      (folio_remove_rmap_pte->folio_memcg_lock)
    *    bdi.wb->list_lock               (zap_pte_range->set_page_dirty)
    *    ->inode->i_lock         (zap_pte_range->set_page_dirty)
    *    ->private_lock          (zap_pte_range->block_dirty_folio)
@@@ -1623,7 -1624,7 +1624,7 @@@ EXPORT_SYMBOL_GPL(__folio_lock_killable
   static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
   {
         struct wait_queue_head *q = folio_waitqueue(folio);
- -      int ret = 0;
+ +      int ret;
   
         wait->folio = folio;
         wait->bit_nr = PG_locked;
@@@ -2173,7 -2174,7 +2174,7 @@@ update_start
   
         if (nr) {
                 folio = fbatch->folios[nr - 1];
- -              *start = folio->index + folio_nr_pages(folio);
+ +              *start = folio_next_index(folio);
         }
   out:
         rcu_read_unlock();
@@@ -2607,15 -2608,6 +2608,15 @@@ ssize_t filemap_read(struct kiocb *iocb
                         goto put_folios;
                 end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
   
+ +              /*
+ +               * Pairs with a barrier in
+ +               * block_write_end()->mark_buffer_dirty() or other page
+ +               * dirtying routines like iomap_write_end() to ensure
+ +               * changes to page contents are visible before we see
+ +               * increased inode size.
+ +               */
+ +              smp_rmb();
+ +
                 /*
                  * Once we start copying data, we don't want to be touching any
                  * cachelines that might be contended:
diff --combined mm/khugepaged.c

index 3defe6713ef1c4f839fd8d9cb965a3e9523613fb,47a20a4ece09896403b9d28006147a9d8703a57e..2b219acb528e25fd7f16b9f58d85a81048355bd4
--- 1/mm/khugepaged.c
--- 2/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@@ -17,6 -17,7 +17,7 @@@
   #include <linux/userfaultfd_k.h>
   #include <linux/page_idle.h>
   #include <linux/page_table_check.h>
+ #include <linux/rcupdate_wait.h>
   #include <linux/swapops.h>
   #include <linux/shmem_fs.h>
   #include <linux/ksm.h>
@@@ -446,8 -447,7 +447,8 @@@ void khugepaged_enter_vma(struct vm_are
   {
         if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
             hugepage_flags_enabled()) {
- -              if (hugepage_vma_check(vma, vm_flags, false, false, true))
+ +              if (thp_vma_allowable_order(vma, vm_flags, false, false, true,
+ +                                          PMD_ORDER))
                         __khugepaged_enter(vma->vm_mm);
         }
   }
@@@ -494,6 -494,11 +495,6 @@@ static void release_pte_folio(struct fo
         folio_putback_lru(folio);
   }
   
- -static void release_pte_page(struct page *page)
- -{
- -      release_pte_folio(page_folio(page));
- -}
- -
   static void release_pte_pages(pte_t *pte, pte_t *_pte,
                 struct list_head *compound_pagelist)
   {
@@@ -682,7 -687,6 +683,7 @@@ static void __collapse_huge_page_copy_s
                                                 spinlock_t *ptl,
                                                 struct list_head *compound_pagelist)
   {
+ +      struct folio *src_folio;
         struct page *src_page;
         struct page *tmp;
         pte_t *_pte;
@@@ -704,17 -708,16 +705,17 @@@
                         }
                 } else {
                         src_page = pte_page(pteval);
- -                      if (!PageCompound(src_page))
- -                              release_pte_page(src_page);
+ +                      src_folio = page_folio(src_page);
+ +                      if (!folio_test_large(src_folio))
+ +                              release_pte_folio(src_folio);
                         /*
                          * ptl mostly unnecessary, but preempt has to
                          * be disabled to update the per-cpu stats
- -                       * inside page_remove_rmap().
+ +                       * inside folio_remove_rmap_pte().
                          */
                         spin_lock(ptl);
                         ptep_clear(vma->vm_mm, address, _pte);
- -                      page_remove_rmap(src_page, vma, false);
+ +                      folio_remove_rmap_pte(src_folio, src_page, vma);
                         spin_unlock(ptl);
                         free_page_and_swap_cache(src_page);
                 }
@@@ -920,16 -923,16 +921,16 @@@ static int hugepage_vma_revalidate(stru
         if (!vma)
                 return SCAN_VMA_NULL;
   
- -      if (!transhuge_vma_suitable(vma, address))
+ +      if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
                 return SCAN_ADDRESS_RANGE;
- -      if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
- -                              cc->is_khugepaged))
+ +      if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
+ +                                   cc->is_khugepaged, PMD_ORDER))
                 return SCAN_VMA_CHECK;
         /*
          * Anon VMA expected, the address may be unmapped then
          * remapped to file after khugepaged reaquired the mmap_lock.
          *
- -       * hugepage_vma_check may return true for qualified file
+ +       * thp_vma_allowable_order may return true for qualified file
          * vmas.
          */
         if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
@@@ -1087,7 -1090,6 +1088,7 @@@ static int collapse_huge_page(struct mm
         pmd_t *pmd, _pmd;
         pte_t *pte;
         pgtable_t pgtable;
+ +      struct folio *folio;
         struct page *hpage;
         spinlock_t *pmd_ptl, *pte_ptl;
         int result = SCAN_FAIL;
@@@ -1138,9 -1140,6 +1139,9 @@@
          * Prevent all access to pagetables with the exception of
          * gup_fast later handled by the ptep_clear_flush and the VM
          * handled by the anon_vma lock + PG_lock.
+ +       *
+ +       * UFFDIO_MOVE is prevented to race as well thanks to the
+ +       * mmap_lock.
          */
         mmap_write_lock(mm);
         result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
@@@ -1210,13 -1209,13 +1211,13 @@@
         if (unlikely(result != SCAN_SUCCEED))
                 goto out_up_write;
   
+ +      folio = page_folio(hpage);
         /*
- -       * spin_lock() below is not the equivalent of smp_wmb(), but
- -       * the smp_wmb() inside __SetPageUptodate() can be reused to
- -       * avoid the copy_huge_page writes to become visible after
- -       * the set_pmd_at() write.
+ +       * The smp_wmb() inside __folio_mark_uptodate() ensures the
+ +       * copy_huge_page writes become visible before the set_pmd_at()
+ +       * write.
          */
- -      __SetPageUptodate(hpage);
+ +      __folio_mark_uptodate(folio);
         pgtable = pmd_pgtable(_pmd);
   
         _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
@@@ -1224,8 -1223,8 +1225,8 @@@
   
         spin_lock(pmd_ptl);
         BUG_ON(!pmd_none(*pmd));
- -      page_add_new_anon_rmap(hpage, vma, address);
- -      lru_cache_add_inactive_or_unevictable(hpage, vma);
+ +      folio_add_new_anon_rmap(folio, vma, address);
+ +      folio_add_lru_vma(folio, vma);
         pgtable_trans_huge_deposit(mm, pmd, pgtable);
         set_pmd_at(mm, address, pmd, _pmd);
         update_mmu_cache_pmd(vma, address, pmd);
@@@ -1505,8 -1504,7 +1506,8 @@@ int collapse_pte_mapped_thp(struct mm_s
          * and map it by a PMD, regardless of sysfs THP settings. As such, let's
          * analogously elide sysfs THP settings here.
          */
- -      if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+ +      if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
+ +                                   PMD_ORDER))
                 return SCAN_VMA_CHECK;
   
         /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@@ -1622,7 -1620,7 +1623,7 @@@
                  * PTE dirty? Shmem page is already dirty; file is read-only.
                  */
                 ptep_clear(mm, addr, pte);
- -              page_remove_rmap(page, vma, false);
+ +              folio_remove_rmap_pte(folio, page, vma);
                 nr_ptes++;
         }
   
@@@ -2122,23 -2120,23 +2123,23 @@@ immap_locked
                 xas_lock_irq(&xas);
         }
   
- -      nr = thp_nr_pages(hpage);
+ +      folio = page_folio(hpage);
+ +      nr = folio_nr_pages(folio);
         if (is_shmem)
- -              __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
+ +              __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
         else
- -              __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
+ +              __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr);
   
         if (nr_none) {
- -              __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
+ +              __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_none);
                 /* nr_none is always 0 for non-shmem. */
- -              __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
+ +              __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_none);
         }
   
         /*
          * Mark hpage as uptodate before inserting it into the page cache so
          * that it isn't mistaken for an fallocated but unwritten page.
          */
- -      folio = page_folio(hpage);
         folio_mark_uptodate(folio);
         folio_ref_add(folio, HPAGE_PMD_NR - 1);
   
@@@ -2148,7 -2146,7 +2149,7 @@@
   
         /* Join all the small entries into a single multi-index entry. */
         xas_set_order(&xas, start, HPAGE_PMD_ORDER);
- -      xas_store(&xas, hpage);
+ +      xas_store(&xas, folio);
         WARN_ON_ONCE(xas_error(&xas));
         xas_unlock_irq(&xas);
   
@@@ -2159,7 -2157,7 +2160,7 @@@
         retract_page_tables(mapping, start);
         if (cc && !cc->is_khugepaged)
                 result = SCAN_PTE_MAPPED_HUGEPAGE;
- -      unlock_page(hpage);
+ +      folio_unlock(folio);
   
         /*
          * The collapse has succeeded, so free the old pages.
@@@ -2371,8 -2369,7 +2372,8 @@@ static unsigned int khugepaged_scan_mm_
                         progress++;
                         break;
                 }
- -              if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
+ +              if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
+ +                                           true, PMD_ORDER)) {
   skip:
                         progress++;
                         continue;
@@@ -2496,7 -2493,7 +2497,7 @@@ static void khugepaged_do_scan(struct c
         while (true) {
                 cond_resched();
   
- -              if (unlikely(kthread_should_stop() || try_to_freeze()))
+ +              if (unlikely(kthread_should_stop()))
                         break;
   
                 spin_lock(&khugepaged_mm_lock);
@@@ -2709,8 -2706,7 +2710,8 @@@ int madvise_collapse(struct vm_area_str
   
         *prev = vma;
   
- -      if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+ +      if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
+ +                                   PMD_ORDER))
                 return -EINVAL;
   
         cc = kmalloc(sizeof(*cc), GFP_KERNEL);
diff --combined mm/shmem.c

index 928aa230493261ef5abfe2ad21217d8b94c6bd12,98f6ca7bdae13e6c6a8d98892053d528217f17cd..d7c84ff621860b85090cf61d9b2970357da01b76
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -79,6 -79,7 +79,7 @@@ static struct vfsmount *shm_mnt __ro_af
   #include <linux/rmap.h>
   #include <linux/uuid.h>
   #include <linux/quotaops.h>
+ #include <linux/rcupdate_wait.h>
   
   #include <linux/uaccess.h>
   
@@@ -1080,24 -1081,7 +1081,24 @@@ whole_folios
                                 }
                                 VM_BUG_ON_FOLIO(folio_test_writeback(folio),
                                                 folio);
- -                              truncate_inode_folio(mapping, folio);
+ +
+ +                              if (!folio_test_large(folio)) {
+ +                                      truncate_inode_folio(mapping, folio);
+ +                              } else if (truncate_inode_partial_folio(folio, lstart, lend)) {
+ +                                      /*
+ +                                       * If we split a page, reset the loop so
+ +                                       * that we pick up the new sub pages.
+ +                                       * Otherwise the THP was entirely
+ +                                       * dropped or the target range was
+ +                                       * zeroed, so just continue the loop as
+ +                                       * is.
+ +                                       */
+ +                                      if (!folio_test_large(folio)) {
+ +                                              folio_unlock(folio);
+ +                                              index = start;
+ +                                              break;
+ +                                      }
+ +                              }
                         }
                         folio_unlock(folio);
                 }
@@@ -1514,7 -1498,8 +1515,7 @@@ static int shmem_writepage(struct page 
   
                 mutex_unlock(&shmem_swaplist_mutex);
                 BUG_ON(folio_mapped(folio));
- -              swap_writepage(&folio->page, wbc);
- -              return 0;
+ +              return swap_writepage(&folio->page, wbc);
         }
   
         mutex_unlock(&shmem_swaplist_mutex);
@@@ -1569,13 -1554,15 +1570,13 @@@ static struct folio *shmem_swapin_clust
   {
         struct mempolicy *mpol;
         pgoff_t ilx;
- -      struct page *page;
+ +      struct folio *folio;
   
         mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
- -      page = swap_cluster_readahead(swap, gfp, mpol, ilx);
+ +      folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
         mpol_cond_put(mpol);
   
- -      if (!page)
- -              return NULL;
- -      return page_folio(page);
+ +      return folio;
   }
   
   /*
@@@ -4459,8 -4446,8 +4460,8 @@@ static void __init shmem_destroy_inodec
   }
   
   /* Keep the page in page cache instead of truncating it */
- -static int shmem_error_remove_page(struct address_space *mapping,
- -                                 struct page *page)
+ +static int shmem_error_remove_folio(struct address_space *mapping,
+ +                                 struct folio *folio)
   {
         return 0;
   }
@@@ -4475,7 -4462,7 +4476,7 @@@ const struct address_space_operations s
   #ifdef CONFIG_MIGRATION
         .migrate_folio  = migrate_folio,
   #endif
- -      .error_remove_page = shmem_error_remove_page,
+ +      .error_remove_folio = shmem_error_remove_folio,
   };
   EXPORT_SYMBOL(shmem_aops);
   
diff --combined mm/swapfile.c

index 3eec686484ef5aaed972d3f8643ffd280890184c,25019af07181e873b6ac06988631da02b8793e82..556ff7347d5f04402b61cc5bd9d0d123a36dc1d5
--- 1/mm/swapfile.c
--- 2/mm/swapfile.c
+++ b/mm/swapfile.c
@@@ -42,6 -42,7 +42,7 @@@
   #include <linux/completion.h>
   #include <linux/suspend.h>
   #include <linux/zswap.h>
+ #include <linux/plist.h>
   
   #include <asm/tlbflush.h>
   #include <linux/swapops.h>
@@@ -227,14 -228,14 +228,14 @@@ offset_to_swap_extent(struct swap_info_
         BUG();
   }
   
- -sector_t swap_page_sector(struct page *page)
+ +sector_t swap_folio_sector(struct folio *folio)
   {
- -      struct swap_info_struct *sis = page_swap_info(page);
+ +      struct swap_info_struct *sis = swp_swap_info(folio->swap);
         struct swap_extent *se;
         sector_t sector;
         pgoff_t offset;
   
- -      offset = __page_file_index(page);
+ +      offset = swp_offset(folio->swap);
         se = offset_to_swap_extent(sis, offset);
         sector = se->start_block + (offset - se->start_page);
         return sector << (PAGE_SHIFT - 9);
@@@ -1495,9 -1496,9 +1496,9 @@@ int swp_swapcount(swp_entry_t entry
   
         do {
                 page = list_next_entry(page, lru);
- -              map = kmap_atomic(page);
+ +              map = kmap_local_page(page);
                 tmp_count = map[offset];
- -              kunmap_atomic(map);
+ +              kunmap_local(map);
   
                 count += (tmp_count & ~COUNT_CONTINUED) * n;
                 n *= (SWAP_CONT_MAX + 1);
@@@ -1741,24 -1742,18 +1742,24 @@@ static inline int pte_same_as_swp(pte_
   static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                 unsigned long addr, swp_entry_t entry, struct folio *folio)
   {
- -      struct page *page = folio_file_page(folio, swp_offset(entry));
- -      struct page *swapcache;
+ +      struct page *page;
+ +      struct folio *swapcache;
         spinlock_t *ptl;
         pte_t *pte, new_pte, old_pte;
- -      bool hwpoisoned = PageHWPoison(page);
+ +      bool hwpoisoned = false;
         int ret = 1;
   
- -      swapcache = page;
- -      page = ksm_might_need_to_copy(page, vma, addr);
- -      if (unlikely(!page))
+ +      swapcache = folio;
+ +      folio = ksm_might_need_to_copy(folio, vma, addr);
+ +      if (unlikely(!folio))
                 return -ENOMEM;
- -      else if (unlikely(PTR_ERR(page) == -EHWPOISON))
+ +      else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
+ +              hwpoisoned = true;
+ +              folio = swapcache;
+ +      }
+ +
+ +      page = folio_file_page(folio, swp_offset(entry));
+ +      if (PageHWPoison(page))
                 hwpoisoned = true;
   
         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@@ -1770,12 -1765,13 +1771,12 @@@
   
         old_pte = ptep_get(pte);
   
- -      if (unlikely(hwpoisoned || !PageUptodate(page))) {
+ +      if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) {
                 swp_entry_t swp_entry;
   
                 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
                 if (hwpoisoned) {
- -                      swp_entry = make_hwpoison_entry(swapcache);
- -                      page = swapcache;
+ +                      swp_entry = make_hwpoison_entry(page);
                 } else {
                         swp_entry = make_poisoned_swp_entry();
                 }
@@@ -1789,27 -1785,31 +1790,27 @@@
          * when reading from swap. This metadata may be indexed by swap entry
          * so this must be called before swap_free().
          */
- -      arch_swap_restore(entry, page_folio(page));
- -
- -      /* See do_swap_page() */
- -      BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
- -      BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+ +      arch_swap_restore(entry, folio);
   
         dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
         inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
- -      get_page(page);
- -      if (page == swapcache) {
+ +      folio_get(folio);
+ +      if (folio == swapcache) {
                 rmap_t rmap_flags = RMAP_NONE;
   
                 /*
- -               * See do_swap_page(): PageWriteback() would be problematic.
- -               * However, we do a wait_on_page_writeback() just before this
- -               * call and have the page locked.
+ +               * See do_swap_page(): writeback would be problematic.
+ +               * However, we do a folio_wait_writeback() just before this
+ +               * call and have the folio locked.
                  */
- -              VM_BUG_ON_PAGE(PageWriteback(page), page);
+ +              VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
                 if (pte_swp_exclusive(old_pte))
                         rmap_flags |= RMAP_EXCLUSIVE;
   
- -              page_add_anon_rmap(page, vma, addr, rmap_flags);
+ +              folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
         } else { /* ksm created a completely new copy */
- -              page_add_new_anon_rmap(page, vma, addr);
- -              lru_cache_add_inactive_or_unevictable(page, vma);
+ +              folio_add_new_anon_rmap(folio, vma, addr);
+ +              folio_add_lru_vma(folio, vma);
         }
         new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
         if (pte_swp_soft_dirty(old_pte))
@@@ -1822,9 -1822,9 +1823,9 @@@ setpte
   out:
         if (pte)
                 pte_unmap_unlock(pte, ptl);
- -      if (page != swapcache) {
- -              unlock_page(page);
- -              put_page(page);
+ +      if (folio != swapcache) {
+ +              folio_unlock(folio);
+ +              folio_put(folio);
         }
         return ret;
   }
@@@ -2225,7 -2225,7 +2226,7 @@@ EXPORT_SYMBOL_GPL(add_swap_extent)
   /*
    * A `swap extent' is a simple thing which maps a contiguous range of pages
    * onto a contiguous range of disk blocks.  A rbtree of swap extents is
- - * built at swapon time and is then used at swap_writepage/swap_readpage
+ + * built at swapon time and is then used at swap_writepage/swap_read_folio
    * time for locating where on disk a page belongs.
    *
    * If the swapfile is an S_ISBLK block device, a single extent is installed.
@@@ -3369,12 -3369,18 +3370,12 @@@ struct swap_info_struct *swp_swap_info(
         return swap_type_to_swap_info(swp_type(entry));
   }
   
- -struct swap_info_struct *page_swap_info(struct page *page)
- -{
- -      swp_entry_t entry = page_swap_entry(page);
- -      return swp_swap_info(entry);
- -}
- -
   /*
    * out-of-line methods to avoid include hell.
    */
   struct address_space *swapcache_mapping(struct folio *folio)
   {
- -      return page_swap_info(&folio->page)->swap_file->f_mapping;
+ +      return swp_swap_info(folio->swap)->swap_file->f_mapping;
   }
   EXPORT_SYMBOL_GPL(swapcache_mapping);
   
@@@ -3472,9 -3478,9 +3473,9 @@@ int add_swap_count_continuation(swp_ent
                 if (!(count & COUNT_CONTINUED))
                         goto out_unlock_cont;
   
- -              map = kmap_atomic(list_page) + offset;
+ +              map = kmap_local_page(list_page) + offset;
                 count = *map;
- -              kunmap_atomic(map);
+ +              kunmap_local(map);
   
                 /*
                  * If this continuation count now has some space in it,
@@@ -3524,7 -3530,7 +3525,7 @@@ static bool swap_count_continued(struc
         spin_lock(&si->cont_lock);
         offset &= ~PAGE_MASK;
         page = list_next_entry(head, lru);
- -      map = kmap_atomic(page) + offset;
+ +      map = kmap_local_page(page) + offset;
   
         if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
                 goto init_map;          /* jump over SWAP_CONT_MAX checks */
@@@ -3534,27 -3540,27 +3535,27 @@@
                  * Think of how you add 1 to 999
                  */
                 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
- -                      kunmap_atomic(map);
+ +                      kunmap_local(map);
                         page = list_next_entry(page, lru);
                         BUG_ON(page == head);
- -                      map = kmap_atomic(page) + offset;
+ +                      map = kmap_local_page(page) + offset;
                 }
                 if (*map == SWAP_CONT_MAX) {
- -                      kunmap_atomic(map);
+ +                      kunmap_local(map);
                         page = list_next_entry(page, lru);
                         if (page == head) {
                                 ret = false;    /* add count continuation */
                                 goto out;
                         }
- -                      map = kmap_atomic(page) + offset;
+ +                      map = kmap_local_page(page) + offset;
   init_map:             *map = 0;               /* we didn't zero the page */
                 }
                 *map += 1;
- -              kunmap_atomic(map);
+ +              kunmap_local(map);
                 while ((page = list_prev_entry(page, lru)) != head) {
- -                      map = kmap_atomic(page) + offset;
+ +                      map = kmap_local_page(page) + offset;
                         *map = COUNT_CONTINUED;
- -                      kunmap_atomic(map);
+ +                      kunmap_local(map);
                 }
                 ret = true;                     /* incremented */
   
@@@ -3564,21 -3570,21 +3565,21 @@@
                  */
                 BUG_ON(count != COUNT_CONTINUED);
                 while (*map == COUNT_CONTINUED) {
- -                      kunmap_atomic(map);
+ +                      kunmap_local(map);
                         page = list_next_entry(page, lru);
                         BUG_ON(page == head);
- -                      map = kmap_atomic(page) + offset;
+ +                      map = kmap_local_page(page) + offset;
                 }
                 BUG_ON(*map == 0);
                 *map -= 1;
                 if (*map == 0)
                         count = 0;
- -              kunmap_atomic(map);
+ +              kunmap_local(map);
                 while ((page = list_prev_entry(page, lru)) != head) {
- -                      map = kmap_atomic(page) + offset;
+ +                      map = kmap_local_page(page) + offset;
                         *map = SWAP_CONT_MAX | count;
                         count = COUNT_CONTINUED;
- -                      kunmap_atomic(map);
+ +                      kunmap_local(map);
                 }
                 ret = count == COUNT_CONTINUED;
         }
diff --combined security/selinux/hooks.c

index 5e5fd5be6d93aa118d8a6eda3b5a78d3efa0d705,b9ccc98421e949ade9492bfa95ed9ff0b7797902..1e4b1f940caf51620ff97664ce8f8fce53ae0c53
--- 1/security/selinux/hooks.c
--- 2/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@@ -85,6 -85,7 +85,7 @@@
   #include <linux/export.h>
   #include <linux/msg.h>
   #include <linux/shm.h>
+ #include <uapi/linux/shm.h>
   #include <linux/bpf.h>
   #include <linux/kernfs.h>
   #include <linux/stringhash.h> /* for hashlen_string() */
@@@ -92,7 -93,6 +93,7 @@@
   #include <linux/fsnotify.h>
   #include <linux/fanotify.h>
   #include <linux/io_uring.h>
+ +#include <uapi/linux/lsm.h>
   
   #include "avc.h"
   #include "objsec.h"
@@@ -1661,6 -1661,8 +1662,6 @@@ static int inode_has_perm(const struct 
         struct inode_security_struct *isec;
         u32 sid;
   
- -      validate_creds(cred);
- -
         if (unlikely(IS_PRIVATE(inode)))
                 return 0;
   
@@@ -2314,19 -2316,6 +2315,19 @@@ static int selinux_bprm_creds_for_exec(
         new_tsec->keycreate_sid = 0;
         new_tsec->sockcreate_sid = 0;
   
+ +      /*
+ +       * Before policy is loaded, label any task outside kernel space
+ +       * as SECINITSID_INIT, so that any userspace tasks surviving from
+ +       * early boot end up with a label different from SECINITSID_KERNEL
+ +       * (if the policy chooses to set SECINITSID_INIT != SECINITSID_KERNEL).
+ +       */
+ +      if (!selinux_initialized()) {
+ +              new_tsec->sid = SECINITSID_INIT;
+ +              /* also clear the exec_sid just in case */
+ +              new_tsec->exec_sid = 0;
+ +              return 0;
+ +      }
+ +
         if (old_tsec->exec_sid) {
                 new_tsec->sid = old_tsec->exec_sid;
                 /* Reset exec SID on execve. */
@@@ -3068,6 -3057,8 +3069,6 @@@ static int selinux_inode_follow_link(st
         struct inode_security_struct *isec;
         u32 sid;
   
- -      validate_creds(cred);
- -
         ad.type = LSM_AUDIT_DATA_DENTRY;
         ad.u.dentry = dentry;
         sid = cred_sid(cred);
@@@ -3111,6 -3102,8 +3112,6 @@@ static int selinux_inode_permission(str
         if (!mask)
                 return 0;
   
- -      validate_creds(cred);
- -
         if (unlikely(IS_PRIVATE(inode)))
                 return 0;
   
@@@ -3739,33 -3732,6 +3740,33 @@@ static int selinux_file_ioctl(struct fi
         return error;
   }
   
+ +static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd,
+ +                            unsigned long arg)
+ +{
+ +      /*
+ +       * If we are in a 64-bit kernel running 32-bit userspace, we need to
+ +       * make sure we don't compare 32-bit flags to 64-bit flags.
+ +       */
+ +      switch (cmd) {
+ +      case FS_IOC32_GETFLAGS:
+ +              cmd = FS_IOC_GETFLAGS;
+ +              break;
+ +      case FS_IOC32_SETFLAGS:
+ +              cmd = FS_IOC_SETFLAGS;
+ +              break;
+ +      case FS_IOC32_GETVERSION:
+ +              cmd = FS_IOC_GETVERSION;
+ +              break;
+ +      case FS_IOC32_SETVERSION:
+ +              cmd = FS_IOC_SETVERSION;
+ +              break;
+ +      default:
+ +              break;
+ +      }
+ +
+ +      return selinux_file_ioctl(file, cmd, arg);
+ +}
+ +
   static int default_noexec __ro_after_init;
   
   static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
@@@ -4588,21 -4554,6 +4589,21 @@@ static int sock_has_perm(struct sock *s
         if (sksec->sid == SECINITSID_KERNEL)
                 return 0;
   
+ +      /*
+ +       * Before POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT, sockets that
+ +       * inherited the kernel context from early boot used to be skipped
+ +       * here, so preserve that behavior unless the capability is set.
+ +       *
+ +       * By setting the capability the policy signals that it is ready
+ +       * for this quirk to be fixed. Note that sockets created by a kernel
+ +       * thread or a usermode helper executed without a transition will
+ +       * still be skipped in this check regardless of the policycap
+ +       * setting.
+ +       */
+ +      if (!selinux_policycap_userspace_initial_context() &&
+ +          sksec->sid == SECINITSID_INIT)
+ +              return 0;
+ +
         ad_net_init_from_sk(&ad, &net, sk);
   
         return avc_has_perm(current_sid(), sksec->sid, sksec->sclass, perms,
@@@ -4717,13 -4668,6 +4718,13 @@@ static int selinux_socket_bind(struct s
                                 return -EINVAL;
                         addr4 = (struct sockaddr_in *)address;
                         if (family_sa == AF_UNSPEC) {
+ +                              if (family == PF_INET6) {
+ +                                      /* Length check from inet6_bind_sk() */
+ +                                      if (addrlen < SIN6_LEN_RFC2133)
+ +                                              return -EINVAL;
+ +                                      /* Family check from __inet6_bind() */
+ +                                      goto err_af;
+ +                              }
                                 /* see __inet_bind(), we only want to allow
                                  * AF_UNSPEC if the address is INADDR_ANY
                                  */
@@@ -6341,8 -6285,8 +6342,8 @@@ static void selinux_d_instantiate(struc
                 inode_doinit_with_dentry(inode, dentry);
   }
   
- -static int selinux_getprocattr(struct task_struct *p,
- -                             const char *name, char **value)
+ +static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p,
+ +                             char **value)
   {
         const struct task_security_struct *__tsec;
         u32 sid;
@@@ -6359,27 -6303,20 +6360,27 @@@
                         goto bad;
         }
   
- -      if (!strcmp(name, "current"))
+ +      switch (attr) {
+ +      case LSM_ATTR_CURRENT:
                 sid = __tsec->sid;
- -      else if (!strcmp(name, "prev"))
+ +              break;
+ +      case LSM_ATTR_PREV:
                 sid = __tsec->osid;
- -      else if (!strcmp(name, "exec"))
+ +              break;
+ +      case LSM_ATTR_EXEC:
                 sid = __tsec->exec_sid;
- -      else if (!strcmp(name, "fscreate"))
+ +              break;
+ +      case LSM_ATTR_FSCREATE:
                 sid = __tsec->create_sid;
- -      else if (!strcmp(name, "keycreate"))
+ +              break;
+ +      case LSM_ATTR_KEYCREATE:
                 sid = __tsec->keycreate_sid;
- -      else if (!strcmp(name, "sockcreate"))
+ +              break;
+ +      case LSM_ATTR_SOCKCREATE:
                 sid = __tsec->sockcreate_sid;
- -      else {
- -              error = -EINVAL;
+ +              break;
+ +      default:
+ +              error = -EOPNOTSUPP;
                 goto bad;
         }
         rcu_read_unlock();
@@@ -6397,7 -6334,7 +6398,7 @@@ bad
         return error;
   }
   
- -static int selinux_setprocattr(const char *name, void *value, size_t size)
+ +static int selinux_lsm_setattr(u64 attr, void *value, size_t size)
   {
         struct task_security_struct *tsec;
         struct cred *new;
@@@ -6408,31 -6345,23 +6409,31 @@@
         /*
          * Basic control over ability to set these attributes at all.
          */
- -      if (!strcmp(name, "exec"))
+ +      switch (attr) {
+ +      case LSM_ATTR_EXEC:
                 error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                      PROCESS__SETEXEC, NULL);
- -      else if (!strcmp(name, "fscreate"))
+ +              break;
+ +      case LSM_ATTR_FSCREATE:
                 error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                      PROCESS__SETFSCREATE, NULL);
- -      else if (!strcmp(name, "keycreate"))
+ +              break;
+ +      case LSM_ATTR_KEYCREATE:
                 error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                      PROCESS__SETKEYCREATE, NULL);
- -      else if (!strcmp(name, "sockcreate"))
+ +              break;
+ +      case LSM_ATTR_SOCKCREATE:
                 error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                      PROCESS__SETSOCKCREATE, NULL);
- -      else if (!strcmp(name, "current"))
+ +              break;
+ +      case LSM_ATTR_CURRENT:
                 error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                      PROCESS__SETCURRENT, NULL);
- -      else
- -              error = -EINVAL;
+ +              break;
+ +      default:
+ +              error = -EOPNOTSUPP;
+ +              break;
+ +      }
         if (error)
                 return error;
   
@@@ -6444,14 -6373,13 +6445,14 @@@
                 }
                 error = security_context_to_sid(value, size,
                                                 &sid, GFP_KERNEL);
- -              if (error == -EINVAL && !strcmp(name, "fscreate")) {
+ +              if (error == -EINVAL && attr == LSM_ATTR_FSCREATE) {
                         if (!has_cap_mac_admin(true)) {
                                 struct audit_buffer *ab;
                                 size_t audit_size;
   
- -                              /* We strip a nul only if it is at the end, otherwise the
- -                               * context contains a nul and we should audit that */
+ +                              /* We strip a nul only if it is at the end,
+ +                               * otherwise the context contains a nul and
+ +                               * we should audit that */
                                 if (str[size - 1] == '\0')
                                         audit_size = size - 1;
                                 else
@@@ -6462,8 -6390,7 +6463,8 @@@
                                 if (!ab)
                                         return error;
                                 audit_log_format(ab, "op=fscreate invalid_context=");
- -                              audit_log_n_untrustedstring(ab, value, audit_size);
+ +                              audit_log_n_untrustedstring(ab, value,
+ +                                                          audit_size);
                                 audit_log_end(ab);
   
                                 return error;
@@@ -6486,11 -6413,11 +6487,11 @@@
            checks and may_create for the file creation checks. The
            operation will then fail if the context is not permitted. */
         tsec = selinux_cred(new);
- -      if (!strcmp(name, "exec")) {
+ +      if (attr == LSM_ATTR_EXEC) {
                 tsec->exec_sid = sid;
- -      } else if (!strcmp(name, "fscreate")) {
+ +      } else if (attr == LSM_ATTR_FSCREATE) {
                 tsec->create_sid = sid;
- -      } else if (!strcmp(name, "keycreate")) {
+ +      } else if (attr == LSM_ATTR_KEYCREATE) {
                 if (sid) {
                         error = avc_has_perm(mysid, sid,
                                              SECCLASS_KEY, KEY__CREATE, NULL);
@@@ -6498,13 -6425,14 +6499,13 @@@
                                 goto abort_change;
                 }
                 tsec->keycreate_sid = sid;
- -      } else if (!strcmp(name, "sockcreate")) {
+ +      } else if (attr == LSM_ATTR_SOCKCREATE) {
                 tsec->sockcreate_sid = sid;
- -      } else if (!strcmp(name, "current")) {
+ +      } else if (attr == LSM_ATTR_CURRENT) {
                 error = -EINVAL;
                 if (sid == 0)
                         goto abort_change;
   
- -              /* Only allow single threaded processes to change context */
                 if (!current_is_single_threaded()) {
                         error = security_bounded_transition(tsec->sid, sid);
                         if (error)
@@@ -6541,69 -6469,6 +6542,69 @@@ abort_change
         return error;
   }
   
+ +/**
+ + * selinux_getselfattr - Get SELinux current task attributes
+ + * @attr: the requested attribute
+ + * @ctx: buffer to receive the result
+ + * @size: buffer size (input), buffer size used (output)
+ + * @flags: unused
+ + *
+ + * Fill the passed user space @ctx with the details of the requested
+ + * attribute.
+ + *
+ + * Returns the number of attributes on success, an error code otherwise.
+ + * There will only ever be one attribute.
+ + */
+ +static int selinux_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
+ +                             size_t *size, u32 flags)
+ +{
+ +      int rc;
+ +      char *val;
+ +      int val_len;
+ +
+ +      val_len = selinux_lsm_getattr(attr, current, &val);
+ +      if (val_len < 0)
+ +              return val_len;
+ +      rc = lsm_fill_user_ctx(ctx, size, val, val_len, LSM_ID_SELINUX, 0);
+ +      kfree(val);
+ +      return (!rc ? 1 : rc);
+ +}
+ +
+ +static int selinux_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
+ +                             size_t size, u32 flags)
+ +{
+ +      int rc;
+ +
+ +      rc = selinux_lsm_setattr(attr, ctx->ctx, ctx->ctx_len);
+ +      if (rc > 0)
+ +              return 0;
+ +      return rc;
+ +}
+ +
+ +static int selinux_getprocattr(struct task_struct *p,
+ +                             const char *name, char **value)
+ +{
+ +      unsigned int attr = lsm_name_to_attr(name);
+ +      int rc;
+ +
+ +      if (attr) {
+ +              rc = selinux_lsm_getattr(attr, p, value);
+ +              if (rc != -EOPNOTSUPP)
+ +                      return rc;
+ +      }
+ +
+ +      return -EINVAL;
+ +}
+ +
+ +static int selinux_setprocattr(const char *name, void *value, size_t size)
+ +{
+ +      int attr = lsm_name_to_attr(name);
+ +
+ +      if (attr)
+ +              return selinux_lsm_setattr(attr, value, size);
+ +      return -EINVAL;
+ +}
+ +
   static int selinux_ismaclabel(const char *name)
   {
         return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0);
@@@ -7086,11 -6951,6 +7087,11 @@@ static int selinux_uring_cmd(struct io_
   }
   #endif /* CONFIG_IO_URING */
   
+ +static const struct lsm_id selinux_lsmid = {
+ +      .name = "selinux",
+ +      .id = LSM_ID_SELINUX,
+ +};
+ +
   /*
    * IMPORTANT NOTE: When adding new hooks, please be careful to keep this order:
    * 1. any hooks that don't belong to (2.) or (3.) below,
@@@ -7177,7 -7037,6 +7178,7 @@@ static struct security_hook_list selinu
         LSM_HOOK_INIT(file_permission, selinux_file_permission),
         LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
         LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
+ +      LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat),
         LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
         LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
         LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
@@@ -7233,8 -7092,6 +7234,8 @@@
   
         LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate),
   
+ +      LSM_HOOK_INIT(getselfattr, selinux_getselfattr),
+ +      LSM_HOOK_INIT(setselfattr, selinux_setselfattr),
         LSM_HOOK_INIT(getprocattr, selinux_getprocattr),
         LSM_HOOK_INIT(setprocattr, selinux_setprocattr),
   
@@@ -7414,8 -7271,7 +7415,8 @@@ static __init int selinux_init(void
   
         hashtab_cache_init();
   
- -      security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), "selinux");
+ +      security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks),
+ +                         &selinux_lsmid);
   
         if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET))
                 panic("SELinux: Unable to register AVC netcache callback\n");
diff --combined security/smack/smack_lsm.c

index c126f6a16de4fbd0d3c05ba93529c50e23093263,7a5600834f1648f30d408323eb4d5c6cc6a576b6..eb465bf74a3c96af081b977c010b0945acf1ba9b
--- 1/security/smack/smack_lsm.c
--- 2/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@@ -37,13 -37,13 +37,14 @@@
   #include <linux/personality.h>
   #include <linux/msg.h>
   #include <linux/shm.h>
+ #include <uapi/linux/shm.h>
   #include <linux/binfmts.h>
   #include <linux/parser.h>
   #include <linux/fs_context.h>
   #include <linux/fs_parser.h>
   #include <linux/watch_queue.h>
   #include <linux/io_uring.h>
+ +#include <uapi/linux/lsm.h>
   #include "smack.h"
   
   #define TRANS_TRUE    "TRUE"
@@@ -3626,35 -3626,6 +3627,35 @@@ static void smack_d_instantiate(struct 
         return;
   }
   
+ +/**
+ + * smack_getselfattr - Smack current process attribute
+ + * @attr: which attribute to fetch
+ + * @ctx: buffer to receive the result
+ + * @size: available size in, actual size out
+ + * @flags: unused
+ + *
+ + * Fill the passed user space @ctx with the details of the requested
+ + * attribute.
+ + *
+ + * Returns the number of attributes on success, an error code otherwise.
+ + * There will only ever be one attribute.
+ + */
+ +static int smack_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
+ +                           size_t *size, u32 flags)
+ +{
+ +      int rc;
+ +      struct smack_known *skp;
+ +
+ +      if (attr != LSM_ATTR_CURRENT)
+ +              return -EOPNOTSUPP;
+ +
+ +      skp = smk_of_current();
+ +      rc = lsm_fill_user_ctx(ctx, size,
+ +                             skp->smk_known, strlen(skp->smk_known) + 1,
+ +                             LSM_ID_SMACK, 0);
+ +      return (!rc ? 1 : rc);
+ +}
+ +
   /**
    * smack_getprocattr - Smack process attribute access
    * @p: the object task
@@@ -3684,8 -3655,8 +3685,8 @@@ static int smack_getprocattr(struct tas
   }
   
   /**
- - * smack_setprocattr - Smack process attribute setting
- - * @name: the name of the attribute in /proc/.../attr
+ + * do_setattr - Smack process attribute setting
+ + * @attr: the ID of the attribute
    * @value: the value to set
    * @size: the size of the value
    *
@@@ -3694,7 -3665,7 +3695,7 @@@
    *
    * Returns the length of the smack label or an error code
    */
- -static int smack_setprocattr(const char *name, void *value, size_t size)
+ +static int do_setattr(u64 attr, void *value, size_t size)
   {
         struct task_smack *tsp = smack_cred(current_cred());
         struct cred *new;
@@@ -3708,8 -3679,8 +3709,8 @@@
         if (value == NULL || size == 0 || size >= SMK_LONGLABEL)
                 return -EINVAL;
   
- -      if (strcmp(name, "current") != 0)
- -              return -EINVAL;
+ +      if (attr != LSM_ATTR_CURRENT)
+ +              return -EOPNOTSUPP;
   
         skp = smk_import_entry(value, size);
         if (IS_ERR(skp))
@@@ -3748,49 -3719,6 +3749,49 @@@
         return size;
   }
   
+ +/**
+ + * smack_setselfattr - Set a Smack process attribute
+ + * @attr: which attribute to set
+ + * @ctx: buffer containing the data
+ + * @size: size of @ctx
+ + * @flags: unused
+ + *
+ + * Fill the passed user space @ctx with the details of the requested
+ + * attribute.
+ + *
+ + * Returns 0 on success, an error code otherwise.
+ + */
+ +static int smack_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
+ +                           size_t size, u32 flags)
+ +{
+ +      int rc;
+ +
+ +      rc = do_setattr(attr, ctx->ctx, ctx->ctx_len);
+ +      if (rc > 0)
+ +              return 0;
+ +      return rc;
+ +}
+ +
+ +/**
+ + * smack_setprocattr - Smack process attribute setting
+ + * @name: the name of the attribute in /proc/.../attr
+ + * @value: the value to set
+ + * @size: the size of the value
+ + *
+ + * Sets the Smack value of the task. Only setting self
+ + * is permitted and only with privilege
+ + *
+ + * Returns the length of the smack label or an error code
+ + */
+ +static int smack_setprocattr(const char *name, void *value, size_t size)
+ +{
+ +      int attr = lsm_name_to_attr(name);
+ +
+ +      if (attr != LSM_ATTR_UNDEF)
+ +              return do_setattr(attr, value, size);
+ +      return -EINVAL;
+ +}
+ +
   /**
    * smack_unix_stream_connect - Smack access on UDS
    * @sock: one sock
@@@ -5006,11 -4934,6 +5007,11 @@@ struct lsm_blob_sizes smack_blob_sizes 
         .lbs_xattr_count = SMACK_INODE_INIT_XATTRS,
   };
   
+ +static const struct lsm_id smack_lsmid = {
+ +      .name = "smack",
+ +      .id = LSM_ID_SMACK,
+ +};
+ +
   static struct security_hook_list smack_hooks[] __ro_after_init = {
         LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check),
         LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
@@@ -5051,7 -4974,6 +5052,7 @@@
   
         LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
         LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
+ +      LSM_HOOK_INIT(file_ioctl_compat, smack_file_ioctl),
         LSM_HOOK_INIT(file_lock, smack_file_lock),
         LSM_HOOK_INIT(file_fcntl, smack_file_fcntl),
         LSM_HOOK_INIT(mmap_file, smack_mmap_file),
@@@ -5106,8 -5028,6 +5107,8 @@@
   
         LSM_HOOK_INIT(d_instantiate, smack_d_instantiate),
   
+ +      LSM_HOOK_INIT(getselfattr, smack_getselfattr),
+ +      LSM_HOOK_INIT(setselfattr, smack_setselfattr),
         LSM_HOOK_INIT(getprocattr, smack_getprocattr),
         LSM_HOOK_INIT(setprocattr, smack_setprocattr),
   
@@@ -5221,7 -5141,7 +5222,7 @@@ static __init int smack_init(void
         /*
          * Register with LSM
          */
- -      security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), "smack");
+ +      security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), &smack_lsmid);
         smack_enabled = 1;
   
         pr_info("Smack:  Initializing.\n");
author	Linus Torvalds <[email protected]>
	Thu, 11 Jan 2024 00:43:55 +0000 (16:43 -0800)
committer	Linus Torvalds <[email protected]>
	Thu, 11 Jan 2024 00:43:55 +0000 (16:43 -0800)
		1	2
arch/x86/include/asm/fpu/types.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/paravirt.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/paravirt_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/lockdep_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mutex.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched/signal.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched/task.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/spinlock.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/uidgid.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/workqueue.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/init_task.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/async.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/khugepaged.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/swapfile.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/selinux/hooks.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/smack/smack_lsm.c	patch \|	diff1 \|	diff2 \|	blob \| history