]> Git Repo - J-linux.git/commitdiff
Merge tag 'header_cleanup-2024-01-10' of https://evilpiepirate.org/git/bcachefs
authorLinus Torvalds <[email protected]>
Thu, 11 Jan 2024 00:43:55 +0000 (16:43 -0800)
committerLinus Torvalds <[email protected]>
Thu, 11 Jan 2024 00:43:55 +0000 (16:43 -0800)
Pull header cleanups from Kent Overstreet:
 "The goal is to get sched.h down to a type only header, so the main
  thing happening in this patchset is splitting out various _types.h
  headers and dependency fixups, as well as moving some things out of
  sched.h to better locations.

  This is prep work for the memory allocation profiling patchset which
  adds new sched.h interdepencencies"

* tag 'header_cleanup-2024-01-10' of https://evilpiepirate.org/git/bcachefs: (51 commits)
  Kill sched.h dependency on rcupdate.h
  kill unnecessary thread_info.h include
  Kill unnecessary kernel.h include
  preempt.h: Kill dependency on list.h
  rseq: Split out rseq.h from sched.h
  LoongArch: signal.c: add header file to fix build error
  restart_block: Trim includes
  lockdep: move held_lock to lockdep_types.h
  sem: Split out sem_types.h
  uidgid: Split out uidgid_types.h
  seccomp: Split out seccomp_types.h
  refcount: Split out refcount_types.h
  uapi/linux/resource.h: fix include
  x86/signal: kill dependency on time.h
  syscall_user_dispatch.h: split out *_types.h
  mm_types_task.h: Trim dependencies
  Split out irqflags_types.h
  ipc: Kill bogus dependency on spinlock.h
  shm: Slim down dependencies
  workqueue: Split out workqueue_types.h
  ...

23 files changed:
1  2 
arch/x86/include/asm/fpu/types.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/paravirt_types.h
fs/exec.c
include/linux/lockdep_types.h
include/linux/mutex.h
include/linux/sched.h
include/linux/sched/signal.h
include/linux/sched/task.h
include/linux/spinlock.h
include/linux/uidgid.h
include/linux/workqueue.h
init/init_task.c
kernel/async.c
kernel/exit.c
kernel/fork.c
kernel/sched/core.c
mm/filemap.c
mm/khugepaged.c
mm/shmem.c
mm/swapfile.c
security/selinux/hooks.c
security/smack/smack_lsm.c

index f1fadc318a88ff895b09c088ed4f057b93f5169b,3dad7cf25505aa4cb2601e55b12d1b71868a5c84..ace9aa3b78a3055e547edeab9f3378be6067386b
@@@ -5,6 -5,8 +5,8 @@@
  #ifndef _ASM_X86_FPU_H
  #define _ASM_X86_FPU_H
  
+ #include <asm/page_types.h>
  /*
   * The legacy x87 FPU state format, as saved by FSAVE and
   * restored by the FRSTOR instructions:
@@@ -415,7 -417,7 +417,7 @@@ struct fpu_state_perm 
         *
         * This master permission field is only to be used when
         * task.fpu.fpstate based checks fail to validate whether the task
 -       * is allowed to expand it's xfeatures set which requires to
 +       * is allowed to expand its xfeatures set which requires to
         * allocate a larger sized fpstate buffer.
         *
         * Do not access this field directly.  Use the provided helper
index 8bcf7584e7dd885b9d7d8fee9e3088318e905d7f,49020002663b36bf7dd1b06eacce1a54bf3a5140..d4eb9e1d61b8ef8a3fc3a2510b0615ea93c11cb8
@@@ -6,6 -6,10 +6,10 @@@
  
  #include <asm/paravirt_types.h>
  
+ #ifndef __ASSEMBLY__
+ struct mm_struct;
+ #endif
  #ifdef CONFIG_PARAVIRT
  #include <asm/pgtable_types.h>
  #include <asm/asm.h>
@@@ -142,7 -146,8 +146,7 @@@ static inline void write_cr0(unsigned l
  static __always_inline unsigned long read_cr2(void)
  {
        return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
 -                              "mov %%cr2, %%rax;",
 -                              ALT_NOT(X86_FEATURE_XENPV));
 +                              "mov %%cr2, %%rax;", ALT_NOT_XEN);
  }
  
  static __always_inline void write_cr2(unsigned long x)
  static inline unsigned long __read_cr3(void)
  {
        return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
 -                            "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV));
 +                            "mov %%cr3, %%rax;", ALT_NOT_XEN);
  }
  
  static inline void write_cr3(unsigned long x)
  {
 -      PVOP_ALT_VCALL1(mmu.write_cr3, x,
 -                      "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV));
 +      PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN);
  }
  
  static inline void __write_cr4(unsigned long x)
@@@ -180,7 -186,7 +184,7 @@@ extern noinstr void pv_native_wbinvd(vo
  
  static __always_inline void wbinvd(void)
  {
 -      PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
 +      PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
  }
  
  static inline u64 paravirt_read_msr(unsigned msr)
@@@ -388,25 -394,27 +392,25 @@@ static inline void paravirt_release_p4d
  static inline pte_t __pte(pteval_t val)
  {
        return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
 -                                        "mov %%rdi, %%rax",
 -                                        ALT_NOT(X86_FEATURE_XENPV)) };
 +                                        "mov %%rdi, %%rax", ALT_NOT_XEN) };
  }
  
  static inline pteval_t pte_val(pte_t pte)
  {
        return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
 -                              "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
 +                              "mov %%rdi, %%rax", ALT_NOT_XEN);
  }
  
  static inline pgd_t __pgd(pgdval_t val)
  {
        return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
 -                                        "mov %%rdi, %%rax",
 -                                        ALT_NOT(X86_FEATURE_XENPV)) };
 +                                        "mov %%rdi, %%rax", ALT_NOT_XEN) };
  }
  
  static inline pgdval_t pgd_val(pgd_t pgd)
  {
        return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
 -                              "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
 +                              "mov %%rdi, %%rax", ALT_NOT_XEN);
  }
  
  #define  __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@@ -440,13 -448,14 +444,13 @@@ static inline void set_pmd(pmd_t *pmdp
  static inline pmd_t __pmd(pmdval_t val)
  {
        return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
 -                                        "mov %%rdi, %%rax",
 -                                        ALT_NOT(X86_FEATURE_XENPV)) };
 +                                        "mov %%rdi, %%rax", ALT_NOT_XEN) };
  }
  
  static inline pmdval_t pmd_val(pmd_t pmd)
  {
        return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
 -                              "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
 +                              "mov %%rdi, %%rax", ALT_NOT_XEN);
  }
  
  static inline void set_pud(pud_t *pudp, pud_t pud)
@@@ -459,7 -468,7 +463,7 @@@ static inline pud_t __pud(pudval_t val
        pudval_t ret;
  
        ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
 -                             "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
 +                             "mov %%rdi, %%rax", ALT_NOT_XEN);
  
        return (pud_t) { ret };
  }
  static inline pudval_t pud_val(pud_t pud)
  {
        return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
 -                              "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
 +                              "mov %%rdi, %%rax", ALT_NOT_XEN);
  }
  
  static inline void pud_clear(pud_t *pudp)
@@@ -487,7 -496,8 +491,7 @@@ static inline void set_p4d(p4d_t *p4dp
  static inline p4d_t __p4d(p4dval_t val)
  {
        p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
 -                                      "mov %%rdi, %%rax",
 -                                      ALT_NOT(X86_FEATURE_XENPV));
 +                                      "mov %%rdi, %%rax", ALT_NOT_XEN);
  
        return (p4d_t) { ret };
  }
  static inline p4dval_t p4d_val(p4d_t p4d)
  {
        return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d,
 -                              "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
 +                              "mov %%rdi, %%rax", ALT_NOT_XEN);
  }
  
  static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
@@@ -681,17 -691,17 +685,17 @@@ bool __raw_callee_save___native_vcpu_is
  static __always_inline unsigned long arch_local_save_flags(void)
  {
        return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
 -                              ALT_NOT(X86_FEATURE_XENPV));
 +                              ALT_NOT_XEN);
  }
  
  static __always_inline void arch_local_irq_disable(void)
  {
 -      PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV));
 +      PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN);
  }
  
  static __always_inline void arch_local_irq_enable(void)
  {
 -      PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV));
 +      PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN);
  }
  
  static __always_inline unsigned long arch_local_irq_save(void)
  #undef PVOP_VCALL4
  #undef PVOP_CALL4
  
 -#define DEFINE_PARAVIRT_ASM(func, instr, sec)         \
 -      asm (".pushsection " #sec ", \"ax\"\n"          \
 -           ".global " #func "\n\t"                    \
 -           ".type " #func ", @function\n\t"           \
 -           ASM_FUNC_ALIGN "\n"                        \
 -           #func ":\n\t"                              \
 -           ASM_ENDBR                                  \
 -           instr "\n\t"                               \
 -           ASM_RET                                    \
 -           ".size " #func ", . - " #func "\n\t"       \
 -           ".popsection")
 -
  extern void default_banner(void);
  void native_pv_lock_init(void) __init;
  
  #else  /* __ASSEMBLY__ */
  
 -#define _PVSITE(ptype, ops, word, algn)               \
 -771:;                                         \
 -      ops;                                    \
 -772:;                                         \
 -      .pushsection .parainstructions,"a";     \
 -       .align algn;                           \
 -       word 771b;                             \
 -       .byte ptype;                           \
 -       .byte 772b-771b;                       \
 -       _ASM_ALIGN;                            \
 -      .popsection
 -
 -
  #ifdef CONFIG_X86_64
  #ifdef CONFIG_PARAVIRT_XXL
 +#ifdef CONFIG_DEBUG_ENTRY
  
 -#define PARA_PATCH(off)               ((off) / 8)
 -#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
  #define PARA_INDIRECT(addr)   *addr(%rip)
  
 -#ifdef CONFIG_DEBUG_ENTRY
  .macro PARA_IRQ_save_fl
 -      PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
 -                ANNOTATE_RETPOLINE_SAFE;
 -                call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);)
 +      ANNOTATE_RETPOLINE_SAFE;
 +      call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);
  .endm
  
 -#define SAVE_FLAGS    ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \
 -                                  ALT_NOT(X86_FEATURE_XENPV)
 +#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;",                 \
 +                               "ALT_CALL_INSTR;", ALT_CALL_ALWAYS,    \
 +                               "pushf; pop %rax;", ALT_NOT_XEN
  #endif
  #endif /* CONFIG_PARAVIRT_XXL */
  #endif        /* CONFIG_X86_64 */
index d8e85d2cf8d56e4d591a1747e8d54f5b67611174,3cfcd5db083b35185dafa96b29f8f10b493a6d17..8d4fbe1be489549ad33c968c2132bdbaf739b871
@@@ -2,9 -2,20 +2,10 @@@
  #ifndef _ASM_X86_PARAVIRT_TYPES_H
  #define _ASM_X86_PARAVIRT_TYPES_H
  
 -#ifndef __ASSEMBLY__
 -#include <linux/types.h>
 -
 -/* These all sit in the .parainstructions section to tell us what to patch. */
 -struct paravirt_patch_site {
 -      u8 *instr;              /* original instructions */
 -      u8 type;                /* type of this instruction */
 -      u8 len;                 /* length of original instruction */
 -};
 -#endif
 -
  #ifdef CONFIG_PARAVIRT
  
  #ifndef __ASSEMBLY__
++#include <linux/types.h>
  
  #include <asm/desc_defs.h>
  #include <asm/pgtable_types.h>
@@@ -241,11 -252,43 +242,11 @@@ struct paravirt_patch_template 
  extern struct pv_info pv_info;
  extern struct paravirt_patch_template pv_ops;
  
 -#define PARAVIRT_PATCH(x)                                     \
 -      (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
 -
 -#define paravirt_type(op)                             \
 -      [paravirt_typenum] "i" (PARAVIRT_PATCH(op)),    \
 -      [paravirt_opptr] "m" (pv_ops.op)
 -/*
 - * Generate some code, and mark it as patchable by the
 - * apply_paravirt() alternate instruction patcher.
 - */
 -#define _paravirt_alt(insn_string, type)              \
 -      "771:\n\t" insn_string "\n" "772:\n"            \
 -      ".pushsection .parainstructions,\"a\"\n"        \
 -      _ASM_ALIGN "\n"                                 \
 -      _ASM_PTR " 771b\n"                              \
 -      "  .byte " type "\n"                            \
 -      "  .byte 772b-771b\n"                           \
 -      _ASM_ALIGN "\n"                                 \
 -      ".popsection\n"
 -
 -/* Generate patchable code, with the default asm parameters. */
 -#define paravirt_alt(insn_string)                                     \
 -      _paravirt_alt(insn_string, "%c[paravirt_typenum]")
 -
 -/* Simple instruction patching code. */
 -#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
 -
 -unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len);
 +#define paravirt_ptr(op)      [paravirt_opptr] "m" (pv_ops.op)
  
  int paravirt_disable_iospace(void);
  
 -/*
 - * This generates an indirect call based on the operation type number.
 - * The type number, computed in PARAVIRT_PATCH, is derived from the
 - * offset into the paravirt_patch_template structure, and can therefore be
 - * freely converted back into a structure offset.
 - */
 +/* This generates an indirect call based on the operation type number. */
  #define PARAVIRT_CALL                                 \
        ANNOTATE_RETPOLINE_SAFE                         \
        "call *%[paravirt_opptr];"
   * However, x86_64 also has to clobber all caller saved registers, which
   * unfortunately, are quite a bit (r8 - r11)
   *
 - * The call instruction itself is marked by placing its start address
 - * and size into the .parainstructions section, so that
 - * apply_paravirt() in arch/i386/kernel/alternative.c can do the
 - * appropriate patching under the control of the backend pv_init_ops
 - * implementation.
 - *
   * Unfortunately there's no way to get gcc to generate the args setup
   * for the call, and then allow the call itself to be generated by an
   * inline asm.  Because of this, we must do the complete arg setup and
                __mask & __eax;                                         \
        })
  
 -
 +/*
 + * Use alternative patching for paravirt calls:
 + * - For replacing an indirect call with a direct one, use the "normal"
 + *   ALTERNATIVE() macro with the indirect call as the initial code sequence,
 + *   which will be replaced with the related direct call by using the
 + *   ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
 + * - In case the replacement is either a direct call or a short code sequence
 + *   depending on a feature bit, the ALTERNATIVE_2() macro is being used.
 + *   The indirect call is the initial code sequence again, while the special
 + *   code sequence is selected with the specified feature bit. In case the
 + *   feature is not active, the direct call is used as above via the
 + *   ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
 + */
  #define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...)    \
        ({                                                              \
                PVOP_CALL_ARGS;                                         \
                PVOP_TEST_NULL(op);                                     \
 -              asm volatile(paravirt_alt(PARAVIRT_CALL)                \
 +              asm volatile(ALTERNATIVE(PARAVIRT_CALL, ALT_CALL_INSTR, \
 +                              ALT_CALL_ALWAYS)                        \
                             : call_clbr, ASM_CALL_CONSTRAINT           \
 -                           : paravirt_type(op),                       \
 +                           : paravirt_ptr(op),                        \
                               ##__VA_ARGS__                            \
                             : "memory", "cc" extra_clbr);              \
                ret;                                                    \
        ({                                                              \
                PVOP_CALL_ARGS;                                         \
                PVOP_TEST_NULL(op);                                     \
 -              asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL),   \
 -                                       alt, cond)                     \
 +              asm volatile(ALTERNATIVE_2(PARAVIRT_CALL,               \
 +                               ALT_CALL_INSTR, ALT_CALL_ALWAYS,       \
 +                               alt, cond)                             \
                             : call_clbr, ASM_CALL_CONSTRAINT           \
 -                           : paravirt_type(op),                       \
 +                           : paravirt_ptr(op),                        \
                               ##__VA_ARGS__                            \
                             : "memory", "cc" extra_clbr);              \
                ret;                                                    \
        __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2),    \
                     PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
  
 -void _paravirt_nop(void);
 -void paravirt_BUG(void);
  unsigned long paravirt_ret0(void);
  #ifdef CONFIG_PARAVIRT_XXL
  u64 _paravirt_ident_64(u64);
@@@ -518,11 -555,11 +519,11 @@@ void pv_native_irq_enable(void)
  unsigned long pv_native_read_cr2(void);
  #endif
  
 -#define paravirt_nop  ((void *)_paravirt_nop)
 -
 -extern struct paravirt_patch_site __parainstructions[],
 -      __parainstructions_end[];
 +#define paravirt_nop  ((void *)nop_func)
  
  #endif        /* __ASSEMBLY__ */
 +
 +#define ALT_NOT_XEN   ALT_NOT(X86_FEATURE_XENPV)
 +
  #endif  /* CONFIG_PARAVIRT */
  #endif        /* _ASM_X86_PARAVIRT_TYPES_H */
diff --combined fs/exec.c
index ee43597cb45311bec9e3bc03269fe62b799a2f57,41773af7e3dca1fdba360b545fbf50925eaff496..88ce7d9ceff5884b03573a6fb6b0227decf4fe85
+++ b/fs/exec.c
@@@ -66,6 -66,7 +66,7 @@@
  #include <linux/coredump.h>
  #include <linux/time_namespace.h>
  #include <linux/user_events.h>
+ #include <linux/rseq.h>
  
  #include <linux/uaccess.h>
  #include <asm/mmu_context.h>
@@@ -1578,10 -1579,11 +1579,10 @@@ static void check_unsafe_exec(struct li
         * will be able to manipulate the current directory, etc.
         * It would be nice to force an unshare instead...
         */
 -      t = p;
        n_fs = 1;
        spin_lock(&p->fs->lock);
        rcu_read_lock();
 -      while_each_thread(p, t) {
 +      for_other_threads(p, t) {
                if (t->fs == p->fs)
                        n_fs++;
        }
index 857d785e89e6a94f4b0cd70b5a8489e83533cb1d,9c533c8d701e7870be1be42f8bf1ed16fa8bf5e1..70d30d40ea4a9e1e0acb2c250981716eeb8687d7
@@@ -127,12 -127,12 +127,12 @@@ struct lock_class 
        unsigned long                   usage_mask;
        const struct lock_trace         *usage_traces[LOCK_TRACE_STATES];
  
 +      const char                      *name;
        /*
         * Generation counter, when doing certain classes of graph walking,
         * to ensure that we check one node only once:
         */
        int                             name_version;
 -      const char                      *name;
  
        u8                              wait_type_inner;
        u8                              wait_type_outer;
@@@ -198,6 -198,63 +198,63 @@@ struct lockdep_map 
  
  struct pin_cookie { unsigned int val; };
  
+ #define MAX_LOCKDEP_KEYS_BITS         13
+ #define MAX_LOCKDEP_KEYS              (1UL << MAX_LOCKDEP_KEYS_BITS)
+ #define INITIAL_CHAIN_KEY             -1
+ struct held_lock {
+       /*
+        * One-way hash of the dependency chain up to this point. We
+        * hash the hashes step by step as the dependency chain grows.
+        *
+        * We use it for dependency-caching and we skip detection
+        * passes and dependency-updates if there is a cache-hit, so
+        * it is absolutely critical for 100% coverage of the validator
+        * to have a unique key value for every unique dependency path
+        * that can occur in the system, to make a unique hash value
+        * as likely as possible - hence the 64-bit width.
+        *
+        * The task struct holds the current hash value (initialized
+        * with zero), here we store the previous hash value:
+        */
+       u64                             prev_chain_key;
+       unsigned long                   acquire_ip;
+       struct lockdep_map              *instance;
+       struct lockdep_map              *nest_lock;
+ #ifdef CONFIG_LOCK_STAT
+       u64                             waittime_stamp;
+       u64                             holdtime_stamp;
+ #endif
+       /*
+        * class_idx is zero-indexed; it points to the element in
+        * lock_classes this held lock instance belongs to. class_idx is in
+        * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive.
+        */
+       unsigned int                    class_idx:MAX_LOCKDEP_KEYS_BITS;
+       /*
+        * The lock-stack is unified in that the lock chains of interrupt
+        * contexts nest ontop of process context chains, but we 'separate'
+        * the hashes by starting with 0 if we cross into an interrupt
+        * context, and we also keep do not add cross-context lock
+        * dependencies - the lock usage graph walking covers that area
+        * anyway, and we'd just unnecessarily increase the number of
+        * dependencies otherwise. [Note: hardirq and softirq contexts
+        * are separated from each other too.]
+        *
+        * The following field is used to detect when we cross into an
+        * interrupt context:
+        */
+       unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */
+       unsigned int trylock:1;                                         /* 16 bits */
+       unsigned int read:2;        /* see lock_acquire() comment */
+       unsigned int check:1;       /* see lock_acquire() comment */
+       unsigned int hardirqs_off:1;
+       unsigned int sync:1;
+       unsigned int references:11;                                     /* 32 bits */
+       unsigned int pin_count;
+ };
  #else /* !CONFIG_LOCKDEP */
  
  /*
diff --combined include/linux/mutex.h
index 95d11308f995d01c5972a2c21cf99d2bf9a23955,0dfba5df652435a679422065255637593183a9c1..7e208d46ba5b838bb391fe821dae1e10cd77204e
@@@ -20,6 -20,7 +20,7 @@@
  #include <linux/osq_lock.h>
  #include <linux/debug_locks.h>
  #include <linux/cleanup.h>
+ #include <linux/mutex_types.h>
  
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
  # define __DEP_MAP_MUTEX_INITIALIZER(lockname)                        \
  
  #ifndef CONFIG_PREEMPT_RT
  
- /*
-  * Simple, straightforward mutexes with strict semantics:
-  *
-  * - only one task can hold the mutex at a time
-  * - only the owner can unlock the mutex
-  * - multiple unlocks are not permitted
-  * - recursive locking is not permitted
-  * - a mutex object must be initialized via the API
-  * - a mutex object must not be initialized via memset or copying
-  * - task may not exit with mutex held
-  * - memory areas where held locks reside must not be freed
-  * - held mutexes must not be reinitialized
-  * - mutexes may not be used in hardware or software interrupt
-  *   contexts such as tasklets and timers
-  *
-  * These semantics are fully enforced when DEBUG_MUTEXES is
-  * enabled. Furthermore, besides enforcing the above rules, the mutex
-  * debugging code also implements a number of additional features
-  * that make lock debugging easier and faster:
-  *
-  * - uses symbolic names of mutexes, whenever they are printed in debug output
-  * - point-of-acquire tracking, symbolic lookup of function names
-  * - list of all locks held in the system, printout of them
-  * - owner tracking
-  * - detects self-recursing locks and prints out all relevant info
-  * - detects multi-task circular deadlocks and prints out all affected
-  *   locks and tasks (and only those tasks)
-  */
- struct mutex {
-       atomic_long_t           owner;
-       raw_spinlock_t          wait_lock;
- #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-       struct optimistic_spin_queue osq; /* Spinner MCS lock */
- #endif
-       struct list_head        wait_list;
- #ifdef CONFIG_DEBUG_MUTEXES
-       void                    *magic;
- #endif
- #ifdef CONFIG_DEBUG_LOCK_ALLOC
-       struct lockdep_map      dep_map;
- #endif
- };
  #ifdef CONFIG_DEBUG_MUTEXES
  
  #define __DEBUG_MUTEX_INITIALIZER(lockname)                           \
@@@ -131,14 -89,6 +89,6 @@@ extern bool mutex_is_locked(struct mute
  /*
   * Preempt-RT variant based on rtmutexes.
   */
- #include <linux/rtmutex.h>
- struct mutex {
-       struct rt_mutex_base    rtmutex;
- #ifdef CONFIG_DEBUG_LOCK_ALLOC
-       struct lockdep_map      dep_map;
- #endif
- };
  
  #define __MUTEX_INITIALIZER(mutexname)                                        \
  {                                                                     \
@@@ -221,7 -171,6 +171,7 @@@ extern void mutex_unlock(struct mutex *
  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
  
  DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T))
 -DEFINE_FREE(mutex, struct mutex *, if (_T) mutex_unlock(_T))
 +DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T))
 +DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0)
  
  #endif /* __LINUX_MUTEX_H */
diff --combined include/linux/sched.h
index d3097f0682d76dab1bed3eecfd4c2477bb3e2b6f,814bfdafbc1caeff756a517cce606e0757227ef0..9a66147915b2702588d33c9c443b65a27a0cc7fb
  #include <uapi/linux/sched.h>
  
  #include <asm/current.h>
- #include <linux/pid.h>
- #include <linux/sem.h>
+ #include <asm/processor.h>
+ #include <linux/thread_info.h>
+ #include <linux/preempt.h>
+ #include <linux/cpumask.h>
+ #include <linux/cache.h>
+ #include <linux/irqflags_types.h>
+ #include <linux/smp_types.h>
+ #include <linux/pid_types.h>
+ #include <linux/sem_types.h>
  #include <linux/shm.h>
  #include <linux/kmsan_types.h>
- #include <linux/mutex.h>
- #include <linux/plist.h>
- #include <linux/hrtimer.h>
- #include <linux/irqflags.h>
- #include <linux/seccomp.h>
- #include <linux/nodemask.h>
- #include <linux/rcupdate.h>
- #include <linux/refcount.h>
+ #include <linux/mutex_types.h>
+ #include <linux/plist_types.h>
+ #include <linux/hrtimer_types.h>
+ #include <linux/timer_types.h>
+ #include <linux/seccomp_types.h>
+ #include <linux/nodemask_types.h>
+ #include <linux/refcount_types.h>
  #include <linux/resource.h>
  #include <linux/latencytop.h>
  #include <linux/sched/prio.h>
  #include <linux/sched/types.h>
  #include <linux/signal_types.h>
- #include <linux/syscall_user_dispatch.h>
+ #include <linux/syscall_user_dispatch_types.h>
  #include <linux/mm_types_task.h>
  #include <linux/task_io_accounting.h>
- #include <linux/posix-timers.h>
- #include <linux/rseq.h>
- #include <linux/seqlock.h>
+ #include <linux/posix-timers_types.h>
+ #include <linux/restart_block.h>
+ #include <uapi/linux/rseq.h>
+ #include <linux/seqlock_types.h>
  #include <linux/kcsan.h>
  #include <linux/rv.h>
  #include <linux/livepatch_sched.h>
+ #include <linux/uidgid_types.h>
  #include <asm/kmap_size.h>
  
  /* task_struct member predeclarations (sorted alphabetically): */
@@@ -63,13 -71,11 +71,13 @@@ struct robust_list_head
  struct root_domain;
  struct rq;
  struct sched_attr;
 +struct sched_dl_entity;
  struct seq_file;
  struct sighand_struct;
  struct signal_struct;
  struct task_delay_info;
  struct task_group;
 +struct task_struct;
  struct user_event_mm;
  
  /*
@@@ -415,6 -421,42 +423,6 @@@ struct load_weight 
        u32                             inv_weight;
  };
  
 -/**
 - * struct util_est - Estimation utilization of FAIR tasks
 - * @enqueued: instantaneous estimated utilization of a task/cpu
 - * @ewma:     the Exponential Weighted Moving Average (EWMA)
 - *            utilization of a task
 - *
 - * Support data structure to track an Exponential Weighted Moving Average
 - * (EWMA) of a FAIR task's utilization. New samples are added to the moving
 - * average each time a task completes an activation. Sample's weight is chosen
 - * so that the EWMA will be relatively insensitive to transient changes to the
 - * task's workload.
 - *
 - * The enqueued attribute has a slightly different meaning for tasks and cpus:
 - * - task:   the task's util_avg at last task dequeue time
 - * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
 - * Thus, the util_est.enqueued of a task represents the contribution on the
 - * estimated utilization of the CPU where that task is currently enqueued.
 - *
 - * Only for tasks we track a moving average of the past instantaneous
 - * estimated utilization. This allows to absorb sporadic drops in utilization
 - * of an otherwise almost periodic task.
 - *
 - * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
 - * updates. When a task is dequeued, its util_est should not be updated if its
 - * util_avg has not been updated in the meantime.
 - * This information is mapped into the MSB bit of util_est.enqueued at dequeue
 - * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
 - * for a task) it is safe to use MSB.
 - */
 -struct util_est {
 -      unsigned int                    enqueued;
 -      unsigned int                    ewma;
 -#define UTIL_EST_WEIGHT_SHIFT         2
 -#define UTIL_AVG_UNCHANGED            0x80000000
 -} __attribute__((__aligned__(sizeof(u64))));
 -
  /*
   * The load/runnable/util_avg accumulates an infinite geometric series
   * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@@ -469,20 -511,9 +477,20 @@@ struct sched_avg 
        unsigned long                   load_avg;
        unsigned long                   runnable_avg;
        unsigned long                   util_avg;
 -      struct util_est                 util_est;
 +      unsigned int                    util_est;
  } ____cacheline_aligned;
  
 +/*
 + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
 + * updates. When a task is dequeued, its util_est should not be updated if its
 + * util_avg has not been updated in the meantime.
 + * This information is mapped into the MSB bit of util_est at dequeue time.
 + * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
 + * it is safe to use MSB.
 + */
 +#define UTIL_EST_WEIGHT_SHIFT         2
 +#define UTIL_AVG_UNCHANGED            0x80000000
 +
  struct sched_statistics {
  #ifdef CONFIG_SCHEDSTATS
        u64                             wait_start;
        u64                             block_max;
        s64                             sum_block_runtime;
  
 -      u64                             exec_max;
 +      s64                             exec_max;
        u64                             slice_max;
  
        u64                             nr_migrations_cold;
@@@ -530,7 -561,7 +538,7 @@@ struct sched_entity 
        struct load_weight              load;
        struct rb_node                  run_node;
        u64                             deadline;
 -      u64                             min_deadline;
 +      u64                             min_vruntime;
  
        struct list_head                group_node;
        unsigned int                    on_rq;
@@@ -584,9 -615,6 +592,9 @@@ struct sched_rt_entity 
  #endif
  } __randomize_layout;
  
 +typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
 +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);
 +
  struct sched_dl_entity {
        struct rb_node                  rb_node;
  
        unsigned int                    dl_yielded        : 1;
        unsigned int                    dl_non_contending : 1;
        unsigned int                    dl_overrun        : 1;
 +      unsigned int                    dl_server         : 1;
  
        /*
         * Bandwidth enforcement timer. Each -deadline task has its
         * timer is needed to decrease the active utilization at the correct
         * time.
         */
 -      struct hrtimer inactive_timer;
 +      struct hrtimer                  inactive_timer;
 +
 +      /*
 +       * Bits for DL-server functionality. Also see the comment near
 +       * dl_server_update().
 +       *
 +       * @rq the runqueue this server is for
 +       *
 +       * @server_has_tasks() returns true if @server_pick return a
 +       * runnable task.
 +       */
 +      struct rq                       *rq;
 +      dl_server_has_tasks_f           server_has_tasks;
 +      dl_server_pick_f                server_pick;
  
  #ifdef CONFIG_RT_MUTEXES
        /*
@@@ -789,7 -803,6 +797,7 @@@ struct task_struct 
        struct sched_entity             se;
        struct sched_rt_entity          rt;
        struct sched_dl_entity          dl;
 +      struct sched_dl_entity          *dl_server;
        const struct sched_class        *sched_class;
  
  #ifdef CONFIG_SCHED_CORE
         */
  };
  
- static inline struct pid *task_pid(struct task_struct *task)
- {
-       return task->thread_pid;
- }
- /*
-  * the helpers to get the task's different pids as they are seen
-  * from various namespaces
-  *
-  * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
-  * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
-  *                     current.
-  * task_xid_nr_ns()  : id seen from the ns specified;
-  *
-  * see also pid_nr() etc in include/linux/pid.h
-  */
- pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
- static inline pid_t task_pid_nr(struct task_struct *tsk)
- {
-       return tsk->pid;
- }
- static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
- }
- static inline pid_t task_pid_vnr(struct task_struct *tsk)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
- }
- static inline pid_t task_tgid_nr(struct task_struct *tsk)
- {
-       return tsk->tgid;
- }
- /**
-  * pid_alive - check that a task structure is not stale
-  * @p: Task structure to be checked.
-  *
-  * Test if a process is not yet dead (at most zombie state)
-  * If pid_alive fails, then pointers within the task structure
-  * can be stale and must not be dereferenced.
-  *
-  * Return: 1 if the process is alive. 0 otherwise.
-  */
- static inline int pid_alive(const struct task_struct *p)
- {
-       return p->thread_pid != NULL;
- }
- static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
- }
- static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
- }
- static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
- }
- static inline pid_t task_session_vnr(struct task_struct *tsk)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
- }
- static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
- }
- static inline pid_t task_tgid_vnr(struct task_struct *tsk)
- {
-       return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
- }
- static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
- {
-       pid_t pid = 0;
-       rcu_read_lock();
-       if (pid_alive(tsk))
-               pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
-       rcu_read_unlock();
-       return pid;
- }
- static inline pid_t task_ppid_nr(const struct task_struct *tsk)
- {
-       return task_ppid_nr_ns(tsk, &init_pid_ns);
- }
- /* Obsolete, do not use: */
- static inline pid_t task_pgrp_nr(struct task_struct *tsk)
- {
-       return task_pgrp_nr_ns(tsk, &init_pid_ns);
- }
  #define TASK_REPORT_IDLE      (TASK_REPORT + 1)
  #define TASK_REPORT_MAX               (TASK_REPORT_IDLE << 1)
  
@@@ -1707,20 -1612,6 +1607,6 @@@ static inline char task_state_to_char(s
        return task_index_to_char(task_state_index(tsk));
  }
  
- /**
-  * is_global_init - check if a task structure is init. Since init
-  * is free to have sub-threads we need to check tgid.
-  * @tsk: Task structure to be checked.
-  *
-  * Check if a task structure is the first user space task the kernel created.
-  *
-  * Return: 1 if the task structure is init. 0 otherwise.
-  */
- static inline int is_global_init(struct task_struct *tsk)
- {
-       return task_tgid_nr(tsk) == 1;
- }
  extern struct pid *cad_pid;
  
  /*
@@@ -1950,7 -1841,9 +1836,7 @@@ extern void ia64_set_curr_task(int cpu
  void yield(void);
  
  union thread_union {
 -#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
        struct task_struct task;
 -#endif
  #ifndef CONFIG_THREAD_INFO_IN_TASK
        struct thread_info thread_info;
  #endif
@@@ -2170,15 -2063,6 +2056,6 @@@ extern int __cond_resched_rwlock_write(
        __cond_resched_rwlock_write(lock);                                      \
  })
  
- static inline void cond_resched_rcu(void)
- {
- #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
-       rcu_read_unlock();
-       cond_resched();
-       rcu_read_lock();
- #endif
- }
  #ifdef CONFIG_PREEMPT_DYNAMIC
  
  extern bool preempt_model_none(void);
@@@ -2220,37 -2104,6 +2097,6 @@@ static inline bool preempt_model_preemp
        return preempt_model_full() || preempt_model_rt();
  }
  
- /*
-  * Does a critical section need to be broken due to another
-  * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
-  * but a general need for low latency)
-  */
- static inline int spin_needbreak(spinlock_t *lock)
- {
- #ifdef CONFIG_PREEMPTION
-       return spin_is_contended(lock);
- #else
-       return 0;
- #endif
- }
- /*
-  * Check if a rwlock is contended.
-  * Returns non-zero if there is another task waiting on the rwlock.
-  * Returns zero if the lock is not contended or the system / underlying
-  * rwlock implementation does not support contention detection.
-  * Technically does not depend on CONFIG_PREEMPTION, but a general need
-  * for low latency.
-  */
- static inline int rwlock_needbreak(rwlock_t *lock)
- {
- #ifdef CONFIG_PREEMPTION
-       return rwlock_is_contended(lock);
- #else
-       return 0;
- #endif
- }
  static __always_inline bool need_resched(void)
  {
        return unlikely(tif_need_resched());
@@@ -2285,6 -2138,8 +2131,8 @@@ extern bool sched_task_on_rq(struct tas
  extern unsigned long get_wchan(struct task_struct *p);
  extern struct task_struct *cpu_curr_snapshot(int cpu);
  
+ #include <linux/spinlock.h>
  /*
   * In order to reduce various lock holder preemption latencies provide an
   * interface to see if a vCPU is currently running or not.
@@@ -2321,129 -2176,6 +2169,6 @@@ static inline bool owner_on_cpu(struct 
  unsigned long sched_cpu_util(int cpu);
  #endif /* CONFIG_SMP */
  
- #ifdef CONFIG_RSEQ
- /*
-  * Map the event mask on the user-space ABI enum rseq_cs_flags
-  * for direct mask checks.
-  */
- enum rseq_event_mask_bits {
-       RSEQ_EVENT_PREEMPT_BIT  = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
-       RSEQ_EVENT_SIGNAL_BIT   = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
-       RSEQ_EVENT_MIGRATE_BIT  = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
- };
- enum rseq_event_mask {
-       RSEQ_EVENT_PREEMPT      = (1U << RSEQ_EVENT_PREEMPT_BIT),
-       RSEQ_EVENT_SIGNAL       = (1U << RSEQ_EVENT_SIGNAL_BIT),
-       RSEQ_EVENT_MIGRATE      = (1U << RSEQ_EVENT_MIGRATE_BIT),
- };
- static inline void rseq_set_notify_resume(struct task_struct *t)
- {
-       if (t->rseq)
-               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
- }
- void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
- static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-                                            struct pt_regs *regs)
- {
-       if (current->rseq)
-               __rseq_handle_notify_resume(ksig, regs);
- }
- static inline void rseq_signal_deliver(struct ksignal *ksig,
-                                      struct pt_regs *regs)
- {
-       preempt_disable();
-       __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
-       preempt_enable();
-       rseq_handle_notify_resume(ksig, regs);
- }
- /* rseq_preempt() requires preemption to be disabled. */
- static inline void rseq_preempt(struct task_struct *t)
- {
-       __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
-       rseq_set_notify_resume(t);
- }
- /* rseq_migrate() requires preemption to be disabled. */
- static inline void rseq_migrate(struct task_struct *t)
- {
-       __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
-       rseq_set_notify_resume(t);
- }
- /*
-  * If parent process has a registered restartable sequences area, the
-  * child inherits. Unregister rseq for a clone with CLONE_VM set.
-  */
- static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
- {
-       if (clone_flags & CLONE_VM) {
-               t->rseq = NULL;
-               t->rseq_len = 0;
-               t->rseq_sig = 0;
-               t->rseq_event_mask = 0;
-       } else {
-               t->rseq = current->rseq;
-               t->rseq_len = current->rseq_len;
-               t->rseq_sig = current->rseq_sig;
-               t->rseq_event_mask = current->rseq_event_mask;
-       }
- }
- static inline void rseq_execve(struct task_struct *t)
- {
-       t->rseq = NULL;
-       t->rseq_len = 0;
-       t->rseq_sig = 0;
-       t->rseq_event_mask = 0;
- }
- #else
- static inline void rseq_set_notify_resume(struct task_struct *t)
- {
- }
- static inline void rseq_handle_notify_resume(struct ksignal *ksig,
-                                            struct pt_regs *regs)
- {
- }
- static inline void rseq_signal_deliver(struct ksignal *ksig,
-                                      struct pt_regs *regs)
- {
- }
- static inline void rseq_preempt(struct task_struct *t)
- {
- }
- static inline void rseq_migrate(struct task_struct *t)
- {
- }
- static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
- {
- }
- static inline void rseq_execve(struct task_struct *t)
- {
- }
- #endif
- #ifdef CONFIG_DEBUG_RSEQ
- void rseq_syscall(struct pt_regs *regs);
- #else
- static inline void rseq_syscall(struct pt_regs *regs)
- {
- }
- #endif
  #ifdef CONFIG_SCHED_CORE
  extern void sched_core_free(struct task_struct *tsk);
  extern void sched_core_fork(struct task_struct *p);
index 015c0e3a3e1d14f65db5eaf2b1869a8098a7875a,b847d8fa75a9792686d9b86bd4588405a04fb211..4b7664c56208f9db855c527cd6daff672454d106
@@@ -9,6 -9,7 +9,7 @@@
  #include <linux/sched/task.h>
  #include <linux/cred.h>
  #include <linux/refcount.h>
+ #include <linux/pid.h>
  #include <linux/posix-timers.h>
  #include <linux/mm_types.h>
  #include <asm/ptrace.h>
@@@ -432,6 -433,7 +433,6 @@@ static inline bool fault_signal_pending
   * This is required every time the blocked sigset_t changes.
   * callers must hold sighand->siglock.
   */
 -extern void recalc_sigpending_and_wake(struct task_struct *t);
  extern void recalc_sigpending(void);
  extern void calculate_sigpending(void);
  
@@@ -645,9 -647,6 +646,9 @@@ extern bool current_is_single_threaded(
  #define while_each_thread(g, t) \
        while ((t = next_thread(t)) != g)
  
 +#define for_other_threads(p, t)       \
 +      for (t = p; (t = next_thread(t)) != p; )
 +
  #define __for_each_thread(signal, t)  \
        list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
                lockdep_is_held(&tasklist_lock))
index 4f3dca3535568b8c9bfd614da359785300cfcb3d,538cdfbe895f9ba41eee5e33da354312cd298a36..d362aacf9f897343f8cfababed5f21fdf430bfb7
@@@ -7,6 -7,8 +7,8 @@@
   * functionality:
   */
  
+ #include <linux/rcupdate.h>
+ #include <linux/refcount.h>
  #include <linux/sched.h>
  #include <linux/uaccess.h>
  
@@@ -226,6 -228,4 +228,6 @@@ static inline void task_unlock(struct t
        spin_unlock(&p->alloc_lock);
  }
  
 +DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))
 +
  #endif /* _LINUX_SCHED_TASK_H */
diff --combined include/linux/spinlock.h
index 90bc853cafb6aeedd433d3016f17a86086df817e,0c71f06454d9e100fff6a79c047164e0f403ba19..eaac8b0da25b8aef964a311eee34d0313c549838
@@@ -449,6 -449,37 +449,37 @@@ static __always_inline int spin_is_cont
        return raw_spin_is_contended(&lock->rlock);
  }
  
+ /*
+  * Does a critical section need to be broken due to another
+  * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
+  * but a general need for low latency)
+  */
+ static inline int spin_needbreak(spinlock_t *lock)
+ {
+ #ifdef CONFIG_PREEMPTION
+       return spin_is_contended(lock);
+ #else
+       return 0;
+ #endif
+ }
+ /*
+  * Check if a rwlock is contended.
+  * Returns non-zero if there is another task waiting on the rwlock.
+  * Returns zero if the lock is not contended or the system / underlying
+  * rwlock implementation does not support contention detection.
+  * Technically does not depend on CONFIG_PREEMPTION, but a general need
+  * for low latency.
+  */
+ static inline int rwlock_needbreak(rwlock_t *lock)
+ {
+ #ifdef CONFIG_PREEMPTION
+       return rwlock_is_contended(lock);
+ #else
+       return 0;
+ #endif
+ }
  #define assert_spin_locked(lock)      assert_raw_spin_locked(&(lock)->rlock)
  
  #else  /* !CONFIG_PREEMPT_RT */
@@@ -507,8 -538,6 +538,8 @@@ DEFINE_LOCK_GUARD_1(raw_spinlock, raw_s
                    raw_spin_lock(_T->lock),
                    raw_spin_unlock(_T->lock))
  
 +DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock))
 +
  DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t,
                    raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING),
                    raw_spin_unlock(_T->lock))
@@@ -517,62 -546,23 +548,62 @@@ DEFINE_LOCK_GUARD_1(raw_spinlock_irq, r
                    raw_spin_lock_irq(_T->lock),
                    raw_spin_unlock_irq(_T->lock))
  
 +DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock))
 +
  DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t,
                    raw_spin_lock_irqsave(_T->lock, _T->flags),
                    raw_spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)
  
 +DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try,
 +                       raw_spin_trylock_irqsave(_T->lock, _T->flags))
 +
  DEFINE_LOCK_GUARD_1(spinlock, spinlock_t,
                    spin_lock(_T->lock),
                    spin_unlock(_T->lock))
  
 +DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock))
 +
  DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t,
                    spin_lock_irq(_T->lock),
                    spin_unlock_irq(_T->lock))
  
 +DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try,
 +                       spin_trylock_irq(_T->lock))
 +
  DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
                    spin_lock_irqsave(_T->lock, _T->flags),
                    spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)
  
 +DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
 +                       spin_trylock_irqsave(_T->lock, _T->flags))
 +
 +DEFINE_LOCK_GUARD_1(read_lock, rwlock_t,
 +                  read_lock(_T->lock),
 +                  read_unlock(_T->lock))
 +
 +DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t,
 +                  read_lock_irq(_T->lock),
 +                  read_unlock_irq(_T->lock))
 +
 +DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t,
 +                  read_lock_irqsave(_T->lock, _T->flags),
 +                  read_unlock_irqrestore(_T->lock, _T->flags),
 +                  unsigned long flags)
 +
 +DEFINE_LOCK_GUARD_1(write_lock, rwlock_t,
 +                  write_lock(_T->lock),
 +                  write_unlock(_T->lock))
 +
 +DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t,
 +                  write_lock_irq(_T->lock),
 +                  write_unlock_irq(_T->lock))
 +
 +DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t,
 +                  write_lock_irqsave(_T->lock, _T->flags),
 +                  write_unlock_irqrestore(_T->lock, _T->flags),
 +                  unsigned long flags)
 +
  #undef __LINUX_INSIDE_SPINLOCK_H
  #endif /* __LINUX_SPINLOCK_H */
diff --combined include/linux/uidgid.h
index 415a7ca2b8829ce9ec2077a34d1ffc915d5eb559,ba20b62f13e1d6f2fd1ba41f424b8ffe01f10dc9..f85ec5613721fe1b4fc6c51b80c9a32cc617c19b
   * to detect when we overlook these differences.
   *
   */
- #include <linux/types.h>
+ #include <linux/uidgid_types.h>
  #include <linux/highuid.h>
  
  struct user_namespace;
  extern struct user_namespace init_user_ns;
 +struct uid_gid_map;
  
- typedef struct {
-       uid_t val;
- } kuid_t;
- typedef struct {
-       gid_t val;
- } kgid_t;
  #define KUIDT_INIT(value) (kuid_t){ value }
  #define KGIDT_INIT(value) (kgid_t){ value }
  
@@@ -139,9 -129,6 +130,9 @@@ static inline bool kgid_has_mapping(str
        return from_kgid(ns, gid) != (gid_t) -1;
  }
  
 +u32 map_id_down(struct uid_gid_map *map, u32 id);
 +u32 map_id_up(struct uid_gid_map *map, u32 id);
 +
  #else
  
  static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
@@@ -190,15 -177,6 +181,15 @@@ static inline bool kgid_has_mapping(str
        return gid_valid(gid);
  }
  
 +static inline u32 map_id_down(struct uid_gid_map *map, u32 id)
 +{
 +      return id;
 +}
 +
 +static inline u32 map_id_up(struct uid_gid_map *map, u32 id)
 +{
 +      return id;
 +}
  #endif /* CONFIG_USER_NS */
  
  #endif /* _LINUX_UIDGID_H */
index b0b9604b76b88a0f2b256564282f5fa4286fdb3a,f1bb2e35301f9493e07521ae4b29a46968149f94..2cc0a9606175fa6f653db5eccadf0071027d6c2f
  #include <linux/atomic.h>
  #include <linux/cpumask.h>
  #include <linux/rcupdate.h>
- struct workqueue_struct;
- struct work_struct;
- typedef void (*work_func_t)(struct work_struct *work);
- void delayed_work_timer_fn(struct timer_list *t);
+ #include <linux/workqueue_types.h>
  
  /*
   * The first word is the work queue pointer and the flags rolled into
@@@ -95,15 -90,6 +90,6 @@@ enum 
  #define WORK_STRUCT_FLAG_MASK    ((1ul << WORK_STRUCT_FLAG_BITS) - 1)
  #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK)
  
- struct work_struct {
-       atomic_long_t data;
-       struct list_head entry;
-       work_func_t func;
- #ifdef CONFIG_LOCKDEP
-       struct lockdep_map lockdep_map;
- #endif
- };
  #define WORK_DATA_INIT()      ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
  #define WORK_DATA_STATIC_INIT()       \
        ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))
@@@ -491,7 -477,7 +477,7 @@@ struct workqueue_attrs *alloc_workqueue
  void free_workqueue_attrs(struct workqueue_attrs *attrs);
  int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs);
 -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask);
 +extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);
  
  extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
                        struct work_struct *work);
diff --combined init/init_task.c
index 6f6485d554df1d7237aa01f129b13d2dad55035c,56220898a256b132fe217f79a319e47dcd218f12..7ecb458eb3da60eb73123f4b2072910f194c4bb5
@@@ -12,6 -12,7 +12,7 @@@
  #include <linux/audit.h>
  #include <linux/numa.h>
  #include <linux/scs.h>
+ #include <linux/plist.h>
  
  #include <linux/uaccess.h>
  
@@@ -51,7 -52,8 +52,7 @@@ static struct sighand_struct init_sigha
  };
  
  #ifdef CONFIG_SHADOW_CALL_STACK
 -unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)]
 -              __init_task_data = {
 +unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = {
        [(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC
  };
  #endif
   * Set up the first task table, touch at your own risk!. Base=0,
   * limit=0x1fffff (=2MB)
   */
 -struct task_struct init_task
 -#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
 -      __init_task_data
 -#endif
 -      __aligned(L1_CACHE_BYTES)
 -= {
 +struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
  #ifdef CONFIG_THREAD_INFO_IN_TASK
        .thread_info    = INIT_THREAD_INFO(init_task),
        .stack_refcount = REFCOUNT_INIT(1),
diff --combined kernel/async.c
index 673bba6bdf3a0b0c0041e4bb7d63e61b8a63d0d7,79f6a3034b1f9016b0dc70f4d856f823177c0ea6..97f224a5257b4e49ed959580ae1566a46e09f3fe
@@@ -46,11 -46,12 +46,12 @@@ asynchronous and synchronous parts of t
  
  #include <linux/async.h>
  #include <linux/atomic.h>
- #include <linux/ktime.h>
  #include <linux/export.h>
- #include <linux/wait.h>
+ #include <linux/ktime.h>
+ #include <linux/pid.h>
  #include <linux/sched.h>
  #include <linux/slab.h>
+ #include <linux/wait.h>
  #include <linux/workqueue.h>
  
  #include "workqueue_internal.h"
@@@ -145,39 -146,6 +146,39 @@@ static void async_run_entry_fn(struct w
        wake_up(&async_done);
  }
  
 +static async_cookie_t __async_schedule_node_domain(async_func_t func,
 +                                                 void *data, int node,
 +                                                 struct async_domain *domain,
 +                                                 struct async_entry *entry)
 +{
 +      async_cookie_t newcookie;
 +      unsigned long flags;
 +
 +      INIT_LIST_HEAD(&entry->domain_list);
 +      INIT_LIST_HEAD(&entry->global_list);
 +      INIT_WORK(&entry->work, async_run_entry_fn);
 +      entry->func = func;
 +      entry->data = data;
 +      entry->domain = domain;
 +
 +      spin_lock_irqsave(&async_lock, flags);
 +
 +      /* allocate cookie and queue */
 +      newcookie = entry->cookie = next_cookie++;
 +
 +      list_add_tail(&entry->domain_list, &domain->pending);
 +      if (domain->registered)
 +              list_add_tail(&entry->global_list, &async_global_pending);
 +
 +      atomic_inc(&entry_count);
 +      spin_unlock_irqrestore(&async_lock, flags);
 +
 +      /* schedule for execution */
 +      queue_work_node(node, system_unbound_wq, &entry->work);
 +
 +      return newcookie;
 +}
 +
  /**
   * async_schedule_node_domain - NUMA specific version of async_schedule_domain
   * @func: function to execute asynchronously
@@@ -219,8 -187,29 +220,8 @@@ async_cookie_t async_schedule_node_doma
                func(data, newcookie);
                return newcookie;
        }
 -      INIT_LIST_HEAD(&entry->domain_list);
 -      INIT_LIST_HEAD(&entry->global_list);
 -      INIT_WORK(&entry->work, async_run_entry_fn);
 -      entry->func = func;
 -      entry->data = data;
 -      entry->domain = domain;
 -
 -      spin_lock_irqsave(&async_lock, flags);
 -
 -      /* allocate cookie and queue */
 -      newcookie = entry->cookie = next_cookie++;
 -
 -      list_add_tail(&entry->domain_list, &domain->pending);
 -      if (domain->registered)
 -              list_add_tail(&entry->global_list, &async_global_pending);
 -
 -      atomic_inc(&entry_count);
 -      spin_unlock_irqrestore(&async_lock, flags);
 -
 -      /* schedule for execution */
 -      queue_work_node(node, system_unbound_wq, &entry->work);
  
 -      return newcookie;
 +      return __async_schedule_node_domain(func, data, node, domain, entry);
  }
  EXPORT_SYMBOL_GPL(async_schedule_node_domain);
  
@@@ -243,35 -232,6 +244,35 @@@ async_cookie_t async_schedule_node(asyn
  }
  EXPORT_SYMBOL_GPL(async_schedule_node);
  
 +/**
 + * async_schedule_dev_nocall - A simplified variant of async_schedule_dev()
 + * @func: function to execute asynchronously
 + * @dev: device argument to be passed to function
 + *
 + * @dev is used as both the argument for the function and to provide NUMA
 + * context for where to run the function.
 + *
 + * If the asynchronous execution of @func is scheduled successfully, return
 + * true. Otherwise, do nothing and return false, unlike async_schedule_dev()
 + * that will run the function synchronously then.
 + */
 +bool async_schedule_dev_nocall(async_func_t func, struct device *dev)
 +{
 +      struct async_entry *entry;
 +
 +      entry = kzalloc(sizeof(struct async_entry), GFP_KERNEL);
 +
 +      /* Give up if there is no memory or too much work. */
 +      if (!entry || atomic_read(&entry_count) > MAX_WORK) {
 +              kfree(entry);
 +              return false;
 +      }
 +
 +      __async_schedule_node_domain(func, dev, dev_to_node(dev),
 +                                   &async_dfl_domain, entry);
 +      return true;
 +}
 +
  /**
   * async_synchronize_full - synchronize all asynchronous function calls
   *
diff --combined kernel/exit.c
index aedc0832c9f4ded6578233a611ee362e0dad77ff,2ef33047371bcfe6fd9de007266333fa3d00c9c5..3988a02efaef06444654a415ce298d378ab925ec
  #include <linux/rethook.h>
  #include <linux/sysfs.h>
  #include <linux/user_events.h>
  #include <linux/uaccess.h>
+ #include <uapi/linux/wait.h>
  #include <asm/unistd.h>
  #include <asm/mmu_context.h>
  
@@@ -824,6 -826,8 +826,6 @@@ void __noreturn do_exit(long code
        ptrace_event(PTRACE_EVENT_EXIT, code);
        user_events_exit(tsk);
  
 -      validate_creds_for_do_exit(tsk);
 -
        io_uring_files_cancel();
        exit_signals(tsk);  /* sets PF_EXITING */
  
        if (tsk->task_frag.page)
                put_page(tsk->task_frag.page);
  
 -      validate_creds_for_do_exit(tsk);
        exit_task_stack_account(tsk);
  
        check_stack_usage();
diff --combined kernel/fork.c
index b32e323adbbf9fea352eb7d8fd483be652c53e0e,53816393995bb94bdf876ba5a8ba8f34a5db76bb..c981fa6171c1aebee1d4e69bc493e218b5d1bcac
@@@ -53,6 -53,7 +53,7 @@@
  #include <linux/seccomp.h>
  #include <linux/swap.h>
  #include <linux/syscalls.h>
+ #include <linux/syscall_user_dispatch.h>
  #include <linux/jiffies.h>
  #include <linux/futex.h>
  #include <linux/compat.h>
  #include <linux/stackprotector.h>
  #include <linux/user_events.h>
  #include <linux/iommu.h>
+ #include <linux/rseq.h>
  
  #include <asm/pgalloc.h>
  #include <linux/uaccess.h>
@@@ -165,6 -167,7 +167,6 @@@ void __weak arch_release_task_struct(st
  {
  }
  
 -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
  static struct kmem_cache *task_struct_cachep;
  
  static inline struct task_struct *alloc_task_struct_node(int node)
@@@ -176,6 -179,9 +178,6 @@@ static inline void free_task_struct(str
  {
        kmem_cache_free(task_struct_cachep, tsk);
  }
 -#endif
 -
 -#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
  
  /*
   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@@ -408,6 -414,24 +410,6 @@@ void thread_stack_cache_init(void
  }
  
  # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
 -#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
 -
 -static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 -{
 -      unsigned long *stack;
 -
 -      stack = arch_alloc_thread_stack_node(tsk, node);
 -      tsk->stack = stack;
 -      return stack ? 0 : -ENOMEM;
 -}
 -
 -static void free_thread_stack(struct task_struct *tsk)
 -{
 -      arch_free_thread_stack(tsk);
 -      tsk->stack = NULL;
 -}
 -
 -#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
  
  /* SLAB cache for signal_struct structures (tsk->signal) */
  static struct kmem_cache *signal_cachep;
@@@ -628,6 -652,7 +630,6 @@@ static __latent_entropy int dup_mmap(st
        int retval;
        unsigned long charge = 0;
        LIST_HEAD(uf);
 -      VMA_ITERATOR(old_vmi, oldmm, 0);
        VMA_ITERATOR(vmi, mm, 0);
  
        uprobe_start_dup_mmap();
                goto out;
        khugepaged_fork(mm, oldmm);
  
 -      retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
 -      if (retval)
 +      /* Use __mt_dup() to efficiently build an identical maple tree. */
 +      retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
 +      if (unlikely(retval))
                goto out;
  
        mt_clear_in_rcu(vmi.mas.tree);
 -      for_each_vma(old_vmi, mpnt) {
 +      for_each_vma(vmi, mpnt) {
                struct file *file;
  
                vma_start_write(mpnt);
                if (mpnt->vm_flags & VM_DONTCOPY) {
 +                      retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
 +                                                  mpnt->vm_end, GFP_KERNEL);
 +                      if (retval)
 +                              goto loop_out;
 +
                        vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
                        continue;
                }
                if (is_vm_hugetlb_page(tmp))
                        hugetlb_dup_vma_private(tmp);
  
 -              /* Link the vma into the MT */
 -              if (vma_iter_bulk_store(&vmi, tmp))
 -                      goto fail_nomem_vmi_store;
 +              /*
 +               * Link the vma into the MT. After using __mt_dup(), memory
 +               * allocation is not necessary here, so it cannot fail.
 +               */
 +              vma_iter_bulk_store(&vmi, tmp);
  
                mm->map_count++;
                if (!(tmp->vm_flags & VM_WIPEONFORK))
                if (tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);
  
 -              if (retval)
 +              if (retval) {
 +                      mpnt = vma_next(&vmi);
                        goto loop_out;
 +              }
        }
        /* a new mm has just been created */
        retval = arch_dup_mmap(oldmm, mm);
  loop_out:
        vma_iter_free(&vmi);
 -      if (!retval)
 +      if (!retval) {
                mt_set_in_rcu(vmi.mas.tree);
 +      } else if (mpnt) {
 +              /*
 +               * The entire maple tree has already been duplicated. If the
 +               * mmap duplication fails, mark the failure point with
 +               * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
 +               * stop releasing VMAs that have not been duplicated after this
 +               * point.
 +               */
 +              mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
 +              mas_store(&vmi.mas, XA_ZERO_ENTRY);
 +      }
  out:
        mmap_write_unlock(mm);
        flush_tlb_mm(oldmm);
@@@ -776,6 -780,8 +778,6 @@@ fail_uprobe_end
        uprobe_end_dup_mmap();
        return retval;
  
 -fail_nomem_vmi_store:
 -      unlink_anon_vmas(tmp);
  fail_nomem_anon_vma_fork:
        mpol_put(vma_policy(tmp));
  fail_nomem_policy:
@@@ -1017,6 -1023,7 +1019,6 @@@ static void set_max_threads(unsigned in
  int arch_task_struct_size __read_mostly;
  #endif
  
 -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
  static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
  {
        /* Fetch thread_struct whitelist for the architecture. */
        else
                *offset += offsetof(struct task_struct, thread);
  }
 -#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
  
  void __init fork_init(void)
  {
        int i;
 -#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
  #ifndef ARCH_MIN_TASKALIGN
  #define ARCH_MIN_TASKALIGN    0
  #endif
                        arch_task_struct_size, align,
                        SLAB_PANIC|SLAB_ACCOUNT,
                        useroffset, usersize, NULL);
 -#endif
  
        /* do the arch specific task caches init */
        arch_task_cache_init();
@@@ -1580,7 -1590,7 +1582,7 @@@ static void complete_vfork_done(struct 
  static int wait_for_vfork_done(struct task_struct *child,
                                struct completion *vfork)
  {
 -      unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
 +      unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
        int killed;
  
        cgroup_enter_frozen();
@@@ -2920,7 -2930,7 +2922,7 @@@ pid_t kernel_clone(struct kernel_clone_
                get_task_struct(p);
        }
  
 -      if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
 +      if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
                /* lock the task to synchronize with memcg migration */
                task_lock(p);
                lru_gen_add_mm(p->mm);
diff --combined kernel/sched/core.c
index db4be4921e7f0eeb8e19b4544d54b20f48e47c8b,d04cf3c47899b85f1f6d75190f08078a90dba19e..9116bcc903467fe0d5854e3deb06b8d334cf85eb
@@@ -57,6 -57,7 +57,7 @@@
  #include <linux/profile.h>
  #include <linux/psi.h>
  #include <linux/rcuwait_api.h>
+ #include <linux/rseq.h>
  #include <linux/sched/wake_q.h>
  #include <linux/scs.h>
  #include <linux/slab.h>
@@@ -1131,28 -1132,6 +1132,28 @@@ static void wake_up_idle_cpu(int cpu
        if (cpu == smp_processor_id())
                return;
  
 +      /*
 +       * Set TIF_NEED_RESCHED and send an IPI if in the non-polling
 +       * part of the idle loop. This forces an exit from the idle loop
 +       * and a round trip to schedule(). Now this could be optimized
 +       * because a simple new idle loop iteration is enough to
 +       * re-evaluate the next tick. Provided some re-ordering of tick
 +       * nohz functions that would need to follow TIF_NR_POLLING
 +       * clearing:
 +       *
 +       * - On most archs, a simple fetch_or on ti::flags with a
 +       *   "0" value would be enough to know if an IPI needs to be sent.
 +       *
 +       * - x86 needs to perform a last need_resched() check between
 +       *   monitor and mwait which doesn't take timers into account.
 +       *   There a dedicated TIF_TIMER flag would be required to
 +       *   fetch_or here and be checked along with TIF_NEED_RESCHED
 +       *   before mwait().
 +       *
 +       * However, remote timer enqueue is not such a frequent event
 +       * and testing of the above solutions didn't appear to report
 +       * much benefits.
 +       */
        if (set_nr_and_not_polling(rq->idle))
                smp_send_reschedule(cpu);
        else
@@@ -2146,14 -2125,12 +2147,14 @@@ void activate_task(struct rq *rq, struc
  
        enqueue_task(rq, p, flags);
  
 -      p->on_rq = TASK_ON_RQ_QUEUED;
 +      WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED);
 +      ASSERT_EXCLUSIVE_WRITER(p->on_rq);
  }
  
  void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
  {
 -      p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
 +      WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
 +      ASSERT_EXCLUSIVE_WRITER(p->on_rq);
  
        dequeue_task(rq, p, flags);
  }
@@@ -3819,8 -3796,6 +3820,8 @@@ ttwu_do_activate(struct rq *rq, struct 
                rq->idle_stamp = 0;
        }
  #endif
 +
 +      p->dl_server = NULL;
  }
  
  /*
@@@ -4535,7 -4510,10 +4536,7 @@@ static void __sched_fork(unsigned long 
        memset(&p->stats, 0, sizeof(p->stats));
  #endif
  
 -      RB_CLEAR_NODE(&p->dl.rb_node);
 -      init_dl_task_timer(&p->dl);
 -      init_dl_inactive_task_timer(&p->dl);
 -      __dl_clear_params(p);
 +      init_dl_entity(&p->dl);
  
        INIT_LIST_HEAD(&p->rt.run_list);
        p->rt.timeout           = 0;
@@@ -6027,27 -6005,12 +6028,27 @@@ __pick_next_task(struct rq *rq, struct 
                        p = pick_next_task_idle(rq);
                }
  
 +              /*
 +               * This is the fast path; it cannot be a DL server pick;
 +               * therefore even if @p == @prev, ->dl_server must be NULL.
 +               */
 +              if (p->dl_server)
 +                      p->dl_server = NULL;
 +
                return p;
        }
  
  restart:
        put_prev_task_balance(rq, prev, rf);
  
 +      /*
 +       * We've updated @prev and no longer need the server link, clear it.
 +       * Must be done before ->pick_next_task() because that can (re)set
 +       * ->dl_server.
 +       */
 +      if (prev->dl_server)
 +              prev->dl_server = NULL;
 +
        for_each_class(class) {
                p = class->pick_next_task(rq);
                if (p)
@@@ -7467,13 -7430,18 +7468,13 @@@ int sched_core_idle_cpu(int cpu
   * required to meet deadlines.
   */
  unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 -                               enum cpu_util_type type,
 -                               struct task_struct *p)
 +                               unsigned long *min,
 +                               unsigned long *max)
  {
 -      unsigned long dl_util, util, irq, max;
 +      unsigned long util, irq, scale;
        struct rq *rq = cpu_rq(cpu);
  
 -      max = arch_scale_cpu_capacity(cpu);
 -
 -      if (!uclamp_is_used() &&
 -          type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
 -              return max;
 -      }
 +      scale = arch_scale_cpu_capacity(cpu);
  
        /*
         * Early check to see if IRQ/steal time saturates the CPU, can be
         * update_irq_load_avg().
         */
        irq = cpu_util_irq(rq);
 -      if (unlikely(irq >= max))
 -              return max;
 +      if (unlikely(irq >= scale)) {
 +              if (min)
 +                      *min = scale;
 +              if (max)
 +                      *max = scale;
 +              return scale;
 +      }
 +
 +      if (min) {
 +              /*
 +               * The minimum utilization returns the highest level between:
 +               * - the computed DL bandwidth needed with the IRQ pressure which
 +               *   steals time to the deadline task.
 +               * - The minimum performance requirement for CFS and/or RT.
 +               */
 +              *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
 +
 +              /*
 +               * When an RT task is runnable and uclamp is not used, we must
 +               * ensure that the task will run at maximum compute capacity.
 +               */
 +              if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
 +                      *min = max(*min, scale);
 +      }
  
        /*
         * Because the time spend on RT/DL tasks is visible as 'lost' time to
         * CFS tasks and we use the same metric to track the effective
         * utilization (PELT windows are synchronized) we can directly add them
         * to obtain the CPU's actual utilization.
 -       *
 -       * CFS and RT utilization can be boosted or capped, depending on
 -       * utilization clamp constraints requested by currently RUNNABLE
 -       * tasks.
 -       * When there are no CFS RUNNABLE tasks, clamps are released and
 -       * frequency will be gracefully reduced with the utilization decay.
         */
        util = util_cfs + cpu_util_rt(rq);
 -      if (type == FREQUENCY_UTIL)
 -              util = uclamp_rq_util_with(rq, util, p);
 -
 -      dl_util = cpu_util_dl(rq);
 +      util += cpu_util_dl(rq);
  
        /*
 -       * For frequency selection we do not make cpu_util_dl() a permanent part
 -       * of this sum because we want to use cpu_bw_dl() later on, but we need
 -       * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
 -       * that we select f_max when there is no idle time.
 -       *
 -       * NOTE: numerical errors or stop class might cause us to not quite hit
 -       * saturation when we should -- something for later.
 +       * The maximum hint is a soft bandwidth requirement, which can be lower
 +       * than the actual utilization because of uclamp_max requirements.
         */
 -      if (util + dl_util >= max)
 -              return max;
 +      if (max)
 +              *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
  
 -      /*
 -       * OTOH, for energy computation we need the estimated running time, so
 -       * include util_dl and ignore dl_bw.
 -       */
 -      if (type == ENERGY_UTIL)
 -              util += dl_util;
 +      if (util >= scale)
 +              return scale;
  
        /*
         * There is still idle time; further improve the number by using the
         *   U' = irq + --------- * U
         *                 max
         */
 -      util = scale_irq_capacity(util, irq, max);
 +      util = scale_irq_capacity(util, irq, scale);
        util += irq;
  
 -      /*
 -       * Bandwidth required by DEADLINE must always be granted while, for
 -       * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
 -       * to gracefully reduce the frequency when no tasks show up for longer
 -       * periods of time.
 -       *
 -       * Ideally we would like to set bw_dl as min/guaranteed freq and util +
 -       * bw_dl as requested freq. However, cpufreq is not yet ready for such
 -       * an interface. So, we only do the latter for now.
 -       */
 -      if (type == FREQUENCY_UTIL)
 -              util += cpu_bw_dl(rq);
 -
 -      return min(max, util);
 +      return min(scale, util);
  }
  
  unsigned long sched_cpu_util(int cpu)
  {
 -      return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
 +      return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
  }
  #endif /* CONFIG_SMP */
  
diff --combined mm/filemap.c
index c8dafe70d4ccedca9f80ec04d6d9319309cc5ad9,1219ffc04a26133abeea103e98bd4c210f49a653..ea49677c63385af4a82981511384f63fc21e7c60
@@@ -45,6 -45,7 +45,7 @@@
  #include <linux/migrate.h>
  #include <linux/pipe_fs_i.h>
  #include <linux/splice.h>
+ #include <linux/rcupdate_wait.h>
  #include <asm/pgalloc.h>
  #include <asm/tlbflush.h>
  #include "internal.h"
   *    ->i_pages lock          (try_to_unmap_one)
   *    ->lruvec->lru_lock      (follow_page->mark_page_accessed)
   *    ->lruvec->lru_lock      (check_pte_range->isolate_lru_page)
 - *    ->private_lock          (page_remove_rmap->set_page_dirty)
 - *    ->i_pages lock          (page_remove_rmap->set_page_dirty)
 - *    bdi.wb->list_lock               (page_remove_rmap->set_page_dirty)
 - *    ->inode->i_lock         (page_remove_rmap->set_page_dirty)
 - *    ->memcg->move_lock      (page_remove_rmap->folio_memcg_lock)
 + *    ->private_lock          (folio_remove_rmap_pte->set_page_dirty)
 + *    ->i_pages lock          (folio_remove_rmap_pte->set_page_dirty)
 + *    bdi.wb->list_lock               (folio_remove_rmap_pte->set_page_dirty)
 + *    ->inode->i_lock         (folio_remove_rmap_pte->set_page_dirty)
 + *    ->memcg->move_lock      (folio_remove_rmap_pte->folio_memcg_lock)
   *    bdi.wb->list_lock               (zap_pte_range->set_page_dirty)
   *    ->inode->i_lock         (zap_pte_range->set_page_dirty)
   *    ->private_lock          (zap_pte_range->block_dirty_folio)
@@@ -1623,7 -1624,7 +1624,7 @@@ EXPORT_SYMBOL_GPL(__folio_lock_killable
  static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
  {
        struct wait_queue_head *q = folio_waitqueue(folio);
 -      int ret = 0;
 +      int ret;
  
        wait->folio = folio;
        wait->bit_nr = PG_locked;
@@@ -2173,7 -2174,7 +2174,7 @@@ update_start
  
        if (nr) {
                folio = fbatch->folios[nr - 1];
 -              *start = folio->index + folio_nr_pages(folio);
 +              *start = folio_next_index(folio);
        }
  out:
        rcu_read_unlock();
@@@ -2607,15 -2608,6 +2608,15 @@@ ssize_t filemap_read(struct kiocb *iocb
                        goto put_folios;
                end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
  
 +              /*
 +               * Pairs with a barrier in
 +               * block_write_end()->mark_buffer_dirty() or other page
 +               * dirtying routines like iomap_write_end() to ensure
 +               * changes to page contents are visible before we see
 +               * increased inode size.
 +               */
 +              smp_rmb();
 +
                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
diff --combined mm/khugepaged.c
index 3defe6713ef1c4f839fd8d9cb965a3e9523613fb,47a20a4ece09896403b9d28006147a9d8703a57e..2b219acb528e25fd7f16b9f58d85a81048355bd4
@@@ -17,6 -17,7 +17,7 @@@
  #include <linux/userfaultfd_k.h>
  #include <linux/page_idle.h>
  #include <linux/page_table_check.h>
+ #include <linux/rcupdate_wait.h>
  #include <linux/swapops.h>
  #include <linux/shmem_fs.h>
  #include <linux/ksm.h>
@@@ -446,8 -447,7 +447,8 @@@ void khugepaged_enter_vma(struct vm_are
  {
        if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
            hugepage_flags_enabled()) {
 -              if (hugepage_vma_check(vma, vm_flags, false, false, true))
 +              if (thp_vma_allowable_order(vma, vm_flags, false, false, true,
 +                                          PMD_ORDER))
                        __khugepaged_enter(vma->vm_mm);
        }
  }
@@@ -494,6 -494,11 +495,6 @@@ static void release_pte_folio(struct fo
        folio_putback_lru(folio);
  }
  
 -static void release_pte_page(struct page *page)
 -{
 -      release_pte_folio(page_folio(page));
 -}
 -
  static void release_pte_pages(pte_t *pte, pte_t *_pte,
                struct list_head *compound_pagelist)
  {
@@@ -682,7 -687,6 +683,7 @@@ static void __collapse_huge_page_copy_s
                                                spinlock_t *ptl,
                                                struct list_head *compound_pagelist)
  {
 +      struct folio *src_folio;
        struct page *src_page;
        struct page *tmp;
        pte_t *_pte;
                        }
                } else {
                        src_page = pte_page(pteval);
 -                      if (!PageCompound(src_page))
 -                              release_pte_page(src_page);
 +                      src_folio = page_folio(src_page);
 +                      if (!folio_test_large(src_folio))
 +                              release_pte_folio(src_folio);
                        /*
                         * ptl mostly unnecessary, but preempt has to
                         * be disabled to update the per-cpu stats
 -                       * inside page_remove_rmap().
 +                       * inside folio_remove_rmap_pte().
                         */
                        spin_lock(ptl);
                        ptep_clear(vma->vm_mm, address, _pte);
 -                      page_remove_rmap(src_page, vma, false);
 +                      folio_remove_rmap_pte(src_folio, src_page, vma);
                        spin_unlock(ptl);
                        free_page_and_swap_cache(src_page);
                }
@@@ -920,16 -923,16 +921,16 @@@ static int hugepage_vma_revalidate(stru
        if (!vma)
                return SCAN_VMA_NULL;
  
 -      if (!transhuge_vma_suitable(vma, address))
 +      if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
                return SCAN_ADDRESS_RANGE;
 -      if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
 -                              cc->is_khugepaged))
 +      if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
 +                                   cc->is_khugepaged, PMD_ORDER))
                return SCAN_VMA_CHECK;
        /*
         * Anon VMA expected, the address may be unmapped then
         * remapped to file after khugepaged reaquired the mmap_lock.
         *
 -       * hugepage_vma_check may return true for qualified file
 +       * thp_vma_allowable_order may return true for qualified file
         * vmas.
         */
        if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
@@@ -1087,7 -1090,6 +1088,7 @@@ static int collapse_huge_page(struct mm
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pgtable_t pgtable;
 +      struct folio *folio;
        struct page *hpage;
        spinlock_t *pmd_ptl, *pte_ptl;
        int result = SCAN_FAIL;
         * Prevent all access to pagetables with the exception of
         * gup_fast later handled by the ptep_clear_flush and the VM
         * handled by the anon_vma lock + PG_lock.
 +       *
 +       * UFFDIO_MOVE is prevented to race as well thanks to the
 +       * mmap_lock.
         */
        mmap_write_lock(mm);
        result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
        if (unlikely(result != SCAN_SUCCEED))
                goto out_up_write;
  
 +      folio = page_folio(hpage);
        /*
 -       * spin_lock() below is not the equivalent of smp_wmb(), but
 -       * the smp_wmb() inside __SetPageUptodate() can be reused to
 -       * avoid the copy_huge_page writes to become visible after
 -       * the set_pmd_at() write.
 +       * The smp_wmb() inside __folio_mark_uptodate() ensures the
 +       * copy_huge_page writes become visible before the set_pmd_at()
 +       * write.
         */
 -      __SetPageUptodate(hpage);
 +      __folio_mark_uptodate(folio);
        pgtable = pmd_pgtable(_pmd);
  
        _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
  
        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
 -      page_add_new_anon_rmap(hpage, vma, address);
 -      lru_cache_add_inactive_or_unevictable(hpage, vma);
 +      folio_add_new_anon_rmap(folio, vma, address);
 +      folio_add_lru_vma(folio, vma);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
@@@ -1505,8 -1504,7 +1506,8 @@@ int collapse_pte_mapped_thp(struct mm_s
         * and map it by a PMD, regardless of sysfs THP settings. As such, let's
         * analogously elide sysfs THP settings here.
         */
 -      if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
 +      if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
 +                                   PMD_ORDER))
                return SCAN_VMA_CHECK;
  
        /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
                 * PTE dirty? Shmem page is already dirty; file is read-only.
                 */
                ptep_clear(mm, addr, pte);
 -              page_remove_rmap(page, vma, false);
 +              folio_remove_rmap_pte(folio, page, vma);
                nr_ptes++;
        }
  
@@@ -2122,23 -2120,23 +2123,23 @@@ immap_locked
                xas_lock_irq(&xas);
        }
  
 -      nr = thp_nr_pages(hpage);
 +      folio = page_folio(hpage);
 +      nr = folio_nr_pages(folio);
        if (is_shmem)
 -              __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
 +              __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
        else
 -              __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
 +              __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr);
  
        if (nr_none) {
 -              __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
 +              __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_none);
                /* nr_none is always 0 for non-shmem. */
 -              __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
 +              __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_none);
        }
  
        /*
         * Mark hpage as uptodate before inserting it into the page cache so
         * that it isn't mistaken for an fallocated but unwritten page.
         */
 -      folio = page_folio(hpage);
        folio_mark_uptodate(folio);
        folio_ref_add(folio, HPAGE_PMD_NR - 1);
  
  
        /* Join all the small entries into a single multi-index entry. */
        xas_set_order(&xas, start, HPAGE_PMD_ORDER);
 -      xas_store(&xas, hpage);
 +      xas_store(&xas, folio);
        WARN_ON_ONCE(xas_error(&xas));
        xas_unlock_irq(&xas);
  
        retract_page_tables(mapping, start);
        if (cc && !cc->is_khugepaged)
                result = SCAN_PTE_MAPPED_HUGEPAGE;
 -      unlock_page(hpage);
 +      folio_unlock(folio);
  
        /*
         * The collapse has succeeded, so free the old pages.
@@@ -2371,8 -2369,7 +2372,8 @@@ static unsigned int khugepaged_scan_mm_
                        progress++;
                        break;
                }
 -              if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
 +              if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
 +                                           true, PMD_ORDER)) {
  skip:
                        progress++;
                        continue;
@@@ -2496,7 -2493,7 +2497,7 @@@ static void khugepaged_do_scan(struct c
        while (true) {
                cond_resched();
  
 -              if (unlikely(kthread_should_stop() || try_to_freeze()))
 +              if (unlikely(kthread_should_stop()))
                        break;
  
                spin_lock(&khugepaged_mm_lock);
@@@ -2709,8 -2706,7 +2710,8 @@@ int madvise_collapse(struct vm_area_str
  
        *prev = vma;
  
 -      if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
 +      if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
 +                                   PMD_ORDER))
                return -EINVAL;
  
        cc = kmalloc(sizeof(*cc), GFP_KERNEL);
diff --combined mm/shmem.c
index 928aa230493261ef5abfe2ad21217d8b94c6bd12,98f6ca7bdae13e6c6a8d98892053d528217f17cd..d7c84ff621860b85090cf61d9b2970357da01b76
@@@ -79,6 -79,7 +79,7 @@@ static struct vfsmount *shm_mnt __ro_af
  #include <linux/rmap.h>
  #include <linux/uuid.h>
  #include <linux/quotaops.h>
+ #include <linux/rcupdate_wait.h>
  
  #include <linux/uaccess.h>
  
@@@ -1080,24 -1081,7 +1081,24 @@@ whole_folios
                                }
                                VM_BUG_ON_FOLIO(folio_test_writeback(folio),
                                                folio);
 -                              truncate_inode_folio(mapping, folio);
 +
 +                              if (!folio_test_large(folio)) {
 +                                      truncate_inode_folio(mapping, folio);
 +                              } else if (truncate_inode_partial_folio(folio, lstart, lend)) {
 +                                      /*
 +                                       * If we split a page, reset the loop so
 +                                       * that we pick up the new sub pages.
 +                                       * Otherwise the THP was entirely
 +                                       * dropped or the target range was
 +                                       * zeroed, so just continue the loop as
 +                                       * is.
 +                                       */
 +                                      if (!folio_test_large(folio)) {
 +                                              folio_unlock(folio);
 +                                              index = start;
 +                                              break;
 +                                      }
 +                              }
                        }
                        folio_unlock(folio);
                }
@@@ -1514,7 -1498,8 +1515,7 @@@ static int shmem_writepage(struct page 
  
                mutex_unlock(&shmem_swaplist_mutex);
                BUG_ON(folio_mapped(folio));
 -              swap_writepage(&folio->page, wbc);
 -              return 0;
 +              return swap_writepage(&folio->page, wbc);
        }
  
        mutex_unlock(&shmem_swaplist_mutex);
@@@ -1569,13 -1554,15 +1570,13 @@@ static struct folio *shmem_swapin_clust
  {
        struct mempolicy *mpol;
        pgoff_t ilx;
 -      struct page *page;
 +      struct folio *folio;
  
        mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
 -      page = swap_cluster_readahead(swap, gfp, mpol, ilx);
 +      folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
        mpol_cond_put(mpol);
  
 -      if (!page)
 -              return NULL;
 -      return page_folio(page);
 +      return folio;
  }
  
  /*
@@@ -4459,8 -4446,8 +4460,8 @@@ static void __init shmem_destroy_inodec
  }
  
  /* Keep the page in page cache instead of truncating it */
 -static int shmem_error_remove_page(struct address_space *mapping,
 -                                 struct page *page)
 +static int shmem_error_remove_folio(struct address_space *mapping,
 +                                 struct folio *folio)
  {
        return 0;
  }
@@@ -4475,7 -4462,7 +4476,7 @@@ const struct address_space_operations s
  #ifdef CONFIG_MIGRATION
        .migrate_folio  = migrate_folio,
  #endif
 -      .error_remove_page = shmem_error_remove_page,
 +      .error_remove_folio = shmem_error_remove_folio,
  };
  EXPORT_SYMBOL(shmem_aops);
  
diff --combined mm/swapfile.c
index 3eec686484ef5aaed972d3f8643ffd280890184c,25019af07181e873b6ac06988631da02b8793e82..556ff7347d5f04402b61cc5bd9d0d123a36dc1d5
@@@ -42,6 -42,7 +42,7 @@@
  #include <linux/completion.h>
  #include <linux/suspend.h>
  #include <linux/zswap.h>
+ #include <linux/plist.h>
  
  #include <asm/tlbflush.h>
  #include <linux/swapops.h>
@@@ -227,14 -228,14 +228,14 @@@ offset_to_swap_extent(struct swap_info_
        BUG();
  }
  
 -sector_t swap_page_sector(struct page *page)
 +sector_t swap_folio_sector(struct folio *folio)
  {
 -      struct swap_info_struct *sis = page_swap_info(page);
 +      struct swap_info_struct *sis = swp_swap_info(folio->swap);
        struct swap_extent *se;
        sector_t sector;
        pgoff_t offset;
  
 -      offset = __page_file_index(page);
 +      offset = swp_offset(folio->swap);
        se = offset_to_swap_extent(sis, offset);
        sector = se->start_block + (offset - se->start_page);
        return sector << (PAGE_SHIFT - 9);
@@@ -1495,9 -1496,9 +1496,9 @@@ int swp_swapcount(swp_entry_t entry
  
        do {
                page = list_next_entry(page, lru);
 -              map = kmap_atomic(page);
 +              map = kmap_local_page(page);
                tmp_count = map[offset];
 -              kunmap_atomic(map);
 +              kunmap_local(map);
  
                count += (tmp_count & ~COUNT_CONTINUED) * n;
                n *= (SWAP_CONT_MAX + 1);
@@@ -1741,24 -1742,18 +1742,24 @@@ static inline int pte_same_as_swp(pte_
  static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct folio *folio)
  {
 -      struct page *page = folio_file_page(folio, swp_offset(entry));
 -      struct page *swapcache;
 +      struct page *page;
 +      struct folio *swapcache;
        spinlock_t *ptl;
        pte_t *pte, new_pte, old_pte;
 -      bool hwpoisoned = PageHWPoison(page);
 +      bool hwpoisoned = false;
        int ret = 1;
  
 -      swapcache = page;
 -      page = ksm_might_need_to_copy(page, vma, addr);
 -      if (unlikely(!page))
 +      swapcache = folio;
 +      folio = ksm_might_need_to_copy(folio, vma, addr);
 +      if (unlikely(!folio))
                return -ENOMEM;
 -      else if (unlikely(PTR_ERR(page) == -EHWPOISON))
 +      else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
 +              hwpoisoned = true;
 +              folio = swapcache;
 +      }
 +
 +      page = folio_file_page(folio, swp_offset(entry));
 +      if (PageHWPoison(page))
                hwpoisoned = true;
  
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  
        old_pte = ptep_get(pte);
  
 -      if (unlikely(hwpoisoned || !PageUptodate(page))) {
 +      if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) {
                swp_entry_t swp_entry;
  
                dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
                if (hwpoisoned) {
 -                      swp_entry = make_hwpoison_entry(swapcache);
 -                      page = swapcache;
 +                      swp_entry = make_hwpoison_entry(page);
                } else {
                        swp_entry = make_poisoned_swp_entry();
                }
         * when reading from swap. This metadata may be indexed by swap entry
         * so this must be called before swap_free().
         */
 -      arch_swap_restore(entry, page_folio(page));
 -
 -      /* See do_swap_page() */
 -      BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
 -      BUG_ON(PageAnon(page) && PageAnonExclusive(page));
 +      arch_swap_restore(entry, folio);
  
        dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
 -      get_page(page);
 -      if (page == swapcache) {
 +      folio_get(folio);
 +      if (folio == swapcache) {
                rmap_t rmap_flags = RMAP_NONE;
  
                /*
 -               * See do_swap_page(): PageWriteback() would be problematic.
 -               * However, we do a wait_on_page_writeback() just before this
 -               * call and have the page locked.
 +               * See do_swap_page(): writeback would be problematic.
 +               * However, we do a folio_wait_writeback() just before this
 +               * call and have the folio locked.
                 */
 -              VM_BUG_ON_PAGE(PageWriteback(page), page);
 +              VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
                if (pte_swp_exclusive(old_pte))
                        rmap_flags |= RMAP_EXCLUSIVE;
  
 -              page_add_anon_rmap(page, vma, addr, rmap_flags);
 +              folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
        } else { /* ksm created a completely new copy */
 -              page_add_new_anon_rmap(page, vma, addr);
 -              lru_cache_add_inactive_or_unevictable(page, vma);
 +              folio_add_new_anon_rmap(folio, vma, addr);
 +              folio_add_lru_vma(folio, vma);
        }
        new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
        if (pte_swp_soft_dirty(old_pte))
@@@ -1822,9 -1822,9 +1823,9 @@@ setpte
  out:
        if (pte)
                pte_unmap_unlock(pte, ptl);
 -      if (page != swapcache) {
 -              unlock_page(page);
 -              put_page(page);
 +      if (folio != swapcache) {
 +              folio_unlock(folio);
 +              folio_put(folio);
        }
        return ret;
  }
@@@ -2225,7 -2225,7 +2226,7 @@@ EXPORT_SYMBOL_GPL(add_swap_extent)
  /*
   * A `swap extent' is a simple thing which maps a contiguous range of pages
   * onto a contiguous range of disk blocks.  A rbtree of swap extents is
 - * built at swapon time and is then used at swap_writepage/swap_readpage
 + * built at swapon time and is then used at swap_writepage/swap_read_folio
   * time for locating where on disk a page belongs.
   *
   * If the swapfile is an S_ISBLK block device, a single extent is installed.
@@@ -3369,12 -3369,18 +3370,12 @@@ struct swap_info_struct *swp_swap_info(
        return swap_type_to_swap_info(swp_type(entry));
  }
  
 -struct swap_info_struct *page_swap_info(struct page *page)
 -{
 -      swp_entry_t entry = page_swap_entry(page);
 -      return swp_swap_info(entry);
 -}
 -
  /*
   * out-of-line methods to avoid include hell.
   */
  struct address_space *swapcache_mapping(struct folio *folio)
  {
 -      return page_swap_info(&folio->page)->swap_file->f_mapping;
 +      return swp_swap_info(folio->swap)->swap_file->f_mapping;
  }
  EXPORT_SYMBOL_GPL(swapcache_mapping);
  
@@@ -3472,9 -3478,9 +3473,9 @@@ int add_swap_count_continuation(swp_ent
                if (!(count & COUNT_CONTINUED))
                        goto out_unlock_cont;
  
 -              map = kmap_atomic(list_page) + offset;
 +              map = kmap_local_page(list_page) + offset;
                count = *map;
 -              kunmap_atomic(map);
 +              kunmap_local(map);
  
                /*
                 * If this continuation count now has some space in it,
@@@ -3524,7 -3530,7 +3525,7 @@@ static bool swap_count_continued(struc
        spin_lock(&si->cont_lock);
        offset &= ~PAGE_MASK;
        page = list_next_entry(head, lru);
 -      map = kmap_atomic(page) + offset;
 +      map = kmap_local_page(page) + offset;
  
        if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
                goto init_map;          /* jump over SWAP_CONT_MAX checks */
                 * Think of how you add 1 to 999
                 */
                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
 -                      kunmap_atomic(map);
 +                      kunmap_local(map);
                        page = list_next_entry(page, lru);
                        BUG_ON(page == head);
 -                      map = kmap_atomic(page) + offset;
 +                      map = kmap_local_page(page) + offset;
                }
                if (*map == SWAP_CONT_MAX) {
 -                      kunmap_atomic(map);
 +                      kunmap_local(map);
                        page = list_next_entry(page, lru);
                        if (page == head) {
                                ret = false;    /* add count continuation */
                                goto out;
                        }
 -                      map = kmap_atomic(page) + offset;
 +                      map = kmap_local_page(page) + offset;
  init_map:             *map = 0;               /* we didn't zero the page */
                }
                *map += 1;
 -              kunmap_atomic(map);
 +              kunmap_local(map);
                while ((page = list_prev_entry(page, lru)) != head) {
 -                      map = kmap_atomic(page) + offset;
 +                      map = kmap_local_page(page) + offset;
                        *map = COUNT_CONTINUED;
 -                      kunmap_atomic(map);
 +                      kunmap_local(map);
                }
                ret = true;                     /* incremented */
  
                 */
                BUG_ON(count != COUNT_CONTINUED);
                while (*map == COUNT_CONTINUED) {
 -                      kunmap_atomic(map);
 +                      kunmap_local(map);
                        page = list_next_entry(page, lru);
                        BUG_ON(page == head);
 -                      map = kmap_atomic(page) + offset;
 +                      map = kmap_local_page(page) + offset;
                }
                BUG_ON(*map == 0);
                *map -= 1;
                if (*map == 0)
                        count = 0;
 -              kunmap_atomic(map);
 +              kunmap_local(map);
                while ((page = list_prev_entry(page, lru)) != head) {
 -                      map = kmap_atomic(page) + offset;
 +                      map = kmap_local_page(page) + offset;
                        *map = SWAP_CONT_MAX | count;
                        count = COUNT_CONTINUED;
 -                      kunmap_atomic(map);
 +                      kunmap_local(map);
                }
                ret = count == COUNT_CONTINUED;
        }
diff --combined security/selinux/hooks.c
index 5e5fd5be6d93aa118d8a6eda3b5a78d3efa0d705,b9ccc98421e949ade9492bfa95ed9ff0b7797902..1e4b1f940caf51620ff97664ce8f8fce53ae0c53
@@@ -85,6 -85,7 +85,7 @@@
  #include <linux/export.h>
  #include <linux/msg.h>
  #include <linux/shm.h>
+ #include <uapi/linux/shm.h>
  #include <linux/bpf.h>
  #include <linux/kernfs.h>
  #include <linux/stringhash.h> /* for hashlen_string() */
@@@ -92,7 -93,6 +93,7 @@@
  #include <linux/fsnotify.h>
  #include <linux/fanotify.h>
  #include <linux/io_uring.h>
 +#include <uapi/linux/lsm.h>
  
  #include "avc.h"
  #include "objsec.h"
@@@ -1661,6 -1661,8 +1662,6 @@@ static int inode_has_perm(const struct 
        struct inode_security_struct *isec;
        u32 sid;
  
 -      validate_creds(cred);
 -
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
  
@@@ -2314,19 -2316,6 +2315,19 @@@ static int selinux_bprm_creds_for_exec(
        new_tsec->keycreate_sid = 0;
        new_tsec->sockcreate_sid = 0;
  
 +      /*
 +       * Before policy is loaded, label any task outside kernel space
 +       * as SECINITSID_INIT, so that any userspace tasks surviving from
 +       * early boot end up with a label different from SECINITSID_KERNEL
 +       * (if the policy chooses to set SECINITSID_INIT != SECINITSID_KERNEL).
 +       */
 +      if (!selinux_initialized()) {
 +              new_tsec->sid = SECINITSID_INIT;
 +              /* also clear the exec_sid just in case */
 +              new_tsec->exec_sid = 0;
 +              return 0;
 +      }
 +
        if (old_tsec->exec_sid) {
                new_tsec->sid = old_tsec->exec_sid;
                /* Reset exec SID on execve. */
@@@ -3068,6 -3057,8 +3069,6 @@@ static int selinux_inode_follow_link(st
        struct inode_security_struct *isec;
        u32 sid;
  
 -      validate_creds(cred);
 -
        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;
        sid = cred_sid(cred);
@@@ -3111,6 -3102,8 +3112,6 @@@ static int selinux_inode_permission(str
        if (!mask)
                return 0;
  
 -      validate_creds(cred);
 -
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
  
@@@ -3739,33 -3732,6 +3740,33 @@@ static int selinux_file_ioctl(struct fi
        return error;
  }
  
 +static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd,
 +                            unsigned long arg)
 +{
 +      /*
 +       * If we are in a 64-bit kernel running 32-bit userspace, we need to
 +       * make sure we don't compare 32-bit flags to 64-bit flags.
 +       */
 +      switch (cmd) {
 +      case FS_IOC32_GETFLAGS:
 +              cmd = FS_IOC_GETFLAGS;
 +              break;
 +      case FS_IOC32_SETFLAGS:
 +              cmd = FS_IOC_SETFLAGS;
 +              break;
 +      case FS_IOC32_GETVERSION:
 +              cmd = FS_IOC_GETVERSION;
 +              break;
 +      case FS_IOC32_SETVERSION:
 +              cmd = FS_IOC_SETVERSION;
 +              break;
 +      default:
 +              break;
 +      }
 +
 +      return selinux_file_ioctl(file, cmd, arg);
 +}
 +
  static int default_noexec __ro_after_init;
  
  static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
@@@ -4588,21 -4554,6 +4589,21 @@@ static int sock_has_perm(struct sock *s
        if (sksec->sid == SECINITSID_KERNEL)
                return 0;
  
 +      /*
 +       * Before POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT, sockets that
 +       * inherited the kernel context from early boot used to be skipped
 +       * here, so preserve that behavior unless the capability is set.
 +       *
 +       * By setting the capability the policy signals that it is ready
 +       * for this quirk to be fixed. Note that sockets created by a kernel
 +       * thread or a usermode helper executed without a transition will
 +       * still be skipped in this check regardless of the policycap
 +       * setting.
 +       */
 +      if (!selinux_policycap_userspace_initial_context() &&
 +          sksec->sid == SECINITSID_INIT)
 +              return 0;
 +
        ad_net_init_from_sk(&ad, &net, sk);
  
        return avc_has_perm(current_sid(), sksec->sid, sksec->sclass, perms,
@@@ -4717,13 -4668,6 +4718,13 @@@ static int selinux_socket_bind(struct s
                                return -EINVAL;
                        addr4 = (struct sockaddr_in *)address;
                        if (family_sa == AF_UNSPEC) {
 +                              if (family == PF_INET6) {
 +                                      /* Length check from inet6_bind_sk() */
 +                                      if (addrlen < SIN6_LEN_RFC2133)
 +                                              return -EINVAL;
 +                                      /* Family check from __inet6_bind() */
 +                                      goto err_af;
 +                              }
                                /* see __inet_bind(), we only want to allow
                                 * AF_UNSPEC if the address is INADDR_ANY
                                 */
@@@ -6341,8 -6285,8 +6342,8 @@@ static void selinux_d_instantiate(struc
                inode_doinit_with_dentry(inode, dentry);
  }
  
 -static int selinux_getprocattr(struct task_struct *p,
 -                             const char *name, char **value)
 +static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p,
 +                             char **value)
  {
        const struct task_security_struct *__tsec;
        u32 sid;
                        goto bad;
        }
  
 -      if (!strcmp(name, "current"))
 +      switch (attr) {
 +      case LSM_ATTR_CURRENT:
                sid = __tsec->sid;
 -      else if (!strcmp(name, "prev"))
 +              break;
 +      case LSM_ATTR_PREV:
                sid = __tsec->osid;
 -      else if (!strcmp(name, "exec"))
 +              break;
 +      case LSM_ATTR_EXEC:
                sid = __tsec->exec_sid;
 -      else if (!strcmp(name, "fscreate"))
 +              break;
 +      case LSM_ATTR_FSCREATE:
                sid = __tsec->create_sid;
 -      else if (!strcmp(name, "keycreate"))
 +              break;
 +      case LSM_ATTR_KEYCREATE:
                sid = __tsec->keycreate_sid;
 -      else if (!strcmp(name, "sockcreate"))
 +              break;
 +      case LSM_ATTR_SOCKCREATE:
                sid = __tsec->sockcreate_sid;
 -      else {
 -              error = -EINVAL;
 +              break;
 +      default:
 +              error = -EOPNOTSUPP;
                goto bad;
        }
        rcu_read_unlock();
@@@ -6397,7 -6334,7 +6398,7 @@@ bad
        return error;
  }
  
 -static int selinux_setprocattr(const char *name, void *value, size_t size)
 +static int selinux_lsm_setattr(u64 attr, void *value, size_t size)
  {
        struct task_security_struct *tsec;
        struct cred *new;
        /*
         * Basic control over ability to set these attributes at all.
         */
 -      if (!strcmp(name, "exec"))
 +      switch (attr) {
 +      case LSM_ATTR_EXEC:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETEXEC, NULL);
 -      else if (!strcmp(name, "fscreate"))
 +              break;
 +      case LSM_ATTR_FSCREATE:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETFSCREATE, NULL);
 -      else if (!strcmp(name, "keycreate"))
 +              break;
 +      case LSM_ATTR_KEYCREATE:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETKEYCREATE, NULL);
 -      else if (!strcmp(name, "sockcreate"))
 +              break;
 +      case LSM_ATTR_SOCKCREATE:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETSOCKCREATE, NULL);
 -      else if (!strcmp(name, "current"))
 +              break;
 +      case LSM_ATTR_CURRENT:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETCURRENT, NULL);
 -      else
 -              error = -EINVAL;
 +              break;
 +      default:
 +              error = -EOPNOTSUPP;
 +              break;
 +      }
        if (error)
                return error;
  
                }
                error = security_context_to_sid(value, size,
                                                &sid, GFP_KERNEL);
 -              if (error == -EINVAL && !strcmp(name, "fscreate")) {
 +              if (error == -EINVAL && attr == LSM_ATTR_FSCREATE) {
                        if (!has_cap_mac_admin(true)) {
                                struct audit_buffer *ab;
                                size_t audit_size;
  
 -                              /* We strip a nul only if it is at the end, otherwise the
 -                               * context contains a nul and we should audit that */
 +                              /* We strip a nul only if it is at the end,
 +                               * otherwise the context contains a nul and
 +                               * we should audit that */
                                if (str[size - 1] == '\0')
                                        audit_size = size - 1;
                                else
                                if (!ab)
                                        return error;
                                audit_log_format(ab, "op=fscreate invalid_context=");
 -                              audit_log_n_untrustedstring(ab, value, audit_size);
 +                              audit_log_n_untrustedstring(ab, value,
 +                                                          audit_size);
                                audit_log_end(ab);
  
                                return error;
           checks and may_create for the file creation checks. The
           operation will then fail if the context is not permitted. */
        tsec = selinux_cred(new);
 -      if (!strcmp(name, "exec")) {
 +      if (attr == LSM_ATTR_EXEC) {
                tsec->exec_sid = sid;
 -      } else if (!strcmp(name, "fscreate")) {
 +      } else if (attr == LSM_ATTR_FSCREATE) {
                tsec->create_sid = sid;
 -      } else if (!strcmp(name, "keycreate")) {
 +      } else if (attr == LSM_ATTR_KEYCREATE) {
                if (sid) {
                        error = avc_has_perm(mysid, sid,
                                             SECCLASS_KEY, KEY__CREATE, NULL);
                                goto abort_change;
                }
                tsec->keycreate_sid = sid;
 -      } else if (!strcmp(name, "sockcreate")) {
 +      } else if (attr == LSM_ATTR_SOCKCREATE) {
                tsec->sockcreate_sid = sid;
 -      } else if (!strcmp(name, "current")) {
 +      } else if (attr == LSM_ATTR_CURRENT) {
                error = -EINVAL;
                if (sid == 0)
                        goto abort_change;
  
 -              /* Only allow single threaded processes to change context */
                if (!current_is_single_threaded()) {
                        error = security_bounded_transition(tsec->sid, sid);
                        if (error)
@@@ -6541,69 -6469,6 +6542,69 @@@ abort_change
        return error;
  }
  
 +/**
 + * selinux_getselfattr - Get SELinux current task attributes
 + * @attr: the requested attribute
 + * @ctx: buffer to receive the result
 + * @size: buffer size (input), buffer size used (output)
 + * @flags: unused
 + *
 + * Fill the passed user space @ctx with the details of the requested
 + * attribute.
 + *
 + * Returns the number of attributes on success, an error code otherwise.
 + * There will only ever be one attribute.
 + */
 +static int selinux_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
 +                             size_t *size, u32 flags)
 +{
 +      int rc;
 +      char *val;
 +      int val_len;
 +
 +      val_len = selinux_lsm_getattr(attr, current, &val);
 +      if (val_len < 0)
 +              return val_len;
 +      rc = lsm_fill_user_ctx(ctx, size, val, val_len, LSM_ID_SELINUX, 0);
 +      kfree(val);
 +      return (!rc ? 1 : rc);
 +}
 +
 +static int selinux_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
 +                             size_t size, u32 flags)
 +{
 +      int rc;
 +
 +      rc = selinux_lsm_setattr(attr, ctx->ctx, ctx->ctx_len);
 +      if (rc > 0)
 +              return 0;
 +      return rc;
 +}
 +
 +static int selinux_getprocattr(struct task_struct *p,
 +                             const char *name, char **value)
 +{
 +      unsigned int attr = lsm_name_to_attr(name);
 +      int rc;
 +
 +      if (attr) {
 +              rc = selinux_lsm_getattr(attr, p, value);
 +              if (rc != -EOPNOTSUPP)
 +                      return rc;
 +      }
 +
 +      return -EINVAL;
 +}
 +
 +static int selinux_setprocattr(const char *name, void *value, size_t size)
 +{
 +      int attr = lsm_name_to_attr(name);
 +
 +      if (attr)
 +              return selinux_lsm_setattr(attr, value, size);
 +      return -EINVAL;
 +}
 +
  static int selinux_ismaclabel(const char *name)
  {
        return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0);
@@@ -7086,11 -6951,6 +7087,11 @@@ static int selinux_uring_cmd(struct io_
  }
  #endif /* CONFIG_IO_URING */
  
 +static const struct lsm_id selinux_lsmid = {
 +      .name = "selinux",
 +      .id = LSM_ID_SELINUX,
 +};
 +
  /*
   * IMPORTANT NOTE: When adding new hooks, please be careful to keep this order:
   * 1. any hooks that don't belong to (2.) or (3.) below,
@@@ -7177,7 -7037,6 +7178,7 @@@ static struct security_hook_list selinu
        LSM_HOOK_INIT(file_permission, selinux_file_permission),
        LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
        LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
 +      LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat),
        LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
        LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
        LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
  
        LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate),
  
 +      LSM_HOOK_INIT(getselfattr, selinux_getselfattr),
 +      LSM_HOOK_INIT(setselfattr, selinux_setselfattr),
        LSM_HOOK_INIT(getprocattr, selinux_getprocattr),
        LSM_HOOK_INIT(setprocattr, selinux_setprocattr),
  
@@@ -7414,8 -7271,7 +7415,8 @@@ static __init int selinux_init(void
  
        hashtab_cache_init();
  
 -      security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks), "selinux");
 +      security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks),
 +                         &selinux_lsmid);
  
        if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET))
                panic("SELinux: Unable to register AVC netcache callback\n");
index c126f6a16de4fbd0d3c05ba93529c50e23093263,7a5600834f1648f30d408323eb4d5c6cc6a576b6..eb465bf74a3c96af081b977c010b0945acf1ba9b
  #include <linux/personality.h>
  #include <linux/msg.h>
  #include <linux/shm.h>
+ #include <uapi/linux/shm.h>
  #include <linux/binfmts.h>
  #include <linux/parser.h>
  #include <linux/fs_context.h>
  #include <linux/fs_parser.h>
  #include <linux/watch_queue.h>
  #include <linux/io_uring.h>
 +#include <uapi/linux/lsm.h>
  #include "smack.h"
  
  #define TRANS_TRUE    "TRUE"
@@@ -3626,35 -3626,6 +3627,35 @@@ static void smack_d_instantiate(struct 
        return;
  }
  
 +/**
 + * smack_getselfattr - Smack current process attribute
 + * @attr: which attribute to fetch
 + * @ctx: buffer to receive the result
 + * @size: available size in, actual size out
 + * @flags: unused
 + *
 + * Fill the passed user space @ctx with the details of the requested
 + * attribute.
 + *
 + * Returns the number of attributes on success, an error code otherwise.
 + * There will only ever be one attribute.
 + */
 +static int smack_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
 +                           size_t *size, u32 flags)
 +{
 +      int rc;
 +      struct smack_known *skp;
 +
 +      if (attr != LSM_ATTR_CURRENT)
 +              return -EOPNOTSUPP;
 +
 +      skp = smk_of_current();
 +      rc = lsm_fill_user_ctx(ctx, size,
 +                             skp->smk_known, strlen(skp->smk_known) + 1,
 +                             LSM_ID_SMACK, 0);
 +      return (!rc ? 1 : rc);
 +}
 +
  /**
   * smack_getprocattr - Smack process attribute access
   * @p: the object task
@@@ -3684,8 -3655,8 +3685,8 @@@ static int smack_getprocattr(struct tas
  }
  
  /**
 - * smack_setprocattr - Smack process attribute setting
 - * @name: the name of the attribute in /proc/.../attr
 + * do_setattr - Smack process attribute setting
 + * @attr: the ID of the attribute
   * @value: the value to set
   * @size: the size of the value
   *
   *
   * Returns the length of the smack label or an error code
   */
 -static int smack_setprocattr(const char *name, void *value, size_t size)
 +static int do_setattr(u64 attr, void *value, size_t size)
  {
        struct task_smack *tsp = smack_cred(current_cred());
        struct cred *new;
        if (value == NULL || size == 0 || size >= SMK_LONGLABEL)
                return -EINVAL;
  
 -      if (strcmp(name, "current") != 0)
 -              return -EINVAL;
 +      if (attr != LSM_ATTR_CURRENT)
 +              return -EOPNOTSUPP;
  
        skp = smk_import_entry(value, size);
        if (IS_ERR(skp))
        return size;
  }
  
 +/**
 + * smack_setselfattr - Set a Smack process attribute
 + * @attr: which attribute to set
 + * @ctx: buffer containing the data
 + * @size: size of @ctx
 + * @flags: unused
 + *
 + * Fill the passed user space @ctx with the details of the requested
 + * attribute.
 + *
 + * Returns 0 on success, an error code otherwise.
 + */
 +static int smack_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
 +                           size_t size, u32 flags)
 +{
 +      int rc;
 +
 +      rc = do_setattr(attr, ctx->ctx, ctx->ctx_len);
 +      if (rc > 0)
 +              return 0;
 +      return rc;
 +}
 +
 +/**
 + * smack_setprocattr - Smack process attribute setting
 + * @name: the name of the attribute in /proc/.../attr
 + * @value: the value to set
 + * @size: the size of the value
 + *
 + * Sets the Smack value of the task. Only setting self
 + * is permitted and only with privilege
 + *
 + * Returns the length of the smack label or an error code
 + */
 +static int smack_setprocattr(const char *name, void *value, size_t size)
 +{
 +      int attr = lsm_name_to_attr(name);
 +
 +      if (attr != LSM_ATTR_UNDEF)
 +              return do_setattr(attr, value, size);
 +      return -EINVAL;
 +}
 +
  /**
   * smack_unix_stream_connect - Smack access on UDS
   * @sock: one sock
@@@ -5006,11 -4934,6 +5007,11 @@@ struct lsm_blob_sizes smack_blob_sizes 
        .lbs_xattr_count = SMACK_INODE_INIT_XATTRS,
  };
  
 +static const struct lsm_id smack_lsmid = {
 +      .name = "smack",
 +      .id = LSM_ID_SMACK,
 +};
 +
  static struct security_hook_list smack_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(ptrace_access_check, smack_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, smack_ptrace_traceme),
  
        LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
        LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
 +      LSM_HOOK_INIT(file_ioctl_compat, smack_file_ioctl),
        LSM_HOOK_INIT(file_lock, smack_file_lock),
        LSM_HOOK_INIT(file_fcntl, smack_file_fcntl),
        LSM_HOOK_INIT(mmap_file, smack_mmap_file),
  
        LSM_HOOK_INIT(d_instantiate, smack_d_instantiate),
  
 +      LSM_HOOK_INIT(getselfattr, smack_getselfattr),
 +      LSM_HOOK_INIT(setselfattr, smack_setselfattr),
        LSM_HOOK_INIT(getprocattr, smack_getprocattr),
        LSM_HOOK_INIT(setprocattr, smack_setprocattr),
  
@@@ -5221,7 -5141,7 +5222,7 @@@ static __init int smack_init(void
        /*
         * Register with LSM
         */
 -      security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), "smack");
 +      security_add_hooks(smack_hooks, ARRAY_SIZE(smack_hooks), &smack_lsmid);
        smack_enabled = 1;
  
        pr_info("Smack:  Initializing.\n");
This page took 0.216967 seconds and 4 git commands to generate.