Merge tag 'mm-nonmm-stable-2024-05-19-11-56' of git://git.kernel.org/pub/scm/linux...

author Linus Torvalds <[email protected]>

Sun, 19 May 2024 21:02:03 +0000 (14:02 -0700)

committer Linus Torvalds <[email protected]>

Sun, 19 May 2024 21:02:03 +0000 (14:02 -0700)
author Linus Torvalds <[email protected]>
Sun, 19 May 2024 21:02:03 +0000 (14:02 -0700)
committer Linus Torvalds <[email protected]>
Sun, 19 May 2024 21:02:03 +0000 (14:02 -0700)
diff --combined Documentation/admin-guide/kernel-parameters.txt

index 90f79a2854270e1291eb76772f4ef5a391a4d383,1fa79a3d0d1a257f20894a1a09a71585d75f7e7c..ef25e06ec08cc88763cd92fb1ce3a898c78b5591
--- 1/Documentation/admin-guide/kernel-parameters.txt
--- 2/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@@ -431,9 -431,6 +431,9 @@@
         arcrimi=        [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards
                         Format: <io>,<irq>,<nodeID>
   
+ +      arm64.no32bit_el0 [ARM64] Unconditionally disable the execution of
+ +                      32 bit applications.
+ +
         arm64.nobti     [ARM64] Unconditionally disable Branch Target
                         Identification support
   
@@@ -2151,12 -2148,6 +2151,12 @@@
                         Format: 0 | 1
                         Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON.
   
+ +      init_mlocked_on_free=   [MM] Fill freed userspace memory with zeroes if
+ +                              it was mlock'ed and not explicitly munlock'ed
+ +                              afterwards.
+ +                              Format: 0 | 1
+ +                              Default set by CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON
+ +
         init_pkru=      [X86] Specify the default memory protection keys rights
                         register contents for all processes.  0x55555554 by
                         default (disallow access to all but pkey 0).  Can
@@@ -2260,8 -2251,6 +2260,8 @@@
                         no_x2apic_optout
                                 BIOS x2APIC opt-out request will be ignored
                         nopost  disable Interrupt Posting
+ +                      posted_msi
+ +                              enable MSIs delivered as posted interrupts
   
         iomem=          Disable strict checking of access to MMIO memory
                 strict  regions from userspace.
@@@ -3434,9 -3423,6 +3434,9 @@@
                         arch-independent options, each of which is an
                         aggregation of existing arch-specific options.
   
+ +                      Note, "mitigations" is supported if and only if the
+ +                      kernel was built with CPU_MITIGATIONS=y.
+ +
                         off
                                 Disable all optional CPU mitigations.  This
                                 improves system performance, but it may also
@@@ -3787,10 -3773,12 +3787,12 @@@
                         Format: [state][,regs][,debounce][,die]
   
         nmi_watchdog=   [KNL,BUGS=X86] Debugging features for SMP kernels
-                       Format: [panic,][nopanic,][num]
+                       Format: [panic,][nopanic,][rNNN,][num]
                         Valid num: 0 or 1
                         0 - turn hardlockup detector in nmi_watchdog off
                         1 - turn hardlockup detector in nmi_watchdog on
+                       rNNN - configure the watchdog with raw perf event 0xNNN
+ 
                         When panic is specified, panic when an NMI watchdog
                         timeout occurs (or 'nopanic' to not panic on an NMI
                         watchdog, if CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is set)
@@@ -4184,11 -4172,13 +4186,11 @@@
   
         page_alloc.shuffle=
                         [KNL] Boolean flag to control whether the page allocator
- -                      should randomize its free lists. The randomization may
- -                      be automatically enabled if the kernel detects it is
- -                      running on a platform with a direct-mapped memory-side
- -                      cache, and this parameter can be used to
- -                      override/disable that behavior. The state of the flag
- -                      can be read from sysfs at:
+ +                      should randomize its free lists. This parameter can be
+ +                      used to enable/disable page randomization. The state of
+ +                      the flag can be read from sysfs at:
                         /sys/module/page_alloc/parameters/shuffle.
+ +                      This parameter is only available if CONFIG_SHUFFLE_PAGE_ALLOCATOR=y.
   
         page_owner=     [KNL,EARLY] Boot-time page_owner enabling option.
                         Storage of the information about who allocated
@@@ -4603,10 -4593,9 +4605,10 @@@
                 norid           [S390] ignore the RID field and force use of
                                 one PCI domain per PCI function
   
- -      pcie_aspm=      [PCIE] Forcibly enable or disable PCIe Active State Power
+ +      pcie_aspm=      [PCIE] Forcibly enable or ignore PCIe Active State Power
                         Management.
- -              off     Disable ASPM.
+ +              off     Don't touch ASPM configuration at all.  Leave any
+ +                      configuration done by firmware unchanged.
                 force   Enable ASPM even on devices that claim not to support it.
                         WARNING: Forcing ASPM on may cause system lockups.
   
@@@ -4794,9 -4783,7 +4796,9 @@@
   
         prot_virt=      [S390] enable hosting protected virtual machines
                         isolated from the hypervisor (if hardware supports
- -                      that).
+ +                      that). If enabled, the default kernel base address
+ +                      might be overridden even when Kernel Address Space
+ +                      Layout Randomization is disabled.
                         Format: <bool>
   
         psi=            [KNL] Enable or disable pressure stall information
@@@ -5107,20 -5094,6 +5109,20 @@@
                         delay, memory pressure or callback list growing too
                         big.
   
+ +      rcutree.rcu_normal_wake_from_gp= [KNL]
+ +                      Reduces a latency of synchronize_rcu() call. This approach
+ +                      maintains its own track of synchronize_rcu() callers, so it
+ +                      does not interact with regular callbacks because it does not
+ +                      use a call_rcu[_hurry]() path. Please note, this is for a
+ +                      normal grace period.
+ +
+ +                      How to enable it:
+ +
+ +                      echo 1 > /sys/module/rcutree/parameters/rcu_normal_wake_from_gp
+ +                      or pass a boot parameter "rcutree.rcu_normal_wake_from_gp=1"
+ +
+ +                      Default is 0.
+ +
         rcuscale.gp_async= [KNL]
                         Measure performance of asynchronous
                         grace-period primitives such as call_rcu().
@@@ -5837,7 -5810,6 +5839,7 @@@
                         but is useful for debugging and performance tuning.
   
         sched_thermal_decay_shift=
+ +                      [Deprecated]
                         [KNL, SMP] Set a decay shift for scheduler thermal
                         pressure signal. Thermal pressure signal follows the
                         default decay period of other scheduler pelt
@@@ -6775,7 -6747,6 +6777,7 @@@
                         - "tpm"
                         - "tee"
                         - "caam"
+ +                      - "dcp"
                         If not specified then it defaults to iterating through
                         the trust source list starting with TPM and assigns the
                         first trust source as a backend which is initialized
@@@ -6791,18 -6762,6 +6793,18 @@@
                         If not specified, "default" is used. In this case,
                         the RNG's choice is left to each individual trust source.
   
+ +      trusted.dcp_use_otp_key
+ +                      This is intended to be used in combination with
+ +                      trusted.source=dcp and will select the DCP OTP key
+ +                      instead of the DCP UNIQUE key blob encryption.
+ +
+ +      trusted.dcp_skip_zk_test
+ +                      This is intended to be used in combination with
+ +                      trusted.source=dcp and will disable the check if the
+ +                      blob key is all zeros. This is helpful for situations where
+ +                      having this key zero'ed is acceptable. E.g. in testing
+ +                      scenarios.
+ +
         tsc=            Disable clocksource stability checks for TSC.
                         Format: <string>
                         [x86] reliable: mark tsc clocksource as reliable, this
@@@ -7363,7 -7322,7 +7365,7 @@@
                         This can be changed after boot by writing to the
                         matching /sys/module/workqueue/parameters file. All
                         workqueues with the "default" affinity scope will be
- -                      updated accordignly.
+ +                      updated accordingly.
   
         workqueue.debug_force_rr_cpu
                         Workqueue used to implicitly guarantee that work
@@@ -7507,4 -7466,3 +7509,3 @@@
                                 memory, and other data can't be written using
                                 xmon commands.
                         off     xmon is disabled.
- 
diff --combined fs/binfmt_elf.c

index b5a25ee49eea4ca9ffe148e2380c0e0ee14cbdf2,ac178ad388239a506376936d3f45467964e2213f..a43897b03ce94017fade29e10f561b2b4e96add8
--- 1/fs/binfmt_elf.c
--- 2/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@@ -1262,9 -1262,6 +1262,9 @@@ out_free_interp
                 if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
                     elf_ex->e_type == ET_DYN && !interpreter) {
                         mm->brk = mm->start_brk = ELF_ET_DYN_BASE;
+ +              } else {
+ +                      /* Otherwise leave a gap between .bss and brk. */
+ +                      mm->brk = mm->start_brk = mm->brk + PAGE_SIZE;
                 }
   
                 mm->brk = mm->start_brk = arch_randomize_brk(mm);
@@@ -1567,6 -1564,7 +1567,6 @@@ static void fill_siginfo_note(struct me
         fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
   }
   
- -#define MAX_FILE_NOTE_SIZE (4*1024*1024)
   /*
    * Format of NT_FILE note:
    *
@@@ -1594,12 -1592,8 +1594,12 @@@ static int fill_files_note(struct memel
   
         names_ofs = (2 + 3 * count) * sizeof(data[0]);
    alloc:
- -      if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
+ +      /* paranoia check */
+ +      if (size >= core_file_note_size_limit) {
+ +              pr_warn_once("coredump Note size too large: %u (does kernel.core_file_note_size_limit sysctl need adjustment?\n",
+ +                            size);
                 return -EINVAL;
+ +      }
         size = round_up(size, PAGE_SIZE);
         /*
          * "size" can be 0 here legitimately.
@@@ -1934,7 -1928,7 +1934,7 @@@ static void free_note_info(struct elf_n
                 threads = t->next;
                 WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus);
                 for (i = 1; i < info->thread_notes; ++i)
-                       kfree(t->notes[i].data);
+                       kvfree(t->notes[i].data);
                 kfree(t);
         }
         kfree(info->psinfo.data);
diff --combined fs/nilfs2/dir.c

index aee40db7a036fb9f7d34e2e456fb6d61ae3bbf2d,0900fcad2d0c2eb442da1357524807c25438c69a..a002a44ff16175f8b6e60b2ad345eaec07ab734a
--- 1/fs/nilfs2/dir.c
--- 2/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@@ -174,7 -174,6 +174,6 @@@ Eend
                     dir->i_ino, (folio->index << PAGE_SHIFT) + offs,
                     (unsigned long)le64_to_cpu(p->inode));
   fail:
-       folio_set_error(folio);
         return false;
   }
   
@@@ -240,7 -239,7 +239,7 @@@ nilfs_filetype_table[NILFS_FT_MAX] = 
   
   #define S_SHIFT 12
   static unsigned char
- -nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+ +nilfs_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
         [S_IFREG >> S_SHIFT]    = NILFS_FT_REG_FILE,
         [S_IFDIR >> S_SHIFT]    = NILFS_FT_DIR,
         [S_IFCHR >> S_SHIFT]    = NILFS_FT_CHRDEV,
diff --combined include/linux/cpumask.h

index e8c412ee6400ddc88f95fef32ca3ee9405b70b5b,04536a29f10fc4a3a3ffe210aa9399f564ee60da..7600e877908fb580b16b55ee7f72ab16ebf044ed
--- 1/include/linux/cpumask.h
--- 2/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@@ -187,23 -187,6 +187,23 @@@ unsigned int cpumask_first_and(const st
         return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
   }
   
+ +/**
+ + * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3
+ + * @srcp1: the first input
+ + * @srcp2: the second input
+ + * @srcp3: the third input
+ + *
+ + * Return: >= nr_cpu_ids if no cpus set in all.
+ + */
+ +static inline
+ +unsigned int cpumask_first_and_and(const struct cpumask *srcp1,
+ +                                 const struct cpumask *srcp2,
+ +                                 const struct cpumask *srcp3)
+ +{
+ +      return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
+ +                                    cpumask_bits(srcp3), small_cpumask_bits);
+ +}
+ +
   /**
    * cpumask_last - get the last CPU in a cpumask
    * @srcp:     - the cpumask pointer
@@@ -405,29 -388,6 +405,29 @@@ unsigned int cpumask_any_but(const stru
         return i;
   }
   
+ +/**
+ + * cpumask_any_and_but - pick a "random" cpu from *mask1 & *mask2, but not this one.
+ + * @mask1: the first input cpumask
+ + * @mask2: the second input cpumask
+ + * @cpu: the cpu to ignore
+ + *
+ + * Returns >= nr_cpu_ids if no cpus set.
+ + */
+ +static inline
+ +unsigned int cpumask_any_and_but(const struct cpumask *mask1,
+ +                               const struct cpumask *mask2,
+ +                               unsigned int cpu)
+ +{
+ +      unsigned int i;
+ +
+ +      cpumask_check(cpu);
+ +      i = cpumask_first_and(mask1, mask2);
+ +      if (i != cpu)
+ +              return i;
+ +
+ +      return cpumask_next_and(cpu, mask1, mask2);
+ +}
+ +
   /**
    * cpumask_nth - get the Nth cpu in a cpumask
    * @srcp: the cpumask pointer
@@@ -893,7 -853,7 +893,7 @@@ static inline int cpulist_parse(const c
    */
   static inline unsigned int cpumask_size(void)
   {
- -      return BITS_TO_LONGS(large_cpumask_bits) * sizeof(long);
+ +      return bitmap_size(large_cpumask_bits);
   }
   
   /*
@@@ -1057,11 -1017,6 +1057,6 @@@ void init_cpu_present(const struct cpum
   void init_cpu_possible(const struct cpumask *src);
   void init_cpu_online(const struct cpumask *src);
   
- static inline void reset_cpu_possible_mask(void)
- {
-       bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS);
- }
- 
   static inline void
   set_cpu_possible(unsigned int cpu, bool possible)
   {
diff --combined include/linux/kexec.h

index 5b93a5767413a561302331740e217e4819595247,f31bd304df4518f01c10223f3cee7eeb8a7b8bef..f0e9f8eda7a3c022b87d579391861b463bce14d0
--- 1/include/linux/kexec.h
--- 2/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@@ -319,10 -319,8 +319,10 @@@ struct kimage 
         /* If set, we are using file mode kexec syscall */
         unsigned int file_mode:1;
   #ifdef CONFIG_CRASH_HOTPLUG
- -      /* If set, allow changes to elfcorehdr of kexec_load'd image */
- -      unsigned int update_elfcorehdr:1;
+ +      /* If set, it is safe to update kexec segments that are
+ +       * excluded from SHA calculation.
+ +       */
+ +      unsigned int hotplug_support:1;
   #endif
   
   #ifdef ARCH_HAS_KIMAGE_ARCH
@@@ -393,10 -391,9 +393,10 @@@ bool kexec_load_permitted(int kexec_ima
   
   /* List of defined/legal kexec flags */
   #ifndef CONFIG_KEXEC_JUMP
- -#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR)
+ +#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR | KEXEC_CRASH_HOTPLUG_SUPPORT)
   #else
- -#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR)
+ +#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR | \
+ +                      KEXEC_CRASH_HOTPLUG_SUPPORT)
   #endif
   
   /* List of defined/legal kexec file flags */
@@@ -464,10 -461,8 +464,8 @@@ static inline void arch_kexec_pre_free_
   
   extern bool kexec_file_dbg_print;
   
- #define kexec_dprintk(fmt, ...)                                       \
-       printk("%s" fmt,                                        \
-              kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG,   \
-              ##__VA_ARGS__)
+ #define kexec_dprintk(fmt, arg...) \
+         do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0)
   
   #else /* !CONFIG_KEXEC_CORE */
   struct pt_regs;
diff --combined init/main.c

index 91e74827f858939ae03c7778a3399aa2e5a2c2c0,0aecd2839c1fd5884e5fa5ff80dc49cc93ce2d18..206acdde51f5a9bec1b7d430ca6701d429a439b8
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -345,6 -345,11 +345,11 @@@ static int __init xbc_snprint_cmdline(c
                         continue;
                 }
                 xbc_array_for_each_value(vnode, val) {
+                       /*
+                        * For prettier and more readable /proc/cmdline, only
+                        * quote the value when necessary, i.e. when it contains
+                        * whitespace.
+                        */
                         q = strpbrk(val, " \t\r\n") ? "\"" : "";
                         ret = snprintf(buf, rest(buf, end), "%s=%s%s%s ",
                                        xbc_namebuf, q, val, q);
@@@ -628,18 -633,14 +633,18 @@@ static void __init setup_command_line(c
   
         if (extra_command_line)
                 xlen = strlen(extra_command_line);
- -      if (extra_init_args)
+ +      if (extra_init_args) {
+ +              extra_init_args = strim(extra_init_args); /* remove trailing space */
                 ilen = strlen(extra_init_args) + 4; /* for " -- " */
+ +      }
   
- -      len = xlen + strlen(boot_command_line) + 1;
+ +      len = xlen + strlen(boot_command_line) + ilen + 1;
   
- -      saved_command_line = memblock_alloc(len + ilen, SMP_CACHE_BYTES);
+ +      saved_command_line = memblock_alloc(len, SMP_CACHE_BYTES);
         if (!saved_command_line)
- -              panic("%s: Failed to allocate %zu bytes\n", __func__, len + ilen);
+ +              panic("%s: Failed to allocate %zu bytes\n", __func__, len);
+ +
+ +      len = xlen + strlen(command_line) + 1;
   
         static_command_line = memblock_alloc(len, SMP_CACHE_BYTES);
         if (!static_command_line)
@@@ -881,6 -882,19 +886,19 @@@ static void __init print_unknown_bootop
         memblock_free(unknown_options, len);
   }
   
+ static void __init early_numa_node_init(void)
+ {
+ #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+ #ifndef cpu_to_node
+       int cpu;
+ 
+       /* The early_cpu_to_node() should be ready here. */
+       for_each_possible_cpu(cpu)
+               set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
+ #endif
+ #endif
+ }
+ 
   asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector
   void start_kernel(void)
   {
@@@ -911,6 -925,7 +929,7 @@@
         setup_nr_cpu_ids();
         setup_per_cpu_areas();
         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
+       early_numa_node_init();
         boot_cpu_hotplug_init();
   
         pr_notice("Kernel command line: %s\n", saved_command_line);
@@@ -1418,7 -1433,6 +1437,7 @@@ static void mark_readonly(void
                  * insecure pages which are W+X.
                  */
                 flush_module_init_free_work();
+ +              jump_label_init_ro();
                 mark_rodata_ro();
                 debug_checkwx();
                 rodata_test();
diff --combined ipc/ipc_sysctl.c

index 113452038303b412281a6fd76231d6a37b0b4431,0867535af96fb07d50408f1803f7d76e1b1083aa..9465b0ae470bad07ffce16193d05ee63bdb28686
--- 1/ipc/ipc_sysctl.c
--- 2/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@@ -178,7 -178,6 +178,6 @@@ static struct ctl_table ipc_sysctls[] 
                 .extra2         = SYSCTL_INT_MAX,
         },
   #endif
-       {}
   };
   
   static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
@@@ -192,6 -191,7 +191,6 @@@ static int set_is_seen(struct ctl_table
   }
   
   static void ipc_set_ownership(struct ctl_table_header *head,
- -                            struct ctl_table *table,
                               kuid_t *uid, kgid_t *gid)
   {
         struct ipc_namespace *ns =
@@@ -204,7 -204,7 +203,7 @@@
         *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
   }
   
- -static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table)
+ +static int ipc_permissions(struct ctl_table_header *head, const struct ctl_table *table)
   {
         int mode = table->mode;
   
@@@ -223,7 -223,7 +222,7 @@@
                 kuid_t ns_root_uid;
                 kgid_t ns_root_gid;
   
- -              ipc_set_ownership(head, table, &ns_root_uid, &ns_root_gid);
+ +              ipc_set_ownership(head, &ns_root_uid, &ns_root_gid);
   
                 if (uid_eq(current_euid(), ns_root_uid))
                         mode >>= 6;
@@@ -305,7 -305,7 +304,7 @@@ bool setup_ipc_sysctls(struct ipc_names
   
   void retire_ipc_sysctls(struct ipc_namespace *ns)
   {
- -      struct ctl_table *tbl;
+ +      const struct ctl_table *tbl;
   
         tbl = ns->ipc_sysctls->ctl_table_arg;
         unregister_sysctl_table(ns->ipc_sysctls);
diff --combined ipc/mq_sysctl.c

index 068e7d5aa42b8bd3cc506a677d13f215fdb89088,22ec532c7fa1192aae1a9878f95d3a363e07e9e7..b70dc2ff22d84d01dab8170df1e586dbcc6a177b
--- 1/ipc/mq_sysctl.c
--- 2/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@@ -64,7 -64,6 +64,6 @@@ static struct ctl_table mq_sysctls[] = 
                 .extra1         = &msg_maxsize_limit_min,
                 .extra2         = &msg_maxsize_limit_max,
         },
-       {}
   };
   
   static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
@@@ -78,6 -77,7 +77,6 @@@ static int set_is_seen(struct ctl_table
   }
   
   static void mq_set_ownership(struct ctl_table_header *head,
- -                           struct ctl_table *table,
                              kuid_t *uid, kgid_t *gid)
   {
         struct ipc_namespace *ns =
@@@ -90,13 -90,13 +89,13 @@@
         *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
   }
   
- -static int mq_permissions(struct ctl_table_header *head, struct ctl_table *table)
+ +static int mq_permissions(struct ctl_table_header *head, const struct ctl_table *table)
   {
         int mode = table->mode;
         kuid_t ns_root_uid;
         kgid_t ns_root_gid;
   
- -      mq_set_ownership(head, table, &ns_root_uid, &ns_root_gid);
+ +      mq_set_ownership(head, &ns_root_uid, &ns_root_gid);
   
         if (uid_eq(current_euid(), ns_root_uid))
                 mode >>= 6;
@@@ -159,7 -159,7 +158,7 @@@ bool setup_mq_sysctls(struct ipc_namesp
   
   void retire_mq_sysctls(struct ipc_namespace *ns)
   {
- -      struct ctl_table *tbl;
+ +      const struct ctl_table *tbl;
   
         tbl = ns->mq_sysctls->ctl_table_arg;
         unregister_sysctl_table(ns->mq_sysctls);
diff --combined kernel/crash_core.c

index 394db3ebe8353543f1dc7ffdba7fe2c6f06a9271,1e7ac977f7c0b48a6be669bf39dff89b3756b214..63cf89393c6eb7c4b5d2d3fe2188a1e46377d952
--- 1/kernel/crash_core.c
--- 2/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@@ -4,6 -4,8 +4,8 @@@
    * Copyright (C) 2002-2004 Eric Biederman  <[email protected]>
    */
   
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+ 
   #include <linux/buildid.h>
   #include <linux/init.h>
   #include <linux/utsname.h>
@@@ -493,10 -495,10 +495,10 @@@ static DEFINE_MUTEX(__crash_hotplug_loc
   
   /*
    * This routine utilized when the crash_hotplug sysfs node is read.
- - * It reflects the kernel's ability/permission to update the crash
- - * elfcorehdr directly.
+ + * It reflects the kernel's ability/permission to update the kdump
+ + * image directly.
    */
- -int crash_check_update_elfcorehdr(void)
+ +int crash_check_hotplug_support(void)
   {
         int rc = 0;
   
@@@ -508,7 -510,10 +510,7 @@@
                 return 0;
         }
         if (kexec_crash_image) {
- -              if (kexec_crash_image->file_mode)
- -                      rc = 1;
- -              else
- -                      rc = kexec_crash_image->update_elfcorehdr;
+ +              rc = kexec_crash_image->hotplug_support;
         }
         /* Release lock now that update complete */
         kexec_unlock();
@@@ -531,7 -536,7 +533,7 @@@
    * list of segments it checks (since the elfcorehdr changes and thus
    * would require an update to purgatory itself to update the digest).
    */
- -static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
+ +static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, void *arg)
   {
         struct kimage *image;
   
@@@ -549,8 -554,8 +551,8 @@@
   
         image = kexec_crash_image;
   
- -      /* Check that updating elfcorehdr is permitted */
- -      if (!(image->file_mode || image->update_elfcorehdr))
+ +      /* Check that kexec segments update is permitted */
+ +      if (!image->hotplug_support)
                 goto out;
   
         if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
@@@ -593,7 -598,7 +595,7 @@@
         image->hp_action = hp_action;
   
         /* Now invoke arch-specific update handler */
- -      arch_crash_handle_hotplug_event(image);
+ +      arch_crash_handle_hotplug_event(image, arg);
   
         /* No longer handling a hotplug event */
         image->hp_action = KEXEC_CRASH_HP_NONE;
@@@ -609,17 -614,17 +611,17 @@@ out
         crash_hotplug_unlock();
   }
   
- -static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
+ +static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *arg)
   {
         switch (val) {
         case MEM_ONLINE:
                 crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY,
- -                      KEXEC_CRASH_HP_INVALID_CPU);
+ +                      KEXEC_CRASH_HP_INVALID_CPU, arg);
                 break;
   
         case MEM_OFFLINE:
                 crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY,
- -                      KEXEC_CRASH_HP_INVALID_CPU);
+ +                      KEXEC_CRASH_HP_INVALID_CPU, arg);
                 break;
         }
         return NOTIFY_OK;
@@@ -632,13 -637,13 +634,13 @@@ static struct notifier_block crash_memh
   
   static int crash_cpuhp_online(unsigned int cpu)
   {
- -      crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu);
+ +      crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu, NULL);
         return 0;
   }
   
   static int crash_cpuhp_offline(unsigned int cpu)
   {
- -      crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu);
+ +      crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu, NULL);
         return 0;
   }
   
diff --combined kernel/watchdog.c

index 941236828de80f16922e37c902d5e2d0ddd25218,ab0129b15f2513f5fd667dffb383beaad4d1104d..51915b44ac73c3fe62c3b16ffdebdbf2f5f220cd
--- 1/kernel/watchdog.c
--- 2/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@@ -12,25 -12,20 +12,25 @@@
   
   #define pr_fmt(fmt) "watchdog: " fmt
   
- -#include <linux/mm.h>
   #include <linux/cpu.h>
- -#include <linux/nmi.h>
   #include <linux/init.h>
+ +#include <linux/irq.h>
+ +#include <linux/irqdesc.h>
+ +#include <linux/kernel_stat.h>
+ +#include <linux/kvm_para.h>
+ +#include <linux/math64.h>
+ +#include <linux/mm.h>
   #include <linux/module.h>
+ +#include <linux/nmi.h>
+ +#include <linux/stop_machine.h>
   #include <linux/sysctl.h>
   #include <linux/tick.h>
+ +
   #include <linux/sched/clock.h>
   #include <linux/sched/debug.h>
   #include <linux/sched/isolation.h>
- -#include <linux/stop_machine.h>
   
   #include <asm/irq_regs.h>
- -#include <linux/kvm_para.h>
   
   static DEFINE_MUTEX(watchdog_mutex);
   
@@@ -40,8 -35,6 +40,8 @@@
   # define WATCHDOG_HARDLOCKUP_DEFAULT  0
   #endif
   
+ +#define NUM_SAMPLE_PERIODS    5
+ +
   unsigned long __read_mostly watchdog_enabled;
   int __read_mostly watchdog_user_enabled = 1;
   static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
@@@ -78,6 -71,7 +78,7 @@@ void __init hardlockup_detector_disable
   
   static int __init hardlockup_panic_setup(char *str)
   {
+ next:
         if (!strncmp(str, "panic", 5))
                 hardlockup_panic = 1;
         else if (!strncmp(str, "nopanic", 7))
@@@ -86,6 -80,14 +87,14 @@@
                 watchdog_hardlockup_user_enabled = 0;
         else if (!strncmp(str, "1", 1))
                 watchdog_hardlockup_user_enabled = 1;
+       else if (!strncmp(str, "r", 1))
+               hardlockup_config_perf_event(str + 1);
+       while (*(str++)) {
+               if (*str == ',') {
+                       str++;
+                       goto next;
+               }
+       }
         return 1;
   }
   __setup("nmi_watchdog=", hardlockup_panic_setup);
@@@ -340,188 -342,6 +349,188 @@@ __setup("watchdog_thresh=", watchdog_th
   
   static void __lockup_detector_cleanup(void);
   
+ +#ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
+ +enum stats_per_group {
+ +      STATS_SYSTEM,
+ +      STATS_SOFTIRQ,
+ +      STATS_HARDIRQ,
+ +      STATS_IDLE,
+ +      NUM_STATS_PER_GROUP,
+ +};
+ +
+ +static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = {
+ +      CPUTIME_SYSTEM,
+ +      CPUTIME_SOFTIRQ,
+ +      CPUTIME_IRQ,
+ +      CPUTIME_IDLE,
+ +};
+ +
+ +static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]);
+ +static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]);
+ +static DEFINE_PER_CPU(u8, cpustat_tail);
+ +
+ +/*
+ + * We don't need nanosecond resolution. A granularity of 16ms is
+ + * sufficient for our precision, allowing us to use u16 to store
+ + * cpustats, which will roll over roughly every ~1000 seconds.
+ + * 2^24 ~= 16 * 10^6
+ + */
+ +static u16 get_16bit_precision(u64 data_ns)
+ +{
+ +      return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */
+ +}
+ +
+ +static void update_cpustat(void)
+ +{
+ +      int i;
+ +      u8 util;
+ +      u16 old_stat, new_stat;
+ +      struct kernel_cpustat kcpustat;
+ +      u64 *cpustat = kcpustat.cpustat;
+ +      u8 tail = __this_cpu_read(cpustat_tail);
+ +      u16 sample_period_16 = get_16bit_precision(sample_period);
+ +
+ +      kcpustat_cpu_fetch(&kcpustat, smp_processor_id());
+ +
+ +      for (i = 0; i < NUM_STATS_PER_GROUP; i++) {
+ +              old_stat = __this_cpu_read(cpustat_old[i]);
+ +              new_stat = get_16bit_precision(cpustat[tracked_stats[i]]);
+ +              util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16);
+ +              __this_cpu_write(cpustat_util[tail][i], util);
+ +              __this_cpu_write(cpustat_old[i], new_stat);
+ +      }
+ +
+ +      __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS);
+ +}
+ +
+ +static void print_cpustat(void)
+ +{
+ +      int i, group;
+ +      u8 tail = __this_cpu_read(cpustat_tail);
+ +      u64 sample_period_second = sample_period;
+ +
+ +      do_div(sample_period_second, NSEC_PER_SEC);
+ +
+ +      /*
+ +       * Outputting the "watchdog" prefix on every line is redundant and not
+ +       * concise, and the original alarm information is sufficient for
+ +       * positioning in logs, hence here printk() is used instead of pr_crit().
+ +       */
+ +      printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n",
+ +             smp_processor_id(), sample_period_second);
+ +
+ +      for (i = 0; i < NUM_SAMPLE_PERIODS; i++) {
+ +              group = (tail + i) % NUM_SAMPLE_PERIODS;
+ +              printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t"
+ +                      "%3u%% hardirq,\t%3u%% idle\n", i + 1,
+ +                      __this_cpu_read(cpustat_util[group][STATS_SYSTEM]),
+ +                      __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]),
+ +                      __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]),
+ +                      __this_cpu_read(cpustat_util[group][STATS_IDLE]));
+ +      }
+ +}
+ +
+ +#define HARDIRQ_PERCENT_THRESH          50
+ +#define NUM_HARDIRQ_REPORT              5
+ +struct irq_counts {
+ +      int irq;
+ +      u32 counts;
+ +};
+ +
+ +static DEFINE_PER_CPU(bool, snapshot_taken);
+ +
+ +/* Tabulate the most frequent interrupts. */
+ +static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank)
+ +{
+ +      int i;
+ +      struct irq_counts new_count = {irq, counts};
+ +
+ +      for (i = 0; i < rank; i++) {
+ +              if (counts > irq_counts[i].counts)
+ +                      swap(new_count, irq_counts[i]);
+ +      }
+ +}
+ +
+ +/*
+ + * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period,
+ + * then the cause of softlockup might be interrupt storm. In this case, it
+ + * would be useful to start interrupt counting.
+ + */
+ +static bool need_counting_irqs(void)
+ +{
+ +      u8 util;
+ +      int tail = __this_cpu_read(cpustat_tail);
+ +
+ +      tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT;
+ +      util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]);
+ +      return util > HARDIRQ_PERCENT_THRESH;
+ +}
+ +
+ +static void start_counting_irqs(void)
+ +{
+ +      if (!__this_cpu_read(snapshot_taken)) {
+ +              kstat_snapshot_irqs();
+ +              __this_cpu_write(snapshot_taken, true);
+ +      }
+ +}
+ +
+ +static void stop_counting_irqs(void)
+ +{
+ +      __this_cpu_write(snapshot_taken, false);
+ +}
+ +
+ +static void print_irq_counts(void)
+ +{
+ +      unsigned int i, count;
+ +      struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = {
+ +              {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}
+ +      };
+ +
+ +      if (__this_cpu_read(snapshot_taken)) {
+ +              for_each_active_irq(i) {
+ +                      count = kstat_get_irq_since_snapshot(i);
+ +                      tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT);
+ +              }
+ +
+ +              /*
+ +               * Outputting the "watchdog" prefix on every line is redundant and not
+ +               * concise, and the original alarm information is sufficient for
+ +               * positioning in logs, hence here printk() is used instead of pr_crit().
+ +               */
+ +              printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n",
+ +                     smp_processor_id(), HARDIRQ_PERCENT_THRESH);
+ +
+ +              for (i = 0; i < NUM_HARDIRQ_REPORT; i++) {
+ +                      if (irq_counts_sorted[i].irq == -1)
+ +                              break;
+ +
+ +                      printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n",
+ +                             i + 1, irq_counts_sorted[i].counts,
+ +                             irq_counts_sorted[i].irq);
+ +              }
+ +
+ +              /*
+ +               * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last
+ +               * sample_period, then we suspect the interrupt storm might be subsiding.
+ +               */
+ +              if (!need_counting_irqs())
+ +                      stop_counting_irqs();
+ +      }
+ +}
+ +
+ +static void report_cpu_status(void)
+ +{
+ +      print_cpustat();
+ +      print_irq_counts();
+ +}
+ +#else
+ +static inline void update_cpustat(void) { }
+ +static inline void report_cpu_status(void) { }
+ +static inline bool need_counting_irqs(void) { return false; }
+ +static inline void start_counting_irqs(void) { }
+ +static inline void stop_counting_irqs(void) { }
+ +#endif
+ +
   /*
    * Hard-lockup warnings should be triggered after just a few seconds. Soft-
    * lockups can have false positives under extreme conditions. So we generally
@@@ -553,7 -373,7 +562,7 @@@ static void set_sample_period(void
          * and hard thresholds) to increment before the
          * hardlockup detector generates a warning
          */
- -      sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ +      sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS);
         watchdog_update_hrtimer_threshold(sample_period);
   }
   
@@@ -623,18 -443,6 +632,18 @@@ static int is_softlockup(unsigned long 
                          unsigned long now)
   {
         if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
+ +              /*
+ +               * If period_ts has not been updated during a sample_period, then
+ +               * in the subsequent few sample_periods, period_ts might also not
+ +               * be updated, which could indicate a potential softlockup. In
+ +               * this case, if we suspect the cause of the potential softlockup
+ +               * might be interrupt storm, then we need to count the interrupts
+ +               * to find which interrupt is storming.
+ +               */
+ +              if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) &&
+ +                  need_counting_irqs())
+ +                      start_counting_irqs();
+ +
                 /* Warn about unreasonable delays. */
                 if (time_after(now, period_ts + get_softlockup_thresh()))
                         return now - touch_ts;
@@@ -657,7 -465,6 +666,7 @@@ static DEFINE_PER_CPU(struct cpu_stop_w
   static int softlockup_fn(void *data)
   {
         update_touch_ts();
+ +      stop_counting_irqs();
         complete(this_cpu_ptr(&softlockup_completion));
   
         return 0;
@@@ -706,8 -513,6 +715,8 @@@ static enum hrtimer_restart watchdog_ti
          */
         period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts));
   
+ +      update_cpustat();
+ +
         /* Reset the interval when touched by known problematic code. */
         if (period_ts == SOFTLOCKUP_DELAY_REPORT) {
                 if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@@ -743,7 -548,6 +752,7 @@@
                 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                         smp_processor_id(), duration,
                         current->comm, task_pid_nr(current));
+ +              report_cpu_status();
                 print_modules();
                 print_irqtrace_events(current);
                 if (regs)
@@@ -1155,6 -959,7 +1164,6 @@@ static struct ctl_table watchdog_sysctl
         },
   #endif /* CONFIG_SMP */
   #endif
- -      {}
   };
   
   static struct ctl_table watchdog_hardlockup_sysctl[] = {
@@@ -1167,6 -972,7 +1176,6 @@@
                 .extra1         = SYSCTL_ZERO,
                 .extra2         = SYSCTL_ONE,
         },
- -      {}
   };
   
   static void __init watchdog_sysctl_init(void)
author	Linus Torvalds <[email protected]>
	Sun, 19 May 2024 21:02:03 +0000 (14:02 -0700)
committer	Linus Torvalds <[email protected]>
	Sun, 19 May 2024 21:02:03 +0000 (14:02 -0700)
		1	2
Documentation/admin-guide/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
fs/binfmt_elf.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nilfs2/dir.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cpumask.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kexec.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
ipc/ipc_sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
ipc/mq_sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/crash_core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/watchdog.c	patch \|	diff1 \|	diff2 \|	blob \| history