Merge branch 'core-rcu-2021.07.04' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <[email protected]>

Sun, 4 Jul 2021 19:58:33 +0000 (12:58 -0700)

committer Linus Torvalds <[email protected]>

Sun, 4 Jul 2021 19:58:33 +0000 (12:58 -0700)
author Linus Torvalds <[email protected]>
Sun, 4 Jul 2021 19:58:33 +0000 (12:58 -0700)
committer Linus Torvalds <[email protected]>
Sun, 4 Jul 2021 19:58:33 +0000 (12:58 -0700)
diff --combined Documentation/admin-guide/kernel-parameters.txt

index d6717b74769414ad70c38fb2756c00e0bf265105,4405fd32e8ab07004abedf45c74d45c9f5b0ae67..b4ee111987620c73c2fbd769a81dd6039168d7b4
--- 1/Documentation/admin-guide/kernel-parameters.txt
--- 2/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@@ -113,7 -113,7 +113,7 @@@
                         the GPE dispatcher.
                         This facility can be used to prevent such uncontrolled
                         GPE floodings.
- -                      Format: <byte>
+ +                      Format: <byte> or <bitmap-list>
   
         acpi_no_auto_serialize  [HW,ACPI]
                         Disable auto-serialization of AML methods
@@@ -301,9 -301,6 +301,9 @@@
                                           allowed anymore to lift isolation
                                           requirements as needed. This option
                                           does not override iommu=pt
+ +                      force_enable - Force enable the IOMMU on platforms known
+ +                                     to be buggy with IOMMU enabled. Use this
+ +                                     option with care.
   
         amd_iommu_dump= [HW,X86-64]
                         Enable AMD IOMMU driver option to dump the ACPI table
@@@ -500,21 -497,16 +500,21 @@@
         ccw_timeout_log [S390]
                         See Documentation/s390/common_io.rst for details.
   
- -      cgroup_disable= [KNL] Disable a particular controller
- -                      Format: {name of the controller(s) to disable}
+ +      cgroup_disable= [KNL] Disable a particular controller or optional feature
+ +                      Format: {name of the controller(s) or feature(s) to disable}
                         The effects of cgroup_disable=foo are:
                         - foo isn't auto-mounted if you mount all cgroups in
                           a single hierarchy
                         - foo isn't visible as an individually mountable
                           subsystem
+ +                      - if foo is an optional feature then the feature is
+ +                        disabled and corresponding cgroup files are not
+ +                        created
                         {Currently only "memory" controller deal with this and
                         cut the overhead, others just disable the usage. So
                         only cgroup_disable=memory is actually worthy}
+ +                      Specifying "pressure" disables per-cgroup pressure
+ +                      stall information accounting feature
   
         cgroup_no_v1=   [KNL] Disable cgroup controllers and named hierarchies in v1
                         Format: { { controller | "all" | "named" }
@@@ -589,28 -581,6 +589,28 @@@
                         loops can be debugged more effectively on production
                         systems.
   
+ +      clocksource.max_cswd_read_retries= [KNL]
+ +                      Number of clocksource_watchdog() retries due to
+ +                      external delays before the clock will be marked
+ +                      unstable.  Defaults to three retries, that is,
+ +                      four attempts to read the clock under test.
+ +
+ +      clocksource.verify_n_cpus= [KNL]
+ +                      Limit the number of CPUs checked for clocksources
+ +                      marked with CLOCK_SOURCE_VERIFY_PERCPU that
+ +                      are marked unstable due to excessive skew.
+ +                      A negative value says to check all CPUs, while
+ +                      zero says not to check any.  Values larger than
+ +                      nr_cpu_ids are silently truncated to nr_cpu_ids.
+ +                      The actual CPUs are chosen randomly, with
+ +                      no replacement if the same CPU is chosen twice.
+ +
+ +      clocksource-wdtest.holdoff= [KNL]
+ +                      Set the time in seconds that the clocksource
+ +                      watchdog test waits before commencing its tests.
+ +                      Defaults to zero when built as a module and to
+ +                      10 seconds when built into the kernel.
+ +
         clearcpuid=BITNUM[,BITNUM...] [X86]
                         Disable CPUID feature X for the kernel. See
                         arch/x86/include/asm/cpufeatures.h for the valid bit
@@@ -1597,23 -1567,6 +1597,23 @@@
                         Documentation/admin-guide/mm/hugetlbpage.rst.
                         Format: size[KMG]
   
+ +      hugetlb_free_vmemmap=
+ +                      [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+ +                      enabled.
+ +                      Allows heavy hugetlb users to free up some more
+ +                      memory (6 * PAGE_SIZE for each 2MB hugetlb page).
+ +                      Format: { on | off (default) }
+ +
+ +                      on:  enable the feature
+ +                      off: disable the feature
+ +
+ +                      Built with CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON=y,
+ +                      the default is on.
+ +
+ +                      This is not compatible with memory_hotplug.memmap_on_memory.
+ +                      If both parameters are enabled, hugetlb_free_vmemmap takes
+ +                      precedence over memory_hotplug.memmap_on_memory.
+ +
         hung_task_panic=
                         [KNL] Should the hung task detector generate panics.
                         Format: 0 | 1
@@@ -2034,7 -1987,7 +2034,7 @@@
                           forcing Dual Address Cycle for PCI cards supporting
                           greater than 32-bit addressing.
   
- -      iommu.strict=   [ARM64] Configure TLB invalidation behaviour
+ +      iommu.strict=   [ARM64, X86] Configure TLB invalidation behaviour
                         Format: { "0" | "1" }
                         0 - Lazy mode.
                           Request that DMA unmap operations use deferred
@@@ -2045,10 -1998,6 +2045,10 @@@
                         1 - Strict mode (default).
                           DMA unmap operations invalidate IOMMU hardware TLBs
                           synchronously.
+ +                      Note: on x86, the default behaviour depends on the
+ +                      equivalent driver-specific parameters, but a strict
+ +                      mode explicitly specified by either method takes
+ +                      precedence.
   
         iommu.passthrough=
                         [ARM64, X86] Configure DMA to bypass the IOMMU by default.
@@@ -2884,10 -2833,6 +2884,10 @@@
                         Note that even when enabled, there are a few cases where
                         the feature is not effective.
   
+ +                      This is not compatible with hugetlb_free_vmemmap. If
+ +                      both parameters are enabled, hugetlb_free_vmemmap takes
+ +                      precedence over memory_hotplug.memmap_on_memory.
+ +
         memtest=        [KNL,X86,ARM,PPC,RISCV] Enable memtest
                         Format: <integer>
                         default : 0 <disable>
@@@ -3299,7 -3244,7 +3299,7 @@@
   
         noclflush       [BUGS=X86] Don't use the CLFLUSH instruction
   
- -      nodelayacct     [KNL] Disable per-task delay accounting
+ +      delayacct       [KNL] Enable per-task delay accounting
   
         nodsp           [SH] Disable hardware DSP at boot time.
   
@@@ -3568,9 -3513,6 +3568,9 @@@
   
         nr_uarts=       [SERIAL] maximum number of UARTs to be registered.
   
+ +      numa=off        [KNL, ARM64, PPC, RISCV, SPARC, X86] Disable NUMA, Only
+ +                      set up a single NUMA node spanning all memory.
+ +
         numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic
                         NUMA balancing.
                         Allowed values are enable and disable
@@@ -3624,12 -3566,6 +3624,12 @@@
                         off: turn off poisoning (default)
                         on: turn on poisoning
   
+ +      page_reporting.page_reporting_order=
+ +                      [KNL] Minimal page reporting order
+ +                      Format: <integer>
+ +                      Adjust the minimal page reporting order. The page
+ +                      reporting is disabled when it exceeds (MAX_ORDER-1).
+ +
         panic=          [KNL] Kernel behaviour on panic: delay <timeout>
                         timeout > 0: seconds before rebooting
                         timeout = 0: wait forever
@@@ -4354,6 -4290,11 +4354,11 @@@
                         whole algorithm to behave better in low memory
                         condition.
   
+       rcutree.rcu_delay_page_cache_fill_msec= [KNL]
+                       Set the page-cache refill delay (in milliseconds)
+                       in response to low-memory conditions.  The range
+                       of permitted values is in the range 0:100000.
+ 
         rcutree.jiffies_till_first_fqs= [KNL]
                         Set delay from grace-period initialization to
                         first attempt to force quiescent states.
@@@ -4839,6 -4780,11 +4844,6 @@@
                         Reserves a hole at the top of the kernel virtual
                         address space.
   
- -      reservelow=     [X86]
- -                      Format: nn[K]
- -                      Set the amount of memory to reserve for BIOS at
- -                      the bottom of the address space.
- -
         reset_devices   [KNL] Force drivers to reset the underlying device
                         during initialization.
   
@@@ -5342,14 -5288,6 +5347,14 @@@
                                   exception. Default behavior is by #AC if
                                   both features are enabled in hardware.
   
+ +                      ratelimit:N -
+ +                                Set system wide rate limit to N bus locks
+ +                                per second for bus lock detection.
+ +                                0 < N <= 1000.
+ +
+ +                                N/A for split lock detection.
+ +
+ +
                         If an #AC exception is hit in the kernel or in
                         firmware (i.e. not while executing in user mode)
                         the kernel will oops in either "warn" or "fatal"
@@@ -5672,25 -5610,12 +5677,25 @@@
                         Note, echoing 1 into this file without the
                         tracepoint_printk kernel cmdline option has no effect.
   
+ +                      The tp_printk_stop_on_boot (see below) can also be used
+ +                      to stop the printing of events to console at
+ +                      late_initcall_sync.
+ +
                         ** CAUTION **
   
                         Having tracepoints sent to printk() and activating high
                         frequency tracepoints such as irq or sched, can cause
                         the system to live lock.
   
+ +      tp_printk_stop_on_boot[FTRACE]
+ +                      When tp_printk (above) is set, it can cause a lot of noise
+ +                      on the console. It may be useful to only include the
+ +                      printing of events during boot up, as user space may
+ +                      make the system inoperable.
+ +
+ +                      This command line option will stop the printing of events
+ +                      to console at the late_initcall_sync() time frame.
+ +
         traceoff_on_warning
                         [FTRACE] enable this option to disable tracing when a
                         warning is hit. This turns off "tracing_on". Tracing can
diff --combined init/main.c

index f622b712dc9a29a94bf88f69af03f0eef32a2861,7b6f49c4d38831495bb4a5eacf66113e457a897c..af521b30a3b84230a5acd9e10fbb4516b533fdb6
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -42,6 -42,7 +42,7 @@@
   #include <linux/profile.h>
   #include <linux/kfence.h>
   #include <linux/rcupdate.h>
+ #include <linux/srcu.h>
   #include <linux/moduleparam.h>
   #include <linux/kallsyms.h>
   #include <linux/writeback.h>
@@@ -386,6 -387,16 +387,6 @@@ static char * __init xbc_make_cmdline(c
         return new_cmdline;
   }
   
- -static u32 boot_config_checksum(unsigned char *p, u32 size)
- -{
- -      u32 ret = 0;
- -
- -      while (size--)
- -              ret += *p++;
- -
- -      return ret;
- -}
- -
   static int __init bootconfig_params(char *param, char *val,
                                     const char *unused, void *arg)
   {
@@@ -429,7 -440,7 +430,7 @@@ static void __init setup_boot_config(vo
                 return;
         }
   
- -      if (boot_config_checksum((unsigned char *)data, size) != csum) {
+ +      if (xbc_calc_checksum(data, size) != csum) {
                 pr_err("bootconfig checksum failed\n");
                 return;
         }
@@@ -682,7 -693,6 +683,7 @@@ noinline void __ref rest_init(void
          */
         rcu_read_lock();
         tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ +      tsk->flags |= PF_NO_SETAFFINITY;
         set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
         rcu_read_unlock();
   
@@@ -863,47 -873,6 +864,47 @@@ void __init __weak arch_call_rest_init(
         rest_init();
   }
   
+ +static void __init print_unknown_bootoptions(void)
+ +{
+ +      char *unknown_options;
+ +      char *end;
+ +      const char *const *p;
+ +      size_t len;
+ +
+ +      if (panic_later || (!argv_init[1] && !envp_init[2]))
+ +              return;
+ +
+ +      /*
+ +       * Determine how many options we have to print out, plus a space
+ +       * before each
+ +       */
+ +      len = 1; /* null terminator */
+ +      for (p = &argv_init[1]; *p; p++) {
+ +              len++;
+ +              len += strlen(*p);
+ +      }
+ +      for (p = &envp_init[2]; *p; p++) {
+ +              len++;
+ +              len += strlen(*p);
+ +      }
+ +
+ +      unknown_options = memblock_alloc(len, SMP_CACHE_BYTES);
+ +      if (!unknown_options) {
+ +              pr_err("%s: Failed to allocate %zu bytes\n",
+ +                      __func__, len);
+ +              return;
+ +      }
+ +      end = unknown_options;
+ +
+ +      for (p = &argv_init[1]; *p; p++)
+ +              end += sprintf(end, " %s", *p);
+ +      for (p = &envp_init[2]; *p; p++)
+ +              end += sprintf(end, " %s", *p);
+ +
+ +      pr_notice("Unknown command line parameters:%s\n", unknown_options);
+ +      memblock_free(__pa(unknown_options), len);
+ +}
+ +
   asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
   {
         char *command_line;
@@@ -945,7 -914,6 +946,7 @@@
                                   static_command_line, __start___param,
                                   __stop___param - __start___param,
                                   -1, -1, NULL, &unknown_bootoption);
+ +      print_unknown_bootoptions();
         if (!IS_ERR_OR_NULL(after_dashes))
                 parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
                            NULL, set_init_arg);
@@@ -974,7 -942,11 +975,7 @@@
          * time - but meanwhile we still have a functioning scheduler.
          */
         sched_init();
- -      /*
- -       * Disable preemption - early bootup scheduling is extremely
- -       * fragile until we cpu_idle() for the first time.
- -       */
- -      preempt_disable();
+ +
         if (WARN(!irqs_disabled(),
                  "Interrupts were enabled *very* early, fixing it\n"))
                 local_irq_disable();
@@@ -1008,6 -980,7 +1009,7 @@@
         tick_init();
         rcu_init_nohz();
         init_timers();
+       srcu_init();
         hrtimers_init();
         softirq_init();
         timekeeping_init();
@@@ -1473,11 -1446,6 +1475,11 @@@ static int __ref kernel_init(void *unus
   {
         int ret;
   
+ +      /*
+ +       * Wait until kthreadd is all set-up.
+ +       */
+ +      wait_for_completion(&kthreadd_done);
+ +
         kernel_init_freeable();
         /* need to finish all async __init code before freeing the memory */
         async_synchronize_full();
@@@ -1558,6 -1526,11 +1560,6 @@@ void __init console_on_rootfs(void
   
   static noinline void __init kernel_init_freeable(void)
   {
- -      /*
- -       * Wait until kthreadd is all set-up.
- -       */
- -      wait_for_completion(&kthreadd_done);
- -
         /* Now the scheduler is fully set up and can do blocking allocations */
         gfp_allowed_mask = __GFP_BITS_MASK;
   
@@@ -1566,7 -1539,7 +1568,7 @@@
          */
         set_mems_allowed(node_states[N_MEMORY]);
   
- -      cad_pid = task_pid(current);
+ +      cad_pid = get_pid(task_pid(current));
   
         smp_prepare_cpus(setup_max_cpus);
   
diff --combined kernel/locking/lockdep.c

index e97d0800143771aafef41ae2709b59afd6d1a088,d6c3c987009d9ff30b8b36b91fd19353d16820a0..bf1c00c881e4871c3b448e2a9c3b852f703ded94
--- 1/kernel/locking/lockdep.c
--- 2/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@@ -760,7 -760,7 +760,7 @@@ static void lockdep_print_held_locks(st
          * It's not reliable to print a task's held locks if it's not sleeping
          * and it's not the current task.
          */
- -      if (p->state == TASK_RUNNING && p != current)
+ +      if (p != current && task_is_running(p))
                 return;
         for (i = 0; i < depth; i++) {
                 printk(" #%d: ", i);
@@@ -843,7 -843,7 +843,7 @@@ static int count_matching_names(struct 
   }
   
   /* used from NMI context -- must be lockless */
- -static __always_inline struct lock_class *
+ +static noinstr struct lock_class *
   look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
   {
         struct lockdep_subclass_key *key;
@@@ -851,14 -851,12 +851,14 @@@
         struct lock_class *class;
   
         if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+ +              instrumentation_begin();
                 debug_locks_off();
                 printk(KERN_ERR
                         "BUG: looking up invalid subclass: %u\n", subclass);
                 printk(KERN_ERR
                         "turning off the locking correctness validator.\n");
                 dump_stack();
+ +              instrumentation_end();
                 return NULL;
         }
   
@@@ -2306,56 -2304,7 +2306,56 @@@ static void print_lock_class_header(str
   }
   
   /*
- - * printk the shortest lock dependencies from @start to @end in reverse order:
+ + * Dependency path printing:
+ + *
+ + * After BFS we get a lock dependency path (linked via ->parent of lock_list),
+ + * printing out each lock in the dependency path will help on understanding how
+ + * the deadlock could happen. Here are some details about dependency path
+ + * printing:
+ + *
+ + * 1) A lock_list can be either forwards or backwards for a lock dependency,
+ + *    for a lock dependency A -> B, there are two lock_lists:
+ + *
+ + *    a)      lock_list in the ->locks_after list of A, whose ->class is B and
+ + *            ->links_to is A. In this case, we can say the lock_list is
+ + *            "A -> B" (forwards case).
+ + *
+ + *    b)      lock_list in the ->locks_before list of B, whose ->class is A
+ + *            and ->links_to is B. In this case, we can say the lock_list is
+ + *            "B <- A" (bacwards case).
+ + *
+ + *    The ->trace of both a) and b) point to the call trace where B was
+ + *    acquired with A held.
+ + *
+ + * 2) A "helper" lock_list is introduced during BFS, this lock_list doesn't
+ + *    represent a certain lock dependency, it only provides an initial entry
+ + *    for BFS. For example, BFS may introduce a "helper" lock_list whose
+ + *    ->class is A, as a result BFS will search all dependencies starting with
+ + *    A, e.g. A -> B or A -> C.
+ + *
+ + *    The notation of a forwards helper lock_list is like "-> A", which means
+ + *    we should search the forwards dependencies starting with "A", e.g A -> B
+ + *    or A -> C.
+ + *
+ + *    The notation of a bacwards helper lock_list is like "<- B", which means
+ + *    we should search the backwards dependencies ending with "B", e.g.
+ + *    B <- A or B <- C.
+ + */
+ +
+ +/*
+ + * printk the shortest lock dependencies from @root to @leaf in reverse order.
+ + *
+ + * We have a lock dependency path as follow:
+ + *
+ + *    @root                                                                 @leaf
+ + *      |                                                                     |
+ + *      V                                                                     V
+ + *              ->parent                                   ->parent
+ + * | lock_list | <--------- | lock_list | ... | lock_list  | <--------- | lock_list |
+ + * |    -> L1  |            | L1 -> L2  | ... |Ln-2 -> Ln-1|            | Ln-1 -> Ln|
+ + *
+ + * , so it's natural that we start from @leaf and print every ->class and
+ + * ->trace until we reach the @root.
    */
   static void __used
   print_shortest_lock_dependencies(struct lock_list *leaf,
@@@ -2383,61 -2332,6 +2383,61 @@@
         } while (entry && (depth >= 0));
   }
   
+ +/*
+ + * printk the shortest lock dependencies from @leaf to @root.
+ + *
+ + * We have a lock dependency path (from a backwards search) as follow:
+ + *
+ + *    @leaf                                                                 @root
+ + *      |                                                                     |
+ + *      V                                                                     V
+ + *              ->parent                                   ->parent
+ + * | lock_list | ---------> | lock_list | ... | lock_list  | ---------> | lock_list |
+ + * | L2 <- L1  |            | L3 <- L2  | ... | Ln <- Ln-1 |            |    <- Ln  |
+ + *
+ + * , so when we iterate from @leaf to @root, we actually print the lock
+ + * dependency path L1 -> L2 -> .. -> Ln in the non-reverse order.
+ + *
+ + * Another thing to notice here is that ->class of L2 <- L1 is L1, while the
+ + * ->trace of L2 <- L1 is the call trace of L2, in fact we don't have the call
+ + * trace of L1 in the dependency path, which is alright, because most of the
+ + * time we can figure out where L1 is held from the call trace of L2.
+ + */
+ +static void __used
+ +print_shortest_lock_dependencies_backwards(struct lock_list *leaf,
+ +                                         struct lock_list *root)
+ +{
+ +      struct lock_list *entry = leaf;
+ +      const struct lock_trace *trace = NULL;
+ +      int depth;
+ +
+ +      /*compute depth from generated tree by BFS*/
+ +      depth = get_lock_depth(leaf);
+ +
+ +      do {
+ +              print_lock_class_header(entry->class, depth);
+ +              if (trace) {
+ +                      printk("%*s ... acquired at:\n", depth, "");
+ +                      print_lock_trace(trace, 2);
+ +                      printk("\n");
+ +              }
+ +
+ +              /*
+ +               * Record the pointer to the trace for the next lock_list
+ +               * entry, see the comments for the function.
+ +               */
+ +              trace = entry->trace;
+ +
+ +              if (depth == 0 && (entry != root)) {
+ +                      printk("lockdep:%s bad path found in chain graph\n", __func__);
+ +                      break;
+ +              }
+ +
+ +              entry = get_lock_parent(entry);
+ +              depth--;
+ +      } while (entry && (depth >= 0));
+ +}
+ +
   static void
   print_irq_lock_scenario(struct lock_list *safe_entry,
                         struct lock_list *unsafe_entry,
@@@ -2552,7 -2446,10 +2552,7 @@@ print_bad_irq_dependency(struct task_st
         lockdep_print_held_locks(curr);
   
         pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
- -      prev_root->trace = save_trace();
- -      if (!prev_root->trace)
- -              return;
- -      print_shortest_lock_dependencies(backwards_entry, prev_root);
+ +      print_shortest_lock_dependencies_backwards(backwards_entry, prev_root);
   
         pr_warn("\nthe dependencies between the lock to be acquired");
         pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
@@@ -2770,18 -2667,8 +2770,18 @@@ static int check_irq_usage(struct task_
          * Step 3: we found a bad match! Now retrieve a lock from the backward
          * list whose usage mask matches the exclusive usage mask from the
          * lock found on the forward list.
+ +       *
+ +       * Note, we should only keep the LOCKF_ENABLED_IRQ_ALL bits, considering
+ +       * the follow case:
+ +       *
+ +       * When trying to add A -> B to the graph, we find that there is a
+ +       * hardirq-safe L, that L -> ... -> A, and another hardirq-unsafe M,
+ +       * that B -> ... -> M. However M is **softirq-safe**, if we use exact
+ +       * invert bits of M's usage_mask, we will find another lock N that is
+ +       * **softirq-unsafe** and N -> ... -> A, however N -> .. -> M will not
+ +       * cause a inversion deadlock.
          */
- -      backward_mask = original_mask(target_entry1->class->usage_mask);
+ +      backward_mask = original_mask(target_entry1->class->usage_mask & LOCKF_ENABLED_IRQ_ALL);
   
         ret = find_usage_backwards(&this, backward_mask, &target_entry);
         if (bfs_error(ret)) {
@@@ -2831,7 -2718,7 +2831,7 @@@ static inline bool usage_skip(struct lo
    * <target> or not. If it can, <src> -> <target> dependency is already
    * in the graph.
    *
- - * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
+ + * Return BFS_RMATCH if it does, or BFS_RNOMATCH if it does not, return BFS_E* if
    * any error appears in the bfs search.
    */
   static noinline enum bfs_result
@@@ -4690,7 -4577,7 +4690,7 @@@ static int check_wait_context(struct ta
         u8 curr_inner;
         int depth;
   
- -      if (!curr->lockdep_depth || !next_inner || next->trylock)
+ +      if (!next_inner || next->trylock)
                 return 0;
   
         if (!next_outer)
@@@ -5849,7 -5736,7 +5849,7 @@@ void lock_contended(struct lockdep_map 
   {
         unsigned long flags;
   
- -      trace_lock_acquired(lock, ip);
+ +      trace_lock_contended(lock, ip);
   
         if (unlikely(!lock_stat || !lockdep_enabled()))
                 return;
@@@ -5867,7 -5754,7 +5867,7 @@@ void lock_acquired(struct lockdep_map *
   {
         unsigned long flags;
   
- -      trace_lock_contended(lock, ip);
+ +      trace_lock_acquired(lock, ip);
   
         if (unlikely(!lock_stat || !lockdep_enabled()))
                 return;
@@@ -6506,6 -6393,7 +6506,7 @@@ asmlinkage __visible void lockdep_sys_e
   void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
   {
         struct task_struct *curr = current;
+       int dl = READ_ONCE(debug_locks);
   
         /* Note: the following can be executed concurrently, so be careful. */
         pr_warn("\n");
@@@ -6515,11 -6403,12 +6516,12 @@@
         pr_warn("-----------------------------\n");
         pr_warn("%s:%d %s!\n", file, line, s);
         pr_warn("\nother info that might help us debug this:\n\n");
-       pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+       pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s",
                !rcu_lockdep_current_cpu_online()
                         ? "RCU used illegally from offline CPU!\n"
                         : "",
-              rcu_scheduler_active, debug_locks);
+              rcu_scheduler_active, dl,
+              dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n");
   
         /*
          * If a CPU is in the RCU-free window in idle (ie: in the section
diff --combined kernel/rcu/rcutorture.c

index 194b9c145c402eb7e55229d53b8480edbce3124d,ec69273898af96bedeb7fb3d1352e84a81e2b4bc..40ef5417d95451bd4aab3f20c4c7f1de011c43b6
--- 1/kernel/rcu/rcutorture.c
--- 2/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@@ -245,12 -245,6 +245,6 @@@ static const char *rcu_torture_writer_s
         return rcu_torture_writer_state_names[i];
   }
   
- #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_PREEMPT_RT)
- # define rcu_can_boost() 1
- #else
- # define rcu_can_boost() 0
- #endif
- 
   #ifdef CONFIG_RCU_TRACE
   static u64 notrace rcu_trace_clock_local(void)
   {
@@@ -331,6 -325,7 +325,7 @@@ struct rcu_torture_ops 
         void (*read_delay)(struct torture_random_state *rrsp,
                            struct rt_read_seg *rtrsp);
         void (*readunlock)(int idx);
+       int (*readlock_held)(void);
         unsigned long (*get_gp_seq)(void);
         unsigned long (*gp_diff)(unsigned long new, unsigned long old);
         void (*deferred_free)(struct rcu_torture *p);
@@@ -345,6 -340,7 +340,7 @@@
         void (*fqs)(void);
         void (*stats)(void);
         void (*gp_kthread_dbg)(void);
+       bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
         int (*stall_dur)(void);
         int irq_capable;
         int can_boost;
@@@ -359,6 -355,11 +355,11 @@@ static struct rcu_torture_ops *cur_ops
    * Definitions for rcu torture testing.
    */
   
+ static int torture_readlock_not_held(void)
+ {
+       return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
+ }
+ 
   static int rcu_torture_read_lock(void) __acquires(RCU)
   {
         rcu_read_lock();
@@@ -483,30 -484,32 +484,32 @@@ static void rcu_sync_torture_init(void
   }
   
   static struct rcu_torture_ops rcu_ops = {
-       .ttype          = RCU_FLAVOR,
-       .init           = rcu_sync_torture_init,
-       .readlock       = rcu_torture_read_lock,
-       .read_delay     = rcu_read_delay,
-       .readunlock     = rcu_torture_read_unlock,
-       .get_gp_seq     = rcu_get_gp_seq,
-       .gp_diff        = rcu_seq_diff,
-       .deferred_free  = rcu_torture_deferred_free,
-       .sync           = synchronize_rcu,
-       .exp_sync       = synchronize_rcu_expedited,
-       .get_gp_state   = get_state_synchronize_rcu,
-       .start_gp_poll  = start_poll_synchronize_rcu,
-       .poll_gp_state  = poll_state_synchronize_rcu,
-       .cond_sync      = cond_synchronize_rcu,
-       .call           = call_rcu,
-       .cb_barrier     = rcu_barrier,
-       .fqs            = rcu_force_quiescent_state,
-       .stats          = NULL,
-       .gp_kthread_dbg = show_rcu_gp_kthreads,
-       .stall_dur      = rcu_jiffies_till_stall_check,
-       .irq_capable    = 1,
-       .can_boost      = rcu_can_boost(),
-       .extendables    = RCUTORTURE_MAX_EXTEND,
-       .name           = "rcu"
+       .ttype                  = RCU_FLAVOR,
+       .init                   = rcu_sync_torture_init,
+       .readlock               = rcu_torture_read_lock,
+       .read_delay             = rcu_read_delay,
+       .readunlock             = rcu_torture_read_unlock,
+       .readlock_held          = torture_readlock_not_held,
+       .get_gp_seq             = rcu_get_gp_seq,
+       .gp_diff                = rcu_seq_diff,
+       .deferred_free          = rcu_torture_deferred_free,
+       .sync                   = synchronize_rcu,
+       .exp_sync               = synchronize_rcu_expedited,
+       .get_gp_state           = get_state_synchronize_rcu,
+       .start_gp_poll          = start_poll_synchronize_rcu,
+       .poll_gp_state          = poll_state_synchronize_rcu,
+       .cond_sync              = cond_synchronize_rcu,
+       .call                   = call_rcu,
+       .cb_barrier             = rcu_barrier,
+       .fqs                    = rcu_force_quiescent_state,
+       .stats                  = NULL,
+       .gp_kthread_dbg         = show_rcu_gp_kthreads,
+       .check_boost_failed     = rcu_check_boost_fail,
+       .stall_dur              = rcu_jiffies_till_stall_check,
+       .irq_capable            = 1,
+       .can_boost              = IS_ENABLED(CONFIG_RCU_BOOST),
+       .extendables            = RCUTORTURE_MAX_EXTEND,
+       .name                   = "rcu"
   };
   
   /*
@@@ -540,6 -543,7 +543,7 @@@ static struct rcu_torture_ops rcu_buste
         .readlock       = rcu_torture_read_lock,
         .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
         .readunlock     = rcu_torture_read_unlock,
+       .readlock_held  = torture_readlock_not_held,
         .get_gp_seq     = rcu_no_completed,
         .deferred_free  = rcu_busted_torture_deferred_free,
         .sync           = synchronize_rcu_busted,
@@@ -589,6 -593,11 +593,11 @@@ static void srcu_torture_read_unlock(in
         srcu_read_unlock(srcu_ctlp, idx);
   }
   
+ static int torture_srcu_read_lock_held(void)
+ {
+       return srcu_read_lock_held(srcu_ctlp);
+ }
+ 
   static unsigned long srcu_torture_completed(void)
   {
         return srcu_batches_completed(srcu_ctlp);
@@@ -646,6 -655,7 +655,7 @@@ static struct rcu_torture_ops srcu_ops 
         .readlock       = srcu_torture_read_lock,
         .read_delay     = srcu_read_delay,
         .readunlock     = srcu_torture_read_unlock,
+       .readlock_held  = torture_srcu_read_lock_held,
         .get_gp_seq     = srcu_torture_completed,
         .deferred_free  = srcu_torture_deferred_free,
         .sync           = srcu_torture_synchronize,
@@@ -681,6 -691,7 +691,7 @@@ static struct rcu_torture_ops srcud_op
         .readlock       = srcu_torture_read_lock,
         .read_delay     = srcu_read_delay,
         .readunlock     = srcu_torture_read_unlock,
+       .readlock_held  = torture_srcu_read_lock_held,
         .get_gp_seq     = srcu_torture_completed,
         .deferred_free  = srcu_torture_deferred_free,
         .sync           = srcu_torture_synchronize,
@@@ -700,6 -711,7 +711,7 @@@ static struct rcu_torture_ops busted_sr
         .readlock       = srcu_torture_read_lock,
         .read_delay     = rcu_read_delay,
         .readunlock     = srcu_torture_read_unlock,
+       .readlock_held  = torture_srcu_read_lock_held,
         .get_gp_seq     = srcu_torture_completed,
         .deferred_free  = srcu_torture_deferred_free,
         .sync           = srcu_torture_synchronize,
@@@ -787,6 -799,7 +799,7 @@@ static struct rcu_torture_ops trivial_o
         .readlock       = rcu_torture_read_lock_trivial,
         .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
         .readunlock     = rcu_torture_read_unlock_trivial,
+       .readlock_held  = torture_readlock_not_held,
         .get_gp_seq     = rcu_no_completed,
         .sync           = synchronize_rcu_trivial,
         .exp_sync       = synchronize_rcu_trivial,
@@@ -850,6 -863,7 +863,7 @@@ static struct rcu_torture_ops tasks_tra
         .readlock       = tasks_tracing_torture_read_lock,
         .read_delay     = srcu_read_delay,  /* just reuse srcu's version. */
         .readunlock     = tasks_tracing_torture_read_unlock,
+       .readlock_held  = rcu_read_lock_trace_held,
         .get_gp_seq     = rcu_no_completed,
         .deferred_free  = rcu_tasks_tracing_torture_deferred_free,
         .sync           = synchronize_rcu_tasks_trace,
@@@ -871,32 -885,13 +885,13 @@@ static unsigned long rcutorture_seq_dif
         return cur_ops->gp_diff(new, old);
   }
   
- static bool __maybe_unused torturing_tasks(void)
- {
-       return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops;
- }
- 
   /*
    * RCU torture priority-boost testing.  Runs one real-time thread per
-  * CPU for moderate bursts, repeatedly registering RCU callbacks and
-  * spinning waiting for them to be invoked.  If a given callback takes
-  * too long to be invoked, we assume that priority inversion has occurred.
+  * CPU for moderate bursts, repeatedly starting grace periods and waiting
+  * for them to complete.  If a given grace period takes too long, we assume
+  * that priority inversion has occurred.
    */
   
- struct rcu_boost_inflight {
-       struct rcu_head rcu;
-       int inflight;
- };
- 
- static void rcu_torture_boost_cb(struct rcu_head *head)
- {
-       struct rcu_boost_inflight *rbip =
-               container_of(head, struct rcu_boost_inflight, rcu);
- 
-       /* Ensure RCU-core accesses precede clearing ->inflight */
-       smp_store_release(&rbip->inflight, 0);
- }
- 
   static int old_rt_runtime = -1;
   
   static void rcu_torture_disable_rt_throttle(void)
@@@ -923,49 -918,68 +918,68 @@@ static void rcu_torture_enable_rt_throt
         old_rt_runtime = -1;
   }
   
- static bool rcu_torture_boost_failed(unsigned long start, unsigned long end)
+ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start)
   {
+       int cpu;
         static int dbg_done;
- 
-       if (end - start > test_boost_duration * HZ - HZ / 2) {
+       unsigned long end = jiffies;
+       bool gp_done;
+       unsigned long j;
+       static unsigned long last_persist;
+       unsigned long lp;
+       unsigned long mininterval = test_boost_duration * HZ - HZ / 2;
+ 
+       if (end - *start > mininterval) {
+               // Recheck after checking time to avoid false positives.
+               smp_mb(); // Time check before grace-period check.
+               if (cur_ops->poll_gp_state(gp_state))
+                       return false; // passed, though perhaps just barely
+               if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) {
+                       // At most one persisted message per boost test.
+                       j = jiffies;
+                       lp = READ_ONCE(last_persist);
+                       if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp)
+                               pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+                       return false; // passed on a technicality
+               }
                 VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
                 n_rcu_torture_boost_failure++;
-               if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg)
+               if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) {
+                       pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n",
+                               current->rt_priority, gp_state, end - *start);
                         cur_ops->gp_kthread_dbg();
+                       // Recheck after print to flag grace period ending during splat.
+                       gp_done = cur_ops->poll_gp_state(gp_state);
+                       pr_info("Boost inversion: GP %lu %s.\n", gp_state,
+                               gp_done ? "ended already" : "still pending");
   
-               return true; /* failed */
+               }
+ 
+               return true; // failed
+       } else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) {
+               *start = jiffies;
         }
   
-       return false; /* passed */
+       return false; // passed
   }
   
   static int rcu_torture_boost(void *arg)
   {
-       unsigned long call_rcu_time;
         unsigned long endtime;
+       unsigned long gp_state;
+       unsigned long gp_state_time;
         unsigned long oldstarttime;
-       struct rcu_boost_inflight rbi = { .inflight = 0 };
   
         VERBOSE_TOROUT_STRING("rcu_torture_boost started");
   
         /* Set real-time priority. */
         sched_set_fifo_low(current);
   
-       init_rcu_head_on_stack(&rbi.rcu);
         /* Each pass through the following loop does one boost-test cycle. */
         do {
                 bool failed = false; // Test failed already in this test interval
-               bool firsttime = true;
+               bool gp_initiated = false;
   
-               /* Increment n_rcu_torture_boosts once per boost-test */
-               while (!kthread_should_stop()) {
-                       if (mutex_trylock(&boost_mutex)) {
-                               n_rcu_torture_boosts++;
-                               mutex_unlock(&boost_mutex);
-                               break;
-                       }
-                       schedule_timeout_uninterruptible(1);
-               }
                 if (kthread_should_stop())
                         goto checkwait;
   
@@@ -979,33 -993,33 +993,33 @@@
                                 goto checkwait;
                 }
   
-               /* Do one boost-test interval. */
+               // Do one boost-test interval.
                 endtime = oldstarttime + test_boost_duration * HZ;
                 while (time_before(jiffies, endtime)) {
-                       /* If we don't have a callback in flight, post one. */
-                       if (!smp_load_acquire(&rbi.inflight)) {
-                               /* RCU core before ->inflight = 1. */
-                               smp_store_release(&rbi.inflight, 1);
-                               cur_ops->call(&rbi.rcu, rcu_torture_boost_cb);
-                               /* Check if the boost test failed */
-                               if (!firsttime && !failed)
-                                       failed = rcu_torture_boost_failed(call_rcu_time, jiffies);
-                               call_rcu_time = jiffies;
-                               firsttime = false;
+                       // Has current GP gone too long?
+                       if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+                               failed = rcu_torture_boost_failed(gp_state, &gp_state_time);
+                       // If we don't have a grace period in flight, start one.
+                       if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) {
+                               gp_state = cur_ops->start_gp_poll();
+                               gp_initiated = true;
+                               gp_state_time = jiffies;
                         }
-                       if (stutter_wait("rcu_torture_boost"))
+                       if (stutter_wait("rcu_torture_boost")) {
                                 sched_set_fifo_low(current);
+                               // If the grace period already ended,
+                               // we don't know when that happened, so
+                               // start over.
+                               if (cur_ops->poll_gp_state(gp_state))
+                                       gp_initiated = false;
+                       }
                         if (torture_must_stop())
                                 goto checkwait;
                 }
   
-               /*
-                * If boost never happened, then inflight will always be 1, in
-                * this case the boost check would never happen in the above
-                * loop so do another one here.
-                */
-               if (!firsttime && !failed && smp_load_acquire(&rbi.inflight))
-                       rcu_torture_boost_failed(call_rcu_time, jiffies);
+               // In case the grace period extended beyond the end of the loop.
+               if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+                       rcu_torture_boost_failed(gp_state, &gp_state_time);
   
                 /*
                  * Set the start time of the next test interval.
@@@ -1014,11 -1028,12 +1028,12 @@@
                  * interval.  Besides, we are running at RT priority,
                  * so delays should be relatively rare.
                  */
-               while (oldstarttime == boost_starttime &&
-                      !kthread_should_stop()) {
+               while (oldstarttime == boost_starttime && !kthread_should_stop()) {
                         if (mutex_trylock(&boost_mutex)) {
-                               boost_starttime = jiffies +
-                                                 test_boost_interval * HZ;
+                               if (oldstarttime == boost_starttime) {
+                                       boost_starttime = jiffies + test_boost_interval * HZ;
+                                       n_rcu_torture_boosts++;
+                               }
                                 mutex_unlock(&boost_mutex);
                                 break;
                         }
@@@ -1030,15 -1045,11 +1045,11 @@@ checkwait:   if (stutter_wait("rcu_tortur
                         sched_set_fifo_low(current);
         } while (!torture_must_stop());
   
-       while (smp_load_acquire(&rbi.inflight))
-               schedule_timeout_uninterruptible(1); // rcu_barrier() deadlocks.
- 
         /* Clean up and exit. */
-       while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
+       while (!kthread_should_stop()) {
                 torture_shutdown_absorb("rcu_torture_boost");
                 schedule_timeout_uninterruptible(1);
         }
-       destroy_rcu_head_on_stack(&rbi.rcu);
         torture_kthread_stopping("rcu_torture_boost");
         return 0;
   }
@@@ -1553,11 -1564,7 +1564,7 @@@ static bool rcu_torture_one_read(struc
         started = cur_ops->get_gp_seq();
         ts = rcu_trace_clock_local();
         p = rcu_dereference_check(rcu_torture_current,
-                                 rcu_read_lock_bh_held() ||
-                                 rcu_read_lock_sched_held() ||
-                                 srcu_read_lock_held(srcu_ctlp) ||
-                                 rcu_read_lock_trace_held() ||
-                                 torturing_tasks());
+                                 !cur_ops->readlock_held || cur_ops->readlock_held());
         if (p == NULL) {
                 /* Wait for rcu_torture_writer to get underway */
                 rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
@@@ -1831,10 -1838,10 +1838,10 @@@ rcu_torture_stats_print(void
                 srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
                                         &flags, &gp_seq);
                 wtp = READ_ONCE(writer_task);
- -              pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n",
+ +              pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n",
                          rcu_torture_writer_state_getname(),
                          rcu_torture_writer_state, gp_seq, flags,
- -                       wtp == NULL ? ~0UL : wtp->state,
+ +                       wtp == NULL ? ~0U : wtp->__state,
                          wtp == NULL ? -1 : (int)task_cpu(wtp));
                 if (!splatted && wtp) {
                         sched_show_task(wtp);
@@@ -1861,48 -1868,49 +1868,49 @@@ rcu_torture_stats(void *arg
                 torture_shutdown_absorb("rcu_torture_stats");
         } while (!torture_must_stop());
         torture_kthread_stopping("rcu_torture_stats");
- 
-       {
-               struct rcu_head *rhp;
-               struct kmem_cache *kcp;
-               static int z;
- 
-               kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
-               rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
-               pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
-               pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
-               mem_dump_obj(ZERO_SIZE_PTR);
-               pr_alert("mem_dump_obj(NULL):");
-               mem_dump_obj(NULL);
-               pr_alert("mem_dump_obj(%px):", &rhp);
-               mem_dump_obj(&rhp);
-               pr_alert("mem_dump_obj(%px):", rhp);
-               mem_dump_obj(rhp);
-               pr_alert("mem_dump_obj(%px):", &rhp->func);
-               mem_dump_obj(&rhp->func);
-               pr_alert("mem_dump_obj(%px):", &z);
-               mem_dump_obj(&z);
-               kmem_cache_free(kcp, rhp);
-               kmem_cache_destroy(kcp);
-               rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
-               pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
-               pr_alert("mem_dump_obj(kmalloc %px):", rhp);
-               mem_dump_obj(rhp);
-               pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
-               mem_dump_obj(&rhp->func);
-               kfree(rhp);
-               rhp = vmalloc(4096);
-               pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
-               pr_alert("mem_dump_obj(vmalloc %px):", rhp);
-               mem_dump_obj(rhp);
-               pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
-               mem_dump_obj(&rhp->func);
-               vfree(rhp);
-       }
- 
         return 0;
   }
   
+ /* Test mem_dump_obj() and friends.  */
+ static void rcu_torture_mem_dump_obj(void)
+ {
+       struct rcu_head *rhp;
+       struct kmem_cache *kcp;
+       static int z;
+ 
+       kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+       rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+       pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
+       pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
+       mem_dump_obj(ZERO_SIZE_PTR);
+       pr_alert("mem_dump_obj(NULL):");
+       mem_dump_obj(NULL);
+       pr_alert("mem_dump_obj(%px):", &rhp);
+       mem_dump_obj(&rhp);
+       pr_alert("mem_dump_obj(%px):", rhp);
+       mem_dump_obj(rhp);
+       pr_alert("mem_dump_obj(%px):", &rhp->func);
+       mem_dump_obj(&rhp->func);
+       pr_alert("mem_dump_obj(%px):", &z);
+       mem_dump_obj(&z);
+       kmem_cache_free(kcp, rhp);
+       kmem_cache_destroy(kcp);
+       rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+       pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+       pr_alert("mem_dump_obj(kmalloc %px):", rhp);
+       mem_dump_obj(rhp);
+       pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
+       mem_dump_obj(&rhp->func);
+       kfree(rhp);
+       rhp = vmalloc(4096);
+       pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+       pr_alert("mem_dump_obj(vmalloc %px):", rhp);
+       mem_dump_obj(rhp);
+       pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
+       mem_dump_obj(&rhp->func);
+       vfree(rhp);
+ }
+ 
   static void
   rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
   {
@@@ -2634,7 -2642,7 +2642,7 @@@ static bool rcu_torture_can_boost(void
   
         if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2)
                 return false;
-       if (!cur_ops->call)
+       if (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)
                 return false;
   
         prio = rcu_get_gp_kthreads_prio();
@@@ -2642,7 -2650,7 +2650,7 @@@
                 return false;
   
         if (prio < 2) {
-               if (boost_warn_once  == 1)
+               if (boost_warn_once == 1)
                         return false;
   
                 pr_alert("%s: WARN: RCU kthread priority too low to test boosting.  Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME);
@@@ -2818,6 -2826,8 +2826,8 @@@ rcu_torture_cleanup(void
         if (cur_ops->cleanup != NULL)
                 cur_ops->cleanup();
   
+       rcu_torture_mem_dump_obj();
+ 
         rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
   
         if (err_segs_recorded) {
@@@ -3120,6 -3130,21 +3130,21 @@@ rcu_torture_init(void
                 if (firsterr < 0)
                         goto unwind;
                 rcutor_hp = firsterr;
+ 
+               // Testing RCU priority boosting requires rcutorture do
+               // some serious abuse.  Counter this by running ksoftirqd
+               // at higher priority.
+               if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+                       for_each_online_cpu(cpu) {
+                               struct sched_param sp;
+                               struct task_struct *t;
+ 
+                               t = per_cpu(ksoftirqd, cpu);
+                               WARN_ON_ONCE(!t);
+                               sp.sched_priority = 2;
+                               sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                       }
+               }
         }
         shutdown_jiffies = jiffies + shutdown_secs * HZ;
         firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
diff --combined kernel/rcu/tree.c

index f12056beb916c3c80345a3c6f319d205878ef2a7,28f1093027b9cc94d1d2d53a8f20d9bb2267226e..51f24ecd94b2688fa8c475d4a1797d0e1c24b211
--- 1/kernel/rcu/tree.c
--- 2/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@@ -32,8 -32,6 +32,8 @@@
   #include <linux/export.h>
   #include <linux/completion.h>
   #include <linux/moduleparam.h>
+ +#include <linux/panic.h>
+ +#include <linux/panic_notifier.h>
   #include <linux/percpu.h>
   #include <linux/notifier.h>
   #include <linux/cpu.h>
@@@ -188,6 -186,17 +188,17 @@@ module_param(rcu_unlock_delay, int, 044
   static int rcu_min_cached_objs = 5;
   module_param(rcu_min_cached_objs, int, 0444);
   
+ // A page shrinker can ask for pages to be freed to make them
+ // available for other parts of the system. This usually happens
+ // under low memory conditions, and in that case we should also
+ // defer page-cache filling for a short time period.
+ //
+ // The default value is 5 seconds, which is long enough to reduce
+ // interference with the shrinker while it asks other systems to
+ // drain their caches.
+ static int rcu_delay_page_cache_fill_msec = 5000;
+ module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+ 
   /* Retrieve RCU kthreads priority for rcutorture */
   int rcu_get_gp_kthreads_prio(void)
   {
@@@ -204,7 -213,7 +215,7 @@@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_p
    * the need for long delays to increase some race probabilities with the
    * need for fast grace periods to increase other race probabilities.
    */
- #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
+ #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
   
   /*
    * Compute the mask of online CPUs for the specified rcu_node structure.
@@@ -244,6 -253,7 +255,7 @@@ void rcu_softirq_qs(void
   {
         rcu_qs();
         rcu_preempt_deferred_qs(current);
+       rcu_tasks_qs(current, false);
   }
   
   /*
@@@ -835,28 -845,6 +847,6 @@@ void noinstr rcu_irq_exit(void
         rcu_nmi_exit();
   }
   
- /**
-  * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
-  *                      towards in kernel preemption
-  *
-  * Same as rcu_irq_exit() but has a sanity check that scheduling is safe
-  * from RCU point of view. Invoked from return from interrupt before kernel
-  * preemption.
-  */
- void rcu_irq_exit_preempt(void)
- {
-       lockdep_assert_irqs_disabled();
-       rcu_nmi_exit();
- 
-       RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
-                        "RCU dynticks_nesting counter underflow/zero!");
-       RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
-                        DYNTICK_IRQ_NONIDLE,
-                        "Bad RCU  dynticks_nmi_nesting counter\n");
-       RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
-                        "RCU in extended quiescent state!");
- }
- 
   #ifdef CONFIG_PROVE_RCU
   /**
    * rcu_irq_exit_check_preempt - Validate that scheduling is possible
@@@ -961,7 -949,7 +951,7 @@@ EXPORT_SYMBOL_GPL(rcu_idle_exit)
    */
   void noinstr rcu_user_exit(void)
   {
-       rcu_eqs_exit(1);
+       rcu_eqs_exit(true);
   }
   
   /**
@@@ -1227,7 -1215,7 +1217,7 @@@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_c
   #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
   
   /*
-  * We are reporting a quiescent state on behalf of some other CPU, so
+  * When trying to report a quiescent state on behalf of some other CPU,
    * it is our responsibility to check for and handle potential overflow
    * of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
    * After all, the CPU might be in deep idle state, and thus executing no
@@@ -2050,7 -2038,7 +2040,7 @@@ static void rcu_gp_fqs_loop(void
   /*
    * Clean up after the old grace period.
    */
- static void rcu_gp_cleanup(void)
+ static noinline void rcu_gp_cleanup(void)
   {
         int cpu;
         bool needgp = false;
@@@ -2491,7 -2479,7 +2481,7 @@@ int rcutree_dead_cpu(unsigned int cpu
   
   /*
    * Invoke any RCU callbacks that have made it to the end of their grace
-  * period.  Thottle as specified by rdp->blimit.
+  * period.  Throttle as specified by rdp->blimit.
    */
   static void rcu_do_batch(struct rcu_data *rdp)
   {
@@@ -2631,7 -2619,7 +2621,7 @@@
    * state, for example, user mode or idle loop.  It also schedules RCU
    * core processing.  If the current grace period has gone on too long,
    * it will ask the scheduler to manufacture a context switch for the sole
-  * purpose of providing a providing the needed quiescent state.
+  * purpose of providing the needed quiescent state.
    */
   void rcu_sched_clock_irq(int user)
   {
@@@ -2913,7 -2901,6 +2903,6 @@@ static int __init rcu_spawn_core_kthrea
                   "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
         return 0;
   }
- early_initcall(rcu_spawn_core_kthreads);
   
   /*
    * Handle any core-RCU processing required by a call_rcu() invocation.
@@@ -3084,12 -3071,14 +3073,14 @@@ __call_rcu(struct rcu_head *head, rcu_c
    * period elapses, in other words after all pre-existing RCU read-side
    * critical sections have completed.  However, the callback function
    * might well execute concurrently with RCU read-side critical sections
-  * that started after call_rcu() was invoked.  RCU read-side critical
-  * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
-  * may be nested.  In addition, regions of code across which interrupts,
-  * preemption, or softirqs have been disabled also serve as RCU read-side
-  * critical sections.  This includes hardware interrupt handlers, softirq
-  * handlers, and NMI handlers.
+  * that started after call_rcu() was invoked.
+  *
+  * RCU read-side critical sections are delimited by rcu_read_lock()
+  * and rcu_read_unlock(), and may be nested.  In addition, but only in
+  * v5.0 and later, regions of code across which interrupts, preemption,
+  * or softirqs have been disabled also serve as RCU read-side critical
+  * sections.  This includes hardware interrupt handlers, softirq handlers,
+  * and NMI handlers.
    *
    * Note that all CPUs must agree that the grace period extended beyond
    * all pre-existing RCU read-side critical section.  On systems with more
@@@ -3109,6 -3098,9 +3100,9 @@@
    * between the call to call_rcu() and the invocation of "func()" -- even
    * if CPU A and CPU B are the same CPU (but again only if the system has
    * more than one CPU).
+  *
+  * Implementation of these memory-ordering guarantees is described here:
+  * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
    */
   void call_rcu(struct rcu_head *head, rcu_callback_t func)
   {
@@@ -3173,6 -3165,7 +3167,7 @@@ struct kfree_rcu_cpu_work 
    *    Even though it is lockless an access has to be protected by the
    *    per-cpu lock.
    * @page_cache_work: A work to refill the cache when it is empty
+  * @backoff_page_cache_fill: Delay cache refills
    * @work_in_progress: Indicates that page_cache_work is running
    * @hrtimer: A hrtimer for scheduling a page_cache_work
    * @nr_bkv_objs: number of allocated objects at @bkvcache.
@@@ -3192,7 -3185,8 +3187,8 @@@ struct kfree_rcu_cpu 
         bool initialized;
         int count;
   
-       struct work_struct page_cache_work;
+       struct delayed_work page_cache_work;
+       atomic_t backoff_page_cache_fill;
         atomic_t work_in_progress;
         struct hrtimer hrtimer;
   
@@@ -3239,7 -3233,7 +3235,7 @@@ get_cached_bnode(struct kfree_rcu_cpu *
         if (!krcp->nr_bkv_objs)
                 return NULL;
   
-       krcp->nr_bkv_objs--;
+       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
         return (struct kvfree_rcu_bulk_data *)
                 llist_del_first(&krcp->bkvcache);
   }
@@@ -3253,14 -3247,33 +3249,33 @@@ put_cached_bnode(struct kfree_rcu_cpu *
                 return false;
   
         llist_add((struct llist_node *) bnode, &krcp->bkvcache);
-       krcp->nr_bkv_objs++;
+       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
         return true;
+ }
+ 
+ static int
+ drain_page_cache(struct kfree_rcu_cpu *krcp)
+ {
+       unsigned long flags;
+       struct llist_node *page_list, *pos, *n;
+       int freed = 0;
   
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       page_list = llist_del_all(&krcp->bkvcache);
+       WRITE_ONCE(krcp->nr_bkv_objs, 0);
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ 
+       llist_for_each_safe(pos, n, page_list) {
+               free_page((unsigned long)pos);
+               freed++;
+       }
+ 
+       return freed;
   }
   
   /*
    * This function is invoked in workqueue context after a grace period.
-  * It frees all the objects queued on ->bhead_free or ->head_free.
+  * It frees all the objects queued on ->bkvhead_free or ->head_free.
    */
   static void kfree_rcu_work(struct work_struct *work)
   {
@@@ -3287,7 -3300,7 +3302,7 @@@
         krwp->head_free = NULL;
         raw_spin_unlock_irqrestore(&krcp->lock, flags);
   
-       // Handle two first channels.
+       // Handle the first two channels.
         for (i = 0; i < FREE_N_CHANNELS; i++) {
                 for (; bkvhead[i]; bkvhead[i] = bnext) {
                         bnext = bkvhead[i]->next;
@@@ -3325,9 -3338,11 +3340,11 @@@
         }
   
         /*
-        * Emergency case only. It can happen under low memory
-        * condition when an allocation gets failed, so the "bulk"
-        * path can not be temporary maintained.
+        * This is used when the "bulk" path can not be used for the
+        * double-argument of kvfree_rcu().  This happens when the
+        * page-cache is empty, which means that objects are instead
+        * queued on a linked list through their rcu_head structures.
+        * This list is named "Channel 3".
          */
         for (; head; head = next) {
                 unsigned long offset = (unsigned long)head->func;
@@@ -3347,34 -3362,31 +3364,31 @@@
   }
   
   /*
-  * Schedule the kfree batch RCU work to run in workqueue context after a GP.
-  *
-  * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
-  * timeout has been reached.
+  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
    */
- static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
+ static void kfree_rcu_monitor(struct work_struct *work)
   {
-       struct kfree_rcu_cpu_work *krwp;
-       bool repeat = false;
+       struct kfree_rcu_cpu *krcp = container_of(work,
+               struct kfree_rcu_cpu, monitor_work.work);
+       unsigned long flags;
         int i, j;
   
-       lockdep_assert_held(&krcp->lock);
+       raw_spin_lock_irqsave(&krcp->lock, flags);
   
+       // Attempt to start a new batch.
         for (i = 0; i < KFREE_N_BATCHES; i++) {
-               krwp = &(krcp->krw_arr[i]);
+               struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
   
-               /*
-                * Try to detach bkvhead or head and attach it over any
-                * available corresponding free channel. It can be that
-                * a previous RCU batch is in progress, it means that
-                * immediately to queue another one is not possible so
-                * return false to tell caller to retry.
-                */
+               // Try to detach bkvhead or head and attach it over any
+               // available corresponding free channel. It can be that
+               // a previous RCU batch is in progress, it means that
+               // immediately to queue another one is not possible so
+               // in that case the monitor work is rearmed.
                 if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
                         (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
                                 (krcp->head && !krwp->head_free)) {
-                       // Channel 1 corresponds to SLAB ptrs.
-                       // Channel 2 corresponds to vmalloc ptrs.
+                       // Channel 1 corresponds to the SLAB-pointer bulk path.
+                       // Channel 2 corresponds to vmalloc-pointer bulk path.
                         for (j = 0; j < FREE_N_CHANNELS; j++) {
                                 if (!krwp->bkvhead_free[j]) {
                                         krwp->bkvhead_free[j] = krcp->bkvhead[j];
@@@ -3382,7 -3394,8 +3396,8 @@@
                                 }
                         }
   
-                       // Channel 3 corresponds to emergency path.
+                       // Channel 3 corresponds to both SLAB and vmalloc
+                       // objects queued on the linked list.
                         if (!krwp->head_free) {
                                 krwp->head_free = krcp->head;
                                 krcp->head = NULL;
@@@ -3390,65 -3403,35 +3405,35 @@@
   
                         WRITE_ONCE(krcp->count, 0);
   
-                       /*
-                        * One work is per one batch, so there are three
-                        * "free channels", the batch can handle. It can
-                        * be that the work is in the pending state when
-                        * channels have been detached following by each
-                        * other.
-                        */
+                       // One work is per one batch, so there are three
+                       // "free channels", the batch can handle. It can
+                       // be that the work is in the pending state when
+                       // channels have been detached following by each
+                       // other.
                         queue_rcu_work(system_wq, &krwp->rcu_work);
                 }
- 
-               // Repeat if any "free" corresponding channel is still busy.
-               if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
-                       repeat = true;
         }
   
-       return !repeat;
- }
- 
- static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
-                                         unsigned long flags)
- {
-       // Attempt to start a new batch.
-       krcp->monitor_todo = false;
-       if (queue_kfree_rcu_work(krcp)) {
-               // Success! Our job is done here.
-               raw_spin_unlock_irqrestore(&krcp->lock, flags);
-               return;
-       }
+       // If there is nothing to detach, it means that our job is
+       // successfully done here. In case of having at least one
+       // of the channels that is still busy we should rearm the
+       // work to repeat an attempt. Because previous batches are
+       // still in progress.
+       if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
+               krcp->monitor_todo = false;
+       else
+               schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
   
-       // Previous RCU batch still in progress, try again later.
-       krcp->monitor_todo = true;
-       schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
         raw_spin_unlock_irqrestore(&krcp->lock, flags);
   }
   
- /*
-  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
-  * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
-  */
- static void kfree_rcu_monitor(struct work_struct *work)
- {
-       unsigned long flags;
-       struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
-                                                monitor_work.work);
- 
-       raw_spin_lock_irqsave(&krcp->lock, flags);
-       if (krcp->monitor_todo)
-               kfree_rcu_drain_unlock(krcp, flags);
-       else
-               raw_spin_unlock_irqrestore(&krcp->lock, flags);
- }
- 
   static enum hrtimer_restart
   schedule_page_work_fn(struct hrtimer *t)
   {
         struct kfree_rcu_cpu *krcp =
                 container_of(t, struct kfree_rcu_cpu, hrtimer);
   
-       queue_work(system_highpri_wq, &krcp->page_cache_work);
+       queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
         return HRTIMER_NORESTART;
   }
   
@@@ -3457,12 -3440,16 +3442,16 @@@ static void fill_page_cache_func(struc
         struct kvfree_rcu_bulk_data *bnode;
         struct kfree_rcu_cpu *krcp =
                 container_of(work, struct kfree_rcu_cpu,
-                       page_cache_work);
+                       page_cache_work.work);
         unsigned long flags;
+       int nr_pages;
         bool pushed;
         int i;
   
-       for (i = 0; i < rcu_min_cached_objs; i++) {
+       nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+               1 : rcu_min_cached_objs;
+ 
+       for (i = 0; i < nr_pages; i++) {
                 bnode = (struct kvfree_rcu_bulk_data *)
                         __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
   
@@@ -3479,6 -3466,7 +3468,7 @@@
         }
   
         atomic_set(&krcp->work_in_progress, 0);
+       atomic_set(&krcp->backoff_page_cache_fill, 0);
   }
   
   static void
@@@ -3486,10 -3474,15 +3476,15 @@@ run_page_cache_worker(struct kfree_rcu_
   {
         if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
                         !atomic_xchg(&krcp->work_in_progress, 1)) {
-               hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
-                       HRTIMER_MODE_REL);
-               krcp->hrtimer.function = schedule_page_work_fn;
-               hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+               if (atomic_read(&krcp->backoff_page_cache_fill)) {
+                       queue_delayed_work(system_wq,
+                               &krcp->page_cache_work,
+                                       msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+               } else {
+                       hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+                       krcp->hrtimer.function = schedule_page_work_fn;
+                       hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+               }
         }
   }
   
@@@ -3554,11 -3547,11 +3549,11 @@@ add_ptr_to_bulk_krc_lock(struct kfree_r
   }
   
   /*
-  * Queue a request for lazy invocation of appropriate free routine after a
-  * grace period. Please note there are three paths are maintained, two are the
-  * main ones that use array of pointers interface and third one is emergency
-  * one, that is used only when the main path can not be maintained temporary,
-  * due to memory pressure.
+  * Queue a request for lazy invocation of the appropriate free routine
+  * after a grace period.  Please note that three paths are maintained,
+  * two for the common case using arrays of pointers and a third one that
+  * is used only when the main paths cannot be used, for example, due to
+  * memory pressure.
    *
    * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
    * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
@@@ -3647,6 -3640,8 +3642,8 @@@ kfree_rcu_shrink_count(struct shrinker 
                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
   
                 count += READ_ONCE(krcp->count);
+               count += READ_ONCE(krcp->nr_bkv_objs);
+               atomic_set(&krcp->backoff_page_cache_fill, 1);
         }
   
         return count;
@@@ -3656,18 -3651,14 +3653,14 @@@ static unsigned lon
   kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
   {
         int cpu, freed = 0;
-       unsigned long flags;
   
         for_each_possible_cpu(cpu) {
                 int count;
                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
   
                 count = krcp->count;
-               raw_spin_lock_irqsave(&krcp->lock, flags);
-               if (krcp->monitor_todo)
-                       kfree_rcu_drain_unlock(krcp, flags);
-               else
-                       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+               count += drain_page_cache(krcp);
+               kfree_rcu_monitor(&krcp->monitor_work.work);
   
                 sc->nr_to_scan -= count;
                 freed += count;
@@@ -3695,7 -3686,8 +3688,8 @@@ void __init kfree_rcu_scheduler_running
                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
   
                 raw_spin_lock_irqsave(&krcp->lock, flags);
-               if (!krcp->head || krcp->monitor_todo) {
+               if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
+                               krcp->monitor_todo) {
                         raw_spin_unlock_irqrestore(&krcp->lock, flags);
                         continue;
                 }
@@@ -3752,10 -3744,12 +3746,12 @@@ static int rcu_blocking_is_gp(void
    * read-side critical sections have completed.  Note, however, that
    * upon return from synchronize_rcu(), the caller might well be executing
    * concurrently with new RCU read-side critical sections that began while
-  * synchronize_rcu() was waiting.  RCU read-side critical sections are
-  * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
-  * In addition, regions of code across which interrupts, preemption, or
-  * softirqs have been disabled also serve as RCU read-side critical
+  * synchronize_rcu() was waiting.
+  *
+  * RCU read-side critical sections are delimited by rcu_read_lock()
+  * and rcu_read_unlock(), and may be nested.  In addition, but only in
+  * v5.0 and later, regions of code across which interrupts, preemption,
+  * or softirqs have been disabled also serve as RCU read-side critical
    * sections.  This includes hardware interrupt handlers, softirq handlers,
    * and NMI handlers.
    *
@@@ -3776,6 -3770,9 +3772,9 @@@
    * to have executed a full memory barrier during the execution of
    * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
    * again only if the system has more than one CPU).
+  *
+  * Implementation of these memory-ordering guarantees is described here:
+  * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
    */
   void synchronize_rcu(void)
   {
@@@ -3846,11 -3843,11 +3845,11 @@@ EXPORT_SYMBOL_GPL(start_poll_synchroniz
   /**
    * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
    *
-  * @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+  * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
    *
    * If a full RCU grace period has elapsed since the earlier call from
    * which oldstate was obtained, return @true, otherwise return @false.
-  * If @false is returned, it is the caller's responsibilty to invoke this
+  * If @false is returned, it is the caller's responsibility to invoke this
    * function later on until it does return @true.  Alternatively, the caller
    * can explicitly wait for a grace period, for example, by passing @oldstate
    * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
@@@ -3862,6 -3859,11 +3861,11 @@@
    * (many hours even on 32-bit systems) should check them occasionally
    * and either refresh them or set a flag indicating that the grace period
    * has completed.
+  *
+  * This function provides the same memory-ordering guarantees that
+  * would be provided by a synchronize_rcu() that was invoked at the call
+  * to the function that provided @oldstate, and that returned at the end
+  * of this function.
    */
   bool poll_state_synchronize_rcu(unsigned long oldstate)
   {
@@@ -3876,7 -3878,7 +3880,7 @@@ EXPORT_SYMBOL_GPL(poll_state_synchroniz
   /**
    * cond_synchronize_rcu - Conditionally wait for an RCU grace period
    *
-  * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+  * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
    *
    * If a full RCU grace period has elapsed since the earlier call to
    * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
@@@ -3886,6 -3888,11 +3890,11 @@@
    * counter wrap is harmless.  If the counter wraps, we have waited for
    * more than 2 billion grace periods (and way more on a 64-bit system!),
    * so waiting for one additional grace period should be just fine.
+  *
+  * This function provides the same memory-ordering guarantees that
+  * would be provided by a synchronize_rcu() that was invoked at the call
+  * to the function that provided @oldstate, and that returned at the end
+  * of this function.
    */
   void cond_synchronize_rcu(unsigned long oldstate)
   {
@@@ -3913,7 -3920,7 +3922,7 @@@ static int rcu_pending(int user
         check_cpu_stall(rdp);
   
         /* Does this CPU need a deferred NOCB wakeup? */
-       if (rcu_nocb_need_deferred_wakeup(rdp))
+       if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE))
                 return 1;
   
         /* Is this a nohz_full CPU in userspace or idle?  (Ignore RCU if so.) */
@@@ -4096,7 -4103,7 +4105,7 @@@ EXPORT_SYMBOL_GPL(rcu_barrier)
   /*
    * Propagate ->qsinitmask bits up the rcu_node tree to account for the
    * first CPU in a given leaf rcu_node structure coming online.  The caller
-  * must hold the corresponding leaf rcu_node ->lock with interrrupts
+  * must hold the corresponding leaf rcu_node ->lock with interrupts
    * disabled.
    */
   static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
@@@ -4191,7 -4198,7 +4200,7 @@@ int rcutree_prepare_cpu(unsigned int cp
         rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
         trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-       rcu_prepare_kthreads(cpu);
+       rcu_spawn_one_boost_kthread(rnp);
         rcu_spawn_cpu_nocb_kthread(cpu);
         WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
   
@@@ -4474,6 -4481,7 +4483,7 @@@ static int __init rcu_spawn_gp_kthread(
         wake_up_process(t);
         rcu_spawn_nocb_kthreads();
         rcu_spawn_boost_kthreads();
+       rcu_spawn_core_kthreads();
         return 0;
   }
   early_initcall(rcu_spawn_gp_kthread);
@@@ -4584,11 -4592,25 +4594,25 @@@ static void __init rcu_init_one(void
    * replace the definitions in tree.h because those are needed to size
    * the ->node array in the rcu_state structure.
    */
- static void __init rcu_init_geometry(void)
+ void rcu_init_geometry(void)
   {
         ulong d;
         int i;
+       static unsigned long old_nr_cpu_ids;
         int rcu_capacity[RCU_NUM_LVLS];
+       static bool initialized;
+ 
+       if (initialized) {
+               /*
+                * Warn if setup_nr_cpu_ids() had not yet been invoked,
+                * unless nr_cpus_ids == NR_CPUS, in which case who cares?
+                */
+               WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
+               return;
+       }
+ 
+       old_nr_cpu_ids = nr_cpu_ids;
+       initialized = true;
   
         /*
          * Initialize any unspecified boot parameters.
@@@ -4689,6 -4711,18 +4713,18 @@@ static void __init kfree_rcu_batch_init
         int cpu;
         int i;
   
+       /* Clamp it to [0:100] seconds interval. */
+       if (rcu_delay_page_cache_fill_msec < 0 ||
+               rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+ 
+               rcu_delay_page_cache_fill_msec =
+                       clamp(rcu_delay_page_cache_fill_msec, 0,
+                               (int) (100 * MSEC_PER_SEC));
+ 
+               pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+                       rcu_delay_page_cache_fill_msec);
+       }
+ 
         for_each_possible_cpu(cpu) {
                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
   
@@@ -4698,7 -4732,7 +4734,7 @@@
                 }
   
                 INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
-               INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
+               INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
                 krcp->initialized = true;
         }
         if (register_shrinker(&kfree_rcu_shrinker))
@@@ -4732,12 -4766,11 +4768,11 @@@ void __init rcu_init(void
                 rcutree_online_cpu(cpu);
         }
   
-       /* Create workqueue for expedited GPs and for Tree SRCU. */
+       /* Create workqueue for Tree SRCU and for expedited GPs. */
         rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
         WARN_ON(!rcu_gp_wq);
         rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
         WARN_ON(!rcu_par_gp_wq);
-       srcu_init();
   
         /* Fill in default value for rcutree.qovld boot parameter. */
         /* -After- the rcu_node ->lock fields are initialized! */
diff --combined kernel/rcu/tree_plugin.h

index 4d6962048c30404a7aedbda0b53a4fef333c6a25,334eaf4d561fa8b91eb3b321970bbe82d85252a7..de1dc3bb7f70167e2e5405b73e79e4e31c9f0a2c
--- 1/kernel/rcu/tree_plugin.h
--- 2/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@@ -33,10 -33,6 +33,6 @@@ static inline bool rcu_current_is_nocb_
         return false;
   }
   
- static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
- {
-       return (timer_curr_running(&rdp->nocb_timer) && !in_irq());
- }
   #else
   static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
   {
@@@ -48,11 -44,6 +44,6 @@@ static inline bool rcu_current_is_nocb_
         return false;
   }
   
- static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
- {
-       return false;
- }
- 
   #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
   
   static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
@@@ -72,8 -63,7 +63,7 @@@
                   rcu_lockdep_is_held_nocb(rdp) ||
                   (rdp == this_cpu_ptr(&rcu_data) &&
                    !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
-                 rcu_current_is_nocb_kthread(rdp) ||
-                 rcu_running_nocb_timer(rdp)),
+                 rcu_current_is_nocb_kthread(rdp)),
                 "Unsafe read of RCU_NOCB offloaded state"
         );
   
@@@ -1098,6 -1088,7 +1088,7 @@@ static int rcu_boost(struct rcu_node *r
         /* Lock only for side effect: boosts task t's priority. */
         rt_mutex_lock(&rnp->boost_mtx);
         rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
+       rnp->n_boosts++;
   
         return READ_ONCE(rnp->exp_tasks) != NULL ||
                READ_ONCE(rnp->boost_tasks) != NULL;
@@@ -1197,22 -1188,16 +1188,16 @@@ static void rcu_preempt_boost_start_gp(
    */
   static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
   {
-       int rnp_index = rnp - rcu_get_root();
         unsigned long flags;
+       int rnp_index = rnp - rcu_get_root();
         struct sched_param sp;
         struct task_struct *t;
   
-       if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
-               return;
- 
-       if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
+       if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
                 return;
   
         rcu_state.boost = 1;
   
-       if (rnp->boost_kthread_task != NULL)
-               return;
- 
         t = kthread_create(rcu_boost_kthread, (void *)rnp,
                            "rcub/%d", rnp_index);
         if (WARN_ON_ONCE(IS_ERR(t)))
@@@ -1264,17 -1249,8 +1249,8 @@@ static void __init rcu_spawn_boost_kthr
         struct rcu_node *rnp;
   
         rcu_for_each_leaf_node(rnp)
-               rcu_spawn_one_boost_kthread(rnp);
- }
- 
- static void rcu_prepare_kthreads(int cpu)
- {
-       struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-       struct rcu_node *rnp = rdp->mynode;
- 
-       /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
-       if (rcu_scheduler_fully_active)
-               rcu_spawn_one_boost_kthread(rnp);
+               if (rcu_rnp_online_cpus(rnp))
+                       rcu_spawn_one_boost_kthread(rnp);
   }
   
   #else /* #ifdef CONFIG_RCU_BOOST */
@@@ -1294,15 -1270,15 +1270,15 @@@ static void rcu_preempt_boost_start_gp(
   {
   }
   
- static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
   {
   }
   
- static void __init rcu_spawn_boost_kthreads(void)
+ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
   {
   }
   
- static void rcu_prepare_kthreads(int cpu)
+ static void __init rcu_spawn_boost_kthreads(void)
   {
   }
   
@@@ -1535,13 -1511,10 +1511,10 @@@ static void rcu_cleanup_after_idle(void
   static int __init rcu_nocb_setup(char *str)
   {
         alloc_bootmem_cpumask_var(&rcu_nocb_mask);
-       if (!strcasecmp(str, "all"))            /* legacy: use "0-N" instead */
+       if (cpulist_parse(str, rcu_nocb_mask)) {
+               pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
                 cpumask_setall(rcu_nocb_mask);
-       else
-               if (cpulist_parse(str, rcu_nocb_mask)) {
-                       pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
-                       cpumask_setall(rcu_nocb_mask);
-               }
+       }
         return 1;
   }
   __setup("rcu_nocbs=", rcu_nocb_setup);
@@@ -1692,43 -1665,50 +1665,50 @@@ bool rcu_is_nocb_cpu(int cpu
         return false;
   }
   
- /*
-  * Kick the GP kthread for this NOCB group.  Caller holds ->nocb_lock
-  * and this function releases it.
-  */
- static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
-                        unsigned long flags)
-       __releases(rdp->nocb_lock)
+ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
+                          struct rcu_data *rdp,
+                          bool force, unsigned long flags)
+       __releases(rdp_gp->nocb_gp_lock)
   {
         bool needwake = false;
-       struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
   
-       lockdep_assert_held(&rdp->nocb_lock);
         if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
-               rcu_nocb_unlock_irqrestore(rdp, flags);
+               raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
                 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
                                     TPS("AlreadyAwake"));
                 return false;
         }
   
-       if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) {
-               WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
-               del_timer(&rdp->nocb_timer);
+       if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+               WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+               del_timer(&rdp_gp->nocb_timer);
         }
-       rcu_nocb_unlock_irqrestore(rdp, flags);
-       raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ 
         if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
                 WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
                 needwake = true;
-               trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
         }
         raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
-       if (needwake)
+       if (needwake) {
+               trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
                 wake_up_process(rdp_gp->nocb_gp_kthread);
+       }
   
         return needwake;
   }
   
+ /*
+  * Kick the GP kthread for this NOCB group.
+  */
+ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+ {
+       unsigned long flags;
+       struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ 
+       raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+       return __wake_nocb_gp(rdp_gp, rdp, force, flags);
+ }
+ 
   /*
    * Arrange to wake the GP kthread for this NOCB group at some future
    * time when it is safe to do so.
@@@ -1736,12 -1716,27 +1716,27 @@@
   static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
                                const char *reason)
   {
-       if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
-               return;
-       if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
-               mod_timer(&rdp->nocb_timer, jiffies + 1);
-       if (rdp->nocb_defer_wakeup < waketype)
-               WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+       unsigned long flags;
+       struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ 
+       raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ 
+       /*
+        * Bypass wakeup overrides previous deferments. In case
+        * of callback storm, no need to wake up too early.
+        */
+       if (waketype == RCU_NOCB_WAKE_BYPASS) {
+               mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
+               WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+       } else {
+               if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
+                       mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
+               if (rdp_gp->nocb_defer_wakeup < waketype)
+                       WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+       }
+ 
+       raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ 
         trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
   }
   
@@@ -1940,7 -1935,7 +1935,7 @@@ static bool rcu_nocb_try_bypass(struct 
   }
   
   /*
-  * Awaken the no-CBs grace-period kthead if needed, either due to it
+  * Awaken the no-CBs grace-period kthread if needed, either due to it
    * legitimately being asleep or due to overload conditions.
    *
    * If warranted, also wake up the kthread servicing this CPUs queues.
@@@ -1968,13 -1963,14 +1963,14 @@@ static void __call_rcu_nocb_wake(struc
                 rdp->qlen_last_fqs_check = len;
                 if (!irqs_disabled_flags(flags)) {
                         /* ... if queue was empty ... */
-                       wake_nocb_gp(rdp, false, flags);
+                       rcu_nocb_unlock_irqrestore(rdp, flags);
+                       wake_nocb_gp(rdp, false);
                         trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
                                             TPS("WakeEmpty"));
                 } else {
+                       rcu_nocb_unlock_irqrestore(rdp, flags);
                         wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
                                            TPS("WakeEmptyIsDeferred"));
-                       rcu_nocb_unlock_irqrestore(rdp, flags);
                 }
         } else if (len > rdp->qlen_last_fqs_check + qhimark) {
                 /* ... or if many callbacks queued. */
@@@ -1989,10 -1985,14 +1985,14 @@@
                 smp_mb(); /* Enqueue before timer_pending(). */
                 if ((rdp->nocb_cb_sleep ||
                      !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
-                   !timer_pending(&rdp->nocb_bypass_timer))
+                   !timer_pending(&rdp->nocb_timer)) {
+                       rcu_nocb_unlock_irqrestore(rdp, flags);
                         wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
                                            TPS("WakeOvfIsDeferred"));
-               rcu_nocb_unlock_irqrestore(rdp, flags);
+               } else {
+                       rcu_nocb_unlock_irqrestore(rdp, flags);
+                       trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+               }
         } else {
                 rcu_nocb_unlock_irqrestore(rdp, flags);
                 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
@@@ -2000,18 -2000,6 +2000,6 @@@
         return;
   }
   
- /* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
- static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
- {
-       unsigned long flags;
-       struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
- 
-       trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
-       rcu_nocb_lock_irqsave(rdp, flags);
-       smp_mb__after_spinlock(); /* Timer expire before wakeup. */
-       __call_rcu_nocb_wake(rdp, true, flags);
- }
- 
   /*
    * Check if we ignore this rdp.
    *
@@@ -2118,11 -2106,7 +2106,7 @@@ static void nocb_gp_wait(struct rcu_dat
                         bypass = true;
                 }
                 rnp = rdp->mynode;
-               if (bypass) {  // Avoid race with first bypass CB.
-                       WRITE_ONCE(my_rdp->nocb_defer_wakeup,
-                                  RCU_NOCB_WAKE_NOT);
-                       del_timer(&my_rdp->nocb_timer);
-               }
+ 
                 // Advance callbacks if helpful and low contention.
                 needwake_gp = false;
                 if (!rcu_segcblist_restempty(&rdp->cblist,
@@@ -2168,12 -2152,12 +2152,12 @@@
         my_rdp->nocb_gp_bypass = bypass;
         my_rdp->nocb_gp_gp = needwait_gp;
         my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+ 
         if (bypass && !rcu_nocb_poll) {
                 // At least one child with non-empty ->nocb_bypass, so set
                 // timer in order to avoid stranding its callbacks.
-               raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
-               mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
-               raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+               wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+                                  TPS("WakeBypassIsDeferred"));
         }
         if (rcu_nocb_poll) {
                 /* Polling, so trace if first poll in the series. */
@@@ -2197,8 -2181,10 +2181,10 @@@
         }
         if (!rcu_nocb_poll) {
                 raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
-               if (bypass)
-                       del_timer(&my_rdp->nocb_bypass_timer);
+               if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+                       WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+                       del_timer(&my_rdp->nocb_timer);
+               }
                 WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
                 raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
         }
@@@ -2334,25 -2320,27 +2320,27 @@@ static int rcu_nocb_cb_kthread(void *ar
   }
   
   /* Is a deferred wakeup of rcu_nocb_kthread() required? */
- static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
   {
-       return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
+       return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
   }
   
   /* Do a deferred wakeup of rcu_nocb_kthread(). */
- static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
+ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
+                                          struct rcu_data *rdp, int level,
+                                          unsigned long flags)
+       __releases(rdp_gp->nocb_gp_lock)
   {
-       unsigned long flags;
         int ndw;
         int ret;
   
-       rcu_nocb_lock_irqsave(rdp, flags);
-       if (!rcu_nocb_need_deferred_wakeup(rdp)) {
-               rcu_nocb_unlock_irqrestore(rdp, flags);
+       if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
+               raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
                 return false;
         }
-       ndw = READ_ONCE(rdp->nocb_defer_wakeup);
-       ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ 
+       ndw = rdp_gp->nocb_defer_wakeup;
+       ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
         trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
   
         return ret;
@@@ -2361,9 -2349,15 +2349,15 @@@
   /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
   static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
   {
+       unsigned long flags;
         struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
   
-       do_nocb_deferred_wakeup_common(rdp);
+       WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
+       trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+ 
+       raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
+       smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+       do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
   }
   
   /*
@@@ -2373,9 -2367,14 +2367,14 @@@
    */
   static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
   {
-       if (rcu_nocb_need_deferred_wakeup(rdp))
-               return do_nocb_deferred_wakeup_common(rdp);
-       return false;
+       unsigned long flags;
+       struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ 
+       if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
+               return false;
+ 
+       raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+       return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
   }
   
   void rcu_nocb_flush_deferred_wakeup(void)
@@@ -2443,17 -2442,15 +2442,15 @@@ static long rcu_nocb_rdp_deoffload(voi
         swait_event_exclusive(rdp->nocb_state_wq,
                               !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
                                                         SEGCBLIST_KTHREAD_GP));
-       rcu_nocb_lock_irqsave(rdp, flags);
-       /* Make sure nocb timer won't stay around */
-       WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
-       rcu_nocb_unlock_irqrestore(rdp, flags);
-       del_timer_sync(&rdp->nocb_timer);
- 
         /*
-        * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY with CB unlocked
-        * and IRQs disabled but let's be paranoid.
+        * Lock one last time to acquire latest callback updates from kthreads
+        * so we can later handle callbacks locally without locking.
          */
         rcu_nocb_lock_irqsave(rdp, flags);
+       /*
+        * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb
+        * lock is released but how about being paranoid for once?
+        */
         rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
         /*
          * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
@@@ -2473,10 -2470,6 +2470,6 @@@ int rcu_nocb_cpu_deoffload(int cpu
         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
         int ret = 0;
   
-       if (rdp == rdp->nocb_gp_rdp) {
-               pr_info("Can't deoffload an rdp GP leader (yet)\n");
-               return -EINVAL;
-       }
         mutex_lock(&rcu_state.barrier_mutex);
         cpus_read_lock();
         if (rcu_rdp_is_offloaded(rdp)) {
@@@ -2517,8 -2510,7 +2510,7 @@@ static long rcu_nocb_rdp_offload(void *
          * SEGCBLIST_SOFTIRQ_ONLY mode.
          */
         raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
-       /* Re-enable nocb timer */
-       WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ 
         /*
          * We didn't take the nocb lock while working on the
          * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
@@@ -2626,7 -2618,6 +2618,6 @@@ static void __init rcu_boot_init_nocb_p
         raw_spin_lock_init(&rdp->nocb_bypass_lock);
         raw_spin_lock_init(&rdp->nocb_gp_lock);
         timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
-       timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
         rcu_cblist_init(&rdp->nocb_bypass);
   }
   
@@@ -2768,7 -2759,7 +2759,7 @@@ EXPORT_SYMBOL_GPL(rcu_bind_current_to_n
   #ifdef CONFIG_SMP
   static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
   {
- -      return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
+ +      return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
   }
   #else // #ifdef CONFIG_SMP
   static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
@@@ -2785,13 -2776,12 +2776,12 @@@ static void show_rcu_nocb_gp_state(stru
   {
         struct rcu_node *rnp = rdp->mynode;
   
-       pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
+       pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
                 rdp->cpu,
                 "kK"[!!rdp->nocb_gp_kthread],
                 "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
                 "dD"[!!rdp->nocb_defer_wakeup],
                 "tT"[timer_pending(&rdp->nocb_timer)],
-               "bB"[timer_pending(&rdp->nocb_bypass_timer)],
                 "sS"[!!rdp->nocb_gp_sleep],
                 ".W"[swait_active(&rdp->nocb_gp_wq)],
                 ".W"[swait_active(&rnp->nocb_gp_wq[0])],
@@@ -2812,7 -2802,6 +2802,6 @@@ static void show_rcu_nocb_state(struct 
         char bufr[20];
         struct rcu_segcblist *rsclp = &rdp->cblist;
         bool waslocked;
-       bool wastimer;
         bool wassleep;
   
         if (rdp->nocb_gp_rdp == rdp)
@@@ -2849,15 -2838,13 +2838,13 @@@
                 return;
   
         waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
-       wastimer = timer_pending(&rdp->nocb_bypass_timer);
         wassleep = swait_active(&rdp->nocb_gp_wq);
-       if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep)
-               return;  /* Nothing untowards. */
+       if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
+               return;  /* Nothing untoward. */
   
-       pr_info("   nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n",
+       pr_info("   nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
                 "lL"[waslocked],
                 "dD"[!!rdp->nocb_defer_wakeup],
-               "tT"[wastimer],
                 "sS"[!!rdp->nocb_gp_sleep],
                 ".W"[wassleep]);
   }
@@@ -2922,7 -2909,7 +2909,7 @@@ static void __init rcu_boot_init_nocb_p
   {
   }
   
- static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
   {
         return false;
   }
diff --combined kernel/rcu/tree_stall.h

index acb2288063b53307ec3df4d4a2e2e9164c22f0f7,f4152aa18f48ff93dcd8ecd8bcd4d3af0b10aded..3f937b20814fdc8d6ac5f1407595b84ee159684b
--- 1/kernel/rcu/tree_stall.h
--- 2/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@@ -314,6 -314,7 +314,7 @@@ static void rcu_print_detail_task_stall
    * tasks blocked within RCU read-side critical sections.
    */
   static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
+       __releases(rnp->lock)
   {
         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
         return 0;
@@@ -460,12 -461,12 +461,12 @@@ static void rcu_check_gp_kthread_starva
   
         if (rcu_is_gp_kthread_starving(&j)) {
                 cpu = gpk ? task_cpu(gpk) : -1;
- -              pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
+ +              pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n",
                        rcu_state.name, j,
                        (long)rcu_seq_current(&rcu_state.gp_seq),
                        data_race(rcu_state.gp_flags),
                        gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- -                     gpk ? gpk->state : ~0, cpu);
+ +                     gpk ? gpk->__state : ~0, cpu);
                 if (gpk) {
                         pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
                         pr_err("RCU grace-period kthread stack dump:\n");
@@@ -503,12 -504,12 +504,12 @@@ static void rcu_check_gp_kthread_expire
             time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) &&
             gpk && !READ_ONCE(gpk->on_rq)) {
                 cpu = task_cpu(gpk);
- -              pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx\n",
+ +              pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n",
                        rcu_state.name, (jiffies - jiffies_fqs),
                        (long)rcu_seq_current(&rcu_state.gp_seq),
                        data_race(rcu_state.gp_flags),
                        gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
- -                     gpk->state);
+ +                     gpk->__state);
                 pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
                        cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
         }
@@@ -716,6 -717,63 +717,63 @@@ static void check_cpu_stall(struct rcu_
   // RCU forward-progress mechanisms, including of callback invocation.
   
   
+ /*
+  * Check to see if a failure to end RCU priority inversion was due to
+  * a CPU not passing through a quiescent state.  When this happens, there
+  * is nothing that RCU priority boosting can do to help, so we shouldn't
+  * count this as an RCU priority boosting failure.  A return of true says
+  * RCU priority boosting is to blame, and false says otherwise.  If false
+  * is returned, the first of the CPUs to blame is stored through cpup.
+  * If there was no CPU blocking the current grace period, but also nothing
+  * in need of being boosted, *cpup is set to -1.  This can happen in case
+  * of vCPU preemption while the last CPU is reporting its quiscent state,
+  * for example.
+  *
+  * If cpup is NULL, then a lockless quick check is carried out, suitable
+  * for high-rate usage.  On the other hand, if cpup is non-NULL, each
+  * rcu_node structure's ->lock is acquired, ruling out high-rate usage.
+  */
+ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
+ {
+       bool atb = false;
+       int cpu;
+       unsigned long flags;
+       struct rcu_node *rnp;
+ 
+       rcu_for_each_leaf_node(rnp) {
+               if (!cpup) {
+                       if (READ_ONCE(rnp->qsmask)) {
+                               return false;
+                       } else {
+                               if (READ_ONCE(rnp->gp_tasks))
+                                       atb = true;
+                               continue;
+                       }
+               }
+               *cpup = -1;
+               raw_spin_lock_irqsave_rcu_node(rnp, flags);
+               if (rnp->gp_tasks)
+                       atb = true;
+               if (!rnp->qsmask) {
+                       // No CPUs without quiescent states for this rnp.
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                       continue;
+               }
+               // Find the first holdout CPU.
+               for_each_leaf_node_possible_cpu(rnp, cpu) {
+                       if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) {
+                               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                               *cpup = cpu;
+                               return false;
+                       }
+               }
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+       }
+       // Can't blame CPUs, so must blame RCU priority boosting.
+       return atb;
+ }
+ EXPORT_SYMBOL_GPL(rcu_check_boost_fail);
+ 
   /*
    * Show the state of the grace-period kthreads.
    */
@@@ -726,6 -784,7 +784,7 @@@ void show_rcu_gp_kthreads(void
         unsigned long j;
         unsigned long ja;
         unsigned long jr;
+       unsigned long js;
         unsigned long jw;
         struct rcu_data *rdp;
         struct rcu_node *rnp;
@@@ -734,21 -793,30 +793,30 @@@
         j = jiffies;
         ja = j - data_race(rcu_state.gp_activity);
         jr = j - data_race(rcu_state.gp_req_activity);
+       js = j - data_race(rcu_state.gp_start);
         jw = j - data_race(rcu_state.gp_wake_time);
-       pr_info("%s: wait state: %s(%d) ->state: %#x delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+       pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
                 rcu_state.name, gp_state_getname(rcu_state.gp_state),
-               rcu_state.gp_state, t ? t->__state : 0x1ffff,
-               ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
- -              rcu_state.gp_state, t ? t->state : 0x1ffffL, t ? t->rt_priority : 0xffU,
++              rcu_state.gp_state, t ? t->__state : 0x1ffffL, t ? t->rt_priority : 0xffU,
+               js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
                 (long)data_race(rcu_state.gp_seq),
                 (long)data_race(rcu_get_root()->gp_seq_needed),
+               data_race(rcu_state.gp_max),
                 data_race(rcu_state.gp_flags));
         rcu_for_each_node_breadth_first(rnp) {
-               if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
-                                READ_ONCE(rnp->gp_seq_needed)))
+               if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
+                   !data_race(rnp->qsmask) && !data_race(rnp->boost_tasks) &&
+                   !data_race(rnp->exp_tasks) && !data_race(rnp->gp_tasks))
                         continue;
-               pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
-                       rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq),
-                       (long)data_race(rnp->gp_seq_needed));
+               pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
+                       rnp->grplo, rnp->grphi,
+                       (long)data_race(rnp->gp_seq), (long)data_race(rnp->gp_seq_needed),
+                       data_race(rnp->qsmask),
+                       ".b"[!!data_race(rnp->boost_kthread_task)],
+                       ".B"[!!data_race(rnp->boost_tasks)],
+                       ".E"[!!data_race(rnp->exp_tasks)],
+                       ".G"[!!data_race(rnp->gp_tasks)],
+                       data_race(rnp->n_boosts));
                 if (!rcu_is_leaf_node(rnp))
                         continue;
                 for_each_leaf_node_possible_cpu(rnp, cpu) {
diff --combined kernel/time/timer.c

index 467087d7bdb664fbc0c6373eeb443303b1c1e5ee,84332f01dc571ea11e358fabf27e6c399acdafd0..3fadb58fc9d7b1c7a273b59a48b8dc82c47a1b22
--- 1/kernel/time/timer.c
--- 2/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@@ -1237,20 -1237,6 +1237,6 @@@ int try_to_del_timer_sync(struct timer_
   }
   EXPORT_SYMBOL(try_to_del_timer_sync);
   
- bool timer_curr_running(struct timer_list *timer)
- {
-       int i;
- 
-       for (i = 0; i < NR_BASES; i++) {
-               struct timer_base *base = this_cpu_ptr(&timer_bases[i]);
- 
-               if (base->running_timer == timer)
-                       return true;
-       }
- 
-       return false;
- }
- 
   #ifdef CONFIG_PREEMPT_RT
   static __init void timer_base_init_expiry_lock(struct timer_base *base)
   {
@@@ -1879,7 -1865,7 +1865,7 @@@ signed long __sched schedule_timeout(si
                         printk(KERN_ERR "schedule_timeout: wrong timeout "
                                 "value %lx\n", timeout);
                         dump_stack();
- -                      current->state = TASK_RUNNING;
+ +                      __set_current_state(TASK_RUNNING);
                         goto out;
                 }
         }
diff --combined mm/oom_kill.c

index fcc29e9a30645f1377d9baa550cf840e264166c4,54527de9cd2ded8b2b3195be953427a6205cfed1..c729a4c4a1ace9ae4b97fb9602d3a48d726f177e
--- 1/mm/oom_kill.c
--- 2/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@@ -104,7 -104,7 +104,7 @@@ static bool oom_cpuset_eligible(struct 
                          * mempolicy intersects current, otherwise it may be
                          * needlessly killed.
                          */
- -                      ret = mempolicy_nodemask_intersects(tsk, mask);
+ +                      ret = mempolicy_in_oom_domain(tsk, mask);
                 } else {
                         /*
                          * This is not a mempolicy constrained oom, so only
@@@ -922,7 -922,7 +922,7 @@@ static void __oom_kill_process(struct t
                         continue;
                 }
                 /*
-                * No kthead_use_mm() user needs to read from the userspace so
+                * No kthread_use_mm() user needs to read from the userspace so
                  * we are ok to reap it.
                  */
                 if (unlikely(p->flags & PF_KTHREAD))
diff --combined mm/slab.h

index 7b60ef2f32c3b6424fe67cd9a0a62ffbd1c54e6e,7189daa0c586c0ff4d8b0565007f4847cf8c2824..67e06637ff2eeda01c178c087a95691f2a7113b5
--- 1/mm/slab.h
--- 2/mm/slab.h
+++ b/mm/slab.h
@@@ -215,7 -215,6 +215,7 @@@ DECLARE_STATIC_KEY_TRUE(slub_debug_enab
   DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
   #endif
   extern void print_tracking(struct kmem_cache *s, void *object);
+ +long validate_slab_cache(struct kmem_cache *s);
   #else
   static inline void print_tracking(struct kmem_cache *s, void *object)
   {
@@@ -240,8 -239,6 +240,8 @@@ static inline bool kmem_cache_debug_fla
   #ifdef CONFIG_MEMCG_KMEM
   int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
                                  gfp_t gfp, bool new_page);
+ +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+ +                   enum node_stat_item idx, int nr);
   
   static inline void memcg_free_page_obj_cgroups(struct page *page)
   {
@@@ -286,6 -283,20 +286,6 @@@ static inline bool memcg_slab_pre_alloc
         return true;
   }
   
- -static inline void mod_objcg_state(struct obj_cgroup *objcg,
- -                                 struct pglist_data *pgdat,
- -                                 enum node_stat_item idx, int nr)
- -{
- -      struct mem_cgroup *memcg;
- -      struct lruvec *lruvec;
- -
- -      rcu_read_lock();
- -      memcg = obj_cgroup_memcg(objcg);
- -      lruvec = mem_cgroup_lruvec(memcg, pgdat);
- -      mod_memcg_lruvec_state(lruvec, idx, nr);
- -      rcu_read_unlock();
- -}
- -
   static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
                                               struct obj_cgroup *objcg,
                                               gfp_t flags, size_t size,
@@@ -298,6 -309,7 +298,6 @@@
         if (!memcg_kmem_enabled() || !objcg)
                 return;
   
- -      flags &= ~__GFP_ACCOUNT;
         for (i = 0; i < size; i++) {
                 if (likely(p[i])) {
                         page = virt_to_head_page(p[i]);
@@@ -618,12 -630,6 +618,12 @@@ static inline bool slab_want_init_on_fr
         return false;
   }
   
+ +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
+ +void debugfs_slab_release(struct kmem_cache *);
+ +#else
+ +static inline void debugfs_slab_release(struct kmem_cache *s) { }
+ +#endif
+ +
   #ifdef CONFIG_PRINTK
   #define KS_ADDRS_COUNT 16
   struct kmem_obj_info {
@@@ -634,6 -640,7 +634,7 @@@
         struct kmem_cache *kp_slab_cache;
         void *kp_ret;
         void *kp_stack[KS_ADDRS_COUNT];
+       void *kp_free_stack[KS_ADDRS_COUNT];
   };
   void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
   #endif
diff --combined mm/slab_common.c

index c126e6f6b5a569b3f9a3dee5b7499c8341b52fff,92e3aa78bb4d96c1a57b95ba5ac35641621df011..1c673c323baf2eed3fbf57d22d9ad70c779b5dd3
--- 1/mm/slab_common.c
--- 2/mm/slab_common.c
+++ b/mm/slab_common.c
@@@ -97,7 -97,8 +97,7 @@@ EXPORT_SYMBOL(kmem_cache_size)
   #ifdef CONFIG_DEBUG_VM
   static int kmem_cache_sanity_check(const char *name, unsigned int size)
   {
- -      if (!name || in_interrupt() || size < sizeof(void *) ||
- -              size > KMALLOC_MAX_SIZE) {
+ +      if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
                 pr_err("kmem_cache_create(%s) integrity check failed\n", name);
                 return -EINVAL;
         }
@@@ -317,16 -318,6 +317,16 @@@ kmem_cache_create_usercopy(const char *
         const char *cache_name;
         int err;
   
+ +#ifdef CONFIG_SLUB_DEBUG
+ +      /*
+ +       * If no slub_debug was enabled globally, the static key is not yet
+ +       * enabled by setup_slub_debug(). Enable it if the cache is being
+ +       * created with any of the debugging flags passed explicitly.
+ +       */
+ +      if (flags & SLAB_DEBUG_FLAGS)
+ +              static_branch_enable(&slub_debug_enabled);
+ +#endif
+ +
         mutex_lock(&slab_mutex);
   
         err = kmem_cache_sanity_check(name, size);
@@@ -377,11 -368,11 +377,11 @@@ out_unlock
   
         if (err) {
                 if (flags & SLAB_PANIC)
- -                      panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
- -                              name, err);
+ +                      panic("%s: Failed to create slab '%s'. Error %d\n",
+ +                              __func__, name, err);
                 else {
- -                      pr_warn("kmem_cache_create(%s) failed with error %d\n",
- -                              name, err);
+ +                      pr_warn("%s(%s) failed with error %d\n",
+ +                              __func__, name, err);
                         dump_stack();
                 }
                 return NULL;
@@@ -448,7 -439,6 +448,7 @@@ static void slab_caches_to_rcu_destroy_
         rcu_barrier();
   
         list_for_each_entry_safe(s, s2, &to_destroy, list) {
+ +              debugfs_slab_release(s);
                 kfence_shutdown_cache(s);
   #ifdef SLAB_SUPPORTS_SYSFS
                 sysfs_slab_release(s);
@@@ -476,7 -466,6 +476,7 @@@ static int shutdown_cache(struct kmem_c
                 schedule_work(&slab_caches_to_rcu_destroy_work);
         } else {
                 kfence_shutdown_cache(s);
+ +              debugfs_slab_release(s);
   #ifdef SLAB_SUPPORTS_SYSFS
                 sysfs_slab_unlink(s);
                 sysfs_slab_release(s);
@@@ -510,8 -499,8 +510,8 @@@ void kmem_cache_destroy(struct kmem_cac
   
         err = shutdown_cache(s);
         if (err) {
- -              pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
- -                     s->name);
+ +              pr_err("%s %s: Slab cache still has objects\n",
+ +                     __func__, s->name);
                 dump_stack();
         }
   out_unlock:
@@@ -575,7 -564,7 +575,7 @@@ EXPORT_SYMBOL_GPL(kmem_valid_obj)
    * depends on the type of object and on how much debugging is enabled.
    * For a slab-cache object, the fact that it is a slab object is printed,
    * and, if available, the slab name, return address, and stack trace from
-  * the allocation of that object.
+  * the allocation and last free path of that object.
    *
    * This function will splat if passed a pointer to a non-slab object.
    * If you are not sure what type of object you have, you should instead
@@@ -620,6 -609,16 +620,16 @@@ void kmem_dump_obj(void *object
                         break;
                 pr_info("    %pS\n", kp.kp_stack[i]);
         }
+ 
+       if (kp.kp_free_stack[0])
+               pr_cont(" Free path:\n");
+ 
+       for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
+               if (!kp.kp_free_stack[i])
+                       break;
+               pr_info("    %pS\n", kp.kp_free_stack[i]);
+       }
+ 
   }
   EXPORT_SYMBOL_GPL(kmem_dump_obj);
   #endif
@@@ -738,30 -737,26 +748,30 @@@ struct kmem_cache *kmalloc_slab(size_t 
   }
   
   #ifdef CONFIG_ZONE_DMA
- -#define INIT_KMALLOC_INFO(__size, __short_size)                       \
- -{                                                             \
- -      .name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,      \
- -      .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,  \
- -      .name[KMALLOC_DMA]     = "dma-kmalloc-" #__short_size,  \
- -      .size = __size,                                         \
- -}
+ +#define KMALLOC_DMA_NAME(sz)  .name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
   #else
+ +#define KMALLOC_DMA_NAME(sz)
+ +#endif
+ +
+ +#ifdef CONFIG_MEMCG_KMEM
+ +#define KMALLOC_CGROUP_NAME(sz)       .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
+ +#else
+ +#define KMALLOC_CGROUP_NAME(sz)
+ +#endif
+ +
   #define INIT_KMALLOC_INFO(__size, __short_size)                       \
   {                                                             \
         .name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,      \
         .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,  \
+ +      KMALLOC_CGROUP_NAME(__short_size)                       \
+ +      KMALLOC_DMA_NAME(__short_size)                          \
         .size = __size,                                         \
   }
- -#endif
   
   /*
    * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
- - * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
- - * kmalloc-67108864.
+ + * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is
+ + * kmalloc-32M.
    */
   const struct kmalloc_info_struct kmalloc_info[] __initconst = {
         INIT_KMALLOC_INFO(0, 0),
@@@ -789,7 -784,8 +799,7 @@@
         INIT_KMALLOC_INFO(4194304, 4M),
         INIT_KMALLOC_INFO(8388608, 8M),
         INIT_KMALLOC_INFO(16777216, 16M),
- -      INIT_KMALLOC_INFO(33554432, 32M),
- -      INIT_KMALLOC_INFO(67108864, 64M)
+ +      INIT_KMALLOC_INFO(33554432, 32M)
   };
   
   /*
@@@ -842,27 -838,13 +852,27 @@@ void __init setup_kmalloc_cache_index_t
   static void __init
   new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
   {
- -      if (type == KMALLOC_RECLAIM)
+ +      if (type == KMALLOC_RECLAIM) {
                 flags |= SLAB_RECLAIM_ACCOUNT;
+ +      } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
+ +              if (cgroup_memory_nokmem) {
+ +                      kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
+ +                      return;
+ +              }
+ +              flags |= SLAB_ACCOUNT;
+ +      }
   
         kmalloc_caches[type][idx] = create_kmalloc_cache(
                                         kmalloc_info[idx].name[type],
                                         kmalloc_info[idx].size, flags, 0,
                                         kmalloc_info[idx].size);
+ +
+ +      /*
+ +       * If CONFIG_MEMCG_KMEM is enabled, disable cache merging for
+ +       * KMALLOC_NORMAL caches.
+ +       */
+ +      if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL))
+ +              kmalloc_caches[type][idx]->refcount = -1;
   }
   
   /*
@@@ -875,9 -857,6 +885,9 @@@ void __init create_kmalloc_caches(slab_
         int i;
         enum kmalloc_cache_type type;
   
+ +      /*
+ +       * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
+ +       */
         for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
                 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
                         if (!kmalloc_caches[type][i])
diff --combined mm/slub.c

index 3bc8b940c933c68f7b65b81c3863f1df13210776,deec894a1345ac49c95dc43cf36a6f58d4b3090f..2ee43ff667a56225d6639e2b4f7edc51cc781869
--- 1/mm/slub.c
--- 2/mm/slub.c
+++ b/mm/slub.c
@@@ -15,7 -15,6 +15,7 @@@
   #include <linux/module.h>
   #include <linux/bit_spinlock.h>
   #include <linux/interrupt.h>
+ +#include <linux/swab.h>
   #include <linux/bitops.h>
   #include <linux/slab.h>
   #include "slab.h"
@@@ -36,9 -35,7 +36,9 @@@
   #include <linux/prefetch.h>
   #include <linux/memcontrol.h>
   #include <linux/random.h>
+ +#include <kunit/test.h>
   
+ +#include <linux/debugfs.h>
   #include <trace/events/kmem.h>
   
   #include "internal.h"
@@@ -119,26 -116,12 +119,26 @@@
    */
   
   #ifdef CONFIG_SLUB_DEBUG
+ +
   #ifdef CONFIG_SLUB_DEBUG_ON
   DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
   #else
   DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
   #endif
- -#endif
+ +
+ +static inline bool __slub_debug_enabled(void)
+ +{
+ +      return static_branch_unlikely(&slub_debug_enabled);
+ +}
+ +
+ +#else         /* CONFIG_SLUB_DEBUG */
+ +
+ +static inline bool __slub_debug_enabled(void)
+ +{
+ +      return false;
+ +}
+ +
+ +#endif                /* CONFIG_SLUB_DEBUG */
   
   static inline bool kmem_cache_debug(struct kmem_cache *s)
   {
@@@ -170,6 -153,9 +170,6 @@@ static inline bool kmem_cache_has_cpu_p
    * - Variable sizing of the per node arrays
    */
   
- -/* Enable to test recovery from slab corruption on boot */
- -#undef SLUB_RESILIENCY_TEST
- -
   /* Enable to log cmpxchg failures */
   #undef SLUB_DEBUG_CMPXCHG
   
@@@ -239,12 -225,6 +239,12 @@@ static inline int sysfs_slab_alias(stru
                                                         { return 0; }
   #endif
   
+ +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
+ +static void debugfs_slab_add(struct kmem_cache *);
+ +#else
+ +static inline void debugfs_slab_add(struct kmem_cache *s) { }
+ +#endif
+ +
   static inline void stat(const struct kmem_cache *s, enum stat_item si)
   {
   #ifdef CONFIG_SLUB_STATS
@@@ -321,7 -301,6 +321,7 @@@ static inline void *get_freepointer_saf
         if (!debug_pagealloc_enabled_static())
                 return get_freepointer(s, object);
   
+ +      object = kasan_reset_tag(object);
         freepointer_addr = (unsigned long)object + s->offset;
         copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
         return freelist_ptr(s, p, freepointer_addr);
@@@ -468,26 -447,6 +468,26 @@@ static inline bool cmpxchg_double_slab(
   static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
   static DEFINE_SPINLOCK(object_map_lock);
   
+ +#if IS_ENABLED(CONFIG_KUNIT)
+ +static bool slab_add_kunit_errors(void)
+ +{
+ +      struct kunit_resource *resource;
+ +
+ +      if (likely(!current->kunit_test))
+ +              return false;
+ +
+ +      resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
+ +      if (!resource)
+ +              return false;
+ +
+ +      (*(int *)resource->data)++;
+ +      kunit_put_resource(resource);
+ +      return true;
+ +}
+ +#else
+ +static inline bool slab_add_kunit_errors(void) { return false; }
+ +#endif
+ +
   /*
    * Determine a map of object in use on a page.
    *
@@@ -708,18 -667,16 +708,18 @@@ static void slab_bug(struct kmem_cache 
         pr_err("=============================================================================\n");
         pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
         pr_err("-----------------------------------------------------------------------------\n\n");
- -
- -      add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
         va_end(args);
   }
   
+ +__printf(2, 3)
   static void slab_fix(struct kmem_cache *s, char *fmt, ...)
   {
         struct va_format vaf;
         va_list args;
   
+ +      if (slab_add_kunit_errors())
+ +              return;
+ +
         va_start(args, fmt);
         vaf.fmt = fmt;
         vaf.va = &args;
@@@ -754,15 -711,15 +754,15 @@@ static void print_trailer(struct kmem_c
                p, p - addr, get_freepointer(s, p));
   
         if (s->flags & SLAB_RED_ZONE)
- -              print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
+ +              print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
                               s->red_left_pad);
         else if (p > addr + 16)
                 print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
   
- -      print_section(KERN_ERR, "Object ", p,
+ +      print_section(KERN_ERR,         "Object   ", p,
                       min_t(unsigned int, s->object_size, PAGE_SIZE));
         if (s->flags & SLAB_RED_ZONE)
- -              print_section(KERN_ERR, "Redzone ", p + s->object_size,
+ +              print_section(KERN_ERR, "Redzone  ", p + s->object_size,
                         s->inuse - s->object_size);
   
         off = get_info_end(s);
@@@ -774,7 -731,7 +774,7 @@@
   
         if (off != size_from_object(s))
                 /* Beginning of the filler is the free pointer */
- -              print_section(KERN_ERR, "Padding ", p + off,
+ +              print_section(KERN_ERR, "Padding  ", p + off,
                               size_from_object(s) - off);
   
         dump_stack();
@@@ -783,12 -740,8 +783,12 @@@
   void object_err(struct kmem_cache *s, struct page *page,
                         u8 *object, char *reason)
   {
+ +      if (slab_add_kunit_errors())
+ +              return;
+ +
         slab_bug(s, "%s", reason);
         print_trailer(s, page, object);
+ +      add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
   }
   
   static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
@@@ -797,16 -750,12 +797,16 @@@
         va_list args;
         char buf[100];
   
+ +      if (slab_add_kunit_errors())
+ +              return;
+ +
         va_start(args, fmt);
         vsnprintf(buf, sizeof(buf), fmt, args);
         va_end(args);
         slab_bug(s, "%s", buf);
         print_page_info(page);
         dump_stack();
+ +      add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
   }
   
   static void init_object(struct kmem_cache *s, void *object, u8 val)
@@@ -828,7 -777,7 +828,7 @@@
   static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
                                                 void *from, void *to)
   {
- -      slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
+ +      slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
         memset(from, data, to - from);
   }
   
@@@ -850,17 -799,12 +850,17 @@@ static int check_bytes_and_report(struc
         while (end > fault && end[-1] == value)
                 end--;
   
+ +      if (slab_add_kunit_errors())
+ +              goto skip_bug_print;
+ +
         slab_bug(s, "%s overwritten", what);
         pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
                                         fault, end - 1, fault - addr,
                                         fault[0], value);
         print_trailer(s, page, object);
+ +      add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
   
+ +skip_bug_print:
         restore_bytes(s, what, value, fault, end);
         return 0;
   }
@@@ -964,11 -908,11 +964,11 @@@ static int check_object(struct kmem_cac
         u8 *endobject = object + s->object_size;
   
         if (s->flags & SLAB_RED_ZONE) {
- -              if (!check_bytes_and_report(s, page, object, "Redzone",
+ +              if (!check_bytes_and_report(s, page, object, "Left Redzone",
                         object - s->red_left_pad, val, s->red_left_pad))
                         return 0;
   
- -              if (!check_bytes_and_report(s, page, object, "Redzone",
+ +              if (!check_bytes_and_report(s, page, object, "Right Redzone",
                         endobject, val, s->inuse - s->object_size))
                         return 0;
         } else {
@@@ -983,7 -927,7 +983,7 @@@
                 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
                         (!check_bytes_and_report(s, page, p, "Poison", p,
                                         POISON_FREE, s->object_size - 1) ||
- -                       !check_bytes_and_report(s, page, p, "Poison",
+ +                       !check_bytes_and_report(s, page, p, "End Poison",
                                 p + s->object_size - 1, POISON_END, 1)))
                         return 0;
                 /*
@@@ -1082,13 -1026,13 +1082,13 @@@ static int on_freelist(struct kmem_cach
                 slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
                          page->objects, max_objects);
                 page->objects = max_objects;
- -              slab_fix(s, "Number of objects adjusted.");
+ +              slab_fix(s, "Number of objects adjusted");
         }
         if (page->inuse != page->objects - nr) {
                 slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
                          page->inuse, page->objects - nr);
                 page->inuse = page->objects - nr;
- -              slab_fix(s, "Object count adjusted.");
+ +              slab_fix(s, "Object count adjusted");
         }
         return search == NULL;
   }
@@@ -1452,8 -1396,6 +1452,8 @@@ static int __init setup_slub_debug(cha
   out:
         if (slub_debug != 0 || slub_debug_string)
                 static_branch_enable(&slub_debug_enabled);
+ +      else
+ +              static_branch_disable(&slub_debug_enabled);
         if ((static_branch_unlikely(&init_on_alloc) ||
              static_branch_unlikely(&init_on_free)) &&
             (slub_debug & SLAB_POISON))
@@@ -3746,6 -3688,7 +3746,6 @@@ static int calculate_sizes(struct kmem_
   {
         slab_flags_t flags = s->flags;
         unsigned int size = s->object_size;
- -      unsigned int freepointer_area;
         unsigned int order;
   
         /*
@@@ -3754,6 -3697,13 +3754,6 @@@
          * the possible location of the free pointer.
          */
         size = ALIGN(size, sizeof(void *));
- -      /*
- -       * This is the area of the object where a freepointer can be
- -       * safely written. If redzoning adds more to the inuse size, we
- -       * can't use that portion for writing the freepointer, so
- -       * s->offset must be limited within this for the general case.
- -       */
- -      freepointer_area = size;
   
   #ifdef CONFIG_SLUB_DEBUG
         /*
@@@ -3779,21 -3729,19 +3779,21 @@@
   
         /*
          * With that we have determined the number of bytes in actual use
- -       * by the object. This is the potential offset to the free pointer.
+ +       * by the object and redzoning.
          */
         s->inuse = size;
   
- -      if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
- -              s->ctor)) {
+ +      if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
+ +          ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
+ +          s->ctor) {
                 /*
                  * Relocate free pointer after the object if it is not
                  * permitted to overwrite the first word of the object on
                  * kmem_cache_free.
                  *
                  * This is the case if we do RCU, have a constructor or
- -               * destructor or are poisoning the objects.
+ +               * destructor, are poisoning the objects, or are
+ +               * redzoning an object smaller than sizeof(void *).
                  *
                  * The assumption that s->offset >= s->inuse means free
                  * pointer is outside of the object is used in the
@@@ -3802,13 -3750,13 +3802,13 @@@
                  */
                 s->offset = size;
                 size += sizeof(void *);
- -      } else if (freepointer_area > sizeof(void *)) {
+ +      } else {
                 /*
                  * Store freelist pointer near middle of object to keep
                  * it away from the edges of the object to avoid small
                  * sized over/underflows from neighboring allocations.
                  */
- -              s->offset = ALIGN(freepointer_area / 2, sizeof(void *));
+ +              s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
         }
   
   #ifdef CONFIG_SLUB_DEBUG
@@@ -3880,6 -3828,15 +3880,6 @@@
   
   static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
   {
- -#ifdef CONFIG_SLUB_DEBUG
- -      /*
- -       * If no slub_debug was enabled globally, the static key is not yet
- -       * enabled by setup_slub_debug(). Enable it if the cache is being
- -       * created with any of the debugging flags passed explicitly.
- -       */
- -      if (flags & SLAB_DEBUG_FLAGS)
- -              static_branch_enable(&slub_debug_enabled);
- -#endif
         s->flags = kmem_cache_flags(s->size, flags, s->name);
   #ifdef CONFIG_SLAB_FREELIST_HARDENED
         s->random = get_random_long();
@@@ -4045,6 -4002,7 +4045,7 @@@ void kmem_obj_info(struct kmem_obj_inf
             !(s->flags & SLAB_STORE_USER))
                 return;
   #ifdef CONFIG_SLUB_DEBUG
+       objp = fixup_red_left(s, objp);
         trackp = get_track(s, objp, TRACK_ALLOC);
         kpp->kp_ret = (void *)trackp->addr;
   #ifdef CONFIG_STACKTRACE
@@@ -4053,6 -4011,13 +4054,13 @@@
                 if (!kpp->kp_stack[i])
                         break;
         }
+ 
+       trackp = get_track(s, objp, TRACK_FREE);
+       for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
+               kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
+               if (!kpp->kp_free_stack[i])
+                       break;
+       }
   #endif
   #endif
   }
@@@ -4509,10 -4474,6 +4517,10 @@@ void __init kmem_cache_init(void
         if (debug_guardpage_minorder())
                 slub_max_order = 0;
   
+ +      /* Print slub debugging pointers without hashing */
+ +      if (__slub_debug_enabled())
+ +              no_hash_pointers_enable(NULL);
+ +
         kmem_cache_node = &boot_kmem_cache_node;
         kmem_cache = &boot_kmem_cache;
   
@@@ -4601,9 -4562,6 +4609,9 @@@ int __kmem_cache_create(struct kmem_cac
         if (err)
                 __kmem_cache_release(s);
   
+ +      if (s->flags & SLAB_STORE_USER)
+ +              debugfs_slab_add(s);
+ +
         return err;
   }
   
@@@ -4712,11 -4670,9 +4720,11 @@@ static int validate_slab_node(struct km
                 validate_slab(s, page);
                 count++;
         }
- -      if (count != n->nr_partial)
+ +      if (count != n->nr_partial) {
                 pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
                        s->name, count, n->nr_partial);
+ +              slab_add_kunit_errors();
+ +      }
   
         if (!(s->flags & SLAB_STORE_USER))
                 goto out;
@@@ -4725,18 -4681,16 +4733,18 @@@
                 validate_slab(s, page);
                 count++;
         }
- -      if (count != atomic_long_read(&n->nr_slabs))
+ +      if (count != atomic_long_read(&n->nr_slabs)) {
                 pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
                        s->name, count, atomic_long_read(&n->nr_slabs));
+ +              slab_add_kunit_errors();
+ +      }
   
   out:
         spin_unlock_irqrestore(&n->list_lock, flags);
         return count;
   }
   
- -static long validate_slab_cache(struct kmem_cache *s)
+ +long validate_slab_cache(struct kmem_cache *s)
   {
         int node;
         unsigned long count = 0;
@@@ -4748,9 -4702,6 +4756,9 @@@
   
         return count;
   }
+ +EXPORT_SYMBOL(validate_slab_cache);
+ +
+ +#ifdef CONFIG_DEBUG_FS
   /*
    * Generate lists of code addresses where slabcache objects are allocated
    * and freed.
@@@ -4774,8 -4725,6 +4782,8 @@@ struct loc_track 
         struct location *loc;
   };
   
+ +static struct dentry *slab_debugfs_root;
+ +
   static void free_loc_track(struct loc_track *t)
   {
         if (t->max)
@@@ -4892,9 -4841,144 +4900,9 @@@ static void process_slab(struct loc_tra
                         add_location(t, s, get_track(s, p, alloc));
         put_map(map);
   }
- -
- -static int list_locations(struct kmem_cache *s, char *buf,
- -                        enum track_item alloc)
- -{
- -      int len = 0;
- -      unsigned long i;
- -      struct loc_track t = { 0, 0, NULL };
- -      int node;
- -      struct kmem_cache_node *n;
- -
- -      if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
- -                           GFP_KERNEL)) {
- -              return sysfs_emit(buf, "Out of memory\n");
- -      }
- -      /* Push back cpu slabs */
- -      flush_all(s);
- -
- -      for_each_kmem_cache_node(s, node, n) {
- -              unsigned long flags;
- -              struct page *page;
- -
- -              if (!atomic_long_read(&n->nr_slabs))
- -                      continue;
- -
- -              spin_lock_irqsave(&n->list_lock, flags);
- -              list_for_each_entry(page, &n->partial, slab_list)
- -                      process_slab(&t, s, page, alloc);
- -              list_for_each_entry(page, &n->full, slab_list)
- -                      process_slab(&t, s, page, alloc);
- -              spin_unlock_irqrestore(&n->list_lock, flags);
- -      }
- -
- -      for (i = 0; i < t.count; i++) {
- -              struct location *l = &t.loc[i];
- -
- -              len += sysfs_emit_at(buf, len, "%7ld ", l->count);
- -
- -              if (l->addr)
- -                      len += sysfs_emit_at(buf, len, "%pS", (void *)l->addr);
- -              else
- -                      len += sysfs_emit_at(buf, len, "<not-available>");
- -
- -              if (l->sum_time != l->min_time)
- -                      len += sysfs_emit_at(buf, len, " age=%ld/%ld/%ld",
- -                                           l->min_time,
- -                                           (long)div_u64(l->sum_time,
- -                                                         l->count),
- -                                           l->max_time);
- -              else
- -                      len += sysfs_emit_at(buf, len, " age=%ld", l->min_time);
- -
- -              if (l->min_pid != l->max_pid)
- -                      len += sysfs_emit_at(buf, len, " pid=%ld-%ld",
- -                                           l->min_pid, l->max_pid);
- -              else
- -                      len += sysfs_emit_at(buf, len, " pid=%ld",
- -                                           l->min_pid);
- -
- -              if (num_online_cpus() > 1 &&
- -                  !cpumask_empty(to_cpumask(l->cpus)))
- -                      len += sysfs_emit_at(buf, len, " cpus=%*pbl",
- -                                           cpumask_pr_args(to_cpumask(l->cpus)));
- -
- -              if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
- -                      len += sysfs_emit_at(buf, len, " nodes=%*pbl",
- -                                           nodemask_pr_args(&l->nodes));
- -
- -              len += sysfs_emit_at(buf, len, "\n");
- -      }
- -
- -      free_loc_track(&t);
- -      if (!t.count)
- -              len += sysfs_emit_at(buf, len, "No data\n");
- -
- -      return len;
- -}
+ +#endif  /* CONFIG_DEBUG_FS   */
   #endif        /* CONFIG_SLUB_DEBUG */
   
- -#ifdef SLUB_RESILIENCY_TEST
- -static void __init resiliency_test(void)
- -{
- -      u8 *p;
- -      int type = KMALLOC_NORMAL;
- -
- -      BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
- -
- -      pr_err("SLUB resiliency testing\n");
- -      pr_err("-----------------------\n");
- -      pr_err("A. Corruption after allocation\n");
- -
- -      p = kzalloc(16, GFP_KERNEL);
- -      p[16] = 0x12;
- -      pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
- -             p + 16);
- -
- -      validate_slab_cache(kmalloc_caches[type][4]);
- -
- -      /* Hmmm... The next two are dangerous */
- -      p = kzalloc(32, GFP_KERNEL);
- -      p[32 + sizeof(void *)] = 0x34;
- -      pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
- -             p);
- -      pr_err("If allocated object is overwritten then not detectable\n\n");
- -
- -      validate_slab_cache(kmalloc_caches[type][5]);
- -      p = kzalloc(64, GFP_KERNEL);
- -      p += 64 + (get_cycles() & 0xff) * sizeof(void *);
- -      *p = 0x56;
- -      pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
- -             p);
- -      pr_err("If allocated object is overwritten then not detectable\n\n");
- -      validate_slab_cache(kmalloc_caches[type][6]);
- -
- -      pr_err("\nB. Corruption after free\n");
- -      p = kzalloc(128, GFP_KERNEL);
- -      kfree(p);
- -      *p = 0x78;
- -      pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
- -      validate_slab_cache(kmalloc_caches[type][7]);
- -
- -      p = kzalloc(256, GFP_KERNEL);
- -      kfree(p);
- -      p[50] = 0x9a;
- -      pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
- -      validate_slab_cache(kmalloc_caches[type][8]);
- -
- -      p = kzalloc(512, GFP_KERNEL);
- -      kfree(p);
- -      p[512] = 0xab;
- -      pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
- -      validate_slab_cache(kmalloc_caches[type][9]);
- -}
- -#else
- -#ifdef CONFIG_SYSFS
- -static void resiliency_test(void) {};
- -#endif
- -#endif        /* SLUB_RESILIENCY_TEST */
- -
   #ifdef CONFIG_SYSFS
   enum slab_stat_type {
         SL_ALL,                 /* All slabs */
@@@ -5282,6 -5366,21 +5290,6 @@@ static ssize_t validate_store(struct km
   }
   SLAB_ATTR(validate);
   
- -static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
- -{
- -      if (!(s->flags & SLAB_STORE_USER))
- -              return -ENOSYS;
- -      return list_locations(s, buf, TRACK_ALLOC);
- -}
- -SLAB_ATTR_RO(alloc_calls);
- -
- -static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
- -{
- -      if (!(s->flags & SLAB_STORE_USER))
- -              return -ENOSYS;
- -      return list_locations(s, buf, TRACK_FREE);
- -}
- -SLAB_ATTR_RO(free_calls);
   #endif /* CONFIG_SLUB_DEBUG */
   
   #ifdef CONFIG_FAILSLAB
@@@ -5445,6 -5544,8 +5453,6 @@@ static struct attribute *slab_attrs[] 
         &poison_attr.attr,
         &store_user_attr.attr,
         &validate_attr.attr,
- -      &alloc_calls_attr.attr,
- -      &free_calls_attr.attr,
   #endif
   #ifdef CONFIG_ZONE_DMA
         &cache_dma_attr.attr,
@@@ -5726,179 -5827,13 +5734,179 @@@ static int __init slab_sysfs_init(void
         }
   
         mutex_unlock(&slab_mutex);
- -      resiliency_test();
         return 0;
   }
   
   __initcall(slab_sysfs_init);
   #endif /* CONFIG_SYSFS */
   
+ +#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
+ +static int slab_debugfs_show(struct seq_file *seq, void *v)
+ +{
+ +
+ +      struct location *l;
+ +      unsigned int idx = *(unsigned int *)v;
+ +      struct loc_track *t = seq->private;
+ +
+ +      if (idx < t->count) {
+ +              l = &t->loc[idx];
+ +
+ +              seq_printf(seq, "%7ld ", l->count);
+ +
+ +              if (l->addr)
+ +                      seq_printf(seq, "%pS", (void *)l->addr);
+ +              else
+ +                      seq_puts(seq, "<not-available>");
+ +
+ +              if (l->sum_time != l->min_time) {
+ +                      seq_printf(seq, " age=%ld/%llu/%ld",
+ +                              l->min_time, div_u64(l->sum_time, l->count),
+ +                              l->max_time);
+ +              } else
+ +                      seq_printf(seq, " age=%ld", l->min_time);
+ +
+ +              if (l->min_pid != l->max_pid)
+ +                      seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
+ +              else
+ +                      seq_printf(seq, " pid=%ld",
+ +                              l->min_pid);
+ +
+ +              if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
+ +                      seq_printf(seq, " cpus=%*pbl",
+ +                               cpumask_pr_args(to_cpumask(l->cpus)));
+ +
+ +              if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
+ +                      seq_printf(seq, " nodes=%*pbl",
+ +                               nodemask_pr_args(&l->nodes));
+ +
+ +              seq_puts(seq, "\n");
+ +      }
+ +
+ +      if (!idx && !t->count)
+ +              seq_puts(seq, "No data\n");
+ +
+ +      return 0;
+ +}
+ +
+ +static void slab_debugfs_stop(struct seq_file *seq, void *v)
+ +{
+ +}
+ +
+ +static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
+ +{
+ +      struct loc_track *t = seq->private;
+ +
+ +      v = ppos;
+ +      ++*ppos;
+ +      if (*ppos <= t->count)
+ +              return v;
+ +
+ +      return NULL;
+ +}
+ +
+ +static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
+ +{
+ +      return ppos;
+ +}
+ +
+ +static const struct seq_operations slab_debugfs_sops = {
+ +      .start  = slab_debugfs_start,
+ +      .next   = slab_debugfs_next,
+ +      .stop   = slab_debugfs_stop,
+ +      .show   = slab_debugfs_show,
+ +};
+ +
+ +static int slab_debug_trace_open(struct inode *inode, struct file *filep)
+ +{
+ +
+ +      struct kmem_cache_node *n;
+ +      enum track_item alloc;
+ +      int node;
+ +      struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
+ +                                              sizeof(struct loc_track));
+ +      struct kmem_cache *s = file_inode(filep)->i_private;
+ +
+ +      if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
+ +              alloc = TRACK_ALLOC;
+ +      else
+ +              alloc = TRACK_FREE;
+ +
+ +      if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL))
+ +              return -ENOMEM;
+ +
+ +      /* Push back cpu slabs */
+ +      flush_all(s);
+ +
+ +      for_each_kmem_cache_node(s, node, n) {
+ +              unsigned long flags;
+ +              struct page *page;
+ +
+ +              if (!atomic_long_read(&n->nr_slabs))
+ +                      continue;
+ +
+ +              spin_lock_irqsave(&n->list_lock, flags);
+ +              list_for_each_entry(page, &n->partial, slab_list)
+ +                      process_slab(t, s, page, alloc);
+ +              list_for_each_entry(page, &n->full, slab_list)
+ +                      process_slab(t, s, page, alloc);
+ +              spin_unlock_irqrestore(&n->list_lock, flags);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static int slab_debug_trace_release(struct inode *inode, struct file *file)
+ +{
+ +      struct seq_file *seq = file->private_data;
+ +      struct loc_track *t = seq->private;
+ +
+ +      free_loc_track(t);
+ +      return seq_release_private(inode, file);
+ +}
+ +
+ +static const struct file_operations slab_debugfs_fops = {
+ +      .open    = slab_debug_trace_open,
+ +      .read    = seq_read,
+ +      .llseek  = seq_lseek,
+ +      .release = slab_debug_trace_release,
+ +};
+ +
+ +static void debugfs_slab_add(struct kmem_cache *s)
+ +{
+ +      struct dentry *slab_cache_dir;
+ +
+ +      if (unlikely(!slab_debugfs_root))
+ +              return;
+ +
+ +      slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
+ +
+ +      debugfs_create_file("alloc_traces", 0400,
+ +              slab_cache_dir, s, &slab_debugfs_fops);
+ +
+ +      debugfs_create_file("free_traces", 0400,
+ +              slab_cache_dir, s, &slab_debugfs_fops);
+ +}
+ +
+ +void debugfs_slab_release(struct kmem_cache *s)
+ +{
+ +      debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root));
+ +}
+ +
+ +static int __init slab_debugfs_init(void)
+ +{
+ +      struct kmem_cache *s;
+ +
+ +      slab_debugfs_root = debugfs_create_dir("slab", NULL);
+ +
+ +      list_for_each_entry(s, &slab_caches, list)
+ +              if (s->flags & SLAB_STORE_USER)
+ +                      debugfs_slab_add(s);
+ +
+ +      return 0;
+ +
+ +}
+ +__initcall(slab_debugfs_init);
+ +#endif
   /*
    * The /proc/slabinfo ABI
    */
diff --combined mm/util.c

index a034525e7ba20522fa9af16ac37bea6eb57b827c,0b6dd9d81da797b5c7bf9a86808175db42981e05..99c6cc77de9e2fff60346dd8b2da1b504af8f570
--- 1/mm/util.c
--- 2/mm/util.c
+++ b/mm/util.c
@@@ -983,7 -983,7 +983,7 @@@ int __weak memcmp_pages(struct page *pa
    * depends on the type of object and on how much debugging is enabled.
    * For example, for a slab-cache object, the slab name is printed, and,
    * if available, the return address and stack trace from the allocation
-  * of that object.
+  * and last free path of that object.
    */
   void mem_dump_obj(void *object)
   {
@@@ -1010,43 -1010,3 +1010,43 @@@
   }
   EXPORT_SYMBOL_GPL(mem_dump_obj);
   #endif
+ +
+ +/*
+ + * A driver might set a page logically offline -- PageOffline() -- and
+ + * turn the page inaccessible in the hypervisor; after that, access to page
+ + * content can be fatal.
+ + *
+ + * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
+ + * pages after checking PageOffline(); however, these PFN walkers can race
+ + * with drivers that set PageOffline().
+ + *
+ + * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
+ + * synchronize with such drivers, achieving that a page cannot be set
+ + * PageOffline() while frozen.
+ + *
+ + * page_offline_begin()/page_offline_end() is used by drivers that care about
+ + * such races when setting a page PageOffline().
+ + */
+ +static DECLARE_RWSEM(page_offline_rwsem);
+ +
+ +void page_offline_freeze(void)
+ +{
+ +      down_read(&page_offline_rwsem);
+ +}
+ +
+ +void page_offline_thaw(void)
+ +{
+ +      up_read(&page_offline_rwsem);
+ +}
+ +
+ +void page_offline_begin(void)
+ +{
+ +      down_write(&page_offline_rwsem);
+ +}
+ +EXPORT_SYMBOL(page_offline_begin);
+ +
+ +void page_offline_end(void)
+ +{
+ +      up_write(&page_offline_rwsem);
+ +}
+ +EXPORT_SYMBOL(page_offline_end);
author	Linus Torvalds <[email protected]>
	Sun, 4 Jul 2021 19:58:33 +0000 (12:58 -0700)
committer	Linus Torvalds <[email protected]>
	Sun, 4 Jul 2021 19:58:33 +0000 (12:58 -0700)
		1	2
Documentation/admin-guide/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/locking/lockdep.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/rcutorture.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree_plugin.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree_stall.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/timer.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/oom_kill.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slab.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slab_common.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slub.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/util.c	patch \|	diff1 \|	diff2 \|	blob \| history