]> Git Repo - linux.git/commitdiff
Merge branch 'core-rcu-2021.07.04' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <[email protected]>
Sun, 4 Jul 2021 19:58:33 +0000 (12:58 -0700)
committerLinus Torvalds <[email protected]>
Sun, 4 Jul 2021 19:58:33 +0000 (12:58 -0700)
Pull RCU updates from Paul McKenney:

 - Bitmap parsing support for "all" as an alias for all bits

 - Documentation updates

 - Miscellaneous fixes, including some that overlap into mm and lockdep

 - kvfree_rcu() updates

 - mem_dump_obj() updates, with acks from one of the slab-allocator
   maintainers

 - RCU NOCB CPU updates, including limited deoffloading

 - SRCU updates

 - Tasks-RCU updates

 - Torture-test updates

* 'core-rcu-2021.07.04' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu: (78 commits)
  tasks-rcu: Make show_rcu_tasks_gp_kthreads() be static inline
  rcu-tasks: Make ksoftirqd provide RCU Tasks quiescent states
  rcu: Add missing __releases() annotation
  rcu: Remove obsolete rcu_read_unlock() deadlock commentary
  rcu: Improve comments describing RCU read-side critical sections
  rcu: Create an unrcu_pointer() to remove __rcu from a pointer
  srcu: Early test SRCU polling start
  rcu: Fix various typos in comments
  rcu/nocb: Unify timers
  rcu/nocb: Prepare for fine-grained deferred wakeup
  rcu/nocb: Only cancel nocb timer if not polling
  rcu/nocb: Delete bypass_timer upon nocb_gp wakeup
  rcu/nocb: Cancel nocb_timer upon nocb_gp wakeup
  rcu/nocb: Allow de-offloading rdp leader
  rcu/nocb: Directly call __wake_nocb_gp() from bypass timer
  rcu: Don't penalize priority boosting when there is nothing to boost
  rcu: Point to documentation of ordering guarantees
  rcu: Make rcu_gp_cleanup() be noinline for tracing
  rcu: Restrict RCU_STRICT_GRACE_PERIOD to at most four CPUs
  rcu: Make show_rcu_gp_kthreads() dump rcu_node structures blocking GP
  ...

13 files changed:
1  2 
Documentation/admin-guide/kernel-parameters.txt
init/main.c
kernel/locking/lockdep.c
kernel/rcu/rcutorture.c
kernel/rcu/tree.c
kernel/rcu/tree_plugin.h
kernel/rcu/tree_stall.h
kernel/time/timer.c
mm/oom_kill.c
mm/slab.h
mm/slab_common.c
mm/slub.c
mm/util.c

index d6717b74769414ad70c38fb2756c00e0bf265105,4405fd32e8ab07004abedf45c74d45c9f5b0ae67..b4ee111987620c73c2fbd769a81dd6039168d7b4
                        the GPE dispatcher.
                        This facility can be used to prevent such uncontrolled
                        GPE floodings.
 -                      Format: <byte>
 +                      Format: <byte> or <bitmap-list>
  
        acpi_no_auto_serialize  [HW,ACPI]
                        Disable auto-serialization of AML methods
                                          allowed anymore to lift isolation
                                          requirements as needed. This option
                                          does not override iommu=pt
 +                      force_enable - Force enable the IOMMU on platforms known
 +                                     to be buggy with IOMMU enabled. Use this
 +                                     option with care.
  
        amd_iommu_dump= [HW,X86-64]
                        Enable AMD IOMMU driver option to dump the ACPI table
        ccw_timeout_log [S390]
                        See Documentation/s390/common_io.rst for details.
  
 -      cgroup_disable= [KNL] Disable a particular controller
 -                      Format: {name of the controller(s) to disable}
 +      cgroup_disable= [KNL] Disable a particular controller or optional feature
 +                      Format: {name of the controller(s) or feature(s) to disable}
                        The effects of cgroup_disable=foo are:
                        - foo isn't auto-mounted if you mount all cgroups in
                          a single hierarchy
                        - foo isn't visible as an individually mountable
                          subsystem
 +                      - if foo is an optional feature then the feature is
 +                        disabled and corresponding cgroup files are not
 +                        created
                        {Currently only "memory" controller deal with this and
                        cut the overhead, others just disable the usage. So
                        only cgroup_disable=memory is actually worthy}
 +                      Specifying "pressure" disables per-cgroup pressure
 +                      stall information accounting feature
  
        cgroup_no_v1=   [KNL] Disable cgroup controllers and named hierarchies in v1
                        Format: { { controller | "all" | "named" }
                        loops can be debugged more effectively on production
                        systems.
  
 +      clocksource.max_cswd_read_retries= [KNL]
 +                      Number of clocksource_watchdog() retries due to
 +                      external delays before the clock will be marked
 +                      unstable.  Defaults to three retries, that is,
 +                      four attempts to read the clock under test.
 +
 +      clocksource.verify_n_cpus= [KNL]
 +                      Limit the number of CPUs checked for clocksources
 +                      marked with CLOCK_SOURCE_VERIFY_PERCPU that
 +                      are marked unstable due to excessive skew.
 +                      A negative value says to check all CPUs, while
 +                      zero says not to check any.  Values larger than
 +                      nr_cpu_ids are silently truncated to nr_cpu_ids.
 +                      The actual CPUs are chosen randomly, with
 +                      no replacement if the same CPU is chosen twice.
 +
 +      clocksource-wdtest.holdoff= [KNL]
 +                      Set the time in seconds that the clocksource
 +                      watchdog test waits before commencing its tests.
 +                      Defaults to zero when built as a module and to
 +                      10 seconds when built into the kernel.
 +
        clearcpuid=BITNUM[,BITNUM...] [X86]
                        Disable CPUID feature X for the kernel. See
                        arch/x86/include/asm/cpufeatures.h for the valid bit
                        Documentation/admin-guide/mm/hugetlbpage.rst.
                        Format: size[KMG]
  
 +      hugetlb_free_vmemmap=
 +                      [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 +                      enabled.
 +                      Allows heavy hugetlb users to free up some more
 +                      memory (6 * PAGE_SIZE for each 2MB hugetlb page).
 +                      Format: { on | off (default) }
 +
 +                      on:  enable the feature
 +                      off: disable the feature
 +
 +                      Built with CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON=y,
 +                      the default is on.
 +
 +                      This is not compatible with memory_hotplug.memmap_on_memory.
 +                      If both parameters are enabled, hugetlb_free_vmemmap takes
 +                      precedence over memory_hotplug.memmap_on_memory.
 +
        hung_task_panic=
                        [KNL] Should the hung task detector generate panics.
                        Format: 0 | 1
                          forcing Dual Address Cycle for PCI cards supporting
                          greater than 32-bit addressing.
  
 -      iommu.strict=   [ARM64] Configure TLB invalidation behaviour
 +      iommu.strict=   [ARM64, X86] Configure TLB invalidation behaviour
                        Format: { "0" | "1" }
                        0 - Lazy mode.
                          Request that DMA unmap operations use deferred
                        1 - Strict mode (default).
                          DMA unmap operations invalidate IOMMU hardware TLBs
                          synchronously.
 +                      Note: on x86, the default behaviour depends on the
 +                      equivalent driver-specific parameters, but a strict
 +                      mode explicitly specified by either method takes
 +                      precedence.
  
        iommu.passthrough=
                        [ARM64, X86] Configure DMA to bypass the IOMMU by default.
                        Note that even when enabled, there are a few cases where
                        the feature is not effective.
  
 +                      This is not compatible with hugetlb_free_vmemmap. If
 +                      both parameters are enabled, hugetlb_free_vmemmap takes
 +                      precedence over memory_hotplug.memmap_on_memory.
 +
        memtest=        [KNL,X86,ARM,PPC,RISCV] Enable memtest
                        Format: <integer>
                        default : 0 <disable>
  
        noclflush       [BUGS=X86] Don't use the CLFLUSH instruction
  
 -      nodelayacct     [KNL] Disable per-task delay accounting
 +      delayacct       [KNL] Enable per-task delay accounting
  
        nodsp           [SH] Disable hardware DSP at boot time.
  
  
        nr_uarts=       [SERIAL] maximum number of UARTs to be registered.
  
 +      numa=off        [KNL, ARM64, PPC, RISCV, SPARC, X86] Disable NUMA, Only
 +                      set up a single NUMA node spanning all memory.
 +
        numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic
                        NUMA balancing.
                        Allowed values are enable and disable
                        off: turn off poisoning (default)
                        on: turn on poisoning
  
 +      page_reporting.page_reporting_order=
 +                      [KNL] Minimal page reporting order
 +                      Format: <integer>
 +                      Adjust the minimal page reporting order. The page
 +                      reporting is disabled when it exceeds (MAX_ORDER-1).
 +
        panic=          [KNL] Kernel behaviour on panic: delay <timeout>
                        timeout > 0: seconds before rebooting
                        timeout = 0: wait forever
                        whole algorithm to behave better in low memory
                        condition.
  
+       rcutree.rcu_delay_page_cache_fill_msec= [KNL]
+                       Set the page-cache refill delay (in milliseconds)
+                       in response to low-memory conditions.  The range
+                       of permitted values is in the range 0:100000.
        rcutree.jiffies_till_first_fqs= [KNL]
                        Set delay from grace-period initialization to
                        first attempt to force quiescent states.
                        Reserves a hole at the top of the kernel virtual
                        address space.
  
 -      reservelow=     [X86]
 -                      Format: nn[K]
 -                      Set the amount of memory to reserve for BIOS at
 -                      the bottom of the address space.
 -
        reset_devices   [KNL] Force drivers to reset the underlying device
                        during initialization.
  
                                  exception. Default behavior is by #AC if
                                  both features are enabled in hardware.
  
 +                      ratelimit:N -
 +                                Set system wide rate limit to N bus locks
 +                                per second for bus lock detection.
 +                                0 < N <= 1000.
 +
 +                                N/A for split lock detection.
 +
 +
                        If an #AC exception is hit in the kernel or in
                        firmware (i.e. not while executing in user mode)
                        the kernel will oops in either "warn" or "fatal"
                        Note, echoing 1 into this file without the
                        tracepoint_printk kernel cmdline option has no effect.
  
 +                      The tp_printk_stop_on_boot (see below) can also be used
 +                      to stop the printing of events to console at
 +                      late_initcall_sync.
 +
                        ** CAUTION **
  
                        Having tracepoints sent to printk() and activating high
                        frequency tracepoints such as irq or sched, can cause
                        the system to live lock.
  
 +      tp_printk_stop_on_boot[FTRACE]
 +                      When tp_printk (above) is set, it can cause a lot of noise
 +                      on the console. It may be useful to only include the
 +                      printing of events during boot up, as user space may
 +                      make the system inoperable.
 +
 +                      This command line option will stop the printing of events
 +                      to console at the late_initcall_sync() time frame.
 +
        traceoff_on_warning
                        [FTRACE] enable this option to disable tracing when a
                        warning is hit. This turns off "tracing_on". Tracing can
diff --combined init/main.c
index f622b712dc9a29a94bf88f69af03f0eef32a2861,7b6f49c4d38831495bb4a5eacf66113e457a897c..af521b30a3b84230a5acd9e10fbb4516b533fdb6
@@@ -42,6 -42,7 +42,7 @@@
  #include <linux/profile.h>
  #include <linux/kfence.h>
  #include <linux/rcupdate.h>
+ #include <linux/srcu.h>
  #include <linux/moduleparam.h>
  #include <linux/kallsyms.h>
  #include <linux/writeback.h>
@@@ -386,6 -387,16 +387,6 @@@ static char * __init xbc_make_cmdline(c
        return new_cmdline;
  }
  
 -static u32 boot_config_checksum(unsigned char *p, u32 size)
 -{
 -      u32 ret = 0;
 -
 -      while (size--)
 -              ret += *p++;
 -
 -      return ret;
 -}
 -
  static int __init bootconfig_params(char *param, char *val,
                                    const char *unused, void *arg)
  {
@@@ -429,7 -440,7 +430,7 @@@ static void __init setup_boot_config(vo
                return;
        }
  
 -      if (boot_config_checksum((unsigned char *)data, size) != csum) {
 +      if (xbc_calc_checksum(data, size) != csum) {
                pr_err("bootconfig checksum failed\n");
                return;
        }
@@@ -682,7 -693,6 +683,7 @@@ noinline void __ref rest_init(void
         */
        rcu_read_lock();
        tsk = find_task_by_pid_ns(pid, &init_pid_ns);
 +      tsk->flags |= PF_NO_SETAFFINITY;
        set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
        rcu_read_unlock();
  
@@@ -863,47 -873,6 +864,47 @@@ void __init __weak arch_call_rest_init(
        rest_init();
  }
  
 +static void __init print_unknown_bootoptions(void)
 +{
 +      char *unknown_options;
 +      char *end;
 +      const char *const *p;
 +      size_t len;
 +
 +      if (panic_later || (!argv_init[1] && !envp_init[2]))
 +              return;
 +
 +      /*
 +       * Determine how many options we have to print out, plus a space
 +       * before each
 +       */
 +      len = 1; /* null terminator */
 +      for (p = &argv_init[1]; *p; p++) {
 +              len++;
 +              len += strlen(*p);
 +      }
 +      for (p = &envp_init[2]; *p; p++) {
 +              len++;
 +              len += strlen(*p);
 +      }
 +
 +      unknown_options = memblock_alloc(len, SMP_CACHE_BYTES);
 +      if (!unknown_options) {
 +              pr_err("%s: Failed to allocate %zu bytes\n",
 +                      __func__, len);
 +              return;
 +      }
 +      end = unknown_options;
 +
 +      for (p = &argv_init[1]; *p; p++)
 +              end += sprintf(end, " %s", *p);
 +      for (p = &envp_init[2]; *p; p++)
 +              end += sprintf(end, " %s", *p);
 +
 +      pr_notice("Unknown command line parameters:%s\n", unknown_options);
 +      memblock_free(__pa(unknown_options), len);
 +}
 +
  asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
  {
        char *command_line;
                                  static_command_line, __start___param,
                                  __stop___param - __start___param,
                                  -1, -1, NULL, &unknown_bootoption);
 +      print_unknown_bootoptions();
        if (!IS_ERR_OR_NULL(after_dashes))
                parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
                           NULL, set_init_arg);
         * time - but meanwhile we still have a functioning scheduler.
         */
        sched_init();
 -      /*
 -       * Disable preemption - early bootup scheduling is extremely
 -       * fragile until we cpu_idle() for the first time.
 -       */
 -      preempt_disable();
 +
        if (WARN(!irqs_disabled(),
                 "Interrupts were enabled *very* early, fixing it\n"))
                local_irq_disable();
        tick_init();
        rcu_init_nohz();
        init_timers();
+       srcu_init();
        hrtimers_init();
        softirq_init();
        timekeeping_init();
@@@ -1473,11 -1446,6 +1475,11 @@@ static int __ref kernel_init(void *unus
  {
        int ret;
  
 +      /*
 +       * Wait until kthreadd is all set-up.
 +       */
 +      wait_for_completion(&kthreadd_done);
 +
        kernel_init_freeable();
        /* need to finish all async __init code before freeing the memory */
        async_synchronize_full();
@@@ -1558,6 -1526,11 +1560,6 @@@ void __init console_on_rootfs(void
  
  static noinline void __init kernel_init_freeable(void)
  {
 -      /*
 -       * Wait until kthreadd is all set-up.
 -       */
 -      wait_for_completion(&kthreadd_done);
 -
        /* Now the scheduler is fully set up and can do blocking allocations */
        gfp_allowed_mask = __GFP_BITS_MASK;
  
         */
        set_mems_allowed(node_states[N_MEMORY]);
  
 -      cad_pid = task_pid(current);
 +      cad_pid = get_pid(task_pid(current));
  
        smp_prepare_cpus(setup_max_cpus);
  
diff --combined kernel/locking/lockdep.c
index e97d0800143771aafef41ae2709b59afd6d1a088,d6c3c987009d9ff30b8b36b91fd19353d16820a0..bf1c00c881e4871c3b448e2a9c3b852f703ded94
@@@ -760,7 -760,7 +760,7 @@@ static void lockdep_print_held_locks(st
         * It's not reliable to print a task's held locks if it's not sleeping
         * and it's not the current task.
         */
 -      if (p->state == TASK_RUNNING && p != current)
 +      if (p != current && task_is_running(p))
                return;
        for (i = 0; i < depth; i++) {
                printk(" #%d: ", i);
@@@ -843,7 -843,7 +843,7 @@@ static int count_matching_names(struct 
  }
  
  /* used from NMI context -- must be lockless */
 -static __always_inline struct lock_class *
 +static noinstr struct lock_class *
  look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
  {
        struct lockdep_subclass_key *key;
        struct lock_class *class;
  
        if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
 +              instrumentation_begin();
                debug_locks_off();
                printk(KERN_ERR
                        "BUG: looking up invalid subclass: %u\n", subclass);
                printk(KERN_ERR
                        "turning off the locking correctness validator.\n");
                dump_stack();
 +              instrumentation_end();
                return NULL;
        }
  
@@@ -2306,56 -2304,7 +2306,56 @@@ static void print_lock_class_header(str
  }
  
  /*
 - * printk the shortest lock dependencies from @start to @end in reverse order:
 + * Dependency path printing:
 + *
 + * After BFS we get a lock dependency path (linked via ->parent of lock_list),
 + * printing out each lock in the dependency path will help on understanding how
 + * the deadlock could happen. Here are some details about dependency path
 + * printing:
 + *
 + * 1) A lock_list can be either forwards or backwards for a lock dependency,
 + *    for a lock dependency A -> B, there are two lock_lists:
 + *
 + *    a)      lock_list in the ->locks_after list of A, whose ->class is B and
 + *            ->links_to is A. In this case, we can say the lock_list is
 + *            "A -> B" (forwards case).
 + *
 + *    b)      lock_list in the ->locks_before list of B, whose ->class is A
 + *            and ->links_to is B. In this case, we can say the lock_list is
 + *            "B <- A" (bacwards case).
 + *
 + *    The ->trace of both a) and b) point to the call trace where B was
 + *    acquired with A held.
 + *
 + * 2) A "helper" lock_list is introduced during BFS, this lock_list doesn't
 + *    represent a certain lock dependency, it only provides an initial entry
 + *    for BFS. For example, BFS may introduce a "helper" lock_list whose
 + *    ->class is A, as a result BFS will search all dependencies starting with
 + *    A, e.g. A -> B or A -> C.
 + *
 + *    The notation of a forwards helper lock_list is like "-> A", which means
 + *    we should search the forwards dependencies starting with "A", e.g A -> B
 + *    or A -> C.
 + *
 + *    The notation of a bacwards helper lock_list is like "<- B", which means
 + *    we should search the backwards dependencies ending with "B", e.g.
 + *    B <- A or B <- C.
 + */
 +
 +/*
 + * printk the shortest lock dependencies from @root to @leaf in reverse order.
 + *
 + * We have a lock dependency path as follow:
 + *
 + *    @root                                                                 @leaf
 + *      |                                                                     |
 + *      V                                                                     V
 + *              ->parent                                   ->parent
 + * | lock_list | <--------- | lock_list | ... | lock_list  | <--------- | lock_list |
 + * |    -> L1  |            | L1 -> L2  | ... |Ln-2 -> Ln-1|            | Ln-1 -> Ln|
 + *
 + * , so it's natural that we start from @leaf and print every ->class and
 + * ->trace until we reach the @root.
   */
  static void __used
  print_shortest_lock_dependencies(struct lock_list *leaf,
        } while (entry && (depth >= 0));
  }
  
 +/*
 + * printk the shortest lock dependencies from @leaf to @root.
 + *
 + * We have a lock dependency path (from a backwards search) as follow:
 + *
 + *    @leaf                                                                 @root
 + *      |                                                                     |
 + *      V                                                                     V
 + *              ->parent                                   ->parent
 + * | lock_list | ---------> | lock_list | ... | lock_list  | ---------> | lock_list |
 + * | L2 <- L1  |            | L3 <- L2  | ... | Ln <- Ln-1 |            |    <- Ln  |
 + *
 + * , so when we iterate from @leaf to @root, we actually print the lock
 + * dependency path L1 -> L2 -> .. -> Ln in the non-reverse order.
 + *
 + * Another thing to notice here is that ->class of L2 <- L1 is L1, while the
 + * ->trace of L2 <- L1 is the call trace of L2, in fact we don't have the call
 + * trace of L1 in the dependency path, which is alright, because most of the
 + * time we can figure out where L1 is held from the call trace of L2.
 + */
 +static void __used
 +print_shortest_lock_dependencies_backwards(struct lock_list *leaf,
 +                                         struct lock_list *root)
 +{
 +      struct lock_list *entry = leaf;
 +      const struct lock_trace *trace = NULL;
 +      int depth;
 +
 +      /*compute depth from generated tree by BFS*/
 +      depth = get_lock_depth(leaf);
 +
 +      do {
 +              print_lock_class_header(entry->class, depth);
 +              if (trace) {
 +                      printk("%*s ... acquired at:\n", depth, "");
 +                      print_lock_trace(trace, 2);
 +                      printk("\n");
 +              }
 +
 +              /*
 +               * Record the pointer to the trace for the next lock_list
 +               * entry, see the comments for the function.
 +               */
 +              trace = entry->trace;
 +
 +              if (depth == 0 && (entry != root)) {
 +                      printk("lockdep:%s bad path found in chain graph\n", __func__);
 +                      break;
 +              }
 +
 +              entry = get_lock_parent(entry);
 +              depth--;
 +      } while (entry && (depth >= 0));
 +}
 +
  static void
  print_irq_lock_scenario(struct lock_list *safe_entry,
                        struct lock_list *unsafe_entry,
@@@ -2552,7 -2446,10 +2552,7 @@@ print_bad_irq_dependency(struct task_st
        lockdep_print_held_locks(curr);
  
        pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
 -      prev_root->trace = save_trace();
 -      if (!prev_root->trace)
 -              return;
 -      print_shortest_lock_dependencies(backwards_entry, prev_root);
 +      print_shortest_lock_dependencies_backwards(backwards_entry, prev_root);
  
        pr_warn("\nthe dependencies between the lock to be acquired");
        pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
@@@ -2770,18 -2667,8 +2770,18 @@@ static int check_irq_usage(struct task_
         * Step 3: we found a bad match! Now retrieve a lock from the backward
         * list whose usage mask matches the exclusive usage mask from the
         * lock found on the forward list.
 +       *
 +       * Note, we should only keep the LOCKF_ENABLED_IRQ_ALL bits, considering
 +       * the follow case:
 +       *
 +       * When trying to add A -> B to the graph, we find that there is a
 +       * hardirq-safe L, that L -> ... -> A, and another hardirq-unsafe M,
 +       * that B -> ... -> M. However M is **softirq-safe**, if we use exact
 +       * invert bits of M's usage_mask, we will find another lock N that is
 +       * **softirq-unsafe** and N -> ... -> A, however N -> .. -> M will not
 +       * cause a inversion deadlock.
         */
 -      backward_mask = original_mask(target_entry1->class->usage_mask);
 +      backward_mask = original_mask(target_entry1->class->usage_mask & LOCKF_ENABLED_IRQ_ALL);
  
        ret = find_usage_backwards(&this, backward_mask, &target_entry);
        if (bfs_error(ret)) {
@@@ -2831,7 -2718,7 +2831,7 @@@ static inline bool usage_skip(struct lo
   * <target> or not. If it can, <src> -> <target> dependency is already
   * in the graph.
   *
 - * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
 + * Return BFS_RMATCH if it does, or BFS_RNOMATCH if it does not, return BFS_E* if
   * any error appears in the bfs search.
   */
  static noinline enum bfs_result
@@@ -4690,7 -4577,7 +4690,7 @@@ static int check_wait_context(struct ta
        u8 curr_inner;
        int depth;
  
 -      if (!curr->lockdep_depth || !next_inner || next->trylock)
 +      if (!next_inner || next->trylock)
                return 0;
  
        if (!next_outer)
@@@ -5849,7 -5736,7 +5849,7 @@@ void lock_contended(struct lockdep_map 
  {
        unsigned long flags;
  
 -      trace_lock_acquired(lock, ip);
 +      trace_lock_contended(lock, ip);
  
        if (unlikely(!lock_stat || !lockdep_enabled()))
                return;
@@@ -5867,7 -5754,7 +5867,7 @@@ void lock_acquired(struct lockdep_map *
  {
        unsigned long flags;
  
 -      trace_lock_contended(lock, ip);
 +      trace_lock_acquired(lock, ip);
  
        if (unlikely(!lock_stat || !lockdep_enabled()))
                return;
@@@ -6506,6 -6393,7 +6506,7 @@@ asmlinkage __visible void lockdep_sys_e
  void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
  {
        struct task_struct *curr = current;
+       int dl = READ_ONCE(debug_locks);
  
        /* Note: the following can be executed concurrently, so be careful. */
        pr_warn("\n");
        pr_warn("-----------------------------\n");
        pr_warn("%s:%d %s!\n", file, line, s);
        pr_warn("\nother info that might help us debug this:\n\n");
-       pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+       pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n%s",
               !rcu_lockdep_current_cpu_online()
                        ? "RCU used illegally from offline CPU!\n"
                        : "",
-              rcu_scheduler_active, debug_locks);
+              rcu_scheduler_active, dl,
+              dl ? "" : "Possible false positive due to lockdep disabling via debug_locks = 0\n");
  
        /*
         * If a CPU is in the RCU-free window in idle (ie: in the section
diff --combined kernel/rcu/rcutorture.c
index 194b9c145c402eb7e55229d53b8480edbce3124d,ec69273898af96bedeb7fb3d1352e84a81e2b4bc..40ef5417d95451bd4aab3f20c4c7f1de011c43b6
@@@ -245,12 -245,6 +245,6 @@@ static const char *rcu_torture_writer_s
        return rcu_torture_writer_state_names[i];
  }
  
- #if defined(CONFIG_RCU_BOOST) && defined(CONFIG_PREEMPT_RT)
- # define rcu_can_boost() 1
- #else
- # define rcu_can_boost() 0
- #endif
  #ifdef CONFIG_RCU_TRACE
  static u64 notrace rcu_trace_clock_local(void)
  {
@@@ -331,6 -325,7 +325,7 @@@ struct rcu_torture_ops 
        void (*read_delay)(struct torture_random_state *rrsp,
                           struct rt_read_seg *rtrsp);
        void (*readunlock)(int idx);
+       int (*readlock_held)(void);
        unsigned long (*get_gp_seq)(void);
        unsigned long (*gp_diff)(unsigned long new, unsigned long old);
        void (*deferred_free)(struct rcu_torture *p);
        void (*fqs)(void);
        void (*stats)(void);
        void (*gp_kthread_dbg)(void);
+       bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
        int (*stall_dur)(void);
        int irq_capable;
        int can_boost;
@@@ -359,6 -355,11 +355,11 @@@ static struct rcu_torture_ops *cur_ops
   * Definitions for rcu torture testing.
   */
  
+ static int torture_readlock_not_held(void)
+ {
+       return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
+ }
  static int rcu_torture_read_lock(void) __acquires(RCU)
  {
        rcu_read_lock();
@@@ -483,30 -484,32 +484,32 @@@ static void rcu_sync_torture_init(void
  }
  
  static struct rcu_torture_ops rcu_ops = {
-       .ttype          = RCU_FLAVOR,
-       .init           = rcu_sync_torture_init,
-       .readlock       = rcu_torture_read_lock,
-       .read_delay     = rcu_read_delay,
-       .readunlock     = rcu_torture_read_unlock,
-       .get_gp_seq     = rcu_get_gp_seq,
-       .gp_diff        = rcu_seq_diff,
-       .deferred_free  = rcu_torture_deferred_free,
-       .sync           = synchronize_rcu,
-       .exp_sync       = synchronize_rcu_expedited,
-       .get_gp_state   = get_state_synchronize_rcu,
-       .start_gp_poll  = start_poll_synchronize_rcu,
-       .poll_gp_state  = poll_state_synchronize_rcu,
-       .cond_sync      = cond_synchronize_rcu,
-       .call           = call_rcu,
-       .cb_barrier     = rcu_barrier,
-       .fqs            = rcu_force_quiescent_state,
-       .stats          = NULL,
-       .gp_kthread_dbg = show_rcu_gp_kthreads,
-       .stall_dur      = rcu_jiffies_till_stall_check,
-       .irq_capable    = 1,
-       .can_boost      = rcu_can_boost(),
-       .extendables    = RCUTORTURE_MAX_EXTEND,
-       .name           = "rcu"
+       .ttype                  = RCU_FLAVOR,
+       .init                   = rcu_sync_torture_init,
+       .readlock               = rcu_torture_read_lock,
+       .read_delay             = rcu_read_delay,
+       .readunlock             = rcu_torture_read_unlock,
+       .readlock_held          = torture_readlock_not_held,
+       .get_gp_seq             = rcu_get_gp_seq,
+       .gp_diff                = rcu_seq_diff,
+       .deferred_free          = rcu_torture_deferred_free,
+       .sync                   = synchronize_rcu,
+       .exp_sync               = synchronize_rcu_expedited,
+       .get_gp_state           = get_state_synchronize_rcu,
+       .start_gp_poll          = start_poll_synchronize_rcu,
+       .poll_gp_state          = poll_state_synchronize_rcu,
+       .cond_sync              = cond_synchronize_rcu,
+       .call                   = call_rcu,
+       .cb_barrier             = rcu_barrier,
+       .fqs                    = rcu_force_quiescent_state,
+       .stats                  = NULL,
+       .gp_kthread_dbg         = show_rcu_gp_kthreads,
+       .check_boost_failed     = rcu_check_boost_fail,
+       .stall_dur              = rcu_jiffies_till_stall_check,
+       .irq_capable            = 1,
+       .can_boost              = IS_ENABLED(CONFIG_RCU_BOOST),
+       .extendables            = RCUTORTURE_MAX_EXTEND,
+       .name                   = "rcu"
  };
  
  /*
@@@ -540,6 -543,7 +543,7 @@@ static struct rcu_torture_ops rcu_buste
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_torture_read_unlock,
+       .readlock_held  = torture_readlock_not_held,
        .get_gp_seq     = rcu_no_completed,
        .deferred_free  = rcu_busted_torture_deferred_free,
        .sync           = synchronize_rcu_busted,
@@@ -589,6 -593,11 +593,11 @@@ static void srcu_torture_read_unlock(in
        srcu_read_unlock(srcu_ctlp, idx);
  }
  
+ static int torture_srcu_read_lock_held(void)
+ {
+       return srcu_read_lock_held(srcu_ctlp);
+ }
  static unsigned long srcu_torture_completed(void)
  {
        return srcu_batches_completed(srcu_ctlp);
@@@ -646,6 -655,7 +655,7 @@@ static struct rcu_torture_ops srcu_ops 
        .readlock       = srcu_torture_read_lock,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
+       .readlock_held  = torture_srcu_read_lock_held,
        .get_gp_seq     = srcu_torture_completed,
        .deferred_free  = srcu_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
@@@ -681,6 -691,7 +691,7 @@@ static struct rcu_torture_ops srcud_op
        .readlock       = srcu_torture_read_lock,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
+       .readlock_held  = torture_srcu_read_lock_held,
        .get_gp_seq     = srcu_torture_completed,
        .deferred_free  = srcu_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
@@@ -700,6 -711,7 +711,7 @@@ static struct rcu_torture_ops busted_sr
        .readlock       = srcu_torture_read_lock,
        .read_delay     = rcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
+       .readlock_held  = torture_srcu_read_lock_held,
        .get_gp_seq     = srcu_torture_completed,
        .deferred_free  = srcu_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
@@@ -787,6 -799,7 +799,7 @@@ static struct rcu_torture_ops trivial_o
        .readlock       = rcu_torture_read_lock_trivial,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_torture_read_unlock_trivial,
+       .readlock_held  = torture_readlock_not_held,
        .get_gp_seq     = rcu_no_completed,
        .sync           = synchronize_rcu_trivial,
        .exp_sync       = synchronize_rcu_trivial,
@@@ -850,6 -863,7 +863,7 @@@ static struct rcu_torture_ops tasks_tra
        .readlock       = tasks_tracing_torture_read_lock,
        .read_delay     = srcu_read_delay,  /* just reuse srcu's version. */
        .readunlock     = tasks_tracing_torture_read_unlock,
+       .readlock_held  = rcu_read_lock_trace_held,
        .get_gp_seq     = rcu_no_completed,
        .deferred_free  = rcu_tasks_tracing_torture_deferred_free,
        .sync           = synchronize_rcu_tasks_trace,
@@@ -871,32 -885,13 +885,13 @@@ static unsigned long rcutorture_seq_dif
        return cur_ops->gp_diff(new, old);
  }
  
- static bool __maybe_unused torturing_tasks(void)
- {
-       return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops;
- }
  /*
   * RCU torture priority-boost testing.  Runs one real-time thread per
-  * CPU for moderate bursts, repeatedly registering RCU callbacks and
-  * spinning waiting for them to be invoked.  If a given callback takes
-  * too long to be invoked, we assume that priority inversion has occurred.
+  * CPU for moderate bursts, repeatedly starting grace periods and waiting
+  * for them to complete.  If a given grace period takes too long, we assume
+  * that priority inversion has occurred.
   */
  
- struct rcu_boost_inflight {
-       struct rcu_head rcu;
-       int inflight;
- };
- static void rcu_torture_boost_cb(struct rcu_head *head)
- {
-       struct rcu_boost_inflight *rbip =
-               container_of(head, struct rcu_boost_inflight, rcu);
-       /* Ensure RCU-core accesses precede clearing ->inflight */
-       smp_store_release(&rbip->inflight, 0);
- }
  static int old_rt_runtime = -1;
  
  static void rcu_torture_disable_rt_throttle(void)
@@@ -923,49 -918,68 +918,68 @@@ static void rcu_torture_enable_rt_throt
        old_rt_runtime = -1;
  }
  
- static bool rcu_torture_boost_failed(unsigned long start, unsigned long end)
+ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start)
  {
+       int cpu;
        static int dbg_done;
-       if (end - start > test_boost_duration * HZ - HZ / 2) {
+       unsigned long end = jiffies;
+       bool gp_done;
+       unsigned long j;
+       static unsigned long last_persist;
+       unsigned long lp;
+       unsigned long mininterval = test_boost_duration * HZ - HZ / 2;
+       if (end - *start > mininterval) {
+               // Recheck after checking time to avoid false positives.
+               smp_mb(); // Time check before grace-period check.
+               if (cur_ops->poll_gp_state(gp_state))
+                       return false; // passed, though perhaps just barely
+               if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) {
+                       // At most one persisted message per boost test.
+                       j = jiffies;
+                       lp = READ_ONCE(last_persist);
+                       if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp)
+                               pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+                       return false; // passed on a technicality
+               }
                VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
                n_rcu_torture_boost_failure++;
-               if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg)
+               if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) {
+                       pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n",
+                               current->rt_priority, gp_state, end - *start);
                        cur_ops->gp_kthread_dbg();
+                       // Recheck after print to flag grace period ending during splat.
+                       gp_done = cur_ops->poll_gp_state(gp_state);
+                       pr_info("Boost inversion: GP %lu %s.\n", gp_state,
+                               gp_done ? "ended already" : "still pending");
  
-               return true; /* failed */
+               }
+               return true; // failed
+       } else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) {
+               *start = jiffies;
        }
  
-       return false; /* passed */
+       return false; // passed
  }
  
  static int rcu_torture_boost(void *arg)
  {
-       unsigned long call_rcu_time;
        unsigned long endtime;
+       unsigned long gp_state;
+       unsigned long gp_state_time;
        unsigned long oldstarttime;
-       struct rcu_boost_inflight rbi = { .inflight = 0 };
  
        VERBOSE_TOROUT_STRING("rcu_torture_boost started");
  
        /* Set real-time priority. */
        sched_set_fifo_low(current);
  
-       init_rcu_head_on_stack(&rbi.rcu);
        /* Each pass through the following loop does one boost-test cycle. */
        do {
                bool failed = false; // Test failed already in this test interval
-               bool firsttime = true;
+               bool gp_initiated = false;
  
-               /* Increment n_rcu_torture_boosts once per boost-test */
-               while (!kthread_should_stop()) {
-                       if (mutex_trylock(&boost_mutex)) {
-                               n_rcu_torture_boosts++;
-                               mutex_unlock(&boost_mutex);
-                               break;
-                       }
-                       schedule_timeout_uninterruptible(1);
-               }
                if (kthread_should_stop())
                        goto checkwait;
  
                                goto checkwait;
                }
  
-               /* Do one boost-test interval. */
+               // Do one boost-test interval.
                endtime = oldstarttime + test_boost_duration * HZ;
                while (time_before(jiffies, endtime)) {
-                       /* If we don't have a callback in flight, post one. */
-                       if (!smp_load_acquire(&rbi.inflight)) {
-                               /* RCU core before ->inflight = 1. */
-                               smp_store_release(&rbi.inflight, 1);
-                               cur_ops->call(&rbi.rcu, rcu_torture_boost_cb);
-                               /* Check if the boost test failed */
-                               if (!firsttime && !failed)
-                                       failed = rcu_torture_boost_failed(call_rcu_time, jiffies);
-                               call_rcu_time = jiffies;
-                               firsttime = false;
+                       // Has current GP gone too long?
+                       if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+                               failed = rcu_torture_boost_failed(gp_state, &gp_state_time);
+                       // If we don't have a grace period in flight, start one.
+                       if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) {
+                               gp_state = cur_ops->start_gp_poll();
+                               gp_initiated = true;
+                               gp_state_time = jiffies;
                        }
-                       if (stutter_wait("rcu_torture_boost"))
+                       if (stutter_wait("rcu_torture_boost")) {
                                sched_set_fifo_low(current);
+                               // If the grace period already ended,
+                               // we don't know when that happened, so
+                               // start over.
+                               if (cur_ops->poll_gp_state(gp_state))
+                                       gp_initiated = false;
+                       }
                        if (torture_must_stop())
                                goto checkwait;
                }
  
-               /*
-                * If boost never happened, then inflight will always be 1, in
-                * this case the boost check would never happen in the above
-                * loop so do another one here.
-                */
-               if (!firsttime && !failed && smp_load_acquire(&rbi.inflight))
-                       rcu_torture_boost_failed(call_rcu_time, jiffies);
+               // In case the grace period extended beyond the end of the loop.
+               if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+                       rcu_torture_boost_failed(gp_state, &gp_state_time);
  
                /*
                 * Set the start time of the next test interval.
                 * interval.  Besides, we are running at RT priority,
                 * so delays should be relatively rare.
                 */
-               while (oldstarttime == boost_starttime &&
-                      !kthread_should_stop()) {
+               while (oldstarttime == boost_starttime && !kthread_should_stop()) {
                        if (mutex_trylock(&boost_mutex)) {
-                               boost_starttime = jiffies +
-                                                 test_boost_interval * HZ;
+                               if (oldstarttime == boost_starttime) {
+                                       boost_starttime = jiffies + test_boost_interval * HZ;
+                                       n_rcu_torture_boosts++;
+                               }
                                mutex_unlock(&boost_mutex);
                                break;
                        }
@@@ -1030,15 -1045,11 +1045,11 @@@ checkwait:   if (stutter_wait("rcu_tortur
                        sched_set_fifo_low(current);
        } while (!torture_must_stop());
  
-       while (smp_load_acquire(&rbi.inflight))
-               schedule_timeout_uninterruptible(1); // rcu_barrier() deadlocks.
        /* Clean up and exit. */
-       while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
+       while (!kthread_should_stop()) {
                torture_shutdown_absorb("rcu_torture_boost");
                schedule_timeout_uninterruptible(1);
        }
-       destroy_rcu_head_on_stack(&rbi.rcu);
        torture_kthread_stopping("rcu_torture_boost");
        return 0;
  }
@@@ -1553,11 -1564,7 +1564,7 @@@ static bool rcu_torture_one_read(struc
        started = cur_ops->get_gp_seq();
        ts = rcu_trace_clock_local();
        p = rcu_dereference_check(rcu_torture_current,
-                                 rcu_read_lock_bh_held() ||
-                                 rcu_read_lock_sched_held() ||
-                                 srcu_read_lock_held(srcu_ctlp) ||
-                                 rcu_read_lock_trace_held() ||
-                                 torturing_tasks());
+                                 !cur_ops->readlock_held || cur_ops->readlock_held());
        if (p == NULL) {
                /* Wait for rcu_torture_writer to get underway */
                rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
@@@ -1831,10 -1838,10 +1838,10 @@@ rcu_torture_stats_print(void
                srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
                                        &flags, &gp_seq);
                wtp = READ_ONCE(writer_task);
 -              pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n",
 +              pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n",
                         rcu_torture_writer_state_getname(),
                         rcu_torture_writer_state, gp_seq, flags,
 -                       wtp == NULL ? ~0UL : wtp->state,
 +                       wtp == NULL ? ~0U : wtp->__state,
                         wtp == NULL ? -1 : (int)task_cpu(wtp));
                if (!splatted && wtp) {
                        sched_show_task(wtp);
@@@ -1861,48 -1868,49 +1868,49 @@@ rcu_torture_stats(void *arg
                torture_shutdown_absorb("rcu_torture_stats");
        } while (!torture_must_stop());
        torture_kthread_stopping("rcu_torture_stats");
-       {
-               struct rcu_head *rhp;
-               struct kmem_cache *kcp;
-               static int z;
-               kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
-               rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
-               pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
-               pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
-               mem_dump_obj(ZERO_SIZE_PTR);
-               pr_alert("mem_dump_obj(NULL):");
-               mem_dump_obj(NULL);
-               pr_alert("mem_dump_obj(%px):", &rhp);
-               mem_dump_obj(&rhp);
-               pr_alert("mem_dump_obj(%px):", rhp);
-               mem_dump_obj(rhp);
-               pr_alert("mem_dump_obj(%px):", &rhp->func);
-               mem_dump_obj(&rhp->func);
-               pr_alert("mem_dump_obj(%px):", &z);
-               mem_dump_obj(&z);
-               kmem_cache_free(kcp, rhp);
-               kmem_cache_destroy(kcp);
-               rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
-               pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
-               pr_alert("mem_dump_obj(kmalloc %px):", rhp);
-               mem_dump_obj(rhp);
-               pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
-               mem_dump_obj(&rhp->func);
-               kfree(rhp);
-               rhp = vmalloc(4096);
-               pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
-               pr_alert("mem_dump_obj(vmalloc %px):", rhp);
-               mem_dump_obj(rhp);
-               pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
-               mem_dump_obj(&rhp->func);
-               vfree(rhp);
-       }
        return 0;
  }
  
+ /* Test mem_dump_obj() and friends.  */
+ static void rcu_torture_mem_dump_obj(void)
+ {
+       struct rcu_head *rhp;
+       struct kmem_cache *kcp;
+       static int z;
+       kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+       rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+       pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
+       pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
+       mem_dump_obj(ZERO_SIZE_PTR);
+       pr_alert("mem_dump_obj(NULL):");
+       mem_dump_obj(NULL);
+       pr_alert("mem_dump_obj(%px):", &rhp);
+       mem_dump_obj(&rhp);
+       pr_alert("mem_dump_obj(%px):", rhp);
+       mem_dump_obj(rhp);
+       pr_alert("mem_dump_obj(%px):", &rhp->func);
+       mem_dump_obj(&rhp->func);
+       pr_alert("mem_dump_obj(%px):", &z);
+       mem_dump_obj(&z);
+       kmem_cache_free(kcp, rhp);
+       kmem_cache_destroy(kcp);
+       rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+       pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+       pr_alert("mem_dump_obj(kmalloc %px):", rhp);
+       mem_dump_obj(rhp);
+       pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
+       mem_dump_obj(&rhp->func);
+       kfree(rhp);
+       rhp = vmalloc(4096);
+       pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+       pr_alert("mem_dump_obj(vmalloc %px):", rhp);
+       mem_dump_obj(rhp);
+       pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
+       mem_dump_obj(&rhp->func);
+       vfree(rhp);
+ }
  static void
  rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
  {
@@@ -2634,7 -2642,7 +2642,7 @@@ static bool rcu_torture_can_boost(void
  
        if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2)
                return false;
-       if (!cur_ops->call)
+       if (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)
                return false;
  
        prio = rcu_get_gp_kthreads_prio();
                return false;
  
        if (prio < 2) {
-               if (boost_warn_once  == 1)
+               if (boost_warn_once == 1)
                        return false;
  
                pr_alert("%s: WARN: RCU kthread priority too low to test boosting.  Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME);
@@@ -2818,6 -2826,8 +2826,8 @@@ rcu_torture_cleanup(void
        if (cur_ops->cleanup != NULL)
                cur_ops->cleanup();
  
+       rcu_torture_mem_dump_obj();
        rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
  
        if (err_segs_recorded) {
@@@ -3120,6 -3130,21 +3130,21 @@@ rcu_torture_init(void
                if (firsterr < 0)
                        goto unwind;
                rcutor_hp = firsterr;
+               // Testing RCU priority boosting requires rcutorture do
+               // some serious abuse.  Counter this by running ksoftirqd
+               // at higher priority.
+               if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+                       for_each_online_cpu(cpu) {
+                               struct sched_param sp;
+                               struct task_struct *t;
+                               t = per_cpu(ksoftirqd, cpu);
+                               WARN_ON_ONCE(!t);
+                               sp.sched_priority = 2;
+                               sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                       }
+               }
        }
        shutdown_jiffies = jiffies + shutdown_secs * HZ;
        firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
diff --combined kernel/rcu/tree.c
index f12056beb916c3c80345a3c6f319d205878ef2a7,28f1093027b9cc94d1d2d53a8f20d9bb2267226e..51f24ecd94b2688fa8c475d4a1797d0e1c24b211
@@@ -32,8 -32,6 +32,8 @@@
  #include <linux/export.h>
  #include <linux/completion.h>
  #include <linux/moduleparam.h>
 +#include <linux/panic.h>
 +#include <linux/panic_notifier.h>
  #include <linux/percpu.h>
  #include <linux/notifier.h>
  #include <linux/cpu.h>
@@@ -188,6 -186,17 +188,17 @@@ module_param(rcu_unlock_delay, int, 044
  static int rcu_min_cached_objs = 5;
  module_param(rcu_min_cached_objs, int, 0444);
  
+ // A page shrinker can ask for pages to be freed to make them
+ // available for other parts of the system. This usually happens
+ // under low memory conditions, and in that case we should also
+ // defer page-cache filling for a short time period.
+ //
+ // The default value is 5 seconds, which is long enough to reduce
+ // interference with the shrinker while it asks other systems to
+ // drain their caches.
+ static int rcu_delay_page_cache_fill_msec = 5000;
+ module_param(rcu_delay_page_cache_fill_msec, int, 0444);
  /* Retrieve RCU kthreads priority for rcutorture */
  int rcu_get_gp_kthreads_prio(void)
  {
@@@ -204,7 -213,7 +215,7 @@@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_p
   * the need for long delays to increase some race probabilities with the
   * need for fast grace periods to increase other race probabilities.
   */
- #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
+ #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
  
  /*
   * Compute the mask of online CPUs for the specified rcu_node structure.
@@@ -244,6 -253,7 +255,7 @@@ void rcu_softirq_qs(void
  {
        rcu_qs();
        rcu_preempt_deferred_qs(current);
+       rcu_tasks_qs(current, false);
  }
  
  /*
@@@ -835,28 -845,6 +847,6 @@@ void noinstr rcu_irq_exit(void
        rcu_nmi_exit();
  }
  
- /**
-  * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
-  *                      towards in kernel preemption
-  *
-  * Same as rcu_irq_exit() but has a sanity check that scheduling is safe
-  * from RCU point of view. Invoked from return from interrupt before kernel
-  * preemption.
-  */
- void rcu_irq_exit_preempt(void)
- {
-       lockdep_assert_irqs_disabled();
-       rcu_nmi_exit();
-       RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
-                        "RCU dynticks_nesting counter underflow/zero!");
-       RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
-                        DYNTICK_IRQ_NONIDLE,
-                        "Bad RCU  dynticks_nmi_nesting counter\n");
-       RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
-                        "RCU in extended quiescent state!");
- }
  #ifdef CONFIG_PROVE_RCU
  /**
   * rcu_irq_exit_check_preempt - Validate that scheduling is possible
@@@ -961,7 -949,7 +951,7 @@@ EXPORT_SYMBOL_GPL(rcu_idle_exit)
   */
  void noinstr rcu_user_exit(void)
  {
-       rcu_eqs_exit(1);
+       rcu_eqs_exit(true);
  }
  
  /**
@@@ -1227,7 -1215,7 +1217,7 @@@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_c
  #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
  
  /*
-  * We are reporting a quiescent state on behalf of some other CPU, so
+  * When trying to report a quiescent state on behalf of some other CPU,
   * it is our responsibility to check for and handle potential overflow
   * of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
   * After all, the CPU might be in deep idle state, and thus executing no
@@@ -2050,7 -2038,7 +2040,7 @@@ static void rcu_gp_fqs_loop(void
  /*
   * Clean up after the old grace period.
   */
- static void rcu_gp_cleanup(void)
+ static noinline void rcu_gp_cleanup(void)
  {
        int cpu;
        bool needgp = false;
@@@ -2491,7 -2479,7 +2481,7 @@@ int rcutree_dead_cpu(unsigned int cpu
  
  /*
   * Invoke any RCU callbacks that have made it to the end of their grace
-  * period.  Thottle as specified by rdp->blimit.
+  * period.  Throttle as specified by rdp->blimit.
   */
  static void rcu_do_batch(struct rcu_data *rdp)
  {
   * state, for example, user mode or idle loop.  It also schedules RCU
   * core processing.  If the current grace period has gone on too long,
   * it will ask the scheduler to manufacture a context switch for the sole
-  * purpose of providing a providing the needed quiescent state.
+  * purpose of providing the needed quiescent state.
   */
  void rcu_sched_clock_irq(int user)
  {
@@@ -2913,7 -2901,6 +2903,6 @@@ static int __init rcu_spawn_core_kthrea
                  "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
        return 0;
  }
- early_initcall(rcu_spawn_core_kthreads);
  
  /*
   * Handle any core-RCU processing required by a call_rcu() invocation.
@@@ -3084,12 -3071,14 +3073,14 @@@ __call_rcu(struct rcu_head *head, rcu_c
   * period elapses, in other words after all pre-existing RCU read-side
   * critical sections have completed.  However, the callback function
   * might well execute concurrently with RCU read-side critical sections
-  * that started after call_rcu() was invoked.  RCU read-side critical
-  * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
-  * may be nested.  In addition, regions of code across which interrupts,
-  * preemption, or softirqs have been disabled also serve as RCU read-side
-  * critical sections.  This includes hardware interrupt handlers, softirq
-  * handlers, and NMI handlers.
+  * that started after call_rcu() was invoked.
+  *
+  * RCU read-side critical sections are delimited by rcu_read_lock()
+  * and rcu_read_unlock(), and may be nested.  In addition, but only in
+  * v5.0 and later, regions of code across which interrupts, preemption,
+  * or softirqs have been disabled also serve as RCU read-side critical
+  * sections.  This includes hardware interrupt handlers, softirq handlers,
+  * and NMI handlers.
   *
   * Note that all CPUs must agree that the grace period extended beyond
   * all pre-existing RCU read-side critical section.  On systems with more
   * between the call to call_rcu() and the invocation of "func()" -- even
   * if CPU A and CPU B are the same CPU (but again only if the system has
   * more than one CPU).
+  *
+  * Implementation of these memory-ordering guarantees is described here:
+  * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
   */
  void call_rcu(struct rcu_head *head, rcu_callback_t func)
  {
@@@ -3173,6 -3165,7 +3167,7 @@@ struct kfree_rcu_cpu_work 
   *    Even though it is lockless an access has to be protected by the
   *    per-cpu lock.
   * @page_cache_work: A work to refill the cache when it is empty
+  * @backoff_page_cache_fill: Delay cache refills
   * @work_in_progress: Indicates that page_cache_work is running
   * @hrtimer: A hrtimer for scheduling a page_cache_work
   * @nr_bkv_objs: number of allocated objects at @bkvcache.
@@@ -3192,7 -3185,8 +3187,8 @@@ struct kfree_rcu_cpu 
        bool initialized;
        int count;
  
-       struct work_struct page_cache_work;
+       struct delayed_work page_cache_work;
+       atomic_t backoff_page_cache_fill;
        atomic_t work_in_progress;
        struct hrtimer hrtimer;
  
@@@ -3239,7 -3233,7 +3235,7 @@@ get_cached_bnode(struct kfree_rcu_cpu *
        if (!krcp->nr_bkv_objs)
                return NULL;
  
-       krcp->nr_bkv_objs--;
+       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
        return (struct kvfree_rcu_bulk_data *)
                llist_del_first(&krcp->bkvcache);
  }
@@@ -3253,14 -3247,33 +3249,33 @@@ put_cached_bnode(struct kfree_rcu_cpu *
                return false;
  
        llist_add((struct llist_node *) bnode, &krcp->bkvcache);
-       krcp->nr_bkv_objs++;
+       WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
        return true;
+ }
+ static int
+ drain_page_cache(struct kfree_rcu_cpu *krcp)
+ {
+       unsigned long flags;
+       struct llist_node *page_list, *pos, *n;
+       int freed = 0;
  
+       raw_spin_lock_irqsave(&krcp->lock, flags);
+       page_list = llist_del_all(&krcp->bkvcache);
+       WRITE_ONCE(krcp->nr_bkv_objs, 0);
+       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+       llist_for_each_safe(pos, n, page_list) {
+               free_page((unsigned long)pos);
+               freed++;
+       }
+       return freed;
  }
  
  /*
   * This function is invoked in workqueue context after a grace period.
-  * It frees all the objects queued on ->bhead_free or ->head_free.
+  * It frees all the objects queued on ->bkvhead_free or ->head_free.
   */
  static void kfree_rcu_work(struct work_struct *work)
  {
        krwp->head_free = NULL;
        raw_spin_unlock_irqrestore(&krcp->lock, flags);
  
-       // Handle two first channels.
+       // Handle the first two channels.
        for (i = 0; i < FREE_N_CHANNELS; i++) {
                for (; bkvhead[i]; bkvhead[i] = bnext) {
                        bnext = bkvhead[i]->next;
        }
  
        /*
-        * Emergency case only. It can happen under low memory
-        * condition when an allocation gets failed, so the "bulk"
-        * path can not be temporary maintained.
+        * This is used when the "bulk" path can not be used for the
+        * double-argument of kvfree_rcu().  This happens when the
+        * page-cache is empty, which means that objects are instead
+        * queued on a linked list through their rcu_head structures.
+        * This list is named "Channel 3".
         */
        for (; head; head = next) {
                unsigned long offset = (unsigned long)head->func;
  }
  
  /*
-  * Schedule the kfree batch RCU work to run in workqueue context after a GP.
-  *
-  * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
-  * timeout has been reached.
+  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
   */
- static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
+ static void kfree_rcu_monitor(struct work_struct *work)
  {
-       struct kfree_rcu_cpu_work *krwp;
-       bool repeat = false;
+       struct kfree_rcu_cpu *krcp = container_of(work,
+               struct kfree_rcu_cpu, monitor_work.work);
+       unsigned long flags;
        int i, j;
  
-       lockdep_assert_held(&krcp->lock);
+       raw_spin_lock_irqsave(&krcp->lock, flags);
  
+       // Attempt to start a new batch.
        for (i = 0; i < KFREE_N_BATCHES; i++) {
-               krwp = &(krcp->krw_arr[i]);
+               struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
  
-               /*
-                * Try to detach bkvhead or head and attach it over any
-                * available corresponding free channel. It can be that
-                * a previous RCU batch is in progress, it means that
-                * immediately to queue another one is not possible so
-                * return false to tell caller to retry.
-                */
+               // Try to detach bkvhead or head and attach it over any
+               // available corresponding free channel. It can be that
+               // a previous RCU batch is in progress, it means that
+               // immediately to queue another one is not possible so
+               // in that case the monitor work is rearmed.
                if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
                        (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
                                (krcp->head && !krwp->head_free)) {
-                       // Channel 1 corresponds to SLAB ptrs.
-                       // Channel 2 corresponds to vmalloc ptrs.
+                       // Channel 1 corresponds to the SLAB-pointer bulk path.
+                       // Channel 2 corresponds to vmalloc-pointer bulk path.
                        for (j = 0; j < FREE_N_CHANNELS; j++) {
                                if (!krwp->bkvhead_free[j]) {
                                        krwp->bkvhead_free[j] = krcp->bkvhead[j];
                                }
                        }
  
-                       // Channel 3 corresponds to emergency path.
+                       // Channel 3 corresponds to both SLAB and vmalloc
+                       // objects queued on the linked list.
                        if (!krwp->head_free) {
                                krwp->head_free = krcp->head;
                                krcp->head = NULL;
  
                        WRITE_ONCE(krcp->count, 0);
  
-                       /*
-                        * One work is per one batch, so there are three
-                        * "free channels", the batch can handle. It can
-                        * be that the work is in the pending state when
-                        * channels have been detached following by each
-                        * other.
-                        */
+                       // One work is per one batch, so there are three
+                       // "free channels", the batch can handle. It can
+                       // be that the work is in the pending state when
+                       // channels have been detached following by each
+                       // other.
                        queue_rcu_work(system_wq, &krwp->rcu_work);
                }
-               // Repeat if any "free" corresponding channel is still busy.
-               if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
-                       repeat = true;
        }
  
-       return !repeat;
- }
- static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
-                                         unsigned long flags)
- {
-       // Attempt to start a new batch.
-       krcp->monitor_todo = false;
-       if (queue_kfree_rcu_work(krcp)) {
-               // Success! Our job is done here.
-               raw_spin_unlock_irqrestore(&krcp->lock, flags);
-               return;
-       }
+       // If there is nothing to detach, it means that our job is
+       // successfully done here. In case of having at least one
+       // of the channels that is still busy we should rearm the
+       // work to repeat an attempt. Because previous batches are
+       // still in progress.
+       if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
+               krcp->monitor_todo = false;
+       else
+               schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
  
-       // Previous RCU batch still in progress, try again later.
-       krcp->monitor_todo = true;
-       schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
        raw_spin_unlock_irqrestore(&krcp->lock, flags);
  }
  
- /*
-  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
-  * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
-  */
- static void kfree_rcu_monitor(struct work_struct *work)
- {
-       unsigned long flags;
-       struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
-                                                monitor_work.work);
-       raw_spin_lock_irqsave(&krcp->lock, flags);
-       if (krcp->monitor_todo)
-               kfree_rcu_drain_unlock(krcp, flags);
-       else
-               raw_spin_unlock_irqrestore(&krcp->lock, flags);
- }
  static enum hrtimer_restart
  schedule_page_work_fn(struct hrtimer *t)
  {
        struct kfree_rcu_cpu *krcp =
                container_of(t, struct kfree_rcu_cpu, hrtimer);
  
-       queue_work(system_highpri_wq, &krcp->page_cache_work);
+       queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
        return HRTIMER_NORESTART;
  }
  
@@@ -3457,12 -3440,16 +3442,16 @@@ static void fill_page_cache_func(struc
        struct kvfree_rcu_bulk_data *bnode;
        struct kfree_rcu_cpu *krcp =
                container_of(work, struct kfree_rcu_cpu,
-                       page_cache_work);
+                       page_cache_work.work);
        unsigned long flags;
+       int nr_pages;
        bool pushed;
        int i;
  
-       for (i = 0; i < rcu_min_cached_objs; i++) {
+       nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+               1 : rcu_min_cached_objs;
+       for (i = 0; i < nr_pages; i++) {
                bnode = (struct kvfree_rcu_bulk_data *)
                        __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
  
        }
  
        atomic_set(&krcp->work_in_progress, 0);
+       atomic_set(&krcp->backoff_page_cache_fill, 0);
  }
  
  static void
@@@ -3486,10 -3474,15 +3476,15 @@@ run_page_cache_worker(struct kfree_rcu_
  {
        if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
                        !atomic_xchg(&krcp->work_in_progress, 1)) {
-               hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
-                       HRTIMER_MODE_REL);
-               krcp->hrtimer.function = schedule_page_work_fn;
-               hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+               if (atomic_read(&krcp->backoff_page_cache_fill)) {
+                       queue_delayed_work(system_wq,
+                               &krcp->page_cache_work,
+                                       msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+               } else {
+                       hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+                       krcp->hrtimer.function = schedule_page_work_fn;
+                       hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+               }
        }
  }
  
@@@ -3554,11 -3547,11 +3549,11 @@@ add_ptr_to_bulk_krc_lock(struct kfree_r
  }
  
  /*
-  * Queue a request for lazy invocation of appropriate free routine after a
-  * grace period. Please note there are three paths are maintained, two are the
-  * main ones that use array of pointers interface and third one is emergency
-  * one, that is used only when the main path can not be maintained temporary,
-  * due to memory pressure.
+  * Queue a request for lazy invocation of the appropriate free routine
+  * after a grace period.  Please note that three paths are maintained,
+  * two for the common case using arrays of pointers and a third one that
+  * is used only when the main paths cannot be used, for example, due to
+  * memory pressure.
   *
   * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
   * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
@@@ -3647,6 -3640,8 +3642,8 @@@ kfree_rcu_shrink_count(struct shrinker 
                struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  
                count += READ_ONCE(krcp->count);
+               count += READ_ONCE(krcp->nr_bkv_objs);
+               atomic_set(&krcp->backoff_page_cache_fill, 1);
        }
  
        return count;
@@@ -3656,18 -3651,14 +3653,14 @@@ static unsigned lon
  kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
  {
        int cpu, freed = 0;
-       unsigned long flags;
  
        for_each_possible_cpu(cpu) {
                int count;
                struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  
                count = krcp->count;
-               raw_spin_lock_irqsave(&krcp->lock, flags);
-               if (krcp->monitor_todo)
-                       kfree_rcu_drain_unlock(krcp, flags);
-               else
-                       raw_spin_unlock_irqrestore(&krcp->lock, flags);
+               count += drain_page_cache(krcp);
+               kfree_rcu_monitor(&krcp->monitor_work.work);
  
                sc->nr_to_scan -= count;
                freed += count;
@@@ -3695,7 -3686,8 +3688,8 @@@ void __init kfree_rcu_scheduler_running
                struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  
                raw_spin_lock_irqsave(&krcp->lock, flags);
-               if (!krcp->head || krcp->monitor_todo) {
+               if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
+                               krcp->monitor_todo) {
                        raw_spin_unlock_irqrestore(&krcp->lock, flags);
                        continue;
                }
@@@ -3752,10 -3744,12 +3746,12 @@@ static int rcu_blocking_is_gp(void
   * read-side critical sections have completed.  Note, however, that
   * upon return from synchronize_rcu(), the caller might well be executing
   * concurrently with new RCU read-side critical sections that began while
-  * synchronize_rcu() was waiting.  RCU read-side critical sections are
-  * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
-  * In addition, regions of code across which interrupts, preemption, or
-  * softirqs have been disabled also serve as RCU read-side critical
+  * synchronize_rcu() was waiting.
+  *
+  * RCU read-side critical sections are delimited by rcu_read_lock()
+  * and rcu_read_unlock(), and may be nested.  In addition, but only in
+  * v5.0 and later, regions of code across which interrupts, preemption,
+  * or softirqs have been disabled also serve as RCU read-side critical
   * sections.  This includes hardware interrupt handlers, softirq handlers,
   * and NMI handlers.
   *
   * to have executed a full memory barrier during the execution of
   * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
   * again only if the system has more than one CPU).
+  *
+  * Implementation of these memory-ordering guarantees is described here:
+  * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
   */
  void synchronize_rcu(void)
  {
@@@ -3846,11 -3843,11 +3845,11 @@@ EXPORT_SYMBOL_GPL(start_poll_synchroniz
  /**
   * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
   *
-  * @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+  * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
   *
   * If a full RCU grace period has elapsed since the earlier call from
   * which oldstate was obtained, return @true, otherwise return @false.
-  * If @false is returned, it is the caller's responsibilty to invoke this
+  * If @false is returned, it is the caller's responsibility to invoke this
   * function later on until it does return @true.  Alternatively, the caller
   * can explicitly wait for a grace period, for example, by passing @oldstate
   * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
   * (many hours even on 32-bit systems) should check them occasionally
   * and either refresh them or set a flag indicating that the grace period
   * has completed.
+  *
+  * This function provides the same memory-ordering guarantees that
+  * would be provided by a synchronize_rcu() that was invoked at the call
+  * to the function that provided @oldstate, and that returned at the end
+  * of this function.
   */
  bool poll_state_synchronize_rcu(unsigned long oldstate)
  {
@@@ -3876,7 -3878,7 +3880,7 @@@ EXPORT_SYMBOL_GPL(poll_state_synchroniz
  /**
   * cond_synchronize_rcu - Conditionally wait for an RCU grace period
   *
-  * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+  * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
   *
   * If a full RCU grace period has elapsed since the earlier call to
   * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
   * counter wrap is harmless.  If the counter wraps, we have waited for
   * more than 2 billion grace periods (and way more on a 64-bit system!),
   * so waiting for one additional grace period should be just fine.
+  *
+  * This function provides the same memory-ordering guarantees that
+  * would be provided by a synchronize_rcu() that was invoked at the call
+  * to the function that provided @oldstate, and that returned at the end
+  * of this function.
   */
  void cond_synchronize_rcu(unsigned long oldstate)
  {
@@@ -3913,7 -3920,7 +3922,7 @@@ static int rcu_pending(int user
        check_cpu_stall(rdp);
  
        /* Does this CPU need a deferred NOCB wakeup? */
-       if (rcu_nocb_need_deferred_wakeup(rdp))
+       if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE))
                return 1;
  
        /* Is this a nohz_full CPU in userspace or idle?  (Ignore RCU if so.) */
@@@ -4096,7 -4103,7 +4105,7 @@@ EXPORT_SYMBOL_GPL(rcu_barrier)
  /*
   * Propagate ->qsinitmask bits up the rcu_node tree to account for the
   * first CPU in a given leaf rcu_node structure coming online.  The caller
-  * must hold the corresponding leaf rcu_node ->lock with interrrupts
+  * must hold the corresponding leaf rcu_node ->lock with interrupts
   * disabled.
   */
  static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
@@@ -4191,7 -4198,7 +4200,7 @@@ int rcutree_prepare_cpu(unsigned int cp
        rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
        trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-       rcu_prepare_kthreads(cpu);
+       rcu_spawn_one_boost_kthread(rnp);
        rcu_spawn_cpu_nocb_kthread(cpu);
        WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
  
@@@ -4474,6 -4481,7 +4483,7 @@@ static int __init rcu_spawn_gp_kthread(
        wake_up_process(t);
        rcu_spawn_nocb_kthreads();
        rcu_spawn_boost_kthreads();
+       rcu_spawn_core_kthreads();
        return 0;
  }
  early_initcall(rcu_spawn_gp_kthread);
@@@ -4584,11 -4592,25 +4594,25 @@@ static void __init rcu_init_one(void
   * replace the definitions in tree.h because those are needed to size
   * the ->node array in the rcu_state structure.
   */
static void __init rcu_init_geometry(void)
void rcu_init_geometry(void)
  {
        ulong d;
        int i;
+       static unsigned long old_nr_cpu_ids;
        int rcu_capacity[RCU_NUM_LVLS];
+       static bool initialized;
+       if (initialized) {
+               /*
+                * Warn if setup_nr_cpu_ids() had not yet been invoked,
+                * unless nr_cpus_ids == NR_CPUS, in which case who cares?
+                */
+               WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
+               return;
+       }
+       old_nr_cpu_ids = nr_cpu_ids;
+       initialized = true;
  
        /*
         * Initialize any unspecified boot parameters.
@@@ -4689,6 -4711,18 +4713,18 @@@ static void __init kfree_rcu_batch_init
        int cpu;
        int i;
  
+       /* Clamp it to [0:100] seconds interval. */
+       if (rcu_delay_page_cache_fill_msec < 0 ||
+               rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+               rcu_delay_page_cache_fill_msec =
+                       clamp(rcu_delay_page_cache_fill_msec, 0,
+                               (int) (100 * MSEC_PER_SEC));
+               pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+                       rcu_delay_page_cache_fill_msec);
+       }
        for_each_possible_cpu(cpu) {
                struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  
                }
  
                INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
-               INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
+               INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
                krcp->initialized = true;
        }
        if (register_shrinker(&kfree_rcu_shrinker))
@@@ -4732,12 -4766,11 +4768,11 @@@ void __init rcu_init(void
                rcutree_online_cpu(cpu);
        }
  
-       /* Create workqueue for expedited GPs and for Tree SRCU. */
+       /* Create workqueue for Tree SRCU and for expedited GPs. */
        rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
        WARN_ON(!rcu_gp_wq);
        rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
        WARN_ON(!rcu_par_gp_wq);
-       srcu_init();
  
        /* Fill in default value for rcutree.qovld boot parameter. */
        /* -After- the rcu_node ->lock fields are initialized! */
diff --combined kernel/rcu/tree_plugin.h
index 4d6962048c30404a7aedbda0b53a4fef333c6a25,334eaf4d561fa8b91eb3b321970bbe82d85252a7..de1dc3bb7f70167e2e5405b73e79e4e31c9f0a2c
@@@ -33,10 -33,6 +33,6 @@@ static inline bool rcu_current_is_nocb_
        return false;
  }
  
- static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
- {
-       return (timer_curr_running(&rdp->nocb_timer) && !in_irq());
- }
  #else
  static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
  {
@@@ -48,11 -44,6 +44,6 @@@ static inline bool rcu_current_is_nocb_
        return false;
  }
  
- static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
- {
-       return false;
- }
  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
  
  static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
@@@ -72,8 -63,7 +63,7 @@@
                  rcu_lockdep_is_held_nocb(rdp) ||
                  (rdp == this_cpu_ptr(&rcu_data) &&
                   !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
-                 rcu_current_is_nocb_kthread(rdp) ||
-                 rcu_running_nocb_timer(rdp)),
+                 rcu_current_is_nocb_kthread(rdp)),
                "Unsafe read of RCU_NOCB offloaded state"
        );
  
@@@ -1098,6 -1088,7 +1088,7 @@@ static int rcu_boost(struct rcu_node *r
        /* Lock only for side effect: boosts task t's priority. */
        rt_mutex_lock(&rnp->boost_mtx);
        rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
+       rnp->n_boosts++;
  
        return READ_ONCE(rnp->exp_tasks) != NULL ||
               READ_ONCE(rnp->boost_tasks) != NULL;
@@@ -1197,22 -1188,16 +1188,16 @@@ static void rcu_preempt_boost_start_gp(
   */
  static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
  {
-       int rnp_index = rnp - rcu_get_root();
        unsigned long flags;
+       int rnp_index = rnp - rcu_get_root();
        struct sched_param sp;
        struct task_struct *t;
  
-       if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
-               return;
-       if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
+       if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
                return;
  
        rcu_state.boost = 1;
  
-       if (rnp->boost_kthread_task != NULL)
-               return;
        t = kthread_create(rcu_boost_kthread, (void *)rnp,
                           "rcub/%d", rnp_index);
        if (WARN_ON_ONCE(IS_ERR(t)))
@@@ -1264,17 -1249,8 +1249,8 @@@ static void __init rcu_spawn_boost_kthr
        struct rcu_node *rnp;
  
        rcu_for_each_leaf_node(rnp)
-               rcu_spawn_one_boost_kthread(rnp);
- }
- static void rcu_prepare_kthreads(int cpu)
- {
-       struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-       struct rcu_node *rnp = rdp->mynode;
-       /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
-       if (rcu_scheduler_fully_active)
-               rcu_spawn_one_boost_kthread(rnp);
+               if (rcu_rnp_online_cpus(rnp))
+                       rcu_spawn_one_boost_kthread(rnp);
  }
  
  #else /* #ifdef CONFIG_RCU_BOOST */
@@@ -1294,15 -1270,15 +1270,15 @@@ static void rcu_preempt_boost_start_gp(
  {
  }
  
- static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
  {
  }
  
- static void __init rcu_spawn_boost_kthreads(void)
+ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
  {
  }
  
- static void rcu_prepare_kthreads(int cpu)
+ static void __init rcu_spawn_boost_kthreads(void)
  {
  }
  
@@@ -1535,13 -1511,10 +1511,10 @@@ static void rcu_cleanup_after_idle(void
  static int __init rcu_nocb_setup(char *str)
  {
        alloc_bootmem_cpumask_var(&rcu_nocb_mask);
-       if (!strcasecmp(str, "all"))            /* legacy: use "0-N" instead */
+       if (cpulist_parse(str, rcu_nocb_mask)) {
+               pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
                cpumask_setall(rcu_nocb_mask);
-       else
-               if (cpulist_parse(str, rcu_nocb_mask)) {
-                       pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
-                       cpumask_setall(rcu_nocb_mask);
-               }
+       }
        return 1;
  }
  __setup("rcu_nocbs=", rcu_nocb_setup);
@@@ -1692,43 -1665,50 +1665,50 @@@ bool rcu_is_nocb_cpu(int cpu
        return false;
  }
  
- /*
-  * Kick the GP kthread for this NOCB group.  Caller holds ->nocb_lock
-  * and this function releases it.
-  */
- static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
-                        unsigned long flags)
-       __releases(rdp->nocb_lock)
+ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
+                          struct rcu_data *rdp,
+                          bool force, unsigned long flags)
+       __releases(rdp_gp->nocb_gp_lock)
  {
        bool needwake = false;
-       struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
  
-       lockdep_assert_held(&rdp->nocb_lock);
        if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
-               rcu_nocb_unlock_irqrestore(rdp, flags);
+               raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
                trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
                                    TPS("AlreadyAwake"));
                return false;
        }
  
-       if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) {
-               WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
-               del_timer(&rdp->nocb_timer);
+       if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+               WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+               del_timer(&rdp_gp->nocb_timer);
        }
-       rcu_nocb_unlock_irqrestore(rdp, flags);
-       raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
        if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
                WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
                needwake = true;
-               trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
        }
        raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
-       if (needwake)
+       if (needwake) {
+               trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
                wake_up_process(rdp_gp->nocb_gp_kthread);
+       }
  
        return needwake;
  }
  
+ /*
+  * Kick the GP kthread for this NOCB group.
+  */
+ static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+ {
+       unsigned long flags;
+       struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+       raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+       return __wake_nocb_gp(rdp_gp, rdp, force, flags);
+ }
  /*
   * Arrange to wake the GP kthread for this NOCB group at some future
   * time when it is safe to do so.
  static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
                               const char *reason)
  {
-       if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
-               return;
-       if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
-               mod_timer(&rdp->nocb_timer, jiffies + 1);
-       if (rdp->nocb_defer_wakeup < waketype)
-               WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+       unsigned long flags;
+       struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+       raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+       /*
+        * Bypass wakeup overrides previous deferments. In case
+        * of callback storm, no need to wake up too early.
+        */
+       if (waketype == RCU_NOCB_WAKE_BYPASS) {
+               mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
+               WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+       } else {
+               if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
+                       mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
+               if (rdp_gp->nocb_defer_wakeup < waketype)
+                       WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+       }
+       raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
        trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
  }
  
@@@ -1940,7 -1935,7 +1935,7 @@@ static bool rcu_nocb_try_bypass(struct 
  }
  
  /*
-  * Awaken the no-CBs grace-period kthead if needed, either due to it
+  * Awaken the no-CBs grace-period kthread if needed, either due to it
   * legitimately being asleep or due to overload conditions.
   *
   * If warranted, also wake up the kthread servicing this CPUs queues.
@@@ -1968,13 -1963,14 +1963,14 @@@ static void __call_rcu_nocb_wake(struc
                rdp->qlen_last_fqs_check = len;
                if (!irqs_disabled_flags(flags)) {
                        /* ... if queue was empty ... */
-                       wake_nocb_gp(rdp, false, flags);
+                       rcu_nocb_unlock_irqrestore(rdp, flags);
+                       wake_nocb_gp(rdp, false);
                        trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
                                            TPS("WakeEmpty"));
                } else {
+                       rcu_nocb_unlock_irqrestore(rdp, flags);
                        wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
                                           TPS("WakeEmptyIsDeferred"));
-                       rcu_nocb_unlock_irqrestore(rdp, flags);
                }
        } else if (len > rdp->qlen_last_fqs_check + qhimark) {
                /* ... or if many callbacks queued. */
                smp_mb(); /* Enqueue before timer_pending(). */
                if ((rdp->nocb_cb_sleep ||
                     !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
-                   !timer_pending(&rdp->nocb_bypass_timer))
+                   !timer_pending(&rdp->nocb_timer)) {
+                       rcu_nocb_unlock_irqrestore(rdp, flags);
                        wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
                                           TPS("WakeOvfIsDeferred"));
-               rcu_nocb_unlock_irqrestore(rdp, flags);
+               } else {
+                       rcu_nocb_unlock_irqrestore(rdp, flags);
+                       trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+               }
        } else {
                rcu_nocb_unlock_irqrestore(rdp, flags);
                trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
        return;
  }
  
- /* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
- static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
- {
-       unsigned long flags;
-       struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
-       trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
-       rcu_nocb_lock_irqsave(rdp, flags);
-       smp_mb__after_spinlock(); /* Timer expire before wakeup. */
-       __call_rcu_nocb_wake(rdp, true, flags);
- }
  /*
   * Check if we ignore this rdp.
   *
@@@ -2118,11 -2106,7 +2106,7 @@@ static void nocb_gp_wait(struct rcu_dat
                        bypass = true;
                }
                rnp = rdp->mynode;
-               if (bypass) {  // Avoid race with first bypass CB.
-                       WRITE_ONCE(my_rdp->nocb_defer_wakeup,
-                                  RCU_NOCB_WAKE_NOT);
-                       del_timer(&my_rdp->nocb_timer);
-               }
                // Advance callbacks if helpful and low contention.
                needwake_gp = false;
                if (!rcu_segcblist_restempty(&rdp->cblist,
        my_rdp->nocb_gp_bypass = bypass;
        my_rdp->nocb_gp_gp = needwait_gp;
        my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
        if (bypass && !rcu_nocb_poll) {
                // At least one child with non-empty ->nocb_bypass, so set
                // timer in order to avoid stranding its callbacks.
-               raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
-               mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
-               raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+               wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+                                  TPS("WakeBypassIsDeferred"));
        }
        if (rcu_nocb_poll) {
                /* Polling, so trace if first poll in the series. */
        }
        if (!rcu_nocb_poll) {
                raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
-               if (bypass)
-                       del_timer(&my_rdp->nocb_bypass_timer);
+               if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+                       WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+                       del_timer(&my_rdp->nocb_timer);
+               }
                WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
                raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
        }
@@@ -2334,25 -2320,27 +2320,27 @@@ static int rcu_nocb_cb_kthread(void *ar
  }
  
  /* Is a deferred wakeup of rcu_nocb_kthread() required? */
- static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
  {
-       return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
+       return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
  }
  
  /* Do a deferred wakeup of rcu_nocb_kthread(). */
- static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
+ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
+                                          struct rcu_data *rdp, int level,
+                                          unsigned long flags)
+       __releases(rdp_gp->nocb_gp_lock)
  {
-       unsigned long flags;
        int ndw;
        int ret;
  
-       rcu_nocb_lock_irqsave(rdp, flags);
-       if (!rcu_nocb_need_deferred_wakeup(rdp)) {
-               rcu_nocb_unlock_irqrestore(rdp, flags);
+       if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
+               raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
                return false;
        }
-       ndw = READ_ONCE(rdp->nocb_defer_wakeup);
-       ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+       ndw = rdp_gp->nocb_defer_wakeup;
+       ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
        trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
  
        return ret;
  /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
  static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
  {
+       unsigned long flags;
        struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
  
-       do_nocb_deferred_wakeup_common(rdp);
+       WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
+       trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+       raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
+       smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+       do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
  }
  
  /*
   */
  static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
  {
-       if (rcu_nocb_need_deferred_wakeup(rdp))
-               return do_nocb_deferred_wakeup_common(rdp);
-       return false;
+       unsigned long flags;
+       struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+       if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
+               return false;
+       raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+       return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
  }
  
  void rcu_nocb_flush_deferred_wakeup(void)
@@@ -2443,17 -2442,15 +2442,15 @@@ static long rcu_nocb_rdp_deoffload(voi
        swait_event_exclusive(rdp->nocb_state_wq,
                              !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
                                                        SEGCBLIST_KTHREAD_GP));
-       rcu_nocb_lock_irqsave(rdp, flags);
-       /* Make sure nocb timer won't stay around */
-       WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
-       rcu_nocb_unlock_irqrestore(rdp, flags);
-       del_timer_sync(&rdp->nocb_timer);
        /*
-        * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY with CB unlocked
-        * and IRQs disabled but let's be paranoid.
+        * Lock one last time to acquire latest callback updates from kthreads
+        * so we can later handle callbacks locally without locking.
         */
        rcu_nocb_lock_irqsave(rdp, flags);
+       /*
+        * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb
+        * lock is released but how about being paranoid for once?
+        */
        rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
        /*
         * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
@@@ -2473,10 -2470,6 +2470,6 @@@ int rcu_nocb_cpu_deoffload(int cpu
        struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
        int ret = 0;
  
-       if (rdp == rdp->nocb_gp_rdp) {
-               pr_info("Can't deoffload an rdp GP leader (yet)\n");
-               return -EINVAL;
-       }
        mutex_lock(&rcu_state.barrier_mutex);
        cpus_read_lock();
        if (rcu_rdp_is_offloaded(rdp)) {
@@@ -2517,8 -2510,7 +2510,7 @@@ static long rcu_nocb_rdp_offload(void *
         * SEGCBLIST_SOFTIRQ_ONLY mode.
         */
        raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
-       /* Re-enable nocb timer */
-       WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
        /*
         * We didn't take the nocb lock while working on the
         * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
@@@ -2626,7 -2618,6 +2618,6 @@@ static void __init rcu_boot_init_nocb_p
        raw_spin_lock_init(&rdp->nocb_bypass_lock);
        raw_spin_lock_init(&rdp->nocb_gp_lock);
        timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
-       timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
        rcu_cblist_init(&rdp->nocb_bypass);
  }
  
@@@ -2768,7 -2759,7 +2759,7 @@@ EXPORT_SYMBOL_GPL(rcu_bind_current_to_n
  #ifdef CONFIG_SMP
  static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
  {
 -      return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
 +      return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
  }
  #else // #ifdef CONFIG_SMP
  static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
@@@ -2785,13 -2776,12 +2776,12 @@@ static void show_rcu_nocb_gp_state(stru
  {
        struct rcu_node *rnp = rdp->mynode;
  
-       pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
+       pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
                rdp->cpu,
                "kK"[!!rdp->nocb_gp_kthread],
                "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
                "dD"[!!rdp->nocb_defer_wakeup],
                "tT"[timer_pending(&rdp->nocb_timer)],
-               "bB"[timer_pending(&rdp->nocb_bypass_timer)],
                "sS"[!!rdp->nocb_gp_sleep],
                ".W"[swait_active(&rdp->nocb_gp_wq)],
                ".W"[swait_active(&rnp->nocb_gp_wq[0])],
@@@ -2812,7 -2802,6 +2802,6 @@@ static void show_rcu_nocb_state(struct 
        char bufr[20];
        struct rcu_segcblist *rsclp = &rdp->cblist;
        bool waslocked;
-       bool wastimer;
        bool wassleep;
  
        if (rdp->nocb_gp_rdp == rdp)
                return;
  
        waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
-       wastimer = timer_pending(&rdp->nocb_bypass_timer);
        wassleep = swait_active(&rdp->nocb_gp_wq);
-       if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep)
-               return;  /* Nothing untowards. */
+       if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
+               return;  /* Nothing untoward. */
  
-       pr_info("   nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n",
+       pr_info("   nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
                "lL"[waslocked],
                "dD"[!!rdp->nocb_defer_wakeup],
-               "tT"[wastimer],
                "sS"[!!rdp->nocb_gp_sleep],
                ".W"[wassleep]);
  }
@@@ -2922,7 -2909,7 +2909,7 @@@ static void __init rcu_boot_init_nocb_p
  {
  }
  
- static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
  {
        return false;
  }
diff --combined kernel/rcu/tree_stall.h
index acb2288063b53307ec3df4d4a2e2e9164c22f0f7,f4152aa18f48ff93dcd8ecd8bcd4d3af0b10aded..3f937b20814fdc8d6ac5f1407595b84ee159684b
@@@ -314,6 -314,7 +314,7 @@@ static void rcu_print_detail_task_stall
   * tasks blocked within RCU read-side critical sections.
   */
  static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
+       __releases(rnp->lock)
  {
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        return 0;
@@@ -460,12 -461,12 +461,12 @@@ static void rcu_check_gp_kthread_starva
  
        if (rcu_is_gp_kthread_starving(&j)) {
                cpu = gpk ? task_cpu(gpk) : -1;
 -              pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
 +              pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n",
                       rcu_state.name, j,
                       (long)rcu_seq_current(&rcu_state.gp_seq),
                       data_race(rcu_state.gp_flags),
                       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
 -                     gpk ? gpk->state : ~0, cpu);
 +                     gpk ? gpk->__state : ~0, cpu);
                if (gpk) {
                        pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
                        pr_err("RCU grace-period kthread stack dump:\n");
@@@ -503,12 -504,12 +504,12 @@@ static void rcu_check_gp_kthread_expire
            time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) &&
            gpk && !READ_ONCE(gpk->on_rq)) {
                cpu = task_cpu(gpk);
 -              pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx\n",
 +              pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n",
                       rcu_state.name, (jiffies - jiffies_fqs),
                       (long)rcu_seq_current(&rcu_state.gp_seq),
                       data_race(rcu_state.gp_flags),
                       gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
 -                     gpk->state);
 +                     gpk->__state);
                pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
                       cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
        }
@@@ -716,6 -717,63 +717,63 @@@ static void check_cpu_stall(struct rcu_
  // RCU forward-progress mechanisms, including of callback invocation.
  
  
+ /*
+  * Check to see if a failure to end RCU priority inversion was due to
+  * a CPU not passing through a quiescent state.  When this happens, there
+  * is nothing that RCU priority boosting can do to help, so we shouldn't
+  * count this as an RCU priority boosting failure.  A return of true says
+  * RCU priority boosting is to blame, and false says otherwise.  If false
+  * is returned, the first of the CPUs to blame is stored through cpup.
+  * If there was no CPU blocking the current grace period, but also nothing
+  * in need of being boosted, *cpup is set to -1.  This can happen in case
+  * of vCPU preemption while the last CPU is reporting its quiscent state,
+  * for example.
+  *
+  * If cpup is NULL, then a lockless quick check is carried out, suitable
+  * for high-rate usage.  On the other hand, if cpup is non-NULL, each
+  * rcu_node structure's ->lock is acquired, ruling out high-rate usage.
+  */
+ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
+ {
+       bool atb = false;
+       int cpu;
+       unsigned long flags;
+       struct rcu_node *rnp;
+       rcu_for_each_leaf_node(rnp) {
+               if (!cpup) {
+                       if (READ_ONCE(rnp->qsmask)) {
+                               return false;
+                       } else {
+                               if (READ_ONCE(rnp->gp_tasks))
+                                       atb = true;
+                               continue;
+                       }
+               }
+               *cpup = -1;
+               raw_spin_lock_irqsave_rcu_node(rnp, flags);
+               if (rnp->gp_tasks)
+                       atb = true;
+               if (!rnp->qsmask) {
+                       // No CPUs without quiescent states for this rnp.
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                       continue;
+               }
+               // Find the first holdout CPU.
+               for_each_leaf_node_possible_cpu(rnp, cpu) {
+                       if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) {
+                               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                               *cpup = cpu;
+                               return false;
+                       }
+               }
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+       }
+       // Can't blame CPUs, so must blame RCU priority boosting.
+       return atb;
+ }
+ EXPORT_SYMBOL_GPL(rcu_check_boost_fail);
  /*
   * Show the state of the grace-period kthreads.
   */
@@@ -726,6 -784,7 +784,7 @@@ void show_rcu_gp_kthreads(void
        unsigned long j;
        unsigned long ja;
        unsigned long jr;
+       unsigned long js;
        unsigned long jw;
        struct rcu_data *rdp;
        struct rcu_node *rnp;
        j = jiffies;
        ja = j - data_race(rcu_state.gp_activity);
        jr = j - data_race(rcu_state.gp_req_activity);
+       js = j - data_race(rcu_state.gp_start);
        jw = j - data_race(rcu_state.gp_wake_time);
-       pr_info("%s: wait state: %s(%d) ->state: %#x delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+       pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
                rcu_state.name, gp_state_getname(rcu_state.gp_state),
-               rcu_state.gp_state, t ? t->__state : 0x1ffff,
-               ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
 -              rcu_state.gp_state, t ? t->state : 0x1ffffL, t ? t->rt_priority : 0xffU,
++              rcu_state.gp_state, t ? t->__state : 0x1ffffL, t ? t->rt_priority : 0xffU,
+               js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
                (long)data_race(rcu_state.gp_seq),
                (long)data_race(rcu_get_root()->gp_seq_needed),
+               data_race(rcu_state.gp_max),
                data_race(rcu_state.gp_flags));
        rcu_for_each_node_breadth_first(rnp) {
-               if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
-                                READ_ONCE(rnp->gp_seq_needed)))
+               if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
+                   !data_race(rnp->qsmask) && !data_race(rnp->boost_tasks) &&
+                   !data_race(rnp->exp_tasks) && !data_race(rnp->gp_tasks))
                        continue;
-               pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
-                       rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq),
-                       (long)data_race(rnp->gp_seq_needed));
+               pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
+                       rnp->grplo, rnp->grphi,
+                       (long)data_race(rnp->gp_seq), (long)data_race(rnp->gp_seq_needed),
+                       data_race(rnp->qsmask),
+                       ".b"[!!data_race(rnp->boost_kthread_task)],
+                       ".B"[!!data_race(rnp->boost_tasks)],
+                       ".E"[!!data_race(rnp->exp_tasks)],
+                       ".G"[!!data_race(rnp->gp_tasks)],
+                       data_race(rnp->n_boosts));
                if (!rcu_is_leaf_node(rnp))
                        continue;
                for_each_leaf_node_possible_cpu(rnp, cpu) {
diff --combined kernel/time/timer.c
index 467087d7bdb664fbc0c6373eeb443303b1c1e5ee,84332f01dc571ea11e358fabf27e6c399acdafd0..3fadb58fc9d7b1c7a273b59a48b8dc82c47a1b22
@@@ -1237,20 -1237,6 +1237,6 @@@ int try_to_del_timer_sync(struct timer_
  }
  EXPORT_SYMBOL(try_to_del_timer_sync);
  
- bool timer_curr_running(struct timer_list *timer)
- {
-       int i;
-       for (i = 0; i < NR_BASES; i++) {
-               struct timer_base *base = this_cpu_ptr(&timer_bases[i]);
-               if (base->running_timer == timer)
-                       return true;
-       }
-       return false;
- }
  #ifdef CONFIG_PREEMPT_RT
  static __init void timer_base_init_expiry_lock(struct timer_base *base)
  {
@@@ -1879,7 -1865,7 +1865,7 @@@ signed long __sched schedule_timeout(si
                        printk(KERN_ERR "schedule_timeout: wrong timeout "
                                "value %lx\n", timeout);
                        dump_stack();
 -                      current->state = TASK_RUNNING;
 +                      __set_current_state(TASK_RUNNING);
                        goto out;
                }
        }
diff --combined mm/oom_kill.c
index fcc29e9a30645f1377d9baa550cf840e264166c4,54527de9cd2ded8b2b3195be953427a6205cfed1..c729a4c4a1ace9ae4b97fb9602d3a48d726f177e
@@@ -104,7 -104,7 +104,7 @@@ static bool oom_cpuset_eligible(struct 
                         * mempolicy intersects current, otherwise it may be
                         * needlessly killed.
                         */
 -                      ret = mempolicy_nodemask_intersects(tsk, mask);
 +                      ret = mempolicy_in_oom_domain(tsk, mask);
                } else {
                        /*
                         * This is not a mempolicy constrained oom, so only
@@@ -922,7 -922,7 +922,7 @@@ static void __oom_kill_process(struct t
                        continue;
                }
                /*
-                * No kthead_use_mm() user needs to read from the userspace so
+                * No kthread_use_mm() user needs to read from the userspace so
                 * we are ok to reap it.
                 */
                if (unlikely(p->flags & PF_KTHREAD))
diff --combined mm/slab.h
index 7b60ef2f32c3b6424fe67cd9a0a62ffbd1c54e6e,7189daa0c586c0ff4d8b0565007f4847cf8c2824..67e06637ff2eeda01c178c087a95691f2a7113b5
+++ b/mm/slab.h
@@@ -215,7 -215,6 +215,7 @@@ DECLARE_STATIC_KEY_TRUE(slub_debug_enab
  DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
  #endif
  extern void print_tracking(struct kmem_cache *s, void *object);
 +long validate_slab_cache(struct kmem_cache *s);
  #else
  static inline void print_tracking(struct kmem_cache *s, void *object)
  {
@@@ -240,8 -239,6 +240,8 @@@ static inline bool kmem_cache_debug_fla
  #ifdef CONFIG_MEMCG_KMEM
  int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
                                 gfp_t gfp, bool new_page);
 +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
 +                   enum node_stat_item idx, int nr);
  
  static inline void memcg_free_page_obj_cgroups(struct page *page)
  {
@@@ -286,6 -283,20 +286,6 @@@ static inline bool memcg_slab_pre_alloc
        return true;
  }
  
 -static inline void mod_objcg_state(struct obj_cgroup *objcg,
 -                                 struct pglist_data *pgdat,
 -                                 enum node_stat_item idx, int nr)
 -{
 -      struct mem_cgroup *memcg;
 -      struct lruvec *lruvec;
 -
 -      rcu_read_lock();
 -      memcg = obj_cgroup_memcg(objcg);
 -      lruvec = mem_cgroup_lruvec(memcg, pgdat);
 -      mod_memcg_lruvec_state(lruvec, idx, nr);
 -      rcu_read_unlock();
 -}
 -
  static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
                                              struct obj_cgroup *objcg,
                                              gfp_t flags, size_t size,
        if (!memcg_kmem_enabled() || !objcg)
                return;
  
 -      flags &= ~__GFP_ACCOUNT;
        for (i = 0; i < size; i++) {
                if (likely(p[i])) {
                        page = virt_to_head_page(p[i]);
@@@ -618,12 -630,6 +618,12 @@@ static inline bool slab_want_init_on_fr
        return false;
  }
  
 +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
 +void debugfs_slab_release(struct kmem_cache *);
 +#else
 +static inline void debugfs_slab_release(struct kmem_cache *s) { }
 +#endif
 +
  #ifdef CONFIG_PRINTK
  #define KS_ADDRS_COUNT 16
  struct kmem_obj_info {
        struct kmem_cache *kp_slab_cache;
        void *kp_ret;
        void *kp_stack[KS_ADDRS_COUNT];
+       void *kp_free_stack[KS_ADDRS_COUNT];
  };
  void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
  #endif
diff --combined mm/slab_common.c
index c126e6f6b5a569b3f9a3dee5b7499c8341b52fff,92e3aa78bb4d96c1a57b95ba5ac35641621df011..1c673c323baf2eed3fbf57d22d9ad70c779b5dd3
@@@ -97,7 -97,8 +97,7 @@@ EXPORT_SYMBOL(kmem_cache_size)
  #ifdef CONFIG_DEBUG_VM
  static int kmem_cache_sanity_check(const char *name, unsigned int size)
  {
 -      if (!name || in_interrupt() || size < sizeof(void *) ||
 -              size > KMALLOC_MAX_SIZE) {
 +      if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
                pr_err("kmem_cache_create(%s) integrity check failed\n", name);
                return -EINVAL;
        }
@@@ -317,16 -318,6 +317,16 @@@ kmem_cache_create_usercopy(const char *
        const char *cache_name;
        int err;
  
 +#ifdef CONFIG_SLUB_DEBUG
 +      /*
 +       * If no slub_debug was enabled globally, the static key is not yet
 +       * enabled by setup_slub_debug(). Enable it if the cache is being
 +       * created with any of the debugging flags passed explicitly.
 +       */
 +      if (flags & SLAB_DEBUG_FLAGS)
 +              static_branch_enable(&slub_debug_enabled);
 +#endif
 +
        mutex_lock(&slab_mutex);
  
        err = kmem_cache_sanity_check(name, size);
@@@ -377,11 -368,11 +377,11 @@@ out_unlock
  
        if (err) {
                if (flags & SLAB_PANIC)
 -                      panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
 -                              name, err);
 +                      panic("%s: Failed to create slab '%s'. Error %d\n",
 +                              __func__, name, err);
                else {
 -                      pr_warn("kmem_cache_create(%s) failed with error %d\n",
 -                              name, err);
 +                      pr_warn("%s(%s) failed with error %d\n",
 +                              __func__, name, err);
                        dump_stack();
                }
                return NULL;
@@@ -448,7 -439,6 +448,7 @@@ static void slab_caches_to_rcu_destroy_
        rcu_barrier();
  
        list_for_each_entry_safe(s, s2, &to_destroy, list) {
 +              debugfs_slab_release(s);
                kfence_shutdown_cache(s);
  #ifdef SLAB_SUPPORTS_SYSFS
                sysfs_slab_release(s);
@@@ -476,7 -466,6 +476,7 @@@ static int shutdown_cache(struct kmem_c
                schedule_work(&slab_caches_to_rcu_destroy_work);
        } else {
                kfence_shutdown_cache(s);
 +              debugfs_slab_release(s);
  #ifdef SLAB_SUPPORTS_SYSFS
                sysfs_slab_unlink(s);
                sysfs_slab_release(s);
@@@ -510,8 -499,8 +510,8 @@@ void kmem_cache_destroy(struct kmem_cac
  
        err = shutdown_cache(s);
        if (err) {
 -              pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
 -                     s->name);
 +              pr_err("%s %s: Slab cache still has objects\n",
 +                     __func__, s->name);
                dump_stack();
        }
  out_unlock:
@@@ -575,7 -564,7 +575,7 @@@ EXPORT_SYMBOL_GPL(kmem_valid_obj)
   * depends on the type of object and on how much debugging is enabled.
   * For a slab-cache object, the fact that it is a slab object is printed,
   * and, if available, the slab name, return address, and stack trace from
-  * the allocation of that object.
+  * the allocation and last free path of that object.
   *
   * This function will splat if passed a pointer to a non-slab object.
   * If you are not sure what type of object you have, you should instead
@@@ -620,6 -609,16 +620,16 @@@ void kmem_dump_obj(void *object
                        break;
                pr_info("    %pS\n", kp.kp_stack[i]);
        }
+       if (kp.kp_free_stack[0])
+               pr_cont(" Free path:\n");
+       for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
+               if (!kp.kp_free_stack[i])
+                       break;
+               pr_info("    %pS\n", kp.kp_free_stack[i]);
+       }
  }
  EXPORT_SYMBOL_GPL(kmem_dump_obj);
  #endif
@@@ -738,30 -737,26 +748,30 @@@ struct kmem_cache *kmalloc_slab(size_t 
  }
  
  #ifdef CONFIG_ZONE_DMA
 -#define INIT_KMALLOC_INFO(__size, __short_size)                       \
 -{                                                             \
 -      .name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,      \
 -      .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,  \
 -      .name[KMALLOC_DMA]     = "dma-kmalloc-" #__short_size,  \
 -      .size = __size,                                         \
 -}
 +#define KMALLOC_DMA_NAME(sz)  .name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
  #else
 +#define KMALLOC_DMA_NAME(sz)
 +#endif
 +
 +#ifdef CONFIG_MEMCG_KMEM
 +#define KMALLOC_CGROUP_NAME(sz)       .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
 +#else
 +#define KMALLOC_CGROUP_NAME(sz)
 +#endif
 +
  #define INIT_KMALLOC_INFO(__size, __short_size)                       \
  {                                                             \
        .name[KMALLOC_NORMAL]  = "kmalloc-" #__short_size,      \
        .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size,  \
 +      KMALLOC_CGROUP_NAME(__short_size)                       \
 +      KMALLOC_DMA_NAME(__short_size)                          \
        .size = __size,                                         \
  }
 -#endif
  
  /*
   * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
 - * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
 - * kmalloc-67108864.
 + * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is
 + * kmalloc-32M.
   */
  const struct kmalloc_info_struct kmalloc_info[] __initconst = {
        INIT_KMALLOC_INFO(0, 0),
        INIT_KMALLOC_INFO(4194304, 4M),
        INIT_KMALLOC_INFO(8388608, 8M),
        INIT_KMALLOC_INFO(16777216, 16M),
 -      INIT_KMALLOC_INFO(33554432, 32M),
 -      INIT_KMALLOC_INFO(67108864, 64M)
 +      INIT_KMALLOC_INFO(33554432, 32M)
  };
  
  /*
@@@ -842,27 -838,13 +852,27 @@@ void __init setup_kmalloc_cache_index_t
  static void __init
  new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
  {
 -      if (type == KMALLOC_RECLAIM)
 +      if (type == KMALLOC_RECLAIM) {
                flags |= SLAB_RECLAIM_ACCOUNT;
 +      } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
 +              if (cgroup_memory_nokmem) {
 +                      kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
 +                      return;
 +              }
 +              flags |= SLAB_ACCOUNT;
 +      }
  
        kmalloc_caches[type][idx] = create_kmalloc_cache(
                                        kmalloc_info[idx].name[type],
                                        kmalloc_info[idx].size, flags, 0,
                                        kmalloc_info[idx].size);
 +
 +      /*
 +       * If CONFIG_MEMCG_KMEM is enabled, disable cache merging for
 +       * KMALLOC_NORMAL caches.
 +       */
 +      if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL))
 +              kmalloc_caches[type][idx]->refcount = -1;
  }
  
  /*
@@@ -875,9 -857,6 +885,9 @@@ void __init create_kmalloc_caches(slab_
        int i;
        enum kmalloc_cache_type type;
  
 +      /*
 +       * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
 +       */
        for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
                for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
                        if (!kmalloc_caches[type][i])
diff --combined mm/slub.c
index 3bc8b940c933c68f7b65b81c3863f1df13210776,deec894a1345ac49c95dc43cf36a6f58d4b3090f..2ee43ff667a56225d6639e2b4f7edc51cc781869
+++ b/mm/slub.c
@@@ -15,7 -15,6 +15,7 @@@
  #include <linux/module.h>
  #include <linux/bit_spinlock.h>
  #include <linux/interrupt.h>
 +#include <linux/swab.h>
  #include <linux/bitops.h>
  #include <linux/slab.h>
  #include "slab.h"
@@@ -36,9 -35,7 +36,9 @@@
  #include <linux/prefetch.h>
  #include <linux/memcontrol.h>
  #include <linux/random.h>
 +#include <kunit/test.h>
  
 +#include <linux/debugfs.h>
  #include <trace/events/kmem.h>
  
  #include "internal.h"
   */
  
  #ifdef CONFIG_SLUB_DEBUG
 +
  #ifdef CONFIG_SLUB_DEBUG_ON
  DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
  #else
  DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
  #endif
 -#endif
 +
 +static inline bool __slub_debug_enabled(void)
 +{
 +      return static_branch_unlikely(&slub_debug_enabled);
 +}
 +
 +#else         /* CONFIG_SLUB_DEBUG */
 +
 +static inline bool __slub_debug_enabled(void)
 +{
 +      return false;
 +}
 +
 +#endif                /* CONFIG_SLUB_DEBUG */
  
  static inline bool kmem_cache_debug(struct kmem_cache *s)
  {
@@@ -170,6 -153,9 +170,6 @@@ static inline bool kmem_cache_has_cpu_p
   * - Variable sizing of the per node arrays
   */
  
 -/* Enable to test recovery from slab corruption on boot */
 -#undef SLUB_RESILIENCY_TEST
 -
  /* Enable to log cmpxchg failures */
  #undef SLUB_DEBUG_CMPXCHG
  
@@@ -239,12 -225,6 +239,12 @@@ static inline int sysfs_slab_alias(stru
                                                        { return 0; }
  #endif
  
 +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
 +static void debugfs_slab_add(struct kmem_cache *);
 +#else
 +static inline void debugfs_slab_add(struct kmem_cache *s) { }
 +#endif
 +
  static inline void stat(const struct kmem_cache *s, enum stat_item si)
  {
  #ifdef CONFIG_SLUB_STATS
@@@ -321,7 -301,6 +321,7 @@@ static inline void *get_freepointer_saf
        if (!debug_pagealloc_enabled_static())
                return get_freepointer(s, object);
  
 +      object = kasan_reset_tag(object);
        freepointer_addr = (unsigned long)object + s->offset;
        copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
        return freelist_ptr(s, p, freepointer_addr);
@@@ -468,26 -447,6 +468,26 @@@ static inline bool cmpxchg_double_slab(
  static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
  static DEFINE_SPINLOCK(object_map_lock);
  
 +#if IS_ENABLED(CONFIG_KUNIT)
 +static bool slab_add_kunit_errors(void)
 +{
 +      struct kunit_resource *resource;
 +
 +      if (likely(!current->kunit_test))
 +              return false;
 +
 +      resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
 +      if (!resource)
 +              return false;
 +
 +      (*(int *)resource->data)++;
 +      kunit_put_resource(resource);
 +      return true;
 +}
 +#else
 +static inline bool slab_add_kunit_errors(void) { return false; }
 +#endif
 +
  /*
   * Determine a map of object in use on a page.
   *
@@@ -708,18 -667,16 +708,18 @@@ static void slab_bug(struct kmem_cache 
        pr_err("=============================================================================\n");
        pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
        pr_err("-----------------------------------------------------------------------------\n\n");
 -
 -      add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
        va_end(args);
  }
  
 +__printf(2, 3)
  static void slab_fix(struct kmem_cache *s, char *fmt, ...)
  {
        struct va_format vaf;
        va_list args;
  
 +      if (slab_add_kunit_errors())
 +              return;
 +
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
@@@ -754,15 -711,15 +754,15 @@@ static void print_trailer(struct kmem_c
               p, p - addr, get_freepointer(s, p));
  
        if (s->flags & SLAB_RED_ZONE)
 -              print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
 +              print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
                              s->red_left_pad);
        else if (p > addr + 16)
                print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
  
 -      print_section(KERN_ERR, "Object ", p,
 +      print_section(KERN_ERR,         "Object   ", p,
                      min_t(unsigned int, s->object_size, PAGE_SIZE));
        if (s->flags & SLAB_RED_ZONE)
 -              print_section(KERN_ERR, "Redzone ", p + s->object_size,
 +              print_section(KERN_ERR, "Redzone  ", p + s->object_size,
                        s->inuse - s->object_size);
  
        off = get_info_end(s);
  
        if (off != size_from_object(s))
                /* Beginning of the filler is the free pointer */
 -              print_section(KERN_ERR, "Padding ", p + off,
 +              print_section(KERN_ERR, "Padding  ", p + off,
                              size_from_object(s) - off);
  
        dump_stack();
  void object_err(struct kmem_cache *s, struct page *page,
                        u8 *object, char *reason)
  {
 +      if (slab_add_kunit_errors())
 +              return;
 +
        slab_bug(s, "%s", reason);
        print_trailer(s, page, object);
 +      add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  }
  
  static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
        va_list args;
        char buf[100];
  
 +      if (slab_add_kunit_errors())
 +              return;
 +
        va_start(args, fmt);
        vsnprintf(buf, sizeof(buf), fmt, args);
        va_end(args);
        slab_bug(s, "%s", buf);
        print_page_info(page);
        dump_stack();
 +      add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  }
  
  static void init_object(struct kmem_cache *s, void *object, u8 val)
  static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
                                                void *from, void *to)
  {
 -      slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
 +      slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
        memset(from, data, to - from);
  }
  
@@@ -850,17 -799,12 +850,17 @@@ static int check_bytes_and_report(struc
        while (end > fault && end[-1] == value)
                end--;
  
 +      if (slab_add_kunit_errors())
 +              goto skip_bug_print;
 +
        slab_bug(s, "%s overwritten", what);
        pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
                                        fault, end - 1, fault - addr,
                                        fault[0], value);
        print_trailer(s, page, object);
 +      add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  
 +skip_bug_print:
        restore_bytes(s, what, value, fault, end);
        return 0;
  }
@@@ -964,11 -908,11 +964,11 @@@ static int check_object(struct kmem_cac
        u8 *endobject = object + s->object_size;
  
        if (s->flags & SLAB_RED_ZONE) {
 -              if (!check_bytes_and_report(s, page, object, "Redzone",
 +              if (!check_bytes_and_report(s, page, object, "Left Redzone",
                        object - s->red_left_pad, val, s->red_left_pad))
                        return 0;
  
 -              if (!check_bytes_and_report(s, page, object, "Redzone",
 +              if (!check_bytes_and_report(s, page, object, "Right Redzone",
                        endobject, val, s->inuse - s->object_size))
                        return 0;
        } else {
                if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
                        (!check_bytes_and_report(s, page, p, "Poison", p,
                                        POISON_FREE, s->object_size - 1) ||
 -                       !check_bytes_and_report(s, page, p, "Poison",
 +                       !check_bytes_and_report(s, page, p, "End Poison",
                                p + s->object_size - 1, POISON_END, 1)))
                        return 0;
                /*
@@@ -1082,13 -1026,13 +1082,13 @@@ static int on_freelist(struct kmem_cach
                slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
                         page->objects, max_objects);
                page->objects = max_objects;
 -              slab_fix(s, "Number of objects adjusted.");
 +              slab_fix(s, "Number of objects adjusted");
        }
        if (page->inuse != page->objects - nr) {
                slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
                         page->inuse, page->objects - nr);
                page->inuse = page->objects - nr;
 -              slab_fix(s, "Object count adjusted.");
 +              slab_fix(s, "Object count adjusted");
        }
        return search == NULL;
  }
@@@ -1452,8 -1396,6 +1452,8 @@@ static int __init setup_slub_debug(cha
  out:
        if (slub_debug != 0 || slub_debug_string)
                static_branch_enable(&slub_debug_enabled);
 +      else
 +              static_branch_disable(&slub_debug_enabled);
        if ((static_branch_unlikely(&init_on_alloc) ||
             static_branch_unlikely(&init_on_free)) &&
            (slub_debug & SLAB_POISON))
@@@ -3746,6 -3688,7 +3746,6 @@@ static int calculate_sizes(struct kmem_
  {
        slab_flags_t flags = s->flags;
        unsigned int size = s->object_size;
 -      unsigned int freepointer_area;
        unsigned int order;
  
        /*
         * the possible location of the free pointer.
         */
        size = ALIGN(size, sizeof(void *));
 -      /*
 -       * This is the area of the object where a freepointer can be
 -       * safely written. If redzoning adds more to the inuse size, we
 -       * can't use that portion for writing the freepointer, so
 -       * s->offset must be limited within this for the general case.
 -       */
 -      freepointer_area = size;
  
  #ifdef CONFIG_SLUB_DEBUG
        /*
  
        /*
         * With that we have determined the number of bytes in actual use
 -       * by the object. This is the potential offset to the free pointer.
 +       * by the object and redzoning.
         */
        s->inuse = size;
  
 -      if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
 -              s->ctor)) {
 +      if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
 +          ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
 +          s->ctor) {
                /*
                 * Relocate free pointer after the object if it is not
                 * permitted to overwrite the first word of the object on
                 * kmem_cache_free.
                 *
                 * This is the case if we do RCU, have a constructor or
 -               * destructor or are poisoning the objects.
 +               * destructor, are poisoning the objects, or are
 +               * redzoning an object smaller than sizeof(void *).
                 *
                 * The assumption that s->offset >= s->inuse means free
                 * pointer is outside of the object is used in the
                 */
                s->offset = size;
                size += sizeof(void *);
 -      } else if (freepointer_area > sizeof(void *)) {
 +      } else {
                /*
                 * Store freelist pointer near middle of object to keep
                 * it away from the edges of the object to avoid small
                 * sized over/underflows from neighboring allocations.
                 */
 -              s->offset = ALIGN(freepointer_area / 2, sizeof(void *));
 +              s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
        }
  
  #ifdef CONFIG_SLUB_DEBUG
  
  static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
  {
 -#ifdef CONFIG_SLUB_DEBUG
 -      /*
 -       * If no slub_debug was enabled globally, the static key is not yet
 -       * enabled by setup_slub_debug(). Enable it if the cache is being
 -       * created with any of the debugging flags passed explicitly.
 -       */
 -      if (flags & SLAB_DEBUG_FLAGS)
 -              static_branch_enable(&slub_debug_enabled);
 -#endif
        s->flags = kmem_cache_flags(s->size, flags, s->name);
  #ifdef CONFIG_SLAB_FREELIST_HARDENED
        s->random = get_random_long();
@@@ -4045,6 -4002,7 +4045,7 @@@ void kmem_obj_info(struct kmem_obj_inf
            !(s->flags & SLAB_STORE_USER))
                return;
  #ifdef CONFIG_SLUB_DEBUG
+       objp = fixup_red_left(s, objp);
        trackp = get_track(s, objp, TRACK_ALLOC);
        kpp->kp_ret = (void *)trackp->addr;
  #ifdef CONFIG_STACKTRACE
                if (!kpp->kp_stack[i])
                        break;
        }
+       trackp = get_track(s, objp, TRACK_FREE);
+       for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
+               kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
+               if (!kpp->kp_free_stack[i])
+                       break;
+       }
  #endif
  #endif
  }
@@@ -4509,10 -4474,6 +4517,10 @@@ void __init kmem_cache_init(void
        if (debug_guardpage_minorder())
                slub_max_order = 0;
  
 +      /* Print slub debugging pointers without hashing */
 +      if (__slub_debug_enabled())
 +              no_hash_pointers_enable(NULL);
 +
        kmem_cache_node = &boot_kmem_cache_node;
        kmem_cache = &boot_kmem_cache;
  
@@@ -4601,9 -4562,6 +4609,9 @@@ int __kmem_cache_create(struct kmem_cac
        if (err)
                __kmem_cache_release(s);
  
 +      if (s->flags & SLAB_STORE_USER)
 +              debugfs_slab_add(s);
 +
        return err;
  }
  
@@@ -4712,11 -4670,9 +4720,11 @@@ static int validate_slab_node(struct km
                validate_slab(s, page);
                count++;
        }
 -      if (count != n->nr_partial)
 +      if (count != n->nr_partial) {
                pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
                       s->name, count, n->nr_partial);
 +              slab_add_kunit_errors();
 +      }
  
        if (!(s->flags & SLAB_STORE_USER))
                goto out;
                validate_slab(s, page);
                count++;
        }
 -      if (count != atomic_long_read(&n->nr_slabs))
 +      if (count != atomic_long_read(&n->nr_slabs)) {
                pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
                       s->name, count, atomic_long_read(&n->nr_slabs));
 +              slab_add_kunit_errors();
 +      }
  
  out:
        spin_unlock_irqrestore(&n->list_lock, flags);
        return count;
  }
  
 -static long validate_slab_cache(struct kmem_cache *s)
 +long validate_slab_cache(struct kmem_cache *s)
  {
        int node;
        unsigned long count = 0;
  
        return count;
  }
 +EXPORT_SYMBOL(validate_slab_cache);
 +
 +#ifdef CONFIG_DEBUG_FS
  /*
   * Generate lists of code addresses where slabcache objects are allocated
   * and freed.
@@@ -4774,8 -4725,6 +4782,8 @@@ struct loc_track 
        struct location *loc;
  };
  
 +static struct dentry *slab_debugfs_root;
 +
  static void free_loc_track(struct loc_track *t)
  {
        if (t->max)
@@@ -4892,9 -4841,144 +4900,9 @@@ static void process_slab(struct loc_tra
                        add_location(t, s, get_track(s, p, alloc));
        put_map(map);
  }
 -
 -static int list_locations(struct kmem_cache *s, char *buf,
 -                        enum track_item alloc)
 -{
 -      int len = 0;
 -      unsigned long i;
 -      struct loc_track t = { 0, 0, NULL };
 -      int node;
 -      struct kmem_cache_node *n;
 -
 -      if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
 -                           GFP_KERNEL)) {
 -              return sysfs_emit(buf, "Out of memory\n");
 -      }
 -      /* Push back cpu slabs */
 -      flush_all(s);
 -
 -      for_each_kmem_cache_node(s, node, n) {
 -              unsigned long flags;
 -              struct page *page;
 -
 -              if (!atomic_long_read(&n->nr_slabs))
 -                      continue;
 -
 -              spin_lock_irqsave(&n->list_lock, flags);
 -              list_for_each_entry(page, &n->partial, slab_list)
 -                      process_slab(&t, s, page, alloc);
 -              list_for_each_entry(page, &n->full, slab_list)
 -                      process_slab(&t, s, page, alloc);
 -              spin_unlock_irqrestore(&n->list_lock, flags);
 -      }
 -
 -      for (i = 0; i < t.count; i++) {
 -              struct location *l = &t.loc[i];
 -
 -              len += sysfs_emit_at(buf, len, "%7ld ", l->count);
 -
 -              if (l->addr)
 -                      len += sysfs_emit_at(buf, len, "%pS", (void *)l->addr);
 -              else
 -                      len += sysfs_emit_at(buf, len, "<not-available>");
 -
 -              if (l->sum_time != l->min_time)
 -                      len += sysfs_emit_at(buf, len, " age=%ld/%ld/%ld",
 -                                           l->min_time,
 -                                           (long)div_u64(l->sum_time,
 -                                                         l->count),
 -                                           l->max_time);
 -              else
 -                      len += sysfs_emit_at(buf, len, " age=%ld", l->min_time);
 -
 -              if (l->min_pid != l->max_pid)
 -                      len += sysfs_emit_at(buf, len, " pid=%ld-%ld",
 -                                           l->min_pid, l->max_pid);
 -              else
 -                      len += sysfs_emit_at(buf, len, " pid=%ld",
 -                                           l->min_pid);
 -
 -              if (num_online_cpus() > 1 &&
 -                  !cpumask_empty(to_cpumask(l->cpus)))
 -                      len += sysfs_emit_at(buf, len, " cpus=%*pbl",
 -                                           cpumask_pr_args(to_cpumask(l->cpus)));
 -
 -              if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
 -                      len += sysfs_emit_at(buf, len, " nodes=%*pbl",
 -                                           nodemask_pr_args(&l->nodes));
 -
 -              len += sysfs_emit_at(buf, len, "\n");
 -      }
 -
 -      free_loc_track(&t);
 -      if (!t.count)
 -              len += sysfs_emit_at(buf, len, "No data\n");
 -
 -      return len;
 -}
 +#endif  /* CONFIG_DEBUG_FS   */
  #endif        /* CONFIG_SLUB_DEBUG */
  
 -#ifdef SLUB_RESILIENCY_TEST
 -static void __init resiliency_test(void)
 -{
 -      u8 *p;
 -      int type = KMALLOC_NORMAL;
 -
 -      BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
 -
 -      pr_err("SLUB resiliency testing\n");
 -      pr_err("-----------------------\n");
 -      pr_err("A. Corruption after allocation\n");
 -
 -      p = kzalloc(16, GFP_KERNEL);
 -      p[16] = 0x12;
 -      pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
 -             p + 16);
 -
 -      validate_slab_cache(kmalloc_caches[type][4]);
 -
 -      /* Hmmm... The next two are dangerous */
 -      p = kzalloc(32, GFP_KERNEL);
 -      p[32 + sizeof(void *)] = 0x34;
 -      pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
 -             p);
 -      pr_err("If allocated object is overwritten then not detectable\n\n");
 -
 -      validate_slab_cache(kmalloc_caches[type][5]);
 -      p = kzalloc(64, GFP_KERNEL);
 -      p += 64 + (get_cycles() & 0xff) * sizeof(void *);
 -      *p = 0x56;
 -      pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
 -             p);
 -      pr_err("If allocated object is overwritten then not detectable\n\n");
 -      validate_slab_cache(kmalloc_caches[type][6]);
 -
 -      pr_err("\nB. Corruption after free\n");
 -      p = kzalloc(128, GFP_KERNEL);
 -      kfree(p);
 -      *p = 0x78;
 -      pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
 -      validate_slab_cache(kmalloc_caches[type][7]);
 -
 -      p = kzalloc(256, GFP_KERNEL);
 -      kfree(p);
 -      p[50] = 0x9a;
 -      pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
 -      validate_slab_cache(kmalloc_caches[type][8]);
 -
 -      p = kzalloc(512, GFP_KERNEL);
 -      kfree(p);
 -      p[512] = 0xab;
 -      pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
 -      validate_slab_cache(kmalloc_caches[type][9]);
 -}
 -#else
 -#ifdef CONFIG_SYSFS
 -static void resiliency_test(void) {};
 -#endif
 -#endif        /* SLUB_RESILIENCY_TEST */
 -
  #ifdef CONFIG_SYSFS
  enum slab_stat_type {
        SL_ALL,                 /* All slabs */
@@@ -5282,6 -5366,21 +5290,6 @@@ static ssize_t validate_store(struct km
  }
  SLAB_ATTR(validate);
  
 -static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
 -{
 -      if (!(s->flags & SLAB_STORE_USER))
 -              return -ENOSYS;
 -      return list_locations(s, buf, TRACK_ALLOC);
 -}
 -SLAB_ATTR_RO(alloc_calls);
 -
 -static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
 -{
 -      if (!(s->flags & SLAB_STORE_USER))
 -              return -ENOSYS;
 -      return list_locations(s, buf, TRACK_FREE);
 -}
 -SLAB_ATTR_RO(free_calls);
  #endif /* CONFIG_SLUB_DEBUG */
  
  #ifdef CONFIG_FAILSLAB
@@@ -5445,6 -5544,8 +5453,6 @@@ static struct attribute *slab_attrs[] 
        &poison_attr.attr,
        &store_user_attr.attr,
        &validate_attr.attr,
 -      &alloc_calls_attr.attr,
 -      &free_calls_attr.attr,
  #endif
  #ifdef CONFIG_ZONE_DMA
        &cache_dma_attr.attr,
@@@ -5726,179 -5827,13 +5734,179 @@@ static int __init slab_sysfs_init(void
        }
  
        mutex_unlock(&slab_mutex);
 -      resiliency_test();
        return 0;
  }
  
  __initcall(slab_sysfs_init);
  #endif /* CONFIG_SYSFS */
  
 +#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
 +static int slab_debugfs_show(struct seq_file *seq, void *v)
 +{
 +
 +      struct location *l;
 +      unsigned int idx = *(unsigned int *)v;
 +      struct loc_track *t = seq->private;
 +
 +      if (idx < t->count) {
 +              l = &t->loc[idx];
 +
 +              seq_printf(seq, "%7ld ", l->count);
 +
 +              if (l->addr)
 +                      seq_printf(seq, "%pS", (void *)l->addr);
 +              else
 +                      seq_puts(seq, "<not-available>");
 +
 +              if (l->sum_time != l->min_time) {
 +                      seq_printf(seq, " age=%ld/%llu/%ld",
 +                              l->min_time, div_u64(l->sum_time, l->count),
 +                              l->max_time);
 +              } else
 +                      seq_printf(seq, " age=%ld", l->min_time);
 +
 +              if (l->min_pid != l->max_pid)
 +                      seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
 +              else
 +                      seq_printf(seq, " pid=%ld",
 +                              l->min_pid);
 +
 +              if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
 +                      seq_printf(seq, " cpus=%*pbl",
 +                               cpumask_pr_args(to_cpumask(l->cpus)));
 +
 +              if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
 +                      seq_printf(seq, " nodes=%*pbl",
 +                               nodemask_pr_args(&l->nodes));
 +
 +              seq_puts(seq, "\n");
 +      }
 +
 +      if (!idx && !t->count)
 +              seq_puts(seq, "No data\n");
 +
 +      return 0;
 +}
 +
 +static void slab_debugfs_stop(struct seq_file *seq, void *v)
 +{
 +}
 +
 +static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
 +{
 +      struct loc_track *t = seq->private;
 +
 +      v = ppos;
 +      ++*ppos;
 +      if (*ppos <= t->count)
 +              return v;
 +
 +      return NULL;
 +}
 +
 +static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
 +{
 +      return ppos;
 +}
 +
 +static const struct seq_operations slab_debugfs_sops = {
 +      .start  = slab_debugfs_start,
 +      .next   = slab_debugfs_next,
 +      .stop   = slab_debugfs_stop,
 +      .show   = slab_debugfs_show,
 +};
 +
 +static int slab_debug_trace_open(struct inode *inode, struct file *filep)
 +{
 +
 +      struct kmem_cache_node *n;
 +      enum track_item alloc;
 +      int node;
 +      struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
 +                                              sizeof(struct loc_track));
 +      struct kmem_cache *s = file_inode(filep)->i_private;
 +
 +      if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
 +              alloc = TRACK_ALLOC;
 +      else
 +              alloc = TRACK_FREE;
 +
 +      if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL))
 +              return -ENOMEM;
 +
 +      /* Push back cpu slabs */
 +      flush_all(s);
 +
 +      for_each_kmem_cache_node(s, node, n) {
 +              unsigned long flags;
 +              struct page *page;
 +
 +              if (!atomic_long_read(&n->nr_slabs))
 +                      continue;
 +
 +              spin_lock_irqsave(&n->list_lock, flags);
 +              list_for_each_entry(page, &n->partial, slab_list)
 +                      process_slab(t, s, page, alloc);
 +              list_for_each_entry(page, &n->full, slab_list)
 +                      process_slab(t, s, page, alloc);
 +              spin_unlock_irqrestore(&n->list_lock, flags);
 +      }
 +
 +      return 0;
 +}
 +
 +static int slab_debug_trace_release(struct inode *inode, struct file *file)
 +{
 +      struct seq_file *seq = file->private_data;
 +      struct loc_track *t = seq->private;
 +
 +      free_loc_track(t);
 +      return seq_release_private(inode, file);
 +}
 +
 +static const struct file_operations slab_debugfs_fops = {
 +      .open    = slab_debug_trace_open,
 +      .read    = seq_read,
 +      .llseek  = seq_lseek,
 +      .release = slab_debug_trace_release,
 +};
 +
 +static void debugfs_slab_add(struct kmem_cache *s)
 +{
 +      struct dentry *slab_cache_dir;
 +
 +      if (unlikely(!slab_debugfs_root))
 +              return;
 +
 +      slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
 +
 +      debugfs_create_file("alloc_traces", 0400,
 +              slab_cache_dir, s, &slab_debugfs_fops);
 +
 +      debugfs_create_file("free_traces", 0400,
 +              slab_cache_dir, s, &slab_debugfs_fops);
 +}
 +
 +void debugfs_slab_release(struct kmem_cache *s)
 +{
 +      debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root));
 +}
 +
 +static int __init slab_debugfs_init(void)
 +{
 +      struct kmem_cache *s;
 +
 +      slab_debugfs_root = debugfs_create_dir("slab", NULL);
 +
 +      list_for_each_entry(s, &slab_caches, list)
 +              if (s->flags & SLAB_STORE_USER)
 +                      debugfs_slab_add(s);
 +
 +      return 0;
 +
 +}
 +__initcall(slab_debugfs_init);
 +#endif
  /*
   * The /proc/slabinfo ABI
   */
diff --combined mm/util.c
index a034525e7ba20522fa9af16ac37bea6eb57b827c,0b6dd9d81da797b5c7bf9a86808175db42981e05..99c6cc77de9e2fff60346dd8b2da1b504af8f570
+++ b/mm/util.c
@@@ -983,7 -983,7 +983,7 @@@ int __weak memcmp_pages(struct page *pa
   * depends on the type of object and on how much debugging is enabled.
   * For example, for a slab-cache object, the slab name is printed, and,
   * if available, the return address and stack trace from the allocation
-  * of that object.
+  * and last free path of that object.
   */
  void mem_dump_obj(void *object)
  {
  }
  EXPORT_SYMBOL_GPL(mem_dump_obj);
  #endif
 +
 +/*
 + * A driver might set a page logically offline -- PageOffline() -- and
 + * turn the page inaccessible in the hypervisor; after that, access to page
 + * content can be fatal.
 + *
 + * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
 + * pages after checking PageOffline(); however, these PFN walkers can race
 + * with drivers that set PageOffline().
 + *
 + * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
 + * synchronize with such drivers, achieving that a page cannot be set
 + * PageOffline() while frozen.
 + *
 + * page_offline_begin()/page_offline_end() is used by drivers that care about
 + * such races when setting a page PageOffline().
 + */
 +static DECLARE_RWSEM(page_offline_rwsem);
 +
 +void page_offline_freeze(void)
 +{
 +      down_read(&page_offline_rwsem);
 +}
 +
 +void page_offline_thaw(void)
 +{
 +      up_read(&page_offline_rwsem);
 +}
 +
 +void page_offline_begin(void)
 +{
 +      down_write(&page_offline_rwsem);
 +}
 +EXPORT_SYMBOL(page_offline_begin);
 +
 +void page_offline_end(void)
 +{
 +      up_write(&page_offline_rwsem);
 +}
 +EXPORT_SYMBOL(page_offline_end);
This page took 0.294292 seconds and 4 git commands to generate.