Merge branch 'exp.2022.05.11a' into HEAD

author Paul E. McKenney <[email protected]>

Wed, 11 May 2022 18:49:35 +0000 (11:49 -0700)

committer Paul E. McKenney <[email protected]>

Wed, 11 May 2022 18:49:35 +0000 (11:49 -0700)
author Paul E. McKenney <[email protected]>
Wed, 11 May 2022 18:49:35 +0000 (11:49 -0700)
committer Paul E. McKenney <[email protected]>
Wed, 11 May 2022 18:49:35 +0000 (11:49 -0700)
diff --combined Documentation/admin-guide/kernel-parameters.txt

index 82dd253e5dbdcc2d348a8c034a585f3512d61f66,5e21a3fb57c46d18c553c596e9c2f7fd8e7ae670..f8d9af5d51e5dcfdf7a0266064cc0b8d4fa82ca3
--- 1/Documentation/admin-guide/kernel-parameters.txt
--- 2/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@@ -4893,6 -4893,18 +4893,18 @@@
   
         rcupdate.rcu_cpu_stall_timeout= [KNL]
                         Set timeout for RCU CPU stall warning messages.
+                       The value is in seconds and the maximum allowed
+                       value is 300 seconds.
+ 
+       rcupdate.rcu_exp_cpu_stall_timeout= [KNL]
+                       Set timeout for expedited RCU CPU stall warning
+                       messages.  The value is in milliseconds
+                       and the maximum allowed value is 21000
+                       milliseconds. Please note that this value is
+                       adjusted to an arch timer tick resolution.
+                       Setting this to zero causes the value from
+                       rcupdate.rcu_cpu_stall_timeout to be used (after
+                       conversion from seconds to milliseconds).
   
         rcupdate.rcu_expedited= [KNL]
                         Use expedited grace-period primitives, for
@@@ -4955,34 -4967,10 +4967,34 @@@
                         number avoids disturbing real-time workloads,
                         but lengthens grace periods.
   
+ +      rcupdate.rcu_task_stall_info= [KNL]
+ +                      Set initial timeout in jiffies for RCU task stall
+ +                      informational messages, which give some indication
+ +                      of the problem for those not patient enough to
+ +                      wait for ten minutes.  Informational messages are
+ +                      only printed prior to the stall-warning message
+ +                      for a given grace period. Disable with a value
+ +                      less than or equal to zero.  Defaults to ten
+ +                      seconds.  A change in value does not take effect
+ +                      until the beginning of the next grace period.
+ +
+ +      rcupdate.rcu_task_stall_info_mult= [KNL]
+ +                      Multiplier for time interval between successive
+ +                      RCU task stall informational messages for a given
+ +                      RCU tasks grace period.  This value is clamped
+ +                      to one through ten, inclusive.  It defaults to
+ +                      the value three, so that the first informational
+ +                      message is printed 10 seconds into the grace
+ +                      period, the second at 40 seconds, the third at
+ +                      160 seconds, and then the stall warning at 600
+ +                      seconds would prevent a fourth at 640 seconds.
+ +
         rcupdate.rcu_task_stall_timeout= [KNL]
- -                      Set timeout in jiffies for RCU task stall warning
- -                      messages.  Disable with a value less than or equal
- -                      to zero.
+ +                      Set timeout in jiffies for RCU task stall
+ +                      warning messages.  Disable with a value less
+ +                      than or equal to zero.  Defaults to ten minutes.
+ +                      A change in value does not take effect until
+ +                      the beginning of the next grace period.
   
         rcupdate.rcu_self_test= [KNL]
                         Run the RCU early boot self tests
@@@ -5401,17 -5389,6 +5413,17 @@@
         smart2=         [HW]
                         Format: <io1>[,<io2>[,...,<io8>]]
   
+ +      smp.csd_lock_timeout= [KNL]
+ +                      Specify the period of time in milliseconds
+ +                      that smp_call_function() and friends will wait
+ +                      for a CPU to release the CSD lock.  This is
+ +                      useful when diagnosing bugs involving CPUs
+ +                      disabling interrupts for extended periods
+ +                      of time.  Defaults to 5,000 milliseconds, and
+ +                      setting a value of zero disables this feature.
+ +                      This feature may be more efficiently disabled
+ +                      using the csdlock_debug- kernel parameter.
+ +
         smsc-ircc2.nopnp        [HW] Don't use PNP to discover SMC devices
         smsc-ircc2.ircc_cfg=    [HW] Device configuration I/O port
         smsc-ircc2.ircc_sir=    [HW] SIR base I/O port
@@@ -5643,30 -5620,6 +5655,30 @@@
                         off:    Disable mitigation and remove
                                 performance impact to RDRAND and RDSEED
   
+ +      srcutree.big_cpu_lim [KNL]
+ +                      Specifies the number of CPUs constituting a
+ +                      large system, such that srcu_struct structures
+ +                      should immediately allocate an srcu_node array.
+ +                      This kernel-boot parameter defaults to 128,
+ +                      but takes effect only when the low-order four
+ +                      bits of srcutree.convert_to_big is equal to 3
+ +                      (decide at boot).
+ +
+ +      srcutree.convert_to_big [KNL]
+ +                      Specifies under what conditions an SRCU tree
+ +                      srcu_struct structure will be converted to big
+ +                      form, that is, with an rcu_node tree:
+ +
+ +                                 0:  Never.
+ +                                 1:  At init_srcu_struct() time.
+ +                                 2:  When rcutorture decides to.
+ +                                 3:  Decide at boot time (default).
+ +                              0x1X:  Above plus if high contention.
+ +
+ +                      Either way, the srcu_node tree will be sized based
+ +                      on the actual runtime number of CPUs (nr_cpu_ids)
+ +                      instead of the compile-time CONFIG_NR_CPUS.
+ +
         srcutree.counter_wrap_check [KNL]
                         Specifies how frequently to check for
                         grace-period sequence counter wrap for the
@@@ -5684,14 -5637,6 +5696,14 @@@
                         expediting.  Set to zero to disable automatic
                         expediting.
   
+ +      srcutree.small_contention_lim [KNL]
+ +                      Specifies the number of update-side contention
+ +                      events per jiffy will be tolerated before
+ +                      initiating a conversion of an srcu_struct
+ +                      structure to big form.  Note that the value of
+ +                      srcutree.convert_to_big must have the 0x10 bit
+ +                      set for contention-based conversions to occur.
+ +
         ssbd=           [ARM64,HW]
                         Speculative Store Bypass Disable control
   
diff --combined kernel/rcu/Kconfig

index 65d45c00fd1b79357c02a40dc651d993a5de6073,fd64a75823cba1ca83cf510cc9c3740974a42ec4..1c630e573548df34a28a9061e87b961c2a879573
--- 1/kernel/rcu/Kconfig
--- 2/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@@ -77,56 -77,31 +77,56 @@@ config TASKS_RCU_GENERI
           This option enables generic infrastructure code supporting
           task-based RCU implementations.  Not for manual selection.
   
+ +config FORCE_TASKS_RCU
+ +      bool "Force selection of TASKS_RCU"
+ +      depends on RCU_EXPERT
+ +      select TASKS_RCU
+ +      default n
+ +      help
+ +        This option force-enables a task-based RCU implementation
+ +        that uses only voluntary context switch (not preemption!),
+ +        idle, and user-mode execution as quiescent states.  Not for
+ +        manual selection in most cases.
+ +
   config TASKS_RCU
- -      def_bool PREEMPTION
+ +      bool
+ +      default n
+ +      select IRQ_WORK
+ +
+ +config FORCE_TASKS_RUDE_RCU
+ +      bool "Force selection of Tasks Rude RCU"
+ +      depends on RCU_EXPERT
+ +      select TASKS_RUDE_RCU
+ +      default n
         help
- -        This option enables a task-based RCU implementation that uses
- -        only voluntary context switch (not preemption!), idle, and
- -        user-mode execution as quiescent states.  Not for manual selection.
+ +        This option force-enables a task-based RCU implementation
+ +        that uses only context switch (including preemption) and
+ +        user-mode execution as quiescent states.  It forces IPIs and
+ +        context switches on all online CPUs, including idle ones,
+ +        so use with caution.  Not for manual selection in most cases.
   
   config TASKS_RUDE_RCU
- -      def_bool 0
+ +      bool
+ +      default n
+ +      select IRQ_WORK
+ +
+ +config FORCE_TASKS_TRACE_RCU
+ +      bool "Force selection of Tasks Trace RCU"
+ +      depends on RCU_EXPERT
+ +      select TASKS_TRACE_RCU
+ +      default n
         help
           This option enables a task-based RCU implementation that uses
- -        only context switch (including preemption) and user-mode
- -        execution as quiescent states.  It forces IPIs and context
- -        switches on all online CPUs, including idle ones, so use
- -        with caution.
+ +        explicit rcu_read_lock_trace() read-side markers, and allows
+ +        these readers to appear in the idle loop as well as on the
+ +        CPU hotplug code paths.  It can force IPIs on online CPUs,
+ +        including idle ones, so use with caution.  Not for manual
+ +        selection in most cases.
   
   config TASKS_TRACE_RCU
- -      def_bool 0
+ +      bool
+ +      default n
         select IRQ_WORK
- -      help
- -        This option enables a task-based RCU implementation that uses
- -        explicit rcu_read_lock_trace() read-side markers, and allows
- -        these readers to appear in the idle loop as well as on the CPU
- -        hotplug code paths.  It can force IPIs on online CPUs, including
- -        idle ones, so use with caution.
   
   config RCU_STALL_COMMON
         def_bool TREE_RCU
@@@ -219,6 -194,20 +219,20 @@@ config RCU_BOOST_DELA
           blocking an expedited RCU grace period is boosted immediately.
   
           Accept the default if unsure.
+ 
+ config RCU_EXP_KTHREAD
+       bool "Perform RCU expedited work in a real-time kthread"
+       depends on RCU_BOOST && RCU_EXPERT
+       default !PREEMPT_RT && NR_CPUS <= 32
+       help
+         Use this option to further reduce the latencies of expedited
+         grace periods at the expense of being more disruptive.
+ 
+         This option is disabled by default on PREEMPT_RT=y kernels which
+         disable expedited grace periods after boot by unconditionally
+         setting rcupdate.rcu_normal_after_boot=1.
+ 
+         Accept the default if unsure.
   
   config RCU_NOCB_CPU
         bool "Offload RCU callback processing from boot-selected CPUs"
@@@ -250,7 -239,7 +264,7 @@@
   
   config TASKS_TRACE_RCU_READ_MB
         bool "Tasks Trace RCU readers use memory barriers in user and idle"
- -      depends on RCU_EXPERT
+ +      depends on RCU_EXPERT && TASKS_TRACE_RCU
         default PREEMPT_RT || NR_CPUS < 8
         help
           Use this option to further reduce the number of IPIs sent
diff --combined kernel/rcu/Kconfig.debug

index 68092e1db64bde2d0253aa1d58cf56827de95e68,0b397b5bf8469a4880a6f7ad61b02c3e948fd1e8..9b64e55d4f6159f44eac3017ded4b6c67a1803a1
--- 1/kernel/rcu/Kconfig.debug
--- 2/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@@ -28,6 -28,9 +28,6 @@@ config RCU_SCALE_TES
         depends on DEBUG_KERNEL
         select TORTURE_TEST
         select SRCU
- -      select TASKS_RCU
- -      select TASKS_RUDE_RCU
- -      select TASKS_TRACE_RCU
         default n
         help
           This option provides a kernel module that runs performance
@@@ -44,6 -47,9 +44,6 @@@ config RCU_TORTURE_TES
         depends on DEBUG_KERNEL
         select TORTURE_TEST
         select SRCU
- -      select TASKS_RCU
- -      select TASKS_RUDE_RCU
- -      select TASKS_TRACE_RCU
         default n
         help
           This option provides a kernel module that runs torture tests
@@@ -60,6 -66,9 +60,6 @@@ config RCU_REF_SCALE_TES
         depends on DEBUG_KERNEL
         select TORTURE_TEST
         select SRCU
- -      select TASKS_RCU
- -      select TASKS_RUDE_RCU
- -      select TASKS_TRACE_RCU
         default n
         help
           This option provides a kernel module that runs performance tests
@@@ -82,6 -91,20 +82,20 @@@ config RCU_CPU_STALL_TIMEOU
           RCU grace period persists, additional CPU stall warnings are
           printed at more widely spaced intervals.
   
+ config RCU_EXP_CPU_STALL_TIMEOUT
+       int "Expedited RCU CPU stall timeout in milliseconds"
+       depends on RCU_STALL_COMMON
+       range 0 21000
+       default 20 if ANDROID
+       default 0 if !ANDROID
+       help
+         If a given expedited RCU grace period extends more than the
+         specified number of milliseconds, a CPU stall warning is printed.
+         If the RCU grace period persists, additional CPU stall warnings
+         are printed at more widely spaced intervals.  A value of zero
+         says to use the RCU_CPU_STALL_TIMEOUT value converted from
+         seconds to milliseconds.
+ 
   config RCU_TRACE
         bool "Enable tracing for RCU"
         depends on DEBUG_KERNEL
diff --combined kernel/rcu/rcu.h

index a985dfc2ce26e37b534510ab83615d44caf1631a,e27bf7d1e3a415051eef16368d21a615979d254d..152492d52715647f953c97c264954cb3496de786
--- 1/kernel/rcu/rcu.h
--- 2/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@@ -210,7 -210,9 +210,9 @@@ static inline bool rcu_stall_is_suppres
   extern int rcu_cpu_stall_ftrace_dump;
   extern int rcu_cpu_stall_suppress;
   extern int rcu_cpu_stall_timeout;
+ extern int rcu_exp_cpu_stall_timeout;
   int rcu_jiffies_till_stall_check(void);
+ int rcu_exp_jiffies_till_stall_check(void);
   
   static inline bool rcu_stall_is_suppressed(void)
   {
@@@ -523,8 -525,6 +525,8 @@@ static inline bool rcu_check_boost_fail
   static inline void show_rcu_gp_kthreads(void) { }
   static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
   static inline void rcu_fwd_progress_check(unsigned long j) { }
+ +static inline void rcu_gp_slow_register(atomic_t *rgssp) { }
+ +static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
   #else /* #ifdef CONFIG_TINY_RCU */
   bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
   unsigned long rcu_get_gp_seq(void);
@@@ -536,14 -536,19 +538,19 @@@ int rcu_get_gp_kthreads_prio(void)
   void rcu_fwd_progress_check(unsigned long j);
   void rcu_force_quiescent_state(void);
   extern struct workqueue_struct *rcu_gp_wq;
+ #ifdef CONFIG_RCU_EXP_KTHREAD
+ extern struct kthread_worker *rcu_exp_gp_kworker;
+ extern struct kthread_worker *rcu_exp_par_gp_kworker;
+ #else /* !CONFIG_RCU_EXP_KTHREAD */
   extern struct workqueue_struct *rcu_par_gp_wq;
+ #endif /* CONFIG_RCU_EXP_KTHREAD */
+ +void rcu_gp_slow_register(atomic_t *rgssp);
+ +void rcu_gp_slow_unregister(atomic_t *rgssp);
   #endif /* #else #ifdef CONFIG_TINY_RCU */
   
   #ifdef CONFIG_RCU_NOCB_CPU
- -bool rcu_is_nocb_cpu(int cpu);
   void rcu_bind_current_to_nocb(void);
   #else
- -static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
   static inline void rcu_bind_current_to_nocb(void) { }
   #endif
   
diff --combined kernel/rcu/tree.c

index 75a35b7adbfa8e17514f706c946cc881a07beb9d,763e45fdf49b1d2021af9a9349354bc80a08ea77..c25ba442044a6e9452a43f617a93abd4a8b6ab08
--- 1/kernel/rcu/tree.c
--- 2/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@@ -1679,8 -1679,6 +1679,8 @@@ static bool __note_gp_changes(struct rc
         rdp->gp_seq = rnp->gp_seq;  /* Remember new grace-period state. */
         if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
                 WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
+ +      if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
+ +              WRITE_ONCE(rdp->last_sched_clock, jiffies);
         WRITE_ONCE(rdp->gpwrap, false);
         rcu_gpnum_ovf(rnp, rdp);
         return ret;
@@@ -1707,37 -1705,11 +1707,37 @@@ static void note_gp_changes(struct rcu_
                 rcu_gp_kthread_wake();
   }
   
+ +static atomic_t *rcu_gp_slow_suppress;
+ +
+ +/* Register a counter to suppress debugging grace-period delays. */
+ +void rcu_gp_slow_register(atomic_t *rgssp)
+ +{
+ +      WARN_ON_ONCE(rcu_gp_slow_suppress);
+ +
+ +      WRITE_ONCE(rcu_gp_slow_suppress, rgssp);
+ +}
+ +EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
+ +
+ +/* Unregister a counter, with NULL for not caring which. */
+ +void rcu_gp_slow_unregister(atomic_t *rgssp)
+ +{
+ +      WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress);
+ +
+ +      WRITE_ONCE(rcu_gp_slow_suppress, NULL);
+ +}
+ +EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister);
+ +
+ +static bool rcu_gp_slow_is_suppressed(void)
+ +{
+ +      atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress);
+ +
+ +      return rgssp && atomic_read(rgssp);
+ +}
+ +
   static void rcu_gp_slow(int delay)
   {
- -      if (delay > 0 &&
- -          !(rcu_seq_ctr(rcu_state.gp_seq) %
- -            (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+ +      if (!rcu_gp_slow_is_suppressed() && delay > 0 &&
+ +          !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
                 schedule_timeout_idle(delay);
   }
   
@@@ -2124,29 -2096,14 +2124,29 @@@ static noinline void rcu_gp_cleanup(voi
         /* Advance CBs to reduce false positives below. */
         offloaded = rcu_rdp_is_offloaded(rdp);
         if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
+ +
+ +              // We get here if a grace period was needed (“needgp”)
+ +              // and the above call to rcu_accelerate_cbs() did not set
+ +              // the RCU_GP_FLAG_INIT bit in ->gp_state (which records
+ +              // the need for another grace period).  The purpose
+ +              // of the “offloaded” check is to avoid invoking
+ +              // rcu_accelerate_cbs() on an offloaded CPU because we do not
+ +              // hold the ->nocb_lock needed to safely access an offloaded
+ +              // ->cblist.  We do not want to acquire that lock because
+ +              // it can be heavily contended during callback floods.
+ +
                 WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
                 WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
- -              trace_rcu_grace_period(rcu_state.name,
- -                                     rcu_state.gp_seq,
- -                                     TPS("newreq"));
+ +              trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
         } else {
- -              WRITE_ONCE(rcu_state.gp_flags,
- -                         rcu_state.gp_flags & RCU_GP_FLAG_INIT);
+ +
+ +              // We get here either if there is no need for an
+ +              // additional grace period or if rcu_accelerate_cbs() has
+ +              // already set the RCU_GP_FLAG_INIT bit in ->gp_flags. 
+ +              // So all we need to do is to clear all of the other
+ +              // ->gp_flags bits.
+ +
+ +              WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT);
         }
         raw_spin_unlock_irq_rcu_node(rnp);
   
@@@ -2652,13 -2609,6 +2652,13 @@@ static void rcu_do_batch(struct rcu_dat
    */
   void rcu_sched_clock_irq(int user)
   {
+ +      unsigned long j;
+ +
+ +      if (IS_ENABLED(CONFIG_PROVE_RCU)) {
+ +              j = jiffies;
+ +              WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock)));
+ +              __this_cpu_write(rcu_data.last_sched_clock, j);
+ +      }
         trace_rcu_utilization(TPS("Start scheduler-tick"));
         lockdep_assert_irqs_disabled();
         raw_cpu_inc(rcu_data.ticks_this_gp);
@@@ -2674,8 -2624,6 +2674,8 @@@
         rcu_flavor_sched_clock_irq(user);
         if (rcu_pending(user))
                 invoke_rcu_core();
+ +      if (user)
+ +              rcu_tasks_classic_qs(current, false);
         lockdep_assert_irqs_disabled();
   
         trace_rcu_utilization(TPS("End scheduler-tick"));
@@@ -3769,9 -3717,7 +3769,9 @@@ static int rcu_blocking_is_gp(void
   {
         int ret;
   
- -      if (IS_ENABLED(CONFIG_PREEMPTION))
+ +      // Invoking preempt_model_*() too early gets a splat.
+ +      if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE ||
+ +          preempt_model_full() || preempt_model_rt())
                 return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
         might_sleep();  /* Check for RCU read-side critical section. */
         preempt_disable();
@@@ -4233,7 -4179,6 +4233,7 @@@ rcu_boot_init_percpu_data(int cpu
         rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
         rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
         rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
+ +      rdp->last_sched_clock = jiffies;
         rdp->cpu = cpu;
         rcu_boot_init_nocb_percpu_data(rdp);
   }
@@@ -4526,6 -4471,51 +4526,51 @@@ static int rcu_pm_notify(struct notifie
         return NOTIFY_OK;
   }
   
+ #ifdef CONFIG_RCU_EXP_KTHREAD
+ struct kthread_worker *rcu_exp_gp_kworker;
+ struct kthread_worker *rcu_exp_par_gp_kworker;
+ 
+ static void __init rcu_start_exp_gp_kworkers(void)
+ {
+       const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
+       const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
+       struct sched_param param = { .sched_priority = kthread_prio };
+ 
+       rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
+       if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+               pr_err("Failed to create %s!\n", gp_kworker_name);
+               return;
+       }
+ 
+       rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
+       if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
+               pr_err("Failed to create %s!\n", par_gp_kworker_name);
+               kthread_destroy_worker(rcu_exp_gp_kworker);
+               return;
+       }
+ 
+       sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+       sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
+                                  &param);
+ }
+ 
+ static inline void rcu_alloc_par_gp_wq(void)
+ {
+ }
+ #else /* !CONFIG_RCU_EXP_KTHREAD */
+ struct workqueue_struct *rcu_par_gp_wq;
+ 
+ static void __init rcu_start_exp_gp_kworkers(void)
+ {
+ }
+ 
+ static inline void rcu_alloc_par_gp_wq(void)
+ {
+       rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+       WARN_ON(!rcu_par_gp_wq);
+ }
+ #endif /* CONFIG_RCU_EXP_KTHREAD */
+ 
   /*
    * Spawn the kthreads that handle RCU's grace periods.
    */
@@@ -4535,7 -4525,6 +4580,7 @@@ static int __init rcu_spawn_gp_kthread(
         struct rcu_node *rnp;
         struct sched_param sp;
         struct task_struct *t;
+ +      struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
   
         rcu_scheduler_fully_active = 1;
         t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
@@@ -4553,15 -4542,11 +4598,17 @@@
         smp_store_release(&rcu_state.gp_kthread, t);  /* ^^^ */
         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
         wake_up_process(t);
- -      rcu_spawn_nocb_kthreads();
- -      rcu_spawn_boost_kthreads();
+ +      /* This is a pre-SMP initcall, we expect a single CPU */
+ +      WARN_ON(num_online_cpus() > 1);
+ +      /*
+ +       * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu()
+ +       * due to rcu_scheduler_fully_active.
+ +       */
+ +      rcu_spawn_cpu_nocb_kthread(smp_processor_id());
+ +      rcu_spawn_one_boost_kthread(rdp->mynode);
         rcu_spawn_core_kthreads();
+       /* Create kthread worker for expedited GPs */
+       rcu_start_exp_gp_kworkers();
         return 0;
   }
   early_initcall(rcu_spawn_gp_kthread);
@@@ -4807,7 -4792,6 +4854,6 @@@ static void __init rcu_dump_rcu_node_tr
   }
   
   struct workqueue_struct *rcu_gp_wq;
- struct workqueue_struct *rcu_par_gp_wq;
   
   static void __init kfree_rcu_batch_init(void)
   {
@@@ -4844,7 -4828,7 +4890,7 @@@
   
   void __init rcu_init(void)
   {
- -      int cpu;
+ +      int cpu = smp_processor_id();
   
         rcu_early_boot_tests();
   
@@@ -4864,16 -4848,16 +4910,15 @@@
          * or the scheduler are operational.
          */
         pm_notifier(rcu_pm_notify, 0);
- -      for_each_online_cpu(cpu) {
- -              rcutree_prepare_cpu(cpu);
- -              rcu_cpu_starting(cpu);
- -              rcutree_online_cpu(cpu);
- -      }
+ +      WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
+ +      rcutree_prepare_cpu(cpu);
+ +      rcu_cpu_starting(cpu);
+ +      rcutree_online_cpu(cpu);
   
         /* Create workqueue for Tree SRCU and for expedited GPs. */
         rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
         WARN_ON(!rcu_gp_wq);
-       rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
-       WARN_ON(!rcu_par_gp_wq);
+       rcu_alloc_par_gp_wq();
   
         /* Fill in default value for rcutree.qovld boot parameter. */
         /* -After- the rcu_node ->lock fields are initialized! */
diff --combined kernel/rcu/tree.h

index 8aa5bf74e796a19463cd8289037b102dba09c7d1,b577cdfdc851086563a9da1839f9fc70a28e028a..2ccf5845957df4201a814de0540b8fbabc6e9412
--- 1/kernel/rcu/tree.h
--- 2/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@@ -10,6 -10,7 +10,7 @@@
    */
   
   #include <linux/cache.h>
+ #include <linux/kthread.h>
   #include <linux/spinlock.h>
   #include <linux/rtmutex.h>
   #include <linux/threads.h>
@@@ -23,7 -24,11 +24,11 @@@
   /* Communicate arguments to a workqueue handler. */
   struct rcu_exp_work {
         unsigned long rew_s;
+ #ifdef CONFIG_RCU_EXP_KTHREAD
+       struct kthread_work rew_work;
+ #else
         struct work_struct rew_work;
+ #endif /* CONFIG_RCU_EXP_KTHREAD */
   };
   
   /* RCU's kthread states for tracing. */
@@@ -254,7 -259,6 +259,7 @@@ struct rcu_data 
         unsigned long rcu_onl_gp_seq;   /* ->gp_seq at last online. */
         short rcu_onl_gp_flags;         /* ->gp_flags at last online. */
         unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
+ +      unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
   
         int cpu;
   };
@@@ -365,7 -369,6 +370,7 @@@ struct rcu_state 
         arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
                                                 /* Synchronize offline with */
                                                 /*  GP pre-initialization. */
+ +      int nocb_is_setup;                      /* nocb is setup from boot */
   };
   
   /* Values for rcu_state structure's gp_flags field. */
@@@ -423,6 -426,7 +428,6 @@@ static void rcu_preempt_boost_start_gp(
   static bool rcu_is_callbacks_kthread(void);
   static void rcu_cpu_kthread_setup(unsigned int cpu);
   static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
- -static void __init rcu_spawn_boost_kthreads(void);
   static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
   static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
   static void rcu_preempt_deferred_qs(struct task_struct *t);
@@@ -440,6 -444,7 +445,6 @@@ static int rcu_nocb_need_deferred_wakeu
   static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
   static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
   static void rcu_spawn_cpu_nocb_kthread(int cpu);
- -static void __init rcu_spawn_nocb_kthreads(void);
   static void show_rcu_nocb_state(struct rcu_data *rdp);
   static void rcu_nocb_lock(struct rcu_data *rdp);
   static void rcu_nocb_unlock(struct rcu_data *rdp);
diff --combined kernel/rcu/tree_stall.h

index 268dd79c58e72acd3e954106e03762f54e7f9d6c,009d3f9305cf7b3bd4b8930647042271608363bc..a001e1e7a99269c9968059a00cff25ea496dbc99
--- 1/kernel/rcu/tree_stall.h
--- 2/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@@ -25,6 -25,34 +25,34 @@@ int sysctl_max_rcu_stall_to_panic __rea
   #define RCU_STALL_MIGHT_DIV           8
   #define RCU_STALL_MIGHT_MIN           (2 * HZ)
   
+ int rcu_exp_jiffies_till_stall_check(void)
+ {
+       int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout);
+       int exp_stall_delay_delta = 0;
+       int till_stall_check;
+ 
+       // Zero says to use rcu_cpu_stall_timeout, but in milliseconds.
+       if (!cpu_stall_timeout)
+               cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check());
+ 
+       // Limit check must be consistent with the Kconfig limits for
+       // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range.
+       // The minimum clamped value is "2UL", because at least one full
+       // tick has to be guaranteed.
+       till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ);
+ 
+       if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout)
+               WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check));
+ 
+ #ifdef CONFIG_PROVE_RCU
+       /* Add extra ~25% out of till_stall_check. */
+       exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1;
+ #endif
+ 
+       return till_stall_check + exp_stall_delay_delta;
+ }
+ EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check);
+ 
   /* Limit-check stall timeouts specified at boottime and runtime. */
   int rcu_jiffies_till_stall_check(void)
   {
@@@ -565,9 -593,9 +593,9 @@@ static void print_other_cpu_stall(unsig
   
         for_each_possible_cpu(cpu)
                 totqlen += rcu_get_n_cbs_cpu(cpu);
- -      pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+ +      pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
                smp_processor_id(), (long)(jiffies - gps),
- -             (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ +             (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
         if (ndetected) {
                 rcu_dump_cpu_stacks();
   
@@@ -626,9 -654,9 +654,9 @@@ static void print_cpu_stall(unsigned lo
         raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
         for_each_possible_cpu(cpu)
                 totqlen += rcu_get_n_cbs_cpu(cpu);
- -      pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
+ +      pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
                 jiffies - gps,
- -              (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ +              (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
   
         rcu_check_gp_kthread_expired_fqs_timer();
         rcu_check_gp_kthread_starvation();
author	Paul E. McKenney <[email protected]>
	Wed, 11 May 2022 18:49:35 +0000 (11:49 -0700)
committer	Paul E. McKenney <[email protected]>
	Wed, 11 May 2022 18:49:35 +0000 (11:49 -0700)
		1	2
Documentation/admin-guide/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/Kconfig.debug	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/rcu.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/rcu/tree_stall.h	patch \|	diff1 \|	diff2 \|	blob \| history