]> Git Repo - linux.git/commitdiff
Merge tag 'locking-core-2021-08-30' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <[email protected]>
Mon, 30 Aug 2021 21:26:36 +0000 (14:26 -0700)
committerLinus Torvalds <[email protected]>
Mon, 30 Aug 2021 21:26:36 +0000 (14:26 -0700)
Pull locking and atomics updates from Thomas Gleixner:
 "The regular pile:

   - A few improvements to the mutex code

   - Documentation updates for atomics to clarify the difference between
     cmpxchg() and try_cmpxchg() and to explain the forward progress
     expectations.

   - Simplification of the atomics fallback generator

   - The addition of arch_atomic_long*() variants and generic arch_*()
     bitops based on them.

   - Add the missing might_sleep() invocations to the down*() operations
     of semaphores.

  The PREEMPT_RT locking core:

   - Scheduler updates to support the state preserving mechanism for
     'sleeping' spin- and rwlocks on RT.

     This mechanism is carefully preserving the state of the task when
     blocking on a 'sleeping' spin- or rwlock and takes regular wake-ups
     targeted at the same task into account. The preserved or updated
     (via a regular wakeup) state is restored when the lock has been
     acquired.

   - Restructuring of the rtmutex code so it can be utilized and
     extended for the RT specific lock variants.

   - Restructuring of the ww_mutex code to allow sharing of the ww_mutex
     specific functionality for rtmutex based ww_mutexes.

   - Header file disentangling to allow substitution of the regular lock
     implementations with the PREEMPT_RT variants without creating an
     unmaintainable #ifdef mess.

   - Shared base code for the PREEMPT_RT specific rw_semaphore and
     rwlock implementations.

     Contrary to the regular rw_semaphores and rwlocks the PREEMPT_RT
     implementation is writer unfair because it is infeasible to do
     priority inheritance on multiple readers. Experience over the years
     has shown that real-time workloads are not the typical workloads
     which are sensitive to writer starvation.

     The alternative solution would be to allow only a single reader
     which has been tried and discarded as it is a major bottleneck
     especially for mmap_sem. Aside of that many of the writer
     starvation critical usage sites have been converted to a writer
     side mutex/spinlock and RCU read side protections in the past
     decade so that the issue is less prominent than it used to be.

   - The actual rtmutex based lock substitutions for PREEMPT_RT enabled
     kernels which affect mutex, ww_mutex, rw_semaphore, spinlock_t and
     rwlock_t. The spin/rw_lock*() functions disable migration across
     the critical section to preserve the existing semantics vs per-CPU
     variables.

   - Rework of the futex REQUEUE_PI mechanism to handle the case of
     early wake-ups which interleave with a re-queue operation to
     prevent the situation that a task would be blocked on both the
     rtmutex associated to the outer futex and the rtmutex based hash
     bucket spinlock.

     While this situation cannot happen on !RT enabled kernels the
     changes make the underlying concurrency problems easier to
     understand in general. As a result the difference between !RT and
     RT kernels is reduced to the handling of waiting for the critical
     section. !RT kernels simply spin-wait as before and RT kernels
     utilize rcu_wait().

   - The substitution of local_lock for PREEMPT_RT with a spinlock which
     protects the critical section while staying preemptible. The CPU
     locality is established by disabling migration.

  The underlying concepts of this code have been in use in PREEMPT_RT for
  way more than a decade. The code has been refactored several times over
  the years and this final incarnation has been optimized once again to be
  as non-intrusive as possible, i.e. the RT specific parts are mostly
  isolated.

  It has been extensively tested in the 5.14-rt patch series and it has
  been verified that !RT kernels are not affected by these changes"

* tag 'locking-core-2021-08-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (92 commits)
  locking/rtmutex: Return success on deadlock for ww_mutex waiters
  locking/rtmutex: Prevent spurious EDEADLK return caused by ww_mutexes
  locking/rtmutex: Dequeue waiter on ww_mutex deadlock
  locking/rtmutex: Dont dereference waiter lockless
  locking/semaphore: Add might_sleep() to down_*() family
  locking/ww_mutex: Initialize waiter.ww_ctx properly
  static_call: Update API documentation
  locking/local_lock: Add PREEMPT_RT support
  locking/spinlock/rt: Prepare for RT local_lock
  locking/rtmutex: Add adaptive spinwait mechanism
  locking/rtmutex: Implement equal priority lock stealing
  preempt: Adjust PREEMPT_LOCK_OFFSET for RT
  locking/rtmutex: Prevent lockdep false positive with PI futexes
  futex: Prevent requeue_pi() lock nesting issue on RT
  futex: Simplify handle_early_requeue_pi_wakeup()
  futex: Reorder sanity checks in futex_requeue()
  futex: Clarify comment in futex_requeue()
  futex: Restructure futex_requeue()
  futex: Correct the number of requeued waiters for PI
  futex: Remove bogus condition for requeue PI
  ...

1  2 
include/linux/sched.h
kernel/rcu/tree_plugin.h
kernel/sched/core.c

diff --combined include/linux/sched.h
index 7c6a77da8b106ea08ac321f47d53e9709c8af4ac,746dfc06a35cb59cbaf93690500464f3f3becd5c..af7179f8572cdeb5961b3ff4cd61d67dd1e71a42
@@@ -95,7 -95,9 +95,9 @@@ struct task_group
  #define TASK_WAKING                   0x0200
  #define TASK_NOLOAD                   0x0400
  #define TASK_NEW                      0x0800
- #define TASK_STATE_MAX                        0x1000
+ /* RT specific auxilliary flag to mark RT lock waiters */
+ #define TASK_RTLOCK_WAIT              0x1000
+ #define TASK_STATE_MAX                        0x2000
  
  /* Convenience macros for the sake of set_current_state: */
  #define TASK_KILLABLE                 (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
  
  #define task_is_stopped_or_traced(task)       ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
  
- #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  /*
   * Special states are those that do not use the normal wait-loop pattern. See
   * the comment with set_special_state().
  #define is_special_task_state(state)                          \
        ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
  
- #define __set_current_state(state_value)                      \
-       do {                                                    \
-               WARN_ON_ONCE(is_special_task_state(state_value));\
-               current->task_state_change = _THIS_IP_;         \
-               WRITE_ONCE(current->__state, (state_value));    \
-       } while (0)
- #define set_current_state(state_value)                                \
-       do {                                                    \
-               WARN_ON_ONCE(is_special_task_state(state_value));\
-               current->task_state_change = _THIS_IP_;         \
-               smp_store_mb(current->__state, (state_value));  \
+ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ # define debug_normal_state_change(state_value)                               \
+       do {                                                            \
+               WARN_ON_ONCE(is_special_task_state(state_value));       \
+               current->task_state_change = _THIS_IP_;                 \
        } while (0)
  
- #define set_special_state(state_value)                                        \
+ # define debug_special_state_change(state_value)                      \
        do {                                                            \
-               unsigned long flags; /* may shadow */                   \
                WARN_ON_ONCE(!is_special_task_state(state_value));      \
-               raw_spin_lock_irqsave(&current->pi_lock, flags);        \
                current->task_state_change = _THIS_IP_;                 \
-               WRITE_ONCE(current->__state, (state_value));            \
-               raw_spin_unlock_irqrestore(&current->pi_lock, flags);   \
        } while (0)
+ # define debug_rtlock_wait_set_state()                                        \
+       do {                                                             \
+               current->saved_state_change = current->task_state_change;\
+               current->task_state_change = _THIS_IP_;                  \
+       } while (0)
+ # define debug_rtlock_wait_restore_state()                            \
+       do {                                                             \
+               current->task_state_change = current->saved_state_change;\
+       } while (0)
  #else
+ # define debug_normal_state_change(cond)      do { } while (0)
+ # define debug_special_state_change(cond)     do { } while (0)
+ # define debug_rtlock_wait_set_state()                do { } while (0)
+ # define debug_rtlock_wait_restore_state()    do { } while (0)
+ #endif
  /*
   * set_current_state() includes a barrier so that the write of current->state
   * is correctly serialised wrt the caller's subsequent test of whether to
   * Also see the comments of try_to_wake_up().
   */
  #define __set_current_state(state_value)                              \
-       WRITE_ONCE(current->__state, (state_value))
+       do {                                                            \
+               debug_normal_state_change((state_value));               \
+               WRITE_ONCE(current->__state, (state_value));            \
+       } while (0)
  
  #define set_current_state(state_value)                                        \
-       smp_store_mb(current->__state, (state_value))
+       do {                                                            \
+               debug_normal_state_change((state_value));               \
+               smp_store_mb(current->__state, (state_value));          \
+       } while (0)
  
  /*
   * set_special_state() should be used for those states when the blocking task
   * can not use the regular condition based wait-loop. In that case we must
-  * serialize against wakeups such that any possible in-flight TASK_RUNNING stores
-  * will not collide with our state change.
+  * serialize against wakeups such that any possible in-flight TASK_RUNNING
+  * stores will not collide with our state change.
   */
  #define set_special_state(state_value)                                        \
        do {                                                            \
                unsigned long flags; /* may shadow */                   \
+                                                                       \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
+               debug_special_state_change((state_value));              \
                WRITE_ONCE(current->__state, (state_value));            \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);   \
        } while (0)
  
- #endif
+ /*
+  * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
+  *
+  * RT's spin/rwlock substitutions are state preserving. The state of the
+  * task when blocking on the lock is saved in task_struct::saved_state and
+  * restored after the lock has been acquired.  These operations are
+  * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
+  * lock related wakeups while the task is blocked on the lock are
+  * redirected to operate on task_struct::saved_state to ensure that these
+  * are not dropped. On restore task_struct::saved_state is set to
+  * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
+  *
+  * The lock operation looks like this:
+  *
+  *    current_save_and_set_rtlock_wait_state();
+  *    for (;;) {
+  *            if (try_lock())
+  *                    break;
+  *            raw_spin_unlock_irq(&lock->wait_lock);
+  *            schedule_rtlock();
+  *            raw_spin_lock_irq(&lock->wait_lock);
+  *            set_current_state(TASK_RTLOCK_WAIT);
+  *    }
+  *    current_restore_rtlock_saved_state();
+  */
+ #define current_save_and_set_rtlock_wait_state()                      \
+       do {                                                            \
+               lockdep_assert_irqs_disabled();                         \
+               raw_spin_lock(&current->pi_lock);                       \
+               current->saved_state = current->__state;                \
+               debug_rtlock_wait_set_state();                          \
+               WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);         \
+               raw_spin_unlock(&current->pi_lock);                     \
+       } while (0);
+ #define current_restore_rtlock_saved_state()                          \
+       do {                                                            \
+               lockdep_assert_irqs_disabled();                         \
+               raw_spin_lock(&current->pi_lock);                       \
+               debug_rtlock_wait_restore_state();                      \
+               WRITE_ONCE(current->__state, current->saved_state);     \
+               current->saved_state = TASK_RUNNING;                    \
+               raw_spin_unlock(&current->pi_lock);                     \
+       } while (0);
  
  #define get_current_state()   READ_ONCE(current->__state)
  
@@@ -230,6 -288,9 +288,9 @@@ extern long schedule_timeout_idle(long 
  asmlinkage void schedule(void);
  extern void schedule_preempt_disabled(void);
  asmlinkage void preempt_schedule_irq(void);
+ #ifdef CONFIG_PREEMPT_RT
+  extern void schedule_rtlock(void);
+ #endif
  
  extern int __must_check io_schedule_prepare(void);
  extern void io_schedule_finish(int token);
@@@ -668,6 -729,11 +729,11 @@@ struct task_struct 
  #endif
        unsigned int                    __state;
  
+ #ifdef CONFIG_PREEMPT_RT
+       /* saved state for "spinlock sleepers" */
+       unsigned int                    saved_state;
+ #endif
        /*
         * This begins the randomizable portion of task_struct. Only
         * scheduling-critical items should be added above here.
        unsigned int                    policy;
        int                             nr_cpus_allowed;
        const cpumask_t                 *cpus_ptr;
 +      cpumask_t                       *user_cpus_ptr;
        cpumask_t                       cpus_mask;
        void                            *migration_pending;
  #ifdef CONFIG_SMP
        /* Used by page_owner=on to detect recursion in page tracking. */
        unsigned                        in_page_owner:1;
  #endif
 +#ifdef CONFIG_EVENTFD
 +      /* Recursion prevention for eventfd_signal() */
 +      unsigned                        in_eventfd_signal:1;
 +#endif
  
        unsigned long                   atomic_flags; /* Flags requiring atomic access. */
  
        struct kmap_ctrl                kmap_ctrl;
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        unsigned long                   task_state_change;
+ # ifdef CONFIG_PREEMPT_RT
+       unsigned long                   saved_state_change;
+ # endif
  #endif
        int                             pagefault_disabled;
  #ifdef CONFIG_MMU
@@@ -1710,11 -1774,6 +1779,11 @@@ extern int task_can_attach(struct task_
  #ifdef CONFIG_SMP
  extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
  extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
 +extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
 +extern void release_user_cpus_ptr(struct task_struct *p);
 +extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
 +extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
 +extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
  #else
  static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  {
@@@ -1725,21 -1784,6 +1794,21 @@@ static inline int set_cpus_allowed_ptr(
                return -EINVAL;
        return 0;
  }
 +static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
 +{
 +      if (src->user_cpus_ptr)
 +              return -EINVAL;
 +      return 0;
 +}
 +static inline void release_user_cpus_ptr(struct task_struct *p)
 +{
 +      WARN_ON(p->user_cpus_ptr);
 +}
 +
 +static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
 +{
 +      return 0;
 +}
  #endif
  
  extern int yield_to(struct task_struct *p, bool preempt);
diff --combined kernel/rcu/tree_plugin.h
index 7a4876a3a882e2454d5a6084d8f439e33973ad60,0ff5e4fb933e7247d8343f87437cb9a4f9582879..d070059163d70ce9ec10a358c0e1103c710124d0
  
  #include "../locking/rtmutex_common.h"
  
 -#ifdef CONFIG_RCU_NOCB_CPU
 -static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
 -static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
 -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
 -{
 -      return lockdep_is_held(&rdp->nocb_lock);
 -}
 -
 -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
 -{
 -      /* Race on early boot between thread creation and assignment */
 -      if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
 -              return true;
 -
 -      if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
 -              if (in_task())
 -                      return true;
 -      return false;
 -}
 -
 -#else
 -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
 -{
 -      return 0;
 -}
 -
 -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
 -{
 -      return false;
 -}
 -
 -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 -
  static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
  {
        /*
@@@ -313,7 -346,7 +313,7 @@@ void rcu_note_context_switch(bool preem
  
        trace_rcu_utilization(TPS("Start context switch"));
        lockdep_assert_irqs_disabled();
 -      WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
 +      WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!");
        if (rcu_preempt_depth() > 0 &&
            !t->rcu_read_unlock_special.b.blocked) {
  
@@@ -372,20 -405,17 +372,20 @@@ static int rcu_preempt_blocked_readers_
  
  static void rcu_preempt_read_enter(void)
  {
 -      current->rcu_read_lock_nesting++;
 +      WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1);
  }
  
  static int rcu_preempt_read_exit(void)
  {
 -      return --current->rcu_read_lock_nesting;
 +      int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1;
 +
 +      WRITE_ONCE(current->rcu_read_lock_nesting, ret);
 +      return ret;
  }
  
  static void rcu_preempt_depth_set(int val)
  {
 -      current->rcu_read_lock_nesting = val;
 +      WRITE_ONCE(current->rcu_read_lock_nesting, val);
  }
  
  /*
@@@ -529,7 -559,7 +529,7 @@@ rcu_preempt_deferred_qs_irqrestore(stru
                        WRITE_ONCE(rnp->exp_tasks, np);
                if (IS_ENABLED(CONFIG_RCU_BOOST)) {
                        /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
-                       drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+                       drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx.rtmutex) == t;
                        if (&t->rcu_node_entry == rnp->boost_tasks)
                                WRITE_ONCE(rnp->boost_tasks, np);
                }
  
                /* Unboost if we were boosted. */
                if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
-                       rt_mutex_futex_unlock(&rnp->boost_mtx);
+                       rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
  
                /*
                 * If this was the last task on the expedited lists,
@@@ -1053,7 -1083,7 +1053,7 @@@ static int rcu_boost(struct rcu_node *r
         * section.
         */
        t = container_of(tb, struct task_struct, rcu_node_entry);
-       rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
+       rt_mutex_init_proxy_locked(&rnp->boost_mtx.rtmutex, t);
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        /* Lock only for side effect: boosts task t's priority. */
        rt_mutex_lock(&rnp->boost_mtx);
@@@ -1449,6 -1479,1460 +1449,6 @@@ static void rcu_cleanup_after_idle(void
  
  #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
  
 -#ifdef CONFIG_RCU_NOCB_CPU
 -
 -/*
 - * Offload callback processing from the boot-time-specified set of CPUs
 - * specified by rcu_nocb_mask.  For the CPUs in the set, there are kthreads
 - * created that pull the callbacks from the corresponding CPU, wait for
 - * a grace period to elapse, and invoke the callbacks.  These kthreads
 - * are organized into GP kthreads, which manage incoming callbacks, wait for
 - * grace periods, and awaken CB kthreads, and the CB kthreads, which only
 - * invoke callbacks.  Each GP kthread invokes its own CBs.  The no-CBs CPUs
 - * do a wake_up() on their GP kthread when they insert a callback into any
 - * empty list, unless the rcu_nocb_poll boot parameter has been specified,
 - * in which case each kthread actively polls its CPU.  (Which isn't so great
 - * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
 - *
 - * This is intended to be used in conjunction with Frederic Weisbecker's
 - * adaptive-idle work, which would seriously reduce OS jitter on CPUs
 - * running CPU-bound user-mode computations.
 - *
 - * Offloading of callbacks can also be used as an energy-efficiency
 - * measure because CPUs with no RCU callbacks queued are more aggressive
 - * about entering dyntick-idle mode.
 - */
 -
 -
 -/*
 - * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
 - * If the list is invalid, a warning is emitted and all CPUs are offloaded.
 - */
 -static int __init rcu_nocb_setup(char *str)
 -{
 -      alloc_bootmem_cpumask_var(&rcu_nocb_mask);
 -      if (cpulist_parse(str, rcu_nocb_mask)) {
 -              pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
 -              cpumask_setall(rcu_nocb_mask);
 -      }
 -      return 1;
 -}
 -__setup("rcu_nocbs=", rcu_nocb_setup);
 -
 -static int __init parse_rcu_nocb_poll(char *arg)
 -{
 -      rcu_nocb_poll = true;
 -      return 0;
 -}
 -early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
 -
 -/*
 - * Don't bother bypassing ->cblist if the call_rcu() rate is low.
 - * After all, the main point of bypassing is to avoid lock contention
 - * on ->nocb_lock, which only can happen at high call_rcu() rates.
 - */
 -static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
 -module_param(nocb_nobypass_lim_per_jiffy, int, 0);
 -
 -/*
 - * Acquire the specified rcu_data structure's ->nocb_bypass_lock.  If the
 - * lock isn't immediately available, increment ->nocb_lock_contended to
 - * flag the contention.
 - */
 -static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
 -      __acquires(&rdp->nocb_bypass_lock)
 -{
 -      lockdep_assert_irqs_disabled();
 -      if (raw_spin_trylock(&rdp->nocb_bypass_lock))
 -              return;
 -      atomic_inc(&rdp->nocb_lock_contended);
 -      WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
 -      smp_mb__after_atomic(); /* atomic_inc() before lock. */
 -      raw_spin_lock(&rdp->nocb_bypass_lock);
 -      smp_mb__before_atomic(); /* atomic_dec() after lock. */
 -      atomic_dec(&rdp->nocb_lock_contended);
 -}
 -
 -/*
 - * Spinwait until the specified rcu_data structure's ->nocb_lock is
 - * not contended.  Please note that this is extremely special-purpose,
 - * relying on the fact that at most two kthreads and one CPU contend for
 - * this lock, and also that the two kthreads are guaranteed to have frequent
 - * grace-period-duration time intervals between successive acquisitions
 - * of the lock.  This allows us to use an extremely simple throttling
 - * mechanism, and further to apply it only to the CPU doing floods of
 - * call_rcu() invocations.  Don't try this at home!
 - */
 -static void rcu_nocb_wait_contended(struct rcu_data *rdp)
 -{
 -      WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
 -      while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
 -              cpu_relax();
 -}
 -
 -/*
 - * Conditionally acquire the specified rcu_data structure's
 - * ->nocb_bypass_lock.
 - */
 -static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
 -{
 -      lockdep_assert_irqs_disabled();
 -      return raw_spin_trylock(&rdp->nocb_bypass_lock);
 -}
 -
 -/*
 - * Release the specified rcu_data structure's ->nocb_bypass_lock.
 - */
 -static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
 -      __releases(&rdp->nocb_bypass_lock)
 -{
 -      lockdep_assert_irqs_disabled();
 -      raw_spin_unlock(&rdp->nocb_bypass_lock);
 -}
 -
 -/*
 - * Acquire the specified rcu_data structure's ->nocb_lock, but only
 - * if it corresponds to a no-CBs CPU.
 - */
 -static void rcu_nocb_lock(struct rcu_data *rdp)
 -{
 -      lockdep_assert_irqs_disabled();
 -      if (!rcu_rdp_is_offloaded(rdp))
 -              return;
 -      raw_spin_lock(&rdp->nocb_lock);
 -}
 -
 -/*
 - * Release the specified rcu_data structure's ->nocb_lock, but only
 - * if it corresponds to a no-CBs CPU.
 - */
 -static void rcu_nocb_unlock(struct rcu_data *rdp)
 -{
 -      if (rcu_rdp_is_offloaded(rdp)) {
 -              lockdep_assert_irqs_disabled();
 -              raw_spin_unlock(&rdp->nocb_lock);
 -      }
 -}
 -
 -/*
 - * Release the specified rcu_data structure's ->nocb_lock and restore
 - * interrupts, but only if it corresponds to a no-CBs CPU.
 - */
 -static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
 -                                     unsigned long flags)
 -{
 -      if (rcu_rdp_is_offloaded(rdp)) {
 -              lockdep_assert_irqs_disabled();
 -              raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 -      } else {
 -              local_irq_restore(flags);
 -      }
 -}
 -
 -/* Lockdep check that ->cblist may be safely accessed. */
 -static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
 -{
 -      lockdep_assert_irqs_disabled();
 -      if (rcu_rdp_is_offloaded(rdp))
 -              lockdep_assert_held(&rdp->nocb_lock);
 -}
 -
 -/*
 - * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
 - * grace period.
 - */
 -static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 -{
 -      swake_up_all(sq);
 -}
 -
 -static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
 -{
 -      return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
 -}
 -
 -static void rcu_init_one_nocb(struct rcu_node *rnp)
 -{
 -      init_swait_queue_head(&rnp->nocb_gp_wq[0]);
 -      init_swait_queue_head(&rnp->nocb_gp_wq[1]);
 -}
 -
 -/* Is the specified CPU a no-CBs CPU? */
 -bool rcu_is_nocb_cpu(int cpu)
 -{
 -      if (cpumask_available(rcu_nocb_mask))
 -              return cpumask_test_cpu(cpu, rcu_nocb_mask);
 -      return false;
 -}
 -
 -static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
 -                         struct rcu_data *rdp,
 -                         bool force, unsigned long flags)
 -      __releases(rdp_gp->nocb_gp_lock)
 -{
 -      bool needwake = false;
 -
 -      if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
 -              raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
 -              trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 -                                  TPS("AlreadyAwake"));
 -              return false;
 -      }
 -
 -      if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
 -              WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
 -              del_timer(&rdp_gp->nocb_timer);
 -      }
 -
 -      if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
 -              WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
 -              needwake = true;
 -      }
 -      raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
 -      if (needwake) {
 -              trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
 -              wake_up_process(rdp_gp->nocb_gp_kthread);
 -      }
 -
 -      return needwake;
 -}
 -
 -/*
 - * Kick the GP kthread for this NOCB group.
 - */
 -static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
 -{
 -      unsigned long flags;
 -      struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 -
 -      raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
 -      return __wake_nocb_gp(rdp_gp, rdp, force, flags);
 -}
 -
 -/*
 - * Arrange to wake the GP kthread for this NOCB group at some future
 - * time when it is safe to do so.
 - */
 -static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
 -                             const char *reason)
 -{
 -      unsigned long flags;
 -      struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 -
 -      raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
 -
 -      /*
 -       * Bypass wakeup overrides previous deferments. In case
 -       * of callback storm, no need to wake up too early.
 -       */
 -      if (waketype == RCU_NOCB_WAKE_BYPASS) {
 -              mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
 -              WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
 -      } else {
 -              if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
 -                      mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
 -              if (rdp_gp->nocb_defer_wakeup < waketype)
 -                      WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
 -      }
 -
 -      raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
 -
 -      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
 -}
 -
 -/*
 - * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
 - * However, if there is a callback to be enqueued and if ->nocb_bypass
 - * proves to be initially empty, just return false because the no-CB GP
 - * kthread may need to be awakened in this case.
 - *
 - * Note that this function always returns true if rhp is NULL.
 - */
 -static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 -                                   unsigned long j)
 -{
 -      struct rcu_cblist rcl;
 -
 -      WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
 -      rcu_lockdep_assert_cblist_protected(rdp);
 -      lockdep_assert_held(&rdp->nocb_bypass_lock);
 -      if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
 -              raw_spin_unlock(&rdp->nocb_bypass_lock);
 -              return false;
 -      }
 -      /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
 -      if (rhp)
 -              rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
 -      rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
 -      rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
 -      WRITE_ONCE(rdp->nocb_bypass_first, j);
 -      rcu_nocb_bypass_unlock(rdp);
 -      return true;
 -}
 -
 -/*
 - * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
 - * However, if there is a callback to be enqueued and if ->nocb_bypass
 - * proves to be initially empty, just return false because the no-CB GP
 - * kthread may need to be awakened in this case.
 - *
 - * Note that this function always returns true if rhp is NULL.
 - */
 -static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 -                                unsigned long j)
 -{
 -      if (!rcu_rdp_is_offloaded(rdp))
 -              return true;
 -      rcu_lockdep_assert_cblist_protected(rdp);
 -      rcu_nocb_bypass_lock(rdp);
 -      return rcu_nocb_do_flush_bypass(rdp, rhp, j);
 -}
 -
 -/*
 - * If the ->nocb_bypass_lock is immediately available, flush the
 - * ->nocb_bypass queue into ->cblist.
 - */
 -static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
 -{
 -      rcu_lockdep_assert_cblist_protected(rdp);
 -      if (!rcu_rdp_is_offloaded(rdp) ||
 -          !rcu_nocb_bypass_trylock(rdp))
 -              return;
 -      WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
 -}
 -
 -/*
 - * See whether it is appropriate to use the ->nocb_bypass list in order
 - * to control contention on ->nocb_lock.  A limited number of direct
 - * enqueues are permitted into ->cblist per jiffy.  If ->nocb_bypass
 - * is non-empty, further callbacks must be placed into ->nocb_bypass,
 - * otherwise rcu_barrier() breaks.  Use rcu_nocb_flush_bypass() to switch
 - * back to direct use of ->cblist.  However, ->nocb_bypass should not be
 - * used if ->cblist is empty, because otherwise callbacks can be stranded
 - * on ->nocb_bypass because we cannot count on the current CPU ever again
 - * invoking call_rcu().  The general rule is that if ->nocb_bypass is
 - * non-empty, the corresponding no-CBs grace-period kthread must not be
 - * in an indefinite sleep state.
 - *
 - * Finally, it is not permitted to use the bypass during early boot,
 - * as doing so would confuse the auto-initialization code.  Besides
 - * which, there is no point in worrying about lock contention while
 - * there is only one CPU in operation.
 - */
 -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 -                              bool *was_alldone, unsigned long flags)
 -{
 -      unsigned long c;
 -      unsigned long cur_gp_seq;
 -      unsigned long j = jiffies;
 -      long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
 -
 -      lockdep_assert_irqs_disabled();
 -
 -      // Pure softirq/rcuc based processing: no bypassing, no
 -      // locking.
 -      if (!rcu_rdp_is_offloaded(rdp)) {
 -              *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
 -              return false;
 -      }
 -
 -      // In the process of (de-)offloading: no bypassing, but
 -      // locking.
 -      if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
 -              rcu_nocb_lock(rdp);
 -              *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
 -              return false; /* Not offloaded, no bypassing. */
 -      }
 -
 -      // Don't use ->nocb_bypass during early boot.
 -      if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
 -              rcu_nocb_lock(rdp);
 -              WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
 -              *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
 -              return false;
 -      }
 -
 -      // If we have advanced to a new jiffy, reset counts to allow
 -      // moving back from ->nocb_bypass to ->cblist.
 -      if (j == rdp->nocb_nobypass_last) {
 -              c = rdp->nocb_nobypass_count + 1;
 -      } else {
 -              WRITE_ONCE(rdp->nocb_nobypass_last, j);
 -              c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
 -              if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
 -                               nocb_nobypass_lim_per_jiffy))
 -                      c = 0;
 -              else if (c > nocb_nobypass_lim_per_jiffy)
 -                      c = nocb_nobypass_lim_per_jiffy;
 -      }
 -      WRITE_ONCE(rdp->nocb_nobypass_count, c);
 -
 -      // If there hasn't yet been all that many ->cblist enqueues
 -      // this jiffy, tell the caller to enqueue onto ->cblist.  But flush
 -      // ->nocb_bypass first.
 -      if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
 -              rcu_nocb_lock(rdp);
 -              *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
 -              if (*was_alldone)
 -                      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 -                                          TPS("FirstQ"));
 -              WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
 -              WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
 -              return false; // Caller must enqueue the callback.
 -      }
 -
 -      // If ->nocb_bypass has been used too long or is too full,
 -      // flush ->nocb_bypass to ->cblist.
 -      if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
 -          ncbs >= qhimark) {
 -              rcu_nocb_lock(rdp);
 -              if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
 -                      *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
 -                      if (*was_alldone)
 -                              trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 -                                                  TPS("FirstQ"));
 -                      WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
 -                      return false; // Caller must enqueue the callback.
 -              }
 -              if (j != rdp->nocb_gp_adv_time &&
 -                  rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
 -                  rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
 -                      rcu_advance_cbs_nowake(rdp->mynode, rdp);
 -                      rdp->nocb_gp_adv_time = j;
 -              }
 -              rcu_nocb_unlock_irqrestore(rdp, flags);
 -              return true; // Callback already enqueued.
 -      }
 -
 -      // We need to use the bypass.
 -      rcu_nocb_wait_contended(rdp);
 -      rcu_nocb_bypass_lock(rdp);
 -      ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
 -      rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
 -      rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
 -      if (!ncbs) {
 -              WRITE_ONCE(rdp->nocb_bypass_first, j);
 -              trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
 -      }
 -      rcu_nocb_bypass_unlock(rdp);
 -      smp_mb(); /* Order enqueue before wake. */
 -      if (ncbs) {
 -              local_irq_restore(flags);
 -      } else {
 -              // No-CBs GP kthread might be indefinitely asleep, if so, wake.
 -              rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
 -              if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
 -                      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 -                                          TPS("FirstBQwake"));
 -                      __call_rcu_nocb_wake(rdp, true, flags);
 -              } else {
 -                      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 -                                          TPS("FirstBQnoWake"));
 -                      rcu_nocb_unlock_irqrestore(rdp, flags);
 -              }
 -      }
 -      return true; // Callback already enqueued.
 -}
 -
 -/*
 - * Awaken the no-CBs grace-period kthread if needed, either due to it
 - * legitimately being asleep or due to overload conditions.
 - *
 - * If warranted, also wake up the kthread servicing this CPUs queues.
 - */
 -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
 -                               unsigned long flags)
 -                               __releases(rdp->nocb_lock)
 -{
 -      unsigned long cur_gp_seq;
 -      unsigned long j;
 -      long len;
 -      struct task_struct *t;
 -
 -      // If we are being polled or there is no kthread, just leave.
 -      t = READ_ONCE(rdp->nocb_gp_kthread);
 -      if (rcu_nocb_poll || !t) {
 -              rcu_nocb_unlock_irqrestore(rdp, flags);
 -              trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 -                                  TPS("WakeNotPoll"));
 -              return;
 -      }
 -      // Need to actually to a wakeup.
 -      len = rcu_segcblist_n_cbs(&rdp->cblist);
 -      if (was_alldone) {
 -              rdp->qlen_last_fqs_check = len;
 -              if (!irqs_disabled_flags(flags)) {
 -                      /* ... if queue was empty ... */
 -                      rcu_nocb_unlock_irqrestore(rdp, flags);
 -                      wake_nocb_gp(rdp, false);
 -                      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 -                                          TPS("WakeEmpty"));
 -              } else {
 -                      rcu_nocb_unlock_irqrestore(rdp, flags);
 -                      wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
 -                                         TPS("WakeEmptyIsDeferred"));
 -              }
 -      } else if (len > rdp->qlen_last_fqs_check + qhimark) {
 -              /* ... or if many callbacks queued. */
 -              rdp->qlen_last_fqs_check = len;
 -              j = jiffies;
 -              if (j != rdp->nocb_gp_adv_time &&
 -                  rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
 -                  rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
 -                      rcu_advance_cbs_nowake(rdp->mynode, rdp);
 -                      rdp->nocb_gp_adv_time = j;
 -              }
 -              smp_mb(); /* Enqueue before timer_pending(). */
 -              if ((rdp->nocb_cb_sleep ||
 -                   !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
 -                  !timer_pending(&rdp->nocb_timer)) {
 -                      rcu_nocb_unlock_irqrestore(rdp, flags);
 -                      wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
 -                                         TPS("WakeOvfIsDeferred"));
 -              } else {
 -                      rcu_nocb_unlock_irqrestore(rdp, flags);
 -                      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
 -              }
 -      } else {
 -              rcu_nocb_unlock_irqrestore(rdp, flags);
 -              trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
 -      }
 -      return;
 -}
 -
 -/*
 - * Check if we ignore this rdp.
 - *
 - * We check that without holding the nocb lock but
 - * we make sure not to miss a freshly offloaded rdp
 - * with the current ordering:
 - *
 - *  rdp_offload_toggle()        nocb_gp_enabled_cb()
 - * -------------------------   ----------------------------
 - *    WRITE flags                 LOCK nocb_gp_lock
 - *    LOCK nocb_gp_lock           READ/WRITE nocb_gp_sleep
 - *    READ/WRITE nocb_gp_sleep    UNLOCK nocb_gp_lock
 - *    UNLOCK nocb_gp_lock         READ flags
 - */
 -static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
 -{
 -      u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
 -
 -      return rcu_segcblist_test_flags(&rdp->cblist, flags);
 -}
 -
 -static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
 -                                                   bool *needwake_state)
 -{
 -      struct rcu_segcblist *cblist = &rdp->cblist;
 -
 -      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
 -              if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
 -                      rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
 -                      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
 -                              *needwake_state = true;
 -              }
 -              return false;
 -      }
 -
 -      /*
 -       * De-offloading. Clear our flag and notify the de-offload worker.
 -       * We will ignore this rdp until it ever gets re-offloaded.
 -       */
 -      WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
 -      rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
 -      if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
 -              *needwake_state = true;
 -      return true;
 -}
 -
 -
 -/*
 - * No-CBs GP kthreads come here to wait for additional callbacks to show up
 - * or for grace periods to end.
 - */
 -static void nocb_gp_wait(struct rcu_data *my_rdp)
 -{
 -      bool bypass = false;
 -      long bypass_ncbs;
 -      int __maybe_unused cpu = my_rdp->cpu;
 -      unsigned long cur_gp_seq;
 -      unsigned long flags;
 -      bool gotcbs = false;
 -      unsigned long j = jiffies;
 -      bool needwait_gp = false; // This prevents actual uninitialized use.
 -      bool needwake;
 -      bool needwake_gp;
 -      struct rcu_data *rdp;
 -      struct rcu_node *rnp;
 -      unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
 -      bool wasempty = false;
 -
 -      /*
 -       * Each pass through the following loop checks for CBs and for the
 -       * nearest grace period (if any) to wait for next.  The CB kthreads
 -       * and the global grace-period kthread are awakened if needed.
 -       */
 -      WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
 -      for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
 -              bool needwake_state = false;
 -
 -              if (!nocb_gp_enabled_cb(rdp))
 -                      continue;
 -              trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
 -              rcu_nocb_lock_irqsave(rdp, flags);
 -              if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
 -                      rcu_nocb_unlock_irqrestore(rdp, flags);
 -                      if (needwake_state)
 -                              swake_up_one(&rdp->nocb_state_wq);
 -                      continue;
 -              }
 -              bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
 -              if (bypass_ncbs &&
 -                  (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
 -                   bypass_ncbs > 2 * qhimark)) {
 -                      // Bypass full or old, so flush it.
 -                      (void)rcu_nocb_try_flush_bypass(rdp, j);
 -                      bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
 -              } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
 -                      rcu_nocb_unlock_irqrestore(rdp, flags);
 -                      if (needwake_state)
 -                              swake_up_one(&rdp->nocb_state_wq);
 -                      continue; /* No callbacks here, try next. */
 -              }
 -              if (bypass_ncbs) {
 -                      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 -                                          TPS("Bypass"));
 -                      bypass = true;
 -              }
 -              rnp = rdp->mynode;
 -
 -              // Advance callbacks if helpful and low contention.
 -              needwake_gp = false;
 -              if (!rcu_segcblist_restempty(&rdp->cblist,
 -                                           RCU_NEXT_READY_TAIL) ||
 -                  (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
 -                   rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
 -                      raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
 -                      needwake_gp = rcu_advance_cbs(rnp, rdp);
 -                      wasempty = rcu_segcblist_restempty(&rdp->cblist,
 -                                                         RCU_NEXT_READY_TAIL);
 -                      raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
 -              }
 -              // Need to wait on some grace period?
 -              WARN_ON_ONCE(wasempty &&
 -                           !rcu_segcblist_restempty(&rdp->cblist,
 -                                                    RCU_NEXT_READY_TAIL));
 -              if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
 -                      if (!needwait_gp ||
 -                          ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
 -                              wait_gp_seq = cur_gp_seq;
 -                      needwait_gp = true;
 -                      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 -                                          TPS("NeedWaitGP"));
 -              }
 -              if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
 -                      needwake = rdp->nocb_cb_sleep;
 -                      WRITE_ONCE(rdp->nocb_cb_sleep, false);
 -                      smp_mb(); /* CB invocation -after- GP end. */
 -              } else {
 -                      needwake = false;
 -              }
 -              rcu_nocb_unlock_irqrestore(rdp, flags);
 -              if (needwake) {
 -                      swake_up_one(&rdp->nocb_cb_wq);
 -                      gotcbs = true;
 -              }
 -              if (needwake_gp)
 -                      rcu_gp_kthread_wake();
 -              if (needwake_state)
 -                      swake_up_one(&rdp->nocb_state_wq);
 -      }
 -
 -      my_rdp->nocb_gp_bypass = bypass;
 -      my_rdp->nocb_gp_gp = needwait_gp;
 -      my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
 -
 -      if (bypass && !rcu_nocb_poll) {
 -              // At least one child with non-empty ->nocb_bypass, so set
 -              // timer in order to avoid stranding its callbacks.
 -              wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
 -                                 TPS("WakeBypassIsDeferred"));
 -      }
 -      if (rcu_nocb_poll) {
 -              /* Polling, so trace if first poll in the series. */
 -              if (gotcbs)
 -                      trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
 -              schedule_timeout_idle(1);
 -      } else if (!needwait_gp) {
 -              /* Wait for callbacks to appear. */
 -              trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
 -              swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
 -                              !READ_ONCE(my_rdp->nocb_gp_sleep));
 -              trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
 -      } else {
 -              rnp = my_rdp->mynode;
 -              trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
 -              swait_event_interruptible_exclusive(
 -                      rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
 -                      rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
 -                      !READ_ONCE(my_rdp->nocb_gp_sleep));
 -              trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
 -      }
 -      if (!rcu_nocb_poll) {
 -              raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
 -              if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
 -                      WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
 -                      del_timer(&my_rdp->nocb_timer);
 -              }
 -              WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
 -              raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
 -      }
 -      my_rdp->nocb_gp_seq = -1;
 -      WARN_ON(signal_pending(current));
 -}
 -
 -/*
 - * No-CBs grace-period-wait kthread.  There is one of these per group
 - * of CPUs, but only once at least one CPU in that group has come online
 - * at least once since boot.  This kthread checks for newly posted
 - * callbacks from any of the CPUs it is responsible for, waits for a
 - * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
 - * that then have callback-invocation work to do.
 - */
 -static int rcu_nocb_gp_kthread(void *arg)
 -{
 -      struct rcu_data *rdp = arg;
 -
 -      for (;;) {
 -              WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
 -              nocb_gp_wait(rdp);
 -              cond_resched_tasks_rcu_qs();
 -      }
 -      return 0;
 -}
 -
 -static inline bool nocb_cb_can_run(struct rcu_data *rdp)
 -{
 -      u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
 -      return rcu_segcblist_test_flags(&rdp->cblist, flags);
 -}
 -
 -static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
 -{
 -      return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
 -}
 -
 -/*
 - * Invoke any ready callbacks from the corresponding no-CBs CPU,
 - * then, if there are no more, wait for more to appear.
 - */
 -static void nocb_cb_wait(struct rcu_data *rdp)
 -{
 -      struct rcu_segcblist *cblist = &rdp->cblist;
 -      unsigned long cur_gp_seq;
 -      unsigned long flags;
 -      bool needwake_state = false;
 -      bool needwake_gp = false;
 -      bool can_sleep = true;
 -      struct rcu_node *rnp = rdp->mynode;
 -
 -      local_irq_save(flags);
 -      rcu_momentary_dyntick_idle();
 -      local_irq_restore(flags);
 -      /*
 -       * Disable BH to provide the expected environment.  Also, when
 -       * transitioning to/from NOCB mode, a self-requeuing callback might
 -       * be invoked from softirq.  A short grace period could cause both
 -       * instances of this callback would execute concurrently.
 -       */
 -      local_bh_disable();
 -      rcu_do_batch(rdp);
 -      local_bh_enable();
 -      lockdep_assert_irqs_enabled();
 -      rcu_nocb_lock_irqsave(rdp, flags);
 -      if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
 -          rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
 -          raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
 -              needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
 -              raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
 -      }
 -
 -      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
 -              if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
 -                      rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
 -                      if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
 -                              needwake_state = true;
 -              }
 -              if (rcu_segcblist_ready_cbs(cblist))
 -                      can_sleep = false;
 -      } else {
 -              /*
 -               * De-offloading. Clear our flag and notify the de-offload worker.
 -               * We won't touch the callbacks and keep sleeping until we ever
 -               * get re-offloaded.
 -               */
 -              WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
 -              rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
 -              if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
 -                      needwake_state = true;
 -      }
 -
 -      WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
 -
 -      if (rdp->nocb_cb_sleep)
 -              trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
 -
 -      rcu_nocb_unlock_irqrestore(rdp, flags);
 -      if (needwake_gp)
 -              rcu_gp_kthread_wake();
 -
 -      if (needwake_state)
 -              swake_up_one(&rdp->nocb_state_wq);
 -
 -      do {
 -              swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
 -                                                  nocb_cb_wait_cond(rdp));
 -
 -              // VVV Ensure CB invocation follows _sleep test.
 -              if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
 -                      WARN_ON(signal_pending(current));
 -                      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
 -              }
 -      } while (!nocb_cb_can_run(rdp));
 -}
 -
 -/*
 - * Per-rcu_data kthread, but only for no-CBs CPUs.  Repeatedly invoke
 - * nocb_cb_wait() to do the dirty work.
 - */
 -static int rcu_nocb_cb_kthread(void *arg)
 -{
 -      struct rcu_data *rdp = arg;
 -
 -      // Each pass through this loop does one callback batch, and,
 -      // if there are no more ready callbacks, waits for them.
 -      for (;;) {
 -              nocb_cb_wait(rdp);
 -              cond_resched_tasks_rcu_qs();
 -      }
 -      return 0;
 -}
 -
 -/* Is a deferred wakeup of rcu_nocb_kthread() required? */
 -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
 -{
 -      return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
 -}
 -
 -/* Do a deferred wakeup of rcu_nocb_kthread(). */
 -static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
 -                                         struct rcu_data *rdp, int level,
 -                                         unsigned long flags)
 -      __releases(rdp_gp->nocb_gp_lock)
 -{
 -      int ndw;
 -      int ret;
 -
 -      if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
 -              raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
 -              return false;
 -      }
 -
 -      ndw = rdp_gp->nocb_defer_wakeup;
 -      ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
 -      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
 -
 -      return ret;
 -}
 -
 -/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
 -static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
 -{
 -      unsigned long flags;
 -      struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
 -
 -      WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
 -      trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
 -
 -      raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
 -      smp_mb__after_spinlock(); /* Timer expire before wakeup. */
 -      do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
 -}
 -
 -/*
 - * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
 - * This means we do an inexact common-case check.  Note that if
 - * we miss, ->nocb_timer will eventually clean things up.
 - */
 -static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
 -{
 -      unsigned long flags;
 -      struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 -
 -      if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
 -              return false;
 -
 -      raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
 -      return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
 -}
 -
 -void rcu_nocb_flush_deferred_wakeup(void)
 -{
 -      do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
 -}
 -EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
 -
 -static int rdp_offload_toggle(struct rcu_data *rdp,
 -                             bool offload, unsigned long flags)
 -      __releases(rdp->nocb_lock)
 -{
 -      struct rcu_segcblist *cblist = &rdp->cblist;
 -      struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
 -      bool wake_gp = false;
 -
 -      rcu_segcblist_offload(cblist, offload);
 -
 -      if (rdp->nocb_cb_sleep)
 -              rdp->nocb_cb_sleep = false;
 -      rcu_nocb_unlock_irqrestore(rdp, flags);
 -
 -      /*
 -       * Ignore former value of nocb_cb_sleep and force wake up as it could
 -       * have been spuriously set to false already.
 -       */
 -      swake_up_one(&rdp->nocb_cb_wq);
 -
 -      raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
 -      if (rdp_gp->nocb_gp_sleep) {
 -              rdp_gp->nocb_gp_sleep = false;
 -              wake_gp = true;
 -      }
 -      raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
 -
 -      if (wake_gp)
 -              wake_up_process(rdp_gp->nocb_gp_kthread);
 -
 -      return 0;
 -}
 -
 -static long rcu_nocb_rdp_deoffload(void *arg)
 -{
 -      struct rcu_data *rdp = arg;
 -      struct rcu_segcblist *cblist = &rdp->cblist;
 -      unsigned long flags;
 -      int ret;
 -
 -      WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
 -
 -      pr_info("De-offloading %d\n", rdp->cpu);
 -
 -      rcu_nocb_lock_irqsave(rdp, flags);
 -      /*
 -       * Flush once and for all now. This suffices because we are
 -       * running on the target CPU holding ->nocb_lock (thus having
 -       * interrupts disabled), and because rdp_offload_toggle()
 -       * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
 -       * Thus future calls to rcu_segcblist_completely_offloaded() will
 -       * return false, which means that future calls to rcu_nocb_try_bypass()
 -       * will refuse to put anything into the bypass.
 -       */
 -      WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
 -      ret = rdp_offload_toggle(rdp, false, flags);
 -      swait_event_exclusive(rdp->nocb_state_wq,
 -                            !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
 -                                                      SEGCBLIST_KTHREAD_GP));
 -      /*
 -       * Lock one last time to acquire latest callback updates from kthreads
 -       * so we can later handle callbacks locally without locking.
 -       */
 -      rcu_nocb_lock_irqsave(rdp, flags);
 -      /*
 -       * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb
 -       * lock is released but how about being paranoid for once?
 -       */
 -      rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
 -      /*
 -       * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
 -       * rcu_nocb_unlock_irqrestore() anymore.
 -       */
 -      raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
 -
 -      /* Sanity check */
 -      WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
 -
 -
 -      return ret;
 -}
 -
 -int rcu_nocb_cpu_deoffload(int cpu)
 -{
 -      struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 -      int ret = 0;
 -
 -      mutex_lock(&rcu_state.barrier_mutex);
 -      cpus_read_lock();
 -      if (rcu_rdp_is_offloaded(rdp)) {
 -              if (cpu_online(cpu)) {
 -                      ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
 -                      if (!ret)
 -                              cpumask_clear_cpu(cpu, rcu_nocb_mask);
 -              } else {
 -                      pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
 -                      ret = -EINVAL;
 -              }
 -      }
 -      cpus_read_unlock();
 -      mutex_unlock(&rcu_state.barrier_mutex);
 -
 -      return ret;
 -}
 -EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
 -
 -static long rcu_nocb_rdp_offload(void *arg)
 -{
 -      struct rcu_data *rdp = arg;
 -      struct rcu_segcblist *cblist = &rdp->cblist;
 -      unsigned long flags;
 -      int ret;
 -
 -      WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
 -      /*
 -       * For now we only support re-offload, ie: the rdp must have been
 -       * offloaded on boot first.
 -       */
 -      if (!rdp->nocb_gp_rdp)
 -              return -EINVAL;
 -
 -      pr_info("Offloading %d\n", rdp->cpu);
 -      /*
 -       * Can't use rcu_nocb_lock_irqsave() while we are in
 -       * SEGCBLIST_SOFTIRQ_ONLY mode.
 -       */
 -      raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
 -
 -      /*
 -       * We didn't take the nocb lock while working on the
 -       * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
 -       * Every modifications that have been done previously on
 -       * rdp->cblist must be visible remotely by the nocb kthreads
 -       * upon wake up after reading the cblist flags.
 -       *
 -       * The layout against nocb_lock enforces that ordering:
 -       *
 -       *  __rcu_nocb_rdp_offload()   nocb_cb_wait()/nocb_gp_wait()
 -       * -------------------------   ----------------------------
 -       *      WRITE callbacks           rcu_nocb_lock()
 -       *      rcu_nocb_lock()           READ flags
 -       *      WRITE flags               READ callbacks
 -       *      rcu_nocb_unlock()         rcu_nocb_unlock()
 -       */
 -      ret = rdp_offload_toggle(rdp, true, flags);
 -      swait_event_exclusive(rdp->nocb_state_wq,
 -                            rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
 -                            rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
 -
 -      return ret;
 -}
 -
 -int rcu_nocb_cpu_offload(int cpu)
 -{
 -      struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 -      int ret = 0;
 -
 -      mutex_lock(&rcu_state.barrier_mutex);
 -      cpus_read_lock();
 -      if (!rcu_rdp_is_offloaded(rdp)) {
 -              if (cpu_online(cpu)) {
 -                      ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
 -                      if (!ret)
 -                              cpumask_set_cpu(cpu, rcu_nocb_mask);
 -              } else {
 -                      pr_info("NOCB: Can't CB-offload an offline CPU\n");
 -                      ret = -EINVAL;
 -              }
 -      }
 -      cpus_read_unlock();
 -      mutex_unlock(&rcu_state.barrier_mutex);
 -
 -      return ret;
 -}
 -EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
 -
 -void __init rcu_init_nohz(void)
 -{
 -      int cpu;
 -      bool need_rcu_nocb_mask = false;
 -      struct rcu_data *rdp;
 -
 -#if defined(CONFIG_NO_HZ_FULL)
 -      if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
 -              need_rcu_nocb_mask = true;
 -#endif /* #if defined(CONFIG_NO_HZ_FULL) */
 -
 -      if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
 -              if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
 -                      pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
 -                      return;
 -              }
 -      }
 -      if (!cpumask_available(rcu_nocb_mask))
 -              return;
 -
 -#if defined(CONFIG_NO_HZ_FULL)
 -      if (tick_nohz_full_running)
 -              cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
 -#endif /* #if defined(CONFIG_NO_HZ_FULL) */
 -
 -      if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
 -              pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
 -              cpumask_and(rcu_nocb_mask, cpu_possible_mask,
 -                          rcu_nocb_mask);
 -      }
 -      if (cpumask_empty(rcu_nocb_mask))
 -              pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
 -      else
 -              pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
 -                      cpumask_pr_args(rcu_nocb_mask));
 -      if (rcu_nocb_poll)
 -              pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
 -
 -      for_each_cpu(cpu, rcu_nocb_mask) {
 -              rdp = per_cpu_ptr(&rcu_data, cpu);
 -              if (rcu_segcblist_empty(&rdp->cblist))
 -                      rcu_segcblist_init(&rdp->cblist);
 -              rcu_segcblist_offload(&rdp->cblist, true);
 -              rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
 -              rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
 -      }
 -      rcu_organize_nocb_kthreads();
 -}
 -
 -/* Initialize per-rcu_data variables for no-CBs CPUs. */
 -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 -{
 -      init_swait_queue_head(&rdp->nocb_cb_wq);
 -      init_swait_queue_head(&rdp->nocb_gp_wq);
 -      init_swait_queue_head(&rdp->nocb_state_wq);
 -      raw_spin_lock_init(&rdp->nocb_lock);
 -      raw_spin_lock_init(&rdp->nocb_bypass_lock);
 -      raw_spin_lock_init(&rdp->nocb_gp_lock);
 -      timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
 -      rcu_cblist_init(&rdp->nocb_bypass);
 -}
 -
 -/*
 - * If the specified CPU is a no-CBs CPU that does not already have its
 - * rcuo CB kthread, spawn it.  Additionally, if the rcuo GP kthread
 - * for this CPU's group has not yet been created, spawn it as well.
 - */
 -static void rcu_spawn_one_nocb_kthread(int cpu)
 -{
 -      struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
 -      struct rcu_data *rdp_gp;
 -      struct task_struct *t;
 -
 -      /*
 -       * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
 -       * then nothing to do.
 -       */
 -      if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
 -              return;
 -
 -      /* If we didn't spawn the GP kthread first, reorganize! */
 -      rdp_gp = rdp->nocb_gp_rdp;
 -      if (!rdp_gp->nocb_gp_kthread) {
 -              t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
 -                              "rcuog/%d", rdp_gp->cpu);
 -              if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
 -                      return;
 -              WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
 -      }
 -
 -      /* Spawn the kthread for this CPU. */
 -      t = kthread_run(rcu_nocb_cb_kthread, rdp,
 -                      "rcuo%c/%d", rcu_state.abbr, cpu);
 -      if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
 -              return;
 -      WRITE_ONCE(rdp->nocb_cb_kthread, t);
 -      WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
 -}
 -
 -/*
 - * If the specified CPU is a no-CBs CPU that does not already have its
 - * rcuo kthread, spawn it.
 - */
 -static void rcu_spawn_cpu_nocb_kthread(int cpu)
 -{
 -      if (rcu_scheduler_fully_active)
 -              rcu_spawn_one_nocb_kthread(cpu);
 -}
 -
 -/*
 - * Once the scheduler is running, spawn rcuo kthreads for all online
 - * no-CBs CPUs.  This assumes that the early_initcall()s happen before
 - * non-boot CPUs come online -- if this changes, we will need to add
 - * some mutual exclusion.
 - */
 -static void __init rcu_spawn_nocb_kthreads(void)
 -{
 -      int cpu;
 -
 -      for_each_online_cpu(cpu)
 -              rcu_spawn_cpu_nocb_kthread(cpu);
 -}
 -
 -/* How many CB CPU IDs per GP kthread?  Default of -1 for sqrt(nr_cpu_ids). */
 -static int rcu_nocb_gp_stride = -1;
 -module_param(rcu_nocb_gp_stride, int, 0444);
 -
 -/*
 - * Initialize GP-CB relationships for all no-CBs CPU.
 - */
 -static void __init rcu_organize_nocb_kthreads(void)
 -{
 -      int cpu;
 -      bool firsttime = true;
 -      bool gotnocbs = false;
 -      bool gotnocbscbs = true;
 -      int ls = rcu_nocb_gp_stride;
 -      int nl = 0;  /* Next GP kthread. */
 -      struct rcu_data *rdp;
 -      struct rcu_data *rdp_gp = NULL;  /* Suppress misguided gcc warn. */
 -      struct rcu_data *rdp_prev = NULL;
 -
 -      if (!cpumask_available(rcu_nocb_mask))
 -              return;
 -      if (ls == -1) {
 -              ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
 -              rcu_nocb_gp_stride = ls;
 -      }
 -
 -      /*
 -       * Each pass through this loop sets up one rcu_data structure.
 -       * Should the corresponding CPU come online in the future, then
 -       * we will spawn the needed set of rcu_nocb_kthread() kthreads.
 -       */
 -      for_each_cpu(cpu, rcu_nocb_mask) {
 -              rdp = per_cpu_ptr(&rcu_data, cpu);
 -              if (rdp->cpu >= nl) {
 -                      /* New GP kthread, set up for CBs & next GP. */
 -                      gotnocbs = true;
 -                      nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
 -                      rdp->nocb_gp_rdp = rdp;
 -                      rdp_gp = rdp;
 -                      if (dump_tree) {
 -                              if (!firsttime)
 -                                      pr_cont("%s\n", gotnocbscbs
 -                                                      ? "" : " (self only)");
 -                              gotnocbscbs = false;
 -                              firsttime = false;
 -                              pr_alert("%s: No-CB GP kthread CPU %d:",
 -                                       __func__, cpu);
 -                      }
 -              } else {
 -                      /* Another CB kthread, link to previous GP kthread. */
 -                      gotnocbscbs = true;
 -                      rdp->nocb_gp_rdp = rdp_gp;
 -                      rdp_prev->nocb_next_cb_rdp = rdp;
 -                      if (dump_tree)
 -                              pr_cont(" %d", cpu);
 -              }
 -              rdp_prev = rdp;
 -      }
 -      if (gotnocbs && dump_tree)
 -              pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
 -}
 -
 -/*
 - * Bind the current task to the offloaded CPUs.  If there are no offloaded
 - * CPUs, leave the task unbound.  Splat if the bind attempt fails.
 - */
 -void rcu_bind_current_to_nocb(void)
 -{
 -      if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
 -              WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
 -}
 -EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
 -
 -// The ->on_cpu field is available only in CONFIG_SMP=y, so...
 -#ifdef CONFIG_SMP
 -static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
 -{
 -      return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
 -}
 -#else // #ifdef CONFIG_SMP
 -static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
 -{
 -      return "";
 -}
 -#endif // #else #ifdef CONFIG_SMP
 -
 -/*
 - * Dump out nocb grace-period kthread state for the specified rcu_data
 - * structure.
 - */
 -static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
 -{
 -      struct rcu_node *rnp = rdp->mynode;
 -
 -      pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
 -              rdp->cpu,
 -              "kK"[!!rdp->nocb_gp_kthread],
 -              "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
 -              "dD"[!!rdp->nocb_defer_wakeup],
 -              "tT"[timer_pending(&rdp->nocb_timer)],
 -              "sS"[!!rdp->nocb_gp_sleep],
 -              ".W"[swait_active(&rdp->nocb_gp_wq)],
 -              ".W"[swait_active(&rnp->nocb_gp_wq[0])],
 -              ".W"[swait_active(&rnp->nocb_gp_wq[1])],
 -              ".B"[!!rdp->nocb_gp_bypass],
 -              ".G"[!!rdp->nocb_gp_gp],
 -              (long)rdp->nocb_gp_seq,
 -              rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
 -              rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
 -              rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
 -              show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
 -}
 -
 -/* Dump out nocb kthread state for the specified rcu_data structure. */
 -static void show_rcu_nocb_state(struct rcu_data *rdp)
 -{
 -      char bufw[20];
 -      char bufr[20];
 -      struct rcu_segcblist *rsclp = &rdp->cblist;
 -      bool waslocked;
 -      bool wassleep;
 -
 -      if (rdp->nocb_gp_rdp == rdp)
 -              show_rcu_nocb_gp_state(rdp);
 -
 -      sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
 -      sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
 -      pr_info("   CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
 -              rdp->cpu, rdp->nocb_gp_rdp->cpu,
 -              rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
 -              "kK"[!!rdp->nocb_cb_kthread],
 -              "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
 -              "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
 -              "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
 -              "sS"[!!rdp->nocb_cb_sleep],
 -              ".W"[swait_active(&rdp->nocb_cb_wq)],
 -              jiffies - rdp->nocb_bypass_first,
 -              jiffies - rdp->nocb_nobypass_last,
 -              rdp->nocb_nobypass_count,
 -              ".D"[rcu_segcblist_ready_cbs(rsclp)],
 -              ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
 -              rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
 -              ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
 -              rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
 -              ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
 -              ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
 -              rcu_segcblist_n_cbs(&rdp->cblist),
 -              rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
 -              rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
 -              show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
 -
 -      /* It is OK for GP kthreads to have GP state. */
 -      if (rdp->nocb_gp_rdp == rdp)
 -              return;
 -
 -      waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
 -      wassleep = swait_active(&rdp->nocb_gp_wq);
 -      if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
 -              return;  /* Nothing untoward. */
 -
 -      pr_info("   nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
 -              "lL"[waslocked],
 -              "dD"[!!rdp->nocb_defer_wakeup],
 -              "sS"[!!rdp->nocb_gp_sleep],
 -              ".W"[wassleep]);
 -}
 -
 -#else /* #ifdef CONFIG_RCU_NOCB_CPU */
 -
 -/* No ->nocb_lock to acquire.  */
 -static void rcu_nocb_lock(struct rcu_data *rdp)
 -{
 -}
 -
 -/* No ->nocb_lock to release.  */
 -static void rcu_nocb_unlock(struct rcu_data *rdp)
 -{
 -}
 -
 -/* No ->nocb_lock to release.  */
 -static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
 -                                     unsigned long flags)
 -{
 -      local_irq_restore(flags);
 -}
 -
 -/* Lockdep check that ->cblist may be safely accessed. */
 -static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
 -{
 -      lockdep_assert_irqs_disabled();
 -}
 -
 -static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 -{
 -}
 -
 -static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
 -{
 -      return NULL;
 -}
 -
 -static void rcu_init_one_nocb(struct rcu_node *rnp)
 -{
 -}
 -
 -static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 -                                unsigned long j)
 -{
 -      return true;
 -}
 -
 -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 -                              bool *was_alldone, unsigned long flags)
 -{
 -      return false;
 -}
 -
 -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
 -                               unsigned long flags)
 -{
 -      WARN_ON_ONCE(1);  /* Should be dead code! */
 -}
 -
 -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 -{
 -}
 -
 -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
 -{
 -      return false;
 -}
 -
 -static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
 -{
 -      return false;
 -}
 -
 -static void rcu_spawn_cpu_nocb_kthread(int cpu)
 -{
 -}
 -
 -static void __init rcu_spawn_nocb_kthreads(void)
 -{
 -}
 -
 -static void show_rcu_nocb_state(struct rcu_data *rdp)
 -{
 -}
 -
 -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 -
  /*
   * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
   * grace-period kthread will do force_quiescent_state() processing?
@@@ -1498,17 -2982,17 +1498,17 @@@ static void noinstr rcu_dynticks_task_e
  /* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
  static void rcu_dynticks_task_trace_enter(void)
  {
 -#ifdef CONFIG_TASKS_RCU_TRACE
 +#ifdef CONFIG_TASKS_TRACE_RCU
        if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
                current->trc_reader_special.b.need_mb = true;
 -#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
 +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
  }
  
  /* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
  static void rcu_dynticks_task_trace_exit(void)
  {
 -#ifdef CONFIG_TASKS_RCU_TRACE
 +#ifdef CONFIG_TASKS_TRACE_RCU
        if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
                current->trc_reader_special.b.need_mb = false;
 -#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
 +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
  }
diff --combined kernel/sched/core.c
index 37bec9b05e31b726c75f36f63b9c9255b1d2b449,c89c1d45dd0b0076f3c14c1b9f14aa8a04bb6f23..c4462c454ab9ce27045df0bb95b1c125aef02e87
@@@ -237,30 -237,9 +237,30 @@@ static DEFINE_MUTEX(sched_core_mutex)
  static atomic_t sched_core_count;
  static struct cpumask sched_core_mask;
  
 +static void sched_core_lock(int cpu, unsigned long *flags)
 +{
 +      const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 +      int t, i = 0;
 +
 +      local_irq_save(*flags);
 +      for_each_cpu(t, smt_mask)
 +              raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
 +}
 +
 +static void sched_core_unlock(int cpu, unsigned long *flags)
 +{
 +      const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 +      int t;
 +
 +      for_each_cpu(t, smt_mask)
 +              raw_spin_unlock(&cpu_rq(t)->__lock);
 +      local_irq_restore(*flags);
 +}
 +
  static void __sched_core_flip(bool enabled)
  {
 -      int cpu, t, i;
 +      unsigned long flags;
 +      int cpu, t;
  
        cpus_read_lock();
  
        for_each_cpu(cpu, &sched_core_mask) {
                const struct cpumask *smt_mask = cpu_smt_mask(cpu);
  
 -              i = 0;
 -              local_irq_disable();
 -              for_each_cpu(t, smt_mask) {
 -                      /* supports up to SMT8 */
 -                      raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
 -              }
 +              sched_core_lock(cpu, &flags);
  
                for_each_cpu(t, smt_mask)
                        cpu_rq(t)->core_enabled = enabled;
  
 -              for_each_cpu(t, smt_mask)
 -                      raw_spin_unlock(&cpu_rq(t)->__lock);
 -              local_irq_enable();
 +              sched_core_unlock(cpu, &flags);
  
                cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
        }
@@@ -1007,7 -993,6 +1007,7 @@@ int get_nohz_timer_target(void
  {
        int i, cpu = smp_processor_id(), default_cpu = -1;
        struct sched_domain *sd;
 +      const struct cpumask *hk_mask;
  
        if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
                if (!idle_cpu(cpu))
                default_cpu = cpu;
        }
  
 +      hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
 +
        rcu_read_lock();
        for_each_domain(cpu, sd) {
 -              for_each_cpu_and(i, sched_domain_span(sd),
 -                      housekeeping_cpumask(HK_FLAG_TIMER)) {
 +              for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
                        if (cpu == i)
                                continue;
  
@@@ -1635,23 -1619,6 +1635,23 @@@ static inline void uclamp_rq_dec(struc
                uclamp_rq_dec_id(rq, p, clamp_id);
  }
  
 +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
 +                                    enum uclamp_id clamp_id)
 +{
 +      if (!p->uclamp[clamp_id].active)
 +              return;
 +
 +      uclamp_rq_dec_id(rq, p, clamp_id);
 +      uclamp_rq_inc_id(rq, p, clamp_id);
 +
 +      /*
 +       * Make sure to clear the idle flag if we've transiently reached 0
 +       * active tasks on rq.
 +       */
 +      if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
 +              rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
 +}
 +
  static inline void
  uclamp_update_active(struct task_struct *p)
  {
         * affecting a valid clamp bucket, the next time it's enqueued,
         * it will already see the updated clamp bucket value.
         */
 -      for_each_clamp_id(clamp_id) {
 -              if (p->uclamp[clamp_id].active) {
 -                      uclamp_rq_dec_id(rq, p, clamp_id);
 -                      uclamp_rq_inc_id(rq, p, clamp_id);
 -              }
 -      }
 +      for_each_clamp_id(clamp_id)
 +              uclamp_rq_reinc_id(rq, p, clamp_id);
  
        task_rq_unlock(rq, p, &rf);
  }
@@@ -2190,7 -2161,7 +2190,7 @@@ static inline bool is_cpu_allowed(struc
  
        /* Non kernel threads are not allowed during either online or offline. */
        if (!(p->flags & PF_KTHREAD))
 -              return cpu_active(cpu);
 +              return cpu_active(cpu) && task_cpu_possible(cpu, p);
  
        /* KTHREAD_IS_PER_CPU is always allowed. */
        if (kthread_is_per_cpu(p))
@@@ -2497,34 -2468,6 +2497,34 @@@ void do_set_cpus_allowed(struct task_st
        __do_set_cpus_allowed(p, new_mask, 0);
  }
  
 +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
 +                    int node)
 +{
 +      if (!src->user_cpus_ptr)
 +              return 0;
 +
 +      dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
 +      if (!dst->user_cpus_ptr)
 +              return -ENOMEM;
 +
 +      cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
 +      return 0;
 +}
 +
 +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
 +{
 +      struct cpumask *user_mask = NULL;
 +
 +      swap(p->user_cpus_ptr, user_mask);
 +
 +      return user_mask;
 +}
 +
 +void release_user_cpus_ptr(struct task_struct *p)
 +{
 +      kfree(clear_user_cpus_ptr(p));
 +}
 +
  /*
   * This function is wildly self concurrent; here be dragons.
   *
@@@ -2742,26 -2685,28 +2742,26 @@@ static int affine_move_task(struct rq *
  }
  
  /*
 - * Change a given task's CPU affinity. Migrate the thread to a
 - * proper CPU and schedule it away if the CPU it's executing on
 - * is removed from the allowed bitmask.
 - *
 - * NOTE: the caller must have a valid reference to the task, the
 - * task must not exit() & deallocate itself prematurely. The
 - * call is not atomic; no spinlocks may be held.
 + * Called with both p->pi_lock and rq->lock held; drops both before returning.
   */
 -static int __set_cpus_allowed_ptr(struct task_struct *p,
 -                                const struct cpumask *new_mask,
 -                                u32 flags)
 +static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
 +                                       const struct cpumask *new_mask,
 +                                       u32 flags,
 +                                       struct rq *rq,
 +                                       struct rq_flags *rf)
 +      __releases(rq->lock)
 +      __releases(p->pi_lock)
  {
 +      const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
        const struct cpumask *cpu_valid_mask = cpu_active_mask;
 +      bool kthread = p->flags & PF_KTHREAD;
 +      struct cpumask *user_mask = NULL;
        unsigned int dest_cpu;
 -      struct rq_flags rf;
 -      struct rq *rq;
        int ret = 0;
  
 -      rq = task_rq_lock(p, &rf);
        update_rq_clock(rq);
  
 -      if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
 +      if (kthread || is_migration_disabled(p)) {
                /*
                 * Kernel threads are allowed on online && !active CPUs,
                 * however, during cpu-hot-unplug, even these might get pushed
                cpu_valid_mask = cpu_online_mask;
        }
  
 +      if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
 +              ret = -EINVAL;
 +              goto out;
 +      }
 +
        /*
         * Must re-check here, to close a race against __kthread_bind(),
         * sched_setaffinity() is not guaranteed to observe the flag.
  
        __do_set_cpus_allowed(p, new_mask, flags);
  
 -      return affine_move_task(rq, p, &rf, dest_cpu, flags);
 +      if (flags & SCA_USER)
 +              user_mask = clear_user_cpus_ptr(p);
 +
 +      ret = affine_move_task(rq, p, rf, dest_cpu, flags);
 +
 +      kfree(user_mask);
 +
 +      return ret;
  
  out:
 -      task_rq_unlock(rq, p, &rf);
 +      task_rq_unlock(rq, p, rf);
  
        return ret;
  }
  
 +/*
 + * Change a given task's CPU affinity. Migrate the thread to a
 + * proper CPU and schedule it away if the CPU it's executing on
 + * is removed from the allowed bitmask.
 + *
 + * NOTE: the caller must have a valid reference to the task, the
 + * task must not exit() & deallocate itself prematurely. The
 + * call is not atomic; no spinlocks may be held.
 + */
 +static int __set_cpus_allowed_ptr(struct task_struct *p,
 +                                const struct cpumask *new_mask, u32 flags)
 +{
 +      struct rq_flags rf;
 +      struct rq *rq;
 +
 +      rq = task_rq_lock(p, &rf);
 +      return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
 +}
 +
  int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  {
        return __set_cpus_allowed_ptr(p, new_mask, 0);
  }
  EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
  
 +/*
 + * Change a given task's CPU affinity to the intersection of its current
 + * affinity mask and @subset_mask, writing the resulting mask to @new_mask
 + * and pointing @p->user_cpus_ptr to a copy of the old mask.
 + * If the resulting mask is empty, leave the affinity unchanged and return
 + * -EINVAL.
 + */
 +static int restrict_cpus_allowed_ptr(struct task_struct *p,
 +                                   struct cpumask *new_mask,
 +                                   const struct cpumask *subset_mask)
 +{
 +      struct cpumask *user_mask = NULL;
 +      struct rq_flags rf;
 +      struct rq *rq;
 +      int err;
 +
 +      if (!p->user_cpus_ptr) {
 +              user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
 +              if (!user_mask)
 +                      return -ENOMEM;
 +      }
 +
 +      rq = task_rq_lock(p, &rf);
 +
 +      /*
 +       * Forcefully restricting the affinity of a deadline task is
 +       * likely to cause problems, so fail and noisily override the
 +       * mask entirely.
 +       */
 +      if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
 +              err = -EPERM;
 +              goto err_unlock;
 +      }
 +
 +      if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
 +              err = -EINVAL;
 +              goto err_unlock;
 +      }
 +
 +      /*
 +       * We're about to butcher the task affinity, so keep track of what
 +       * the user asked for in case we're able to restore it later on.
 +       */
 +      if (user_mask) {
 +              cpumask_copy(user_mask, p->cpus_ptr);
 +              p->user_cpus_ptr = user_mask;
 +      }
 +
 +      return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
 +
 +err_unlock:
 +      task_rq_unlock(rq, p, &rf);
 +      kfree(user_mask);
 +      return err;
 +}
 +
 +/*
 + * Restrict the CPU affinity of task @p so that it is a subset of
 + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
 + * old affinity mask. If the resulting mask is empty, we warn and walk
 + * up the cpuset hierarchy until we find a suitable mask.
 + */
 +void force_compatible_cpus_allowed_ptr(struct task_struct *p)
 +{
 +      cpumask_var_t new_mask;
 +      const struct cpumask *override_mask = task_cpu_possible_mask(p);
 +
 +      alloc_cpumask_var(&new_mask, GFP_KERNEL);
 +
 +      /*
 +       * __migrate_task() can fail silently in the face of concurrent
 +       * offlining of the chosen destination CPU, so take the hotplug
 +       * lock to ensure that the migration succeeds.
 +       */
 +      cpus_read_lock();
 +      if (!cpumask_available(new_mask))
 +              goto out_set_mask;
 +
 +      if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
 +              goto out_free_mask;
 +
 +      /*
 +       * We failed to find a valid subset of the affinity mask for the
 +       * task, so override it based on its cpuset hierarchy.
 +       */
 +      cpuset_cpus_allowed(p, new_mask);
 +      override_mask = new_mask;
 +
 +out_set_mask:
 +      if (printk_ratelimit()) {
 +              printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
 +                              task_pid_nr(p), p->comm,
 +                              cpumask_pr_args(override_mask));
 +      }
 +
 +      WARN_ON(set_cpus_allowed_ptr(p, override_mask));
 +out_free_mask:
 +      cpus_read_unlock();
 +      free_cpumask_var(new_mask);
 +}
 +
 +static int
 +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
 +
 +/*
 + * Restore the affinity of a task @p which was previously restricted by a
 + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
 + * @p->user_cpus_ptr.
 + *
 + * It is the caller's responsibility to serialise this with any calls to
 + * force_compatible_cpus_allowed_ptr(@p).
 + */
 +void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
 +{
 +      struct cpumask *user_mask = p->user_cpus_ptr;
 +      unsigned long flags;
 +
 +      /*
 +       * Try to restore the old affinity mask. If this fails, then
 +       * we free the mask explicitly to avoid it being inherited across
 +       * a subsequent fork().
 +       */
 +      if (!user_mask || !__sched_setaffinity(p, user_mask))
 +              return;
 +
 +      raw_spin_lock_irqsave(&p->pi_lock, flags);
 +      user_mask = clear_user_cpus_ptr(p);
 +      raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 +
 +      kfree(user_mask);
 +}
 +
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  {
  #ifdef CONFIG_SCHED_DEBUG
@@@ -3330,7 -3112,9 +3330,7 @@@ static int select_fallback_rq(int cpu, 
  
                /* Look for allowed, online CPU in same node. */
                for_each_cpu(dest_cpu, nodemask) {
 -                      if (!cpu_active(dest_cpu))
 -                              continue;
 -                      if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
 +                      if (is_cpu_allowed(p, dest_cpu))
                                return dest_cpu;
                }
        }
                /* No more Mr. Nice Guy. */
                switch (state) {
                case cpuset:
 -                      if (IS_ENABLED(CONFIG_CPUSETS)) {
 -                              cpuset_cpus_allowed_fallback(p);
 +                      if (cpuset_cpus_allowed_fallback(p)) {
                                state = possible;
                                break;
                        }
                         *
                         * More yuck to audit.
                         */
 -                      do_set_cpus_allowed(p, cpu_possible_mask);
 +                      do_set_cpus_allowed(p, task_cpu_possible_mask(p));
                        state = fail;
                        break;
 -
                case fail:
                        BUG();
                        break;
@@@ -3775,6 -3561,55 +3775,55 @@@ static void ttwu_queue(struct task_stru
        rq_unlock(rq, &rf);
  }
  
+ /*
+  * Invoked from try_to_wake_up() to check whether the task can be woken up.
+  *
+  * The caller holds p::pi_lock if p != current or has preemption
+  * disabled when p == current.
+  *
+  * The rules of PREEMPT_RT saved_state:
+  *
+  *   The related locking code always holds p::pi_lock when updating
+  *   p::saved_state, which means the code is fully serialized in both cases.
+  *
+  *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
+  *   bits set. This allows to distinguish all wakeup scenarios.
+  */
+ static __always_inline
+ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+ {
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
+               WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+                            state != TASK_RTLOCK_WAIT);
+       }
+       if (READ_ONCE(p->__state) & state) {
+               *success = 1;
+               return true;
+       }
+ #ifdef CONFIG_PREEMPT_RT
+       /*
+        * Saved state preserves the task state across blocking on
+        * an RT lock.  If the state matches, set p::saved_state to
+        * TASK_RUNNING, but do not wake the task because it waits
+        * for a lock wakeup. Also indicate success because from
+        * the regular waker's point of view this has succeeded.
+        *
+        * After acquiring the lock the task will restore p::__state
+        * from p::saved_state which ensures that the regular
+        * wakeup is not lost. The restore will also set
+        * p::saved_state to TASK_RUNNING so any further tests will
+        * not result in false positives vs. @success
+        */
+       if (p->saved_state & state) {
+               p->saved_state = TASK_RUNNING;
+               *success = 1;
+       }
+ #endif
+       return false;
+ }
  /*
   * Notes on Program-Order guarantees on SMP systems.
   *
@@@ -3914,10 -3749,9 +3963,9 @@@ try_to_wake_up(struct task_struct *p, u
                 *  - we're serialized against set_special_state() by virtue of
                 *    it disabling IRQs (this allows not taking ->pi_lock).
                 */
-               if (!(READ_ONCE(p->__state) & state))
+               if (!ttwu_state_match(p, state, &success))
                        goto out;
  
-               success = 1;
                trace_sched_waking(p);
                WRITE_ONCE(p->__state, TASK_RUNNING);
                trace_sched_wakeup(p);
         */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        smp_mb__after_spinlock();
-       if (!(READ_ONCE(p->__state) & state))
+       if (!ttwu_state_match(p, state, &success))
                goto unlock;
  
        trace_sched_waking(p);
  
-       /* We're going to change ->state: */
-       success = 1;
        /*
         * Ensure we load p->on_rq _after_ p->state, otherwise it would
         * be possible to, falsely, observe p->on_rq == 0 and get stuck
@@@ -5874,9 -5705,11 +5919,9 @@@ static bool try_steal_cookie(int this, 
                if (p->core_occupation > dst->idle->core_occupation)
                        goto next;
  
 -              p->on_rq = TASK_ON_RQ_MIGRATING;
                deactivate_task(src, p, 0);
                set_task_cpu(p, this);
                activate_task(dst, p, 0);
 -              p->on_rq = TASK_ON_RQ_QUEUED;
  
                resched_curr(dst);
  
@@@ -5948,109 -5781,35 +5993,109 @@@ void queue_core_balance(struct rq *rq
        queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
  }
  
 -static inline void sched_core_cpu_starting(unsigned int cpu)
 +static void sched_core_cpu_starting(unsigned int cpu)
  {
        const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 -      struct rq *rq, *core_rq = NULL;
 -      int i;
 +      struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
 +      unsigned long flags;
 +      int t;
 +
 +      sched_core_lock(cpu, &flags);
  
 -      core_rq = cpu_rq(cpu)->core;
 +      WARN_ON_ONCE(rq->core != rq);
 +
 +      /* if we're the first, we'll be our own leader */
 +      if (cpumask_weight(smt_mask) == 1)
 +              goto unlock;
  
 -      if (!core_rq) {
 -              for_each_cpu(i, smt_mask) {
 -                      rq = cpu_rq(i);
 -                      if (rq->core && rq->core == rq)
 -                              core_rq = rq;
 +      /* find the leader */
 +      for_each_cpu(t, smt_mask) {
 +              if (t == cpu)
 +                      continue;
 +              rq = cpu_rq(t);
 +              if (rq->core == rq) {
 +                      core_rq = rq;
 +                      break;
                }
 +      }
  
 -              if (!core_rq)
 -                      core_rq = cpu_rq(cpu);
 +      if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
 +              goto unlock;
  
 -              for_each_cpu(i, smt_mask) {
 -                      rq = cpu_rq(i);
 +      /* install and validate core_rq */
 +      for_each_cpu(t, smt_mask) {
 +              rq = cpu_rq(t);
  
 -                      WARN_ON_ONCE(rq->core && rq->core != core_rq);
 +              if (t == cpu)
                        rq->core = core_rq;
 -              }
 +
 +              WARN_ON_ONCE(rq->core != core_rq);
        }
 +
 +unlock:
 +      sched_core_unlock(cpu, &flags);
  }
 +
 +static void sched_core_cpu_deactivate(unsigned int cpu)
 +{
 +      const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 +      struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
 +      unsigned long flags;
 +      int t;
 +
 +      sched_core_lock(cpu, &flags);
 +
 +      /* if we're the last man standing, nothing to do */
 +      if (cpumask_weight(smt_mask) == 1) {
 +              WARN_ON_ONCE(rq->core != rq);
 +              goto unlock;
 +      }
 +
 +      /* if we're not the leader, nothing to do */
 +      if (rq->core != rq)
 +              goto unlock;
 +
 +      /* find a new leader */
 +      for_each_cpu(t, smt_mask) {
 +              if (t == cpu)
 +                      continue;
 +              core_rq = cpu_rq(t);
 +              break;
 +      }
 +
 +      if (WARN_ON_ONCE(!core_rq)) /* impossible */
 +              goto unlock;
 +
 +      /* copy the shared state to the new leader */
 +      core_rq->core_task_seq      = rq->core_task_seq;
 +      core_rq->core_pick_seq      = rq->core_pick_seq;
 +      core_rq->core_cookie        = rq->core_cookie;
 +      core_rq->core_forceidle     = rq->core_forceidle;
 +      core_rq->core_forceidle_seq = rq->core_forceidle_seq;
 +
 +      /* install new leader */
 +      for_each_cpu(t, smt_mask) {
 +              rq = cpu_rq(t);
 +              rq->core = core_rq;
 +      }
 +
 +unlock:
 +      sched_core_unlock(cpu, &flags);
 +}
 +
 +static inline void sched_core_cpu_dying(unsigned int cpu)
 +{
 +      struct rq *rq = cpu_rq(cpu);
 +
 +      if (rq->core != rq)
 +              rq->core = rq;
 +}
 +
  #else /* !CONFIG_SCHED_CORE */
  
  static inline void sched_core_cpu_starting(unsigned int cpu) {}
 +static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
 +static inline void sched_core_cpu_dying(unsigned int cpu) {}
  
  static struct task_struct *
  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  
  #endif /* CONFIG_SCHED_CORE */
  
+ /*
+  * Constants for the sched_mode argument of __schedule().
+  *
+  * The mode argument allows RT enabled kernels to differentiate a
+  * preemption from blocking on an 'sleeping' spin/rwlock. Note that
+  * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
+  * optimize the AND operation out and just check for zero.
+  */
+ #define SM_NONE                       0x0
+ #define SM_PREEMPT            0x1
+ #define SM_RTLOCK_WAIT                0x2
+ #ifndef CONFIG_PREEMPT_RT
+ # define SM_MASK_PREEMPT      (~0U)
+ #else
+ # define SM_MASK_PREEMPT      SM_PREEMPT
+ #endif
  /*
   * __schedule() is the main scheduler function.
   *
   *
   * WARNING: must be called with preemption disabled!
   */
- static void __sched notrace __schedule(bool preempt)
+ static void __sched notrace __schedule(unsigned int sched_mode)
  {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        rq = cpu_rq(cpu);
        prev = rq->curr;
  
-       schedule_debug(prev, preempt);
+       schedule_debug(prev, !!sched_mode);
  
        if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
                hrtick_clear(rq);
  
        local_irq_disable();
-       rcu_note_context_switch(preempt);
+       rcu_note_context_switch(!!sched_mode);
  
        /*
         * Make sure that signal_pending_state()->signal_pending() below
         *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
         */
        prev_state = READ_ONCE(prev->__state);
-       if (!preempt && prev_state) {
+       if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
                if (signal_pending_state(prev_state, prev)) {
                        WRITE_ONCE(prev->__state, TASK_RUNNING);
                } else {
                migrate_disable_switch(rq, prev);
                psi_sched_switch(prev, next, !task_on_rq_queued(prev));
  
-               trace_sched_switch(preempt, prev, next);
+               trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
  
                /* Also unlocks the rq: */
                rq = context_switch(rq, prev, next, &rf);
@@@ -6239,7 -6016,7 +6302,7 @@@ void __noreturn do_task_dead(void
        /* Tell freezer to ignore us: */
        current->flags |= PF_NOFREEZE;
  
-       __schedule(false);
+       __schedule(SM_NONE);
        BUG();
  
        /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
@@@ -6300,7 -6077,7 +6363,7 @@@ asmlinkage __visible void __sched sched
        sched_submit_work(tsk);
        do {
                preempt_disable();
-               __schedule(false);
+               __schedule(SM_NONE);
                sched_preempt_enable_no_resched();
        } while (need_resched());
        sched_update_worker(tsk);
@@@ -6328,7 -6105,7 +6391,7 @@@ void __sched schedule_idle(void
         */
        WARN_ON_ONCE(current->__state);
        do {
-               __schedule(false);
+               __schedule(SM_NONE);
        } while (need_resched());
  }
  
@@@ -6363,6 -6140,18 +6426,18 @@@ void __sched schedule_preempt_disabled(
        preempt_disable();
  }
  
+ #ifdef CONFIG_PREEMPT_RT
+ void __sched notrace schedule_rtlock(void)
+ {
+       do {
+               preempt_disable();
+               __schedule(SM_RTLOCK_WAIT);
+               sched_preempt_enable_no_resched();
+       } while (need_resched());
+ }
+ NOKPROBE_SYMBOL(schedule_rtlock);
+ #endif
  static void __sched notrace preempt_schedule_common(void)
  {
        do {
                 */
                preempt_disable_notrace();
                preempt_latency_start(1);
-               __schedule(true);
+               __schedule(SM_PREEMPT);
                preempt_latency_stop(1);
                preempt_enable_no_resched_notrace();
  
@@@ -6460,7 -6249,7 +6535,7 @@@ asmlinkage __visible void __sched notra
                 * an infinite recursion.
                 */
                prev_ctx = exception_enter();
-               __schedule(true);
+               __schedule(SM_PREEMPT);
                exception_exit(prev_ctx);
  
                preempt_latency_stop(1);
@@@ -6609,7 -6398,7 +6684,7 @@@ asmlinkage __visible void __sched preem
        do {
                preempt_disable();
                local_irq_enable();
-               __schedule(true);
+               __schedule(SM_PREEMPT);
                local_irq_disable();
                sched_preempt_enable_no_resched();
        } while (need_resched());
@@@ -7586,16 -7375,6 +7661,16 @@@ err_size
        return -E2BIG;
  }
  
 +static void get_params(struct task_struct *p, struct sched_attr *attr)
 +{
 +      if (task_has_dl_policy(p))
 +              __getparam_dl(p, attr);
 +      else if (task_has_rt_policy(p))
 +              attr->sched_priority = p->rt_priority;
 +      else
 +              attr->sched_nice = task_nice(p);
 +}
 +
  /**
   * sys_sched_setscheduler - set/change the scheduler policy and RT priority
   * @pid: the pid in question.
@@@ -7657,8 -7436,6 +7732,8 @@@ SYSCALL_DEFINE3(sched_setattr, pid_t, p
        rcu_read_unlock();
  
        if (likely(p)) {
 +              if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
 +                      get_params(p, &attr);
                retval = sched_setattr(p, &attr);
                put_task_struct(p);
        }
@@@ -7807,8 -7584,12 +7882,8 @@@ SYSCALL_DEFINE4(sched_getattr, pid_t, p
        kattr.sched_policy = p->policy;
        if (p->sched_reset_on_fork)
                kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
 -      if (task_has_dl_policy(p))
 -              __getparam_dl(p, &kattr);
 -      else if (task_has_rt_policy(p))
 -              kattr.sched_priority = p->rt_priority;
 -      else
 -              kattr.sched_nice = task_nice(p);
 +      get_params(p, &kattr);
 +      kattr.sched_flags &= SCHED_FLAG_ALL;
  
  #ifdef CONFIG_UCLAMP_TASK
        /*
@@@ -7829,76 -7610,9 +7904,76 @@@ out_unlock
        return retval;
  }
  
 -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 +#ifdef CONFIG_SMP
 +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
  {
 +      int ret = 0;
 +
 +      /*
 +       * If the task isn't a deadline task or admission control is
 +       * disabled then we don't care about affinity changes.
 +       */
 +      if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
 +              return 0;
 +
 +      /*
 +       * Since bandwidth control happens on root_domain basis,
 +       * if admission test is enabled, we only admit -deadline
 +       * tasks allowed to run on all the CPUs in the task's
 +       * root_domain.
 +       */
 +      rcu_read_lock();
 +      if (!cpumask_subset(task_rq(p)->rd->span, mask))
 +              ret = -EBUSY;
 +      rcu_read_unlock();
 +      return ret;
 +}
 +#endif
 +
 +static int
 +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
 +{
 +      int retval;
        cpumask_var_t cpus_allowed, new_mask;
 +
 +      if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
 +              return -ENOMEM;
 +
 +      if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
 +              retval = -ENOMEM;
 +              goto out_free_cpus_allowed;
 +      }
 +
 +      cpuset_cpus_allowed(p, cpus_allowed);
 +      cpumask_and(new_mask, mask, cpus_allowed);
 +
 +      retval = dl_task_check_affinity(p, new_mask);
 +      if (retval)
 +              goto out_free_new_mask;
 +again:
 +      retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
 +      if (retval)
 +              goto out_free_new_mask;
 +
 +      cpuset_cpus_allowed(p, cpus_allowed);
 +      if (!cpumask_subset(new_mask, cpus_allowed)) {
 +              /*
 +               * We must have raced with a concurrent cpuset update.
 +               * Just reset the cpumask to the cpuset's cpus_allowed.
 +               */
 +              cpumask_copy(new_mask, cpus_allowed);
 +              goto again;
 +      }
 +
 +out_free_new_mask:
 +      free_cpumask_var(new_mask);
 +out_free_cpus_allowed:
 +      free_cpumask_var(cpus_allowed);
 +      return retval;
 +}
 +
 +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 +{
        struct task_struct *p;
        int retval;
  
                retval = -EINVAL;
                goto out_put_task;
        }
 -      if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
 -              retval = -ENOMEM;
 -              goto out_put_task;
 -      }
 -      if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
 -              retval = -ENOMEM;
 -              goto out_free_cpus_allowed;
 -      }
 -      retval = -EPERM;
 +
        if (!check_same_owner(p)) {
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
 -                      goto out_free_new_mask;
 +                      retval = -EPERM;
 +                      goto out_put_task;
                }
                rcu_read_unlock();
        }
  
        retval = security_task_setscheduler(p);
        if (retval)
 -              goto out_free_new_mask;
 -
 -
 -      cpuset_cpus_allowed(p, cpus_allowed);
 -      cpumask_and(new_mask, in_mask, cpus_allowed);
 -
 -      /*
 -       * Since bandwidth control happens on root_domain basis,
 -       * if admission test is enabled, we only admit -deadline
 -       * tasks allowed to run on all the CPUs in the task's
 -       * root_domain.
 -       */
 -#ifdef CONFIG_SMP
 -      if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
 -              rcu_read_lock();
 -              if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
 -                      retval = -EBUSY;
 -                      rcu_read_unlock();
 -                      goto out_free_new_mask;
 -              }
 -              rcu_read_unlock();
 -      }
 -#endif
 -again:
 -      retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
 +              goto out_put_task;
  
 -      if (!retval) {
 -              cpuset_cpus_allowed(p, cpus_allowed);
 -              if (!cpumask_subset(new_mask, cpus_allowed)) {
 -                      /*
 -                       * We must have raced with a concurrent cpuset
 -                       * update. Just reset the cpus_allowed to the
 -                       * cpuset's cpus_allowed
 -                       */
 -                      cpumask_copy(new_mask, cpus_allowed);
 -                      goto again;
 -              }
 -      }
 -out_free_new_mask:
 -      free_cpumask_var(new_mask);
 -out_free_cpus_allowed:
 -      free_cpumask_var(cpus_allowed);
 +      retval = __sched_setaffinity(p, in_mask);
  out_put_task:
        put_task_struct(p);
        return retval;
@@@ -8076,17 -7836,6 +8151,17 @@@ int __sched __cond_resched(void
                preempt_schedule_common();
                return 1;
        }
 +      /*
 +       * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
 +       * whether the current CPU is in an RCU read-side critical section,
 +       * so the tick can report quiescent states even for CPUs looping
 +       * in kernel context.  In contrast, in non-preemptible kernels,
 +       * RCU readers leave no in-memory hints, which means that CPU-bound
 +       * processes executing in kernel context might never report an
 +       * RCU quiescent state.  Therefore, the following code causes
 +       * cond_resched() to report a quiescent state, but only when RCU
 +       * is in urgent need of one.
 +       */
  #ifndef CONFIG_PREEMPT_RCU
        rcu_all_qs();
  #endif
@@@ -9033,8 -8782,6 +9108,8 @@@ int sched_cpu_deactivate(unsigned int c
         */
        if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
                static_branch_dec_cpuslocked(&sched_smt_present);
 +
 +      sched_core_cpu_deactivate(cpu);
  #endif
  
        if (!sched_smp_initialized)
@@@ -9139,7 -8886,6 +9214,7 @@@ int sched_cpu_dying(unsigned int cpu
        calc_load_migrate(rq);
        update_max_interval();
        hrtick_clear(rq);
 +      sched_core_cpu_dying(cpu);
        return 0;
  }
  #endif
@@@ -9351,7 -9097,7 +9426,7 @@@ void __init sched_init(void
                atomic_set(&rq->nr_iowait, 0);
  
  #ifdef CONFIG_SCHED_CORE
 -              rq->core = NULL;
 +              rq->core = rq;
                rq->core_pick = NULL;
                rq->core_enabled = 0;
                rq->core_tree = RB_ROOT;
@@@ -10133,7 -9879,7 +10208,7 @@@ static int tg_set_cfs_bandwidth(struct 
         * Prevent race between setting of cfs_rq->runtime_enabled and
         * unthrottle_offline_cfs_rqs().
         */
 -      get_online_cpus();
 +      cpus_read_lock();
        mutex_lock(&cfs_constraints_mutex);
        ret = __cfs_schedulable(tg, period, quota);
        if (ret)
                cfs_bandwidth_usage_dec();
  out_unlock:
        mutex_unlock(&cfs_constraints_mutex);
 -      put_online_cpus();
 +      cpus_read_unlock();
  
        return ret;
  }
@@@ -10428,20 -10174,6 +10503,20 @@@ static u64 cpu_rt_period_read_uint(stru
  }
  #endif /* CONFIG_RT_GROUP_SCHED */
  
 +#ifdef CONFIG_FAIR_GROUP_SCHED
 +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
 +                             struct cftype *cft)
 +{
 +      return css_tg(css)->idle;
 +}
 +
 +static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
 +                              struct cftype *cft, s64 idle)
 +{
 +      return sched_group_set_idle(css_tg(css), idle);
 +}
 +#endif
 +
  static struct cftype cpu_legacy_files[] = {
  #ifdef CONFIG_FAIR_GROUP_SCHED
        {
                .read_u64 = cpu_shares_read_u64,
                .write_u64 = cpu_shares_write_u64,
        },
 +      {
 +              .name = "idle",
 +              .read_s64 = cpu_idle_read_s64,
 +              .write_s64 = cpu_idle_write_s64,
 +      },
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
        {
@@@ -10661,12 -10388,6 +10736,12 @@@ static struct cftype cpu_files[] = 
                .read_s64 = cpu_weight_nice_read_s64,
                .write_s64 = cpu_weight_nice_write_s64,
        },
 +      {
 +              .name = "idle",
 +              .flags = CFTYPE_NOT_ON_ROOT,
 +              .read_s64 = cpu_idle_read_s64,
 +              .write_s64 = cpu_idle_write_s64,
 +      },
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
        {
This page took 0.179155 seconds and 4 git commands to generate.