kernel/rcu/tree_plugin.h

   1 /* SPDX-License-Identifier: GPL-2.0+ */
   2 /*
   3  * Read-Copy Update mechanism for mutual exclusion (tree-based version)
   4  * Internal non-public definitions that provide either classic
   5  * or preemptible semantics.
   6  *
   7  * Copyright Red Hat, 2009
   8  * Copyright IBM Corporation, 2009
   9  *
  10  * Author: Ingo Molnar <[email protected]>
  11  *         Paul E. McKenney <[email protected]>
  12  */
  13
  14 #include "../locking/rtmutex_common.h"
  15
  16 static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
  17 {
  18         /*
  19          * In order to read the offloaded state of an rdp in a safe
  20          * and stable way and prevent from its value to be changed
  21          * under us, we must either hold the barrier mutex, the cpu
  22          * hotplug lock (read or write) or the nocb lock. Local
  23          * non-preemptible reads are also safe. NOCB kthreads and
  24          * timers have their own means of synchronization against the
  25          * offloaded state updaters.
  26          */
  27         RCU_NOCB_LOCKDEP_WARN(
  28                 !(lockdep_is_held(&rcu_state.barrier_mutex) ||
  29                   (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
  30                   lockdep_is_held(&rdp->nocb_lock) ||
  31                   lockdep_is_held(&rcu_state.nocb_mutex) ||
  32                   (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) &&
  33                    rdp == this_cpu_ptr(&rcu_data)) ||
  34                   rcu_current_is_nocb_kthread(rdp)),
  35                 "Unsafe read of RCU_NOCB offloaded state"
  36         );
  37
  38         return rcu_segcblist_is_offloaded(&rdp->cblist);
  39 }
  40
  41 /*
  42  * Check the RCU kernel configuration parameters and print informative
  43  * messages about anything out of the ordinary.
  44  */
  45 static void __init rcu_bootup_announce_oddness(void)
  46 {
  47         if (IS_ENABLED(CONFIG_RCU_TRACE))
  48                 pr_info("\tRCU event tracing is enabled.\n");
  49         if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
  50             (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
  51                 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n",
  52                         RCU_FANOUT);
  53         if (rcu_fanout_exact)
  54                 pr_info("\tHierarchical RCU autobalancing is disabled.\n");
  55         if (IS_ENABLED(CONFIG_PROVE_RCU))
  56                 pr_info("\tRCU lockdep checking is enabled.\n");
  57         if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
  58                 pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n");
  59         if (RCU_NUM_LVLS >= 4)
  60                 pr_info("\tFour(or more)-level hierarchy is enabled.\n");
  61         if (RCU_FANOUT_LEAF != 16)
  62                 pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
  63                         RCU_FANOUT_LEAF);
  64         if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
  65                 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n",
  66                         rcu_fanout_leaf);
  67         if (nr_cpu_ids != NR_CPUS)
  68                 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
  69 #ifdef CONFIG_RCU_BOOST
  70         pr_info("\tRCU priority boosting: priority %d delay %d ms.\n",
  71                 kthread_prio, CONFIG_RCU_BOOST_DELAY);
  72 #endif
  73         if (blimit != DEFAULT_RCU_BLIMIT)
  74                 pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
  75         if (qhimark != DEFAULT_RCU_QHIMARK)
  76                 pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
  77         if (qlowmark != DEFAULT_RCU_QLOMARK)
  78                 pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
  79         if (qovld != DEFAULT_RCU_QOVLD)
  80                 pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld);
  81         if (jiffies_till_first_fqs != ULONG_MAX)
  82                 pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
  83         if (jiffies_till_next_fqs != ULONG_MAX)
  84                 pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
  85         if (jiffies_till_sched_qs != ULONG_MAX)
  86                 pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs);
  87         if (rcu_kick_kthreads)
  88                 pr_info("\tKick kthreads if too-long grace period.\n");
  89         if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
  90                 pr_info("\tRCU callback double-/use-after-free debug is enabled.\n");
  91         if (gp_preinit_delay)
  92                 pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
  93         if (gp_init_delay)
  94                 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
  95         if (gp_cleanup_delay)
  96                 pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
  97         if (nohz_full_patience_delay < 0) {
  98                 pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay);
  99                 nohz_full_patience_delay = 0;
 100         } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) {
 101                 pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC);
 102                 nohz_full_patience_delay = 5 * MSEC_PER_SEC;
 103         } else if (nohz_full_patience_delay) {
 104                 pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay);
 105         }
 106         nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay);
 107         if (!use_softirq)
 108                 pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
 109         if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
 110                 pr_info("\tRCU debug extended QS entry/exit.\n");
 111         rcupdate_announce_bootup_oddness();
 112 }
 113
 114 #ifdef CONFIG_PREEMPT_RCU
 115
 116 static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake);
 117 static void rcu_read_unlock_special(struct task_struct *t);
 118
 119 /*
 120  * Tell them what RCU they are running.
 121  */
 122 static void __init rcu_bootup_announce(void)
 123 {
 124         pr_info("Preemptible hierarchical RCU implementation.\n");
 125         rcu_bootup_announce_oddness();
 126 }
 127
 128 /* Flags for rcu_preempt_ctxt_queue() decision table. */
 129 #define RCU_GP_TASKS    0x8
 130 #define RCU_EXP_TASKS   0x4
 131 #define RCU_GP_BLKD     0x2
 132 #define RCU_EXP_BLKD    0x1
 133
 134 /*
 135  * Queues a task preempted within an RCU-preempt read-side critical
 136  * section into the appropriate location within the ->blkd_tasks list,
 137  * depending on the states of any ongoing normal and expedited grace
 138  * periods.  The ->gp_tasks pointer indicates which element the normal
 139  * grace period is waiting on (NULL if none), and the ->exp_tasks pointer
 140  * indicates which element the expedited grace period is waiting on (again,
 141  * NULL if none).  If a grace period is waiting on a given element in the
 142  * ->blkd_tasks list, it also waits on all subsequent elements.  Thus,
 143  * adding a task to the tail of the list blocks any grace period that is
 144  * already waiting on one of the elements.  In contrast, adding a task
 145  * to the head of the list won't block any grace period that is already
 146  * waiting on one of the elements.
 147  *
 148  * This queuing is imprecise, and can sometimes make an ongoing grace
 149  * period wait for a task that is not strictly speaking blocking it.
 150  * Given the choice, we needlessly block a normal grace period rather than
 151  * blocking an expedited grace period.
 152  *
 153  * Note that an endless sequence of expedited grace periods still cannot
 154  * indefinitely postpone a normal grace period.  Eventually, all of the
 155  * fixed number of preempted tasks blocking the normal grace period that are
 156  * not also blocking the expedited grace period will resume and complete
 157  * their RCU read-side critical sections.  At that point, the ->gp_tasks
 158  * pointer will equal the ->exp_tasks pointer, at which point the end of
 159  * the corresponding expedited grace period will also be the end of the
 160  * normal grace period.
 161  */
 162 static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
 163         __releases(rnp->lock) /* But leaves rrupts disabled. */
 164 {
 165         int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
 166                          (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
 167                          (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) +
 168                          (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
 169         struct task_struct *t = current;
 170
 171         raw_lockdep_assert_held_rcu_node(rnp);
 172         WARN_ON_ONCE(rdp->mynode != rnp);
 173         WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
 174         /* RCU better not be waiting on newly onlined CPUs! */
 175         WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask &
 176                      rdp->grpmask);
 177
 178         /*
 179          * Decide where to queue the newly blocked task.  In theory,
 180          * this could be an if-statement.  In practice, when I tried
 181          * that, it was quite messy.
 182          */
 183         switch (blkd_state) {
 184         case 0:
 185         case                RCU_EXP_TASKS:
 186         case                RCU_EXP_TASKS | RCU_GP_BLKD:
 187         case RCU_GP_TASKS:
 188         case RCU_GP_TASKS | RCU_EXP_TASKS:
 189
 190                 /*
 191                  * Blocking neither GP, or first task blocking the normal
 192                  * GP but not blocking the already-waiting expedited GP.
 193                  * Queue at the head of the list to avoid unnecessarily
 194                  * blocking the already-waiting GPs.
 195                  */
 196                 list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
 197                 break;
 198
 199         case                                              RCU_EXP_BLKD:
 200         case                                RCU_GP_BLKD:
 201         case                                RCU_GP_BLKD | RCU_EXP_BLKD:
 202         case RCU_GP_TASKS |                               RCU_EXP_BLKD:
 203         case RCU_GP_TASKS |                 RCU_GP_BLKD | RCU_EXP_BLKD:
 204         case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
 205
 206                 /*
 207                  * First task arriving that blocks either GP, or first task
 208                  * arriving that blocks the expedited GP (with the normal
 209                  * GP already waiting), or a task arriving that blocks
 210                  * both GPs with both GPs already waiting.  Queue at the
 211                  * tail of the list to avoid any GP waiting on any of the
 212                  * already queued tasks that are not blocking it.
 213                  */
 214                 list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
 215                 break;
 216
 217         case                RCU_EXP_TASKS |               RCU_EXP_BLKD:
 218         case                RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
 219         case RCU_GP_TASKS | RCU_EXP_TASKS |               RCU_EXP_BLKD:
 220
 221                 /*
 222                  * Second or subsequent task blocking the expedited GP.
 223                  * The task either does not block the normal GP, or is the
 224                  * first task blocking the normal GP.  Queue just after
 225                  * the first task blocking the expedited GP.
 226                  */
 227                 list_add(&t->rcu_node_entry, rnp->exp_tasks);
 228                 break;
 229
 230         case RCU_GP_TASKS |                 RCU_GP_BLKD:
 231         case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD:
 232
 233                 /*
 234                  * Second or subsequent task blocking the normal GP.
 235                  * The task does not block the expedited GP. Queue just
 236                  * after the first task blocking the normal GP.
 237                  */
 238                 list_add(&t->rcu_node_entry, rnp->gp_tasks);
 239                 break;
 240
 241         default:
 242
 243                 /* Yet another exercise in excessive paranoia. */
 244                 WARN_ON_ONCE(1);
 245                 break;
 246         }
 247
 248         /*
 249          * We have now queued the task.  If it was the first one to
 250          * block either grace period, update the ->gp_tasks and/or
 251          * ->exp_tasks pointers, respectively, to reference the newly
 252          * blocked tasks.
 253          */
 254         if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
 255                 WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
 256                 WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
 257         }
 258         if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
 259                 WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry);
 260         WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
 261                      !(rnp->qsmask & rdp->grpmask));
 262         WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
 263                      !(rnp->expmask & rdp->grpmask));
 264         raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
 265
 266         /*
 267          * Report the quiescent state for the expedited GP.  This expedited
 268          * GP should not be able to end until we report, so there should be
 269          * no need to check for a subsequent expedited GP.  (Though we are
 270          * still in a quiescent state in any case.)
 271          *
 272          * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change.
 273          */
 274         if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp)
 275                 rcu_report_exp_rdp(rdp);
 276         else
 277                 WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
 278         ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
 279 }
 280
 281 /*
 282  * Record a preemptible-RCU quiescent state for the specified CPU.
 283  * Note that this does not necessarily mean that the task currently running
 284  * on the CPU is in a quiescent state:  Instead, it means that the current
 285  * grace period need not wait on any RCU read-side critical section that
 286  * starts later on this CPU.  It also means that if the current task is
 287  * in an RCU read-side critical section, it has already added itself to
 288  * some leaf rcu_node structure's ->blkd_tasks list.  In addition to the
 289  * current task, there might be any number of other tasks blocked while
 290  * in an RCU read-side critical section.
 291  *
 292  * Unlike non-preemptible-RCU, quiescent state reports for expedited
 293  * grace periods are handled separately via deferred quiescent states
 294  * and context switch events.
 295  *
 296  * Callers to this function must disable preemption.
 297  */
 298 static void rcu_qs(void)
 299 {
 300         RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
 301         if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) {
 302                 trace_rcu_grace_period(TPS("rcu_preempt"),
 303                                        __this_cpu_read(rcu_data.gp_seq),
 304                                        TPS("cpuqs"));
 305                 __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
 306                 barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
 307                 WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
 308         }
 309 }
 310
 311 /*
 312  * We have entered the scheduler, and the current task might soon be
 313  * context-switched away from.  If this task is in an RCU read-side
 314  * critical section, we will no longer be able to rely on the CPU to
 315  * record that fact, so we enqueue the task on the blkd_tasks list.
 316  * The task will dequeue itself when it exits the outermost enclosing
 317  * RCU read-side critical section.  Therefore, the current grace period
 318  * cannot be permitted to complete until the blkd_tasks list entries
 319  * predating the current grace period drain, in other words, until
 320  * rnp->gp_tasks becomes NULL.
 321  *
 322  * Caller must disable interrupts.
 323  */
 324 void rcu_note_context_switch(bool preempt)
 325 {
 326         struct task_struct *t = current;
 327         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 328         struct rcu_node *rnp;
 329
 330         trace_rcu_utilization(TPS("Start context switch"));
 331         lockdep_assert_irqs_disabled();
 332         WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!");
 333         if (rcu_preempt_depth() > 0 &&
 334             !t->rcu_read_unlock_special.b.blocked) {
 335
 336                 /* Possibly blocking in an RCU read-side critical section. */
 337                 rnp = rdp->mynode;
 338                 raw_spin_lock_rcu_node(rnp);
 339                 t->rcu_read_unlock_special.b.blocked = true;
 340                 t->rcu_blocked_node = rnp;
 341
 342                 /*
 343                  * Verify the CPU's sanity, trace the preemption, and
 344                  * then queue the task as required based on the states
 345                  * of any ongoing and expedited grace periods.
 346                  */
 347                 WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
 348                 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
 349                 trace_rcu_preempt_task(rcu_state.name,
 350                                        t->pid,
 351                                        (rnp->qsmask & rdp->grpmask)
 352                                        ? rnp->gp_seq
 353                                        : rcu_seq_snap(&rnp->gp_seq));
 354                 rcu_preempt_ctxt_queue(rnp, rdp);
 355         } else {
 356                 rcu_preempt_deferred_qs(t);
 357         }
 358
 359         /*
 360          * Either we were not in an RCU read-side critical section to
 361          * begin with, or we have now recorded that critical section
 362          * globally.  Either way, we can now note a quiescent state
 363          * for this CPU.  Again, if we were in an RCU read-side critical
 364          * section, and if that critical section was blocking the current
 365          * grace period, then the fact that the task has been enqueued
 366          * means that we continue to block the current grace period.
 367          */
 368         rcu_qs();
 369         if (rdp->cpu_no_qs.b.exp)
 370                 rcu_report_exp_rdp(rdp);
 371         rcu_tasks_qs(current, preempt);
 372         trace_rcu_utilization(TPS("End context switch"));
 373 }
 374 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 375
 376 /*
 377  * Check for preempted RCU readers blocking the current grace period
 378  * for the specified rcu_node structure.  If the caller needs a reliable
 379  * answer, it must hold the rcu_node's ->lock.
 380  */
 381 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 382 {
 383         return READ_ONCE(rnp->gp_tasks) != NULL;
 384 }
 385
 386 /* limit value for ->rcu_read_lock_nesting. */
 387 #define RCU_NEST_PMAX (INT_MAX / 2)
 388
 389 static void rcu_preempt_read_enter(void)
 390 {
 391         WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1);
 392 }
 393
 394 static int rcu_preempt_read_exit(void)
 395 {
 396         int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1;
 397
 398         WRITE_ONCE(current->rcu_read_lock_nesting, ret);
 399         return ret;
 400 }
 401
 402 static void rcu_preempt_depth_set(int val)
 403 {
 404         WRITE_ONCE(current->rcu_read_lock_nesting, val);
 405 }
 406
 407 /*
 408  * Preemptible RCU implementation for rcu_read_lock().
 409  * Just increment ->rcu_read_lock_nesting, shared state will be updated
 410  * if we block.
 411  */
 412 void __rcu_read_lock(void)
 413 {
 414         rcu_preempt_read_enter();
 415         if (IS_ENABLED(CONFIG_PROVE_LOCKING))
 416                 WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
 417         if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread)
 418                 WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true);
 419         barrier();  /* critical section after entry code. */
 420 }
 421 EXPORT_SYMBOL_GPL(__rcu_read_lock);
 422
 423 /*
 424  * Preemptible RCU implementation for rcu_read_unlock().
 425  * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
 426  * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
 427  * invoke rcu_read_unlock_special() to clean up after a context switch
 428  * in an RCU read-side critical section and other special cases.
 429  */
 430 void __rcu_read_unlock(void)
 431 {
 432         struct task_struct *t = current;
 433
 434         barrier();  // critical section before exit code.
 435         if (rcu_preempt_read_exit() == 0) {
 436                 barrier();  // critical-section exit before .s check.
 437                 if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
 438                         rcu_read_unlock_special(t);
 439         }
 440         if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
 441                 int rrln = rcu_preempt_depth();
 442
 443                 WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX);
 444         }
 445 }
 446 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 447
 448 /*
 449  * Advance a ->blkd_tasks-list pointer to the next entry, instead
 450  * returning NULL if at the end of the list.
 451  */
 452 static struct list_head *rcu_next_node_entry(struct task_struct *t,
 453                                              struct rcu_node *rnp)
 454 {
 455         struct list_head *np;
 456
 457         np = t->rcu_node_entry.next;
 458         if (np == &rnp->blkd_tasks)
 459                 np = NULL;
 460         return np;
 461 }
 462
 463 /*
 464  * Return true if the specified rcu_node structure has tasks that were
 465  * preempted within an RCU read-side critical section.
 466  */
 467 static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 468 {
 469         return !list_empty(&rnp->blkd_tasks);
 470 }
 471
 472 /*
 473  * Report deferred quiescent states.  The deferral time can
 474  * be quite short, for example, in the case of the call from
 475  * rcu_read_unlock_special().
 476  */
 477 static notrace void
 478 rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 479 {
 480         bool empty_exp;
 481         bool empty_norm;
 482         bool empty_exp_now;
 483         struct list_head *np;
 484         bool drop_boost_mutex = false;
 485         struct rcu_data *rdp;
 486         struct rcu_node *rnp;
 487         union rcu_special special;
 488
 489         /*
 490          * If RCU core is waiting for this CPU to exit its critical section,
 491          * report the fact that it has exited.  Because irqs are disabled,
 492          * t->rcu_read_unlock_special cannot change.
 493          */
 494         special = t->rcu_read_unlock_special;
 495         rdp = this_cpu_ptr(&rcu_data);
 496         if (!special.s && !rdp->cpu_no_qs.b.exp) {
 497                 local_irq_restore(flags);
 498                 return;
 499         }
 500         t->rcu_read_unlock_special.s = 0;
 501         if (special.b.need_qs) {
 502                 if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
 503                         rdp->cpu_no_qs.b.norm = false;
 504                         rcu_report_qs_rdp(rdp);
 505                         udelay(rcu_unlock_delay);
 506                 } else {
 507                         rcu_qs();
 508                 }
 509         }
 510
 511         /*
 512          * Respond to a request by an expedited grace period for a
 513          * quiescent state from this CPU.  Note that requests from
 514          * tasks are handled when removing the task from the
 515          * blocked-tasks list below.
 516          */
 517         if (rdp->cpu_no_qs.b.exp)
 518                 rcu_report_exp_rdp(rdp);
 519
 520         /* Clean up if blocked during RCU read-side critical section. */
 521         if (special.b.blocked) {
 522
 523                 /*
 524                  * Remove this task from the list it blocked on.  The task
 525                  * now remains queued on the rcu_node corresponding to the
 526                  * CPU it first blocked on, so there is no longer any need
 527                  * to loop.  Retain a WARN_ON_ONCE() out of sheer paranoia.
 528                  */
 529                 rnp = t->rcu_blocked_node;
 530                 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 531                 WARN_ON_ONCE(rnp != t->rcu_blocked_node);
 532                 WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
 533                 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 534                 WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
 535                              (!empty_norm || rnp->qsmask));
 536                 empty_exp = sync_rcu_exp_done(rnp);
 537                 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 538                 np = rcu_next_node_entry(t, rnp);
 539                 list_del_init(&t->rcu_node_entry);
 540                 t->rcu_blocked_node = NULL;
 541                 trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
 542                                                 rnp->gp_seq, t->pid);
 543                 if (&t->rcu_node_entry == rnp->gp_tasks)
 544                         WRITE_ONCE(rnp->gp_tasks, np);
 545                 if (&t->rcu_node_entry == rnp->exp_tasks)
 546                         WRITE_ONCE(rnp->exp_tasks, np);
 547                 if (IS_ENABLED(CONFIG_RCU_BOOST)) {
 548                         /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
 549                         drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx.rtmutex) == t;
 550                         if (&t->rcu_node_entry == rnp->boost_tasks)
 551                                 WRITE_ONCE(rnp->boost_tasks, np);
 552                 }
 553
 554                 /*
 555                  * If this was the last task on the current list, and if
 556                  * we aren't waiting on any CPUs, report the quiescent state.
 557                  * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
 558                  * so we must take a snapshot of the expedited state.
 559                  */
 560                 empty_exp_now = sync_rcu_exp_done(rnp);
 561                 if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
 562                         trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
 563                                                          rnp->gp_seq,
 564                                                          0, rnp->qsmask,
 565                                                          rnp->level,
 566                                                          rnp->grplo,
 567                                                          rnp->grphi,
 568                                                          !!rnp->gp_tasks);
 569                         rcu_report_unblock_qs_rnp(rnp, flags);
 570                 } else {
 571                         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 572                 }
 573
 574                 /*
 575                  * If this was the last task on the expedited lists,
 576                  * then we need to report up the rcu_node hierarchy.
 577                  */
 578                 if (!empty_exp && empty_exp_now)
 579                         rcu_report_exp_rnp(rnp, true);
 580
 581                 /* Unboost if we were boosted. */
 582                 if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
 583                         rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
 584         } else {
 585                 local_irq_restore(flags);
 586         }
 587 }
 588
 589 /*
 590  * Is a deferred quiescent-state pending, and are we also not in
 591  * an RCU read-side critical section?  It is the caller's responsibility
 592  * to ensure it is otherwise safe to report any deferred quiescent
 593  * states.  The reason for this is that it is safe to report a
 594  * quiescent state during context switch even though preemption
 595  * is disabled.  This function cannot be expected to understand these
 596  * nuances, so the caller must handle them.
 597  */
 598 static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 599 {
 600         return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
 601                 READ_ONCE(t->rcu_read_unlock_special.s)) &&
 602                rcu_preempt_depth() == 0;
 603 }
 604
 605 /*
 606  * Report a deferred quiescent state if needed and safe to do so.
 607  * As with rcu_preempt_need_deferred_qs(), "safe" involves only
 608  * not being in an RCU read-side critical section.  The caller must
 609  * evaluate safety in terms of interrupt, softirq, and preemption
 610  * disabling.
 611  */
 612 notrace void rcu_preempt_deferred_qs(struct task_struct *t)
 613 {
 614         unsigned long flags;
 615
 616         if (!rcu_preempt_need_deferred_qs(t))
 617                 return;
 618         local_irq_save(flags);
 619         rcu_preempt_deferred_qs_irqrestore(t, flags);
 620 }
 621
 622 /*
 623  * Minimal handler to give the scheduler a chance to re-evaluate.
 624  */
 625 static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
 626 {
 627         struct rcu_data *rdp;
 628
 629         rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
 630         rdp->defer_qs_iw_pending = false;
 631 }
 632
 633 /*
 634  * Handle special cases during rcu_read_unlock(), such as needing to
 635  * notify RCU core processing or task having blocked during the RCU
 636  * read-side critical section.
 637  */
 638 static void rcu_read_unlock_special(struct task_struct *t)
 639 {
 640         unsigned long flags;
 641         bool irqs_were_disabled;
 642         bool preempt_bh_were_disabled =
 643                         !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
 644
 645         /* NMI handlers cannot block and cannot safely manipulate state. */
 646         if (in_nmi())
 647                 return;
 648
 649         local_irq_save(flags);
 650         irqs_were_disabled = irqs_disabled_flags(flags);
 651         if (preempt_bh_were_disabled || irqs_were_disabled) {
 652                 bool expboost; // Expedited GP in flight or possible boosting.
 653                 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 654                 struct rcu_node *rnp = rdp->mynode;
 655
 656                 expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
 657                            (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
 658                            (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
 659                            ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
 660                            (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
 661                             t->rcu_blocked_node);
 662                 // Need to defer quiescent state until everything is enabled.
 663                 if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) {
 664                         // Using softirq, safe to awaken, and either the
 665                         // wakeup is free or there is either an expedited
 666                         // GP in flight or a potential need to deboost.
 667                         raise_softirq_irqoff(RCU_SOFTIRQ);
 668                 } else {
 669                         // Enabling BH or preempt does reschedule, so...
 670                         // Also if no expediting and no possible deboosting,
 671                         // slow is OK.  Plus nohz_full CPUs eventually get
 672                         // tick enabled.
 673                         set_tsk_need_resched(current);
 674                         set_preempt_need_resched();
 675                         if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
 676                             expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
 677                                 // Get scheduler to re-evaluate and call hooks.
 678                                 // If !IRQ_WORK, FQS scan will eventually IPI.
 679                                 if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
 680                                     IS_ENABLED(CONFIG_PREEMPT_RT))
 681                                         rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
 682                                                                 rcu_preempt_deferred_qs_handler);
 683                                 else
 684                                         init_irq_work(&rdp->defer_qs_iw,
 685                                                       rcu_preempt_deferred_qs_handler);
 686                                 rdp->defer_qs_iw_pending = true;
 687                                 irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
 688                         }
 689                 }
 690                 local_irq_restore(flags);
 691                 return;
 692         }
 693         rcu_preempt_deferred_qs_irqrestore(t, flags);
 694 }
 695
 696 /*
 697  * Check that the list of blocked tasks for the newly completed grace
 698  * period is in fact empty.  It is a serious bug to complete a grace
 699  * period that still has RCU readers blocked!  This function must be
 700  * invoked -before- updating this rnp's ->gp_seq.
 701  *
 702  * Also, if there are blocked tasks on the list, they automatically
 703  * block the newly created grace period, so set up ->gp_tasks accordingly.
 704  */
 705 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 706 {
 707         struct task_struct *t;
 708
 709         RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
 710         raw_lockdep_assert_held_rcu_node(rnp);
 711         if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
 712                 dump_blkd_tasks(rnp, 10);
 713         if (rcu_preempt_has_tasks(rnp) &&
 714             (rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
 715                 WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
 716                 t = container_of(rnp->gp_tasks, struct task_struct,
 717                                  rcu_node_entry);
 718                 trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
 719                                                 rnp->gp_seq, t->pid);
 720         }
 721         WARN_ON_ONCE(rnp->qsmask);
 722 }
 723
 724 /*
 725  * Check for a quiescent state from the current CPU, including voluntary
 726  * context switches for Tasks RCU.  When a task blocks, the task is
 727  * recorded in the corresponding CPU's rcu_node structure, which is checked
 728  * elsewhere, hence this function need only check for quiescent states
 729  * related to the current CPU, not to those related to tasks.
 730  */
 731 static void rcu_flavor_sched_clock_irq(int user)
 732 {
 733         struct task_struct *t = current;
 734
 735         lockdep_assert_irqs_disabled();
 736         if (rcu_preempt_depth() > 0 ||
 737             (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
 738                 /* No QS, force context switch if deferred. */
 739                 if (rcu_preempt_need_deferred_qs(t)) {
 740                         set_tsk_need_resched(t);
 741                         set_preempt_need_resched();
 742                 }
 743         } else if (rcu_preempt_need_deferred_qs(t)) {
 744                 rcu_preempt_deferred_qs(t); /* Report deferred QS. */
 745                 return;
 746         } else if (!WARN_ON_ONCE(rcu_preempt_depth())) {
 747                 rcu_qs(); /* Report immediate QS. */
 748                 return;
 749         }
 750
 751         /* If GP is oldish, ask for help from rcu_read_unlock_special(). */
 752         if (rcu_preempt_depth() > 0 &&
 753             __this_cpu_read(rcu_data.core_needs_qs) &&
 754             __this_cpu_read(rcu_data.cpu_no_qs.b.norm) &&
 755             !t->rcu_read_unlock_special.b.need_qs &&
 756             time_after(jiffies, rcu_state.gp_start + HZ))
 757                 t->rcu_read_unlock_special.b.need_qs = true;
 758 }
 759
 760 /*
 761  * Check for a task exiting while in a preemptible-RCU read-side
 762  * critical section, clean up if so.  No need to issue warnings, as
 763  * debug_check_no_locks_held() already does this if lockdep is enabled.
 764  * Besides, if this function does anything other than just immediately
 765  * return, there was a bug of some sort.  Spewing warnings from this
 766  * function is like as not to simply obscure important prior warnings.
 767  */
 768 void exit_rcu(void)
 769 {
 770         struct task_struct *t = current;
 771
 772         if (unlikely(!list_empty(&current->rcu_node_entry))) {
 773                 rcu_preempt_depth_set(1);
 774                 barrier();
 775                 WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
 776         } else if (unlikely(rcu_preempt_depth())) {
 777                 rcu_preempt_depth_set(1);
 778         } else {
 779                 return;
 780         }
 781         __rcu_read_unlock();
 782         rcu_preempt_deferred_qs(current);
 783 }
 784
 785 /*
 786  * Dump the blocked-tasks state, but limit the list dump to the
 787  * specified number of elements.
 788  */
 789 static void
 790 dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
 791 {
 792         int cpu;
 793         int i;
 794         struct list_head *lhp;
 795         struct rcu_data *rdp;
 796         struct rcu_node *rnp1;
 797
 798         raw_lockdep_assert_held_rcu_node(rnp);
 799         pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
 800                 __func__, rnp->grplo, rnp->grphi, rnp->level,
 801                 (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs);
 802         for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
 803                 pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
 804                         __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
 805         pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
 806                 __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks),
 807                 READ_ONCE(rnp->exp_tasks));
 808         pr_info("%s: ->blkd_tasks", __func__);
 809         i = 0;
 810         list_for_each(lhp, &rnp->blkd_tasks) {
 811                 pr_cont(" %p", lhp);
 812                 if (++i >= ncheck)
 813                         break;
 814         }
 815         pr_cont("\n");
 816         for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
 817                 rdp = per_cpu_ptr(&rcu_data, cpu);
 818                 pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
 819                         cpu, ".o"[rcu_rdp_cpu_online(rdp)],
 820                         (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state,
 821                         (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state);
 822         }
 823 }
 824
 825 #else /* #ifdef CONFIG_PREEMPT_RCU */
 826
 827 /*
 828  * If strict grace periods are enabled, and if the calling
 829  * __rcu_read_unlock() marks the beginning of a quiescent state, immediately
 830  * report that quiescent state and, if requested, spin for a bit.
 831  */
 832 void rcu_read_unlock_strict(void)
 833 {
 834         struct rcu_data *rdp;
 835
 836         if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
 837                 return;
 838         rdp = this_cpu_ptr(&rcu_data);
 839         rdp->cpu_no_qs.b.norm = false;
 840         rcu_report_qs_rdp(rdp);
 841         udelay(rcu_unlock_delay);
 842 }
 843 EXPORT_SYMBOL_GPL(rcu_read_unlock_strict);
 844
 845 /*
 846  * Tell them what RCU they are running.
 847  */
 848 static void __init rcu_bootup_announce(void)
 849 {
 850         pr_info("Hierarchical RCU implementation.\n");
 851         rcu_bootup_announce_oddness();
 852 }
 853
 854 /*
 855  * Note a quiescent state for PREEMPTION=n.  Because we do not need to know
 856  * how many quiescent states passed, just if there was at least one since
 857  * the start of the grace period, this just sets a flag.  The caller must
 858  * have disabled preemption.
 859  */
 860 static void rcu_qs(void)
 861 {
 862         RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!");
 863         if (!__this_cpu_read(rcu_data.cpu_no_qs.s))
 864                 return;
 865         trace_rcu_grace_period(TPS("rcu_sched"),
 866                                __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
 867         __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
 868         if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
 869                 rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
 870 }
 871
 872 /*
 873  * Register an urgently needed quiescent state.  If there is an
 874  * emergency, invoke rcu_momentary_eqs() to do a heavy-weight
 875  * dyntick-idle quiescent state visible to other CPUs, which will in
 876  * some cases serve for expedited as well as normal grace periods.
 877  * Either way, register a lightweight quiescent state.
 878  */
 879 void rcu_all_qs(void)
 880 {
 881         unsigned long flags;
 882
 883         if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
 884                 return;
 885         preempt_disable();  // For CONFIG_PREEMPT_COUNT=y kernels
 886         /* Load rcu_urgent_qs before other flags. */
 887         if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
 888                 preempt_enable();
 889                 return;
 890         }
 891         this_cpu_write(rcu_data.rcu_urgent_qs, false);
 892         if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
 893                 local_irq_save(flags);
 894                 rcu_momentary_eqs();
 895                 local_irq_restore(flags);
 896         }
 897         rcu_qs();
 898         preempt_enable();
 899 }
 900 EXPORT_SYMBOL_GPL(rcu_all_qs);
 901
 902 /*
 903  * Note a PREEMPTION=n context switch. The caller must have disabled interrupts.
 904  */
 905 void rcu_note_context_switch(bool preempt)
 906 {
 907         trace_rcu_utilization(TPS("Start context switch"));
 908         rcu_qs();
 909         /* Load rcu_urgent_qs before other flags. */
 910         if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs)))
 911                 goto out;
 912         this_cpu_write(rcu_data.rcu_urgent_qs, false);
 913         if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
 914                 rcu_momentary_eqs();
 915 out:
 916         rcu_tasks_qs(current, preempt);
 917         trace_rcu_utilization(TPS("End context switch"));
 918 }
 919 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 920
 921 /*
 922  * Because preemptible RCU does not exist, there are never any preempted
 923  * RCU readers.
 924  */
 925 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 926 {
 927         return 0;
 928 }
 929
 930 /*
 931  * Because there is no preemptible RCU, there can be no readers blocked.
 932  */
 933 static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 934 {
 935         return false;
 936 }
 937
 938 /*
 939  * Because there is no preemptible RCU, there can be no deferred quiescent
 940  * states.
 941  */
 942 static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 943 {
 944         return false;
 945 }
 946
 947 // Except that we do need to respond to a request by an expedited
 948 // grace period for a quiescent state from this CPU.  Note that in
 949 // non-preemptible kernels, there can be no context switches within RCU
 950 // read-side critical sections, which in turn means that the leaf rcu_node
 951 // structure's blocked-tasks list is always empty.  is therefore no need to
 952 // actually check it.  Instead, a quiescent state from this CPU suffices,
 953 // and this function is only called from such a quiescent state.
 954 notrace void rcu_preempt_deferred_qs(struct task_struct *t)
 955 {
 956         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 957
 958         if (READ_ONCE(rdp->cpu_no_qs.b.exp))
 959                 rcu_report_exp_rdp(rdp);
 960 }
 961
 962 /*
 963  * Because there is no preemptible RCU, there can be no readers blocked,
 964  * so there is no need to check for blocked tasks.  So check only for
 965  * bogus qsmask values.
 966  */
 967 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 968 {
 969         WARN_ON_ONCE(rnp->qsmask);
 970 }
 971
 972 /*
 973  * Check to see if this CPU is in a non-context-switch quiescent state,
 974  * namely user mode and idle loop.
 975  */
 976 static void rcu_flavor_sched_clock_irq(int user)
 977 {
 978         if (user || rcu_is_cpu_rrupt_from_idle()) {
 979
 980                 /*
 981                  * Get here if this CPU took its interrupt from user
 982                  * mode or from the idle loop, and if this is not a
 983                  * nested interrupt.  In this case, the CPU is in
 984                  * a quiescent state, so note it.
 985                  *
 986                  * No memory barrier is required here because rcu_qs()
 987                  * references only CPU-local variables that other CPUs
 988                  * neither access nor modify, at least not while the
 989                  * corresponding CPU is online.
 990                  */
 991                 rcu_qs();
 992         }
 993 }
 994
 995 /*
 996  * Because preemptible RCU does not exist, tasks cannot possibly exit
 997  * while in preemptible RCU read-side critical sections.
 998  */
 999 void exit_rcu(void)
1000 {
1001 }
1002
1003 /*
1004  * Dump the guaranteed-empty blocked-tasks state.  Trust but verify.
1005  */
1006 static void
1007 dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
1008 {
1009         WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
1010 }
1011
1012 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1013
1014 /*
1015  * If boosting, set rcuc kthreads to realtime priority.
1016  */
1017 static void rcu_cpu_kthread_setup(unsigned int cpu)
1018 {
1019         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
1020 #ifdef CONFIG_RCU_BOOST
1021         struct sched_param sp;
1022
1023         sp.sched_priority = kthread_prio;
1024         sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1025 #endif /* #ifdef CONFIG_RCU_BOOST */
1026
1027         WRITE_ONCE(rdp->rcuc_activity, jiffies);
1028 }
1029
1030 static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp)
1031 {
1032 #ifdef CONFIG_RCU_NOCB_CPU
1033         return rdp->nocb_cb_kthread == current;
1034 #else
1035         return false;
1036 #endif
1037 }
1038
1039 /*
1040  * Is the current CPU running the RCU-callbacks kthread?
1041  * Caller must have preemption disabled.
1042  */
1043 static bool rcu_is_callbacks_kthread(struct rcu_data *rdp)
1044 {
1045         return rdp->rcu_cpu_kthread_task == current ||
1046                         rcu_is_callbacks_nocb_kthread(rdp);
1047 }
1048
1049 #ifdef CONFIG_RCU_BOOST
1050
1051 /*
1052  * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1053  * or ->boost_tasks, advancing the pointer to the next task in the
1054  * ->blkd_tasks list.
1055  *
1056  * Note that irqs must be enabled: boosting the task can block.
1057  * Returns 1 if there are more tasks needing to be boosted.
1058  */
1059 static int rcu_boost(struct rcu_node *rnp)
1060 {
1061         unsigned long flags;
1062         struct task_struct *t;
1063         struct list_head *tb;
1064
1065         if (READ_ONCE(rnp->exp_tasks) == NULL &&
1066             READ_ONCE(rnp->boost_tasks) == NULL)
1067                 return 0;  /* Nothing left to boost. */
1068
1069         raw_spin_lock_irqsave_rcu_node(rnp, flags);
1070
1071         /*
1072          * Recheck under the lock: all tasks in need of boosting
1073          * might exit their RCU read-side critical sections on their own.
1074          */
1075         if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1076                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1077                 return 0;
1078         }
1079
1080         /*
1081          * Preferentially boost tasks blocking expedited grace periods.
1082          * This cannot starve the normal grace periods because a second
1083          * expedited grace period must boost all blocked tasks, including
1084          * those blocking the pre-existing normal grace period.
1085          */
1086         if (rnp->exp_tasks != NULL)
1087                 tb = rnp->exp_tasks;
1088         else
1089                 tb = rnp->boost_tasks;
1090
1091         /*
1092          * We boost task t by manufacturing an rt_mutex that appears to
1093          * be held by task t.  We leave a pointer to that rt_mutex where
1094          * task t can find it, and task t will release the mutex when it
1095          * exits its outermost RCU read-side critical section.  Then
1096          * simply acquiring this artificial rt_mutex will boost task
1097          * t's priority.  (Thanks to tglx for suggesting this approach!)
1098          *
1099          * Note that task t must acquire rnp->lock to remove itself from
1100          * the ->blkd_tasks list, which it will do from exit() if from
1101          * nowhere else.  We therefore are guaranteed that task t will
1102          * stay around at least until we drop rnp->lock.  Note that
1103          * rnp->lock also resolves races between our priority boosting
1104          * and task t's exiting its outermost RCU read-side critical
1105          * section.
1106          */
1107         t = container_of(tb, struct task_struct, rcu_node_entry);
1108         rt_mutex_init_proxy_locked(&rnp->boost_mtx.rtmutex, t);
1109         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1110         /* Lock only for side effect: boosts task t's priority. */
1111         rt_mutex_lock(&rnp->boost_mtx);
1112         rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
1113         rnp->n_boosts++;
1114
1115         return READ_ONCE(rnp->exp_tasks) != NULL ||
1116                READ_ONCE(rnp->boost_tasks) != NULL;
1117 }
1118
1119 /*
1120  * Priority-boosting kthread, one per leaf rcu_node.
1121  */
1122 static int rcu_boost_kthread(void *arg)
1123 {
1124         struct rcu_node *rnp = (struct rcu_node *)arg;
1125         int spincnt = 0;
1126         int more2boost;
1127
1128         trace_rcu_utilization(TPS("Start boost kthread@init"));
1129         for (;;) {
1130                 WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING);
1131                 trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1132                 rcu_wait(READ_ONCE(rnp->boost_tasks) ||
1133                          READ_ONCE(rnp->exp_tasks));
1134                 trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1135                 WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING);
1136                 more2boost = rcu_boost(rnp);
1137                 if (more2boost)
1138                         spincnt++;
1139                 else
1140                         spincnt = 0;
1141                 if (spincnt > 10) {
1142                         WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
1143                         trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1144                         schedule_timeout_idle(2);
1145                         trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1146                         spincnt = 0;
1147                 }
1148         }
1149         /* NOTREACHED */
1150         trace_rcu_utilization(TPS("End boost kthread@notreached"));
1151         return 0;
1152 }
1153
1154 /*
1155  * Check to see if it is time to start boosting RCU readers that are
1156  * blocking the current grace period, and, if so, tell the per-rcu_node
1157  * kthread to start boosting them.  If there is an expedited grace
1158  * period in progress, it is always time to boost.
1159  *
1160  * The caller must hold rnp->lock, which this function releases.
1161  * The ->boost_kthread_task is immortal, so we don't need to worry
1162  * about it going away.
1163  */
1164 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1165         __releases(rnp->lock)
1166 {
1167         raw_lockdep_assert_held_rcu_node(rnp);
1168         if (!rnp->boost_kthread_task ||
1169             (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
1170                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1171                 return;
1172         }
1173         if (rnp->exp_tasks != NULL ||
1174             (rnp->gp_tasks != NULL &&
1175              rnp->boost_tasks == NULL &&
1176              rnp->qsmask == 0 &&
1177              (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld ||
1178               IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) {
1179                 if (rnp->exp_tasks == NULL)
1180                         WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
1181                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1182                 rcu_wake_cond(rnp->boost_kthread_task,
1183                               READ_ONCE(rnp->boost_kthread_status));
1184         } else {
1185                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1186         }
1187 }
1188
1189 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1190
1191 /*
1192  * Do priority-boost accounting for the start of a new grace period.
1193  */
1194 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1195 {
1196         rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1197 }
1198
1199 /*
1200  * Create an RCU-boost kthread for the specified node if one does not
1201  * already exist.  We only create this kthread for preemptible RCU.
1202  */
1203 static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
1204 {
1205         unsigned long flags;
1206         int rnp_index = rnp - rcu_get_root();
1207         struct sched_param sp;
1208         struct task_struct *t;
1209
1210         if (rnp->boost_kthread_task)
1211                 return;
1212
1213         t = kthread_create(rcu_boost_kthread, (void *)rnp,
1214                            "rcub/%d", rnp_index);
1215         if (WARN_ON_ONCE(IS_ERR(t)))
1216                 return;
1217
1218         raw_spin_lock_irqsave_rcu_node(rnp, flags);
1219         rnp->boost_kthread_task = t;
1220         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1221
1222         sp.sched_priority = kthread_prio;
1223         sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1224         rcu_thread_affine_rnp(t, rnp);
1225         wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1226 }
1227
1228 #else /* #ifdef CONFIG_RCU_BOOST */
1229
1230 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1231         __releases(rnp->lock)
1232 {
1233         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1234 }
1235
1236 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1237 {
1238 }
1239
1240 static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
1241 {
1242 }
1243
1244 #endif /* #else #ifdef CONFIG_RCU_BOOST */
1245
1246 /*
1247  * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
1248  * grace-period kthread will do force_quiescent_state() processing?
1249  * The idea is to avoid waking up RCU core processing on such a
1250  * CPU unless the grace period has extended for too long.
1251  *
1252  * This code relies on the fact that all NO_HZ_FULL CPUs are also
1253  * RCU_NOCB_CPU CPUs.
1254  */
1255 static bool rcu_nohz_full_cpu(void)
1256 {
1257 #ifdef CONFIG_NO_HZ_FULL
1258         if (tick_nohz_full_cpu(smp_processor_id()) &&
1259             (!rcu_gp_in_progress() ||
1260              time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ)))
1261                 return true;
1262 #endif /* #ifdef CONFIG_NO_HZ_FULL */
1263         return false;
1264 }
1265
1266 /*
1267  * Bind the RCU grace-period kthreads to the housekeeping CPU.
1268  */
1269 static void rcu_bind_gp_kthread(void)
1270 {
1271         if (!tick_nohz_full_enabled())
1272                 return;
1273         housekeeping_affine(current, HK_TYPE_RCU);
1274 }