// SPDX-License-Identifier: GPL-2.0+
/*
- ------ * Read-Copy Update mechanism for mutual exclusion
+ ++++++ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
*
* Copyright IBM Corporation, 2008
*
*
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
++++ +++static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
/* rcuc/rcub kthread realtime priority */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
{
int old;
int new;
+ ++++++ int new_old;
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ ++++++ new_old = atomic_read(&rdp->dynticks);
do {
- ------ old = atomic_read(&rdp->dynticks);
+ ++++++ old = new_old;
if (old & RCU_DYNTICK_CTRL_CTR)
return false;
new = old | RCU_DYNTICK_CTRL_MASK;
- ------ } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old);
+ ++++++ new_old = atomic_cmpxchg(&rdp->dynticks, old, new);
+ ++++++ } while (new_old != old);
return true;
}
static long qhimark = DEFAULT_RCU_QHIMARK;
#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
static long qlowmark = DEFAULT_RCU_QLOMARK;
++++ +++#define DEFAULT_RCU_QOVLD_MULT 2
++++ +++#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
++++ +++static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */
++++ +++static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */
module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
module_param(qlowmark, long, 0444);
++++ +++module_param(qovld, long, 0444);
static ulong jiffies_till_first_fqs = ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
incby = 1;
} else if (tick_nohz_full_cpu(rdp->cpu) &&
rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&
- ------ READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
+ ++++++ READ_ONCE(rdp->rcu_urgent_qs) &&
+ ++++++ !READ_ONCE(rdp->rcu_forced_tick)) {
raw_spin_lock_rcu_node(rdp->mynode);
// Recheck under lock.
if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
- ------ rdp->rcu_forced_tick = true;
+ ++++++ WRITE_ONCE(rdp->rcu_forced_tick, true);
tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
}
raw_spin_unlock_rcu_node(rdp->mynode);
WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
- ------ rdp->rcu_forced_tick = false;
+ ++++++ WRITE_ONCE(rdp->rcu_forced_tick, false);
}
}
rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
if (!READ_ONCE(*rnhqp) &&
(time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
---- --- time_after(jiffies, rcu_state.jiffies_resched))) {
++++ +++ time_after(jiffies, rcu_state.jiffies_resched) ||
++++ +++ rcu_state.cbovld)) {
WRITE_ONCE(*rnhqp, true);
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
smp_store_release(ruqp, true);
* So hit them over the head with the resched_cpu() hammer!
*/
if (tick_nohz_full_cpu(rdp->cpu) &&
---- --- time_after(jiffies,
---- --- READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) {
++++ +++ (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
++++ +++ rcu_state.cbovld)) {
WRITE_ONCE(*ruqp, true);
resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
unsigned long gp_seq_req, const char *s)
{
- ------ trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req,
- ------ rnp->level, rnp->grplo, rnp->grphi, s);
+ ++++++ trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+ ++++++ gp_seq_req, rnp->level,
+ ++++++ rnp->grplo, rnp->grphi, s);
}
/*
TPS("Prestarted"));
goto unlock_out;
}
- ------ rnp->gp_seq_needed = gp_seq_req;
+ ++++++ WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req);
if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
/*
* We just marked the leaf or internal node, and a
}
trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT);
- ------ rcu_state.gp_req_activity = jiffies;
- ------ if (!rcu_state.gp_kthread) {
+ ++++++ WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
+ ++++++ if (!READ_ONCE(rcu_state.gp_kthread)) {
trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
goto unlock_out;
}
- ------ trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq"));
+ ++++++ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
ret = true; /* Caller must wake GP kthread. */
unlock_out:
/* Push furthest requested GP to leaf node and rcu_data structure. */
if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
- ------ rnp_start->gp_seq_needed = rnp->gp_seq_needed;
- ------ rdp->gp_seq_needed = rnp->gp_seq_needed;
+ ++++++ WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed);
+ ++++++ WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
}
if (rnp != rnp_start)
raw_spin_unlock_rcu_node(rnp);
}
/*
- ------ * Awaken the grace-period kthread. Don't do a self-awaken (unless in
- ------ * an interrupt or softirq handler), and don't bother awakening when there
- ------ * is nothing for the grace-period kthread to do (as in several CPUs raced
- ------ * to awaken, and we lost), and finally don't try to awaken a kthread that
- ------ * has not yet been created. If all those checks are passed, track some
- ------ * debug information and awaken.
+ ++++++ * Awaken the grace-period kthread. Don't do a self-awaken (unless in an
+ ++++++ * interrupt or softirq handler, in which case we just might immediately
+ ++++++ * sleep upon return, resulting in a grace-period hang), and don't bother
+ ++++++ * awakening when there is nothing for the grace-period kthread to do
+ ++++++ * (as in several CPUs raced to awaken, we lost), and finally don't try
+ ++++++ * to awaken a kthread that has not yet been created. If all those checks
+ ++++++ * are passed, track some debug information and awaken.
*
* So why do the self-wakeup when in an interrupt or softirq handler
* in the grace-period kthread's context? Because the kthread might have
*/
static void rcu_gp_kthread_wake(void)
{
- ------ if ((current == rcu_state.gp_kthread &&
- ------ !in_irq() && !in_serving_softirq()) ||
- ------ !READ_ONCE(rcu_state.gp_flags) ||
- ------ !rcu_state.gp_kthread)
+ ++++++ struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
+ ++++++
+ ++++++ if ((current == t && !in_irq() && !in_serving_softirq()) ||
+ ++++++ !READ_ONCE(rcu_state.gp_flags) || !t)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
rcu_lockdep_assert_cblist_protected(rdp);
c = rcu_seq_snap(&rcu_state.gp_seq);
- ------ if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
+ ++++++ if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
/* Old request still live, so mark recent callbacks. */
(void)rcu_segcblist_accelerate(&rdp->cblist, c);
return;
static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
{
bool ret = false;
---- --- bool need_gp;
++++ +++ bool need_qs;
const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
rcu_segcblist_is_offloaded(&rdp->cblist);
unlikely(READ_ONCE(rdp->gpwrap))) {
if (!offloaded)
ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
++++ +++ rdp->core_needs_qs = false;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
} else {
if (!offloaded)
ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
++++ +++ if (rdp->core_needs_qs)
++++ +++ rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
}
/* Now handle the beginnings of any new-to-this-CPU grace periods. */
* go looking for one.
*/
trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart"));
---- --- need_gp = !!(rnp->qsmask & rdp->grpmask);
---- --- rdp->cpu_no_qs.b.norm = need_gp;
---- --- rdp->core_needs_qs = need_gp;
++++ +++ need_qs = !!(rnp->qsmask & rdp->grpmask);
++++ +++ rdp->cpu_no_qs.b.norm = need_qs;
++++ +++ rdp->core_needs_qs = need_qs;
zero_cpu_stall_ticks(rdp);
}
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
- ------ rdp->gp_seq_needed = rnp->gp_seq_needed;
+ ++++++ WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
return ret;
WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
jiffies + (j ? 3 * j : 2));
}
- ------ trace_rcu_grace_period(rcu_state.name,
- ------ READ_ONCE(rcu_state.gp_seq),
+ ++++++ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswait"));
rcu_state.gp_state = RCU_GP_WAIT_FQS;
ret = swait_event_idle_timeout_exclusive(
/* If time for quiescent-state forcing, do it. */
if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) ||
(gf & RCU_GP_FLAG_FQS)) {
- ------ trace_rcu_grace_period(rcu_state.name,
- ------ READ_ONCE(rcu_state.gp_seq),
+ ++++++ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqsstart"));
rcu_gp_fqs(first_gp_fqs);
first_gp_fqs = false;
- ------ trace_rcu_grace_period(rcu_state.name,
- ------ READ_ONCE(rcu_state.gp_seq),
+ ++++++ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqsend"));
cond_resched_tasks_rcu_qs();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
cond_resched_tasks_rcu_qs();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
WARN_ON(signal_pending(current));
- ------ trace_rcu_grace_period(rcu_state.name,
- ------ READ_ONCE(rcu_state.gp_seq),
+ ++++++ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswaitsig"));
ret = 1; /* Keep old FQS timing. */
j = jiffies;
*/
static void rcu_gp_cleanup(void)
{
---- --- unsigned long gp_duration;
++++ +++ int cpu;
bool needgp = false;
++++ +++ unsigned long gp_duration;
unsigned long new_gp_seq;
bool offloaded;
struct rcu_data *rdp;
needgp = __note_gp_changes(rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
needgp = rcu_future_gp_cleanup(rnp) || needgp;
++++ +++ // Reset overload indication for CPUs no longer overloaded
++++ +++ if (rcu_is_leaf_node(rnp))
++++ +++ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) {
++++ +++ rdp = per_cpu_ptr(&rcu_data, cpu);
++++ +++ check_cb_ovld_locked(rdp, rnp);
++++ +++ }
sq = rcu_nocb_gp_get(rnp);
raw_spin_unlock_irq_rcu_node(rnp);
rcu_nocb_gp_cleanup(sq);
rcu_segcblist_is_offloaded(&rdp->cblist);
if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
- ------ rcu_state.gp_req_activity = jiffies;
+ ++++++ WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
trace_rcu_grace_period(rcu_state.name,
- ------ READ_ONCE(rcu_state.gp_seq),
+ ++++++ rcu_state.gp_seq,
TPS("newreq"));
} else {
WRITE_ONCE(rcu_state.gp_flags,
/* Handle grace-period start. */
for (;;) {
- ------ trace_rcu_grace_period(rcu_state.name,
- ------ READ_ONCE(rcu_state.gp_seq),
+ ++++++ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwait"));
rcu_state.gp_state = RCU_GP_WAIT_GPS;
swait_event_idle_exclusive(rcu_state.gp_wq,
cond_resched_tasks_rcu_qs();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
WARN_ON(signal_pending(current));
- ------ trace_rcu_grace_period(rcu_state.name,
- ------ READ_ONCE(rcu_state.gp_seq),
+ ++++++ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwaitsig"));
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
rcu_preempt_blocked_readers_cgp(rnp));
- ------ rnp->qsmask &= ~mask;
+ ++++++ WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
mask, rnp->qsmask, rnp->level,
rnp->grplo, rnp->grphi,
rnp_c = rnp;
rnp = rnp->parent;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- ------ oldmask = rnp_c->qsmask;
+ ++++++ oldmask = READ_ONCE(rnp_c->qsmask);
}
/*
return;
}
mask = rdp->grpmask;
++++ +++ if (rdp->cpu == smp_processor_id())
++++ +++ rdp->core_needs_qs = false;
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
return 0;
blkd = !!(rnp->qsmask & rdp->grpmask);
- ------ trace_rcu_grace_period(rcu_state.name, rnp->gp_seq,
+ ++++++ trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
return 0;
}
struct rcu_data *rdp;
struct rcu_node *rnp;
++++ +++ rcu_state.cbovld = rcu_state.cbovldnext;
++++ +++ rcu_state.cbovldnext = false;
rcu_for_each_leaf_node(rnp) {
cond_resched_tasks_rcu_qs();
mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
++++ +++ rcu_state.cbovldnext |= !!rnp->cbovldmask;
if (rnp->qsmask == 0) {
if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||
rcu_preempt_blocked_readers_cgp(rnp)) {
}
/*
---- --- * Helper function for call_rcu() and friends. The cpu argument will
---- --- * normally be -1, indicating "currently running CPU". It may specify
---- --- * a CPU only if that CPU is a no-CBs CPU. Currently, only rcu_barrier()
---- --- * is expected to specify a CPU.
++++ +++ * Check and if necessary update the leaf rcu_node structure's
++++ +++ * ->cbovldmask bit corresponding to the current CPU based on that CPU's
++++ +++ * number of queued RCU callbacks. The caller must hold the leaf rcu_node
++++ +++ * structure's ->lock.
+ */
++++ +++static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp)
++++ +++{
++++ +++ raw_lockdep_assert_held_rcu_node(rnp);
++++ +++ if (qovld_calc <= 0)
++++ +++ return; // Early boot and wildcard value set.
++++ +++ if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc)
++++ +++ WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask);
++++ +++ else
++++ +++ WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask);
++++ +++}
++++ +++
++++ +++/*
++++ +++ * Check and if necessary update the leaf rcu_node structure's
++++ +++ * ->cbovldmask bit corresponding to the current CPU based on that CPU's
++++ +++ * number of queued RCU callbacks. No locks need be held, but the
++++ +++ * caller must have disabled interrupts.
++++ +++ *
++++ +++ * Note that this function ignores the possibility that there are a lot
++++ +++ * of callbacks all of which have already seen the end of their respective
++++ +++ * grace periods. This omission is due to the need for no-CBs CPUs to
++++ +++ * be holding ->nocb_lock to do this check, which is too heavy for a
++++ +++ * common-case operation.
+ ++ +++ */
++++ +++static void check_cb_ovld(struct rcu_data *rdp)
++++ +++{
++++ +++ struct rcu_node *const rnp = rdp->mynode;
++++ +++
++++ +++ if (qovld_calc <= 0 ||
++++ +++ ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) ==
++++ +++ !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask)))
++++ +++ return; // Early boot wildcard value or already set correctly.
++++ +++ raw_spin_lock_rcu_node(rnp);
++++ +++ check_cb_ovld_locked(rdp, rnp);
++++ +++ raw_spin_unlock_rcu_node(rnp);
++++ +++}
++++ +++
++++ +++/* Helper function for call_rcu() and friends. */
static void
__call_rcu(struct rcu_head *head, rcu_callback_t func)
{
rcu_segcblist_init(&rdp->cblist);
}
++++ +++ check_cb_ovld(rdp);
if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
return; // Enqueued onto ->nocb_bypass, so just leave.
---- --- /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
++++ +++ // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
rcu_segcblist_enqueue(&rdp->cblist, head);
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
#define KFREE_DRAIN_JIFFIES (HZ / 50)
#define KFREE_N_BATCHES 2
++ +++++/*
++ +++++ * This macro defines how many entries the "records" array
++ +++++ * will contain. It is based on the fact that the size of
++ +++++ * kfree_rcu_bulk_data structure becomes exactly one page.
++ +++++ */
++ +++++#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3)
++ +++++
++ +++++/**
++ +++++ * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
++ +++++ * @nr_records: Number of active pointers in the array
++ +++++ * @records: Array of the kfree_rcu() pointers
++ +++++ * @next: Next bulk object in the block chain
++ +++++ * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set
++ +++++ */
++ +++++struct kfree_rcu_bulk_data {
++ +++++ unsigned long nr_records;
++ +++++ void *records[KFREE_BULK_MAX_ENTR];
++ +++++ struct kfree_rcu_bulk_data *next;
++ +++++ struct rcu_head *head_free_debug;
++ +++++};
++ +++++
/**
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
++ +++++ * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
++ +++++ struct kfree_rcu_bulk_data *bhead_free;
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
++ +++++ * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
++ +++++ * @bcached: Keeps at most one object for later reuse when build chain blocks
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
*/
struct kfree_rcu_cpu {
struct rcu_head *head;
++ +++++ struct kfree_rcu_bulk_data *bhead;
++ +++++ struct kfree_rcu_bulk_data *bcached;
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
spinlock_t lock;
struct delayed_work monitor_work;
static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
++ +++++static __always_inline void
++ +++++debug_rcu_head_unqueue_bulk(struct rcu_head *head)
++ +++++{
++ +++++#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
++ +++++ for (; head; head = head->next)
++ +++++ debug_rcu_head_unqueue(head);
++ +++++#endif
++ +++++}
++ +++++
/*
* This function is invoked in workqueue context after a grace period.
-- ----- * It frees all the objects queued on ->head_free.
++ +++++ * It frees all the objects queued on ->bhead_free or ->head_free.
*/
static void kfree_rcu_work(struct work_struct *work)
{
unsigned long flags;
struct rcu_head *head, *next;
++ +++++ struct kfree_rcu_bulk_data *bhead, *bnext;
struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
spin_lock_irqsave(&krcp->lock, flags);
head = krwp->head_free;
krwp->head_free = NULL;
++ +++++ bhead = krwp->bhead_free;
++ +++++ krwp->bhead_free = NULL;
spin_unlock_irqrestore(&krcp->lock, flags);
-- ----- // List "head" is now private, so traverse locklessly.
++ +++++ /* "bhead" is now private, so traverse locklessly. */
++ +++++ for (; bhead; bhead = bnext) {
++ +++++ bnext = bhead->next;
++ +++++
++ +++++ debug_rcu_head_unqueue_bulk(bhead->head_free_debug);
++ +++++
++ +++++ rcu_lock_acquire(&rcu_callback_map);
++ +++++ trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
++ +++++ bhead->nr_records, bhead->records);
++ +++++
++ +++++ kfree_bulk(bhead->nr_records, bhead->records);
++ +++++ rcu_lock_release(&rcu_callback_map);
++ +++++
++ +++++ if (cmpxchg(&krcp->bcached, NULL, bhead))
++ +++++ free_page((unsigned long) bhead);
++ +++++
++ +++++ cond_resched_tasks_rcu_qs();
++ +++++ }
++ +++++
++ +++++ /*
++ +++++ * Emergency case only. It can happen under low memory
++ +++++ * condition when an allocation gets failed, so the "bulk"
++ +++++ * path can not be temporary maintained.
++ +++++ */
for (; head; head = next) {
unsigned long offset = (unsigned long)head->func;
next = head->next;
-- ----- // Potentially optimize with kfree_bulk in future.
debug_rcu_head_unqueue(head);
rcu_lock_acquire(&rcu_callback_map);
trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
-- ----- if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
-- ----- /* Could be optimized with kfree_bulk() in future. */
++ +++++ if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
kfree((void *)head - offset);
-- ----- }
rcu_lock_release(&rcu_callback_map);
cond_resched_tasks_rcu_qs();
*/
static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
{
++ +++++ struct kfree_rcu_cpu_work *krwp;
++ +++++ bool queued = false;
int i;
-- ----- struct kfree_rcu_cpu_work *krwp = NULL;
lockdep_assert_held(&krcp->lock);
-- ----- for (i = 0; i < KFREE_N_BATCHES; i++)
-- ----- if (!krcp->krw_arr[i].head_free) {
-- ----- krwp = &(krcp->krw_arr[i]);
-- ----- break;
-- ----- }
-- ----- // If a previous RCU batch is in progress, we cannot immediately
-- ----- // queue another one, so return false to tell caller to retry.
-- ----- if (!krwp)
-- ----- return false;
++ +++++ for (i = 0; i < KFREE_N_BATCHES; i++) {
++ +++++ krwp = &(krcp->krw_arr[i]);
-- ----- krwp->head_free = krcp->head;
-- ----- krcp->head = NULL;
-- ----- INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
-- ----- queue_rcu_work(system_wq, &krwp->rcu_work);
-- ----- return true;
++ +++++ /*
++ +++++ * Try to detach bhead or head and attach it over any
++ +++++ * available corresponding free channel. It can be that
++ +++++ * a previous RCU batch is in progress, it means that
++ +++++ * immediately to queue another one is not possible so
++ +++++ * return false to tell caller to retry.
++ +++++ */
++ +++++ if ((krcp->bhead && !krwp->bhead_free) ||
++ +++++ (krcp->head && !krwp->head_free)) {
++ +++++ /* Channel 1. */
++ +++++ if (!krwp->bhead_free) {
++ +++++ krwp->bhead_free = krcp->bhead;
++ +++++ krcp->bhead = NULL;
++ +++++ }
++ +++++
++ +++++ /* Channel 2. */
++ +++++ if (!krwp->head_free) {
++ +++++ krwp->head_free = krcp->head;
++ +++++ krcp->head = NULL;
++ +++++ }
++ +++++
++ +++++ /*
++ +++++ * One work is per one batch, so there are two "free channels",
++ +++++ * "bhead_free" and "head_free" the batch can handle. It can be
++ +++++ * that the work is in the pending state when two channels have
++ +++++ * been detached following each other, one by one.
++ +++++ */
++ +++++ queue_rcu_work(system_wq, &krwp->rcu_work);
++ +++++ queued = true;
++ +++++ }
++ +++++ }
++ +++++
++ +++++ return queued;
}
static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
spin_unlock_irqrestore(&krcp->lock, flags);
}
++ +++++static inline bool
++ +++++kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
++ +++++ struct rcu_head *head, rcu_callback_t func)
++ +++++{
++ +++++ struct kfree_rcu_bulk_data *bnode;
++ +++++
++ +++++ if (unlikely(!krcp->initialized))
++ +++++ return false;
++ +++++
++ +++++ lockdep_assert_held(&krcp->lock);
++ +++++
++ +++++ /* Check if a new block is required. */
++ +++++ if (!krcp->bhead ||
++ +++++ krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
++ +++++ bnode = xchg(&krcp->bcached, NULL);
++ +++++ if (!bnode) {
++ +++++ WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
++ +++++
++ +++++ bnode = (struct kfree_rcu_bulk_data *)
++ +++++ __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
++ +++++ }
++ +++++
++ +++++ /* Switch to emergency path. */
++ +++++ if (unlikely(!bnode))
++ +++++ return false;
++ +++++
++ +++++ /* Initialize the new block. */
++ +++++ bnode->nr_records = 0;
++ +++++ bnode->next = krcp->bhead;
++ +++++ bnode->head_free_debug = NULL;
++ +++++
++ +++++ /* Attach it to the head. */
++ +++++ krcp->bhead = bnode;
++ +++++ }
++ +++++
++ +++++#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
++ +++++ head->func = func;
++ +++++ head->next = krcp->bhead->head_free_debug;
++ +++++ krcp->bhead->head_free_debug = head;
++ +++++#endif
++ +++++
++ +++++ /* Finally insert. */
++ +++++ krcp->bhead->records[krcp->bhead->nr_records++] =
++ +++++ (void *) head - (unsigned long) func;
++ +++++
++ +++++ return true;
++ +++++}
++ +++++
/*
-- ----- * Queue a request for lazy invocation of kfree() after a grace period.
++ +++++ * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace
++ +++++ * period. Please note there are two paths are maintained, one is the main one
++ +++++ * that uses kfree_bulk() interface and second one is emergency one, that is
++ +++++ * used only when the main path can not be maintained temporary, due to memory
++ +++++ * pressure.
*
* Each kfree_call_rcu() request is added to a batch. The batch will be drained
-- ----- * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch
-- ----- * will be kfree'd in workqueue context. This allows us to:
-- ----- *
-- ----- * 1. Batch requests together to reduce the number of grace periods during
-- ----- * heavy kfree_rcu() load.
-- ----- *
-- ----- * 2. It makes it possible to use kfree_bulk() on a large number of
-- ----- * kfree_rcu() requests thus reducing cache misses and the per-object
-- ----- * overhead of kfree().
++ +++++ * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
++ +++++ * be free'd in workqueue context. This allows us to: batch requests together to
++ +++++ * reduce the number of grace periods during heavy kfree_rcu() load.
*/
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
__func__, head);
goto unlock_return;
}
-- ----- head->func = func;
-- ----- head->next = krcp->head;
-- ----- krcp->head = head;
++ +++++
++ +++++ /*
++ +++++ * Under high memory pressure GFP_NOWAIT can fail,
++ +++++ * in that case the emergency path is maintained.
++ +++++ */
++ +++++ if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
++ +++++ head->func = func;
++ +++++ head->next = krcp->head;
++ +++++ krcp->head = head;
++ +++++ }
// Set timer to drain after KFREE_DRAIN_JIFFIES.
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
/*
* RCU callback function for rcu_barrier(). If we are last, wake
* up the task executing rcu_barrier().
+ ++++++ *
+ ++++++ * Note that the value of rcu_state.barrier_sequence must be captured
+ ++++++ * before the atomic_dec_and_test(). Otherwise, if this CPU is not last,
+ ++++++ * other CPUs might count the value down to zero before this CPU gets
+ ++++++ * around to invoking rcu_barrier_trace(), which might result in bogus
+ ++++++ * data from the next instance of rcu_barrier().
*/
static void rcu_barrier_callback(struct rcu_head *rhp)
{
+ ++++++ unsigned long __maybe_unused s = rcu_state.barrier_sequence;
+ ++++++
if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
- ------ rcu_barrier_trace(TPS("LastCB"), -1,
- ------ rcu_state.barrier_sequence);
+ ++++++ rcu_barrier_trace(TPS("LastCB"), -1, s);
complete(&rcu_state.barrier_completion);
} else {
- ------ rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence);
+ ++++++ rcu_barrier_trace(TPS("CB"), -1, s);
}
}
/*
* Called with preemption disabled, and from cross-cpu IRQ context.
*/
- ------static void rcu_barrier_func(void *unused)
+ ++++++static void rcu_barrier_func(void *cpu_in)
{
- ------ struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
+ ++++++ uintptr_t cpu = (uintptr_t)cpu_in;
+ ++++++ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
*/
void rcu_barrier(void)
{
- ------ int cpu;
+ ++++++ uintptr_t cpu;
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
/*
- ------ * Initialize the count to one rather than to zero in order to
- ------ * avoid a too-soon return to zero in case of a short grace period
- ------ * (or preemption of this task). Exclude CPU-hotplug operations
- ------ * to ensure that no offline CPU has callbacks queued.
+ ++++++ * Initialize the count to two rather than to zero in order
+ ++++++ * to avoid a too-soon return to zero in case of an immediate
+ ++++++ * invocation of the just-enqueued callback (or preemption of
+ ++++++ * this task). Exclude CPU-hotplug operations to ensure that no
+ ++++++ * offline non-offloaded CPU has callbacks queued.
*/
init_completion(&rcu_state.barrier_completion);
- ------ atomic_set(&rcu_state.barrier_cpu_count, 1);
+ ++++++ atomic_set(&rcu_state.barrier_cpu_count, 2);
get_online_cpus();
/*
*/
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- ------ if (!cpu_online(cpu) &&
+ ++++++ if (cpu_is_offline(cpu) &&
!rcu_segcblist_is_offloaded(&rdp->cblist))
continue;
- ------ if (rcu_segcblist_n_cbs(&rdp->cblist)) {
+ ++++++ if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) {
rcu_barrier_trace(TPS("OnlineQ"), cpu,
rcu_state.barrier_sequence);
- ------ smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
+ ++++++ smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1);
+ ++++++ } else if (rcu_segcblist_n_cbs(&rdp->cblist) &&
+ ++++++ cpu_is_offline(cpu)) {
+ ++++++ rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu,
+ ++++++ rcu_state.barrier_sequence);
+ ++++++ local_irq_disable();
+ ++++++ rcu_barrier_func((void *)cpu);
+ ++++++ local_irq_enable();
+ ++++++ } else if (cpu_is_offline(cpu)) {
+ ++++++ rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu,
+ ++++++ rcu_state.barrier_sequence);
} else {
rcu_barrier_trace(TPS("OnlineNQ"), cpu,
rcu_state.barrier_sequence);
* Now that we have an rcu_barrier_callback() callback on each
* CPU, and thus each counted, remove the initial count.
*/
- ------ if (atomic_dec_and_test(&rcu_state.barrier_cpu_count))
+ ++++++ if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count))
complete(&rcu_state.barrier_completion);
/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
rdp->beenonline = true; /* We have now been online. */
- ------ rdp->gp_seq = rnp->gp_seq;
- ------ rdp->gp_seq_needed = rnp->gp_seq;
+ ++++++ rdp->gp_seq = READ_ONCE(rnp->gp_seq);
+ ++++++ rdp->gp_seq_needed = rdp->gp_seq;
rdp->cpu_no_qs.b.norm = true;
rdp->core_needs_qs = false;
rdp->rcu_iw_pending = false;
- ------ rdp->rcu_iw_gp_seq = rnp->gp_seq - 1;
+ ++++++ rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_prepare_kthreads(cpu);
rnp = rdp->mynode;
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- ------ rnp->qsmaskinitnext |= mask;
+ ++++++ WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
oldmask = rnp->expmaskinitnext;
rnp->expmaskinitnext |= mask;
oldmask ^= rnp->expmaskinitnext;
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
- ------ rnp->qsmaskinitnext &= ~mask;
+ ++++++ WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
raw_spin_unlock(&rcu_state.ofl_lock);
}
rnp = rcu_get_root();
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- ------ rcu_state.gp_kthread = t;
+ ++++++ WRITE_ONCE(rcu_state.gp_activity, jiffies);
+ ++++++ WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
+ ++++++ // Reset .gp_activity and .gp_req_activity before setting .gp_kthread.
+ ++++++ smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
wake_up_process(t);
rcu_spawn_nocb_kthreads();
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
spin_lock_init(&krcp->lock);
-- ----- for (i = 0; i < KFREE_N_BATCHES; i++)
++ +++++ for (i = 0; i < KFREE_N_BATCHES; i++) {
++ +++++ INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
++ +++++ }
++ +++++
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
krcp->initialized = true;
}
rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_par_gp_wq);
srcu_init();
++++ +++
++++ +++ /* Fill in default value for rcutree.qovld boot parameter. */
++++ +++ /* -After- the rcu_node ->lock fields are initialized! */
++++ +++ if (qovld < 0)
++++ +++ qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
++++ +++ else
++++ +++ qovld_calc = qovld;
}
#include "tree_stall.h"