insecure, please do not use on production kernels.
debug_locks_verbose=
- [KNL] verbose self-tests
- Format=<0|1>
+ [KNL] verbose locking self-tests
+ Format: <int>
Print debugging info while doing the locking API
self-tests.
- We default to 0 (no extra messages), setting it to
- 1 will print _a lot_ more information - normally
- only useful to kernel developers.
+ Bitmask for the various LOCKTYPE_ tests. Defaults to 0
+ (no extra messages), setting it to -1 (all bits set)
+ will print _a_lot_ more information - normally only
+ useful to lockdep developers.
debug_objects [KNL] Enable object debugging
For example, to override I2C bus2:
omap_mux=i2c2_scl.i2c2_scl=0x100,i2c2_sda.i2c2_sda=0x100
- oprofile.timer= [HW]
- Use timer interrupt instead of performance counters
-
- oprofile.cpu_type= Force an oprofile cpu type
- This might be useful if you have an older oprofile
- userland or if you want common events.
- Format: { arch_perfmon }
- arch_perfmon: [X86] Force use of architectural
- perfmon on Intel CPUs instead of the
- CPU specific event set.
- timer: [X86] Force use of architectural NMI
- timer mode (see also oprofile.timer
- for generic hr timer mode)
-
oops=panic Always panic on oopses. Default is to just kill the
process, but there is a small probability of
deadlocking the machine.
Format: {"off"}
Disable Hardware Transactional Memory
+ preempt= [KNL]
+ Select preemption mode if you have CONFIG_PREEMPT_DYNAMIC
+ none - Limited to cond_resched() calls
+ voluntary - Limited to cond_resched() and might_sleep() calls
+ full - Any section that isn't explicitly preempt disabled
+ can be preempted anytime.
+
print-fatal-signals=
[KNL] debug: print fatal signals
value, meaning that RCU_SOFTIRQ is used by default.
Specify rcutree.use_softirq=0 to use rcuc kthreads.
+ But note that CONFIG_PREEMPT_RT=y kernels disable
+ this kernel boot parameter, forcibly setting it
+ to zero.
+
rcutree.rcu_fanout_exact= [KNL]
Disable autobalancing of the rcu_node combining
tree. This is used by rcutorture, and might
Set wakeup interval for idle CPUs that have
RCU callbacks (RCU_FAST_NO_HZ=y).
- rcutree.rcu_idle_lazy_gp_delay= [KNL]
- Set wakeup interval for idle CPUs that have
- only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
- Lazy RCU callbacks are those which RCU can
- prove do nothing more than free memory.
-
rcutree.rcu_kick_kthreads= [KNL]
Cause the grace-period kthread to get an extra
wake_up() if it sleeps three times longer than
stress RCU, they don't participate in the actual
test, hence the "fake".
+ rcutorture.nocbs_nthreads= [KNL]
+ Set number of RCU callback-offload togglers.
+ Zero (the default) disables toggling.
+
+ rcutorture.nocbs_toggle= [KNL]
+ Set the delay in milliseconds between successive
+ callback-offload toggling attempts.
+
rcutorture.nreaders= [KNL]
Set number of RCU readers. The value -1 selects
N-1, where N is the number of CPUs. A value
only normal grace-period primitives. No effect
on CONFIG_TINY_RCU kernels.
+ But note that CONFIG_PREEMPT_RT=y kernels enables
+ this kernel boot parameter, forcibly setting
+ it to the value one, that is, converting any
+ post-boot attempt at an expedited RCU grace
+ period to instead use normal non-expedited
+ grace-period processing.
+
rcupdate.rcu_task_ipi_delay= [KNL]
Set time in jiffies during which RCU tasks will
avoid sending IPIs, starting with the beginning
refscale.verbose= [KNL]
Enable additional printk() statements.
+ refscale.verbose_batched= [KNL]
+ Batch the additional printk() statements. If zero
+ (the default) or negative, print everything. Otherwise,
+ print every Nth verbose statement, where N is the value
+ specified.
+
relax_domain_level=
[KNL, SMP] Set scheduler's default relax_domain_level.
See Documentation/admin-guide/cgroup-v1/cpusets.rst.
are running concurrently, especially on systems
with rotating-rust storage.
+ torture.verbose_sleep_frequency= [KNL]
+ Specifies how many verbose printk()s should be
+ emitted between each sleep. The default of zero
+ disables verbose-printk() sleeping.
+
+ torture.verbose_sleep_duration= [KNL]
+ Duration of each verbose-printk() sleep in jiffies.
+
tp720= [HW,PS2]
tpm_suspend_pcr=[HW,TPM]
config GENERIC_ENTRY
bool
-config OPROFILE
- tristate "OProfile system profiling"
- depends on PROFILING
- depends on HAVE_OPROFILE
- select RING_BUFFER
- select RING_BUFFER_ALLOW_SWAP
- help
- OProfile is a profiling system capable of profiling the
- whole system, include the kernel, kernel modules, libraries,
- and applications.
-
- If unsure, say N.
-
-config OPROFILE_EVENT_MULTIPLEX
- bool "OProfile multiplexing support (EXPERIMENTAL)"
- default n
- depends on OPROFILE && X86
- help
- The number of hardware counters is limited. The multiplexing
- feature enables OProfile to gather more events than counters
- are provided by the hardware. This is realized by switching
- between events at a user specified time interval.
-
- If unsure, say N.
-
-config HAVE_OPROFILE
- bool
-
-config OPROFILE_NMI_TIMER
- def_bool y
- depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !PPC64
-
config KPROBES
bool "Kprobes"
depends on MODULES
bool
depends on HAVE_STATIC_CALL
+ config HAVE_PREEMPT_DYNAMIC
+ bool
+ depends on HAVE_STATIC_CALL
+ depends on GENERIC_ENTRY
+ help
+ Select this if the architecture support boot time preempt setting
+ on top of static calls. It is strongly advised to support inline
+ static call to avoid any overhead.
+
config ARCH_WANT_LD_ORPHAN_WARN
bool
help
If a 32-bit architecture requires 64-bit arguments to be split into
pairs of 32-bit arguments, select this option.
+config ARCH_HAS_ELFCORE_COMPAT
+ bool
+
source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig"
#define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK))
#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
+ max(x * (MAX_PRIO - prio) / (NICE_WIDTH / 2), MIN_SPU_TIMESLICE)
/*
* scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
/*
* Wake up the active spu_contexts.
- *
- * When the awakened processes see their "notify_active" flag is set,
- * they will call spu_switch_notify().
*/
for_each_online_node(node) {
struct spu *spu;
spu_switch_log_notify(spu, ctx, SWITCH_LOG_START, 0);
spu_restore(&ctx->csa, spu);
spu->timestamp = jiffies;
- spu_switch_notify(spu, ctx);
ctx->state = SPU_STATE_RUNNABLE;
spuctx_switch_state(ctx, SPU_UTIL_USER);
*/
atomic_dec_if_positive(&ctx->gang->aff_sched_count);
- spu_switch_notify(spu, NULL);
spu_unmap_mappings(ctx);
spu_save(&ctx->csa, spu);
spu_switch_log_notify(spu, ctx, SWITCH_LOG_STOP, 0);
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
select SWIOTLB
+ select ARCH_HAS_ELFCORE_COMPAT
config FORCE_DYNAMIC_FTRACE
def_bool y
select HAVE_MOVE_PMD
select HAVE_MOVE_PUD
select HAVE_NMI
- select HAVE_OPROFILE
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
select HAVE_STACK_VALIDATION if X86_64
select HAVE_STATIC_CALL
select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION
+ select HAVE_PREEMPT_DYNAMIC
select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
config HPET_EMULATE_RTC
def_bool y
- depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
+ depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
config APB_TIMER
def_bool y if X86_INTEL_MID
If you don't know what a machine check is and you don't do kernel
QA it is safe to say n.
-config X86_THERMAL_VECTOR
- def_bool y
- depends on X86_MCE_INTEL
-
source "arch/x86/events/Kconfig"
config X86_LEGACY_VM86
depends on X86_64
select ARCH_WANT_OLD_COMPAT_IPC
select BINFMT_ELF
- select COMPAT_BINFMT_ELF
select COMPAT_OLD_SIGACTION
help
Include code to run legacy 32-bit programs under a
#define THERMAL_TABLE(name)
#endif
+#ifdef CONFIG_DTPM
+#define DTPM_TABLE() \
+ . = ALIGN(8); \
+ __dtpm_table = .; \
+ KEEP(*(__dtpm_table)) \
+ __dtpm_table_end = .;
+#else
+#define DTPM_TABLE()
+#endif
+
#define KERNEL_DTB() \
STRUCT_ALIGN(); \
__dtb_start = .; \
. = ALIGN(8); \
__start_static_call_sites = .; \
KEEP(*(.static_call_sites)) \
- __stop_static_call_sites = .;
+ __stop_static_call_sites = .; \
+ __start_static_call_tramp_key = .; \
+ KEEP(*(.static_call_tramp_key)) \
+ __stop_static_call_tramp_key = .;
/*
* Allow architectures to handle ro_after_init data on their
ACPI_PROBE_TABLE(irqchip) \
ACPI_PROBE_TABLE(timer) \
THERMAL_TABLE(governor) \
+ DTPM_TABLE() \
EARLYCON_TABLE() \
LSM_TABLE() \
EARLY_LSM_TABLE() \
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
#define ulong2long(a) (*(long *)(&(a)))
+#define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
+#define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b)))
/* Exported common interfaces */
void call_rcu(struct rcu_head *head, rcu_callback_t func);
#ifdef CONFIG_RCU_NOCB_CPU
void rcu_init_nohz(void);
+int rcu_nocb_cpu_offload(int cpu);
+int rcu_nocb_cpu_deoffload(int cpu);
+ void rcu_nocb_flush_deferred_wakeup(void);
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_init_nohz(void) { }
+static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
+static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
+ static inline void rcu_nocb_flush_deferred_wakeup(void) { }
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
/**
*/
#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
-/*
- * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain.
- */
-#define __kvfree_rcu(head, offset) \
- do { \
- BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \
- kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
- } while (0)
-
/**
* kfree_rcu() - kfree an object after a grace period.
- * @ptr: pointer to kfree
- * @rhf: the name of the struct rcu_head within the type of @ptr.
+ * @ptr: pointer to kfree for both single- and double-argument invocations.
+ * @rhf: the name of the struct rcu_head within the type of @ptr,
+ * but only for double-argument invocations.
*
* Many rcu callbacks functions just call kfree() on the base structure.
* These functions are trivial, but their size adds up, and furthermore
* Because the functions are not allowed in the low-order 4096 bytes of
* kernel virtual memory, offsets up to 4095 bytes can be accommodated.
* If the offset is larger than 4095 bytes, a compile-time error will
- * be generated in __kvfree_rcu(). If this error is triggered, you can
+ * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
* either fall back to use of call_rcu() or rearrange the structure to
* position the rcu_head structure into the first 4096 bytes.
*
* The BUILD_BUG_ON check must not involve any function calls, hence the
* checks are done in macros here.
*/
-#define kfree_rcu(ptr, rhf) \
-do { \
- typeof (ptr) ___p = (ptr); \
- \
- if (___p) \
- __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
-} while (0)
+#define kfree_rcu kvfree_rcu
/**
* kvfree_rcu() - kvfree an object after a grace period.
kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__)
#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME
-#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf)
+#define kvfree_rcu_arg_2(ptr, rhf) \
+do { \
+ typeof (ptr) ___p = (ptr); \
+ \
+ if (___p) { \
+ BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \
+ kvfree_call_rcu(&((___p)->rhf), (rcu_callback_t)(unsigned long) \
+ (offsetof(typeof(*(ptr)), rhf))); \
+ } \
+} while (0)
+
#define kvfree_rcu_arg_1(ptr) \
do { \
typeof(ptr) ___p = (ptr); \
i.e. put less load on throttled CPUs than on non/less throttled ones.
This requires the architecture to implement
- arch_set_thermal_pressure() and arch_get_thermal_pressure().
+ arch_set_thermal_pressure() and arch_scale_thermal_pressure().
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
bool "Profiling support"
help
Say Y here to enable the extended profiling support mechanisms used
- by profilers such as OProfile.
+ by profilers.
#
# Place an empty function call at each tracepoint site. Can be
#include <linux/min_heap.h>
#include <linux/highmem.h>
#include <linux/pgtable.h>
+#include <linux/buildid.h>
#include "internal.h"
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
static atomic_t nr_text_poke_events __read_mostly;
+static atomic_t nr_build_id_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
groups->index = 0;
}
+ static inline struct cgroup *event_cgroup(const struct perf_event *event)
+ {
+ struct cgroup *cgroup = NULL;
+
+ #ifdef CONFIG_CGROUP_PERF
+ if (event->cgrp)
+ cgroup = event->cgrp->css.cgroup;
+ #endif
+
+ return cgroup;
+ }
+
/*
* Compare function for event groups;
*
* Implements complex key that first sorts by CPU and then by virtual index
* which provides ordering when rotating groups for the same CPU.
*/
- static bool
- perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+ static __always_inline int
+ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
+ const u64 left_group_index, const struct perf_event *right)
{
- if (left->cpu < right->cpu)
- return true;
- if (left->cpu > right->cpu)
- return false;
+ if (left_cpu < right->cpu)
+ return -1;
+ if (left_cpu > right->cpu)
+ return 1;
#ifdef CONFIG_CGROUP_PERF
- if (left->cgrp != right->cgrp) {
- if (!left->cgrp || !left->cgrp->css.cgroup) {
- /*
- * Left has no cgroup but right does, no cgroups come
- * first.
- */
- return true;
+ {
+ const struct cgroup *right_cgroup = event_cgroup(right);
+
+ if (left_cgroup != right_cgroup) {
+ if (!left_cgroup) {
+ /*
+ * Left has no cgroup but right does, no
+ * cgroups come first.
+ */
+ return -1;
+ }
+ if (!right_cgroup) {
+ /*
+ * Right has no cgroup but left does, no
+ * cgroups come first.
+ */
+ return 1;
+ }
+ /* Two dissimilar cgroups, order by id. */
+ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
+ return -1;
+
+ return 1;
}
- if (!right->cgrp || !right->cgrp->css.cgroup) {
- /*
- * Right has no cgroup but left does, no cgroups come
- * first.
- */
- return false;
- }
- /* Two dissimilar cgroups, order by id. */
- if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
- return true;
-
- return false;
}
#endif
- if (left->group_index < right->group_index)
- return true;
- if (left->group_index > right->group_index)
- return false;
+ if (left_group_index < right->group_index)
+ return -1;
+ if (left_group_index > right->group_index)
+ return 1;
+
+ return 0;
+ }
- return false;
+ #define __node_2_pe(node) \
+ rb_entry((node), struct perf_event, group_node)
+
+ static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
+ {
+ struct perf_event *e = __node_2_pe(a);
+ return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
+ __node_2_pe(b)) < 0;
+ }
+
+ struct __group_key {
+ int cpu;
+ struct cgroup *cgroup;
+ };
+
+ static inline int __group_cmp(const void *key, const struct rb_node *node)
+ {
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
+ return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
}
/*
perf_event_groups_insert(struct perf_event_groups *groups,
struct perf_event *event)
{
- struct perf_event *node_event;
- struct rb_node *parent;
- struct rb_node **node;
-
event->group_index = ++groups->index;
- node = &groups->tree.rb_node;
- parent = *node;
-
- while (*node) {
- parent = *node;
- node_event = container_of(*node, struct perf_event, group_node);
-
- if (perf_event_groups_less(event, node_event))
- node = &parent->rb_left;
- else
- node = &parent->rb_right;
- }
-
- rb_link_node(&event->group_node, parent, node);
- rb_insert_color(&event->group_node, &groups->tree);
+ rb_add(&event->group_node, &groups->tree, __group_less);
}
/*
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
struct cgroup *cgrp)
{
- struct perf_event *node_event = NULL, *match = NULL;
- struct rb_node *node = groups->tree.rb_node;
- #ifdef CONFIG_CGROUP_PERF
- u64 node_cgrp_id, cgrp_id = 0;
-
- if (cgrp)
- cgrp_id = cgrp->kn->id;
- #endif
-
- while (node) {
- node_event = container_of(node, struct perf_event, group_node);
-
- if (cpu < node_event->cpu) {
- node = node->rb_left;
- continue;
- }
- if (cpu > node_event->cpu) {
- node = node->rb_right;
- continue;
- }
- #ifdef CONFIG_CGROUP_PERF
- node_cgrp_id = 0;
- if (node_event->cgrp && node_event->cgrp->css.cgroup)
- node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
+ struct __group_key key = {
+ .cpu = cpu,
+ .cgroup = cgrp,
+ };
+ struct rb_node *node;
- if (cgrp_id < node_cgrp_id) {
- node = node->rb_left;
- continue;
- }
- if (cgrp_id > node_cgrp_id) {
- node = node->rb_right;
- continue;
- }
- #endif
- match = node_event;
- node = node->rb_left;
- }
+ node = rb_find_first(&key, &groups->tree, __group_cmp);
+ if (node)
+ return __node_2_pe(node);
- return match;
+ return NULL;
}
/*
static struct perf_event *
perf_event_groups_next(struct perf_event *event)
{
- struct perf_event *next;
- #ifdef CONFIG_CGROUP_PERF
- u64 curr_cgrp_id = 0;
- u64 next_cgrp_id = 0;
- #endif
-
- next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
- if (next == NULL || next->cpu != event->cpu)
- return NULL;
-
- #ifdef CONFIG_CGROUP_PERF
- if (event->cgrp && event->cgrp->css.cgroup)
- curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
+ struct __group_key key = {
+ .cpu = event->cpu,
+ .cgroup = event_cgroup(event),
+ };
+ struct rb_node *next;
- if (next->cgrp && next->cgrp->css.cgroup)
- next_cgrp_id = next->cgrp->css.cgroup->kn->id;
+ next = rb_next_match(&key, &event->group_node, __group_cmp);
+ if (next)
+ return __node_2_pe(next);
- if (curr_cgrp_id != next_cgrp_id)
- return NULL;
- #endif
- return next;
+ return NULL;
}
/*
dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
+ if (event->attr.build_id)
+ atomic_dec(&nr_build_id_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
if (event->attr.namespaces)
u64 ino;
u64 ino_generation;
u32 prot, flags;
+ u8 build_id[BUILD_ID_SIZE_MAX];
+ u32 build_id_size;
struct {
struct perf_event_header header;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
u32 type = mmap_event->event_id.header.type;
+ bool use_build_id;
int ret;
if (!perf_event_mmap_match(event, data))
mmap_event->event_id.pid = perf_event_pid(event, current);
mmap_event->event_id.tid = perf_event_tid(event, current);
+ use_build_id = event->attr.build_id && mmap_event->build_id_size;
+
+ if (event->attr.mmap2 && use_build_id)
+ mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
+
perf_output_put(&handle, mmap_event->event_id);
if (event->attr.mmap2) {
- perf_output_put(&handle, mmap_event->maj);
- perf_output_put(&handle, mmap_event->min);
- perf_output_put(&handle, mmap_event->ino);
- perf_output_put(&handle, mmap_event->ino_generation);
+ if (use_build_id) {
+ u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
+
+ __output_copy(&handle, size, 4);
+ __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
+ } else {
+ perf_output_put(&handle, mmap_event->maj);
+ perf_output_put(&handle, mmap_event->min);
+ perf_output_put(&handle, mmap_event->ino);
+ perf_output_put(&handle, mmap_event->ino_generation);
+ }
perf_output_put(&handle, mmap_event->prot);
perf_output_put(&handle, mmap_event->flags);
}
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
+ if (atomic_read(&nr_build_id_events))
+ build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
+
perf_iterate_sb(perf_event_mmap_output,
mmap_event,
NULL);
inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
+ if (event->attr.build_id)
+ atomic_inc(&nr_build_id_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
if (event->attr.namespaces)
return 1;
}
+ #define __node_2_waiter(node) \
+ rb_entry((node), struct rt_mutex_waiter, tree_entry)
+
+ static inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
+ {
+ return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b));
+ }
+
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &lock->waiters.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- bool leftmost = true;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
-
- rb_link_node(&waiter->tree_entry, parent, link);
- rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
+ rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
}
static void
RB_CLEAR_NODE(&waiter->tree_entry);
}
+ #define __node_2_pi_waiter(node) \
+ rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
+
+ static inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+ {
+ return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
+ }
+
static void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct rt_mutex_waiter *entry;
- bool leftmost = true;
-
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
- if (rt_mutex_waiter_less(waiter, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
- }
- }
-
- rb_link_node(&waiter->pi_tree_entry, parent, link);
- rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
+ rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
}
static void
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * Futex variant, that since futex variants do not use the fast-path, can be
- * simple and will not need to retry.
+ * __rt_mutex_futex_unlock - Futex variant, that since futex variants
+ * do not use the fast-path, can be simple and will not need to retry.
+ *
+ * @lock: The rt_mutex to be unlocked
+ * @wake_q: The wake queue head from which to get the next lock waiter
*/
bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
EXPORT_SYMBOL_GPL(rt_mutex_destroy);
/**
- * __rt_mutex_init - initialize the rt lock
+ * __rt_mutex_init - initialize the rt_mutex
*
- * @lock: the rt lock to be initialized
+ * @lock: The rt_mutex to be initialized
+ * @name: The lock name used for debugging
+ * @key: The lock class key used for debugging
*
- * Initialize the rt lock to unlocked state.
+ * Initialize the rt_mutex to unlocked state.
*
- * Initializing of a locked rt lock is not allowed
+ * Initializing of a locked rt_mutex is not allowed
*/
void __rt_mutex_init(struct rt_mutex *lock, const char *name,
struct lock_class_key *key)
.dynticks_nesting = 1,
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+#ifdef CONFIG_RCU_NOCB_CPU
+ .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
+#endif
};
static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
static bool dump_tree;
module_param(dump_tree, bool, 0444);
/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
-static bool use_softirq = true;
+static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
+#ifndef CONFIG_PREEMPT_RT
module_param(use_softirq, bool, 0444);
+#endif
/* Control rcu_node-tree auto-balancing at boot time. */
static bool rcu_fanout_exact;
module_param(rcu_fanout_exact, bool, 0444);
trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
rdp = this_cpu_ptr(&rcu_data);
- do_nocb_deferred_wakeup(rdp);
rcu_prepare_for_idle();
rcu_preempt_deferred_qs(current);
EXPORT_SYMBOL_GPL(rcu_idle_enter);
#ifdef CONFIG_NO_HZ_FULL
+
+ #if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)
+ /*
+ * An empty function that will trigger a reschedule on
+ * IRQ tail once IRQs get re-enabled on userspace/guest resume.
+ */
+ static void late_wakeup_func(struct irq_work *work)
+ {
+ }
+
+ static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
+ IRQ_WORK_INIT(late_wakeup_func);
+
+ /*
+ * If either:
+ *
+ * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
+ * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
+ *
+ * In these cases the late RCU wake ups aren't supported in the resched loops and our
+ * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
+ * get re-enabled again.
+ */
+ noinstr static void rcu_irq_work_resched(void)
+ {
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
+ return;
+
+ if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
+ return;
+
+ instrumentation_begin();
+ if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
+ irq_work_queue(this_cpu_ptr(&late_wakeup_work));
+ }
+ instrumentation_end();
+ }
+
+ #else
+ static inline void rcu_irq_work_resched(void) { }
+ #endif
+
/**
* rcu_user_enter - inform RCU that we are resuming userspace.
*
noinstr void rcu_user_enter(void)
{
lockdep_assert_irqs_disabled();
+
+ /*
+ * Other than generic entry implementation, we may be past the last
+ * rescheduling opportunity in the entry code. Trigger a self IPI
+ * that will fire and reschedule once we resume in user/guest mode.
+ */
+ rcu_irq_work_resched();
rcu_eqs_enter(true);
}
+
#endif /* CONFIG_NO_HZ_FULL */
/**
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
+
/*
* Callbacks are often registered with incomplete grace-period
* information. Something about the fact that getting exact
else
trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
+
return ret;
}
* go offline later. Please also refer to "Hotplug CPU" section
* of RCU's Requirements documentation.
*/
- rcu_state.gp_state = RCU_GP_ONOFF;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
rcu_for_each_leaf_node(rnp) {
smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
firstseq = READ_ONCE(rnp->ofl_seq);
* The grace period cannot complete until the initialization
* process finishes, because this kthread handles both.
*/
- rcu_state.gp_state = RCU_GP_INIT;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
rcu_for_each_node_breadth_first(rnp) {
rcu_gp_slow(gp_init_delay);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
ret = 0;
for (;;) {
if (!ret) {
- rcu_state.jiffies_force_qs = jiffies + j;
+ WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
+ /*
+ * jiffies_force_qs before RCU_GP_WAIT_FQS state
+ * update; required for stall checks.
+ */
+ smp_wmb();
WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
jiffies + (j ? 3 * j : 2));
}
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswait"));
- rcu_state.gp_state = RCU_GP_WAIT_FQS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
ret = swait_event_idle_timeout_exclusive(
rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
rcu_gp_torture_wait();
- rcu_state.gp_state = RCU_GP_DOING_FQS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
if (!READ_ONCE(rnp->qsmask) &&
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
rcu_seq_end(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
- rcu_state.gp_state = RCU_GP_IDLE;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
/* Check for GP requests since above loop. */
rdp = this_cpu_ptr(&rcu_data);
if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
for (;;) {
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwait"));
- rcu_state.gp_state = RCU_GP_WAIT_GPS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
swait_event_idle_exclusive(rcu_state.gp_wq,
READ_ONCE(rcu_state.gp_flags) &
RCU_GP_FLAG_INIT);
rcu_gp_torture_wait();
- rcu_state.gp_state = RCU_GP_DONE_GPS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
/* Locking provides needed memory barrier. */
if (rcu_gp_init())
break;
rcu_gp_fqs_loop();
/* Handle grace-period end. */
- rcu_state.gp_state = RCU_GP_CLEANUP;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
rcu_gp_cleanup();
- rcu_state.gp_state = RCU_GP_CLEANED;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
}
}
static void rcu_do_batch(struct rcu_data *rdp)
{
int div;
+ bool __maybe_unused empty;
unsigned long flags;
const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
- long bl, count;
+ long bl, count = 0;
long pending, tlimit = 0;
/* If no callbacks are ready, just return. */
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
if (offloaded)
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
+
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
rcu_callback_t f;
+ count++;
debug_rcu_head_unqueue(rhp);
rcu_lock_acquire(&rcu_callback_map);
/*
* Stop only if limit reached and CPU has something to do.
- * Note: The rcl structure counts down from zero.
*/
- if (-rcl.len >= bl && !offloaded &&
+ if (count >= bl && !offloaded &&
(need_resched() ||
(!is_idle_task(current) && !rcu_is_callbacks_kthread())))
break;
if (unlikely(tlimit)) {
/* only call local_clock() every 32 callbacks */
- if (likely((-rcl.len & 31) || local_clock() < tlimit))
+ if (likely((count & 31) || local_clock() < tlimit))
continue;
/* Exceeded the time limit, so leave. */
break;
}
- if (offloaded) {
- WARN_ON_ONCE(in_serving_softirq());
+ if (!in_serving_softirq()) {
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
local_irq_save(flags);
rcu_nocb_lock(rdp);
- count = -rcl.len;
rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
- smp_mb(); /* List handling before counting for rcu_barrier(). */
- rcu_segcblist_insert_count(&rdp->cblist, &rcl);
+ rcu_segcblist_add_len(&rdp->cblist, -count);
/* Reinstate batch limit if we have worked down the excess. */
count = rcu_segcblist_n_cbs(&rdp->cblist);
* The following usually indicates a double call_rcu(). To track
* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
*/
- WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
+ empty = rcu_segcblist_empty(&rdp->cblist);
+ WARN_ON_ONCE(count == 0 && !empty);
WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- count != 0 && rcu_segcblist_empty(&rdp->cblist));
+ count != 0 && empty);
+ WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
+ WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
rcu_nocb_unlock_irqrestore(rdp, flags);
void rcu_sched_clock_irq(int user)
{
trace_rcu_utilization(TPS("Start scheduler-tick"));
+ lockdep_assert_irqs_disabled();
raw_cpu_inc(rcu_data.ticks_this_gp);
/* The load-acquire pairs with the store-release setting to true. */
if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
rcu_flavor_sched_clock_irq(user);
if (rcu_pending(user))
invoke_rcu_core();
+ lockdep_assert_irqs_disabled();
trace_rcu_utilization(TPS("End scheduler-tick"));
}
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
- local_irq_save(flags);
+ rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
+ rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
- local_irq_restore(flags);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
}
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
likely(READ_ONCE(rcu_scheduler_fully_active)))
rcu_do_batch(rdp);
static void
__call_rcu(struct rcu_head *head, rcu_callback_t func)
{
+ static atomic_t doublefrees;
unsigned long flags;
struct rcu_data *rdp;
bool was_alldone;
* Use rcu:rcu_callback trace event to find the previous
* time callback was passed to __call_rcu().
*/
- WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
- head, head->func);
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
WRITE_ONCE(head->func, rcu_leak_callback);
return;
}
trace_rcu_callback(rcu_state.name, head,
rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
+
/* Go handle any RCU core processing required. */
if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
goto unlock_return;
}
+ kasan_record_aux_stack(ptr);
success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
if (!success) {
run_page_cache_worker(krcp);
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ lockdep_assert_irqs_disabled();
+
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rdp);
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rcu_state.n_force_qs;
rdp->blimit = blimit;
- if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
- !rcu_segcblist_is_offloaded(&rdp->cblist))
- rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
rcu_dynticks_eqs_online();
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ /*
+ * Lock in case the CB/GP kthreads are still around handling
+ * old callbacks (longer term we should flush all callbacks
+ * before completing CPU offline)
+ */
+ rcu_nocb_lock(rdp);
+ if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */
+ rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
+ rcu_nocb_unlock(rdp);
/*
* Add CPU to leaf rcu_node pending-online bitmask. Any needed
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
+ // Do any dangling deferred wakeups.
+ do_nocb_deferred_wakeup(rdp);
+
/* QS for any half-done expedited grace period. */
preempt_disable();
rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
/* 5) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
+ struct swait_queue_head nocb_state_wq; /* For offloading state changes */
struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
atomic_t nocb_lock_contended; /* Contention experienced. */
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
+#define RCU_NOCB_WAKE_OFF -1
#define RCU_NOCB_WAKE_NOT 0
#define RCU_NOCB_WAKE 1
#define RCU_NOCB_WAKE_FORCE 2
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
- static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
static void __init rcu_spawn_nocb_kthreads(void);
{
struct task_struct *t = current;
+ lockdep_assert_irqs_disabled();
if (user || rcu_is_cpu_rrupt_from_idle()) {
rcu_note_voluntary_context_switch(current);
}
* Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
* and this function releases it.
*/
- static void wake_nocb_gp(struct rcu_data *rdp, bool force,
- unsigned long flags)
+ static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
+ unsigned long flags)
__releases(rdp->nocb_lock)
{
bool needwake = false;
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("AlreadyAwake"));
rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
+ return false;
}
del_timer(&rdp->nocb_timer);
rcu_nocb_unlock_irqrestore(rdp, flags);
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
if (needwake)
wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ return needwake;
}
/*
static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
const char *reason)
{
+ if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
+ return;
if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
mod_timer(&rdp->nocb_timer, jiffies + 1);
if (rdp->nocb_defer_wakeup < waketype)
__call_rcu_nocb_wake(rdp, true, flags);
}
+/*
+ * Check if we ignore this rdp.
+ *
+ * We check that without holding the nocb lock but
+ * we make sure not to miss a freshly offloaded rdp
+ * with the current ordering:
+ *
+ * rdp_offload_toggle() nocb_gp_enabled_cb()
+ * ------------------------- ----------------------------
+ * WRITE flags LOCK nocb_gp_lock
+ * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
+ * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
+ * UNLOCK nocb_gp_lock READ flags
+ */
+static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
+
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_state)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ }
+ return true;
+ }
+
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ return false;
+}
+
+
/*
* No-CBs GP kthreads come here to wait for additional callbacks to show up
* or for grace periods to end.
*/
WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ bool needwake_state = false;
+
+ if (!nocb_gp_enabled_cb(rdp))
+ continue;
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
rcu_nocb_lock_irqsave(rdp, flags);
+ if (!nocb_gp_update_state(rdp, &needwake_state)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ continue;
+ }
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
if (bypass_ncbs &&
(time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
continue; /* No callbacks here, try next. */
}
if (bypass_ncbs) {
}
if (needwake_gp)
rcu_gp_kthread_wake();
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
}
my_rdp->nocb_gp_bypass = bypass;
return 0;
}
+static inline bool nocb_cb_can_run(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+{
+ return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+}
+
/*
* Invoke any ready callbacks from the corresponding no-CBs CPU,
* then, if there are no more, wait for more to appear.
*/
static void nocb_cb_wait(struct rcu_data *rdp)
{
+ struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long cur_gp_seq;
unsigned long flags;
+ bool needwake_state = false;
bool needwake_gp = false;
struct rcu_node *rnp = rdp->mynode;
local_bh_enable();
lockdep_assert_irqs_enabled();
rcu_nocb_lock_irqsave(rdp, flags);
- if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
}
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_gp)
- rcu_gp_kthread_wake();
- return;
- }
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
WRITE_ONCE(rdp->nocb_cb_sleep, true);
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+ if (rcu_segcblist_ready_cbs(cblist))
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ } else {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We won't touch the callbacks and keep sleeping until we ever
+ * get re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+
+ if (rdp->nocb_cb_sleep)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+
rcu_nocb_unlock_irqrestore(rdp, flags);
if (needwake_gp)
rcu_gp_kthread_wake();
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- !READ_ONCE(rdp->nocb_cb_sleep));
- if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
- /* ^^^ Ensure CB invocation follows _sleep test. */
- return;
- }
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+
+ do {
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+
+ // VVV Ensure CB invocation follows _sleep test.
+ if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ } while (!nocb_cb_can_run(rdp));
}
/*
/* Is a deferred wakeup of rcu_nocb_kthread() required? */
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
{
- return READ_ONCE(rdp->nocb_defer_wakeup);
+ return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
- static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
+ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
{
unsigned long flags;
int ndw;
+ int ret;
rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_nocb_need_deferred_wakeup(rdp)) {
rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
+ return false;
}
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+
+ return ret;
}
/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
* This means we do an inexact common-case check. Note that if
* we miss, ->nocb_timer will eventually clean things up.
*/
- static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
if (rcu_nocb_need_deferred_wakeup(rdp))
- do_nocb_deferred_wakeup_common(rdp);
+ return do_nocb_deferred_wakeup_common(rdp);
+ return false;
+ }
+
+ void rcu_nocb_flush_deferred_wakeup(void)
+ {
+ do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
}
+ EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
+static int rdp_offload_toggle(struct rcu_data *rdp,
+ bool offload, unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool wake_gp = false;
+
+ rcu_segcblist_offload(cblist, offload);
+
+ if (rdp->nocb_cb_sleep)
+ rdp->nocb_cb_sleep = false;
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ /*
+ * Ignore former value of nocb_cb_sleep and force wake up as it could
+ * have been spuriously set to false already.
+ */
+ swake_up_one(&rdp->nocb_cb_wq);
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ if (rdp_gp->nocb_gp_sleep) {
+ rdp_gp->nocb_gp_sleep = false;
+ wake_gp = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ return 0;
+}
+
+static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ pr_info("De-offloading %d\n", rdp->cpu);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * If there are still pending work offloaded, the offline
+ * CPU won't help much handling them.
+ */
+ if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ return -EBUSY;
+ }
+
+ ret = rdp_offload_toggle(rdp, false, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
+ SEGCBLIST_KTHREAD_GP));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /* Make sure nocb timer won't stay around */
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ del_timer_sync(&rdp->nocb_timer);
+
+ /*
+ * Flush bypass. While IRQs are disabled and once we set
+ * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be
+ * enqueued on bypass.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_nocb_flush_bypass(rdp, NULL, jiffies);
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
+ /*
+ * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
+ * rcu_nocb_unlock_irqrestore() anymore. Theoretically we
+ * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs
+ * disabled now, but let's be paranoid.
+ */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ return ret;
+}
+
+static long rcu_nocb_rdp_deoffload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ return __rcu_nocb_rdp_deoffload(rdp);
+}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ if (rdp == rdp->nocb_gp_rdp) {
+ pr_info("Can't deoffload an rdp GP leader (yet)\n");
+ return -EINVAL;
+ }
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ if (cpu_online(cpu))
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ else
+ ret = __rcu_nocb_rdp_deoffload(rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+static int __rcu_nocb_rdp_offload(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ /*
+ * For now we only support re-offload, ie: the rdp must have been
+ * offloaded on boot first.
+ */
+ if (!rdp->nocb_gp_rdp)
+ return -EINVAL;
+
+ pr_info("Offloading %d\n", rdp->cpu);
+ /*
+ * Can't use rcu_nocb_lock_irqsave() while we are in
+ * SEGCBLIST_SOFTIRQ_ONLY mode.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ /* Re-enable nocb timer */
+ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ /*
+ * We didn't take the nocb lock while working on the
+ * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
+ * Every modifications that have been done previously on
+ * rdp->cblist must be visible remotely by the nocb kthreads
+ * upon wake up after reading the cblist flags.
+ *
+ * The layout against nocb_lock enforces that ordering:
+ *
+ * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
+ * ------------------------- ----------------------------
+ * WRITE callbacks rcu_nocb_lock()
+ * rcu_nocb_lock() READ flags
+ * WRITE flags READ callbacks
+ * rcu_nocb_unlock() rcu_nocb_unlock()
+ */
+ ret = rdp_offload_toggle(rdp, true, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+
+ return ret;
+}
+
+static long rcu_nocb_rdp_offload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ return __rcu_nocb_rdp_offload(rdp);
+}
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ if (cpu_online(cpu))
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ else
+ ret = __rcu_nocb_rdp_offload(rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+
void __init rcu_init_nohz(void)
{
int cpu;
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_offload(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist, true);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
}
rcu_organize_nocb_kthreads();
}
{
init_swait_queue_head(&rdp->nocb_cb_wq);
init_swait_queue_head(&rdp->nocb_gp_wq);
+ init_swait_queue_head(&rdp->nocb_state_wq);
raw_spin_lock_init(&rdp->nocb_lock);
raw_spin_lock_init(&rdp->nocb_bypass_lock);
raw_spin_lock_init(&rdp->nocb_gp_lock);
}
EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+// The ->on_cpu field is available only in CONFIG_SMP=y, so...
+#ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
+}
+#else // #ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return "";
+}
+#endif // #else #ifdef CONFIG_SMP
+
/*
* Dump out nocb grace-period kthread state for the specified rcu_data
* structure.
{
struct rcu_node *rnp = rdp->mynode;
- pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
+ pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
rdp->cpu,
"kK"[!!rdp->nocb_gp_kthread],
"lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
".B"[!!rdp->nocb_gp_bypass],
".G"[!!rdp->nocb_gp_gp],
(long)rdp->nocb_gp_seq,
- rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
+ rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
}
/* Dump out nocb kthread state for the specified rcu_data structure. */
static void show_rcu_nocb_state(struct rcu_data *rdp)
{
+ char bufw[20];
+ char bufr[20];
struct rcu_segcblist *rsclp = &rdp->cblist;
bool waslocked;
bool wastimer;
if (rdp->nocb_gp_rdp == rdp)
show_rcu_nocb_gp_state(rdp);
- pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
+ sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
"bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
"cC"[!!atomic_read(&rdp->nocb_lock_contended)],
jiffies - rdp->nocb_nobypass_last,
rdp->nocb_nobypass_count,
".D"[rcu_segcblist_ready_cbs(rsclp)],
- ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
- ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
- ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
+ ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
+ ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
+ ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
- rcu_segcblist_n_cbs(&rdp->cblist));
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
/* It is OK for GP kthreads to have GP state. */
if (rdp->nocb_gp_rdp == rdp)
return false;
}
- static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+ static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
+ return false;
}
static void rcu_spawn_cpu_nocb_kthread(int cpu)
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
+ ktime_t time = rq->hrtick_time;
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
+ hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time;
s64 delta;
/*
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
- time = ktime_add_ns(timer->base->get_time(), delta);
-
- hrtimer_set_expires(timer, time);
+ rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
if (rq == this_rq())
__hrtick_restart(rq);
/**
* try_invoke_on_locked_down_task - Invoke a function on task in fixed state
- * @p: Process for which the function is to be invoked.
+ * @p: Process for which the function is to be invoked, can be @current.
* @func: Function to invoke.
* @arg: Argument to function.
*
*/
bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
{
- bool ret = false;
struct rq_flags rf;
+ bool ret = false;
struct rq *rq;
- lockdep_assert_irqs_enabled();
- raw_spin_lock_irq(&p->pi_lock);
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
if (p->on_rq) {
rq = __task_rq_lock(p, &rf);
if (task_rq(p) == rq)
ret = func(p, arg);
}
}
- raw_spin_unlock_irq(&p->pi_lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
}
schedule_debug(prev, preempt);
- if (sched_feat(HRTICK))
+ if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+ EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+ #endif
+
+
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
*
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+ EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+ #endif
+
#endif /* CONFIG_PREEMPTION */
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+
+ #include <linux/entry-common.h>
+
+ /*
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
+ *
+ *
+ * NONE:
+ * cond_resched <- __cond_resched
+ * might_resched <- RET0
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * VOLUNTARY:
+ * cond_resched <- __cond_resched
+ * might_resched <- __cond_resched
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * FULL:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ */
+
+ enum {
+ preempt_dynamic_none = 0,
+ preempt_dynamic_voluntary,
+ preempt_dynamic_full,
+ };
+
+ static int preempt_dynamic_mode = preempt_dynamic_full;
+
+ static int sched_dynamic_mode(const char *str)
+ {
+ if (!strcmp(str, "none"))
+ return 0;
+
+ if (!strcmp(str, "voluntary"))
+ return 1;
+
+ if (!strcmp(str, "full"))
+ return 2;
+
+ return -1;
+ }
+
+ static void sched_dynamic_update(int mode)
+ {
+ /*
+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+ * the ZERO state, which is invalid.
+ */
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, __cond_resched);
+ static_call_update(preempt_schedule, __preempt_schedule_func);
+ static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+ static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+
+ switch (mode) {
+ case preempt_dynamic_none:
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+ static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+ static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+ pr_info("Dynamic Preempt: none\n");
+ break;
+
+ case preempt_dynamic_voluntary:
+ static_call_update(cond_resched, __cond_resched);
+ static_call_update(might_resched, __cond_resched);
+ static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL);
+ static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL);
+ static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL);
+ pr_info("Dynamic Preempt: voluntary\n");
+ break;
+
+ case preempt_dynamic_full:
+ static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0);
+ static_call_update(preempt_schedule, __preempt_schedule_func);
+ static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+ static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: full\n");
+ break;
+ }
+
+ preempt_dynamic_mode = mode;
+ }
+
+ static int __init setup_preempt_mode(char *str)
+ {
+ int mode = sched_dynamic_mode(str);
+ if (mode < 0) {
+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+ return 1;
+ }
+
+ sched_dynamic_update(mode);
+ return 0;
+ }
+ __setup("preempt=", setup_preempt_mode);
+
+ #ifdef CONFIG_SCHED_DEBUG
+
+ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+ {
+ char buf[16];
+ int mode;
+
+ if (cnt > 15)
+ cnt = 15;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ mode = sched_dynamic_mode(strstrip(buf));
+ if (mode < 0)
+ return mode;
+
+ sched_dynamic_update(mode);
+
+ *ppos += cnt;
+
+ return cnt;
+ }
+
+ static int sched_dynamic_show(struct seq_file *m, void *v)
+ {
+ static const char * preempt_modes[] = {
+ "none", "voluntary", "full"
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, "(");
+ seq_puts(m, preempt_modes[i]);
+ if (preempt_dynamic_mode == i)
+ seq_puts(m, ")");
+
+ seq_puts(m, " ");
+ }
+
+ seq_puts(m, "\n");
+ return 0;
+ }
+
+ static int sched_dynamic_open(struct inode *inode, struct file *filp)
+ {
+ return single_open(filp, sched_dynamic_show, NULL);
+ }
+
+ static const struct file_operations sched_dynamic_fops = {
+ .open = sched_dynamic_open,
+ .write = sched_dynamic_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ };
+
+ static __init int sched_init_debug_dynamic(void)
+ {
+ debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
+ return 0;
+ }
+ late_initcall(sched_init_debug_dynamic);
+
+ #endif /* CONFIG_SCHED_DEBUG */
+ #endif /* CONFIG_PREEMPT_DYNAMIC */
+
+
/*
* This is the entry point to schedule() from kernel preemption
* off of irq context.
* @p: the task in question.
*
* Return: The priority value as seen by users in /proc.
- * RT tasks are offset by -200. Normal tasks are centered
- * around 0, value goes from -16 to +15.
+ *
+ * sched policy return value kernel prio user prio/nice
+ *
+ * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
+ * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
+ * deadline -101 -1 0
*/
int task_prio(const struct task_struct *p)
{
return cpu_rq(cpu)->idle;
}
+ #ifdef CONFIG_SMP
+ /*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the irq utilization.
+ *
+ * The DL bandwidth number otoh is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum cpu_util_type type,
+ struct task_struct *p)
+ {
+ unsigned long dl_util, util, irq;
+ struct rq *rq = cpu_rq(cpu);
+
+ if (!uclamp_is_used() &&
+ type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
+ return max;
+ }
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= max))
+ return max;
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ *
+ * CFS and RT utilization can be boosted or capped, depending on
+ * utilization clamp constraints requested by currently RUNNABLE
+ * tasks.
+ * When there are no CFS RUNNABLE tasks, clamps are released and
+ * frequency will be gracefully reduced with the utilization decay.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ if (type == FREQUENCY_UTIL)
+ util = uclamp_rq_util_with(rq, util, p);
+
+ dl_util = cpu_util_dl(rq);
+
+ /*
+ * For frequency selection we do not make cpu_util_dl() a permanent part
+ * of this sum because we want to use cpu_bw_dl() later on, but we need
+ * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+ * that we select f_max when there is no idle time.
+ *
+ * NOTE: numerical errors or stop class might cause us to not quite hit
+ * saturation when we should -- something for later.
+ */
+ if (util + dl_util >= max)
+ return max;
+
+ /*
+ * OTOH, for energy computation we need the estimated running time, so
+ * include util_dl and ignore dl_bw.
+ */
+ if (type == ENERGY_UTIL)
+ util += dl_util;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * irq metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, max);
+ util += irq;
+
+ /*
+ * Bandwidth required by DEADLINE must always be granted while, for
+ * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
+ * to gracefully reduce the frequency when no tasks show up for longer
+ * periods of time.
+ *
+ * Ideally we would like to set bw_dl as min/guaranteed freq and util +
+ * bw_dl as requested freq. However, cpufreq is not yet ready for such
+ * an interface. So, we only do the latter for now.
+ */
+ if (type == FREQUENCY_UTIL)
+ util += cpu_bw_dl(rq);
+
+ return min(max, util);
+ }
+
+ unsigned long sched_cpu_util(int cpu, unsigned long max)
+ {
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
+ ENERGY_UTIL, NULL);
+ }
+ #endif /* CONFIG_SMP */
+
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
- if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
+ if (attr->sched_priority > MAX_RT_PRIO-1)
return -EINVAL;
if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
(rt_policy(policy) != (attr->sched_priority != 0)))
return 0;
}
- #ifndef CONFIG_PREEMPTION
- int __sched _cond_resched(void)
+ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
+ int __sched __cond_resched(void)
{
if (should_resched(0)) {
preempt_schedule_common();
return 1;
}
+ #ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
+ #endif
return 0;
}
- EXPORT_SYMBOL(_cond_resched);
+ EXPORT_SYMBOL(__cond_resched);
+ #endif
+
+ #ifdef CONFIG_PREEMPT_DYNAMIC
+ DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
+ EXPORT_STATIC_CALL_TRAMP(cond_resched);
+
+ DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
+ EXPORT_STATIC_CALL_TRAMP(might_resched);
#endif
/*
switch (policy) {
case SCHED_FIFO:
case SCHED_RR:
- ret = MAX_USER_RT_PRIO-1;
+ ret = MAX_RT_PRIO-1;
break;
case SCHED_DEADLINE:
case SCHED_NORMAL:
struct rq_flags rf;
int ret;
+ /*
+ * Remove CPU from nohz.idle_cpus_mask to prevent participating in
+ * load balancing when not active
+ */
+ nohz_balance_exit_idle(rq);
+
set_cpu_active(cpu, false);
/*
calc_load_migrate(rq);
update_max_interval();
- nohz_balance_exit_idle(rq);
hrtick_clear(rq);
return 0;
}