blkg_destroy(blkg);
spin_unlock(&blkcg->lock);
}
+
+ q->root_blkg = NULL;
+ q->root_rl.blkg = NULL;
}
/*
struct cftype blkcg_files[] = {
{
.name = "stat",
+ .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = blkcg_print_stat,
},
{ } /* terminate */
#include <linux/sched.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
- #include <linux/memcontrol.h>
#include <linux/blk-cgroup.h>
#include <linux/backing-dev-defs.h>
#include <linux/slab.h>
int __must_check bdi_init(struct backing_dev_info *bdi);
-void bdi_destroy(struct backing_dev_info *bdi);
+void bdi_exit(struct backing_dev_info *bdi);
__printf(3, 4)
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...);
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
+void bdi_unregister(struct backing_dev_info *bdi);
+
int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
+void bdi_destroy(struct backing_dev_info *bdi);
+
void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
bool range_cyclic, enum wb_reason reason);
void wb_start_background_writeback(struct bdi_writeback *wb);
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
- return cgroup_on_dfl(mem_cgroup_root_css->cgroup) &&
- cgroup_on_dfl(blkcg_root_css->cgroup) &&
+ return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+ cgroup_subsys_on_dfl(io_cgrp_subsys) &&
bdi_cap_account_dirty(bdi) &&
(bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
(inode->i_sb->s_iflags & SB_I_CGROUPWB);
rcu_read_unlock();
}
-struct wb_iter {
- int start_memcg_id;
- struct radix_tree_iter tree_iter;
- void **slot;
-};
-
-static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
- struct backing_dev_info *bdi)
-{
- struct radix_tree_iter *titer = &iter->tree_iter;
-
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- if (iter->start_memcg_id >= 0) {
- iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
- iter->start_memcg_id = -1;
- } else {
- iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
- }
-
- if (!iter->slot)
- iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
- if (iter->slot)
- return *iter->slot;
- return NULL;
-}
-
-static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
- struct backing_dev_info *bdi,
- int start_memcg_id)
-{
- iter->start_memcg_id = start_memcg_id;
-
- if (start_memcg_id)
- return __wb_iter_next(iter, bdi);
- else
- return &bdi->wb;
-}
-
-/**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
- * @wb_cur: cursor struct bdi_writeback pointer
- * @bdi: bdi to walk wb's of
- * @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_memcg_id: memcg ID to start iteration from
- *
- * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter
- * to be used as temp storage during iteration. rcu_read_lock() must be
- * held throughout iteration.
- */
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \
- for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \
- (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
-
#else /* CONFIG_CGROUP_WRITEBACK */
static inline bool inode_cgwb_enabled(struct inode *inode)
{
}
-struct wb_iter {
- int next_id;
-};
-
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \
- for ((iter)->next_id = (start_blkcg_id); \
- ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
-
static inline int inode_congested(struct inode *inode, int cong_bits)
{
return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
extern struct files_struct init_files;
extern struct fs_struct init_fs;
- #ifdef CONFIG_CGROUPS
- #define INIT_GROUP_RWSEM(sig) \
- .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
- #else
- #define INIT_GROUP_RWSEM(sig)
- #endif
-
#ifdef CONFIG_CPUSETS
#define INIT_CPUSET_SEQ(tsk) \
.mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
.rlim = INIT_RLIMITS, \
.cputimer = { \
.cputime_atomic = INIT_CPUTIME_ATOMIC, \
- .running = 0, \
+ .running = false, \
+ .checking_timer = false, \
}, \
INIT_PREV_CPUTIME(sig) \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
- INIT_GROUP_RWSEM(sig) \
}
extern struct nsproxy init_nsproxy;
*
* DEFINE_STATIC_KEY_TRUE(key);
* DEFINE_STATIC_KEY_FALSE(key);
- * static_key_likely()
- * statick_key_unlikely()
+ * static_branch_likely()
+ * static_branch_unlikely()
*
* Jump labels provide an interface to generate dynamic branches using
* self-modifying code. Assuming toolchain and architecture support, if we
* statement, setting the key to true requires us to patch in a jump
* to the out-of-line of true branch.
*
- * In addtion to static_branch_{enable,disable}, we can also reference count
+ * In addition to static_branch_{enable,disable}, we can also reference count
* the key or branch direction via static_branch_{inc,dec}. Thus,
* static_branch_inc() can be thought of as a 'make more true' and
- * static_branch_dec() as a 'make more false'. The inc()/dec()
- * interface is meant to be used exclusively from the inc()/dec() for a given
- * key.
+ * static_branch_dec() as a 'make more false'.
*
* Since this relies on modifying code, the branch modifying functions
* must be considered absolute slow paths (machine wide synchronization etc.).
#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
#define jump_label_enabled static_key_enabled
- static inline bool static_key_enabled(struct static_key *key)
- {
- return static_key_count(key) > 0;
- }
-
static inline void static_key_enable(struct static_key *key)
{
int count = static_key_count(key);
#define DEFINE_STATIC_KEY_FALSE(name) \
struct static_key_false name = STATIC_KEY_FALSE_INIT
+ extern bool ____wrong_branch_error(void);
+
+ #define static_key_enabled(x) \
+ ({ \
+ if (!__builtin_types_compatible_p(typeof(*x), struct static_key) && \
+ !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
+ !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
+ ____wrong_branch_error(); \
+ static_key_count((struct static_key *)x) > 0; \
+ })
+
#ifdef HAVE_JUMP_LABEL
/*
* See jump_label_type() / jump_label_init_type().
*/
- extern bool ____wrong_branch_error(void);
-
#define static_branch_likely(x) \
({ \
bool branch; \
/* OOM-Killer disable */
int oom_kill_disable;
+ /* handle for "memory.events" */
+ struct cgroup_file events_file;
+
/* protect arrays of thresholds */
struct mutex thresholds_lock;
* percpu counter.
*/
struct mem_cgroup_stat_cpu __percpu *stat;
- spinlock_t pcp_counter_lock;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct cg_proto tcp_mem;
unsigned int nr)
{
this_cpu_add(memcg->stat->events[idx], nr);
+ cgroup_file_notify(&memcg->events_file);
}
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
static inline bool mem_cgroup_disabled(void)
{
- if (memory_cgrp_subsys.disabled)
- return true;
- return false;
+ return !cgroup_subsys_enabled(memory_cgrp_subsys);
}
/*
struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
- unsigned long *pdirty, unsigned long *pwriteback);
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ unsigned long *pheadroom, unsigned long *pdirty,
+ unsigned long *pwriteback);
#else /* CONFIG_CGROUP_WRITEBACK */
}
static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
- unsigned long *pavail,
+ unsigned long *pfilepages,
+ unsigned long *pheadroom,
unsigned long *pdirty,
unsigned long *pwriteback)
{
.sum_exec_runtime = ATOMIC64_INIT(0), \
}
-#ifdef CONFIG_PREEMPT_COUNT
-#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
-#else
-#define PREEMPT_DISABLED PREEMPT_ENABLED
-#endif
+#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
+
+/*
+ * Disable preemption until the scheduler is running -- use an unconditional
+ * value so that it also works on !PREEMPT_COUNT kernels.
+ *
+ * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
+ */
+#define INIT_PREEMPT_COUNT PREEMPT_OFFSET
/*
- * Disable preemption until the scheduler is running.
- * Reset by start_kernel()->sched_init()->init_idle().
+ * Initial preempt_count value; reflects the preempt_count schedule invariant
+ * which states that during context switches:
*
- * We include PREEMPT_ACTIVE to avoid cond_resched() from working
- * before the scheduler is active -- see should_resched().
+ * preempt_count() == 2*PREEMPT_DISABLE_OFFSET
+ *
+ * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
+ * Note: See finish_task_switch().
*/
-#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE)
+#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
/**
* struct thread_group_cputimer - thread group interval timer counts
* @cputime_atomic: atomic thread group interval timers.
- * @running: non-zero when there are timers running and
- * @cputime receives updates.
+ * @running: true when there are timers running and
+ * @cputime_atomic receives updates.
+ * @checking_timer: true when a thread in the group is in the
+ * process of checking for thread group timers.
*
* This structure contains the version of task_cputime, above, that is
* used for thread group CPU timer calculations.
*/
struct thread_group_cputimer {
struct task_cputime_atomic cputime_atomic;
- int running;
+ bool running;
+ bool checking_timer;
};
#include <linux/rwsem.h>
unsigned audit_tty_log_passwd;
struct tty_audit_buf *tty_audit_buf;
#endif
- #ifdef CONFIG_CGROUPS
- /*
- * group_rwsem prevents new tasks from entering the threadgroup and
- * member tasks from exiting,a more specifically, setting of
- * PF_EXITING. fork and exit paths are protected with this rwsem
- * using threadgroup_change_begin/end(). Users which require
- * threadgroup to remain stable should use threadgroup_[un]lock()
- * which also takes care of exec path. Currently, cgroup is the
- * only user.
- */
- struct rw_semaphore group_rwsem;
- #endif
oom_flags_t oom_flags;
short oom_score_adj; /* OOM kill score adjustment */
struct hlist_node uidhash_node;
kuid_t uid;
-#ifdef CONFIG_PERF_EVENTS
+#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
atomic_long_t locked_vm;
#endif
};
#endif
};
-extern struct sched_domain_topology_level *sched_domain_topology;
-
extern void set_sched_topology(struct sched_domain_topology_level *tl);
extern void wake_up_if_idle(int cpu);
/*
* The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors the amount of time that a sched_entity is
- * runnable on a rq into its weight. For cfs_rq, it is the aggregated
- * such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency scaling into the amount of time
+ * 1) load_avg factors frequency scaling into the amount of time that a
+ * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
+ * aggregated such weights of all runnable and blocked sched_entities.
+ * 2) util_avg factors frequency and cpu scaling into the amount of time
* that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
* For cfs_rq, it is the aggregated such times of all runnable and
* blocked sched_entities.
union rcu_special {
struct {
- bool blocked;
- bool need_qs;
- } b;
- short s;
+ u8 blocked;
+ u8 need_qs;
+ u8 exp_need_qs;
+ u8 pad; /* Otherwise the compiler can store garbage here. */
+ } b; /* Bits. */
+ u32 s; /* Set of bits. */
};
struct rcu_node;
static int perf_sample_allowed_ns __read_mostly =
DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
-void update_perf_cpu_limits(void)
+static void update_perf_cpu_limits(void)
{
u64 tmp = perf_sample_period_ns;
* mode SWOUT : schedule out everything
* mode SWIN : schedule in based on cgroup for next
*/
-void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task, int mode)
{
struct perf_cpu_context *cpuctx;
struct pmu *pmu;
PERF_EVENT_STATE_INACTIVE;
}
-/*
- * Called at perf_event creation and when events are attached/detached from a
- * group.
- */
-static void perf_event__read_size(struct perf_event *event)
+static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
{
int entry = sizeof(u64); /* value */
int size = 0;
entry += sizeof(u64);
if (event->attr.read_format & PERF_FORMAT_GROUP) {
- nr += event->group_leader->nr_siblings;
+ nr += nr_siblings;
size += sizeof(u64);
}
event->read_size = size;
}
-static void perf_event__header_size(struct perf_event *event)
+static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
{
struct perf_sample_data *data;
- u64 sample_type = event->attr.sample_type;
u16 size = 0;
- perf_event__read_size(event);
-
if (sample_type & PERF_SAMPLE_IP)
size += sizeof(data->ip);
event->header_size = size;
}
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__header_size(struct perf_event *event)
+{
+ __perf_event_read_size(event,
+ event->group_leader->nr_siblings);
+ __perf_event_header_size(event, event->attr.sample_type);
+}
+
static void perf_event__id_header_size(struct perf_event *event)
{
struct perf_sample_data *data;
event->id_header_size = size;
}
+static bool perf_event_validate_size(struct perf_event *event)
+{
+ /*
+ * The values computed here will be over-written when we actually
+ * attach the event.
+ */
+ __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
+ __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
+ perf_event__id_header_size(event);
+
+ /*
+ * Sum the lot; should not exceed the 64k limit we have on records.
+ * Conservative limit to allow for callchains and other variable fields.
+ */
+ if (event->read_size + event->header_size +
+ event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
+ return false;
+
+ return true;
+}
+
static void perf_group_attach(struct perf_event *event)
{
struct perf_event *group_leader = event->group_leader, *pos;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
- pmu->start_txn(pmu);
+ pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
rcu_read_unlock();
}
+struct perf_read_data {
+ struct perf_event *event;
+ bool group;
+ int ret;
+};
+
/*
* Cross CPU call to read the hardware event
*/
static void __perf_event_read(void *info)
{
- struct perf_event *event = info;
+ struct perf_read_data *data = info;
+ struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct pmu *pmu = event->pmu;
/*
* If this is a task context, we need to check whether it is
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
+
update_event_times(event);
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- event->pmu->read(event);
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ goto unlock;
+
+ if (!data->group) {
+ pmu->read(event);
+ data->ret = 0;
+ goto unlock;
+ }
+
+ pmu->start_txn(pmu, PERF_PMU_TXN_READ);
+
+ pmu->read(event);
+
+ list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ update_event_times(sub);
+ if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+ /*
+ * Use sibling's PMU rather than @event's since
+ * sibling could be on different (eg: software) PMU.
+ */
+ sub->pmu->read(sub);
+ }
+ }
+
+ data->ret = pmu->commit_txn(pmu);
+
+unlock:
raw_spin_unlock(&ctx->lock);
}
return val;
}
-static u64 perf_event_read(struct perf_event *event)
+static int perf_event_read(struct perf_event *event, bool group)
{
+ int ret = 0;
+
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ struct perf_read_data data = {
+ .event = event,
+ .group = group,
+ .ret = 0,
+ };
smp_call_function_single(event->oncpu,
- __perf_event_read, event, 1);
+ __perf_event_read, &data, 1);
+ ret = data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
- update_event_times(event);
+ if (group)
+ update_group_times(event);
+ else
+ update_event_times(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
- return perf_event_count(event);
+ return ret;
}
/*
* see the comment there.
*
* 2) there is a lock-inversion with mmap_sem through
- * perf_event_read_group(), which takes faults while
+ * perf_read_group(), which takes faults while
* holding ctx->mutex, however this is called after
* the last filedesc died, so there is no possibility
* to trigger the AB-BA case.
*running = 0;
mutex_lock(&event->child_mutex);
- total += perf_event_read(event);
+
+ (void)perf_event_read(event, false);
+ total += perf_event_count(event);
+
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
*running += event->total_time_running +
atomic64_read(&event->child_total_time_running);
list_for_each_entry(child, &event->child_list, child_list) {
- total += perf_event_read(child);
+ (void)perf_event_read(child, false);
+ total += perf_event_count(child);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
}
EXPORT_SYMBOL_GPL(perf_event_read_value);
-static int perf_event_read_group(struct perf_event *event,
- u64 read_format, char __user *buf)
+static int __perf_read_group_add(struct perf_event *leader,
+ u64 read_format, u64 *values)
{
- struct perf_event *leader = event->group_leader, *sub;
- struct perf_event_context *ctx = leader->ctx;
- int n = 0, size = 0, ret;
- u64 count, enabled, running;
- u64 values[5];
+ struct perf_event *sub;
+ int n = 1; /* skip @nr */
+ int ret;
- lockdep_assert_held(&ctx->mutex);
+ ret = perf_event_read(leader, true);
+ if (ret)
+ return ret;
- count = perf_event_read_value(leader, &enabled, &running);
+ /*
+ * Since we co-schedule groups, {enabled,running} times of siblings
+ * will be identical to those of the leader, so we only publish one
+ * set.
+ */
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] += leader->total_time_enabled +
+ atomic64_read(&leader->child_total_time_enabled);
+ }
- values[n++] = 1 + leader->nr_siblings;
- if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- values[n++] = enabled;
- if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- values[n++] = running;
- values[n++] = count;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] += leader->total_time_running +
+ atomic64_read(&leader->child_total_time_running);
+ }
+
+ /*
+ * Write {count,id} tuples for every sibling.
+ */
+ values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
- size = n * sizeof(u64);
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ values[n++] += perf_event_count(sub);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(sub);
+ }
- if (copy_to_user(buf, values, size))
- return -EFAULT;
+ return 0;
+}
- ret = size;
+static int perf_read_group(struct perf_event *event,
+ u64 read_format, char __user *buf)
+{
+ struct perf_event *leader = event->group_leader, *child;
+ struct perf_event_context *ctx = leader->ctx;
+ int ret;
+ u64 *values;
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
- n = 0;
+ lockdep_assert_held(&ctx->mutex);
- values[n++] = perf_event_read_value(sub, &enabled, &running);
- if (read_format & PERF_FORMAT_ID)
- values[n++] = primary_event_id(sub);
+ values = kzalloc(event->read_size, GFP_KERNEL);
+ if (!values)
+ return -ENOMEM;
- size = n * sizeof(u64);
+ values[0] = 1 + leader->nr_siblings;
- if (copy_to_user(buf + ret, values, size)) {
- return -EFAULT;
- }
+ /*
+ * By locking the child_mutex of the leader we effectively
+ * lock the child list of all siblings.. XXX explain how.
+ */
+ mutex_lock(&leader->child_mutex);
- ret += size;
+ ret = __perf_read_group_add(leader, read_format, values);
+ if (ret)
+ goto unlock;
+
+ list_for_each_entry(child, &leader->child_list, child_list) {
+ ret = __perf_read_group_add(child, read_format, values);
+ if (ret)
+ goto unlock;
}
+ mutex_unlock(&leader->child_mutex);
+
+ ret = event->read_size;
+ if (copy_to_user(buf, values, event->read_size))
+ ret = -EFAULT;
+ goto out;
+
+unlock:
+ mutex_unlock(&leader->child_mutex);
+out:
+ kfree(values);
return ret;
}
-static int perf_event_read_one(struct perf_event *event,
+static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
* Read the performance event - simple non blocking version for now
*/
static ssize_t
-perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+__perf_read(struct perf_event *event, char __user *buf, size_t count)
{
u64 read_format = event->attr.read_format;
int ret;
WARN_ON_ONCE(event->ctx->parent_ctx);
if (read_format & PERF_FORMAT_GROUP)
- ret = perf_event_read_group(event, read_format, buf);
+ ret = perf_read_group(event, read_format, buf);
else
- ret = perf_event_read_one(event, read_format, buf);
+ ret = perf_read_one(event, read_format, buf);
return ret;
}
int ret;
ctx = perf_event_ctx_lock(event);
- ret = perf_read_hw(event, buf, count);
+ ret = __perf_read(event, buf, count);
perf_event_ctx_unlock(event, ctx);
return ret;
static void _perf_event_reset(struct perf_event *event)
{
- (void)perf_event_read(event);
+ (void)perf_event_read(event, false);
local64_set(&event->count, 0);
perf_event_update_userpage(event);
}
if (sample_type & PERF_SAMPLE_RAW) {
if (data->raw) {
- perf_output_put(handle, data->raw->size);
- __output_copy(handle, data->raw->data,
- data->raw->size);
+ u32 raw_size = data->raw->size;
+ u32 real_size = round_up(raw_size + sizeof(u32),
+ sizeof(u64)) - sizeof(u32);
+ u64 zero = 0;
+
+ perf_output_put(handle, real_size);
+ __output_copy(handle, data->raw->data, raw_size);
+ if (real_size - raw_size)
+ __output_copy(handle, &zero, real_size - raw_size);
} else {
struct {
u32 size;
else
size += sizeof(u32);
- WARN_ON_ONCE(size & (sizeof(u64)-1));
- header->size += size;
+ header->size += round_up(size, sizeof(u64));
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
{
}
+static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
+{
+}
+
static int perf_pmu_nop_int(struct pmu *pmu)
{
return 0;
}
-static void perf_pmu_start_txn(struct pmu *pmu)
+static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
+
+static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
{
+ __this_cpu_write(nop_txn_flags, flags);
+
+ if (flags & ~PERF_PMU_TXN_ADD)
+ return;
+
perf_pmu_disable(pmu);
}
static int perf_pmu_commit_txn(struct pmu *pmu)
{
+ unsigned int flags = __this_cpu_read(nop_txn_flags);
+
+ __this_cpu_write(nop_txn_flags, 0);
+
+ if (flags & ~PERF_PMU_TXN_ADD)
+ return 0;
+
perf_pmu_enable(pmu);
return 0;
}
static void perf_pmu_cancel_txn(struct pmu *pmu)
{
+ unsigned int flags = __this_cpu_read(nop_txn_flags);
+
+ __this_cpu_write(nop_txn_flags, 0);
+
+ if (flags & ~PERF_PMU_TXN_ADD)
+ return;
+
perf_pmu_enable(pmu);
}
pmu->commit_txn = perf_pmu_commit_txn;
pmu->cancel_txn = perf_pmu_cancel_txn;
} else {
- pmu->start_txn = perf_pmu_nop_void;
+ pmu->start_txn = perf_pmu_nop_txn;
pmu->commit_txn = perf_pmu_nop_int;
pmu->cancel_txn = perf_pmu_nop_void;
}
return ret;
}
-struct pmu *perf_init_event(struct perf_event *event)
+static struct pmu *perf_init_event(struct perf_event *event)
{
struct pmu *pmu = NULL;
int idx;
if (move_group) {
gctx = group_leader->ctx;
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ } else {
+ mutex_lock(&ctx->mutex);
+ }
+
+ if (!perf_event_validate_size(event)) {
+ err = -E2BIG;
+ goto err_locked;
+ }
+
+ /*
+ * Must be under the same ctx::mutex as perf_install_in_context(),
+ * because we need to serialize with concurrent event creation.
+ */
+ if (!exclusive_event_installable(event, ctx)) {
+ /* exclusive and group stuff are assumed mutually exclusive */
+ WARN_ON_ONCE(move_group);
+
+ err = -EBUSY;
+ goto err_locked;
+ }
+
+ WARN_ON_ONCE(ctx->parent_ctx);
+ if (move_group) {
/*
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
perf_remove_from_context(group_leader, false);
list_for_each_entry(sibling, &group_leader->sibling_list,
perf_remove_from_context(sibling, false);
put_ctx(gctx);
}
- } else {
- mutex_lock(&ctx->mutex);
- }
-
- WARN_ON_ONCE(ctx->parent_ctx);
- if (move_group) {
/*
* Wait for everybody to stop referencing the events through
* the old lists, before installing it on new lists.
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
get_ctx(ctx);
- }
- if (!exclusive_event_installable(event, ctx)) {
- err = -EBUSY;
- mutex_unlock(&ctx->mutex);
- fput(event_file);
- goto err_context;
+ /*
+ * Now that all events are installed in @ctx, nothing
+ * references @gctx anymore, so drop the last reference we have
+ * on it.
+ */
+ put_ctx(gctx);
}
+ /*
+ * Precalculate sample_data sizes; do while holding ctx::mutex such
+ * that we're serialized against further additions and before
+ * perf_install_in_context() which is the point the event is active and
+ * can use these values.
+ */
+ perf_event__header_size(event);
+ perf_event__id_header_size(event);
+
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
- if (move_group) {
+ if (move_group)
mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
- }
mutex_unlock(&ctx->mutex);
put_online_cpus();
list_add_tail(&event->owner_entry, ¤t->perf_event_list);
mutex_unlock(¤t->perf_event_mutex);
- /*
- * Precalculate sample_data sizes
- */
- perf_event__header_size(event);
- perf_event__id_header_size(event);
-
/*
* Drop the reference on the group_event after placing the
* new event on the sibling_list. This ensures destruction
fd_install(event_fd, event_file);
return event_fd;
+err_locked:
+ if (move_group)
+ mutex_unlock(&gctx->mutex);
+ mutex_unlock(&ctx->mutex);
+/* err_file: */
+ fput(event_file);
err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
task_function_call(task, __perf_cgroup_move, task);
}
- static void perf_cgroup_exit(struct cgroup_subsys_state *css,
- struct cgroup_subsys_state *old_css,
- struct task_struct *task)
- {
- task_function_call(task, __perf_cgroup_move, task);
- }
-
struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
- .exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
};
#endif /* CONFIG_CGROUP_PERF */
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ cgroup_free(tsk);
task_numa_free(tsk);
security_task_free(tsk);
exit_creds(tsk);
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
- sig->cputimer.running = 1;
+ sig->cputimer.running = true;
}
/* The timer lists. */
tty_audit_fork(sig);
sched_autogroup_fork(sig);
- #ifdef CONFIG_CGROUPS
- init_rwsem(&sig->group_rwsem);
- #endif
-
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (!idle_cpu(cpu))
+ if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
return cpu;
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
- if (!idle_cpu(i)) {
+ if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
cpu = i;
goto unlock;
}
}
}
+
+ if (!is_housekeeping_cpu(cpu))
+ cpu = housekeeping_any_cpu();
unlock:
rcu_read_unlock();
return cpu;
/*
* SCHED_IDLE tasks get minimal weight:
*/
- if (p->policy == SCHED_IDLE) {
+ if (idle_policy(p->policy)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
return;
load->inv_weight = prio_to_wmult[prio];
}
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_queued(rq, p);
+ if (!(flags & ENQUEUE_RESTORE))
+ sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_dequeued(rq, p);
+ if (!(flags & DEQUEUE_SAVE))
+ sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
* holding rq->lock.
*/
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
}
if (running)
put_prev_task(rq, p);
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, p, 0);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
}
/*
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
- p->sched_class->migrate_task_rq(p, new_cpu);
+ p->sched_class->migrate_task_rq(p);
p->se.nr_migrations++;
perf_event_task_migrate(p);
}
struct rq *src_rq, *dst_rq;
int ret = -EAGAIN;
+ if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+ return -EAGAIN;
+
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
double_raw_lock(&arg->src_task->pi_lock,
&arg->dst_task->pi_lock);
double_rq_lock(src_rq, dst_rq);
+
if (task_cpu(arg->dst_task) != arg->dst_cpu)
goto unlock;
goto out;
}
+ /* No more Mr. Nice Guy. */
switch (state) {
case cpuset:
- /* No more Mr. Nice Guy. */
- cpuset_cpus_allowed_fallback(p);
- state = possible;
- break;
-
+ if (IS_ENABLED(CONFIG_CPUSETS)) {
+ cpuset_cpus_allowed_fallback(p);
+ state = possible;
+ break;
+ }
+ /* fall-through */
case possible:
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
#endif /* CONFIG_SCHEDSTATS */
}
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
p->on_rq = TASK_ON_RQ_QUEUED;
#endif /* CONFIG_NUMA_BALANCING */
}
+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+
#ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
+
void set_numabalancing_state(bool enabled)
{
if (enabled)
- sched_feat_set("NUMA");
+ static_branch_enable(&sched_numa_balancing);
else
- sched_feat_set("NO_NUMA");
+ static_branch_disable(&sched_numa_balancing);
}
-#else
-__read_mostly bool numabalancing_enabled;
-
-void set_numabalancing_state(bool enabled)
-{
- numabalancing_enabled = enabled;
-}
-#endif /* CONFIG_SCHED_DEBUG */
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
{
struct ctl_table t;
int err;
- int state = numabalancing_enabled;
+ int state = static_branch_likely(&sched_numa_balancing);
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ /* Initialize new task's runnable average */
+ init_entity_runnable_average(&p->se);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
- /* Initialize new task's runnable average */
- init_entity_runnable_average(&p->se);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
+ if (p->sched_class->task_woken) {
+ /*
+ * Nothing relies on rq->lock after this, so its fine to
+ * drop it.
+ */
+ lockdep_unpin_lock(&rq->lock);
p->sched_class->task_woken(rq, p);
+ lockdep_pin_lock(&rq->lock);
+ }
#endif
task_rq_unlock(rq, p, &flags);
}
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
- trace_sched_switch(prev, next);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
struct mm_struct *mm = rq->prev_mm;
long prev_state;
+ /*
+ * The previous task will have left us with a preempt_count of 2
+ * because it left us after:
+ *
+ * schedule()
+ * preempt_disable(); // 1
+ * __schedule()
+ * raw_spin_lock_irq(&rq->lock) // 2
+ *
+ * Also, see FORK_PREEMPT_COUNT.
+ */
+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+ "corrupted preempt_count: %s/%d/0x%x\n",
+ current->comm, current->pid, preempt_count()))
+ preempt_count_set(FORK_PREEMPT_COUNT);
+
rq->prev_mm = NULL;
/*
* If a task dies, then it sets TASK_DEAD in tsk->state and calls
* schedule one last time. The schedule call will never return, and
* the scheduled task must drop that reference.
- * The test for TASK_DEAD must occur while the runqueue locks are
- * still held, otherwise prev could be scheduled on another cpu, die
- * there before we look at prev->state, and then the reference would
- * be dropped twice.
+ *
+ * We must observe prev->state before clearing prev->on_cpu (in
+ * finish_lock_switch), otherwise a concurrent wakeup can get prev
+ * running on another CPU and we could rave with its RUNNING -> DEAD
+ * transition, resulting in a double drop.
*/
prev_state = prev->state;
vtime_task_switch(prev);
{
struct rq *rq;
- /* finish_task_switch() drops rq->lock and enables preemtion */
- preempt_disable();
+ /*
+ * New tasks start with FORK_PREEMPT_COUNT, see there and
+ * finish_task_switch() for details.
+ *
+ * finish_task_switch() will drop rq->lock() and lower preempt_count
+ * and the preempt_enable() will end up enabling preemption (on
+ * PREEMPT_COUNT kernels).
+ */
+
rq = finish_task_switch(prev);
balance_callback(rq);
preempt_enable();
/*
* Check if only the current task is running on the cpu.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race. The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptable section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
*/
bool single_task_running(void)
{
- if (cpu_rq(smp_processor_id())->nr_running == 1)
- return true;
- else
- return false;
+ return raw_rq()->nr_running == 1;
}
EXPORT_SYMBOL(single_task_running);
static inline void schedule_debug(struct task_struct *prev)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
- BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+ BUG_ON(task_stack_end_corrupted(prev));
#endif
- /*
- * Test if we are atomic. Since do_exit() needs to call into
- * schedule() atomically, we ignore that path. Otherwise whine
- * if we are scheduling when we should not.
- */
- if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+
+ if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
+ preempt_count_set(PREEMPT_DISABLED);
+ }
rcu_sleep_check();
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched __schedule(void)
+static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
rcu_note_context_switch();
prev = rq->curr;
+ /*
+ * do_exit() calls schedule() with preemption disabled as an exception;
+ * however we must fix that up, otherwise the next task will see an
+ * inconsistent (higher) preempt count.
+ *
+ * It also avoids the below schedule_debug() test from complaining
+ * about this.
+ */
+ if (unlikely(prev->state == TASK_DEAD))
+ preempt_enable_no_resched_notrace();
+
schedule_debug(prev);
if (sched_feat(HRTICK))
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
switch_count = &prev->nivcsw;
- if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+ if (!preempt && prev->state) {
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
rq->curr = next;
++*switch_count;
+ trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */
cpu = cpu_of(rq);
} else {
sched_submit_work(tsk);
do {
preempt_disable();
- __schedule();
+ __schedule(false);
sched_preempt_enable_no_resched();
} while (need_resched());
}
static void __sched notrace preempt_schedule_common(void)
{
do {
- preempt_active_enter();
- __schedule();
- preempt_active_exit();
+ preempt_disable_notrace();
+ __schedule(true);
+ preempt_enable_no_resched_notrace();
/*
* Check again in case we missed a preemption opportunity
return;
do {
- /*
- * Use raw __prempt_count() ops that don't call function.
- * We can't call functions before disabling preemption which
- * disarm preemption tracing recursions.
- */
- __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
- barrier();
+ preempt_disable_notrace();
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
* an infinite recursion.
*/
prev_ctx = exception_enter();
- __schedule();
+ __schedule(true);
exception_exit(prev_ctx);
- barrier();
- __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+ preempt_enable_no_resched_notrace();
} while (need_resched());
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
prev_state = exception_enter();
do {
- preempt_active_enter();
+ preempt_disable();
local_irq_enable();
- __schedule();
+ __schedule(true);
local_irq_disable();
- preempt_active_exit();
+ sched_preempt_enable_no_resched();
} while (need_resched());
exception_exit(prev_state);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, queued, running, enqueue_flag = 0;
+ int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
struct rq *rq;
const struct sched_class *prev_class;
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- enqueue_flag = ENQUEUE_REPLENISH;
+ enqueue_flag |= ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
p->sched_class = &dl_sched_class;
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
if (oldprio < prio)
- enqueue_flag = ENQUEUE_HEAD;
+ enqueue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
}
queued = task_on_rq_queued(p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
delta = p->prio - old_prio;
if (queued) {
- enqueue_task(rq, p, 0);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
} else {
reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
- if (policy != SCHED_DEADLINE &&
- policy != SCHED_FIFO && policy != SCHED_RR &&
- policy != SCHED_NORMAL && policy != SCHED_BATCH &&
- policy != SCHED_IDLE)
+ if (!valid_policy(policy))
return -EINVAL;
}
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
- if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+ if (idle_policy(p->policy) && !idle_policy(policy)) {
if (!can_nice(p, task_nice(p)))
return -EPERM;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
if (running)
p->sched_class->set_curr_task(rq);
if (queued) {
+ int enqueue_flags = ENQUEUE_RESTORE;
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
*/
- enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ if (oldprio <= p->prio)
+ enqueue_flags |= ENQUEUE_HEAD;
+
+ enqueue_task(rq, p, enqueue_flags);
}
check_class_changed(rq, p, prev_class, oldprio);
{
return _sched_setscheduler(p, policy, param, false);
}
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- do_set_cpus_allowed(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMP
+ /*
+ * Its possible that init_idle() gets called multiple times on a task,
+ * in that case do_set_cpus_allowed() will not do the right thing.
+ *
+ * And since this is boot we can forgo the serialization.
+ */
+ set_cpus_allowed_common(idle, cpumask_of(cpu));
+#endif
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the cpu isn't yet set to this cpu so the
rq->curr = rq->idle = idle;
idle->on_rq = TASK_ON_RQ_QUEUED;
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
idle->on_cpu = 1;
#endif
raw_spin_unlock(&rq->lock);
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
vtime_init_idle(idle, cpu);
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
}
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, p, 0);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
task_rq_unlock(rq, p, &flags);
}
#endif /* CONFIG_NUMA_BALANCING */
break;
/*
- * Ensure rq->lock covers the entire task selection
- * until the migration.
+ * pick_next_task assumes pinned rq->lock.
*/
lockdep_pin_lock(&rq->lock);
next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
+ /*
+ * Rules for changing task_struct::cpus_allowed are holding
+ * both pi_lock and rq->lock, such that holding either
+ * stabilizes the mask.
+ *
+ * Drop rq->lock is not quite as disastrous as it usually is
+ * because !cpu_active at this point, which means load-balance
+ * will not interfere. Also, stop-machine.
+ */
+ lockdep_unpin_lock(&rq->lock);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_lock(&next->pi_lock);
+ raw_spin_lock(&rq->lock);
+
+ /*
+ * Since we're inside stop-machine, _nothing_ should have
+ * changed the task, WARN if weird stuff happened, because in
+ * that case the above rq->lock drop is a fail too.
+ */
+ if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+ raw_spin_unlock(&next->pi_lock);
+ continue;
+ }
+
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
- lockdep_unpin_lock(&rq->lock);
rq = __migrate_task(rq, next, dest_cpu);
if (rq != dead_rq) {
raw_spin_unlock(&rq->lock);
rq = dead_rq;
raw_spin_lock(&rq->lock);
}
+ raw_spin_unlock(&next->pi_lock);
}
rq->stop = stop;
static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
+ int cpu = (long)hcpu;
+
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_STARTING:
set_cpu_rq_start_time();
return NOTIFY_OK;
+
case CPU_ONLINE:
/*
* At this point a starting CPU has marked itself as online via
* set_cpu_online(). But it might not yet have marked itself
* as active, which is essential from here on.
- *
- * Thus, fall-through and help the starting CPU along.
*/
+ set_cpu_active(cpu, true);
+ stop_machine_unpark(cpu);
+ return NOTIFY_OK;
+
case CPU_DOWN_FAILED:
- set_cpu_active((long)hcpu, true);
+ set_cpu_active(cpu, true);
return NOTIFY_OK;
+
default:
return NOTIFY_DONE;
}
{ NULL, },
};
-struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static struct sched_domain_topology_level *sched_domain_topology =
+ default_topology;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
- /* nohz_full won't take effect without isolating the cpus. */
- tick_nohz_full_add_cpus_to(cpu_isolated_map);
-
sched_init_numa();
/*
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline int preempt_count_equals(int preempt_offset)
{
- int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+ int nested = preempt_count() + rcu_preempt_depth();
return (nested == preempt_offset);
}
queued = task_on_rq_queued(tsk);
if (queued)
- dequeue_task(rq, tsk, 0);
+ dequeue_task(rq, tsk, DEQUEUE_SAVE);
if (unlikely(running))
put_prev_task(rq, tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk, queued);
+ tsk->sched_class->task_move_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, tsk, 0);
+ enqueue_task(rq, tsk, ENQUEUE_RESTORE);
task_rq_unlock(rq, tsk, &flags);
}
sched_move_task(task);
}
- static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
- struct cgroup_subsys_state *old_css,
- struct task_struct *task)
- {
- sched_move_task(task);
- }
-
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
- .exit = cpu_cgroup_exit,
.legacy_cftypes = cpu_files,
.early_init = 1,
};
memcg = page->mem_cgroup;
- if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+ if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
memcg = root_mem_cgroup;
rcu_read_unlock();
}
/*
+ * Return page count for single (non recursive) @memcg.
+ *
* Implementation Note: reading percpu statistics for memcg.
*
* Both of vmstat[] and percpu_counter has threshold and do periodic
* synchronization to implement "quick" read. There are trade-off between
* reading cost and precision of value. Then, we may have a chance to implement
- * a periodic synchronizion of counter in memcg's counter.
+ * a periodic synchronization of counter in memcg's counter.
*
* But this _read() function is used for user interface now. The user accounts
* memory usage by memory cgroup and he _always_ requires exact value because
*
* If there are kernel internal actions which can make use of some not-exact
* value, and reading all cpu value can be performance bottleneck in some
- * common workload, threashold and synchonization as vmstat[] should be
+ * common workload, threshold and synchronization as vmstat[] should be
* implemented.
*/
-static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx)
+static unsigned long
+mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
{
long val = 0;
int cpu;
+ /* Per-cpu values can be negative, use a signed accumulator */
for_each_possible_cpu(cpu)
val += per_cpu(memcg->stat->count[idx], cpu);
+ /*
+ * Summing races with updates, so val may be negative. Avoid exposing
+ * transient negative values.
+ */
+ if (val < 0)
+ val = 0;
return val;
}
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
- pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
+ pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
K(mem_cgroup_read_stat(iter, i)));
}
enum mem_cgroup_stat_index idx)
{
struct mem_cgroup *iter;
- long val = 0;
+ unsigned long val = 0;
- /* Per-cpu values can be negative, use a signed accumulator */
for_each_mem_cgroup_tree(iter, memcg)
val += mem_cgroup_read_stat(iter, idx);
- if (val < 0) /* race ? */
- val = 0;
return val;
}
* of course permitted.
*/
mutex_lock(&memcg_create_mutex);
- if (cgroup_has_tasks(memcg->css.cgroup) ||
+ if (cgroup_is_populated(memcg->css.cgroup) ||
(memcg->use_hierarchy && memcg_has_children(memcg)))
err = -EBUSY;
mutex_unlock(&memcg_create_mutex);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
- seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
+ seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
}
(u64)memsw * PAGE_SIZE);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- long long val = 0;
+ unsigned long long val = 0;
if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
for_each_mem_cgroup_tree(mi, memcg)
val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
- seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
+ seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
}
for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
ret = page_counter_memparse(args, "-1", &threshold);
if (ret)
return ret;
+ threshold <<= PAGE_SHIFT;
mutex_lock(&memcg->thresholds_lock);
/**
* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
* @wb: bdi_writeback in question
- * @pavail: out parameter for number of available pages
+ * @pfilepages: out parameter for number of file pages
+ * @pheadroom: out parameter for number of allocatable pages according to memcg
* @pdirty: out parameter for number of dirty pages
* @pwriteback: out parameter for number of pages under writeback
*
- * Determine the numbers of available, dirty, and writeback pages in @wb's
- * memcg. Dirty and writeback are self-explanatory. Available is a bit
- * more involved.
+ * Determine the numbers of file, headroom, dirty, and writeback pages in
+ * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
+ * is a bit more involved.
*
- * A memcg's headroom is "min(max, high) - used". The available memory is
- * calculated as the lowest headroom of itself and the ancestors plus the
- * number of pages already being used for file pages. Note that this
- * doesn't consider the actual amount of available memory in the system.
- * The caller should further cap *@pavail accordingly.
+ * A memcg's headroom is "min(max, high) - used". In the hierarchy, the
+ * headroom is calculated as the lowest headroom of itself and the
+ * ancestors. Note that this doesn't consider the actual amount of
+ * available memory in the system. The caller should further cap
+ * *@pheadroom accordingly.
*/
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
- unsigned long *pdirty, unsigned long *pwriteback)
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ unsigned long *pheadroom, unsigned long *pdirty,
+ unsigned long *pwriteback)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- unsigned long head_room = PAGE_COUNTER_MAX;
- unsigned long file_pages;
*pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
/* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+ (1 << LRU_ACTIVE_FILE));
+ *pheadroom = PAGE_COUNTER_MAX;
- file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- (1 << LRU_ACTIVE_FILE));
while ((parent = parent_mem_cgroup(memcg))) {
unsigned long ceiling = min(memcg->memory.limit, memcg->high);
unsigned long used = page_counter_read(&memcg->memory);
- head_room = min(head_room, ceiling - min(ceiling, used));
+ *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
memcg = parent;
}
-
- *pavail = file_pages + head_room;
}
#else /* CONFIG_CGROUP_WRITEBACK */
{
.name = "cgroup.event_control", /* XXX: for compat */
.write = memcg_write_event_control,
- .flags = CFTYPE_NO_PREFIX,
- .mode = S_IWUGO,
+ .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
},
{
.name = "swappiness",
if (memcg_wb_domain_init(memcg, GFP_KERNEL))
goto out_free_stat;
- spin_lock_init(&memcg->pcp_counter_lock);
return memcg;
out_free_stat:
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *from;
- struct task_struct *p;
+ struct task_struct *leader, *p;
struct mm_struct *mm;
unsigned long move_flags;
int ret = 0;
if (!move_flags)
return 0;
- p = cgroup_taskset_first(tset);
+ /*
+ * Multi-process migrations only happen on the default hierarchy
+ * where charge immigration is not used. Perform charge
+ * immigration if @tset contains a leader and whine if there are
+ * multiple.
+ */
+ p = NULL;
+ cgroup_taskset_for_each_leader(leader, tset) {
+ WARN_ON_ONCE(p);
+ p = leader;
+ }
+ if (!p)
+ return 0;
+
from = mem_cgroup_from_task(p);
VM_BUG_ON(from == memcg);
* guarantees that @root doesn't have any children, so turning it
* on for the root memcg is enough.
*/
- if (cgroup_on_dfl(root_css->cgroup))
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
root_mem_cgroup->use_hierarchy = true;
else
root_mem_cgroup->use_hierarchy = false;
{
.name = "events",
.flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, events_file),
.seq_show = memory_events_show,
},
{ } /* terminate */
if (!memcg)
return true;
#ifdef CONFIG_CGROUP_WRITEBACK
- if (cgroup_on_dfl(memcg->css.cgroup))
- if (memcg->css.cgroup)
++ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return true;
#endif
return false;