dynticks subsystem by forcing the context tracking on all
CPUs in the system.
- Say Y only if you're working on the developpement of an
+ Say Y only if you're working on the development of an
architecture backend for the context tracking.
Say N otherwise, this option brings an overhead that you
config ARCH_SUPPORTS_NUMA_BALANCING
bool
+#
+# For architectures that know their GCC __int128 support is sound
+#
+config ARCH_SUPPORTS_INT128
+ bool
+
# For architectures that (ab)use NUMA to represent different memory regions
# all cpu-local but of different latencies, such as SuperH.
#
menuconfig CGROUPS
boolean "Control Group support"
- depends on EVENTFD
help
This option adds support for grouping sets of processes together, for
use with process control subsystems such as Cpusets, CFS, memory
bool "Memory Resource Controller for Control Groups"
depends on RESOURCE_COUNTERS
select MM_OWNER
+ select EVENTFD
help
Provides a memory resource controller that manages both anonymous
memory and page cache. (See Documentation/cgroups/memory.txt)
config SCHED_AUTOGROUP
bool "Automatic process group scheduling"
- select EVENTFD
select CGROUPS
select CGROUP_SCHED
select FAIR_GROUP_SCHED
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/backing-dev.h>
- #include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/spinlock.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
- #include <linux/eventfd.h>
- #include <linux/poll.h>
#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
- #include <linux/file.h>
#include <linux/atomic.h>
+ /*
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+ #define CGROUP_PIDLIST_DESTROY_DELAY HZ
+
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
static DEFINE_MUTEX(cgroup_root_mutex);
+ #define cgroup_assert_mutex_or_rcu_locked() \
+ rcu_lockdep_assert(rcu_read_lock_held() || \
+ lockdep_is_held(&cgroup_mutex), \
+ "cgroup_mutex or RCU read lock required");
+
+ #ifdef CONFIG_LOCKDEP
+ #define cgroup_assert_mutex_or_root_locked() \
+ WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \
+ !lockdep_is_held(&cgroup_root_mutex)))
+ #else
+ #define cgroup_assert_mutex_or_root_locked() do { } while (0)
+ #endif
+
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
*/
static struct workqueue_struct *cgroup_destroy_wq;
+ /*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+ static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
* populated with the built in subsystems, and modular subsystems are
/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
- /*
- * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
- */
- struct cfent {
- struct list_head node;
- struct dentry *dentry;
- struct cftype *type;
- struct cgroup_subsys_state *css;
-
- /* file xattrs */
- struct simple_xattrs xattrs;
- };
-
- /*
- * cgroup_event represents events which userspace want to receive.
- */
- struct cgroup_event {
- /*
- * css which the event belongs to.
- */
- struct cgroup_subsys_state *css;
- /*
- * Control file which the event associated.
- */
- struct cftype *cft;
- /*
- * eventfd to signal userspace about the event.
- */
- struct eventfd_ctx *eventfd;
- /*
- * Each of these stored in a list by the cgroup.
- */
- struct list_head list;
- /*
- * All fields below needed to unregister event when
- * userspace closes eventfd.
- */
- poll_table pt;
- wait_queue_head_t *wqh;
- wait_queue_t wait;
- struct work_struct remove;
- };
-
/* The list of hierarchy roots */
static LIST_HEAD(cgroup_roots);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
static int cgroup_file_release(struct inode *inode, struct file *file);
+ static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
+ /**
+ * for_each_css - iterate all css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_mutex.
+ */
+ #define for_each_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = rcu_dereference_check( \
+ (cgrp)->subsys[(ssid)], \
+ lockdep_is_held(&cgroup_mutex)))) { } \
+ else
+
/**
* for_each_subsys - iterate all loaded cgroup subsystems
* @ss: the iteration cursor
- * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
*
- * Should be called under cgroup_mutex.
+ * Iterates through all loaded subsystems. Should be called under
+ * cgroup_mutex or cgroup_root_mutex.
*/
- #define for_each_subsys(ss, i) \
- for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \
- if (({ lockdep_assert_held(&cgroup_mutex); \
- !((ss) = cgroup_subsys[i]); })) { } \
+ #define for_each_subsys(ss, ssid) \
+ for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \
+ (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((ss) = cgroup_subsys[(ssid)])) { } \
else
/**
for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[i]) || true); (i)++)
- /* iterate each subsystem attached to a hierarchy */
- #define for_each_root_subsys(root, ss) \
- list_for_each_entry((ss), &(root)->subsys_list, sibling)
-
/* iterate across the active hierarchies */
#define for_each_active_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
*/
deactivate_super(cgrp->root->sb);
- /*
- * if we're getting rid of the cgroup, refcount should ensure
- * that there are no pidlists left.
- */
- BUG_ON(!list_empty(&cgrp->pidlists));
+ cgroup_pidlist_destroy_all(cgrp);
simple_xattrs_free(&cgrp->xattrs);
struct cgroup *cgrp = dentry->d_fsdata;
BUG_ON(!(cgroup_is_dead(cgrp)));
+
+ /*
+ * XXX: cgrp->id is only used to look up css's. As cgroup
+ * and css's lifetimes will be decoupled, it should be made
+ * per-subsystem and moved to css->id so that lookups are
+ * successful until the target css is released.
+ */
+ idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+ cgrp->id = -1;
+
call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
} else {
struct cfent *cfe = __d_cfe(dentry);
cgroup_css(cgroup_dummy_top, ss));
cgroup_css(cgrp, ss)->cgroup = cgrp;
- list_move(&ss->sibling, &root->subsys_list);
ss->root = root;
if (ss->bind)
ss->bind(cgroup_css(cgrp, ss));
RCU_INIT_POINTER(cgrp->subsys[i], NULL);
cgroup_subsys[i]->root = &cgroup_dummy_root;
- list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
/* subsystem is now free - drop reference on module */
module_put(ss->module);
{
struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
struct cgroup_subsys *ss;
+ int ssid;
mutex_lock(&cgroup_root_mutex);
- for_each_root_subsys(root, ss)
- seq_printf(seq, ",%s", ss->name);
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(seq, ",%s", ss->name);
if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
seq_puts(seq, ",sane_behavior");
if (root->flags & CGRP_ROOT_NOPREFIX)
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->dummy_css.cgroup = cgrp;
- INIT_LIST_HEAD(&cgrp->event_list);
- spin_lock_init(&cgrp->event_list_lock);
simple_xattrs_init(&cgrp->xattrs);
}
{
struct cgroup *cgrp = &root->top_cgroup;
- INIT_LIST_HEAD(&root->subsys_list);
INIT_LIST_HEAD(&root->root_list);
root->number_of_cgroups = 1;
cgrp->root = root;
return ERR_PTR(ret);
}
- static void cgroup_kill_sb(struct super_block *sb) {
+ static void cgroup_kill_sb(struct super_block *sb)
+ {
struct cgroupfs_root *root = sb->s_fs_info;
struct cgroup *cgrp = &root->top_cgroup;
struct cgrp_cset_link *link, *tmp_link;
bool threadgroup)
{
int retval, i, group_size;
- struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroupfs_root *root = cgrp->root;
+ struct cgroup_subsys_state *css, *failed_css = NULL;
/* threadgroup list cursor and array */
struct task_struct *leader = tsk;
struct task_and_cgroup *tc;
/*
* step 1: check that we can legitimately attach to the cgroup.
*/
- for_each_root_subsys(root, ss) {
- struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
-
- if (ss->can_attach) {
- retval = ss->can_attach(css, &tset);
+ for_each_css(css, i, cgrp) {
+ if (css->ss->can_attach) {
+ retval = css->ss->can_attach(css, &tset);
if (retval) {
- failed_ss = ss;
+ failed_css = css;
goto out_cancel_attach;
}
}
/*
* step 4: do subsystem attach callbacks.
*/
- for_each_root_subsys(root, ss) {
- struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
-
- if (ss->attach)
- ss->attach(css, &tset);
- }
+ for_each_css(css, i, cgrp)
+ if (css->ss->attach)
+ css->ss->attach(css, &tset);
/*
* step 5: success! and cleanup
}
out_cancel_attach:
if (retval) {
- for_each_root_subsys(root, ss) {
- struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
-
- if (ss == failed_ss)
+ for_each_css(css, i, cgrp) {
+ if (css == failed_css)
break;
- if (ss->cancel_attach)
- ss->cancel_attach(css, &tset);
+ if (css->ss->cancel_attach)
+ css->ss->cancel_attach(css, &tset);
}
}
out_free_group_list:
tsk = find_task_by_vpid(pid);
if (!tsk) {
rcu_read_unlock();
- ret= -ESRCH;
+ ret = -ESRCH;
goto out_unlock_cgroup;
}
/*
return 0;
}
- static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *seq)
+ static int cgroup_release_agent_show(struct seq_file *seq, void *v)
{
- struct cgroup *cgrp = css->cgroup;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
return 0;
}
- static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *seq)
+ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
- seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
return 0;
}
/* A buffer size big enough for numbers or short strings */
#define CGROUP_LOCAL_BUFFER_SIZE 64
- static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
- struct cftype *cft, struct file *file,
- const char __user *userbuf, size_t nbytes,
- loff_t *unused_ppos)
+ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
{
- char buffer[CGROUP_LOCAL_BUFFER_SIZE];
- int retval = 0;
- char *end;
+ struct cfent *cfe = __d_cfe(file->f_dentry);
+ struct cftype *cft = __d_cft(file->f_dentry);
+ struct cgroup_subsys_state *css = cfe->css;
+ size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
+ char *buf;
+ int ret;
- if (!nbytes)
- return -EINVAL;
- if (nbytes >= sizeof(buffer))
+ if (nbytes >= max_bytes)
return -E2BIG;
- if (copy_from_user(buffer, userbuf, nbytes))
- return -EFAULT;
- buffer[nbytes] = 0; /* nul-terminate */
- if (cft->write_u64) {
- u64 val = simple_strtoull(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
- retval = cft->write_u64(css, cft, val);
+ buf = kmalloc(nbytes + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, userbuf, nbytes)) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+ buf[nbytes] = '\0';
+
+ if (cft->write_string) {
+ ret = cft->write_string(css, cft, strstrip(buf));
+ } else if (cft->write_u64) {
+ unsigned long long v;
+ ret = kstrtoull(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_u64(css, cft, v);
+ } else if (cft->write_s64) {
+ long long v;
+ ret = kstrtoll(buf, 0, &v);
+ if (!ret)
+ ret = cft->write_s64(css, cft, v);
+ } else if (cft->trigger) {
+ ret = cft->trigger(css, (unsigned int)cft->private);
} else {
- s64 val = simple_strtoll(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
- retval = cft->write_s64(css, cft, val);
+ ret = -EINVAL;
}
- if (!retval)
- retval = nbytes;
- return retval;
+ out_free:
+ kfree(buf);
+ return ret ?: nbytes;
}
- static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
- struct cftype *cft, struct file *file,
- const char __user *userbuf, size_t nbytes,
- loff_t *unused_ppos)
+ /*
+ * seqfile ops/methods for returning structured data. Currently just
+ * supports string->u64 maps, but can be extended in future.
+ */
+
+ static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
- char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
- int retval = 0;
- size_t max_bytes = cft->max_write_len;
- char *buffer = local_buffer;
+ struct cftype *cft = seq_cft(seq);
- if (!max_bytes)
- max_bytes = sizeof(local_buffer) - 1;
- if (nbytes >= max_bytes)
- return -E2BIG;
- /* Allocate a dynamic buffer if we need one */
- if (nbytes >= sizeof(local_buffer)) {
- buffer = kmalloc(nbytes + 1, GFP_KERNEL);
- if (buffer == NULL)
- return -ENOMEM;
- }
- if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
- retval = -EFAULT;
- goto out;
+ if (cft->seq_start) {
+ return cft->seq_start(seq, ppos);
+ } else {
+ /*
+ * The same behavior and code as single_open(). Returns
+ * !NULL if pos is at the beginning; otherwise, NULL.
+ */
+ return NULL + !*ppos;
}
-
- buffer[nbytes] = 0; /* nul-terminate */
- retval = cft->write_string(css, cft, strstrip(buffer));
- if (!retval)
- retval = nbytes;
- out:
- if (buffer != local_buffer)
- kfree(buffer);
- return retval;
}
- static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
- size_t nbytes, loff_t *ppos)
+ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
{
- struct cfent *cfe = __d_cfe(file->f_dentry);
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup_subsys_state *css = cfe->css;
+ struct cftype *cft = seq_cft(seq);
- if (cft->write)
- return cft->write(css, cft, file, buf, nbytes, ppos);
- if (cft->write_u64 || cft->write_s64)
- return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
- if (cft->write_string)
- return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
- if (cft->trigger) {
- int ret = cft->trigger(css, (unsigned int)cft->private);
- return ret ? ret : nbytes;
+ if (cft->seq_next) {
+ return cft->seq_next(seq, v, ppos);
+ } else {
+ /*
+ * The same behavior and code as single_open(), always
+ * terminate after the initial read.
+ */
+ ++*ppos;
+ return NULL;
}
- return -EINVAL;
}
- static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft, struct file *file,
- char __user *buf, size_t nbytes, loff_t *ppos)
+ static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
- char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- u64 val = cft->read_u64(css, cft);
- int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
+ struct cftype *cft = seq_cft(seq);
- return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+ if (cft->seq_stop)
+ cft->seq_stop(seq, v);
}
- static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
- struct cftype *cft, struct file *file,
- char __user *buf, size_t nbytes, loff_t *ppos)
+ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
- char tmp[CGROUP_LOCAL_BUFFER_SIZE];
- s64 val = cft->read_s64(css, cft);
- int len = sprintf(tmp, "%lld\n", (long long) val);
+ struct cftype *cft = seq_cft(m);
+ struct cgroup_subsys_state *css = seq_css(m);
- return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
- }
+ if (cft->seq_show)
+ return cft->seq_show(m, arg);
- static ssize_t cgroup_file_read(struct file *file, char __user *buf,
- size_t nbytes, loff_t *ppos)
- {
- struct cfent *cfe = __d_cfe(file->f_dentry);
- struct cftype *cft = __d_cft(file->f_dentry);
- struct cgroup_subsys_state *css = cfe->css;
-
- if (cft->read)
- return cft->read(css, cft, file, buf, nbytes, ppos);
if (cft->read_u64)
- return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
- if (cft->read_s64)
- return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
- return -EINVAL;
- }
-
- /*
- * seqfile ops/methods for returning structured data. Currently just
- * supports string->u64 maps, but can be extended in future.
- */
-
- static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
- {
- struct seq_file *sf = cb->state;
- return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
- }
-
- static int cgroup_seqfile_show(struct seq_file *m, void *arg)
- {
- struct cfent *cfe = m->private;
- struct cftype *cft = cfe->type;
- struct cgroup_subsys_state *css = cfe->css;
-
- if (cft->read_map) {
- struct cgroup_map_cb cb = {
- .fill = cgroup_map_add,
- .state = m,
- };
- return cft->read_map(css, cft, &cb);
- }
- return cft->read_seq_string(css, cft, m);
+ seq_printf(m, "%llu\n", cft->read_u64(css, cft));
+ else if (cft->read_s64)
+ seq_printf(m, "%lld\n", cft->read_s64(css, cft));
+ else
+ return -EINVAL;
+ return 0;
}
- static const struct file_operations cgroup_seqfile_operations = {
- .read = seq_read,
- .write = cgroup_file_write,
- .llseek = seq_lseek,
- .release = cgroup_file_release,
+ static struct seq_operations cgroup_seq_operations = {
+ .start = cgroup_seqfile_start,
+ .next = cgroup_seqfile_next,
+ .stop = cgroup_seqfile_stop,
+ .show = cgroup_seqfile_show,
};
static int cgroup_file_open(struct inode *inode, struct file *file)
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
struct cgroup_subsys_state *css;
+ struct cgroup_open_file *of;
int err;
err = generic_file_open(inode, file);
WARN_ON_ONCE(cfe->css && cfe->css != css);
cfe->css = css;
- if (cft->read_map || cft->read_seq_string) {
- file->f_op = &cgroup_seqfile_operations;
- err = single_open(file, cgroup_seqfile_show, cfe);
- } else if (cft->open) {
- err = cft->open(inode, file);
+ of = __seq_open_private(file, &cgroup_seq_operations,
+ sizeof(struct cgroup_open_file));
+ if (of) {
+ of->cfe = cfe;
+ return 0;
}
- if (css->ss && err)
+ if (css->ss)
css_put(css);
- return err;
+ return -ENOMEM;
}
static int cgroup_file_release(struct inode *inode, struct file *file)
{
struct cfent *cfe = __d_cfe(file->f_dentry);
- struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup_subsys_state *css = cfe->css;
- int ret = 0;
- if (cft->release)
- ret = cft->release(inode, file);
if (css->ss)
css_put(css);
- if (file->f_op == &cgroup_seqfile_operations)
- single_release(inode, file);
- return ret;
+ return seq_release_private(inode, file);
}
/*
}
static const struct file_operations cgroup_file_operations = {
- .read = cgroup_file_read,
+ .read = seq_read,
.write = cgroup_file_write,
.llseek = generic_file_llseek,
.open = cgroup_file_open,
.removexattr = cgroup_removexattr,
};
- /*
- * Check if a file is a control file
- */
- static inline struct cftype *__file_cft(struct file *file)
- {
- if (file_inode(file)->i_fop != &cgroup_file_operations)
- return ERR_PTR(-EINVAL);
- return __d_cft(file->f_dentry);
- }
-
static int cgroup_create_file(struct dentry *dentry, umode_t mode,
struct super_block *sb)
{
if (cft->mode)
return cft->mode;
- if (cft->read || cft->read_u64 || cft->read_s64 ||
- cft->read_map || cft->read_seq_string)
+ if (cft->read_u64 || cft->read_s64 || cft->seq_show)
mode |= S_IRUGO;
- if (cft->write || cft->write_u64 || cft->write_s64 ||
- cft->write_string || cft->trigger)
+ if (cft->write_u64 || cft->write_s64 || cft->write_string ||
+ cft->trigger)
mode |= S_IWUSR;
return mode;
* @parent_css: css whose children to walk
*
* This function returns the next child of @parent_css and should be called
- * under RCU read lock. The only requirement is that @parent_css and
- * @pos_css are accessible. The next sibling is guaranteed to be returned
- * regardless of their states.
+ * under either cgroup_mutex or RCU read lock. The only requirement is
+ * that @parent_css and @pos_css are accessible. The next sibling is
+ * guaranteed to be returned regardless of their states.
*/
struct cgroup_subsys_state *
css_next_child(struct cgroup_subsys_state *pos_css,
struct cgroup *cgrp = parent_css->cgroup;
struct cgroup *next;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ cgroup_assert_mutex_or_rcu_locked();
/*
* @pos could already have been removed. Once a cgroup is removed,
* to visit for pre-order traversal of @root's descendants. @root is
* included in the iteration and the first node to be visited.
*
- * While this function requires RCU read locking, it doesn't require the
- * whole traversal to be contained in a single RCU critical section. This
- * function will return the correct next descendant as long as both @pos
- * and @root are accessible and @pos is a descendant of @root.
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @root are accessible and @pos is a descendant of @root.
*/
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
{
struct cgroup_subsys_state *next;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ cgroup_assert_mutex_or_rcu_locked();
/* if first iteration, visit @root */
if (!pos)
* is returned. This can be used during pre-order traversal to skip
* subtree of @pos.
*
- * While this function requires RCU read locking, it doesn't require the
- * whole traversal to be contained in a single RCU critical section. This
- * function will return the correct rightmost descendant as long as @pos is
- * accessible.
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct rightmost descendant as
+ * long as @pos is accessible.
*/
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
{
struct cgroup_subsys_state *last, *tmp;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ cgroup_assert_mutex_or_rcu_locked();
do {
last = pos;
* to visit for post-order traversal of @root's descendants. @root is
* included in the iteration and the last node to be visited.
*
- * While this function requires RCU read locking, it doesn't require the
- * whole traversal to be contained in a single RCU critical section. This
- * function will return the correct next descendant as long as both @pos
- * and @cgroup are accessible and @pos is a descendant of @cgroup.
+ * While this function requires cgroup_mutex or RCU read locking, it
+ * doesn't require the whole traversal to be contained in a single critical
+ * section. This function will return the correct next descendant as long
+ * as both @pos and @cgroup are accessible and @pos is a descendant of
+ * @cgroup.
*/
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
{
struct cgroup_subsys_state *next;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ cgroup_assert_mutex_or_rcu_locked();
/* if first iteration, visit leftmost descendant which may be @root */
if (!pos)
pid_t *list;
/* how many elements the above list has */
int length;
- /* how many files are using the current array */
- int use_count;
/* each of these stored in a list by its cgroup */
struct list_head links;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup *owner;
- /* protects the other fields */
- struct rw_semaphore rwsem;
+ /* for delayed destruction */
+ struct delayed_work destroy_dwork;
};
/*
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
+
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
kfree(p);
}
+ /*
+ * Used to destroy all pidlists lingering waiting for destroy timer. None
+ * should be left afterwards.
+ */
+ static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
+ {
+ struct cgroup_pidlist *l, *tmp_l;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+ mutex_unlock(&cgrp->pidlist_mutex);
+
+ flush_workqueue(cgroup_pidlist_destroy_wq);
+ BUG_ON(!list_empty(&cgrp->pidlists));
+ }
+
+ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+ {
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+ destroy_dwork);
+ struct cgroup_pidlist *tofree = NULL;
+
+ mutex_lock(&l->owner->pidlist_mutex);
+
+ /*
+ * Destroy iff we didn't get queued again. The state won't change
+ * as destroy_dwork can only be queued while locked.
+ */
+ if (!delayed_work_pending(dwork)) {
+ list_del(&l->links);
+ pidlist_free(l->list);
+ put_pid_ns(l->key.ns);
+ tofree = l;
+ }
+
+ mutex_unlock(&l->owner->pidlist_mutex);
+ kfree(tofree);
+ }
+
/*
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* Returns the number of unique elements.
return dest;
}
+ /*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco. As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer. As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ *
+ * All this extra complexity was caused by the original implementation
+ * committing to an entirely unnecessary property. In the long term, we
+ * want to do away with it. Explicitly scramble sort order if
+ * sane_behavior so that no such expectation exists in the new interface.
+ *
+ * Scrambling is done by swapping every two consecutive bits, which is
+ * non-identity one-to-one mapping which disturbs sort order sufficiently.
+ */
+ static pid_t pid_fry(pid_t pid)
+ {
+ unsigned a = pid & 0x55555555;
+ unsigned b = pid & 0xAAAAAAAA;
+
+ return (a << 1) | (b >> 1);
+ }
+
+ static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
+ {
+ if (cgroup_sane_behavior(cgrp))
+ return pid_fry(pid);
+ else
+ return pid;
+ }
+
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
+ static int fried_cmppid(const void *a, const void *b)
+ {
+ return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
+ }
+
+ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+ {
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = task_active_pid_ns(current);
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ list_for_each_entry(l, &cgrp->pidlists, links)
+ if (l->key.type == type && l->key.ns == ns)
+ return l;
+ return NULL;
+ }
+
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
- static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
- enum cgroup_filetype type)
+ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+ enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
- /* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = task_active_pid_ns(current);
- /*
- * We can't drop the pidlist_mutex before taking the l->rwsem in case
- * the last ref-holder is trying to remove l from the list at the same
- * time. Holding the pidlist_mutex precludes somebody taking whichever
- * list we find out from under us - compare release_pid_array().
- */
- mutex_lock(&cgrp->pidlist_mutex);
- list_for_each_entry(l, &cgrp->pidlists, links) {
- if (l->key.type == type && l->key.ns == ns) {
- /* make sure l doesn't vanish out from under us */
- down_write(&l->rwsem);
- mutex_unlock(&cgrp->pidlist_mutex);
- return l;
- }
- }
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ l = cgroup_pidlist_find(cgrp, type);
+ if (l)
+ return l;
+
/* entry not found; create a new one */
l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
- if (!l) {
- mutex_unlock(&cgrp->pidlist_mutex);
+ if (!l)
return l;
- }
- init_rwsem(&l->rwsem);
- down_write(&l->rwsem);
+
+ INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
l->key.type = type;
- l->key.ns = get_pid_ns(ns);
+ /* don't need task_nsproxy() if we're looking at ourself */
+ l->key.ns = get_pid_ns(task_active_pid_ns(current));
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
- mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
struct task_struct *tsk;
struct cgroup_pidlist *l;
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
- sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (cgroup_sane_behavior(cgrp))
+ sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
+ else
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
- l = cgroup_pidlist_find(cgrp, type);
+
+ l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
+ mutex_unlock(&cgrp->pidlist_mutex);
pidlist_free(array);
return -ENOMEM;
}
- /* store array, freeing old if necessary - lock already held */
+
+ /* store array, freeing old if necessary */
pidlist_free(l->list);
l->list = array;
l->length = length;
- l->use_count++;
- up_write(&l->rwsem);
*lp = l;
return 0;
}
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
- struct cgroup_pidlist *l = s->private;
+ struct cgroup_open_file *of = s->private;
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+ struct cgroup_pidlist *l;
+ enum cgroup_filetype type = seq_cft(s)->private;
int index = 0, pid = *pos;
- int *iter;
+ int *iter, ret;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+
+ /*
+ * !NULL @of->priv indicates that this isn't the first start()
+ * after open. If the matching pidlist is around, we can use that.
+ * Look for it. Note that @of->priv can't be used directly. It
+ * could already have been destroyed.
+ */
+ if (of->priv)
+ of->priv = cgroup_pidlist_find(cgrp, type);
+
+ /*
+ * Either this is the first start() after open or the matching
+ * pidlist has been destroyed inbetween. Create a new one.
+ */
+ if (!of->priv) {
+ ret = pidlist_array_load(cgrp, type,
+ (struct cgroup_pidlist **)&of->priv);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ l = of->priv;
- down_read(&l->rwsem);
if (pid) {
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
- if (l->list[mid] == pid) {
+ if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
- } else if (l->list[mid] <= pid)
+ } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
return NULL;
/* Update the abstract position to be the actual pid that we found */
iter = l->list + index;
- *pos = *iter;
+ *pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
- struct cgroup_pidlist *l = s->private;
- up_read(&l->rwsem);
+ struct cgroup_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+
+ if (l)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+ CGROUP_PIDLIST_DESTROY_DELAY);
+ mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct cgroup_pidlist *l = s->private;
+ struct cgroup_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
if (p >= end) {
return NULL;
} else {
- *pos = *p;
+ *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
return p;
}
}
.show = cgroup_pidlist_show,
};
- static void cgroup_release_pid_array(struct cgroup_pidlist *l)
- {
- /*
- * the case where we're the last user of this particular pidlist will
- * have us remove it from the cgroup's list, which entails taking the
- * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
- * pidlist_mutex, we have to take pidlist_mutex first.
- */
- mutex_lock(&l->owner->pidlist_mutex);
- down_write(&l->rwsem);
- BUG_ON(!l->use_count);
- if (!--l->use_count) {
- /* we're the last user if refcount is 0; remove and free */
- list_del(&l->links);
- mutex_unlock(&l->owner->pidlist_mutex);
- pidlist_free(l->list);
- put_pid_ns(l->key.ns);
- up_write(&l->rwsem);
- kfree(l);
- return;
- }
- mutex_unlock(&l->owner->pidlist_mutex);
- up_write(&l->rwsem);
- }
-
- static int cgroup_pidlist_release(struct inode *inode, struct file *file)
- {
- struct cgroup_pidlist *l;
- if (!(file->f_mode & FMODE_READ))
- return 0;
- /*
- * the seq_file will only be initialized if the file was opened for
- * reading; hence we check if it's not null only in that case.
- */
- l = ((struct seq_file *)file->private_data)->private;
- cgroup_release_pid_array(l);
- return seq_release(inode, file);
- }
-
- static const struct file_operations cgroup_pidlist_operations = {
- .read = seq_read,
- .llseek = seq_lseek,
- .write = cgroup_file_write,
- .release = cgroup_pidlist_release,
- };
-
- /*
- * The following functions handle opens on a file that displays a pidlist
- * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
- * in the cgroup.
- */
- /* helper function for the two below it */
- static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
- {
- struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- struct cgroup_pidlist *l;
- int retval;
-
- /* Nothing to do for write-only files */
- if (!(file->f_mode & FMODE_READ))
- return 0;
-
- /* have the array populated */
- retval = pidlist_array_load(cgrp, type, &l);
- if (retval)
- return retval;
- /* configure file information */
- file->f_op = &cgroup_pidlist_operations;
-
- retval = seq_open(file, &cgroup_pidlist_seq_operations);
- if (retval) {
- cgroup_release_pid_array(l);
- return retval;
- }
- ((struct seq_file *)file->private_data)->private = l;
- return 0;
- }
- static int cgroup_tasks_open(struct inode *unused, struct file *file)
- {
- return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
- }
- static int cgroup_procs_open(struct inode *unused, struct file *file)
- {
- return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
- }
-
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft)
{
deactivate_super(sb);
}
- /*
- * Unregister event and free resources.
- *
- * Gets called from workqueue.
- */
- static void cgroup_event_remove(struct work_struct *work)
- {
- struct cgroup_event *event = container_of(work, struct cgroup_event,
- remove);
- struct cgroup_subsys_state *css = event->css;
-
- remove_wait_queue(event->wqh, &event->wait);
-
- event->cft->unregister_event(css, event->cft, event->eventfd);
-
- /* Notify userspace the event is going away. */
- eventfd_signal(event->eventfd, 1);
-
- eventfd_ctx_put(event->eventfd);
- kfree(event);
- css_put(css);
- }
-
- /*
- * Gets called on POLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
- static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
- int sync, void *key)
- {
- struct cgroup_event *event = container_of(wait,
- struct cgroup_event, wait);
- struct cgroup *cgrp = event->css->cgroup;
- unsigned long flags = (unsigned long)key;
-
- if (flags & POLLHUP) {
- /*
- * If the event has been detached at cgroup removal, we
- * can simply return knowing the other side will cleanup
- * for us.
- *
- * We can't race against event freeing since the other
- * side will require wqh->lock via remove_wait_queue(),
- * which we hold.
- */
- spin_lock(&cgrp->event_list_lock);
- if (!list_empty(&event->list)) {
- list_del_init(&event->list);
- /*
- * We are in atomic context, but cgroup_event_remove()
- * may sleep, so we have to call it in workqueue.
- */
- schedule_work(&event->remove);
- }
- spin_unlock(&cgrp->event_list_lock);
- }
-
- return 0;
- }
-
- static void cgroup_event_ptable_queue_proc(struct file *file,
- wait_queue_head_t *wqh, poll_table *pt)
- {
- struct cgroup_event *event = container_of(pt,
- struct cgroup_event, pt);
-
- event->wqh = wqh;
- add_wait_queue(wqh, &event->wait);
- }
-
- /*
- * Parse input and register new cgroup event handler.
- *
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
- */
- static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
- struct cftype *cft, const char *buffer)
- {
- struct cgroup *cgrp = dummy_css->cgroup;
- struct cgroup_event *event;
- struct cgroup_subsys_state *cfile_css;
- unsigned int efd, cfd;
- struct fd efile;
- struct fd cfile;
- char *endp;
- int ret;
-
- efd = simple_strtoul(buffer, &endp, 10);
- if (*endp != ' ')
- return -EINVAL;
- buffer = endp + 1;
-
- cfd = simple_strtoul(buffer, &endp, 10);
- if ((*endp != ' ') && (*endp != '\0'))
- return -EINVAL;
- buffer = endp + 1;
-
- event = kzalloc(sizeof(*event), GFP_KERNEL);
- if (!event)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&event->list);
- init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
- init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
- INIT_WORK(&event->remove, cgroup_event_remove);
-
- efile = fdget(efd);
- if (!efile.file) {
- ret = -EBADF;
- goto out_kfree;
- }
-
- event->eventfd = eventfd_ctx_fileget(efile.file);
- if (IS_ERR(event->eventfd)) {
- ret = PTR_ERR(event->eventfd);
- goto out_put_efile;
- }
-
- cfile = fdget(cfd);
- if (!cfile.file) {
- ret = -EBADF;
- goto out_put_eventfd;
- }
-
- /* the process need read permission on control file */
- /* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(file_inode(cfile.file), MAY_READ);
- if (ret < 0)
- goto out_put_cfile;
-
- event->cft = __file_cft(cfile.file);
- if (IS_ERR(event->cft)) {
- ret = PTR_ERR(event->cft);
- goto out_put_cfile;
- }
-
- if (!event->cft->ss) {
- ret = -EBADF;
- goto out_put_cfile;
- }
-
- /*
- * Determine the css of @cfile, verify it belongs to the same
- * cgroup as cgroup.event_control, and associate @event with it.
- * Remaining events are automatically removed on cgroup destruction
- * but the removal is asynchronous, so take an extra ref.
- */
- rcu_read_lock();
-
- ret = -EINVAL;
- event->css = cgroup_css(cgrp, event->cft->ss);
- cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
- if (event->css && event->css == cfile_css && css_tryget(event->css))
- ret = 0;
-
- rcu_read_unlock();
- if (ret)
- goto out_put_cfile;
-
- if (!event->cft->register_event || !event->cft->unregister_event) {
- ret = -EINVAL;
- goto out_put_css;
- }
-
- ret = event->cft->register_event(event->css, event->cft,
- event->eventfd, buffer);
- if (ret)
- goto out_put_css;
-
- efile.file->f_op->poll(efile.file, &event->pt);
-
- spin_lock(&cgrp->event_list_lock);
- list_add(&event->list, &cgrp->event_list);
- spin_unlock(&cgrp->event_list_lock);
-
- fdput(cfile);
- fdput(efile);
-
- return 0;
-
- out_put_css:
- css_put(event->css);
- out_put_cfile:
- fdput(cfile);
- out_put_eventfd:
- eventfd_ctx_put(event->eventfd);
- out_put_efile:
- fdput(efile);
- out_kfree:
- kfree(event);
-
- return ret;
- }
-
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
static struct cftype cgroup_base_files[] = {
{
.name = "cgroup.procs",
- .open = cgroup_procs_open,
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
.write_u64 = cgroup_procs_write,
- .release = cgroup_pidlist_release,
.mode = S_IRUGO | S_IWUSR,
},
- {
- .name = "cgroup.event_control",
- .write_string = cgroup_write_event_control,
- .mode = S_IWUGO,
- },
{
.name = "cgroup.clone_children",
.flags = CFTYPE_INSANE,
{
.name = "cgroup.sane_behavior",
.flags = CFTYPE_ONLY_ON_ROOT,
- .read_seq_string = cgroup_sane_behavior_show,
+ .seq_show = cgroup_sane_behavior_show,
},
/*
{
.name = "tasks",
.flags = CFTYPE_INSANE, /* use "procs" instead */
- .open = cgroup_tasks_open,
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_TASKS,
.write_u64 = cgroup_tasks_write,
- .release = cgroup_pidlist_release,
.mode = S_IRUGO | S_IWUSR,
},
{
{
.name = "release_agent",
.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
- .read_seq_string = cgroup_release_agent_show,
+ .seq_show = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
.max_write_len = PATH_MAX,
},
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
+ rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
call_rcu(&css->rcu_head, css_free_rcu_fn);
}
RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
}
+ /**
+ * create_css - create a cgroup_subsys_state
+ * @cgrp: the cgroup new css will be associated with
+ * @ss: the subsys of new css
+ *
+ * Create a new css associated with @cgrp - @ss pair. On success, the new
+ * css is online and installed in @cgrp with all interface files created.
+ * Returns 0 on success, -errno on failure.
+ */
+ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
+ {
+ struct cgroup *parent = cgrp->parent;
+ struct cgroup_subsys_state *css;
+ int err;
+
+ lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
+ lockdep_assert_held(&cgroup_mutex);
+
+ css = ss->css_alloc(cgroup_css(parent, ss));
+ if (IS_ERR(css))
+ return PTR_ERR(css);
+
+ err = percpu_ref_init(&css->refcnt, css_release);
+ if (err)
+ goto err_free;
+
+ init_css(css, ss, cgrp);
+
+ err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
+ if (err)
+ goto err_free;
+
+ err = online_css(css);
+ if (err)
+ goto err_free;
+
+ dget(cgrp->dentry);
+ css_get(css->parent);
+
+ if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
+ parent->parent) {
+ pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+ current->comm, current->pid, ss->name);
+ if (!strcmp(ss->name, "memory"))
+ pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
+ ss->warned_broken_hierarchy = true;
+ }
+
+ return 0;
+
+ err_free:
+ percpu_ref_cancel_init(&css->refcnt);
+ ss->css_free(css);
+ return err;
+ }
+
/*
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
umode_t mode)
{
- struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
struct cgroup *cgrp;
struct cgroup_name *name;
struct cgroupfs_root *root = parent->root;
- int err = 0;
+ int ssid, err = 0;
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
- for_each_root_subsys(root, ss) {
- struct cgroup_subsys_state *css;
-
- css = ss->css_alloc(cgroup_css(parent, ss));
- if (IS_ERR(css)) {
- err = PTR_ERR(css);
- goto err_free_all;
- }
- css_ar[ss->subsys_id] = css;
-
- err = percpu_ref_init(&css->refcnt, css_release);
- if (err)
- goto err_free_all;
-
- init_css(css, ss, cgrp);
- }
-
/*
* Create directory. cgroup_create_file() returns with the new
* directory locked on success so that it can be populated without
*/
err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
if (err < 0)
- goto err_free_all;
+ goto err_unlock;
lockdep_assert_held(&dentry->d_inode->i_mutex);
cgrp->serial_nr = cgroup_serial_nr_next++;
/* hold a ref to the parent's dentry */
dget(parent->dentry);
- /* creation succeeded, notify subsystems */
- for_each_root_subsys(root, ss) {
- struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
-
- err = online_css(css);
- if (err)
- goto err_destroy;
-
- /* each css holds a ref to the cgroup's dentry and parent css */
- dget(dentry);
- css_get(css->parent);
-
- /* mark it consumed for error path */
- css_ar[ss->subsys_id] = NULL;
-
- if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
- parent->parent) {
- pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
- current->comm, current->pid, ss->name);
- if (!strcmp(ss->name, "memory"))
- pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
- ss->warned_broken_hierarchy = true;
- }
- }
-
+ /*
+ * @cgrp is now fully operational. If something fails after this
+ * point, it'll be released via the normal destruction path.
+ */
idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
if (err)
goto err_destroy;
- err = cgroup_populate_dir(cgrp, root->subsys_mask);
- if (err)
- goto err_destroy;
+ /* let's create and online css's */
+ for_each_subsys(ss, ssid) {
+ if (root->subsys_mask & (1 << ssid)) {
+ err = create_css(cgrp, ss);
+ if (err)
+ goto err_destroy;
+ }
+ }
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return 0;
- err_free_all:
- for_each_root_subsys(root, ss) {
- struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
-
- if (css) {
- percpu_ref_cancel_init(&css->refcnt);
- ss->css_free(css);
- }
- }
+ err_unlock:
mutex_unlock(&cgroup_mutex);
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
return err;
err_destroy:
- for_each_root_subsys(root, ss) {
- struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
-
- if (css) {
- percpu_ref_cancel_init(&css->refcnt);
- ss->css_free(css);
- }
- }
cgroup_destroy_locked(cgrp);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&dentry->d_inode->i_mutex);
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct dentry *d = cgrp->dentry;
- struct cgroup_event *event, *tmp;
- struct cgroup_subsys *ss;
+ struct cgroup_subsys_state *css;
struct cgroup *child;
bool empty;
+ int ssid;
lockdep_assert_held(&d->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
* will be invoked to perform the rest of destruction once the
* percpu refs of all css's are confirmed to be killed.
*/
- for_each_root_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
-
- if (css)
- kill_css(css);
- }
+ for_each_css(css, ssid, cgrp)
+ kill_css(css);
/*
* Mark @cgrp dead. This prevents further task migration and child
dget(d);
cgroup_d_remove_dir(d);
- /*
- * Unregister events and notify userspace.
- * Notify userspace about cgroup removing only after rmdir of cgroup
- * directory to avoid race between userspace and kernelspace.
- */
- spin_lock(&cgrp->event_list_lock);
- list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
- list_del_init(&event->list);
- schedule_work(&event->remove);
- }
- spin_unlock(&cgrp->event_list_lock);
-
return 0;
};
/* delete this cgroup from parent->children */
list_del_rcu(&cgrp->sibling);
- /*
- * We should remove the cgroup object from idr before its grace
- * period starts, so we won't be looking up a cgroup while the
- * cgroup is being freed.
- */
- idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
- cgrp->id = -1;
-
dput(d);
set_bit(CGRP_RELEASABLE, &parent->flags);
cgroup_init_cftsets(ss);
/* Create the top cgroup state for this subsystem */
- list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
ss->root = &cgroup_dummy_root;
css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
/* We don't handle early failures gracefully */
cgroup_init_cftsets(ss);
mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
cgroup_subsys[ss->subsys_id] = ss;
/*
if (IS_ERR(css)) {
/* failure case - need to deassign the cgroup_subsys[] slot. */
cgroup_subsys[ss->subsys_id] = NULL;
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
return PTR_ERR(css);
}
- list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
ss->root = &cgroup_dummy_root;
/* our new subsystem will be attached to the dummy hierarchy. */
write_unlock(&css_set_lock);
ret = online_css(css);
- if (ret)
+ if (ret) {
+ ss->css_free(css);
goto err_unload;
+ }
/* success! */
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
return 0;
err_unload:
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
/* @ss can't be mounted here as try_module_get() would fail */
cgroup_unload_subsys(ss);
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
struct cgrp_cset_link *link;
+ struct cgroup_subsys_state *css;
BUG_ON(ss->module == NULL);
BUG_ON(ss->root != &cgroup_dummy_root);
mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
- offline_css(cgroup_css(cgroup_dummy_top, ss));
+ css = cgroup_css(cgroup_dummy_top, ss);
+ if (css)
+ offline_css(css);
/* deassign the subsys_id */
cgroup_subsys[ss->subsys_id] = NULL;
- /* remove subsystem from the dummy root's list of subsystems */
- list_del_init(&ss->sibling);
-
/*
* disentangle the css from all css_sets attached to the dummy
* top. as in loading, we need to pay our respects to the hashtable
* need to free before marking as null because ss->css_free needs
* the cgrp->subsys pointer to find their state.
*/
- ss->css_free(cgroup_css(cgroup_dummy_top, ss));
+ if (css)
+ ss->css_free(css);
RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
}
EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
*/
cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
BUG_ON(!cgroup_destroy_wq);
+
+ /*
+ * Used to destroy pidlists and separate to serve as flush domain.
+ * Cap @max_active to 1 too.
+ */
+ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+ 0, 1);
+ BUG_ON(!cgroup_pidlist_destroy_wq);
+
return 0;
}
core_initcall(cgroup_wq_init);
for_each_active_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
- int count = 0;
+ int ssid, count = 0;
seq_printf(m, "%d:", root->hierarchy_id);
- for_each_root_subsys(root, ss)
- seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
* @dentry: directory dentry of interest
* @ss: subsystem of interest
*
- * Must be called under RCU read lock. The caller is responsible for
- * pinning the returned css if it needs to be accessed outside the RCU
- * critical section.
+ * Must be called under cgroup_mutex or RCU read lock. The caller is
+ * responsible for pinning the returned css if it needs to be accessed
+ * outside the critical section.
*/
struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
struct cgroup_subsys *ss)
{
struct cgroup *cgrp;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ cgroup_assert_mutex_or_rcu_locked();
/* is @dentry a cgroup dir? */
if (!dentry->d_inode ||
{
struct cgroup *cgrp;
- rcu_lockdep_assert(rcu_read_lock_held() ||
- lockdep_is_held(&cgroup_mutex),
- "css_from_id() needs proper protection");
+ cgroup_assert_mutex_or_rcu_locked();
cgrp = idr_find(&ss->root->cgroup_idr, id);
if (cgrp)
return count;
}
- static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
- struct cftype *cft,
- struct seq_file *seq)
+ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
{
struct cgrp_cset_link *link;
struct css_set *cset;
}
#define MAX_TASKS_SHOWN_PER_CSS 25
- static int cgroup_css_links_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *seq)
+ static int cgroup_css_links_read(struct seq_file *seq, void *v)
{
+ struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
read_lock(&css_set_lock);
{
.name = "current_css_set_cg_links",
- .read_seq_string = current_css_set_cg_links_read,
+ .seq_show = current_css_set_cg_links_read,
},
{
.name = "cgroup_css_links",
- .read_seq_string = cgroup_css_links_read,
+ .seq_show = cgroup_css_links_read,
},
{
*/
int sysctl_sched_rt_runtime = 950000;
-
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/
{
int prio;
- if (task_has_rt_policy(p))
+ if (task_has_dl_policy(p))
+ prio = MAX_DL_PRIO-1;
+ else if (task_has_rt_policy(p))
prio = MAX_RT_PRIO-1 - p->rt_priority;
else
prio = __normal_prio(p);
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio)
+ } else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
}
* TIF_NEED_RESCHED remotely (for the first time) will also send
* this IPI.
*/
- if (tif_need_resched())
- set_preempt_need_resched();
+ preempt_fold_need_resched();
if (llist_empty(&this_rq()->wake_list)
&& !tick_nohz_full_cpu(smp_processor_id())
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
+ RB_CLEAR_NODE(&p->dl.rb_node);
+ hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ p->dl.dl_runtime = p->dl.runtime = 0;
+ p->dl.dl_deadline = p->dl.deadline = 0;
+ p->dl.dl_period = 0;
+ p->dl.flags = 0;
+
INIT_LIST_HEAD(&p->rt.run_list);
#ifdef CONFIG_PREEMPT_NOTIFIERS
/*
* fork()/clone()-time setup:
*/
-void sched_fork(unsigned long clone_flags, struct task_struct *p)
+int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
unsigned long flags;
int cpu = get_cpu();
* Revert to default priority/policy on fork if requested.
*/
if (unlikely(p->sched_reset_on_fork)) {
- if (task_has_rt_policy(p)) {
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->policy = SCHED_NORMAL;
p->static_prio = NICE_TO_PRIO(0);
p->rt_priority = 0;
p->sched_reset_on_fork = 0;
}
- if (!rt_prio(p->prio))
+ if (dl_prio(p->prio)) {
+ put_cpu();
+ return -EAGAIN;
+ } else if (rt_prio(p->prio)) {
+ p->sched_class = &rt_sched_class;
+ } else {
p->sched_class = &fair_sched_class;
+ }
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
init_task_preempt_count(p);
#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ RB_CLEAR_NODE(&p->pushable_dl_tasks);
#endif
put_cpu();
+ return 0;
+}
+
+unsigned long to_ratio(u64 period, u64 runtime)
+{
+ if (runtime == RUNTIME_INF)
+ return 1ULL << 20;
+
+ /*
+ * Doing this here saves a lot of checks in all
+ * the calling paths, and returning zero seems
+ * safe for them anyway.
+ */
+ if (period == 0)
+ return 0;
+
+ return div64_u64(runtime << 20, period);
+}
+
+#ifdef CONFIG_SMP
+inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->rd->dl_bw;
}
+static inline int dl_bw_cpus(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+ int cpus = 0;
+
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
+ cpus++;
+
+ return cpus;
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+ return 1;
+}
+#endif
+
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+static int dl_overflow(struct task_struct *p, int policy,
+ const struct sched_attr *attr)
+{
+
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+ u64 period = attr->sched_period;
+ u64 runtime = attr->sched_runtime;
+ u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+ int cpus, err = -1;
+
+ if (new_bw == p->dl.dl_bw)
+ return 0;
+
+ /*
+ * Either if a task, enters, leave, or stays -deadline but changes
+ * its parameters, we may need to update accordingly the total
+ * allocated bandwidth of the container.
+ */
+ raw_spin_lock(&dl_b->lock);
+ cpus = dl_bw_cpus(task_cpu(p));
+ if (dl_policy(policy) && !task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ __dl_add(dl_b, new_bw);
+ err = 0;
+ } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+ !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ __dl_clear(dl_b, p->dl.dl_bw);
+ __dl_add(dl_b, new_bw);
+ err = 0;
+ } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+ __dl_clear(dl_b, p->dl.dl_bw);
+ err = 0;
+ }
+ raw_spin_unlock(&dl_b->lock);
+
+ return err;
+}
+
+extern void init_dl_bw(struct dl_bw *dl_b);
+
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
if (unlikely(prev_state == TASK_DEAD)) {
task_numa_free(prev);
+ if (prev->sched_class->task_dead)
+ prev->sched_class->task_dead(prev);
+
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
- trigger_load_balance(rq, cpu);
+ trigger_load_balance(rq);
#endif
rq_last_tick_reset(rq);
}
{
/*
* Test if we are atomic. Since do_exit() needs to call into
- * schedule() atomically, we ignore that path for now.
- * Otherwise, whine if we are scheduling when we should not be.
+ * schedule() atomically, we ignore that path. Otherwise whine
+ * if we are scheduling when we should not.
*/
- if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+ if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
__schedule_bug(prev);
rcu_sleep_check();
} while (need_resched());
}
EXPORT_SYMBOL(preempt_schedule);
+#endif /* CONFIG_PREEMPT */
/*
* this is the entry point to schedule() from kernel preemption
exception_exit(prev_state);
}
-#endif /* CONFIG_PREEMPT */
-
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, on_rq, running;
+ int oldprio, on_rq, running, enqueue_flag = 0;
struct rq *rq;
const struct sched_class *prev_class;
- BUG_ON(prio < 0 || prio > MAX_PRIO);
+ BUG_ON(prio > MAX_PRIO);
rq = __task_rq_lock(p);
}
trace_sched_pi_setprio(p, prio);
+ p->pi_top_task = rt_mutex_get_top_task(p);
oldprio = p->prio;
prev_class = p->sched_class;
on_rq = p->on_rq;
if (running)
p->sched_class->put_prev_task(rq, p);
- if (rt_prio(prio))
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
+ dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+ p->dl.dl_boosted = 1;
+ p->dl.dl_throttled = 0;
+ enqueue_flag = ENQUEUE_REPLENISH;
+ } else
+ p->dl.dl_boosted = 0;
+ p->sched_class = &dl_sched_class;
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.dl_boosted = 0;
+ if (oldprio < prio)
+ enqueue_flag = ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
- else
+ } else {
+ if (dl_prio(oldprio))
+ p->dl.dl_boosted = 0;
p->sched_class = &fair_sched_class;
+ }
p->prio = prio;
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq)
- enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
+ enqueue_task(rq, p, enqueue_flag);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
__task_rq_unlock(rq);
}
#endif
+
void set_user_nice(struct task_struct *p, long nice)
{
int old_prio, delta, on_rq;
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
* it wont have any effect on scheduling until the task is
- * SCHED_FIFO/SCHED_RR:
+ * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
*/
- if (task_has_rt_policy(p)) {
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
return pid ? find_task_by_vpid(pid) : current;
}
-/* Actually do priority change: must hold rq lock. */
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ init_dl_task_timer(dl_se);
+ dl_se->dl_runtime = attr->sched_runtime;
+ dl_se->dl_deadline = attr->sched_deadline;
+ dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+ dl_se->flags = attr->sched_flags;
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_throttled = 0;
+ dl_se->dl_new = 1;
+}
+
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+ const struct sched_attr *attr)
{
+ int policy = attr->sched_policy;
+
+ if (policy == -1) /* setparam */
+ policy = p->policy;
+
p->policy = policy;
- p->rt_priority = prio;
+
+ if (dl_policy(policy))
+ __setparam_dl(p, attr);
+ else if (fair_policy(policy))
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+
+ /*
+ * __sched_setscheduler() ensures attr->sched_priority == 0 when
+ * !rt_policy. Always setting this ensures that things like
+ * getparam()/getattr() don't report silly values for !rt tasks.
+ */
+ p->rt_priority = attr->sched_priority;
+
p->normal_prio = normal_prio(p);
- /* we are holding p->pi_lock already */
p->prio = rt_mutex_getprio(p);
- if (rt_prio(p->prio))
+
+ if (dl_prio(p->prio))
+ p->sched_class = &dl_sched_class;
+ else if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
+
set_load_weight(p);
}
+static void
+__getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ attr->sched_priority = p->rt_priority;
+ attr->sched_runtime = dl_se->dl_runtime;
+ attr->sched_deadline = dl_se->dl_deadline;
+ attr->sched_period = dl_se->dl_period;
+ attr->sched_flags = dl_se->flags;
+}
+
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution (1us); we
+ * check sched_runtime only since it is always the smaller one.
+ */
+static bool
+__checkparam_dl(const struct sched_attr *attr)
+{
+ return attr && attr->sched_deadline != 0 &&
+ (attr->sched_period == 0 ||
+ (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
+ (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
+ attr->sched_runtime >= (2 << (DL_SCALE - 1));
+}
+
/*
* check the target process has a UID that matches the current process's
*/
return match;
}
-static int __sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param, bool user)
+static int __sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ bool user)
{
int retval, oldprio, oldpolicy = -1, on_rq, running;
+ int policy = attr->sched_policy;
unsigned long flags;
const struct sched_class *prev_class;
struct rq *rq;
reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
} else {
- reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
- policy &= ~SCHED_RESET_ON_FORK;
+ reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
- if (policy != SCHED_FIFO && policy != SCHED_RR &&
+ if (policy != SCHED_DEADLINE &&
+ policy != SCHED_FIFO && policy != SCHED_RR &&
policy != SCHED_NORMAL && policy != SCHED_BATCH &&
policy != SCHED_IDLE)
return -EINVAL;
}
+ if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+ return -EINVAL;
+
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
- if (param->sched_priority < 0 ||
- (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
- (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+ if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
- if (rt_policy(policy) != (param->sched_priority != 0))
+ if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+ (rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
/*
* Allow unprivileged RT tasks to decrease priority:
*/
if (user && !capable(CAP_SYS_NICE)) {
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < TASK_NICE(p) &&
+ !can_nice(p, attr->sched_nice))
+ return -EPERM;
+ }
+
if (rt_policy(policy)) {
unsigned long rlim_rtprio =
task_rlimit(p, RLIMIT_RTPRIO);
return -EPERM;
/* can't increase priority */
- if (param->sched_priority > p->rt_priority &&
- param->sched_priority > rlim_rtprio)
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
return -EPERM;
}
/*
* If not changing anything there's no need to proceed further:
*/
- if (unlikely(policy == p->policy && (!rt_policy(policy) ||
- param->sched_priority == p->rt_priority))) {
+ if (unlikely(policy == p->policy)) {
+ if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+ goto change;
+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ goto change;
+ if (dl_policy(policy))
+ goto change;
+
task_rq_unlock(rq, p, &flags);
return 0;
}
+change:
-#ifdef CONFIG_RT_GROUP_SCHED
if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
/*
* Do not allow realtime tasks into groups that have no runtime
* assigned.
task_rq_unlock(rq, p, &flags);
return -EPERM;
}
- }
#endif
+#ifdef CONFIG_SMP
+ if (dl_bandwidth_enabled() && dl_policy(policy)) {
+ cpumask_t *span = rq->rd->span;
+
+ /*
+ * Don't allow tasks with an affinity mask smaller than
+ * the entire root_domain to become SCHED_DEADLINE. We
+ * will also fail if there's no bandwidth available.
+ */
+ if (!cpumask_subset(span, &p->cpus_allowed) ||
+ rq->rd->dl_bw.bw == 0) {
+ task_rq_unlock(rq, p, &flags);
+ return -EPERM;
+ }
+ }
+#endif
+ }
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
task_rq_unlock(rq, p, &flags);
goto recheck;
}
+
+ /*
+ * If setscheduling to SCHED_DEADLINE (or changing the parameters
+ * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+ * is available.
+ */
+ if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+ task_rq_unlock(rq, p, &flags);
+ return -EBUSY;
+ }
+
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
oldprio = p->prio;
prev_class = p->sched_class;
- __setscheduler(rq, p, policy, param->sched_priority);
+ __setscheduler(rq, p, attr);
if (running)
p->sched_class->set_curr_task(rq);
return 0;
}
+static int _sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool check)
+{
+ struct sched_attr attr = {
+ .sched_policy = policy,
+ .sched_priority = param->sched_priority,
+ .sched_nice = PRIO_TO_NICE(p->static_prio),
+ };
+
+ /*
+ * Fixup the legacy SCHED_RESET_ON_FORK hack
+ */
+ if (policy & SCHED_RESET_ON_FORK) {
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ policy &= ~SCHED_RESET_ON_FORK;
+ attr.sched_policy = policy;
+ }
+
+ return __sched_setscheduler(p, &attr, check);
+}
/**
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
* @p: the task in question.
int sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param)
{
- return __sched_setscheduler(p, policy, param, true);
+ return _sched_setscheduler(p, policy, param, true);
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
/**
* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
* @p: the task in question.
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
const struct sched_param *param)
{
- return __sched_setscheduler(p, policy, param, false);
+ return _sched_setscheduler(p, policy, param, false);
}
static int
return retval;
}
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+ return -EFAULT;
+
+ /*
+ * zero the full structure, so that a short copy will be nice.
+ */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE) /* silly large */
+ goto err_size;
+
+ if (!size) /* abi compat */
+ size = SCHED_ATTR_SIZE_VER0;
+
+ if (size < SCHED_ATTR_SIZE_VER0)
+ goto err_size;
+
+ /*
+ * If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(*attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(*attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ ret = get_user(val, addr);
+ if (ret)
+ return ret;
+ if (val)
+ goto err_size;
+ }
+ size = sizeof(*attr);
+ }
+
+ ret = copy_from_user(attr, uattr, size);
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * XXX: do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+
+out:
+ return ret;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ ret = -E2BIG;
+ goto out;
+}
+
/**
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
* @pid: the pid in question.
return do_sched_setscheduler(pid, -1, param);
}
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
+{
+ struct sched_attr attr;
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0)
+ return -EINVAL;
+
+ if (sched_copy_attr(uattr, &attr))
+ return -EFAULT;
+
+ rcu_read_lock();
+ retval = -ESRCH;
+ p = find_process_by_pid(pid);
+ if (p != NULL)
+ retval = sched_setattr(p, &attr);
+ rcu_read_unlock();
+
+ return retval;
+}
+
/**
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
* @pid: the pid in question.
if (retval)
goto out_unlock;
+ if (task_has_dl_policy(p)) {
+ retval = -EINVAL;
+ goto out_unlock;
+ }
lp.sched_priority = p->rt_priority;
rcu_read_unlock();
return retval;
}
+static int sched_read_attr(struct sched_attr __user *uattr,
+ struct sched_attr *attr,
+ unsigned int usize)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_WRITE, uattr, usize))
+ return -EFAULT;
+
+ /*
+ * If we're handed a smaller struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. old
+ * user-space does not get uncomplete information.
+ */
+ if (usize < sizeof(*attr)) {
+ unsigned char *addr;
+ unsigned char *end;
+
+ addr = (void *)attr + usize;
+ end = (void *)attr + sizeof(*attr);
+
+ for (; addr < end; addr++) {
+ if (*addr)
+ goto err_size;
+ }
+
+ attr->size = usize;
+ }
+
+ ret = copy_to_user(uattr, attr, usize);
+ if (ret)
+ return -EFAULT;
+
+out:
+ return ret;
+
+err_size:
+ ret = -E2BIG;
+ goto out;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ */
+SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, size)
+{
+ struct sched_attr attr = {
+ .size = sizeof(struct sched_attr),
+ };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || size > PAGE_SIZE ||
+ size < SCHED_ATTR_SIZE_VER0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ p = find_process_by_pid(pid);
+ retval = -ESRCH;
+ if (!p)
+ goto out_unlock;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
+ attr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, &attr);
+ else if (task_has_rt_policy(p))
+ attr.sched_priority = p->rt_priority;
+ else
+ attr.sched_nice = TASK_NICE(p);
+
+ rcu_read_unlock();
+
+ retval = sched_read_attr(uattr, &attr, size);
+ return retval;
+
+out_unlock:
+ rcu_read_unlock();
+ return retval;
+}
+
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
cpumask_var_t cpus_allowed, new_mask;
if (retval)
goto out_unlock;
+
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, in_mask, cpus_allowed);
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+#ifdef CONFIG_SMP
+ if (task_has_dl_policy(p)) {
+ const struct cpumask *span = task_rq(p)->rd->span;
+
+ if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
+ retval = -EBUSY;
+ goto out_unlock;
+ }
+ }
+#endif
again:
retval = set_cpus_allowed_ptr(p, new_mask);
}
double_rq_lock(rq, p_rq);
- while (task_rq(p) != p_rq) {
+ if (task_rq(p) != p_rq) {
double_rq_unlock(rq, p_rq);
goto again;
}
case SCHED_RR:
ret = MAX_USER_RT_PRIO-1;
break;
+ case SCHED_DEADLINE:
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
case SCHED_RR:
ret = 1;
break;
+ case SCHED_DEADLINE:
case SCHED_NORMAL:
case SCHED_BATCH:
case SCHED_IDLE:
static int sched_cpu_inactive(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
+ unsigned long flags;
+ long cpu = (long)hcpu;
+
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
- set_cpu_active((long)hcpu, false);
+ set_cpu_active(cpu, false);
+
+ /* explicitly allow suspend */
+ if (!(action & CPU_TASKS_FROZEN)) {
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+ bool overflow;
+ int cpus;
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+ if (overflow)
+ return notifier_from_errno(-EBUSY);
+ }
return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
}
+
+ return NOTIFY_DONE;
}
static int __init migration_init(void)
struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
cpupri_cleanup(&rd->cpupri);
+ cpudl_cleanup(&rd->cpudl);
+ free_cpumask_var(rd->dlo_mask);
free_cpumask_var(rd->rto_mask);
free_cpumask_var(rd->online);
free_cpumask_var(rd->span);
cpumask_clear_cpu(rq->cpu, old_rd->span);
/*
- * If we dont want to free the old_rt yet then
+ * If we dont want to free the old_rd yet then
* set old_rd to NULL to skip the freeing later
* in this function:
*/
goto out;
if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
goto free_online;
+ if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_dlo_mask;
+
+ init_dl_bw(&rd->dl_bw);
+ if (cpudl_init(&rd->cpudl) != 0)
+ goto free_dlo_mask;
if (cpupri_init(&rd->cpupri) != 0)
goto free_rto_mask;
free_rto_mask:
free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+ free_cpumask_var(rd->dlo_mask);
free_online:
free_cpumask_var(rd->online);
free_span:
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
+ struct sched_domain *busy_sd = NULL;
int id = cpu;
int size = 1;
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
- rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
+ busy_sd = sd->parent; /* sd_busy */
}
+ rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
* die on a /0 trap.
*/
sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+ sg->sgp->power_orig = sg->sgp->power;
/*
* Make sure the first group of this domain contains the
free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
+ init_sched_dl_class();
}
#else
void __init sched_init_smp(void)
#endif /* CONFIG_CPUMASK_OFFSTACK */
}
+ init_rt_bandwidth(&def_rt_bandwidth,
+ global_rt_period(), global_rt_runtime());
+ init_dl_bandwidth(&def_dl_bandwidth,
+ global_rt_period(), global_rt_runtime());
+
#ifdef CONFIG_SMP
init_defrootdomain();
#endif
- init_rt_bandwidth(&def_rt_bandwidth,
- global_rt_period(), global_rt_runtime());
-
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), global_rt_runtime());
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt, rq);
+ init_dl_rq(&rq->dl, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
#endif
-#ifdef CONFIG_RT_MUTEXES
- plist_head_init(&init_task.pi_waiters);
-#endif
-
/*
* The boot idle thread does lazy MMU switching as well:
*/
static void normalize_task(struct rq *rq, struct task_struct *p)
{
const struct sched_class *prev_class = p->sched_class;
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ };
int old_prio = p->prio;
int on_rq;
on_rq = p->on_rq;
if (on_rq)
dequeue_task(rq, p, 0);
- __setscheduler(rq, p, SCHED_NORMAL, 0);
+ __setscheduler(rq, p, &attr);
if (on_rq) {
enqueue_task(rq, p, 0);
resched_task(rq->curr);
p->se.statistics.block_start = 0;
#endif
- if (!rt_task(p)) {
+ if (!dl_task(p) && !rt_task(p)) {
/*
* Renice negative nice level userspace
* tasks back to 0:
}
#endif /* CONFIG_CGROUP_SCHED */
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
- if (runtime == RUNTIME_INF)
- return 1ULL << 20;
-
- return div64_u64(runtime << 20, period);
-}
-#endif
-
#ifdef CONFIG_RT_GROUP_SCHED
/*
* Ensure that the real time constraints are schedulable.
do_div(rt_period_us, NSEC_PER_USEC);
return rt_period_us;
}
+#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_RT_GROUP_SCHED
static int sched_rt_global_constraints(void)
{
- u64 runtime, period;
int ret = 0;
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
- runtime = global_rt_runtime();
- period = global_rt_period();
-
- /*
- * Sanity check on the sysctl variables.
- */
- if (runtime > period && runtime != RUNTIME_INF)
- return -EINVAL;
-
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
ret = __rt_schedulable(NULL, 0, 0);
static int sched_rt_global_constraints(void)
{
unsigned long flags;
- int i;
-
- if (sysctl_sched_rt_period <= 0)
- return -EINVAL;
-
- /*
- * There's always some RT tasks in the root group
- * -- migration, kstopmachine etc..
- */
- if (sysctl_sched_rt_runtime == 0)
- return -EBUSY;
+ int i, ret = 0;
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
}
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
- return 0;
+ return ret;
}
#endif /* CONFIG_RT_GROUP_SCHED */
-int sched_rr_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int sched_dl_global_constraints(void)
{
- int ret;
- static DEFINE_MUTEX(mutex);
+ u64 runtime = global_rt_runtime();
+ u64 period = global_rt_period();
+ u64 new_bw = to_ratio(period, runtime);
+ int cpu, ret = 0;
- mutex_lock(&mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
- /* make sure that internally we keep jiffies */
- /* also, writing zero resets timeslice to default */
- if (!ret && write) {
- sched_rr_timeslice = sched_rr_timeslice <= 0 ?
- RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ /*
+ * Here we want to check the bandwidth not being set to some
+ * value smaller than the currently allocated bandwidth in
+ * any of the root_domains.
+ *
+ * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+ * cycling on root_domains... Discussion on different/better
+ * solutions is welcome!
+ */
+ for_each_possible_cpu(cpu) {
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock(&dl_b->lock);
+ if (new_bw < dl_b->total_bw)
+ ret = -EBUSY;
+ raw_spin_unlock(&dl_b->lock);
+
+ if (ret)
+ break;
}
- mutex_unlock(&mutex);
+
return ret;
}
+static void sched_dl_do_global(void)
+{
+ u64 new_bw = -1;
+ int cpu;
+
+ def_dl_bandwidth.dl_period = global_rt_period();
+ def_dl_bandwidth.dl_runtime = global_rt_runtime();
+
+ if (global_rt_runtime() != RUNTIME_INF)
+ new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+ /*
+ * FIXME: As above...
+ */
+ for_each_possible_cpu(cpu) {
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+
+ raw_spin_lock(&dl_b->lock);
+ dl_b->bw = new_bw;
+ raw_spin_unlock(&dl_b->lock);
+ }
+}
+
+static int sched_rt_global_validate(void)
+{
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
+ if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+ def_rt_bandwidth.rt_runtime = global_rt_runtime();
+ def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
+
int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret;
int old_period, old_runtime;
static DEFINE_MUTEX(mutex);
+ int ret;
mutex_lock(&mutex);
old_period = sysctl_sched_rt_period;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (!ret && write) {
+ ret = sched_rt_global_validate();
+ if (ret)
+ goto undo;
+
ret = sched_rt_global_constraints();
- if (ret) {
- sysctl_sched_rt_period = old_period;
- sysctl_sched_rt_runtime = old_runtime;
- } else {
- def_rt_bandwidth.rt_runtime = global_rt_runtime();
- def_rt_bandwidth.rt_period =
- ns_to_ktime(global_rt_period());
- }
+ if (ret)
+ goto undo;
+
+ ret = sched_dl_global_constraints();
+ if (ret)
+ goto undo;
+
+ sched_rt_do_global();
+ sched_dl_do_global();
+ }
+ if (0) {
+undo:
+ sysctl_sched_rt_period = old_period;
+ sysctl_sched_rt_runtime = old_runtime;
}
mutex_unlock(&mutex);
return ret;
}
+int sched_rr_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ /* make sure that internally we keep jiffies */
+ /* also, writing zero resets timeslice to default */
+ if (!ret && write) {
+ sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+ RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ }
+ mutex_unlock(&mutex);
+ return ret;
+}
+
#ifdef CONFIG_CGROUP_SCHED
static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
return ret;
}
- static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
- struct cgroup_map_cb *cb)
+ static int cpu_stats_show(struct seq_file *sf, void *v)
{
- struct task_group *tg = css_tg(css);
+ struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
- cb->fill(cb, "nr_periods", cfs_b->nr_periods);
- cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
- cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+ seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
+ seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
+ seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
return 0;
}
},
{
.name = "stat",
- .read_map = cpu_stats_show,
+ .seq_show = cpu_stats_show,
},
#endif
#ifdef CONFIG_RT_GROUP_SCHED
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
+ #include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
+ #include <linux/file.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
struct eventfd_ctx *eventfd;
};
+ /*
+ * cgroup_event represents events which userspace want to receive.
+ */
+ struct mem_cgroup_event {
+ /*
+ * memcg which the event belongs to.
+ */
+ struct mem_cgroup *memcg;
+ /*
+ * eventfd to signal userspace about the event.
+ */
+ struct eventfd_ctx *eventfd;
+ /*
+ * Each of these stored in a list by the cgroup.
+ */
+ struct list_head list;
+ /*
+ * register_event() callback will be used to add new userspace
+ * waiter for changes related to this event. Use eventfd_signal()
+ * on eventfd to send notification to userspace.
+ */
+ int (*register_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args);
+ /*
+ * unregister_event() callback will be called when userspace closes
+ * the eventfd or on cgroup removing. This callback must be set,
+ * if you want provide notification functionality.
+ */
+ void (*unregister_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd);
+ /*
+ * All fields below needed to unregister event when
+ * userspace closes eventfd.
+ */
+ poll_table pt;
+ wait_queue_head_t *wqh;
+ wait_queue_t wait;
+ struct work_struct remove;
+ };
+
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
atomic_t numainfo_updating;
#endif
+ /* List of events which userspace want to receive */
+ struct list_head event_list;
+ spinlock_t event_list_lock;
+
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
static size_t memcg_size(void)
{
return sizeof(struct mem_cgroup) +
- nr_node_ids * sizeof(struct mem_cgroup_per_node);
+ nr_node_ids * sizeof(struct mem_cgroup_per_node *);
}
/* internal only representation about the status of kmem accounting. */
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
- struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
- {
- return &mem_cgroup_from_css(css)->vmpressure;
- }
-
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
goto bypass;
if (unlikely(task_in_memcg_oom(current)))
- goto bypass;
+ goto nomem;
+
+ if (gfp_mask & __GFP_NOFAIL)
+ oom = false;
/*
* We always charge the cgroup the mm_struct belongs to.
}
#ifdef CONFIG_SLABINFO
- static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *m)
+ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct memcg_cache_params *params;
if (!memcg_can_account_kmem(memcg))
return val << PAGE_SHIFT;
}
- static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct file *file,
- char __user *buf, size_t nbytes, loff_t *ppos)
+ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- char str[64];
u64 val;
- int name, len;
+ int name;
enum res_type type;
type = MEMFILE_TYPE(cft->private);
BUG();
}
- len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
- return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+ return val;
}
static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
#endif
#ifdef CONFIG_NUMA
- static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *m)
+ static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
const char *name;
const struct numa_stat *stat;
int nid;
unsigned long nr;
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
}
- static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
- struct seq_file *m)
+ static int memcg_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct mem_cgroup *mi;
unsigned int i;
mem_cgroup_oom_notify_cb(iter);
}
- static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args, enum res_type type)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- enum res_type type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
int i, size, ret;
return ret;
}
- static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd)
+ static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+ {
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+ }
+
+ static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+ {
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+ }
+
+ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, enum res_type type)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- enum res_type type = MEMFILE_TYPE(cft->private);
u64 usage;
int i, j, size;
mutex_unlock(&memcg->thresholds_lock);
}
- static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+ static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+ {
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+ }
+
+ static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+ {
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+ }
+
+ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *event;
- enum res_type type = MEMFILE_TYPE(cft->private);
- BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
return 0;
}
- static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd)
+ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *ev, *tmp;
- enum res_type type = MEMFILE_TYPE(cft->private);
-
- BUG_ON(type != _OOM_TYPE);
spin_lock(&memcg_oom_lock);
spin_unlock(&memcg_oom_lock);
}
- static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct cgroup_map_cb *cb)
+ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-
- cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
- if (atomic_read(&memcg->under_oom))
- cb->fill(cb, "under_oom", 1);
- else
- cb->fill(cb, "under_oom", 0);
+ seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+ seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
return 0;
}
}
#endif
+ /*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered. It tries to support fully configurable
+ * events for each user. Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+ /*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+ static void memcg_event_remove(struct work_struct *work)
+ {
+ struct mem_cgroup_event *event =
+ container_of(work, struct mem_cgroup_event, remove);
+ struct mem_cgroup *memcg = event->memcg;
+
+ remove_wait_queue(event->wqh, &event->wait);
+
+ event->unregister_event(memcg, event->eventfd);
+
+ /* Notify userspace the event is going away. */
+ eventfd_signal(event->eventfd, 1);
+
+ eventfd_ctx_put(event->eventfd);
+ kfree(event);
+ css_put(&memcg->css);
+ }
+
+ /*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+ static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+ int sync, void *key)
+ {
+ struct mem_cgroup_event *event =
+ container_of(wait, struct mem_cgroup_event, wait);
+ struct mem_cgroup *memcg = event->memcg;
+ unsigned long flags = (unsigned long)key;
+
+ if (flags & POLLHUP) {
+ /*
+ * If the event has been detached at cgroup removal, we
+ * can simply return knowing the other side will cleanup
+ * for us.
+ *
+ * We can't race against event freeing since the other
+ * side will require wqh->lock via remove_wait_queue(),
+ * which we hold.
+ */
+ spin_lock(&memcg->event_list_lock);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ /*
+ * We are in atomic context, but cgroup_event_remove()
+ * may sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
+ }
+
+ return 0;
+ }
+
+ static void memcg_event_ptable_queue_proc(struct file *file,
+ wait_queue_head_t *wqh, poll_table *pt)
+ {
+ struct mem_cgroup_event *event =
+ container_of(pt, struct mem_cgroup_event, pt);
+
+ event->wqh = wqh;
+ add_wait_queue(wqh, &event->wait);
+ }
+
+ /*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+ static int memcg_write_event_control(struct cgroup_subsys_state *css,
+ struct cftype *cft, const char *buffer)
+ {
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event;
+ struct cgroup_subsys_state *cfile_css;
+ unsigned int efd, cfd;
+ struct fd efile;
+ struct fd cfile;
+ const char *name;
+ char *endp;
+ int ret;
+
+ efd = simple_strtoul(buffer, &endp, 10);
+ if (*endp != ' ')
+ return -EINVAL;
+ buffer = endp + 1;
+
+ cfd = simple_strtoul(buffer, &endp, 10);
+ if ((*endp != ' ') && (*endp != '\0'))
+ return -EINVAL;
+ buffer = endp + 1;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ event->memcg = memcg;
+ INIT_LIST_HEAD(&event->list);
+ init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+ init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+ INIT_WORK(&event->remove, memcg_event_remove);
+
+ efile = fdget(efd);
+ if (!efile.file) {
+ ret = -EBADF;
+ goto out_kfree;
+ }
+
+ event->eventfd = eventfd_ctx_fileget(efile.file);
+ if (IS_ERR(event->eventfd)) {
+ ret = PTR_ERR(event->eventfd);
+ goto out_put_efile;
+ }
+
+ cfile = fdget(cfd);
+ if (!cfile.file) {
+ ret = -EBADF;
+ goto out_put_eventfd;
+ }
+
+ /* the process need read permission on control file */
+ /* AV: shouldn't we check that it's been opened for read instead? */
+ ret = inode_permission(file_inode(cfile.file), MAY_READ);
+ if (ret < 0)
+ goto out_put_cfile;
+
+ /*
+ * Determine the event callbacks and set them in @event. This used
+ * to be done via struct cftype but cgroup core no longer knows
+ * about these events. The following is crude but the whole thing
+ * is for compatibility anyway.
+ *
+ * DO NOT ADD NEW FILES.
+ */
+ name = cfile.file->f_dentry->d_name.name;
+
+ if (!strcmp(name, "memory.usage_in_bytes")) {
+ event->register_event = mem_cgroup_usage_register_event;
+ event->unregister_event = mem_cgroup_usage_unregister_event;
+ } else if (!strcmp(name, "memory.oom_control")) {
+ event->register_event = mem_cgroup_oom_register_event;
+ event->unregister_event = mem_cgroup_oom_unregister_event;
+ } else if (!strcmp(name, "memory.pressure_level")) {
+ event->register_event = vmpressure_register_event;
+ event->unregister_event = vmpressure_unregister_event;
+ } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+ event->register_event = memsw_cgroup_usage_register_event;
+ event->unregister_event = memsw_cgroup_usage_unregister_event;
+ } else {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
+ * Verify @cfile should belong to @css. Also, remaining events are
+ * automatically removed on cgroup destruction but the removal is
+ * asynchronous, so take an extra ref on @css.
+ */
+ rcu_read_lock();
+
+ ret = -EINVAL;
+ cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
+ &mem_cgroup_subsys);
+ if (cfile_css == css && css_tryget(css))
+ ret = 0;
+
+ rcu_read_unlock();
+ if (ret)
+ goto out_put_cfile;
+
+ ret = event->register_event(memcg, event->eventfd, buffer);
+ if (ret)
+ goto out_put_css;
+
+ efile.file->f_op->poll(efile.file, &event->pt);
+
+ spin_lock(&memcg->event_list_lock);
+ list_add(&event->list, &memcg->event_list);
+ spin_unlock(&memcg->event_list_lock);
+
+ fdput(cfile);
+ fdput(efile);
+
+ return 0;
+
+ out_put_css:
+ css_put(css);
+ out_put_cfile:
+ fdput(cfile);
+ out_put_eventfd:
+ eventfd_ctx_put(event->eventfd);
+ out_put_efile:
+ fdput(efile);
+ out_kfree:
+ kfree(event);
+
+ return ret;
+ }
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
- .read = mem_cgroup_read,
- .register_event = mem_cgroup_usage_register_event,
- .unregister_event = mem_cgroup_usage_unregister_event,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "stat",
- .read_seq_string = memcg_stat_show,
+ .seq_show = memcg_stat_show,
},
{
.name = "force_empty",
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
+ {
+ .name = "cgroup.event_control", /* XXX: for compat */
+ .write_string = memcg_write_event_control,
+ .flags = CFTYPE_NO_PREFIX,
+ .mode = S_IWUGO,
+ },
{
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
},
{
.name = "oom_control",
- .read_map = mem_cgroup_oom_control_read,
+ .seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
- .register_event = mem_cgroup_oom_register_event,
- .unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
- .register_event = vmpressure_register_event,
- .unregister_event = vmpressure_unregister_event,
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
- .read_seq_string = memcg_numa_stat_show,
+ .seq_show = memcg_numa_stat_show,
},
#endif
#ifdef CONFIG_MEMCG_KMEM
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.failcnt",
.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
#ifdef CONFIG_SLABINFO
{
.name = "kmem.slabinfo",
- .read_seq_string = mem_cgroup_slabinfo_read,
+ .seq_show = mem_cgroup_slabinfo_read,
},
#endif
#endif
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
- .read = mem_cgroup_read,
- .register_event = mem_cgroup_usage_register_event,
- .unregister_event = mem_cgroup_usage_unregister_event,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
.write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
.trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{ }, /* terminate */
};
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
+ INIT_LIST_HEAD(&memcg->event_list);
+ spin_lock_init(&memcg->event_list_lock);
return &memcg->css;
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event, *tmp;
+
+ /*
+ * Unregister events and notify userspace.
+ * Notify userspace about cgroup removing only after rmdir of cgroup
+ * directory to avoid race between userspace and kernelspace.
+ */
+ spin_lock(&memcg->event_list_lock);
+ list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+ list_del_init(&event->list);
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
kmem_cgroup_css_offline(memcg);
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ /*
+ * XXX: css_offline() would be where we should reparent all
+ * memory to prepare the cgroup for destruction. However,
+ * memcg does not do css_tryget() and res_counter charging
+ * under the same RCU lock region, which means that charging
+ * could race with offlining. Offlining only happens to
+ * cgroups with no tasks in them but charges can show up
+ * without any tasks from the swapin path when the target
+ * memcg is looked up from the swapout record and not from the
+ * current task as it usually is. A race like this can leak
+ * charges and put pages with stale cgroup pointers into
+ * circulation:
+ *
+ * #0 #1
+ * lookup_swap_cgroup_id()
+ * rcu_read_lock()
+ * mem_cgroup_lookup()
+ * css_tryget()
+ * rcu_read_unlock()
+ * disable css_tryget()
+ * call_rcu()
+ * offline_css()
+ * reparent_charges()
+ * res_counter_charge()
+ * css_put()
+ * css_free()
+ * pc->mem_cgroup = dead memcg
+ * add page to lru
+ *
+ * The bulk of the charges are still moved in offline_css() to
+ * avoid pinning a lot of pages in case a long-term reference
+ * like a swapout record is deferring the css_free() to long
+ * after offlining. But this makes sure we catch any charges
+ * made after offlining:
+ */
+ mem_cgroup_reparent_charges(memcg);
memcg_destroy_kmem(memcg);
__mem_cgroup_free(memcg);