[linux.git] / kernel / cgroup_pids.c

/*
 * Process number limiting controller for cgroups.
 *
 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
 * after a certain limit is reached.
 *
 * Since it is trivial to hit the task limit without hitting any kmemcg limits
 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
 * preventable in the scope of a cgroup hierarchy by allowing resource limiting
 * of the number of tasks in a cgroup.
 *
 * In order to use the `pids` controller, set the maximum number of tasks in
 * pids.max (this is not available in the root cgroup for obvious reasons). The
 * number of processes currently in the cgroup is given by pids.current.
 * Organisational operations are not blocked by cgroup policies, so it is
 * possible to have pids.current > pids.max. However, it is not possible to
 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
 * would cause a cgroup policy to be violated.
 *
 * To set a cgroup to have no limit, set pids.max to "max". This is the default
 * for all new cgroups (N.B. that PID limits are hierarchical, so the most
 * stringent limit in the hierarchy is followed).
 *
 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
 * a superset of parent/child/pids.current.
 *
 * Copyright (C) 2015 Aleksa Sarai <[email protected]>
 *
 * This file is subject to the terms and conditions of version 2 of the GNU
 * General Public License.  See the file COPYING in the main directory of the
 * Linux distribution for more details.
 */

#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cgroup.h>
#include <linux/slab.h>

#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"

struct pids_cgroup {
	struct cgroup_subsys_state	css;

	/*
	 * Use 64-bit types so that we can safely represent "max" as
	 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
	 */
	atomic64_t			counter;
	int64_t				limit;
};

static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
{
	return container_of(css, struct pids_cgroup, css);
}

static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
{
	return css_pids(pids->css.parent);
}

static struct cgroup_subsys_state *
pids_css_alloc(struct cgroup_subsys_state *parent)
{
	struct pids_cgroup *pids;

	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
	if (!pids)
		return ERR_PTR(-ENOMEM);

	pids->limit = PIDS_MAX;
	atomic64_set(&pids->counter, 0);
	return &pids->css;
}

static void pids_css_free(struct cgroup_subsys_state *css)
{
	kfree(css_pids(css));
}

/**
 * pids_cancel - uncharge the local pid count
 * @pids: the pid cgroup state
 * @num: the number of pids to cancel
 *
 * This function will WARN if the pid count goes under 0, because such a case is
 * a bug in the pids controller proper.
 */
static void pids_cancel(struct pids_cgroup *pids, int num)
{
	/*
	 * A negative count (or overflow for that matter) is invalid,
	 * and indicates a bug in the `pids` controller proper.
	 */
	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
}

/**
 * pids_uncharge - hierarchically uncharge the pid count
 * @pids: the pid cgroup state
 * @num: the number of pids to uncharge
 */
static void pids_uncharge(struct pids_cgroup *pids, int num)
{
	struct pids_cgroup *p;

	for (p = pids; p; p = parent_pids(p))
		pids_cancel(p, num);
}

/**
 * pids_charge - hierarchically charge the pid count
 * @pids: the pid cgroup state
 * @num: the number of pids to charge
 *
 * This function does *not* follow the pid limit set. It cannot fail and the new
 * pid count may exceed the limit. This is only used for reverting failed
 * attaches, where there is no other way out than violating the limit.
 */
static void pids_charge(struct pids_cgroup *pids, int num)
{
	struct pids_cgroup *p;

	for (p = pids; p; p = parent_pids(p))
		atomic64_add(num, &p->counter);
}

/**
 * pids_try_charge - hierarchically try to charge the pid count
 * @pids: the pid cgroup state
 * @num: the number of pids to charge
 *
 * This function follows the set limit. It will fail if the charge would cause
 * the new value to exceed the hierarchical limit. Returns 0 if the charge
 * succeded, otherwise -EAGAIN.
 */
static int pids_try_charge(struct pids_cgroup *pids, int num)
{
	struct pids_cgroup *p, *q;

	for (p = pids; p; p = parent_pids(p)) {
		int64_t new = atomic64_add_return(num, &p->counter);

		/*
		 * Since new is capped to the maximum number of pid_t, if
		 * p->limit is %PIDS_MAX then we know that this test will never
		 * fail.
		 */
		if (new > p->limit)
			goto revert;
	}

	return 0;

revert:
	for (q = pids; q != p; q = parent_pids(q))
		pids_cancel(q, num);
	pids_cancel(p, num);

	return -EAGAIN;
}

static int pids_can_attach(struct cgroup_subsys_state *css,
			   struct cgroup_taskset *tset)
{
	struct pids_cgroup *pids = css_pids(css);
	struct task_struct *task;

	cgroup_taskset_for_each(task, tset) {
		struct cgroup_subsys_state *old_css;
		struct pids_cgroup *old_pids;

		/*
		 * No need to pin @old_css between here and cancel_attach()
		 * because cgroup core protects it from being freed before
		 * the migration completes or fails.
		 */
		old_css = task_css(task, pids_cgrp_id);
		old_pids = css_pids(old_css);

		pids_charge(pids, 1);
		pids_uncharge(old_pids, 1);
	}

	return 0;
}

static void pids_cancel_attach(struct cgroup_subsys_state *css,
			       struct cgroup_taskset *tset)
{
	struct pids_cgroup *pids = css_pids(css);
	struct task_struct *task;

	cgroup_taskset_for_each(task, tset) {
		struct cgroup_subsys_state *old_css;
		struct pids_cgroup *old_pids;

		old_css = task_css(task, pids_cgrp_id);
		old_pids = css_pids(old_css);

		pids_charge(old_pids, 1);
		pids_uncharge(pids, 1);
	}
}

static int pids_can_fork(struct task_struct *task, void **priv_p)
{
	struct cgroup_subsys_state *css;
	struct pids_cgroup *pids;
	int err;

	/*
	 * Use the "current" task_css for the pids subsystem as the tentative
	 * css. It is possible we will charge the wrong hierarchy, in which
	 * case we will forcefully revert/reapply the charge on the right
	 * hierarchy after it is committed to the task proper.
	 */
	css = task_get_css(current, pids_cgrp_id);
	pids = css_pids(css);

	err = pids_try_charge(pids, 1);
	if (err)
		goto err_css_put;

	*priv_p = css;
	return 0;

err_css_put:
	css_put(css);
	return err;
}

static void pids_cancel_fork(struct task_struct *task, void *priv)
{
	struct cgroup_subsys_state *css = priv;
	struct pids_cgroup *pids = css_pids(css);

	pids_uncharge(pids, 1);
	css_put(css);
}

static void pids_fork(struct task_struct *task, void *priv)
{
	struct cgroup_subsys_state *css;
	struct cgroup_subsys_state *old_css = priv;
	struct pids_cgroup *pids;
	struct pids_cgroup *old_pids = css_pids(old_css);

	css = task_get_css(task, pids_cgrp_id);
	pids = css_pids(css);

	/*
	 * If the association has changed, we have to revert and reapply the
	 * charge/uncharge on the wrong hierarchy to the current one. Since
	 * the association can only change due to an organisation event, its
	 * okay for us to ignore the limit in this case.
	 */
	if (pids != old_pids) {
		pids_uncharge(old_pids, 1);
		pids_charge(pids, 1);
	}

	css_put(css);
	css_put(old_css);
}

static void pids_exit(struct cgroup_subsys_state *css,
		      struct cgroup_subsys_state *old_css,
		      struct task_struct *task)
{
	struct pids_cgroup *pids = css_pids(old_css);

	pids_uncharge(pids, 1);
}

static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
			      size_t nbytes, loff_t off)
{
	struct cgroup_subsys_state *css = of_css(of);
	struct pids_cgroup *pids = css_pids(css);
	int64_t limit;
	int err;

	buf = strstrip(buf);
	if (!strcmp(buf, PIDS_MAX_STR)) {
		limit = PIDS_MAX;
		goto set_limit;
	}

	err = kstrtoll(buf, 0, &limit);
	if (err)
		return err;

	if (limit < 0 || limit >= PIDS_MAX)
		return -EINVAL;

set_limit:
	/*
	 * Limit updates don't need to be mutex'd, since it isn't
	 * critical that any racing fork()s follow the new limit.
	 */
	pids->limit = limit;
	return nbytes;
}

static int pids_max_show(struct seq_file *sf, void *v)
{
	struct cgroup_subsys_state *css = seq_css(sf);
	struct pids_cgroup *pids = css_pids(css);
	int64_t limit = pids->limit;

	if (limit >= PIDS_MAX)
		seq_printf(sf, "%s\n", PIDS_MAX_STR);
	else
		seq_printf(sf, "%lld\n", limit);

	return 0;
}

static s64 pids_current_read(struct cgroup_subsys_state *css,
			     struct cftype *cft)
{
	struct pids_cgroup *pids = css_pids(css);

	return atomic64_read(&pids->counter);
}

static struct cftype pids_files[] = {
	{
		.name = "max",
		.write = pids_max_write,
		.seq_show = pids_max_show,
		.flags = CFTYPE_NOT_ON_ROOT,
	},
	{
		.name = "current",
		.read_s64 = pids_current_read,
	},
	{ }	/* terminate */
};

struct cgroup_subsys pids_cgrp_subsys = {
	.css_alloc	= pids_css_alloc,
	.css_free	= pids_css_free,
	.can_attach 	= pids_can_attach,
	.cancel_attach 	= pids_cancel_attach,
	.can_fork	= pids_can_fork,
	.cancel_fork	= pids_cancel_fork,
	.fork		= pids_fork,
	.exit		= pids_exit,
	.legacy_cftypes	= pids_files,
	.dfl_cftypes	= pids_files,
};
Commit	Line	Data
49b786ea AS	1	/*
	2	* Process number limiting controller for cgroups.
	3	*
	4	* Used to allow a cgroup hierarchy to stop any new processes from fork()ing
	5	* after a certain limit is reached.
	6	*
	7	* Since it is trivial to hit the task limit without hitting any kmemcg limits
	8	* in place, PIDs are a fundamental resource. As such, PID exhaustion must be
	9	* preventable in the scope of a cgroup hierarchy by allowing resource limiting
	10	* of the number of tasks in a cgroup.
	11	*
	12	* In order to use the `pids` controller, set the maximum number of tasks in
	13	* pids.max (this is not available in the root cgroup for obvious reasons). The
	14	* number of processes currently in the cgroup is given by pids.current.
	15	* Organisational operations are not blocked by cgroup policies, so it is
	16	* possible to have pids.current > pids.max. However, it is not possible to
	17	* violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
	18	* would cause a cgroup policy to be violated.
	19	*
	20	* To set a cgroup to have no limit, set pids.max to "max". This is the default
	21	* for all new cgroups (N.B. that PID limits are hierarchical, so the most
	22	* stringent limit in the hierarchy is followed).
	23	*
	24	* pids.current tracks all child cgroup hierarchies, so parent/pids.current is
	25	* a superset of parent/child/pids.current.
	26	*
	27	* Copyright (C) 2015 Aleksa Sarai <[email protected]>
	28	*
	29	* This file is subject to the terms and conditions of version 2 of the GNU
	30	* General Public License. See the file COPYING in the main directory of the
	31	* Linux distribution for more details.
	32	*/
	33
	34	#include <linux/kernel.h>
	35	#include <linux/threads.h>
	36	#include <linux/atomic.h>
	37	#include <linux/cgroup.h>
	38	#include <linux/slab.h>
	39
	40	#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
	41	#define PIDS_MAX_STR "max"
	42
	43	struct pids_cgroup {
	44	struct cgroup_subsys_state css;
	45
	46	/*
	47	* Use 64-bit types so that we can safely represent "max" as
	48	* %PIDS_MAX = (%PID_MAX_LIMIT + 1).
	49	*/
	50	atomic64_t counter;
	51	int64_t limit;
	52	};
	53
	54	static struct pids_cgroup css_pids(struct cgroup_subsys_state css)
	55	{
	56	return container_of(css, struct pids_cgroup, css);
	57	}
	58
	59	static struct pids_cgroup parent_pids(struct pids_cgroup pids)
	60	{
	61	return css_pids(pids->css.parent);
	62	}
	63
	64	static struct cgroup_subsys_state *
65	pids_css_alloc(struct cgroup_subsys_state *parent)
66	{
67	struct pids_cgroup *pids;
68
69	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
70	if (!pids)
71	return ERR_PTR(-ENOMEM);
72
73	pids->limit = PIDS_MAX;
74	atomic64_set(&pids->counter, 0);
75	return &pids->css;
76	}
77
78	static void pids_css_free(struct cgroup_subsys_state *css)
79	{
80	kfree(css_pids(css));
81	}
82
83	/**
84	* pids_cancel - uncharge the local pid count
85	* @pids: the pid cgroup state
86	* @num: the number of pids to cancel
87	*
88	* This function will WARN if the pid count goes under 0, because such a case is
89	* a bug in the pids controller proper.
90	*/
91	static void pids_cancel(struct pids_cgroup *pids, int num)
92	{
93	/*
94	* A negative count (or overflow for that matter) is invalid,
95	* and indicates a bug in the `pids` controller proper.
96	*/
97	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
98	}
99
100	/**
101	* pids_uncharge - hierarchically uncharge the pid count
102	* @pids: the pid cgroup state
103	* @num: the number of pids to uncharge
104	*/
105	static void pids_uncharge(struct pids_cgroup *pids, int num)
106	{
107	struct pids_cgroup *p;
108
109	for (p = pids; p; p = parent_pids(p))
110	pids_cancel(p, num);
111	}
112
113	/**
114	* pids_charge - hierarchically charge the pid count
115	* @pids: the pid cgroup state
116	* @num: the number of pids to charge
117	*
118	* This function does not follow the pid limit set. It cannot fail and the new
119	* pid count may exceed the limit. This is only used for reverting failed
120	* attaches, where there is no other way out than violating the limit.
121	*/
122	static void pids_charge(struct pids_cgroup *pids, int num)
123	{
124	struct pids_cgroup *p;
125
126	for (p = pids; p; p = parent_pids(p))
127	atomic64_add(num, &p->counter);
128	}
129
130	/**
131	* pids_try_charge - hierarchically try to charge the pid count
132	* @pids: the pid cgroup state
133	* @num: the number of pids to charge
134	*
135	* This function follows the set limit. It will fail if the charge would cause
136	* the new value to exceed the hierarchical limit. Returns 0 if the charge
137	* succeded, otherwise -EAGAIN.
138	*/
139	static int pids_try_charge(struct pids_cgroup *pids, int num)
140	{
141	struct pids_cgroup p, q;
142
143	for (p = pids; p; p = parent_pids(p)) {
144	int64_t new = atomic64_add_return(num, &p->counter);
145
146	/*
147	* Since new is capped to the maximum number of pid_t, if
148	* p->limit is %PIDS_MAX then we know that this test will never
149	* fail.
150	*/
151	if (new > p->limit)
152	goto revert;
153	}
154
155	return 0;
156
157	revert:
158	for (q = pids; q != p; q = parent_pids(q))
159	pids_cancel(q, num);
160	pids_cancel(p, num);
161
162	return -EAGAIN;
163	}
164
165	static int pids_can_attach(struct cgroup_subsys_state *css,
166	struct cgroup_taskset *tset)
167	{
168	struct pids_cgroup *pids = css_pids(css);
169	struct task_struct *task;
170
171	cgroup_taskset_for_each(task, tset) {
172	struct cgroup_subsys_state *old_css;
173	struct pids_cgroup *old_pids;
174
175	/*
ce523995 AS	176	* No need to pin @old_css between here and cancel_attach()
	177	* because cgroup core protects it from being freed before
	178	* the migration completes or fails.
49b786ea	179	*/
ce523995	180	old_css = task_css(task, pids_cgrp_id);
49b786ea AS	181	old_pids = css_pids(old_css);
	182
	183	pids_charge(pids, 1);
	184	pids_uncharge(old_pids, 1);
	185	}
	186
	187	return 0;
	188	}
	189
	190	static void pids_cancel_attach(struct cgroup_subsys_state *css,
	191	struct cgroup_taskset *tset)
	192	{
	193	struct pids_cgroup *pids = css_pids(css);
	194	struct task_struct *task;
	195
	196	cgroup_taskset_for_each(task, tset) {
	197	struct cgroup_subsys_state *old_css;
	198	struct pids_cgroup *old_pids;
	199
	200	old_css = task_css(task, pids_cgrp_id);
	201	old_pids = css_pids(old_css);
	202
	203	pids_charge(old_pids, 1);
	204	pids_uncharge(pids, 1);
49b786ea AS	205	}
	206	}
	207
49b786ea AS	208	static int pids_can_fork(struct task_struct task, void *priv_p)
	209	{
	210	struct cgroup_subsys_state *css;
	211	struct pids_cgroup *pids;
	212	int err;
	213
	214	/*
	215	* Use the "current" task_css for the pids subsystem as the tentative
	216	* css. It is possible we will charge the wrong hierarchy, in which
	217	* case we will forcefully revert/reapply the charge on the right
	218	* hierarchy after it is committed to the task proper.
	219	*/
	220	css = task_get_css(current, pids_cgrp_id);
	221	pids = css_pids(css);
	222
	223	err = pids_try_charge(pids, 1);
	224	if (err)
	225	goto err_css_put;
	226
	227	*priv_p = css;
	228	return 0;
	229
	230	err_css_put:
	231	css_put(css);
	232	return err;
	233	}
	234
	235	static void pids_cancel_fork(struct task_struct task, void priv)
	236	{
	237	struct cgroup_subsys_state *css = priv;
	238	struct pids_cgroup *pids = css_pids(css);
	239
	240	pids_uncharge(pids, 1);
	241	css_put(css);
	242	}
	243
	244	static void pids_fork(struct task_struct task, void priv)
	245	{
	246	struct cgroup_subsys_state *css;
	247	struct cgroup_subsys_state *old_css = priv;
	248	struct pids_cgroup *pids;
	249	struct pids_cgroup *old_pids = css_pids(old_css);
	250
	251	css = task_get_css(task, pids_cgrp_id);
	252	pids = css_pids(css);
	253
	254	/*
	255	* If the association has changed, we have to revert and reapply the
	256	* charge/uncharge on the wrong hierarchy to the current one. Since
	257	* the association can only change due to an organisation event, its
	258	* okay for us to ignore the limit in this case.
	259	*/
	260	if (pids != old_pids) {
	261	pids_uncharge(old_pids, 1);
	262	pids_charge(pids, 1);
	263	}
	264
	265	css_put(css);
	266	css_put(old_css);
	267	}
	268
	269	static void pids_exit(struct cgroup_subsys_state *css,
	270	struct cgroup_subsys_state *old_css,
	271	struct task_struct *task)
272	{
273	struct pids_cgroup *pids = css_pids(old_css);
274
275	pids_uncharge(pids, 1);
276	}
277
278	static ssize_t pids_max_write(struct kernfs_open_file of, char buf,
279	size_t nbytes, loff_t off)
280	{
281	struct cgroup_subsys_state *css = of_css(of);
282	struct pids_cgroup *pids = css_pids(css);
283	int64_t limit;
284	int err;
285
286	buf = strstrip(buf);
287	if (!strcmp(buf, PIDS_MAX_STR)) {
288	limit = PIDS_MAX;
289	goto set_limit;
290	}
291
292	err = kstrtoll(buf, 0, &limit);
293	if (err)
294	return err;
295
296	if (limit < 0 \|\| limit >= PIDS_MAX)
297	return -EINVAL;
298
299	set_limit:
300	/*
301	* Limit updates don't need to be mutex'd, since it isn't
302	* critical that any racing fork()s follow the new limit.
303	*/
304	pids->limit = limit;
305	return nbytes;
306	}
307
308	static int pids_max_show(struct seq_file sf, void v)
309	{
310	struct cgroup_subsys_state *css = seq_css(sf);
311	struct pids_cgroup *pids = css_pids(css);
312	int64_t limit = pids->limit;
313
314	if (limit >= PIDS_MAX)
315	seq_printf(sf, "%s\n", PIDS_MAX_STR);
316	else
317	seq_printf(sf, "%lld\n", limit);
318
319	return 0;
320	}
321
322	static s64 pids_current_read(struct cgroup_subsys_state *css,
323	struct cftype *cft)
324	{
325	struct pids_cgroup *pids = css_pids(css);
326
327	return atomic64_read(&pids->counter);
328	}
329
330	static struct cftype pids_files[] = {
331	{
332	.name = "max",
333	.write = pids_max_write,
334	.seq_show = pids_max_show,
335	.flags = CFTYPE_NOT_ON_ROOT,
336	},
337	{
338	.name = "current",
339	.read_s64 = pids_current_read,
340	},
341	{ } /* terminate */
342	};
343
344	struct cgroup_subsys pids_cgrp_subsys = {
345	.css_alloc = pids_css_alloc,
346	.css_free = pids_css_free,
49b786ea AS	347	.can_attach = pids_can_attach,
	348	.cancel_attach = pids_cancel_attach,
	349	.can_fork = pids_can_fork,
	350	.cancel_fork = pids_cancel_fork,
	351	.fork = pids_fork,
	352	.exit = pids_exit,
	353	.legacy_cftypes = pids_files,
	354	.dfl_cftypes = pids_files,
	355	};