Git Repo - linux.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Generic pidhash and scalable, time-bounded PID allocator
	3	*
	4	* (C) 2002-2003 Nadia Yvette Chambers, IBM
	5	* (C) 2004 Nadia Yvette Chambers, Oracle
	6	* (C) 2002-2004 Ingo Molnar, Red Hat
	7	*
	8	* pid-structures are backing objects for tasks sharing a given ID to chain
	9	* against. There is very little to them aside from hashing them and
	10	* parking tasks using given ID's on a list.
	11	*
	12	* The hash is always changed with the tasklist_lock write-acquired,
	13	* and the hash is only accessed with the tasklist_lock at least
	14	* read-acquired, so there's no additional SMP locking needed here.
	15	*
	16	* We have a list of bitmap pages, which bitmaps represent the PID space.
	17	* Allocating and freeing PIDs is completely lockless. The worst-case
	18	* allocation scenario when all but one out of 1 million PIDs possible are
	19	* allocated already: the scanning of 32 list entries and at most PAGE_SIZE
	20	* bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
	21	*
	22	* Pid namespaces:
	23	* (C) 2007 Pavel Emelyanov <[email protected]>, OpenVZ, SWsoft Inc.
	24	* (C) 2007 Sukadev Bhattiprolu <[email protected]>, IBM
	25	* Many thanks to Oleg Nesterov for comments and help
	26	*
	27	*/
	28
	29	#include <linux/mm.h>
	30	#include <linux/export.h>
	31	#include <linux/slab.h>
	32	#include <linux/init.h>
	33	#include <linux/rculist.h>
	34	#include <linux/bootmem.h>
	35	#include <linux/hash.h>
	36	#include <linux/pid_namespace.h>
	37	#include <linux/init_task.h>
	38	#include <linux/syscalls.h>
	39	#include <linux/proc_ns.h>
	40	#include <linux/proc_fs.h>
	41	#include <linux/sched/task.h>
	42
	43	#define pid_hashfn(nr, ns) \
	44	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
	45	static struct hlist_head *pid_hash;
	46	static unsigned int pidhash_shift = 4;
	47	struct pid init_struct_pid = INIT_STRUCT_PID;
	48
	49	int pid_max = PID_MAX_DEFAULT;
	50
	51	#define RESERVED_PIDS 300
	52
	53	int pid_max_min = RESERVED_PIDS + 1;
	54	int pid_max_max = PID_MAX_LIMIT;
	55
	56	static inline int mk_pid(struct pid_namespace *pid_ns,
	57	struct pidmap *map, int off)
	58	{
	59	return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
	60	}
	61
	62	#define find_next_offset(map, off) \
	63	find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
	64
	65	/*
	66	* PID-map pages start out as NULL, they get allocated upon
	67	* first use and are never deallocated. This way a low pid_max
	68	* value does not cause lots of bitmaps to be allocated, but
	69	* the scheme scales to up to 4 million PIDs, runtime.
	70	*/
	71	struct pid_namespace init_pid_ns = {
	72	.kref = KREF_INIT(2),
	73	.pidmap = {
	74	[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
	75	},
	76	.last_pid = 0,
	77	.nr_hashed = PIDNS_HASH_ADDING,
	78	.level = 0,
	79	.child_reaper = &init_task,
	80	.user_ns = &init_user_ns,
	81	.ns.inum = PROC_PID_INIT_INO,
	82	#ifdef CONFIG_PID_NS
	83	.ns.ops = &pidns_operations,
	84	#endif
	85	};
	86	EXPORT_SYMBOL_GPL(init_pid_ns);
	87
	88	/*
	89	* Note: disable interrupts while the pidmap_lock is held as an
	90	* interrupt might come in and do read_lock(&tasklist_lock).
	91	*
	92	* If we don't disable interrupts there is a nasty deadlock between
	93	* detach_pid()->free_pid() and another cpu that does
	94	* spin_lock(&pidmap_lock) followed by an interrupt routine that does
	95	* read_lock(&tasklist_lock);
	96	*
	97	* After we clean up the tasklist_lock and know there are no
	98	* irq handlers that take it we can leave the interrupts enabled.
	99	* For now it is easier to be safe than to prove it can't happen.
	100	*/
	101
	102	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
	103
	104	static void free_pidmap(struct upid *upid)
	105	{
	106	int nr = upid->nr;
	107	struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
	108	int offset = nr & BITS_PER_PAGE_MASK;
	109
	110	clear_bit(offset, map->page);
	111	atomic_inc(&map->nr_free);
	112	}
	113
	114	/*
	115	* If we started walking pids at 'base', is 'a' seen before 'b'?
	116	*/
	117	static int pid_before(int base, int a, int b)
	118	{
	119	/*
	120	* This is the same as saying
	121	*
	122	* (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
	123	* and that mapping orders 'a' and 'b' with respect to 'base'.
	124	*/
	125	return (unsigned)(a - base) < (unsigned)(b - base);
	126	}
	127
	128	/*
	129	* We might be racing with someone else trying to set pid_ns->last_pid
	130	* at the pid allocation time (there's also a sysctl for this, but racing
	131	* with this one is OK, see comment in kernel/pid_namespace.c about it).
	132	* We want the winner to have the "later" value, because if the
	133	* "earlier" value prevails, then a pid may get reused immediately.
	134	*
	135	* Since pids rollover, it is not sufficient to just pick the bigger
	136	* value. We have to consider where we started counting from.
	137	*
	138	* 'base' is the value of pid_ns->last_pid that we observed when
	139	* we started looking for a pid.
	140	*
	141	* 'pid' is the pid that we eventually found.
	142	*/
	143	static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
	144	{
	145	int prev;
	146	int last_write = base;
	147	do {
	148	prev = last_write;
	149	last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
	150	} while ((prev != last_write) && (pid_before(base, last_write, pid)));
	151	}
	152
	153	static int alloc_pidmap(struct pid_namespace *pid_ns)
	154	{
	155	int i, offset, max_scan, pid, last = pid_ns->last_pid;
	156	struct pidmap *map;
	157
	158	pid = last + 1;
	159	if (pid >= pid_max)
	160	pid = RESERVED_PIDS;
	161	offset = pid & BITS_PER_PAGE_MASK;
	162	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
	163	/*
	164	* If last_pid points into the middle of the map->page we
	165	* want to scan this bitmap block twice, the second time
	166	* we start with offset == 0 (or RESERVED_PIDS).
	167	*/
	168	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
	169	for (i = 0; i <= max_scan; ++i) {
	170	if (unlikely(!map->page)) {
	171	void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
	172	/*
	173	* Free the page if someone raced with us
	174	* installing it:
	175	*/
	176	spin_lock_irq(&pidmap_lock);
	177	if (!map->page) {
	178	map->page = page;
	179	page = NULL;
	180	}
	181	spin_unlock_irq(&pidmap_lock);
	182	kfree(page);
	183	if (unlikely(!map->page))
	184	return -ENOMEM;
	185	}
	186	if (likely(atomic_read(&map->nr_free))) {
	187	for ( ; ; ) {
	188	if (!test_and_set_bit(offset, map->page)) {
	189	atomic_dec(&map->nr_free);
	190	set_last_pid(pid_ns, last, pid);
	191	return pid;
	192	}
	193	offset = find_next_offset(map, offset);
	194	if (offset >= BITS_PER_PAGE)
	195	break;
	196	pid = mk_pid(pid_ns, map, offset);
	197	if (pid >= pid_max)
	198	break;
	199	}
	200	}
	201	if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
	202	++map;
	203	offset = 0;
	204	} else {
	205	map = &pid_ns->pidmap[0];
	206	offset = RESERVED_PIDS;
	207	if (unlikely(last == offset))
	208	break;
	209	}
	210	pid = mk_pid(pid_ns, map, offset);
	211	}
	212	return -EAGAIN;
	213	}
	214
	215	int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
	216	{
	217	int offset;
	218	struct pidmap map, end;
	219
	220	if (last >= PID_MAX_LIMIT)
	221	return -1;
	222
	223	offset = (last + 1) & BITS_PER_PAGE_MASK;
	224	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
	225	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
	226	for (; map < end; map++, offset = 0) {
	227	if (unlikely(!map->page))
	228	continue;
	229	offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
	230	if (offset < BITS_PER_PAGE)
	231	return mk_pid(pid_ns, map, offset);
	232	}
	233	return -1;
	234	}
	235
	236	void put_pid(struct pid *pid)
	237	{
	238	struct pid_namespace *ns;
	239
	240	if (!pid)
	241	return;
	242
	243	ns = pid->numbers[pid->level].ns;
	244	if ((atomic_read(&pid->count) == 1) \|\|
	245	atomic_dec_and_test(&pid->count)) {
	246	kmem_cache_free(ns->pid_cachep, pid);
	247	put_pid_ns(ns);
	248	}
	249	}
	250	EXPORT_SYMBOL_GPL(put_pid);
	251
	252	static void delayed_put_pid(struct rcu_head *rhp)
	253	{
	254	struct pid *pid = container_of(rhp, struct pid, rcu);
	255	put_pid(pid);
	256	}
	257
	258	void free_pid(struct pid *pid)
	259	{
	260	/* We can be called with write_lock_irq(&tasklist_lock) held */
	261	int i;
	262	unsigned long flags;
	263
	264	spin_lock_irqsave(&pidmap_lock, flags);
	265	for (i = 0; i <= pid->level; i++) {
	266	struct upid *upid = pid->numbers + i;
	267	struct pid_namespace *ns = upid->ns;
	268	hlist_del_rcu(&upid->pid_chain);
	269	switch(--ns->nr_hashed) {
	270	case 2:
	271	case 1:
	272	/* When all that is left in the pid namespace
	273	* is the reaper wake up the reaper. The reaper
	274	* may be sleeping in zap_pid_ns_processes().
	275	*/
	276	wake_up_process(ns->child_reaper);
	277	break;
	278	case PIDNS_HASH_ADDING:
	279	/* Handle a fork failure of the first process */
	280	WARN_ON(ns->child_reaper);
	281	ns->nr_hashed = 0;
	282	/* fall through */
	283	case 0:
	284	schedule_work(&ns->proc_work);
	285	break;
	286	}
	287	}
	288	spin_unlock_irqrestore(&pidmap_lock, flags);
	289
	290	for (i = 0; i <= pid->level; i++)
	291	free_pidmap(pid->numbers + i);
	292
	293	call_rcu(&pid->rcu, delayed_put_pid);
	294	}
	295
	296	struct pid alloc_pid(struct pid_namespace ns)
	297	{
	298	struct pid *pid;
	299	enum pid_type type;
	300	int i, nr;
	301	struct pid_namespace *tmp;
	302	struct upid *upid;
	303	int retval = -ENOMEM;
	304
	305	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
	306	if (!pid)
	307	return ERR_PTR(retval);
	308
	309	tmp = ns;
	310	pid->level = ns->level;
	311	for (i = ns->level; i >= 0; i--) {
	312	nr = alloc_pidmap(tmp);
	313	if (nr < 0) {
	314	retval = nr;
	315	goto out_free;
	316	}
	317
	318	pid->numbers[i].nr = nr;
	319	pid->numbers[i].ns = tmp;
	320	tmp = tmp->parent;
	321	}
	322
	323	if (unlikely(is_child_reaper(pid))) {
	324	if (pid_ns_prepare_proc(ns)) {
	325	disable_pid_allocation(ns);
	326	goto out_free;
	327	}
	328	}
	329
	330	get_pid_ns(ns);
	331	atomic_set(&pid->count, 1);
	332	for (type = 0; type < PIDTYPE_MAX; ++type)
	333	INIT_HLIST_HEAD(&pid->tasks[type]);
	334
	335	upid = pid->numbers + ns->level;
	336	spin_lock_irq(&pidmap_lock);
	337	if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
	338	goto out_unlock;
	339	for ( ; upid >= pid->numbers; --upid) {
	340	hlist_add_head_rcu(&upid->pid_chain,
	341	&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
	342	upid->ns->nr_hashed++;
	343	}
	344	spin_unlock_irq(&pidmap_lock);
	345
	346	return pid;
	347
	348	out_unlock:
	349	spin_unlock_irq(&pidmap_lock);
	350	put_pid_ns(ns);
	351
	352	out_free:
	353	while (++i <= ns->level)
	354	free_pidmap(pid->numbers + i);
	355
	356	kmem_cache_free(ns->pid_cachep, pid);
	357	return ERR_PTR(retval);
	358	}
	359
	360	void disable_pid_allocation(struct pid_namespace *ns)
	361	{
	362	spin_lock_irq(&pidmap_lock);
	363	ns->nr_hashed &= ~PIDNS_HASH_ADDING;
	364	spin_unlock_irq(&pidmap_lock);
	365	}
	366
	367	struct pid find_pid_ns(int nr, struct pid_namespace ns)
	368	{
	369	struct upid *pnr;
	370
	371	hlist_for_each_entry_rcu(pnr,
	372	&pid_hash[pid_hashfn(nr, ns)], pid_chain)
	373	if (pnr->nr == nr && pnr->ns == ns)
	374	return container_of(pnr, struct pid,
	375	numbers[ns->level]);
	376
	377	return NULL;
	378	}
	379	EXPORT_SYMBOL_GPL(find_pid_ns);
	380
	381	struct pid *find_vpid(int nr)
	382	{
	383	return find_pid_ns(nr, task_active_pid_ns(current));
	384	}
	385	EXPORT_SYMBOL_GPL(find_vpid);
	386
	387	/*
	388	* attach_pid() must be called with the tasklist_lock write-held.
	389	*/
	390	void attach_pid(struct task_struct *task, enum pid_type type)
	391	{
	392	struct pid_link *link = &task->pids[type];
	393	hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
	394	}
	395
	396	static void __change_pid(struct task_struct *task, enum pid_type type,
	397	struct pid *new)
	398	{
	399	struct pid_link *link;
	400	struct pid *pid;
	401	int tmp;
	402
	403	link = &task->pids[type];
	404	pid = link->pid;
	405
	406	hlist_del_rcu(&link->node);
	407	link->pid = new;
	408
	409	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
	410	if (!hlist_empty(&pid->tasks[tmp]))
	411	return;
	412
	413	free_pid(pid);
	414	}
	415
	416	void detach_pid(struct task_struct *task, enum pid_type type)
	417	{
	418	__change_pid(task, type, NULL);
	419	}
	420
	421	void change_pid(struct task_struct *task, enum pid_type type,
	422	struct pid *pid)
	423	{
	424	__change_pid(task, type, pid);
	425	attach_pid(task, type);
	426	}
	427
	428	/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
	429	void transfer_pid(struct task_struct old, struct task_struct new,
	430	enum pid_type type)
	431	{
	432	new->pids[type].pid = old->pids[type].pid;
	433	hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
	434	}
	435
	436	struct task_struct pid_task(struct pid pid, enum pid_type type)
	437	{
	438	struct task_struct *result = NULL;
	439	if (pid) {
	440	struct hlist_node *first;
	441	first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
	442	lockdep_tasklist_lock_is_held());
	443	if (first)
	444	result = hlist_entry(first, struct task_struct, pids[(type)].node);
	445	}
	446	return result;
	447	}
	448	EXPORT_SYMBOL(pid_task);
	449
	450	/*
	451	* Must be called under rcu_read_lock().
	452	*/
	453	struct task_struct find_task_by_pid_ns(pid_t nr, struct pid_namespace ns)
	454	{
	455	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
	456	"find_task_by_pid_ns() needs rcu_read_lock() protection");
	457	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
	458	}
	459
	460	struct task_struct *find_task_by_vpid(pid_t vnr)
	461	{
	462	return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
	463	}
	464
	465	struct pid get_task_pid(struct task_struct task, enum pid_type type)
	466	{
	467	struct pid *pid;
	468	rcu_read_lock();
	469	if (type != PIDTYPE_PID)
	470	task = task->group_leader;
	471	pid = get_pid(rcu_dereference(task->pids[type].pid));
	472	rcu_read_unlock();
	473	return pid;
	474	}
	475	EXPORT_SYMBOL_GPL(get_task_pid);
	476
	477	struct task_struct get_pid_task(struct pid pid, enum pid_type type)
	478	{
	479	struct task_struct *result;
	480	rcu_read_lock();
	481	result = pid_task(pid, type);
	482	if (result)
	483	get_task_struct(result);
	484	rcu_read_unlock();
	485	return result;
	486	}
	487	EXPORT_SYMBOL_GPL(get_pid_task);
	488
	489	struct pid *find_get_pid(pid_t nr)
	490	{
	491	struct pid *pid;
	492
	493	rcu_read_lock();
	494	pid = get_pid(find_vpid(nr));
	495	rcu_read_unlock();
	496
	497	return pid;
	498	}
	499	EXPORT_SYMBOL_GPL(find_get_pid);
	500
	501	pid_t pid_nr_ns(struct pid pid, struct pid_namespace ns)
	502	{
	503	struct upid *upid;
	504	pid_t nr = 0;
	505
	506	if (pid && ns->level <= pid->level) {
	507	upid = &pid->numbers[ns->level];
	508	if (upid->ns == ns)
	509	nr = upid->nr;
	510	}
	511	return nr;
	512	}
	513	EXPORT_SYMBOL_GPL(pid_nr_ns);
	514
	515	pid_t pid_vnr(struct pid *pid)
	516	{
	517	return pid_nr_ns(pid, task_active_pid_ns(current));
	518	}
	519	EXPORT_SYMBOL_GPL(pid_vnr);
	520
	521	pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
	522	struct pid_namespace *ns)
	523	{
	524	pid_t nr = 0;
	525
	526	rcu_read_lock();
	527	if (!ns)
	528	ns = task_active_pid_ns(current);
	529	if (likely(pid_alive(task))) {
	530	if (type != PIDTYPE_PID) {
	531	if (type == __PIDTYPE_TGID)
	532	type = PIDTYPE_PID;
	533	task = task->group_leader;
	534	}
	535	nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
	536	}
	537	rcu_read_unlock();
	538
	539	return nr;
	540	}
	541	EXPORT_SYMBOL(__task_pid_nr_ns);
	542
	543	struct pid_namespace task_active_pid_ns(struct task_struct tsk)
	544	{
	545	return ns_of_pid(task_pid(tsk));
	546	}
	547	EXPORT_SYMBOL_GPL(task_active_pid_ns);
	548
	549	/*
	550	* Used by proc to find the first pid that is greater than or equal to nr.
	551	*
	552	* If there is a pid at nr this function is exactly the same as find_pid_ns.
	553	*/
	554	struct pid find_ge_pid(int nr, struct pid_namespace ns)
	555	{
	556	struct pid *pid;
	557
	558	do {
	559	pid = find_pid_ns(nr, ns);
	560	if (pid)
	561	break;
	562	nr = next_pidmap(ns, nr);
	563	} while (nr > 0);
	564
	565	return pid;
	566	}
	567
	568	/*
	569	* The pid hash table is scaled according to the amount of memory in the
	570	* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
	571	* more.
	572	*/
	573	void __init pidhash_init(void)
	574	{
	575	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
	576	HASH_EARLY \| HASH_SMALL \| HASH_ZERO,
	577	&pidhash_shift, NULL,
	578	0, 4096);
	579	}
	580
	581	void __init pidmap_init(void)
	582	{
	583	/* Verify no one has done anything silly: */
	584	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
	585
	586	/* bump default and minimum pid_max based on number of cpus */
	587	pid_max = min(pid_max_max, max_t(int, pid_max,
	588	PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
	589	pid_max_min = max_t(int, pid_max_min,
	590	PIDS_PER_CPU_MIN * num_possible_cpus());
	591	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
	592
	593	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
	594	/* Reserve PID 0. We never call free_pidmap(0) */
	595	set_bit(0, init_pid_ns.pidmap[0].page);
	596	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
	597
	598	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
	599	SLAB_HWCACHE_ALIGN \| SLAB_PANIC \| SLAB_ACCOUNT);
	600	}