]>
Commit | Line | Data |
---|---|---|
50f4f2d1 | 1 | // SPDX-License-Identifier: GPL-2.0 |
cb12fd8e | 2 | #include <linux/anon_inodes.h> |
50f4f2d1 CB |
3 | #include <linux/file.h> |
4 | #include <linux/fs.h> | |
cdda1f26 | 5 | #include <linux/cgroup.h> |
50f4f2d1 CB |
6 | #include <linux/magic.h> |
7 | #include <linux/mount.h> | |
8 | #include <linux/pid.h> | |
cb12fd8e | 9 | #include <linux/pidfs.h> |
50f4f2d1 CB |
10 | #include <linux/pid_namespace.h> |
11 | #include <linux/poll.h> | |
12 | #include <linux/proc_fs.h> | |
13 | #include <linux/proc_ns.h> | |
14 | #include <linux/pseudo_fs.h> | |
5b08bd40 | 15 | #include <linux/ptrace.h> |
50f4f2d1 CB |
16 | #include <linux/seq_file.h> |
17 | #include <uapi/linux/pidfd.h> | |
5b08bd40 CB |
18 | #include <linux/ipc_namespace.h> |
19 | #include <linux/time_namespace.h> | |
20 | #include <linux/utsname.h> | |
21 | #include <net/net_namespace.h> | |
50f4f2d1 | 22 | |
b28ddcc3 | 23 | #include "internal.h" |
5b08bd40 | 24 | #include "mount.h" |
b28ddcc3 | 25 | |
50f4f2d1 CB |
26 | #ifdef CONFIG_PROC_FS |
27 | /** | |
28 | * pidfd_show_fdinfo - print information about a pidfd | |
29 | * @m: proc fdinfo file | |
30 | * @f: file referencing a pidfd | |
31 | * | |
32 | * Pid: | |
33 | * This function will print the pid that a given pidfd refers to in the | |
34 | * pid namespace of the procfs instance. | |
35 | * If the pid namespace of the process is not a descendant of the pid | |
36 | * namespace of the procfs instance 0 will be shown as its pid. This is | |
37 | * similar to calling getppid() on a process whose parent is outside of | |
38 | * its pid namespace. | |
39 | * | |
40 | * NSpid: | |
41 | * If pid namespaces are supported then this function will also print | |
42 | * the pid of a given pidfd refers to for all descendant pid namespaces | |
43 | * starting from the current pid namespace of the instance, i.e. the | |
44 | * Pid field and the first entry in the NSpid field will be identical. | |
45 | * If the pid namespace of the process is not a descendant of the pid | |
46 | * namespace of the procfs instance 0 will be shown as its first NSpid | |
47 | * entry and no others will be shown. | |
48 | * Note that this differs from the Pid and NSpid fields in | |
49 | * /proc/<pid>/status where Pid and NSpid are always shown relative to | |
50 | * the pid namespace of the procfs instance. The difference becomes | |
51 | * obvious when sending around a pidfd between pid namespaces from a | |
52 | * different branch of the tree, i.e. where no ancestral relation is | |
53 | * present between the pid namespaces: | |
54 | * - create two new pid namespaces ns1 and ns2 in the initial pid | |
55 | * namespace (also take care to create new mount namespaces in the | |
56 | * new pid namespace and mount procfs) | |
57 | * - create a process with a pidfd in ns1 | |
58 | * - send pidfd from ns1 to ns2 | |
59 | * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid | |
60 | * have exactly one entry, which is 0 | |
61 | */ | |
62 | static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) | |
63 | { | |
cb12fd8e | 64 | struct pid *pid = pidfd_pid(f); |
50f4f2d1 CB |
65 | struct pid_namespace *ns; |
66 | pid_t nr = -1; | |
67 | ||
68 | if (likely(pid_has_task(pid, PIDTYPE_PID))) { | |
69 | ns = proc_pid_ns(file_inode(m->file)->i_sb); | |
70 | nr = pid_nr_ns(pid, ns); | |
71 | } | |
72 | ||
73 | seq_put_decimal_ll(m, "Pid:\t", nr); | |
74 | ||
75 | #ifdef CONFIG_PID_NS | |
76 | seq_put_decimal_ll(m, "\nNSpid:\t", nr); | |
77 | if (nr > 0) { | |
78 | int i; | |
79 | ||
80 | /* If nr is non-zero it means that 'pid' is valid and that | |
81 | * ns, i.e. the pid namespace associated with the procfs | |
82 | * instance, is in the pid namespace hierarchy of pid. | |
83 | * Start at one below the already printed level. | |
84 | */ | |
85 | for (i = ns->level + 1; i <= pid->level; i++) | |
86 | seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); | |
87 | } | |
88 | #endif | |
89 | seq_putc(m, '\n'); | |
90 | } | |
91 | #endif | |
92 | ||
93 | /* | |
94 | * Poll support for process exit notification. | |
95 | */ | |
96 | static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) | |
97 | { | |
cb12fd8e | 98 | struct pid *pid = pidfd_pid(file); |
50f4f2d1 CB |
99 | bool thread = file->f_flags & PIDFD_THREAD; |
100 | struct task_struct *task; | |
101 | __poll_t poll_flags = 0; | |
102 | ||
103 | poll_wait(file, &pid->wait_pidfd, pts); | |
104 | /* | |
105 | * Depending on PIDFD_THREAD, inform pollers when the thread | |
106 | * or the whole thread-group exits. | |
107 | */ | |
108 | guard(rcu)(); | |
109 | task = pid_task(pid, PIDTYPE_PID); | |
110 | if (!task) | |
111 | poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; | |
112 | else if (task->exit_state && (thread || thread_group_empty(task))) | |
113 | poll_flags = EPOLLIN | EPOLLRDNORM; | |
114 | ||
115 | return poll_flags; | |
116 | } | |
117 | ||
cdda1f26 LB |
118 | static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) |
119 | { | |
120 | struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; | |
121 | size_t usize = _IOC_SIZE(cmd); | |
122 | struct pidfd_info kinfo = {}; | |
123 | struct user_namespace *user_ns; | |
124 | const struct cred *c; | |
125 | __u64 mask; | |
126 | #ifdef CONFIG_CGROUPS | |
127 | struct cgroup *cgrp; | |
128 | #endif | |
129 | ||
130 | if (!uinfo) | |
131 | return -EINVAL; | |
132 | if (usize < PIDFD_INFO_SIZE_VER0) | |
133 | return -EINVAL; /* First version, no smaller struct possible */ | |
134 | ||
135 | if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) | |
136 | return -EFAULT; | |
137 | ||
138 | c = get_task_cred(task); | |
139 | if (!c) | |
140 | return -ESRCH; | |
141 | ||
142 | /* Unconditionally return identifiers and credentials, the rest only on request */ | |
143 | ||
144 | user_ns = current_user_ns(); | |
145 | kinfo.ruid = from_kuid_munged(user_ns, c->uid); | |
146 | kinfo.rgid = from_kgid_munged(user_ns, c->gid); | |
147 | kinfo.euid = from_kuid_munged(user_ns, c->euid); | |
148 | kinfo.egid = from_kgid_munged(user_ns, c->egid); | |
149 | kinfo.suid = from_kuid_munged(user_ns, c->suid); | |
150 | kinfo.sgid = from_kgid_munged(user_ns, c->sgid); | |
151 | kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); | |
152 | kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); | |
153 | kinfo.mask |= PIDFD_INFO_CREDS; | |
154 | put_cred(c); | |
155 | ||
156 | #ifdef CONFIG_CGROUPS | |
157 | rcu_read_lock(); | |
158 | cgrp = task_dfl_cgroup(task); | |
159 | kinfo.cgroupid = cgroup_id(cgrp); | |
160 | kinfo.mask |= PIDFD_INFO_CGROUPID; | |
161 | rcu_read_unlock(); | |
162 | #endif | |
163 | ||
164 | /* | |
165 | * Copy pid/tgid last, to reduce the chances the information might be | |
166 | * stale. Note that it is not possible to ensure it will be valid as the | |
167 | * task might return as soon as the copy_to_user finishes, but that's ok | |
168 | * and userspace expects that might happen and can act accordingly, so | |
169 | * this is just best-effort. What we can do however is checking that all | |
170 | * the fields are set correctly, or return ESRCH to avoid providing | |
171 | * incomplete information. */ | |
172 | ||
173 | kinfo.ppid = task_ppid_nr_ns(task, NULL); | |
174 | kinfo.tgid = task_tgid_vnr(task); | |
175 | kinfo.pid = task_pid_vnr(task); | |
176 | kinfo.mask |= PIDFD_INFO_PID; | |
177 | ||
178 | if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) | |
179 | return -ESRCH; | |
180 | ||
181 | /* | |
182 | * If userspace and the kernel have the same struct size it can just | |
183 | * be copied. If userspace provides an older struct, only the bits that | |
184 | * userspace knows about will be copied. If userspace provides a new | |
185 | * struct, only the bits that the kernel knows about will be copied. | |
186 | */ | |
187 | if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo)))) | |
188 | return -EFAULT; | |
189 | ||
190 | return 0; | |
191 | } | |
192 | ||
5b08bd40 CB |
193 | static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
194 | { | |
195 | struct task_struct *task __free(put_task) = NULL; | |
196 | struct nsproxy *nsp __free(put_nsproxy) = NULL; | |
197 | struct pid *pid = pidfd_pid(file); | |
9b3e1504 | 198 | struct ns_common *ns_common = NULL; |
8a460677 | 199 | struct pid_namespace *pid_ns; |
5b08bd40 | 200 | |
5b08bd40 CB |
201 | task = get_pid_task(pid, PIDTYPE_PID); |
202 | if (!task) | |
203 | return -ESRCH; | |
204 | ||
cdda1f26 LB |
205 | /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ |
206 | if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) | |
207 | return pidfd_info(task, cmd, arg); | |
208 | ||
209 | if (arg) | |
210 | return -EINVAL; | |
211 | ||
5b08bd40 CB |
212 | scoped_guard(task_lock, task) { |
213 | nsp = task->nsproxy; | |
214 | if (nsp) | |
215 | get_nsproxy(nsp); | |
216 | } | |
217 | if (!nsp) | |
218 | return -ESRCH; /* just pretend it didn't exist */ | |
219 | ||
220 | /* | |
221 | * We're trying to open a file descriptor to the namespace so perform a | |
222 | * filesystem cred ptrace check. Also, we mirror nsfs behavior. | |
223 | */ | |
224 | if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) | |
225 | return -EACCES; | |
226 | ||
227 | switch (cmd) { | |
228 | /* Namespaces that hang of nsproxy. */ | |
229 | case PIDFD_GET_CGROUP_NAMESPACE: | |
9b3e1504 CB |
230 | if (IS_ENABLED(CONFIG_CGROUPS)) { |
231 | get_cgroup_ns(nsp->cgroup_ns); | |
232 | ns_common = to_ns_common(nsp->cgroup_ns); | |
233 | } | |
5b08bd40 CB |
234 | break; |
235 | case PIDFD_GET_IPC_NAMESPACE: | |
9b3e1504 CB |
236 | if (IS_ENABLED(CONFIG_IPC_NS)) { |
237 | get_ipc_ns(nsp->ipc_ns); | |
238 | ns_common = to_ns_common(nsp->ipc_ns); | |
239 | } | |
5b08bd40 CB |
240 | break; |
241 | case PIDFD_GET_MNT_NAMESPACE: | |
242 | get_mnt_ns(nsp->mnt_ns); | |
243 | ns_common = to_ns_common(nsp->mnt_ns); | |
244 | break; | |
245 | case PIDFD_GET_NET_NAMESPACE: | |
9b3e1504 CB |
246 | if (IS_ENABLED(CONFIG_NET_NS)) { |
247 | ns_common = to_ns_common(nsp->net_ns); | |
248 | get_net_ns(ns_common); | |
249 | } | |
5b08bd40 CB |
250 | break; |
251 | case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: | |
9b3e1504 CB |
252 | if (IS_ENABLED(CONFIG_PID_NS)) { |
253 | get_pid_ns(nsp->pid_ns_for_children); | |
254 | ns_common = to_ns_common(nsp->pid_ns_for_children); | |
255 | } | |
5b08bd40 CB |
256 | break; |
257 | case PIDFD_GET_TIME_NAMESPACE: | |
9b3e1504 CB |
258 | if (IS_ENABLED(CONFIG_TIME_NS)) { |
259 | get_time_ns(nsp->time_ns); | |
260 | ns_common = to_ns_common(nsp->time_ns); | |
261 | } | |
5b08bd40 CB |
262 | break; |
263 | case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: | |
9b3e1504 CB |
264 | if (IS_ENABLED(CONFIG_TIME_NS)) { |
265 | get_time_ns(nsp->time_ns_for_children); | |
266 | ns_common = to_ns_common(nsp->time_ns_for_children); | |
267 | } | |
5b08bd40 CB |
268 | break; |
269 | case PIDFD_GET_UTS_NAMESPACE: | |
9b3e1504 CB |
270 | if (IS_ENABLED(CONFIG_UTS_NS)) { |
271 | get_uts_ns(nsp->uts_ns); | |
272 | ns_common = to_ns_common(nsp->uts_ns); | |
273 | } | |
5b08bd40 CB |
274 | break; |
275 | /* Namespaces that don't hang of nsproxy. */ | |
276 | case PIDFD_GET_USER_NAMESPACE: | |
9b3e1504 CB |
277 | if (IS_ENABLED(CONFIG_USER_NS)) { |
278 | rcu_read_lock(); | |
279 | ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); | |
280 | rcu_read_unlock(); | |
281 | } | |
5b08bd40 CB |
282 | break; |
283 | case PIDFD_GET_PID_NAMESPACE: | |
9b3e1504 CB |
284 | if (IS_ENABLED(CONFIG_PID_NS)) { |
285 | rcu_read_lock(); | |
8a460677 CB |
286 | pid_ns = task_active_pid_ns(task); |
287 | if (pid_ns) | |
288 | ns_common = to_ns_common(get_pid_ns(pid_ns)); | |
9b3e1504 CB |
289 | rcu_read_unlock(); |
290 | } | |
5b08bd40 CB |
291 | break; |
292 | default: | |
293 | return -ENOIOCTLCMD; | |
294 | } | |
295 | ||
9b3e1504 CB |
296 | if (!ns_common) |
297 | return -EOPNOTSUPP; | |
298 | ||
5b08bd40 CB |
299 | /* open_namespace() unconditionally consumes the reference */ |
300 | return open_namespace(ns_common); | |
301 | } | |
302 | ||
cb12fd8e | 303 | static const struct file_operations pidfs_file_operations = { |
50f4f2d1 CB |
304 | .poll = pidfd_poll, |
305 | #ifdef CONFIG_PROC_FS | |
306 | .show_fdinfo = pidfd_show_fdinfo, | |
307 | #endif | |
5b08bd40 CB |
308 | .unlocked_ioctl = pidfd_ioctl, |
309 | .compat_ioctl = compat_ptr_ioctl, | |
50f4f2d1 | 310 | }; |
cb12fd8e CB |
311 | |
312 | struct pid *pidfd_pid(const struct file *file) | |
313 | { | |
314 | if (file->f_op != &pidfs_file_operations) | |
315 | return ERR_PTR(-EBADF); | |
cb12fd8e | 316 | return file_inode(file)->i_private; |
cb12fd8e CB |
317 | } |
318 | ||
cb12fd8e | 319 | static struct vfsmount *pidfs_mnt __ro_after_init; |
cb12fd8e | 320 | |
9d9539db CB |
321 | #if BITS_PER_LONG == 32 |
322 | /* | |
323 | * Provide a fallback mechanism for 32-bit systems so processes remain | |
324 | * reliably comparable by inode number even on those systems. | |
325 | */ | |
326 | static DEFINE_IDA(pidfd_inum_ida); | |
327 | ||
328 | static int pidfs_inum(struct pid *pid, unsigned long *ino) | |
329 | { | |
330 | int ret; | |
331 | ||
332 | ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, | |
333 | UINT_MAX, GFP_ATOMIC); | |
334 | if (ret < 0) | |
335 | return -ENOSPC; | |
336 | ||
337 | *ino = ret; | |
338 | return 0; | |
339 | } | |
340 | ||
341 | static inline void pidfs_free_inum(unsigned long ino) | |
342 | { | |
343 | if (ino > 0) | |
344 | ida_free(&pidfd_inum_ida, ino); | |
345 | } | |
346 | #else | |
347 | static inline int pidfs_inum(struct pid *pid, unsigned long *ino) | |
348 | { | |
349 | *ino = pid->ino; | |
350 | return 0; | |
351 | } | |
352 | #define pidfs_free_inum(ino) ((void)(ino)) | |
353 | #endif | |
354 | ||
cb12fd8e CB |
355 | /* |
356 | * The vfs falls back to simple_setattr() if i_op->setattr() isn't | |
357 | * implemented. Let's reject it completely until we have a clean | |
358 | * permission concept for pidfds. | |
359 | */ | |
360 | static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, | |
361 | struct iattr *attr) | |
362 | { | |
363 | return -EOPNOTSUPP; | |
364 | } | |
365 | ||
db3d841a LT |
366 | |
367 | /* | |
368 | * User space expects pidfs inodes to have no file type in st_mode. | |
369 | * | |
370 | * In particular, 'lsof' has this legacy logic: | |
371 | * | |
372 | * type = s->st_mode & S_IFMT; | |
373 | * switch (type) { | |
374 | * ... | |
375 | * case 0: | |
376 | * if (!strcmp(p, "anon_inode")) | |
377 | * Lf->ntype = Ntype = N_ANON_INODE; | |
378 | * | |
379 | * to detect our old anon_inode logic. | |
380 | * | |
381 | * Rather than mess with our internal sane inode data, just fix it | |
382 | * up here in getattr() by masking off the format bits. | |
383 | */ | |
cb12fd8e CB |
384 | static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, |
385 | struct kstat *stat, u32 request_mask, | |
386 | unsigned int query_flags) | |
387 | { | |
388 | struct inode *inode = d_inode(path->dentry); | |
389 | ||
390 | generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); | |
db3d841a | 391 | stat->mode &= ~S_IFMT; |
cb12fd8e CB |
392 | return 0; |
393 | } | |
394 | ||
395 | static const struct inode_operations pidfs_inode_operations = { | |
396 | .getattr = pidfs_getattr, | |
397 | .setattr = pidfs_setattr, | |
398 | }; | |
399 | ||
400 | static void pidfs_evict_inode(struct inode *inode) | |
401 | { | |
402 | struct pid *pid = inode->i_private; | |
403 | ||
404 | clear_inode(inode); | |
405 | put_pid(pid); | |
9d9539db | 406 | pidfs_free_inum(inode->i_ino); |
cb12fd8e CB |
407 | } |
408 | ||
409 | static const struct super_operations pidfs_sops = { | |
410 | .drop_inode = generic_delete_inode, | |
411 | .evict_inode = pidfs_evict_inode, | |
412 | .statfs = simple_statfs, | |
413 | }; | |
414 | ||
db3d841a LT |
415 | /* |
416 | * 'lsof' has knowledge of out historical anon_inode use, and expects | |
417 | * the pidfs dentry name to start with 'anon_inode'. | |
418 | */ | |
cb12fd8e CB |
419 | static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) |
420 | { | |
db3d841a | 421 | return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); |
cb12fd8e CB |
422 | } |
423 | ||
424 | static const struct dentry_operations pidfs_dentry_operations = { | |
425 | .d_delete = always_delete_dentry, | |
426 | .d_dname = pidfs_dname, | |
2558e3b2 | 427 | .d_prune = stashed_dentry_prune, |
cb12fd8e CB |
428 | }; |
429 | ||
9d9539db | 430 | static int pidfs_init_inode(struct inode *inode, void *data) |
e9c5263c CB |
431 | { |
432 | inode->i_private = data; | |
433 | inode->i_flags |= S_PRIVATE; | |
434 | inode->i_mode |= S_IRWXU; | |
435 | inode->i_op = &pidfs_inode_operations; | |
436 | inode->i_fop = &pidfs_file_operations; | |
9d9539db CB |
437 | /* |
438 | * Inode numbering for pidfs start at RESERVED_PIDS + 1. This | |
439 | * avoids collisions with the root inode which is 1 for pseudo | |
440 | * filesystems. | |
441 | */ | |
442 | return pidfs_inum(data, &inode->i_ino); | |
e9c5263c CB |
443 | } |
444 | ||
445 | static void pidfs_put_data(void *data) | |
446 | { | |
447 | struct pid *pid = data; | |
448 | put_pid(pid); | |
449 | } | |
450 | ||
451 | static const struct stashed_operations pidfs_stashed_ops = { | |
452 | .init_inode = pidfs_init_inode, | |
453 | .put_data = pidfs_put_data, | |
454 | }; | |
455 | ||
cb12fd8e CB |
456 | static int pidfs_init_fs_context(struct fs_context *fc) |
457 | { | |
458 | struct pseudo_fs_context *ctx; | |
459 | ||
460 | ctx = init_pseudo(fc, PID_FS_MAGIC); | |
461 | if (!ctx) | |
462 | return -ENOMEM; | |
463 | ||
464 | ctx->ops = &pidfs_sops; | |
465 | ctx->dops = &pidfs_dentry_operations; | |
e9c5263c | 466 | fc->s_fs_info = (void *)&pidfs_stashed_ops; |
cb12fd8e CB |
467 | return 0; |
468 | } | |
469 | ||
470 | static struct file_system_type pidfs_type = { | |
471 | .name = "pidfs", | |
472 | .init_fs_context = pidfs_init_fs_context, | |
473 | .kill_sb = kill_anon_super, | |
474 | }; | |
475 | ||
476 | struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) | |
477 | { | |
478 | ||
cb12fd8e | 479 | struct file *pidfd_file; |
b28ddcc3 CB |
480 | struct path path; |
481 | int ret; | |
cb12fd8e | 482 | |
9d9539db | 483 | ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); |
b28ddcc3 CB |
484 | if (ret < 0) |
485 | return ERR_PTR(ret); | |
486 | ||
487 | pidfd_file = dentry_open(&path, flags, current_cred()); | |
488 | path_put(&path); | |
cb12fd8e CB |
489 | return pidfd_file; |
490 | } | |
491 | ||
492 | void __init pidfs_init(void) | |
493 | { | |
494 | pidfs_mnt = kern_mount(&pidfs_type); | |
495 | if (IS_ERR(pidfs_mnt)) | |
496 | panic("Failed to mount pidfs pseudo filesystem"); | |
cb12fd8e | 497 | } |