]>
Commit | Line | Data |
---|---|---|
50f4f2d1 | 1 | // SPDX-License-Identifier: GPL-2.0 |
cb12fd8e | 2 | #include <linux/anon_inodes.h> |
50f4f2d1 CB |
3 | #include <linux/file.h> |
4 | #include <linux/fs.h> | |
5 | #include <linux/magic.h> | |
6 | #include <linux/mount.h> | |
7 | #include <linux/pid.h> | |
cb12fd8e | 8 | #include <linux/pidfs.h> |
50f4f2d1 CB |
9 | #include <linux/pid_namespace.h> |
10 | #include <linux/poll.h> | |
11 | #include <linux/proc_fs.h> | |
12 | #include <linux/proc_ns.h> | |
13 | #include <linux/pseudo_fs.h> | |
5b08bd40 | 14 | #include <linux/ptrace.h> |
50f4f2d1 CB |
15 | #include <linux/seq_file.h> |
16 | #include <uapi/linux/pidfd.h> | |
5b08bd40 CB |
17 | #include <linux/ipc_namespace.h> |
18 | #include <linux/time_namespace.h> | |
19 | #include <linux/utsname.h> | |
20 | #include <net/net_namespace.h> | |
50f4f2d1 | 21 | |
b28ddcc3 | 22 | #include "internal.h" |
5b08bd40 | 23 | #include "mount.h" |
b28ddcc3 | 24 | |
50f4f2d1 CB |
25 | #ifdef CONFIG_PROC_FS |
26 | /** | |
27 | * pidfd_show_fdinfo - print information about a pidfd | |
28 | * @m: proc fdinfo file | |
29 | * @f: file referencing a pidfd | |
30 | * | |
31 | * Pid: | |
32 | * This function will print the pid that a given pidfd refers to in the | |
33 | * pid namespace of the procfs instance. | |
34 | * If the pid namespace of the process is not a descendant of the pid | |
35 | * namespace of the procfs instance 0 will be shown as its pid. This is | |
36 | * similar to calling getppid() on a process whose parent is outside of | |
37 | * its pid namespace. | |
38 | * | |
39 | * NSpid: | |
40 | * If pid namespaces are supported then this function will also print | |
41 | * the pid of a given pidfd refers to for all descendant pid namespaces | |
42 | * starting from the current pid namespace of the instance, i.e. the | |
43 | * Pid field and the first entry in the NSpid field will be identical. | |
44 | * If the pid namespace of the process is not a descendant of the pid | |
45 | * namespace of the procfs instance 0 will be shown as its first NSpid | |
46 | * entry and no others will be shown. | |
47 | * Note that this differs from the Pid and NSpid fields in | |
48 | * /proc/<pid>/status where Pid and NSpid are always shown relative to | |
49 | * the pid namespace of the procfs instance. The difference becomes | |
50 | * obvious when sending around a pidfd between pid namespaces from a | |
51 | * different branch of the tree, i.e. where no ancestral relation is | |
52 | * present between the pid namespaces: | |
53 | * - create two new pid namespaces ns1 and ns2 in the initial pid | |
54 | * namespace (also take care to create new mount namespaces in the | |
55 | * new pid namespace and mount procfs) | |
56 | * - create a process with a pidfd in ns1 | |
57 | * - send pidfd from ns1 to ns2 | |
58 | * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid | |
59 | * have exactly one entry, which is 0 | |
60 | */ | |
61 | static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) | |
62 | { | |
cb12fd8e | 63 | struct pid *pid = pidfd_pid(f); |
50f4f2d1 CB |
64 | struct pid_namespace *ns; |
65 | pid_t nr = -1; | |
66 | ||
67 | if (likely(pid_has_task(pid, PIDTYPE_PID))) { | |
68 | ns = proc_pid_ns(file_inode(m->file)->i_sb); | |
69 | nr = pid_nr_ns(pid, ns); | |
70 | } | |
71 | ||
72 | seq_put_decimal_ll(m, "Pid:\t", nr); | |
73 | ||
74 | #ifdef CONFIG_PID_NS | |
75 | seq_put_decimal_ll(m, "\nNSpid:\t", nr); | |
76 | if (nr > 0) { | |
77 | int i; | |
78 | ||
79 | /* If nr is non-zero it means that 'pid' is valid and that | |
80 | * ns, i.e. the pid namespace associated with the procfs | |
81 | * instance, is in the pid namespace hierarchy of pid. | |
82 | * Start at one below the already printed level. | |
83 | */ | |
84 | for (i = ns->level + 1; i <= pid->level; i++) | |
85 | seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); | |
86 | } | |
87 | #endif | |
88 | seq_putc(m, '\n'); | |
89 | } | |
90 | #endif | |
91 | ||
92 | /* | |
93 | * Poll support for process exit notification. | |
94 | */ | |
95 | static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) | |
96 | { | |
cb12fd8e | 97 | struct pid *pid = pidfd_pid(file); |
50f4f2d1 CB |
98 | bool thread = file->f_flags & PIDFD_THREAD; |
99 | struct task_struct *task; | |
100 | __poll_t poll_flags = 0; | |
101 | ||
102 | poll_wait(file, &pid->wait_pidfd, pts); | |
103 | /* | |
104 | * Depending on PIDFD_THREAD, inform pollers when the thread | |
105 | * or the whole thread-group exits. | |
106 | */ | |
107 | guard(rcu)(); | |
108 | task = pid_task(pid, PIDTYPE_PID); | |
109 | if (!task) | |
110 | poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; | |
111 | else if (task->exit_state && (thread || thread_group_empty(task))) | |
112 | poll_flags = EPOLLIN | EPOLLRDNORM; | |
113 | ||
114 | return poll_flags; | |
115 | } | |
116 | ||
5b08bd40 CB |
117 | static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
118 | { | |
119 | struct task_struct *task __free(put_task) = NULL; | |
120 | struct nsproxy *nsp __free(put_nsproxy) = NULL; | |
121 | struct pid *pid = pidfd_pid(file); | |
9b3e1504 | 122 | struct ns_common *ns_common = NULL; |
5b08bd40 CB |
123 | |
124 | if (arg) | |
125 | return -EINVAL; | |
126 | ||
127 | task = get_pid_task(pid, PIDTYPE_PID); | |
128 | if (!task) | |
129 | return -ESRCH; | |
130 | ||
131 | scoped_guard(task_lock, task) { | |
132 | nsp = task->nsproxy; | |
133 | if (nsp) | |
134 | get_nsproxy(nsp); | |
135 | } | |
136 | if (!nsp) | |
137 | return -ESRCH; /* just pretend it didn't exist */ | |
138 | ||
139 | /* | |
140 | * We're trying to open a file descriptor to the namespace so perform a | |
141 | * filesystem cred ptrace check. Also, we mirror nsfs behavior. | |
142 | */ | |
143 | if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) | |
144 | return -EACCES; | |
145 | ||
146 | switch (cmd) { | |
147 | /* Namespaces that hang of nsproxy. */ | |
148 | case PIDFD_GET_CGROUP_NAMESPACE: | |
9b3e1504 CB |
149 | if (IS_ENABLED(CONFIG_CGROUPS)) { |
150 | get_cgroup_ns(nsp->cgroup_ns); | |
151 | ns_common = to_ns_common(nsp->cgroup_ns); | |
152 | } | |
5b08bd40 CB |
153 | break; |
154 | case PIDFD_GET_IPC_NAMESPACE: | |
9b3e1504 CB |
155 | if (IS_ENABLED(CONFIG_IPC_NS)) { |
156 | get_ipc_ns(nsp->ipc_ns); | |
157 | ns_common = to_ns_common(nsp->ipc_ns); | |
158 | } | |
5b08bd40 CB |
159 | break; |
160 | case PIDFD_GET_MNT_NAMESPACE: | |
161 | get_mnt_ns(nsp->mnt_ns); | |
162 | ns_common = to_ns_common(nsp->mnt_ns); | |
163 | break; | |
164 | case PIDFD_GET_NET_NAMESPACE: | |
9b3e1504 CB |
165 | if (IS_ENABLED(CONFIG_NET_NS)) { |
166 | ns_common = to_ns_common(nsp->net_ns); | |
167 | get_net_ns(ns_common); | |
168 | } | |
5b08bd40 CB |
169 | break; |
170 | case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: | |
9b3e1504 CB |
171 | if (IS_ENABLED(CONFIG_PID_NS)) { |
172 | get_pid_ns(nsp->pid_ns_for_children); | |
173 | ns_common = to_ns_common(nsp->pid_ns_for_children); | |
174 | } | |
5b08bd40 CB |
175 | break; |
176 | case PIDFD_GET_TIME_NAMESPACE: | |
9b3e1504 CB |
177 | if (IS_ENABLED(CONFIG_TIME_NS)) { |
178 | get_time_ns(nsp->time_ns); | |
179 | ns_common = to_ns_common(nsp->time_ns); | |
180 | } | |
5b08bd40 CB |
181 | break; |
182 | case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: | |
9b3e1504 CB |
183 | if (IS_ENABLED(CONFIG_TIME_NS)) { |
184 | get_time_ns(nsp->time_ns_for_children); | |
185 | ns_common = to_ns_common(nsp->time_ns_for_children); | |
186 | } | |
5b08bd40 CB |
187 | break; |
188 | case PIDFD_GET_UTS_NAMESPACE: | |
9b3e1504 CB |
189 | if (IS_ENABLED(CONFIG_UTS_NS)) { |
190 | get_uts_ns(nsp->uts_ns); | |
191 | ns_common = to_ns_common(nsp->uts_ns); | |
192 | } | |
5b08bd40 CB |
193 | break; |
194 | /* Namespaces that don't hang of nsproxy. */ | |
195 | case PIDFD_GET_USER_NAMESPACE: | |
9b3e1504 CB |
196 | if (IS_ENABLED(CONFIG_USER_NS)) { |
197 | rcu_read_lock(); | |
198 | ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); | |
199 | rcu_read_unlock(); | |
200 | } | |
5b08bd40 CB |
201 | break; |
202 | case PIDFD_GET_PID_NAMESPACE: | |
9b3e1504 CB |
203 | if (IS_ENABLED(CONFIG_PID_NS)) { |
204 | rcu_read_lock(); | |
205 | ns_common = to_ns_common( get_pid_ns(task_active_pid_ns(task))); | |
206 | rcu_read_unlock(); | |
207 | } | |
5b08bd40 CB |
208 | break; |
209 | default: | |
210 | return -ENOIOCTLCMD; | |
211 | } | |
212 | ||
9b3e1504 CB |
213 | if (!ns_common) |
214 | return -EOPNOTSUPP; | |
215 | ||
5b08bd40 CB |
216 | /* open_namespace() unconditionally consumes the reference */ |
217 | return open_namespace(ns_common); | |
218 | } | |
219 | ||
cb12fd8e | 220 | static const struct file_operations pidfs_file_operations = { |
50f4f2d1 CB |
221 | .poll = pidfd_poll, |
222 | #ifdef CONFIG_PROC_FS | |
223 | .show_fdinfo = pidfd_show_fdinfo, | |
224 | #endif | |
5b08bd40 CB |
225 | .unlocked_ioctl = pidfd_ioctl, |
226 | .compat_ioctl = compat_ptr_ioctl, | |
50f4f2d1 | 227 | }; |
cb12fd8e CB |
228 | |
229 | struct pid *pidfd_pid(const struct file *file) | |
230 | { | |
231 | if (file->f_op != &pidfs_file_operations) | |
232 | return ERR_PTR(-EBADF); | |
cb12fd8e | 233 | return file_inode(file)->i_private; |
cb12fd8e CB |
234 | } |
235 | ||
cb12fd8e | 236 | static struct vfsmount *pidfs_mnt __ro_after_init; |
cb12fd8e | 237 | |
9d9539db CB |
238 | #if BITS_PER_LONG == 32 |
239 | /* | |
240 | * Provide a fallback mechanism for 32-bit systems so processes remain | |
241 | * reliably comparable by inode number even on those systems. | |
242 | */ | |
243 | static DEFINE_IDA(pidfd_inum_ida); | |
244 | ||
245 | static int pidfs_inum(struct pid *pid, unsigned long *ino) | |
246 | { | |
247 | int ret; | |
248 | ||
249 | ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, | |
250 | UINT_MAX, GFP_ATOMIC); | |
251 | if (ret < 0) | |
252 | return -ENOSPC; | |
253 | ||
254 | *ino = ret; | |
255 | return 0; | |
256 | } | |
257 | ||
258 | static inline void pidfs_free_inum(unsigned long ino) | |
259 | { | |
260 | if (ino > 0) | |
261 | ida_free(&pidfd_inum_ida, ino); | |
262 | } | |
263 | #else | |
264 | static inline int pidfs_inum(struct pid *pid, unsigned long *ino) | |
265 | { | |
266 | *ino = pid->ino; | |
267 | return 0; | |
268 | } | |
269 | #define pidfs_free_inum(ino) ((void)(ino)) | |
270 | #endif | |
271 | ||
cb12fd8e CB |
272 | /* |
273 | * The vfs falls back to simple_setattr() if i_op->setattr() isn't | |
274 | * implemented. Let's reject it completely until we have a clean | |
275 | * permission concept for pidfds. | |
276 | */ | |
277 | static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, | |
278 | struct iattr *attr) | |
279 | { | |
280 | return -EOPNOTSUPP; | |
281 | } | |
282 | ||
db3d841a LT |
283 | |
284 | /* | |
285 | * User space expects pidfs inodes to have no file type in st_mode. | |
286 | * | |
287 | * In particular, 'lsof' has this legacy logic: | |
288 | * | |
289 | * type = s->st_mode & S_IFMT; | |
290 | * switch (type) { | |
291 | * ... | |
292 | * case 0: | |
293 | * if (!strcmp(p, "anon_inode")) | |
294 | * Lf->ntype = Ntype = N_ANON_INODE; | |
295 | * | |
296 | * to detect our old anon_inode logic. | |
297 | * | |
298 | * Rather than mess with our internal sane inode data, just fix it | |
299 | * up here in getattr() by masking off the format bits. | |
300 | */ | |
cb12fd8e CB |
301 | static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, |
302 | struct kstat *stat, u32 request_mask, | |
303 | unsigned int query_flags) | |
304 | { | |
305 | struct inode *inode = d_inode(path->dentry); | |
306 | ||
307 | generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); | |
db3d841a | 308 | stat->mode &= ~S_IFMT; |
cb12fd8e CB |
309 | return 0; |
310 | } | |
311 | ||
312 | static const struct inode_operations pidfs_inode_operations = { | |
313 | .getattr = pidfs_getattr, | |
314 | .setattr = pidfs_setattr, | |
315 | }; | |
316 | ||
317 | static void pidfs_evict_inode(struct inode *inode) | |
318 | { | |
319 | struct pid *pid = inode->i_private; | |
320 | ||
321 | clear_inode(inode); | |
322 | put_pid(pid); | |
9d9539db | 323 | pidfs_free_inum(inode->i_ino); |
cb12fd8e CB |
324 | } |
325 | ||
326 | static const struct super_operations pidfs_sops = { | |
327 | .drop_inode = generic_delete_inode, | |
328 | .evict_inode = pidfs_evict_inode, | |
329 | .statfs = simple_statfs, | |
330 | }; | |
331 | ||
db3d841a LT |
332 | /* |
333 | * 'lsof' has knowledge of out historical anon_inode use, and expects | |
334 | * the pidfs dentry name to start with 'anon_inode'. | |
335 | */ | |
cb12fd8e CB |
336 | static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) |
337 | { | |
db3d841a | 338 | return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); |
cb12fd8e CB |
339 | } |
340 | ||
341 | static const struct dentry_operations pidfs_dentry_operations = { | |
342 | .d_delete = always_delete_dentry, | |
343 | .d_dname = pidfs_dname, | |
2558e3b2 | 344 | .d_prune = stashed_dentry_prune, |
cb12fd8e CB |
345 | }; |
346 | ||
9d9539db | 347 | static int pidfs_init_inode(struct inode *inode, void *data) |
e9c5263c CB |
348 | { |
349 | inode->i_private = data; | |
350 | inode->i_flags |= S_PRIVATE; | |
351 | inode->i_mode |= S_IRWXU; | |
352 | inode->i_op = &pidfs_inode_operations; | |
353 | inode->i_fop = &pidfs_file_operations; | |
9d9539db CB |
354 | /* |
355 | * Inode numbering for pidfs start at RESERVED_PIDS + 1. This | |
356 | * avoids collisions with the root inode which is 1 for pseudo | |
357 | * filesystems. | |
358 | */ | |
359 | return pidfs_inum(data, &inode->i_ino); | |
e9c5263c CB |
360 | } |
361 | ||
362 | static void pidfs_put_data(void *data) | |
363 | { | |
364 | struct pid *pid = data; | |
365 | put_pid(pid); | |
366 | } | |
367 | ||
368 | static const struct stashed_operations pidfs_stashed_ops = { | |
369 | .init_inode = pidfs_init_inode, | |
370 | .put_data = pidfs_put_data, | |
371 | }; | |
372 | ||
cb12fd8e CB |
373 | static int pidfs_init_fs_context(struct fs_context *fc) |
374 | { | |
375 | struct pseudo_fs_context *ctx; | |
376 | ||
377 | ctx = init_pseudo(fc, PID_FS_MAGIC); | |
378 | if (!ctx) | |
379 | return -ENOMEM; | |
380 | ||
381 | ctx->ops = &pidfs_sops; | |
382 | ctx->dops = &pidfs_dentry_operations; | |
e9c5263c | 383 | fc->s_fs_info = (void *)&pidfs_stashed_ops; |
cb12fd8e CB |
384 | return 0; |
385 | } | |
386 | ||
387 | static struct file_system_type pidfs_type = { | |
388 | .name = "pidfs", | |
389 | .init_fs_context = pidfs_init_fs_context, | |
390 | .kill_sb = kill_anon_super, | |
391 | }; | |
392 | ||
393 | struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) | |
394 | { | |
395 | ||
cb12fd8e | 396 | struct file *pidfd_file; |
b28ddcc3 CB |
397 | struct path path; |
398 | int ret; | |
cb12fd8e | 399 | |
9d9539db | 400 | ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); |
b28ddcc3 CB |
401 | if (ret < 0) |
402 | return ERR_PTR(ret); | |
403 | ||
404 | pidfd_file = dentry_open(&path, flags, current_cred()); | |
405 | path_put(&path); | |
cb12fd8e CB |
406 | return pidfd_file; |
407 | } | |
408 | ||
409 | void __init pidfs_init(void) | |
410 | { | |
411 | pidfs_mnt = kern_mount(&pidfs_type); | |
412 | if (IS_ERR(pidfs_mnt)) | |
413 | panic("Failed to mount pidfs pseudo filesystem"); | |
cb12fd8e | 414 | } |