]>
Commit | Line | Data |
---|---|---|
50f4f2d1 | 1 | // SPDX-License-Identifier: GPL-2.0 |
cb12fd8e | 2 | #include <linux/anon_inodes.h> |
50f4f2d1 CB |
3 | #include <linux/file.h> |
4 | #include <linux/fs.h> | |
5 | #include <linux/magic.h> | |
6 | #include <linux/mount.h> | |
7 | #include <linux/pid.h> | |
cb12fd8e | 8 | #include <linux/pidfs.h> |
50f4f2d1 CB |
9 | #include <linux/pid_namespace.h> |
10 | #include <linux/poll.h> | |
11 | #include <linux/proc_fs.h> | |
12 | #include <linux/proc_ns.h> | |
13 | #include <linux/pseudo_fs.h> | |
14 | #include <linux/seq_file.h> | |
15 | #include <uapi/linux/pidfd.h> | |
16 | ||
b28ddcc3 CB |
17 | #include "internal.h" |
18 | ||
50f4f2d1 CB |
19 | #ifdef CONFIG_PROC_FS |
20 | /** | |
21 | * pidfd_show_fdinfo - print information about a pidfd | |
22 | * @m: proc fdinfo file | |
23 | * @f: file referencing a pidfd | |
24 | * | |
25 | * Pid: | |
26 | * This function will print the pid that a given pidfd refers to in the | |
27 | * pid namespace of the procfs instance. | |
28 | * If the pid namespace of the process is not a descendant of the pid | |
29 | * namespace of the procfs instance 0 will be shown as its pid. This is | |
30 | * similar to calling getppid() on a process whose parent is outside of | |
31 | * its pid namespace. | |
32 | * | |
33 | * NSpid: | |
34 | * If pid namespaces are supported then this function will also print | |
35 | * the pid of a given pidfd refers to for all descendant pid namespaces | |
36 | * starting from the current pid namespace of the instance, i.e. the | |
37 | * Pid field and the first entry in the NSpid field will be identical. | |
38 | * If the pid namespace of the process is not a descendant of the pid | |
39 | * namespace of the procfs instance 0 will be shown as its first NSpid | |
40 | * entry and no others will be shown. | |
41 | * Note that this differs from the Pid and NSpid fields in | |
42 | * /proc/<pid>/status where Pid and NSpid are always shown relative to | |
43 | * the pid namespace of the procfs instance. The difference becomes | |
44 | * obvious when sending around a pidfd between pid namespaces from a | |
45 | * different branch of the tree, i.e. where no ancestral relation is | |
46 | * present between the pid namespaces: | |
47 | * - create two new pid namespaces ns1 and ns2 in the initial pid | |
48 | * namespace (also take care to create new mount namespaces in the | |
49 | * new pid namespace and mount procfs) | |
50 | * - create a process with a pidfd in ns1 | |
51 | * - send pidfd from ns1 to ns2 | |
52 | * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid | |
53 | * have exactly one entry, which is 0 | |
54 | */ | |
55 | static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) | |
56 | { | |
cb12fd8e | 57 | struct pid *pid = pidfd_pid(f); |
50f4f2d1 CB |
58 | struct pid_namespace *ns; |
59 | pid_t nr = -1; | |
60 | ||
61 | if (likely(pid_has_task(pid, PIDTYPE_PID))) { | |
62 | ns = proc_pid_ns(file_inode(m->file)->i_sb); | |
63 | nr = pid_nr_ns(pid, ns); | |
64 | } | |
65 | ||
66 | seq_put_decimal_ll(m, "Pid:\t", nr); | |
67 | ||
68 | #ifdef CONFIG_PID_NS | |
69 | seq_put_decimal_ll(m, "\nNSpid:\t", nr); | |
70 | if (nr > 0) { | |
71 | int i; | |
72 | ||
73 | /* If nr is non-zero it means that 'pid' is valid and that | |
74 | * ns, i.e. the pid namespace associated with the procfs | |
75 | * instance, is in the pid namespace hierarchy of pid. | |
76 | * Start at one below the already printed level. | |
77 | */ | |
78 | for (i = ns->level + 1; i <= pid->level; i++) | |
79 | seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); | |
80 | } | |
81 | #endif | |
82 | seq_putc(m, '\n'); | |
83 | } | |
84 | #endif | |
85 | ||
86 | /* | |
87 | * Poll support for process exit notification. | |
88 | */ | |
89 | static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) | |
90 | { | |
cb12fd8e | 91 | struct pid *pid = pidfd_pid(file); |
50f4f2d1 CB |
92 | bool thread = file->f_flags & PIDFD_THREAD; |
93 | struct task_struct *task; | |
94 | __poll_t poll_flags = 0; | |
95 | ||
96 | poll_wait(file, &pid->wait_pidfd, pts); | |
97 | /* | |
98 | * Depending on PIDFD_THREAD, inform pollers when the thread | |
99 | * or the whole thread-group exits. | |
100 | */ | |
101 | guard(rcu)(); | |
102 | task = pid_task(pid, PIDTYPE_PID); | |
103 | if (!task) | |
104 | poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; | |
105 | else if (task->exit_state && (thread || thread_group_empty(task))) | |
106 | poll_flags = EPOLLIN | EPOLLRDNORM; | |
107 | ||
108 | return poll_flags; | |
109 | } | |
110 | ||
cb12fd8e | 111 | static const struct file_operations pidfs_file_operations = { |
50f4f2d1 CB |
112 | .poll = pidfd_poll, |
113 | #ifdef CONFIG_PROC_FS | |
114 | .show_fdinfo = pidfd_show_fdinfo, | |
115 | #endif | |
116 | }; | |
cb12fd8e CB |
117 | |
118 | struct pid *pidfd_pid(const struct file *file) | |
119 | { | |
120 | if (file->f_op != &pidfs_file_operations) | |
121 | return ERR_PTR(-EBADF); | |
cb12fd8e | 122 | return file_inode(file)->i_private; |
cb12fd8e CB |
123 | } |
124 | ||
cb12fd8e | 125 | static struct vfsmount *pidfs_mnt __ro_after_init; |
cb12fd8e | 126 | |
9d9539db CB |
127 | #if BITS_PER_LONG == 32 |
128 | /* | |
129 | * Provide a fallback mechanism for 32-bit systems so processes remain | |
130 | * reliably comparable by inode number even on those systems. | |
131 | */ | |
132 | static DEFINE_IDA(pidfd_inum_ida); | |
133 | ||
134 | static int pidfs_inum(struct pid *pid, unsigned long *ino) | |
135 | { | |
136 | int ret; | |
137 | ||
138 | ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, | |
139 | UINT_MAX, GFP_ATOMIC); | |
140 | if (ret < 0) | |
141 | return -ENOSPC; | |
142 | ||
143 | *ino = ret; | |
144 | return 0; | |
145 | } | |
146 | ||
147 | static inline void pidfs_free_inum(unsigned long ino) | |
148 | { | |
149 | if (ino > 0) | |
150 | ida_free(&pidfd_inum_ida, ino); | |
151 | } | |
152 | #else | |
153 | static inline int pidfs_inum(struct pid *pid, unsigned long *ino) | |
154 | { | |
155 | *ino = pid->ino; | |
156 | return 0; | |
157 | } | |
158 | #define pidfs_free_inum(ino) ((void)(ino)) | |
159 | #endif | |
160 | ||
cb12fd8e CB |
161 | /* |
162 | * The vfs falls back to simple_setattr() if i_op->setattr() isn't | |
163 | * implemented. Let's reject it completely until we have a clean | |
164 | * permission concept for pidfds. | |
165 | */ | |
166 | static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, | |
167 | struct iattr *attr) | |
168 | { | |
169 | return -EOPNOTSUPP; | |
170 | } | |
171 | ||
db3d841a LT |
172 | |
173 | /* | |
174 | * User space expects pidfs inodes to have no file type in st_mode. | |
175 | * | |
176 | * In particular, 'lsof' has this legacy logic: | |
177 | * | |
178 | * type = s->st_mode & S_IFMT; | |
179 | * switch (type) { | |
180 | * ... | |
181 | * case 0: | |
182 | * if (!strcmp(p, "anon_inode")) | |
183 | * Lf->ntype = Ntype = N_ANON_INODE; | |
184 | * | |
185 | * to detect our old anon_inode logic. | |
186 | * | |
187 | * Rather than mess with our internal sane inode data, just fix it | |
188 | * up here in getattr() by masking off the format bits. | |
189 | */ | |
cb12fd8e CB |
190 | static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, |
191 | struct kstat *stat, u32 request_mask, | |
192 | unsigned int query_flags) | |
193 | { | |
194 | struct inode *inode = d_inode(path->dentry); | |
195 | ||
196 | generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); | |
db3d841a | 197 | stat->mode &= ~S_IFMT; |
cb12fd8e CB |
198 | return 0; |
199 | } | |
200 | ||
201 | static const struct inode_operations pidfs_inode_operations = { | |
202 | .getattr = pidfs_getattr, | |
203 | .setattr = pidfs_setattr, | |
204 | }; | |
205 | ||
206 | static void pidfs_evict_inode(struct inode *inode) | |
207 | { | |
208 | struct pid *pid = inode->i_private; | |
209 | ||
210 | clear_inode(inode); | |
211 | put_pid(pid); | |
9d9539db | 212 | pidfs_free_inum(inode->i_ino); |
cb12fd8e CB |
213 | } |
214 | ||
215 | static const struct super_operations pidfs_sops = { | |
216 | .drop_inode = generic_delete_inode, | |
217 | .evict_inode = pidfs_evict_inode, | |
218 | .statfs = simple_statfs, | |
219 | }; | |
220 | ||
db3d841a LT |
221 | /* |
222 | * 'lsof' has knowledge of out historical anon_inode use, and expects | |
223 | * the pidfs dentry name to start with 'anon_inode'. | |
224 | */ | |
cb12fd8e CB |
225 | static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) |
226 | { | |
db3d841a | 227 | return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); |
cb12fd8e CB |
228 | } |
229 | ||
230 | static const struct dentry_operations pidfs_dentry_operations = { | |
231 | .d_delete = always_delete_dentry, | |
232 | .d_dname = pidfs_dname, | |
2558e3b2 | 233 | .d_prune = stashed_dentry_prune, |
cb12fd8e CB |
234 | }; |
235 | ||
9d9539db | 236 | static int pidfs_init_inode(struct inode *inode, void *data) |
e9c5263c CB |
237 | { |
238 | inode->i_private = data; | |
239 | inode->i_flags |= S_PRIVATE; | |
240 | inode->i_mode |= S_IRWXU; | |
241 | inode->i_op = &pidfs_inode_operations; | |
242 | inode->i_fop = &pidfs_file_operations; | |
9d9539db CB |
243 | /* |
244 | * Inode numbering for pidfs start at RESERVED_PIDS + 1. This | |
245 | * avoids collisions with the root inode which is 1 for pseudo | |
246 | * filesystems. | |
247 | */ | |
248 | return pidfs_inum(data, &inode->i_ino); | |
e9c5263c CB |
249 | } |
250 | ||
251 | static void pidfs_put_data(void *data) | |
252 | { | |
253 | struct pid *pid = data; | |
254 | put_pid(pid); | |
255 | } | |
256 | ||
257 | static const struct stashed_operations pidfs_stashed_ops = { | |
258 | .init_inode = pidfs_init_inode, | |
259 | .put_data = pidfs_put_data, | |
260 | }; | |
261 | ||
cb12fd8e CB |
262 | static int pidfs_init_fs_context(struct fs_context *fc) |
263 | { | |
264 | struct pseudo_fs_context *ctx; | |
265 | ||
266 | ctx = init_pseudo(fc, PID_FS_MAGIC); | |
267 | if (!ctx) | |
268 | return -ENOMEM; | |
269 | ||
270 | ctx->ops = &pidfs_sops; | |
271 | ctx->dops = &pidfs_dentry_operations; | |
e9c5263c | 272 | fc->s_fs_info = (void *)&pidfs_stashed_ops; |
cb12fd8e CB |
273 | return 0; |
274 | } | |
275 | ||
276 | static struct file_system_type pidfs_type = { | |
277 | .name = "pidfs", | |
278 | .init_fs_context = pidfs_init_fs_context, | |
279 | .kill_sb = kill_anon_super, | |
280 | }; | |
281 | ||
282 | struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) | |
283 | { | |
284 | ||
cb12fd8e | 285 | struct file *pidfd_file; |
b28ddcc3 CB |
286 | struct path path; |
287 | int ret; | |
cb12fd8e | 288 | |
9d9539db | 289 | ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); |
b28ddcc3 CB |
290 | if (ret < 0) |
291 | return ERR_PTR(ret); | |
292 | ||
293 | pidfd_file = dentry_open(&path, flags, current_cred()); | |
294 | path_put(&path); | |
cb12fd8e CB |
295 | return pidfd_file; |
296 | } | |
297 | ||
298 | void __init pidfs_init(void) | |
299 | { | |
300 | pidfs_mnt = kern_mount(&pidfs_type); | |
301 | if (IS_ERR(pidfs_mnt)) | |
302 | panic("Failed to mount pidfs pseudo filesystem"); | |
cb12fd8e | 303 | } |