]>
Commit | Line | Data |
---|---|---|
50f4f2d1 | 1 | // SPDX-License-Identifier: GPL-2.0 |
cb12fd8e | 2 | #include <linux/anon_inodes.h> |
50f4f2d1 CB |
3 | #include <linux/file.h> |
4 | #include <linux/fs.h> | |
5 | #include <linux/magic.h> | |
6 | #include <linux/mount.h> | |
7 | #include <linux/pid.h> | |
cb12fd8e | 8 | #include <linux/pidfs.h> |
50f4f2d1 CB |
9 | #include <linux/pid_namespace.h> |
10 | #include <linux/poll.h> | |
11 | #include <linux/proc_fs.h> | |
12 | #include <linux/proc_ns.h> | |
13 | #include <linux/pseudo_fs.h> | |
14 | #include <linux/seq_file.h> | |
15 | #include <uapi/linux/pidfd.h> | |
16 | ||
17 | static int pidfd_release(struct inode *inode, struct file *file) | |
18 | { | |
cb12fd8e | 19 | #ifndef CONFIG_FS_PID |
50f4f2d1 CB |
20 | struct pid *pid = file->private_data; |
21 | ||
22 | file->private_data = NULL; | |
23 | put_pid(pid); | |
cb12fd8e | 24 | #endif |
50f4f2d1 CB |
25 | return 0; |
26 | } | |
27 | ||
28 | #ifdef CONFIG_PROC_FS | |
29 | /** | |
30 | * pidfd_show_fdinfo - print information about a pidfd | |
31 | * @m: proc fdinfo file | |
32 | * @f: file referencing a pidfd | |
33 | * | |
34 | * Pid: | |
35 | * This function will print the pid that a given pidfd refers to in the | |
36 | * pid namespace of the procfs instance. | |
37 | * If the pid namespace of the process is not a descendant of the pid | |
38 | * namespace of the procfs instance 0 will be shown as its pid. This is | |
39 | * similar to calling getppid() on a process whose parent is outside of | |
40 | * its pid namespace. | |
41 | * | |
42 | * NSpid: | |
43 | * If pid namespaces are supported then this function will also print | |
44 | * the pid of a given pidfd refers to for all descendant pid namespaces | |
45 | * starting from the current pid namespace of the instance, i.e. the | |
46 | * Pid field and the first entry in the NSpid field will be identical. | |
47 | * If the pid namespace of the process is not a descendant of the pid | |
48 | * namespace of the procfs instance 0 will be shown as its first NSpid | |
49 | * entry and no others will be shown. | |
50 | * Note that this differs from the Pid and NSpid fields in | |
51 | * /proc/<pid>/status where Pid and NSpid are always shown relative to | |
52 | * the pid namespace of the procfs instance. The difference becomes | |
53 | * obvious when sending around a pidfd between pid namespaces from a | |
54 | * different branch of the tree, i.e. where no ancestral relation is | |
55 | * present between the pid namespaces: | |
56 | * - create two new pid namespaces ns1 and ns2 in the initial pid | |
57 | * namespace (also take care to create new mount namespaces in the | |
58 | * new pid namespace and mount procfs) | |
59 | * - create a process with a pidfd in ns1 | |
60 | * - send pidfd from ns1 to ns2 | |
61 | * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid | |
62 | * have exactly one entry, which is 0 | |
63 | */ | |
64 | static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) | |
65 | { | |
cb12fd8e | 66 | struct pid *pid = pidfd_pid(f); |
50f4f2d1 CB |
67 | struct pid_namespace *ns; |
68 | pid_t nr = -1; | |
69 | ||
70 | if (likely(pid_has_task(pid, PIDTYPE_PID))) { | |
71 | ns = proc_pid_ns(file_inode(m->file)->i_sb); | |
72 | nr = pid_nr_ns(pid, ns); | |
73 | } | |
74 | ||
75 | seq_put_decimal_ll(m, "Pid:\t", nr); | |
76 | ||
77 | #ifdef CONFIG_PID_NS | |
78 | seq_put_decimal_ll(m, "\nNSpid:\t", nr); | |
79 | if (nr > 0) { | |
80 | int i; | |
81 | ||
82 | /* If nr is non-zero it means that 'pid' is valid and that | |
83 | * ns, i.e. the pid namespace associated with the procfs | |
84 | * instance, is in the pid namespace hierarchy of pid. | |
85 | * Start at one below the already printed level. | |
86 | */ | |
87 | for (i = ns->level + 1; i <= pid->level; i++) | |
88 | seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); | |
89 | } | |
90 | #endif | |
91 | seq_putc(m, '\n'); | |
92 | } | |
93 | #endif | |
94 | ||
95 | /* | |
96 | * Poll support for process exit notification. | |
97 | */ | |
98 | static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) | |
99 | { | |
cb12fd8e | 100 | struct pid *pid = pidfd_pid(file); |
50f4f2d1 CB |
101 | bool thread = file->f_flags & PIDFD_THREAD; |
102 | struct task_struct *task; | |
103 | __poll_t poll_flags = 0; | |
104 | ||
105 | poll_wait(file, &pid->wait_pidfd, pts); | |
106 | /* | |
107 | * Depending on PIDFD_THREAD, inform pollers when the thread | |
108 | * or the whole thread-group exits. | |
109 | */ | |
110 | guard(rcu)(); | |
111 | task = pid_task(pid, PIDTYPE_PID); | |
112 | if (!task) | |
113 | poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; | |
114 | else if (task->exit_state && (thread || thread_group_empty(task))) | |
115 | poll_flags = EPOLLIN | EPOLLRDNORM; | |
116 | ||
117 | return poll_flags; | |
118 | } | |
119 | ||
cb12fd8e | 120 | static const struct file_operations pidfs_file_operations = { |
50f4f2d1 CB |
121 | .release = pidfd_release, |
122 | .poll = pidfd_poll, | |
123 | #ifdef CONFIG_PROC_FS | |
124 | .show_fdinfo = pidfd_show_fdinfo, | |
125 | #endif | |
126 | }; | |
cb12fd8e CB |
127 | |
128 | struct pid *pidfd_pid(const struct file *file) | |
129 | { | |
130 | if (file->f_op != &pidfs_file_operations) | |
131 | return ERR_PTR(-EBADF); | |
132 | #ifdef CONFIG_FS_PID | |
133 | return file_inode(file)->i_private; | |
134 | #else | |
135 | return file->private_data; | |
136 | #endif | |
137 | } | |
138 | ||
139 | #ifdef CONFIG_FS_PID | |
140 | static struct vfsmount *pidfs_mnt __ro_after_init; | |
141 | static struct super_block *pidfs_sb __ro_after_init; | |
142 | ||
143 | /* | |
144 | * The vfs falls back to simple_setattr() if i_op->setattr() isn't | |
145 | * implemented. Let's reject it completely until we have a clean | |
146 | * permission concept for pidfds. | |
147 | */ | |
148 | static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, | |
149 | struct iattr *attr) | |
150 | { | |
151 | return -EOPNOTSUPP; | |
152 | } | |
153 | ||
154 | static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, | |
155 | struct kstat *stat, u32 request_mask, | |
156 | unsigned int query_flags) | |
157 | { | |
158 | struct inode *inode = d_inode(path->dentry); | |
159 | ||
160 | generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); | |
161 | return 0; | |
162 | } | |
163 | ||
164 | static const struct inode_operations pidfs_inode_operations = { | |
165 | .getattr = pidfs_getattr, | |
166 | .setattr = pidfs_setattr, | |
167 | }; | |
168 | ||
169 | static void pidfs_evict_inode(struct inode *inode) | |
170 | { | |
171 | struct pid *pid = inode->i_private; | |
172 | ||
173 | clear_inode(inode); | |
174 | put_pid(pid); | |
175 | } | |
176 | ||
177 | static const struct super_operations pidfs_sops = { | |
178 | .drop_inode = generic_delete_inode, | |
179 | .evict_inode = pidfs_evict_inode, | |
180 | .statfs = simple_statfs, | |
181 | }; | |
182 | ||
183 | static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) | |
184 | { | |
185 | return dynamic_dname(buffer, buflen, "pidfd:[%lu]", | |
186 | d_inode(dentry)->i_ino); | |
187 | } | |
188 | ||
189 | static const struct dentry_operations pidfs_dentry_operations = { | |
190 | .d_delete = always_delete_dentry, | |
191 | .d_dname = pidfs_dname, | |
192 | }; | |
193 | ||
194 | static int pidfs_init_fs_context(struct fs_context *fc) | |
195 | { | |
196 | struct pseudo_fs_context *ctx; | |
197 | ||
198 | ctx = init_pseudo(fc, PID_FS_MAGIC); | |
199 | if (!ctx) | |
200 | return -ENOMEM; | |
201 | ||
202 | ctx->ops = &pidfs_sops; | |
203 | ctx->dops = &pidfs_dentry_operations; | |
204 | return 0; | |
205 | } | |
206 | ||
207 | static struct file_system_type pidfs_type = { | |
208 | .name = "pidfs", | |
209 | .init_fs_context = pidfs_init_fs_context, | |
210 | .kill_sb = kill_anon_super, | |
211 | }; | |
212 | ||
213 | struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) | |
214 | { | |
215 | ||
216 | struct inode *inode; | |
217 | struct file *pidfd_file; | |
218 | ||
219 | inode = iget_locked(pidfs_sb, pid->ino); | |
220 | if (!inode) | |
221 | return ERR_PTR(-ENOMEM); | |
222 | ||
223 | if (inode->i_state & I_NEW) { | |
224 | /* | |
225 | * Inode numbering for pidfs start at RESERVED_PIDS + 1. | |
226 | * This avoids collisions with the root inode which is 1 | |
227 | * for pseudo filesystems. | |
228 | */ | |
229 | inode->i_ino = pid->ino; | |
230 | inode->i_mode = S_IFREG | S_IRUGO; | |
231 | inode->i_op = &pidfs_inode_operations; | |
232 | inode->i_fop = &pidfs_file_operations; | |
233 | inode->i_flags |= S_IMMUTABLE; | |
234 | inode->i_private = get_pid(pid); | |
235 | simple_inode_init_ts(inode); | |
236 | unlock_new_inode(inode); | |
237 | } | |
238 | ||
239 | pidfd_file = alloc_file_pseudo(inode, pidfs_mnt, "", flags, | |
240 | &pidfs_file_operations); | |
241 | if (IS_ERR(pidfd_file)) | |
242 | iput(inode); | |
243 | ||
244 | return pidfd_file; | |
245 | } | |
246 | ||
247 | void __init pidfs_init(void) | |
248 | { | |
249 | pidfs_mnt = kern_mount(&pidfs_type); | |
250 | if (IS_ERR(pidfs_mnt)) | |
251 | panic("Failed to mount pidfs pseudo filesystem"); | |
252 | ||
253 | pidfs_sb = pidfs_mnt->mnt_sb; | |
254 | } | |
255 | ||
256 | #else /* !CONFIG_FS_PID */ | |
257 | ||
258 | struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) | |
259 | { | |
260 | struct file *pidfd_file; | |
261 | ||
262 | pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid, | |
263 | flags | O_RDWR); | |
264 | if (IS_ERR(pidfd_file)) | |
265 | return pidfd_file; | |
266 | ||
267 | get_pid(pid); | |
268 | return pidfd_file; | |
269 | } | |
270 | ||
271 | void __init pidfs_init(void) { } | |
272 | #endif |