]> Git Repo - linux.git/blame - mm/memfd.c
Merge tag 'spi-v6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi
[linux.git] / mm / memfd.c
CommitLineData
5d752600
MK
1/*
2 * memfd_create system call and file sealing support
3 *
4 * Code was originally included in shmem.c, and broken out to facilitate
5 * use by hugetlbfs as well as tmpfs.
6 *
7 * This file is released under the GPL.
8 */
9
10#include <linux/fs.h>
11#include <linux/vfs.h>
12#include <linux/pagemap.h>
13#include <linux/file.h>
14#include <linux/mm.h>
15#include <linux/sched/signal.h>
16#include <linux/khugepaged.h>
17#include <linux/syscalls.h>
18#include <linux/hugetlb.h>
19#include <linux/shmem_fs.h>
20#include <linux/memfd.h>
105ff533 21#include <linux/pid_namespace.h>
5d752600
MK
22#include <uapi/linux/memfd.h>
23
24/*
2313216f 25 * We need a tag: a new tag would expand every xa_node by 8 bytes,
5d752600
MK
26 * so reuse a tag which we firmly believe is never set or cleared on tmpfs
27 * or hugetlbfs because they are memory only filesystems.
28 */
29#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
30#define LAST_SCAN 4 /* about 150ms max */
31
b4d02baa
DH
32static bool memfd_folio_has_extra_refs(struct folio *folio)
33{
34 return folio_ref_count(folio) - folio_mapcount(folio) !=
35 folio_nr_pages(folio);
36}
37
ef3038a5 38static void memfd_tag_pins(struct xa_state *xas)
5d752600 39{
b4d02baa 40 struct folio *folio;
f2b277c4 41 int latency = 0;
5d752600
MK
42
43 lru_add_drain();
5d752600 44
ef3038a5 45 xas_lock_irq(xas);
b4d02baa
DH
46 xas_for_each(xas, folio, ULONG_MAX) {
47 if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
ef3038a5 48 xas_set_mark(xas, MEMFD_TAG_PINNED);
5d752600 49
b4d02baa 50 if (++latency < XA_CHECK_SCHED)
ef3038a5 51 continue;
f2b277c4 52 latency = 0;
ef3038a5
MW
53
54 xas_pause(xas);
55 xas_unlock_irq(xas);
56 cond_resched();
57 xas_lock_irq(xas);
5d752600 58 }
ef3038a5 59 xas_unlock_irq(xas);
5d752600
MK
60}
61
89c1905d
VK
62/*
63 * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c).
64 * It is mainly called to allocate a folio in a memfd when the caller
65 * (memfd_pin_folios()) cannot find a folio in the page cache at a given
66 * index in the mapping.
67 */
68struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
69{
70#ifdef CONFIG_HUGETLB_PAGE
71 struct folio *folio;
72 gfp_t gfp_mask;
73 int err;
74
75 if (is_file_hugepages(memfd)) {
76 /*
77 * The folio would most likely be accessed by a DMA driver,
78 * therefore, we have zone memory constraints where we can
79 * alloc from. Also, the folio will be pinned for an indefinite
80 * amount of time, so it is not expected to be migrated away.
81 */
82 gfp_mask = htlb_alloc_mask(hstate_file(memfd));
83 gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
84
85 folio = alloc_hugetlb_folio_nodemask(hstate_file(memfd),
86 numa_node_id(),
87 NULL,
88 gfp_mask,
89 false);
90 if (folio && folio_try_get(folio)) {
91 err = hugetlb_add_to_page_cache(folio,
92 memfd->f_mapping,
93 idx);
94 if (err) {
95 folio_put(folio);
96 free_huge_folio(folio);
97 return ERR_PTR(err);
98 }
99 return folio;
100 }
101 return ERR_PTR(-ENOMEM);
102 }
103#endif
104 return shmem_read_folio(memfd->f_mapping, idx);
105}
106
5d752600
MK
107/*
108 * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
109 * via get_user_pages(), drivers might have some pending I/O without any active
b4d02baa 110 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios
5d752600
MK
111 * and see whether it has an elevated ref-count. If so, we tag them and wait for
112 * them to be dropped.
113 * The caller must guarantee that no new user will acquire writable references
b4d02baa 114 * to those folios to avoid races.
5d752600
MK
115 */
116static int memfd_wait_for_pins(struct address_space *mapping)
117{
2313216f 118 XA_STATE(xas, &mapping->i_pages, 0);
b4d02baa 119 struct folio *folio;
5d752600
MK
120 int error, scan;
121
ef3038a5 122 memfd_tag_pins(&xas);
5d752600
MK
123
124 error = 0;
125 for (scan = 0; scan <= LAST_SCAN; scan++) {
f2b277c4 126 int latency = 0;
2313216f
MW
127
128 if (!xas_marked(&xas, MEMFD_TAG_PINNED))
5d752600
MK
129 break;
130
131 if (!scan)
132 lru_add_drain_all();
133 else if (schedule_timeout_killable((HZ << scan) / 200))
134 scan = LAST_SCAN;
135
2313216f
MW
136 xas_set(&xas, 0);
137 xas_lock_irq(&xas);
b4d02baa 138 xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
2313216f 139 bool clear = true;
f2b277c4 140
b4d02baa
DH
141 if (!xa_is_value(folio) &&
142 memfd_folio_has_extra_refs(folio)) {
5d752600
MK
143 /*
144 * On the last scan, we clean up all those tags
145 * we inserted; but make a note that we still
b4d02baa 146 * found folios pinned.
5d752600 147 */
2313216f
MW
148 if (scan == LAST_SCAN)
149 error = -EBUSY;
150 else
151 clear = false;
5d752600 152 }
2313216f
MW
153 if (clear)
154 xas_clear_mark(&xas, MEMFD_TAG_PINNED);
f2b277c4 155
b4d02baa 156 if (++latency < XA_CHECK_SCHED)
2313216f 157 continue;
f2b277c4 158 latency = 0;
5d752600 159
2313216f
MW
160 xas_pause(&xas);
161 xas_unlock_irq(&xas);
162 cond_resched();
163 xas_lock_irq(&xas);
5d752600 164 }
2313216f 165 xas_unlock_irq(&xas);
5d752600
MK
166 }
167
168 return error;
169}
170
171static unsigned int *memfd_file_seals_ptr(struct file *file)
172{
173 if (shmem_file(file))
174 return &SHMEM_I(file_inode(file))->seals;
175
176#ifdef CONFIG_HUGETLBFS
177 if (is_file_hugepages(file))
178 return &HUGETLBFS_I(file_inode(file))->seals;
179#endif
180
181 return NULL;
182}
183
184#define F_ALL_SEALS (F_SEAL_SEAL | \
6fd73538 185 F_SEAL_EXEC | \
5d752600
MK
186 F_SEAL_SHRINK | \
187 F_SEAL_GROW | \
ab3948f5
JFG
188 F_SEAL_WRITE | \
189 F_SEAL_FUTURE_WRITE)
5d752600
MK
190
191static int memfd_add_seals(struct file *file, unsigned int seals)
192{
193 struct inode *inode = file_inode(file);
194 unsigned int *file_seals;
195 int error;
196
197 /*
198 * SEALING
199 * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
200 * but restrict access to a specific subset of file operations. Seals
201 * can only be added, but never removed. This way, mutually untrusted
202 * parties can share common memory regions with a well-defined policy.
203 * A malicious peer can thus never perform unwanted operations on a
204 * shared object.
205 *
206 * Seals are only supported on special tmpfs or hugetlbfs files and
207 * always affect the whole underlying inode. Once a seal is set, it
208 * may prevent some kinds of access to the file. Currently, the
209 * following seals are defined:
210 * SEAL_SEAL: Prevent further seals from being set on this file
211 * SEAL_SHRINK: Prevent the file from shrinking
212 * SEAL_GROW: Prevent the file from growing
213 * SEAL_WRITE: Prevent write access to the file
6fd73538 214 * SEAL_EXEC: Prevent modification of the exec bits in the file mode
5d752600
MK
215 *
216 * As we don't require any trust relationship between two parties, we
217 * must prevent seals from being removed. Therefore, sealing a file
218 * only adds a given set of seals to the file, it never touches
219 * existing seals. Furthermore, the "setting seals"-operation can be
220 * sealed itself, which basically prevents any further seal from being
221 * added.
222 *
223 * Semantics of sealing are only defined on volatile files. Only
224 * anonymous tmpfs and hugetlbfs files support sealing. More
225 * importantly, seals are never written to disk. Therefore, there's
226 * no plan to support it on other file types.
227 */
228
229 if (!(file->f_mode & FMODE_WRITE))
230 return -EPERM;
231 if (seals & ~(unsigned int)F_ALL_SEALS)
232 return -EINVAL;
233
234 inode_lock(inode);
235
236 file_seals = memfd_file_seals_ptr(file);
237 if (!file_seals) {
238 error = -EINVAL;
239 goto unlock;
240 }
241
242 if (*file_seals & F_SEAL_SEAL) {
243 error = -EPERM;
244 goto unlock;
245 }
246
247 if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
248 error = mapping_deny_writable(file->f_mapping);
249 if (error)
250 goto unlock;
251
252 error = memfd_wait_for_pins(file->f_mapping);
253 if (error) {
254 mapping_allow_writable(file->f_mapping);
255 goto unlock;
256 }
257 }
258
c4f75bc8
JX
259 /*
260 * SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
261 */
262 if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
263 seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
264
5d752600
MK
265 *file_seals |= seals;
266 error = 0;
267
268unlock:
269 inode_unlock(inode);
270 return error;
271}
272
273static int memfd_get_seals(struct file *file)
274{
275 unsigned int *seals = memfd_file_seals_ptr(file);
276
277 return seals ? *seals : -EINVAL;
278}
279
f7b8f70b 280long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
5d752600
MK
281{
282 long error;
283
284 switch (cmd) {
285 case F_ADD_SEALS:
5d752600
MK
286 error = memfd_add_seals(file, arg);
287 break;
288 case F_GET_SEALS:
289 error = memfd_get_seals(file);
290 break;
291 default:
292 error = -EINVAL;
293 break;
294 }
295
296 return error;
297}
298
299#define MFD_NAME_PREFIX "memfd:"
300#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
301#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
302
105ff533 303#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
5d752600 304
72de2591
JX
305static int check_sysctl_memfd_noexec(unsigned int *flags)
306{
307#ifdef CONFIG_SYSCTL
9876cfe8
AS
308 struct pid_namespace *ns = task_active_pid_ns(current);
309 int sysctl = pidns_memfd_noexec_scope(ns);
72de2591
JX
310
311 if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
202e1422 312 if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
72de2591
JX
313 *flags |= MFD_NOEXEC_SEAL;
314 else
315 *flags |= MFD_EXEC;
316 }
317
202e1422
AS
318 if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
319 pr_err_ratelimited(
320 "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
321 current->comm, task_pid_nr(current), sysctl);
72de2591
JX
322 return -EACCES;
323 }
324#endif
72de2591
JX
325 return 0;
326}
327
5d752600
MK
328SYSCALL_DEFINE2(memfd_create,
329 const char __user *, uname,
330 unsigned int, flags)
331{
332 unsigned int *file_seals;
333 struct file *file;
334 int fd, error;
335 char *name;
336 long len;
337
338 if (!(flags & MFD_HUGETLB)) {
339 if (flags & ~(unsigned int)MFD_ALL_FLAGS)
340 return -EINVAL;
341 } else {
342 /* Allow huge page size encoding in flags. */
343 if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
344 (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
345 return -EINVAL;
346 }
347
105ff533
JX
348 /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
349 if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
350 return -EINVAL;
351
202e1422
AS
352 error = check_sysctl_memfd_noexec(&flags);
353 if (error < 0)
354 return error;
72de2591 355
5d752600
MK
356 /* length includes terminating zero */
357 len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
358 if (len <= 0)
359 return -EFAULT;
360 if (len > MFD_NAME_MAX_LEN + 1)
361 return -EINVAL;
362
363 name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
364 if (!name)
365 return -ENOMEM;
366
367 strcpy(name, MFD_NAME_PREFIX);
368 if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
369 error = -EFAULT;
370 goto err_name;
371 }
372
373 /* terminating-zero may have changed after strnlen_user() returned */
374 if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
375 error = -EFAULT;
376 goto err_name;
377 }
378
379 fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
380 if (fd < 0) {
381 error = fd;
382 goto err_name;
383 }
384
385 if (flags & MFD_HUGETLB) {
83c1fd76 386 file = hugetlb_file_setup(name, 0, VM_NORESERVE,
5d752600
MK
387 HUGETLB_ANONHUGE_INODE,
388 (flags >> MFD_HUGE_SHIFT) &
389 MFD_HUGE_MASK);
390 } else
391 file = shmem_file_setup(name, 0, VM_NORESERVE);
392 if (IS_ERR(file)) {
393 error = PTR_ERR(file);
394 goto err_fd;
395 }
396 file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
c9c554f2 397 file->f_flags |= O_LARGEFILE;
5d752600 398
105ff533
JX
399 if (flags & MFD_NOEXEC_SEAL) {
400 struct inode *inode = file_inode(file);
401
402 inode->i_mode &= ~0111;
403 file_seals = memfd_file_seals_ptr(file);
935d44ac
RS
404 if (file_seals) {
405 *file_seals &= ~F_SEAL_SEAL;
406 *file_seals |= F_SEAL_EXEC;
407 }
105ff533
JX
408 } else if (flags & MFD_ALLOW_SEALING) {
409 /* MFD_EXEC and MFD_ALLOW_SEALING are set */
5d752600 410 file_seals = memfd_file_seals_ptr(file);
935d44ac
RS
411 if (file_seals)
412 *file_seals &= ~F_SEAL_SEAL;
5d752600
MK
413 }
414
415 fd_install(fd, file);
416 kfree(name);
417 return fd;
418
419err_fd:
420 put_unused_fd(fd);
421err_name:
422 kfree(name);
423 return error;
424}
This page took 0.40955 seconds and 4 git commands to generate.