]> Git Repo - linux.git/blame - mm/mlock.c
mm/munlock: delete munlock_vma_pages_all(), allow oomreap
[linux.git] / mm / mlock.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
1da177e4
LT
2/*
3 * linux/mm/mlock.c
4 *
5 * (C) Copyright 1995 Linus Torvalds
6 * (C) Copyright 2002 Christoph Hellwig
7 */
8
c59ede7b 9#include <linux/capability.h>
1da177e4
LT
10#include <linux/mman.h>
11#include <linux/mm.h>
8703e8a4 12#include <linux/sched/user.h>
b291f000
NP
13#include <linux/swap.h>
14#include <linux/swapops.h>
15#include <linux/pagemap.h>
7225522b 16#include <linux/pagevec.h>
1da177e4
LT
17#include <linux/mempolicy.h>
18#include <linux/syscalls.h>
e8edc6e0 19#include <linux/sched.h>
b95f1b31 20#include <linux/export.h>
b291f000
NP
21#include <linux/rmap.h>
22#include <linux/mmzone.h>
23#include <linux/hugetlb.h>
7225522b
VB
24#include <linux/memcontrol.h>
25#include <linux/mm_inline.h>
1507f512 26#include <linux/secretmem.h>
b291f000
NP
27
28#include "internal.h"
1da177e4 29
7f43add4 30bool can_do_mlock(void)
e8edc6e0 31{
59e99e5b 32 if (rlimit(RLIMIT_MEMLOCK) != 0)
7f43add4 33 return true;
a5a6579d 34 if (capable(CAP_IPC_LOCK))
7f43add4
WX
35 return true;
36 return false;
e8edc6e0
AD
37}
38EXPORT_SYMBOL(can_do_mlock);
1da177e4 39
b291f000
NP
40/*
41 * Mlocked pages are marked with PageMlocked() flag for efficient testing
42 * in vmscan and, possibly, the fault path; and to support semi-accurate
43 * statistics.
44 *
45 * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
46 * be placed on the LRU "unevictable" list, rather than the [in]active lists.
47 * The unevictable list is an LRU sibling list to the [in]active lists.
48 * PageUnevictable is set to indicate the unevictable state.
b291f000
NP
49 */
50
51/*
52 * LRU accounting for clear_page_mlock()
53 */
e6c509f8 54void clear_page_mlock(struct page *page)
b291f000 55{
0964730b
HD
56 int nr_pages;
57
e6c509f8 58 if (!TestClearPageMlocked(page))
b291f000 59 return;
b291f000 60
0964730b
HD
61 nr_pages = thp_nr_pages(page);
62 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
63 count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
9c4e6b1a
SB
64 /*
65 * The previous TestClearPageMlocked() corresponds to the smp_mb()
66 * in __pagevec_lru_add_fn().
67 *
68 * See __pagevec_lru_add_fn for more explanation.
69 */
b291f000
NP
70 if (!isolate_lru_page(page)) {
71 putback_lru_page(page);
72 } else {
73 /*
8891d6da 74 * We lost the race. the page already moved to evictable list.
b291f000 75 */
8891d6da 76 if (PageUnevictable(page))
0964730b 77 count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
b291f000
NP
78 }
79}
80
81/*
82 * Mark page as mlocked if not already.
83 * If page on LRU, isolate and putback to move to unevictable list.
84 */
85void mlock_vma_page(struct page *page)
86{
57e68e9c 87 /* Serialize with page migration */
b291f000
NP
88 BUG_ON(!PageLocked(page));
89
e90309c9
KS
90 VM_BUG_ON_PAGE(PageTail(page), page);
91 VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
92
5344b7e6 93 if (!TestSetPageMlocked(page)) {
0964730b
HD
94 int nr_pages = thp_nr_pages(page);
95
96 mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
97 count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
5344b7e6
NP
98 if (!isolate_lru_page(page))
99 putback_lru_page(page);
100 }
b291f000
NP
101}
102
6927c1dd
LS
103/**
104 * munlock_vma_page - munlock a vma page
b7701a5f 105 * @page: page to be unlocked, either a normal page or THP page head
b291f000 106 */
ebcbc6ea 107void munlock_vma_page(struct page *page)
b291f000 108{
ebcbc6ea 109 /* Serialize with page migration */
b291f000 110 BUG_ON(!PageLocked(page));
56afe477 111
ebcbc6ea 112 VM_BUG_ON_PAGE(PageTail(page), page);
01cc2e58 113
ebcbc6ea
HD
114 if (TestClearPageMlocked(page)) {
115 int nr_pages = thp_nr_pages(page);
7225522b 116
ebcbc6ea
HD
117 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
118 if (!isolate_lru_page(page)) {
119 putback_lru_page(page);
120 count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
121 } else if (PageUnevictable(page)) {
122 count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
7225522b
VB
123 }
124 }
7225522b
VB
125}
126
b291f000 127/*
ba470de4
RR
128 * munlock_vma_pages_range() - munlock all pages in the vma range.'
129 * @vma - vma containing range to be munlock()ed.
130 * @start - start address in @vma of the range
131 * @end - end of range in @vma.
132 *
133 * For mremap(), munmap() and exit().
134 *
135 * Called with @vma VM_LOCKED.
136 *
137 * Returns with VM_LOCKED cleared. Callers must be prepared to
138 * deal with this.
b291f000 139 */
a213e5cf
HD
140static void munlock_vma_pages_range(struct vm_area_struct *vma,
141 unsigned long start, unsigned long end)
b291f000 142{
de60f5f1 143 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
408e82b7 144
ebcbc6ea 145 /* Reimplementation to follow in later commit */
b291f000
NP
146}
147
148/*
149 * mlock_fixup - handle mlock[all]/munlock[all] requests.
150 *
151 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
152 * munlock is a no-op. However, for some special vmas, we go ahead and
cea10a19 153 * populate the ptes.
b291f000
NP
154 *
155 * For vmas that pass the filters, merge/split as appropriate.
156 */
1da177e4 157static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
ca16d140 158 unsigned long start, unsigned long end, vm_flags_t newflags)
1da177e4 159{
b291f000 160 struct mm_struct *mm = vma->vm_mm;
1da177e4 161 pgoff_t pgoff;
b291f000 162 int nr_pages;
1da177e4 163 int ret = 0;
ca16d140 164 int lock = !!(newflags & VM_LOCKED);
b155b4fd 165 vm_flags_t old_flags = vma->vm_flags;
1da177e4 166
fed067da 167 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
e1fb4a08 168 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
1507f512 169 vma_is_dax(vma) || vma_is_secretmem(vma))
b0f205c2
EM
170 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
171 goto out;
b291f000 172
1da177e4
LT
173 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
174 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
19a809af 175 vma->vm_file, pgoff, vma_policy(vma),
9a10064f 176 vma->vm_userfaultfd_ctx, vma_anon_name(vma));
1da177e4
LT
177 if (*prev) {
178 vma = *prev;
179 goto success;
180 }
181
1da177e4
LT
182 if (start != vma->vm_start) {
183 ret = split_vma(mm, vma, start, 1);
184 if (ret)
185 goto out;
186 }
187
188 if (end != vma->vm_end) {
189 ret = split_vma(mm, vma, end, 0);
190 if (ret)
191 goto out;
192 }
193
194success:
b291f000
NP
195 /*
196 * Keep track of amount of locked VM.
197 */
198 nr_pages = (end - start) >> PAGE_SHIFT;
199 if (!lock)
200 nr_pages = -nr_pages;
b155b4fd
SG
201 else if (old_flags & VM_LOCKED)
202 nr_pages = 0;
b291f000
NP
203 mm->locked_vm += nr_pages;
204
1da177e4 205 /*
c1e8d7c6 206 * vm_flags is protected by the mmap_lock held in write mode.
1da177e4 207 * It's okay if try_to_unmap_one unmaps a page just after we
fc05f566 208 * set VM_LOCKED, populate_vma_page_range will bring it back.
1da177e4 209 */
1da177e4 210
fed067da 211 if (lock)
408e82b7 212 vma->vm_flags = newflags;
fed067da 213 else
408e82b7 214 munlock_vma_pages_range(vma, start, end);
1da177e4 215
1da177e4 216out:
b291f000 217 *prev = vma;
1da177e4
LT
218 return ret;
219}
220
1aab92ec
EM
221static int apply_vma_lock_flags(unsigned long start, size_t len,
222 vm_flags_t flags)
1da177e4
LT
223{
224 unsigned long nstart, end, tmp;
68d68ff6 225 struct vm_area_struct *vma, *prev;
1da177e4
LT
226 int error;
227
8fd9e488 228 VM_BUG_ON(offset_in_page(start));
fed067da 229 VM_BUG_ON(len != PAGE_ALIGN(len));
1da177e4
LT
230 end = start + len;
231 if (end < start)
232 return -EINVAL;
233 if (end == start)
234 return 0;
097d5910 235 vma = find_vma(current->mm, start);
1da177e4
LT
236 if (!vma || vma->vm_start > start)
237 return -ENOMEM;
238
097d5910 239 prev = vma->vm_prev;
1da177e4
LT
240 if (start > vma->vm_start)
241 prev = vma;
242
243 for (nstart = start ; ; ) {
b0f205c2 244 vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
1da177e4 245
1aab92ec 246 newflags |= flags;
1da177e4 247
1aab92ec 248 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
1da177e4
LT
249 tmp = vma->vm_end;
250 if (tmp > end)
251 tmp = end;
252 error = mlock_fixup(vma, &prev, nstart, tmp, newflags);
253 if (error)
254 break;
255 nstart = tmp;
256 if (nstart < prev->vm_end)
257 nstart = prev->vm_end;
258 if (nstart >= end)
259 break;
260
261 vma = prev->vm_next;
262 if (!vma || vma->vm_start != nstart) {
263 error = -ENOMEM;
264 break;
265 }
266 }
267 return error;
268}
269
0cf2f6f6
SG
270/*
271 * Go through vma areas and sum size of mlocked
272 * vma pages, as return value.
273 * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
274 * is also counted.
275 * Return value: previously mlocked page counts
276 */
0874bb49 277static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
0cf2f6f6
SG
278 unsigned long start, size_t len)
279{
280 struct vm_area_struct *vma;
0874bb49 281 unsigned long count = 0;
0cf2f6f6
SG
282
283 if (mm == NULL)
284 mm = current->mm;
285
286 vma = find_vma(mm, start);
287 if (vma == NULL)
48b03eea 288 return 0;
0cf2f6f6
SG
289
290 for (; vma ; vma = vma->vm_next) {
291 if (start >= vma->vm_end)
292 continue;
293 if (start + len <= vma->vm_start)
294 break;
295 if (vma->vm_flags & VM_LOCKED) {
296 if (start > vma->vm_start)
297 count -= (start - vma->vm_start);
298 if (start + len < vma->vm_end) {
299 count += start + len - vma->vm_start;
300 break;
301 }
302 count += vma->vm_end - vma->vm_start;
303 }
304 }
305
306 return count >> PAGE_SHIFT;
307}
308
ebcbc6ea
HD
309/*
310 * convert get_user_pages() return value to posix mlock() error
311 */
312static int __mlock_posix_error_return(long retval)
313{
314 if (retval == -EFAULT)
315 retval = -ENOMEM;
316 else if (retval == -ENOMEM)
317 retval = -EAGAIN;
318 return retval;
319}
320
dc0ef0df 321static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
1da177e4
LT
322{
323 unsigned long locked;
324 unsigned long lock_limit;
325 int error = -ENOMEM;
326
057d3389
AK
327 start = untagged_addr(start);
328
1da177e4
LT
329 if (!can_do_mlock())
330 return -EPERM;
331
8fd9e488 332 len = PAGE_ALIGN(len + (offset_in_page(start)));
1da177e4
LT
333 start &= PAGE_MASK;
334
59e99e5b 335 lock_limit = rlimit(RLIMIT_MEMLOCK);
1da177e4 336 lock_limit >>= PAGE_SHIFT;
1f1cd705
DB
337 locked = len >> PAGE_SHIFT;
338
d8ed45c5 339 if (mmap_write_lock_killable(current->mm))
dc0ef0df 340 return -EINTR;
1f1cd705
DB
341
342 locked += current->mm->locked_vm;
0cf2f6f6
SG
343 if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
344 /*
345 * It is possible that the regions requested intersect with
346 * previously mlocked areas, that part area in "mm->locked_vm"
347 * should not be counted to new mlock increment count. So check
348 * and adjust locked count if necessary.
349 */
350 locked -= count_mm_mlocked_page_nr(current->mm,
351 start, len);
352 }
1da177e4
LT
353
354 /* check against resource limits */
355 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
1aab92ec 356 error = apply_vma_lock_flags(start, len, flags);
1f1cd705 357
d8ed45c5 358 mmap_write_unlock(current->mm);
c561259c
KS
359 if (error)
360 return error;
361
362 error = __mm_populate(start, len, 0);
363 if (error)
364 return __mlock_posix_error_return(error);
365 return 0;
1da177e4
LT
366}
367
1aab92ec
EM
368SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
369{
370 return do_mlock(start, len, VM_LOCKED);
371}
372
a8ca5d0e
EM
373SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
374{
b0f205c2
EM
375 vm_flags_t vm_flags = VM_LOCKED;
376
377 if (flags & ~MLOCK_ONFAULT)
a8ca5d0e
EM
378 return -EINVAL;
379
b0f205c2
EM
380 if (flags & MLOCK_ONFAULT)
381 vm_flags |= VM_LOCKONFAULT;
382
383 return do_mlock(start, len, vm_flags);
a8ca5d0e
EM
384}
385
6a6160a7 386SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
1da177e4
LT
387{
388 int ret;
389
057d3389
AK
390 start = untagged_addr(start);
391
8fd9e488 392 len = PAGE_ALIGN(len + (offset_in_page(start)));
1da177e4 393 start &= PAGE_MASK;
1f1cd705 394
d8ed45c5 395 if (mmap_write_lock_killable(current->mm))
dc0ef0df 396 return -EINTR;
1aab92ec 397 ret = apply_vma_lock_flags(start, len, 0);
d8ed45c5 398 mmap_write_unlock(current->mm);
1f1cd705 399
1da177e4
LT
400 return ret;
401}
402
b0f205c2
EM
403/*
404 * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
405 * and translate into the appropriate modifications to mm->def_flags and/or the
406 * flags for all current VMAs.
407 *
408 * There are a couple of subtleties with this. If mlockall() is called multiple
409 * times with different flags, the values do not necessarily stack. If mlockall
410 * is called once including the MCL_FUTURE flag and then a second time without
411 * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
412 */
1aab92ec 413static int apply_mlockall_flags(int flags)
1da177e4 414{
68d68ff6 415 struct vm_area_struct *vma, *prev = NULL;
b0f205c2 416 vm_flags_t to_add = 0;
1da177e4 417
b0f205c2
EM
418 current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
419 if (flags & MCL_FUTURE) {
09a9f1d2 420 current->mm->def_flags |= VM_LOCKED;
1aab92ec 421
b0f205c2
EM
422 if (flags & MCL_ONFAULT)
423 current->mm->def_flags |= VM_LOCKONFAULT;
424
425 if (!(flags & MCL_CURRENT))
426 goto out;
427 }
428
429 if (flags & MCL_CURRENT) {
430 to_add |= VM_LOCKED;
431 if (flags & MCL_ONFAULT)
432 to_add |= VM_LOCKONFAULT;
433 }
1da177e4
LT
434
435 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
ca16d140 436 vm_flags_t newflags;
1da177e4 437
b0f205c2
EM
438 newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
439 newflags |= to_add;
1da177e4
LT
440
441 /* Ignore errors */
442 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
50d4fb78 443 cond_resched();
1da177e4
LT
444 }
445out:
446 return 0;
447}
448
3480b257 449SYSCALL_DEFINE1(mlockall, int, flags)
1da177e4
LT
450{
451 unsigned long lock_limit;
86d2adcc 452 int ret;
1da177e4 453
dedca635
PS
454 if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
455 flags == MCL_ONFAULT)
86d2adcc 456 return -EINVAL;
1da177e4 457
1da177e4 458 if (!can_do_mlock())
86d2adcc 459 return -EPERM;
1da177e4 460
59e99e5b 461 lock_limit = rlimit(RLIMIT_MEMLOCK);
1da177e4
LT
462 lock_limit >>= PAGE_SHIFT;
463
d8ed45c5 464 if (mmap_write_lock_killable(current->mm))
dc0ef0df 465 return -EINTR;
1f1cd705 466
dc0ef0df 467 ret = -ENOMEM;
1da177e4
LT
468 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
469 capable(CAP_IPC_LOCK))
1aab92ec 470 ret = apply_mlockall_flags(flags);
d8ed45c5 471 mmap_write_unlock(current->mm);
bebeb3d6
ML
472 if (!ret && (flags & MCL_CURRENT))
473 mm_populate(0, TASK_SIZE);
86d2adcc 474
1da177e4
LT
475 return ret;
476}
477
3480b257 478SYSCALL_DEFINE0(munlockall)
1da177e4
LT
479{
480 int ret;
481
d8ed45c5 482 if (mmap_write_lock_killable(current->mm))
dc0ef0df 483 return -EINTR;
1aab92ec 484 ret = apply_mlockall_flags(0);
d8ed45c5 485 mmap_write_unlock(current->mm);
1da177e4
LT
486 return ret;
487}
488
489/*
490 * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
491 * shm segments) get accounted against the user_struct instead.
492 */
493static DEFINE_SPINLOCK(shmlock_user_lock);
494
d7c9e99a 495int user_shm_lock(size_t size, struct ucounts *ucounts)
1da177e4
LT
496{
497 unsigned long lock_limit, locked;
d7c9e99a 498 long memlock;
1da177e4
LT
499 int allowed = 0;
500
501 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
59e99e5b 502 lock_limit = rlimit(RLIMIT_MEMLOCK);
5ed44a40
HB
503 if (lock_limit == RLIM_INFINITY)
504 allowed = 1;
1da177e4
LT
505 lock_limit >>= PAGE_SHIFT;
506 spin_lock(&shmlock_user_lock);
d7c9e99a
AG
507 memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
508
509 if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
510 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
511 goto out;
512 }
513 if (!get_ucounts(ucounts)) {
514 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
1da177e4 515 goto out;
d7c9e99a 516 }
1da177e4
LT
517 allowed = 1;
518out:
519 spin_unlock(&shmlock_user_lock);
520 return allowed;
521}
522
d7c9e99a 523void user_shm_unlock(size_t size, struct ucounts *ucounts)
1da177e4
LT
524{
525 spin_lock(&shmlock_user_lock);
d7c9e99a 526 dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
1da177e4 527 spin_unlock(&shmlock_user_lock);
d7c9e99a 528 put_ucounts(ucounts);
1da177e4 529}
This page took 1.16248 seconds and 4 git commands to generate.