]> Git Repo - linux.git/blob - mm/mprotect.c
userfaultfd: wp: apply _PAGE_UFFD_WP bit
[linux.git] / mm / mprotect.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  mm/mprotect.c
4  *
5  *  (C) Copyright 1994 Linus Torvalds
6  *  (C) Copyright 2002 Christoph Hellwig
7  *
8  *  Address space accounting code       <[email protected]>
9  *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
10  */
11
12 #include <linux/pagewalk.h>
13 #include <linux/hugetlb.h>
14 #include <linux/shm.h>
15 #include <linux/mman.h>
16 #include <linux/fs.h>
17 #include <linux/highmem.h>
18 #include <linux/security.h>
19 #include <linux/mempolicy.h>
20 #include <linux/personality.h>
21 #include <linux/syscalls.h>
22 #include <linux/swap.h>
23 #include <linux/swapops.h>
24 #include <linux/mmu_notifier.h>
25 #include <linux/migrate.h>
26 #include <linux/perf_event.h>
27 #include <linux/pkeys.h>
28 #include <linux/ksm.h>
29 #include <linux/uaccess.h>
30 #include <linux/mm_inline.h>
31 #include <asm/pgtable.h>
32 #include <asm/cacheflush.h>
33 #include <asm/mmu_context.h>
34 #include <asm/tlbflush.h>
35
36 #include "internal.h"
37
38 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39                 unsigned long addr, unsigned long end, pgprot_t newprot,
40                 unsigned long cp_flags)
41 {
42         pte_t *pte, oldpte;
43         spinlock_t *ptl;
44         unsigned long pages = 0;
45         int target_node = NUMA_NO_NODE;
46         bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
47         bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
48         bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
49         bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
50
51         /*
52          * Can be called with only the mmap_sem for reading by
53          * prot_numa so we must check the pmd isn't constantly
54          * changing from under us from pmd_none to pmd_trans_huge
55          * and/or the other way around.
56          */
57         if (pmd_trans_unstable(pmd))
58                 return 0;
59
60         /*
61          * The pmd points to a regular pte so the pmd can't change
62          * from under us even if the mmap_sem is only hold for
63          * reading.
64          */
65         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
66
67         /* Get target node for single threaded private VMAs */
68         if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
69             atomic_read(&vma->vm_mm->mm_users) == 1)
70                 target_node = numa_node_id();
71
72         flush_tlb_batched_pending(vma->vm_mm);
73         arch_enter_lazy_mmu_mode();
74         do {
75                 oldpte = *pte;
76                 if (pte_present(oldpte)) {
77                         pte_t ptent;
78                         bool preserve_write = prot_numa && pte_write(oldpte);
79
80                         /*
81                          * Avoid trapping faults against the zero or KSM
82                          * pages. See similar comment in change_huge_pmd.
83                          */
84                         if (prot_numa) {
85                                 struct page *page;
86
87                                 /* Avoid TLB flush if possible */
88                                 if (pte_protnone(oldpte))
89                                         continue;
90
91                                 page = vm_normal_page(vma, addr, oldpte);
92                                 if (!page || PageKsm(page))
93                                         continue;
94
95                                 /* Also skip shared copy-on-write pages */
96                                 if (is_cow_mapping(vma->vm_flags) &&
97                                     page_mapcount(page) != 1)
98                                         continue;
99
100                                 /*
101                                  * While migration can move some dirty pages,
102                                  * it cannot move them all from MIGRATE_ASYNC
103                                  * context.
104                                  */
105                                 if (page_is_file_lru(page) && PageDirty(page))
106                                         continue;
107
108                                 /*
109                                  * Don't mess with PTEs if page is already on the node
110                                  * a single-threaded process is running on.
111                                  */
112                                 if (target_node == page_to_nid(page))
113                                         continue;
114                         }
115
116                         oldpte = ptep_modify_prot_start(vma, addr, pte);
117                         ptent = pte_modify(oldpte, newprot);
118                         if (preserve_write)
119                                 ptent = pte_mk_savedwrite(ptent);
120
121                         if (uffd_wp) {
122                                 ptent = pte_wrprotect(ptent);
123                                 ptent = pte_mkuffd_wp(ptent);
124                         } else if (uffd_wp_resolve) {
125                                 /*
126                                  * Leave the write bit to be handled
127                                  * by PF interrupt handler, then
128                                  * things like COW could be properly
129                                  * handled.
130                                  */
131                                 ptent = pte_clear_uffd_wp(ptent);
132                         }
133
134                         /* Avoid taking write faults for known dirty pages */
135                         if (dirty_accountable && pte_dirty(ptent) &&
136                                         (pte_soft_dirty(ptent) ||
137                                          !(vma->vm_flags & VM_SOFTDIRTY))) {
138                                 ptent = pte_mkwrite(ptent);
139                         }
140                         ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
141                         pages++;
142                 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
143                         swp_entry_t entry = pte_to_swp_entry(oldpte);
144
145                         if (is_write_migration_entry(entry)) {
146                                 pte_t newpte;
147                                 /*
148                                  * A protection check is difficult so
149                                  * just be safe and disable write
150                                  */
151                                 make_migration_entry_read(&entry);
152                                 newpte = swp_entry_to_pte(entry);
153                                 if (pte_swp_soft_dirty(oldpte))
154                                         newpte = pte_swp_mksoft_dirty(newpte);
155                                 set_pte_at(vma->vm_mm, addr, pte, newpte);
156
157                                 pages++;
158                         }
159
160                         if (is_write_device_private_entry(entry)) {
161                                 pte_t newpte;
162
163                                 /*
164                                  * We do not preserve soft-dirtiness. See
165                                  * copy_one_pte() for explanation.
166                                  */
167                                 make_device_private_entry_read(&entry);
168                                 newpte = swp_entry_to_pte(entry);
169                                 set_pte_at(vma->vm_mm, addr, pte, newpte);
170
171                                 pages++;
172                         }
173                 }
174         } while (pte++, addr += PAGE_SIZE, addr != end);
175         arch_leave_lazy_mmu_mode();
176         pte_unmap_unlock(pte - 1, ptl);
177
178         return pages;
179 }
180
181 /*
182  * Used when setting automatic NUMA hinting protection where it is
183  * critical that a numa hinting PMD is not confused with a bad PMD.
184  */
185 static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
186 {
187         pmd_t pmdval = pmd_read_atomic(pmd);
188
189         /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
190 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
191         barrier();
192 #endif
193
194         if (pmd_none(pmdval))
195                 return 1;
196         if (pmd_trans_huge(pmdval))
197                 return 0;
198         if (unlikely(pmd_bad(pmdval))) {
199                 pmd_clear_bad(pmd);
200                 return 1;
201         }
202
203         return 0;
204 }
205
206 static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
207                 pud_t *pud, unsigned long addr, unsigned long end,
208                 pgprot_t newprot, unsigned long cp_flags)
209 {
210         pmd_t *pmd;
211         unsigned long next;
212         unsigned long pages = 0;
213         unsigned long nr_huge_updates = 0;
214         struct mmu_notifier_range range;
215
216         range.start = 0;
217
218         pmd = pmd_offset(pud, addr);
219         do {
220                 unsigned long this_pages;
221
222                 next = pmd_addr_end(addr, end);
223
224                 /*
225                  * Automatic NUMA balancing walks the tables with mmap_sem
226                  * held for read. It's possible a parallel update to occur
227                  * between pmd_trans_huge() and a pmd_none_or_clear_bad()
228                  * check leading to a false positive and clearing.
229                  * Hence, it's necessary to atomically read the PMD value
230                  * for all the checks.
231                  */
232                 if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
233                      pmd_none_or_clear_bad_unless_trans_huge(pmd))
234                         goto next;
235
236                 /* invoke the mmu notifier if the pmd is populated */
237                 if (!range.start) {
238                         mmu_notifier_range_init(&range,
239                                 MMU_NOTIFY_PROTECTION_VMA, 0,
240                                 vma, vma->vm_mm, addr, end);
241                         mmu_notifier_invalidate_range_start(&range);
242                 }
243
244                 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
245                         if (next - addr != HPAGE_PMD_SIZE) {
246                                 __split_huge_pmd(vma, pmd, addr, false, NULL);
247                         } else {
248                                 int nr_ptes = change_huge_pmd(vma, pmd, addr,
249                                                               newprot, cp_flags);
250
251                                 if (nr_ptes) {
252                                         if (nr_ptes == HPAGE_PMD_NR) {
253                                                 pages += HPAGE_PMD_NR;
254                                                 nr_huge_updates++;
255                                         }
256
257                                         /* huge pmd was handled */
258                                         goto next;
259                                 }
260                         }
261                         /* fall through, the trans huge pmd just split */
262                 }
263                 this_pages = change_pte_range(vma, pmd, addr, next, newprot,
264                                               cp_flags);
265                 pages += this_pages;
266 next:
267                 cond_resched();
268         } while (pmd++, addr = next, addr != end);
269
270         if (range.start)
271                 mmu_notifier_invalidate_range_end(&range);
272
273         if (nr_huge_updates)
274                 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
275         return pages;
276 }
277
278 static inline unsigned long change_pud_range(struct vm_area_struct *vma,
279                 p4d_t *p4d, unsigned long addr, unsigned long end,
280                 pgprot_t newprot, unsigned long cp_flags)
281 {
282         pud_t *pud;
283         unsigned long next;
284         unsigned long pages = 0;
285
286         pud = pud_offset(p4d, addr);
287         do {
288                 next = pud_addr_end(addr, end);
289                 if (pud_none_or_clear_bad(pud))
290                         continue;
291                 pages += change_pmd_range(vma, pud, addr, next, newprot,
292                                           cp_flags);
293         } while (pud++, addr = next, addr != end);
294
295         return pages;
296 }
297
298 static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
299                 pgd_t *pgd, unsigned long addr, unsigned long end,
300                 pgprot_t newprot, unsigned long cp_flags)
301 {
302         p4d_t *p4d;
303         unsigned long next;
304         unsigned long pages = 0;
305
306         p4d = p4d_offset(pgd, addr);
307         do {
308                 next = p4d_addr_end(addr, end);
309                 if (p4d_none_or_clear_bad(p4d))
310                         continue;
311                 pages += change_pud_range(vma, p4d, addr, next, newprot,
312                                           cp_flags);
313         } while (p4d++, addr = next, addr != end);
314
315         return pages;
316 }
317
318 static unsigned long change_protection_range(struct vm_area_struct *vma,
319                 unsigned long addr, unsigned long end, pgprot_t newprot,
320                 unsigned long cp_flags)
321 {
322         struct mm_struct *mm = vma->vm_mm;
323         pgd_t *pgd;
324         unsigned long next;
325         unsigned long start = addr;
326         unsigned long pages = 0;
327
328         BUG_ON(addr >= end);
329         pgd = pgd_offset(mm, addr);
330         flush_cache_range(vma, addr, end);
331         inc_tlb_flush_pending(mm);
332         do {
333                 next = pgd_addr_end(addr, end);
334                 if (pgd_none_or_clear_bad(pgd))
335                         continue;
336                 pages += change_p4d_range(vma, pgd, addr, next, newprot,
337                                           cp_flags);
338         } while (pgd++, addr = next, addr != end);
339
340         /* Only flush the TLB if we actually modified any entries: */
341         if (pages)
342                 flush_tlb_range(vma, start, end);
343         dec_tlb_flush_pending(mm);
344
345         return pages;
346 }
347
348 unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
349                        unsigned long end, pgprot_t newprot,
350                        unsigned long cp_flags)
351 {
352         unsigned long pages;
353
354         BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
355
356         if (is_vm_hugetlb_page(vma))
357                 pages = hugetlb_change_protection(vma, start, end, newprot);
358         else
359                 pages = change_protection_range(vma, start, end, newprot,
360                                                 cp_flags);
361
362         return pages;
363 }
364
365 static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
366                                unsigned long next, struct mm_walk *walk)
367 {
368         return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
369                 0 : -EACCES;
370 }
371
372 static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
373                                    unsigned long addr, unsigned long next,
374                                    struct mm_walk *walk)
375 {
376         return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
377                 0 : -EACCES;
378 }
379
380 static int prot_none_test(unsigned long addr, unsigned long next,
381                           struct mm_walk *walk)
382 {
383         return 0;
384 }
385
386 static const struct mm_walk_ops prot_none_walk_ops = {
387         .pte_entry              = prot_none_pte_entry,
388         .hugetlb_entry          = prot_none_hugetlb_entry,
389         .test_walk              = prot_none_test,
390 };
391
392 int
393 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
394         unsigned long start, unsigned long end, unsigned long newflags)
395 {
396         struct mm_struct *mm = vma->vm_mm;
397         unsigned long oldflags = vma->vm_flags;
398         long nrpages = (end - start) >> PAGE_SHIFT;
399         unsigned long charged = 0;
400         pgoff_t pgoff;
401         int error;
402         int dirty_accountable = 0;
403
404         if (newflags == oldflags) {
405                 *pprev = vma;
406                 return 0;
407         }
408
409         /*
410          * Do PROT_NONE PFN permission checks here when we can still
411          * bail out without undoing a lot of state. This is a rather
412          * uncommon case, so doesn't need to be very optimized.
413          */
414         if (arch_has_pfn_modify_check() &&
415             (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
416             (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
417                 pgprot_t new_pgprot = vm_get_page_prot(newflags);
418
419                 error = walk_page_range(current->mm, start, end,
420                                 &prot_none_walk_ops, &new_pgprot);
421                 if (error)
422                         return error;
423         }
424
425         /*
426          * If we make a private mapping writable we increase our commit;
427          * but (without finer accounting) cannot reduce our commit if we
428          * make it unwritable again. hugetlb mapping were accounted for
429          * even if read-only so there is no need to account for them here
430          */
431         if (newflags & VM_WRITE) {
432                 /* Check space limits when area turns into data. */
433                 if (!may_expand_vm(mm, newflags, nrpages) &&
434                                 may_expand_vm(mm, oldflags, nrpages))
435                         return -ENOMEM;
436                 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
437                                                 VM_SHARED|VM_NORESERVE))) {
438                         charged = nrpages;
439                         if (security_vm_enough_memory_mm(mm, charged))
440                                 return -ENOMEM;
441                         newflags |= VM_ACCOUNT;
442                 }
443         }
444
445         /*
446          * First try to merge with previous and/or next vma.
447          */
448         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
449         *pprev = vma_merge(mm, *pprev, start, end, newflags,
450                            vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
451                            vma->vm_userfaultfd_ctx);
452         if (*pprev) {
453                 vma = *pprev;
454                 VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
455                 goto success;
456         }
457
458         *pprev = vma;
459
460         if (start != vma->vm_start) {
461                 error = split_vma(mm, vma, start, 1);
462                 if (error)
463                         goto fail;
464         }
465
466         if (end != vma->vm_end) {
467                 error = split_vma(mm, vma, end, 0);
468                 if (error)
469                         goto fail;
470         }
471
472 success:
473         /*
474          * vm_flags and vm_page_prot are protected by the mmap_sem
475          * held in write mode.
476          */
477         vma->vm_flags = newflags;
478         dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
479         vma_set_page_prot(vma);
480
481         change_protection(vma, start, end, vma->vm_page_prot,
482                           dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
483
484         /*
485          * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
486          * fault on access.
487          */
488         if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
489                         (newflags & VM_WRITE)) {
490                 populate_vma_page_range(vma, start, end, NULL);
491         }
492
493         vm_stat_account(mm, oldflags, -nrpages);
494         vm_stat_account(mm, newflags, nrpages);
495         perf_event_mmap(vma);
496         return 0;
497
498 fail:
499         vm_unacct_memory(charged);
500         return error;
501 }
502
503 /*
504  * pkey==-1 when doing a legacy mprotect()
505  */
506 static int do_mprotect_pkey(unsigned long start, size_t len,
507                 unsigned long prot, int pkey)
508 {
509         unsigned long nstart, end, tmp, reqprot;
510         struct vm_area_struct *vma, *prev;
511         int error = -EINVAL;
512         const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
513         const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
514                                 (prot & PROT_READ);
515
516         start = untagged_addr(start);
517
518         prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
519         if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
520                 return -EINVAL;
521
522         if (start & ~PAGE_MASK)
523                 return -EINVAL;
524         if (!len)
525                 return 0;
526         len = PAGE_ALIGN(len);
527         end = start + len;
528         if (end <= start)
529                 return -ENOMEM;
530         if (!arch_validate_prot(prot, start))
531                 return -EINVAL;
532
533         reqprot = prot;
534
535         if (down_write_killable(&current->mm->mmap_sem))
536                 return -EINTR;
537
538         /*
539          * If userspace did not allocate the pkey, do not let
540          * them use it here.
541          */
542         error = -EINVAL;
543         if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
544                 goto out;
545
546         vma = find_vma(current->mm, start);
547         error = -ENOMEM;
548         if (!vma)
549                 goto out;
550         prev = vma->vm_prev;
551         if (unlikely(grows & PROT_GROWSDOWN)) {
552                 if (vma->vm_start >= end)
553                         goto out;
554                 start = vma->vm_start;
555                 error = -EINVAL;
556                 if (!(vma->vm_flags & VM_GROWSDOWN))
557                         goto out;
558         } else {
559                 if (vma->vm_start > start)
560                         goto out;
561                 if (unlikely(grows & PROT_GROWSUP)) {
562                         end = vma->vm_end;
563                         error = -EINVAL;
564                         if (!(vma->vm_flags & VM_GROWSUP))
565                                 goto out;
566                 }
567         }
568         if (start > vma->vm_start)
569                 prev = vma;
570
571         for (nstart = start ; ; ) {
572                 unsigned long mask_off_old_flags;
573                 unsigned long newflags;
574                 int new_vma_pkey;
575
576                 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
577
578                 /* Does the application expect PROT_READ to imply PROT_EXEC */
579                 if (rier && (vma->vm_flags & VM_MAYEXEC))
580                         prot |= PROT_EXEC;
581
582                 /*
583                  * Each mprotect() call explicitly passes r/w/x permissions.
584                  * If a permission is not passed to mprotect(), it must be
585                  * cleared from the VMA.
586                  */
587                 mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
588                                         VM_FLAGS_CLEAR;
589
590                 new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
591                 newflags = calc_vm_prot_bits(prot, new_vma_pkey);
592                 newflags |= (vma->vm_flags & ~mask_off_old_flags);
593
594                 /* newflags >> 4 shift VM_MAY% in place of VM_% */
595                 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
596                         error = -EACCES;
597                         goto out;
598                 }
599
600                 error = security_file_mprotect(vma, reqprot, prot);
601                 if (error)
602                         goto out;
603
604                 tmp = vma->vm_end;
605                 if (tmp > end)
606                         tmp = end;
607                 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
608                 if (error)
609                         goto out;
610                 nstart = tmp;
611
612                 if (nstart < prev->vm_end)
613                         nstart = prev->vm_end;
614                 if (nstart >= end)
615                         goto out;
616
617                 vma = prev->vm_next;
618                 if (!vma || vma->vm_start != nstart) {
619                         error = -ENOMEM;
620                         goto out;
621                 }
622                 prot = reqprot;
623         }
624 out:
625         up_write(&current->mm->mmap_sem);
626         return error;
627 }
628
629 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
630                 unsigned long, prot)
631 {
632         return do_mprotect_pkey(start, len, prot, -1);
633 }
634
635 #ifdef CONFIG_ARCH_HAS_PKEYS
636
637 SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
638                 unsigned long, prot, int, pkey)
639 {
640         return do_mprotect_pkey(start, len, prot, pkey);
641 }
642
643 SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
644 {
645         int pkey;
646         int ret;
647
648         /* No flags supported yet. */
649         if (flags)
650                 return -EINVAL;
651         /* check for unsupported init values */
652         if (init_val & ~PKEY_ACCESS_MASK)
653                 return -EINVAL;
654
655         down_write(&current->mm->mmap_sem);
656         pkey = mm_pkey_alloc(current->mm);
657
658         ret = -ENOSPC;
659         if (pkey == -1)
660                 goto out;
661
662         ret = arch_set_user_pkey_access(current, pkey, init_val);
663         if (ret) {
664                 mm_pkey_free(current->mm, pkey);
665                 goto out;
666         }
667         ret = pkey;
668 out:
669         up_write(&current->mm->mmap_sem);
670         return ret;
671 }
672
673 SYSCALL_DEFINE1(pkey_free, int, pkey)
674 {
675         int ret;
676
677         down_write(&current->mm->mmap_sem);
678         ret = mm_pkey_free(current->mm, pkey);
679         up_write(&current->mm->mmap_sem);
680
681         /*
682          * We could provie warnings or errors if any VMA still
683          * has the pkey set here.
684          */
685         return ret;
686 }
687
688 #endif /* CONFIG_ARCH_HAS_PKEYS */
This page took 0.086534 seconds and 4 git commands to generate.