1 // SPDX-License-Identifier: GPL-2.0
3 * Implement mseal() syscall.
5 * Copyright (c) 2023,2024 Google, Inc.
10 #include <linux/mempolicy.h>
11 #include <linux/mman.h>
13 #include <linux/mm_inline.h>
14 #include <linux/mmu_context.h>
15 #include <linux/syscalls.h>
16 #include <linux/sched.h>
19 static inline bool vma_is_sealed(struct vm_area_struct *vma)
21 return (vma->vm_flags & VM_SEALED);
24 static inline void set_vma_sealed(struct vm_area_struct *vma)
26 vm_flags_set(vma, VM_SEALED);
30 * check if a vma is sealed for modification.
31 * return true, if modification is allowed.
33 static bool can_modify_vma(struct vm_area_struct *vma)
35 if (unlikely(vma_is_sealed(vma)))
41 static bool is_madv_discard(int behavior)
46 case MADV_DONTNEED_LOCKED:
56 static bool is_ro_anon(struct vm_area_struct *vma)
58 /* check anonymous mapping. */
59 if (vma->vm_file || vma->vm_flags & VM_SHARED)
63 * check for non-writable:
64 * PROT=RO or PKRU is not writeable.
66 if (!(vma->vm_flags & VM_WRITE) ||
67 !arch_vma_access_permitted(vma, true, false, false))
74 * Check if the vmas of a memory range are allowed to be modified.
75 * the memory ranger can have a gap (unallocated memory).
76 * return true, if it is allowed.
78 bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
80 struct vm_area_struct *vma;
82 VMA_ITERATOR(vmi, mm, start);
84 /* going through each vma to check. */
85 for_each_vma_range(vmi, vma, end) {
86 if (unlikely(!can_modify_vma(vma)))
90 /* Allow by default. */
95 * Check if the vmas of a memory range are allowed to be modified by madvise.
96 * the memory ranger can have a gap (unallocated memory).
97 * return true, if it is allowed.
99 bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
102 struct vm_area_struct *vma;
104 VMA_ITERATOR(vmi, mm, start);
106 if (!is_madv_discard(behavior))
109 /* going through each vma to check. */
110 for_each_vma_range(vmi, vma, end)
111 if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
114 /* Allow by default. */
118 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
119 struct vm_area_struct **prev, unsigned long start,
120 unsigned long end, vm_flags_t newflags)
123 vm_flags_t oldflags = vma->vm_flags;
125 if (newflags == oldflags)
128 vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
141 * Check for do_mseal:
142 * 1> start is part of a valid vma.
143 * 2> end is part of a valid vma.
144 * 3> No gap (unallocated address) between start and end.
145 * 4> map is sealable.
147 static int check_mm_seal(unsigned long start, unsigned long end)
149 struct vm_area_struct *vma;
150 unsigned long nstart = start;
152 VMA_ITERATOR(vmi, current->mm, start);
154 /* going through each vma to check. */
155 for_each_vma_range(vmi, vma, end) {
156 if (vma->vm_start > nstart)
157 /* unallocated memory found. */
160 if (vma->vm_end >= end)
163 nstart = vma->vm_end;
172 static int apply_mm_seal(unsigned long start, unsigned long end)
174 unsigned long nstart;
175 struct vm_area_struct *vma, *prev;
177 VMA_ITERATOR(vmi, current->mm, start);
179 vma = vma_iter_load(&vmi);
181 * Note: check_mm_seal should already checked ENOMEM case.
182 * so vma should not be null, same for the other ENOMEM cases.
184 prev = vma_prev(&vmi);
185 if (start > vma->vm_start)
189 for_each_vma_range(vmi, vma, end) {
194 newflags = vma->vm_flags | VM_SEALED;
198 error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
201 nstart = vma_iter_end(&vmi);
208 * mseal(2) seals the VM's meta data from
211 * addr/len: VM address range.
213 * The address range by addr/len must meet:
214 * start (addr) must be in a valid VMA.
215 * end (addr + len) must be in a valid VMA.
216 * no gap (unallocated memory) between start and end.
217 * start (addr) must be page aligned.
219 * len: len will be page aligned implicitly.
221 * Below VMA operations are blocked after sealing.
222 * 1> Unmapping, moving to another location, and shrinking
223 * the size, via munmap() and mremap(), can leave an empty
224 * space, therefore can be replaced with a VMA with a new
226 * 2> Moving or expanding a different vma into the current location,
228 * 3> Modifying a VMA via mmap(MAP_FIXED).
229 * 4> Size expansion, via mremap(), does not appear to pose any
230 * specific risks to sealed VMAs. It is included anyway because
231 * the use case is unclear. In any case, users can rely on
232 * merging to expand a sealed VMA.
233 * 5> mprotect and pkey_mprotect.
234 * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
235 * for anonymous memory, when users don't have write permission to the
236 * memory. Those behaviors can alter region contents by discarding pages,
237 * effectively a memset(0) for anonymous memory.
244 * invalid input flags.
245 * start address is not page aligned.
246 * Address arange (start + len) overflow.
248 * addr is not a valid address (not allocated).
249 * end (start + len) is not a valid address.
250 * a gap (unallocated memory) between start and end.
252 * - In 32 bit architecture, sealing is not supported.
254 * user can call mseal(2) multiple times, adding a seal on an
255 * already sealed memory is a no-action (no error).
257 * unseal() is not supported.
259 static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
264 struct mm_struct *mm = current->mm;
266 ret = can_do_mseal(flags);
270 start = untagged_addr(start);
271 if (!PAGE_ALIGNED(start))
274 len = PAGE_ALIGN(len_in);
275 /* Check to see whether len was rounded up from small -ve to zero. */
286 if (mmap_write_lock_killable(mm))
290 * First pass, this helps to avoid
291 * partial sealing in case of error in input address range,
294 ret = check_mm_seal(start, end);
299 * Second pass, this should success, unless there are errors
300 * from vma_modify_flags, e.g. merge/split error, or process
301 * reaching the max supported VMAs, however, those cases shall
304 ret = apply_mm_seal(start, end);
307 mmap_write_unlock(current->mm);
311 SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
314 return do_mseal(start, len, flags);