]> Git Repo - linux.git/blame - mm/mseal.c
Merge tag 'uml-for-linus-6.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux.git] / mm / mseal.c
CommitLineData
8be7258a
JX
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Implement mseal() syscall.
4 *
5 * Copyright (c) 2023,2024 Google, Inc.
6 *
7 * Author: Jeff Xu <[email protected]>
8 */
9
10#include <linux/mempolicy.h>
11#include <linux/mman.h>
12#include <linux/mm.h>
13#include <linux/mm_inline.h>
14#include <linux/mmu_context.h>
15#include <linux/syscalls.h>
16#include <linux/sched.h>
17#include "internal.h"
18
19static inline bool vma_is_sealed(struct vm_area_struct *vma)
20{
21 return (vma->vm_flags & VM_SEALED);
22}
23
24static inline void set_vma_sealed(struct vm_area_struct *vma)
25{
26 vm_flags_set(vma, VM_SEALED);
27}
28
29/*
30 * check if a vma is sealed for modification.
31 * return true, if modification is allowed.
32 */
33static bool can_modify_vma(struct vm_area_struct *vma)
34{
35 if (unlikely(vma_is_sealed(vma)))
36 return false;
37
38 return true;
39}
40
41static bool is_madv_discard(int behavior)
42{
43 return behavior &
44 (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED |
45 MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK);
46}
47
48static bool is_ro_anon(struct vm_area_struct *vma)
49{
50 /* check anonymous mapping. */
51 if (vma->vm_file || vma->vm_flags & VM_SHARED)
52 return false;
53
54 /*
55 * check for non-writable:
56 * PROT=RO or PKRU is not writeable.
57 */
58 if (!(vma->vm_flags & VM_WRITE) ||
59 !arch_vma_access_permitted(vma, true, false, false))
60 return true;
61
62 return false;
63}
64
65/*
66 * Check if the vmas of a memory range are allowed to be modified.
67 * the memory ranger can have a gap (unallocated memory).
68 * return true, if it is allowed.
69 */
70bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
71{
72 struct vm_area_struct *vma;
73
74 VMA_ITERATOR(vmi, mm, start);
75
76 /* going through each vma to check. */
77 for_each_vma_range(vmi, vma, end) {
78 if (unlikely(!can_modify_vma(vma)))
79 return false;
80 }
81
82 /* Allow by default. */
83 return true;
84}
85
86/*
87 * Check if the vmas of a memory range are allowed to be modified by madvise.
88 * the memory ranger can have a gap (unallocated memory).
89 * return true, if it is allowed.
90 */
91bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
92 int behavior)
93{
94 struct vm_area_struct *vma;
95
96 VMA_ITERATOR(vmi, mm, start);
97
98 if (!is_madv_discard(behavior))
99 return true;
100
101 /* going through each vma to check. */
102 for_each_vma_range(vmi, vma, end)
103 if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
104 return false;
105
106 /* Allow by default. */
107 return true;
108}
109
110static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
111 struct vm_area_struct **prev, unsigned long start,
112 unsigned long end, vm_flags_t newflags)
113{
114 int ret = 0;
115 vm_flags_t oldflags = vma->vm_flags;
116
117 if (newflags == oldflags)
118 goto out;
119
120 vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
121 if (IS_ERR(vma)) {
122 ret = PTR_ERR(vma);
123 goto out;
124 }
125
126 set_vma_sealed(vma);
127out:
128 *prev = vma;
129 return ret;
130}
131
132/*
133 * Check for do_mseal:
134 * 1> start is part of a valid vma.
135 * 2> end is part of a valid vma.
136 * 3> No gap (unallocated address) between start and end.
137 * 4> map is sealable.
138 */
139static int check_mm_seal(unsigned long start, unsigned long end)
140{
141 struct vm_area_struct *vma;
142 unsigned long nstart = start;
143
144 VMA_ITERATOR(vmi, current->mm, start);
145
146 /* going through each vma to check. */
147 for_each_vma_range(vmi, vma, end) {
148 if (vma->vm_start > nstart)
149 /* unallocated memory found. */
150 return -ENOMEM;
151
152 if (vma->vm_end >= end)
153 return 0;
154
155 nstart = vma->vm_end;
156 }
157
158 return -ENOMEM;
159}
160
161/*
162 * Apply sealing.
163 */
164static int apply_mm_seal(unsigned long start, unsigned long end)
165{
166 unsigned long nstart;
167 struct vm_area_struct *vma, *prev;
168
169 VMA_ITERATOR(vmi, current->mm, start);
170
171 vma = vma_iter_load(&vmi);
172 /*
173 * Note: check_mm_seal should already checked ENOMEM case.
174 * so vma should not be null, same for the other ENOMEM cases.
175 */
176 prev = vma_prev(&vmi);
177 if (start > vma->vm_start)
178 prev = vma;
179
180 nstart = start;
181 for_each_vma_range(vmi, vma, end) {
182 int error;
183 unsigned long tmp;
184 vm_flags_t newflags;
185
186 newflags = vma->vm_flags | VM_SEALED;
187 tmp = vma->vm_end;
188 if (tmp > end)
189 tmp = end;
190 error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
191 if (error)
192 return error;
193 nstart = vma_iter_end(&vmi);
194 }
195
196 return 0;
197}
198
199/*
200 * mseal(2) seals the VM's meta data from
201 * selected syscalls.
202 *
203 * addr/len: VM address range.
204 *
205 * The address range by addr/len must meet:
206 * start (addr) must be in a valid VMA.
207 * end (addr + len) must be in a valid VMA.
208 * no gap (unallocated memory) between start and end.
209 * start (addr) must be page aligned.
210 *
211 * len: len will be page aligned implicitly.
212 *
213 * Below VMA operations are blocked after sealing.
214 * 1> Unmapping, moving to another location, and shrinking
215 * the size, via munmap() and mremap(), can leave an empty
216 * space, therefore can be replaced with a VMA with a new
217 * set of attributes.
218 * 2> Moving or expanding a different vma into the current location,
219 * via mremap().
220 * 3> Modifying a VMA via mmap(MAP_FIXED).
221 * 4> Size expansion, via mremap(), does not appear to pose any
222 * specific risks to sealed VMAs. It is included anyway because
223 * the use case is unclear. In any case, users can rely on
224 * merging to expand a sealed VMA.
225 * 5> mprotect and pkey_mprotect.
226 * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
227 * for anonymous memory, when users don't have write permission to the
228 * memory. Those behaviors can alter region contents by discarding pages,
229 * effectively a memset(0) for anonymous memory.
230 *
231 * flags: reserved.
232 *
233 * return values:
234 * zero: success.
235 * -EINVAL:
236 * invalid input flags.
237 * start address is not page aligned.
238 * Address arange (start + len) overflow.
239 * -ENOMEM:
240 * addr is not a valid address (not allocated).
241 * end (start + len) is not a valid address.
242 * a gap (unallocated memory) between start and end.
243 * -EPERM:
244 * - In 32 bit architecture, sealing is not supported.
245 * Note:
246 * user can call mseal(2) multiple times, adding a seal on an
247 * already sealed memory is a no-action (no error).
248 *
249 * unseal() is not supported.
250 */
251static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
252{
253 size_t len;
254 int ret = 0;
255 unsigned long end;
256 struct mm_struct *mm = current->mm;
257
258 ret = can_do_mseal(flags);
259 if (ret)
260 return ret;
261
262 start = untagged_addr(start);
263 if (!PAGE_ALIGNED(start))
264 return -EINVAL;
265
266 len = PAGE_ALIGN(len_in);
267 /* Check to see whether len was rounded up from small -ve to zero. */
268 if (len_in && !len)
269 return -EINVAL;
270
271 end = start + len;
272 if (end < start)
273 return -EINVAL;
274
275 if (end == start)
276 return 0;
277
278 if (mmap_write_lock_killable(mm))
279 return -EINTR;
280
281 /*
282 * First pass, this helps to avoid
283 * partial sealing in case of error in input address range,
284 * e.g. ENOMEM error.
285 */
286 ret = check_mm_seal(start, end);
287 if (ret)
288 goto out;
289
290 /*
291 * Second pass, this should success, unless there are errors
292 * from vma_modify_flags, e.g. merge/split error, or process
293 * reaching the max supported VMAs, however, those cases shall
294 * be rare.
295 */
296 ret = apply_mm_seal(start, end);
297
298out:
299 mmap_write_unlock(current->mm);
300 return ret;
301}
302
303SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
304 flags)
305{
306 return do_mseal(start, len, flags);
307}
This page took 0.141732 seconds and 4 git commands to generate.