]>
Commit | Line | Data |
---|---|---|
12af2b83 MRI |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Copyright (C) 2002 Richard Henderson | |
4 | * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. | |
5 | * Copyright (C) 2023 Luis Chamberlain <[email protected]> | |
6 | * Copyright (C) 2024 Mike Rapoport IBM. | |
7 | */ | |
8 | ||
2e45474a MRM |
9 | #define pr_fmt(fmt) "execmem: " fmt |
10 | ||
12af2b83 | 11 | #include <linux/mm.h> |
2e45474a | 12 | #include <linux/mutex.h> |
12af2b83 MRI |
13 | #include <linux/vmalloc.h> |
14 | #include <linux/execmem.h> | |
2e45474a MRM |
15 | #include <linux/maple_tree.h> |
16 | #include <linux/set_memory.h> | |
12af2b83 | 17 | #include <linux/moduleloader.h> |
0c133b1e | 18 | #include <linux/text-patching.h> |
12af2b83 | 19 | |
2e45474a MRM |
20 | #include <asm/tlbflush.h> |
21 | ||
22 | #include "internal.h" | |
23 | ||
f6bec26c | 24 | static struct execmem_info *execmem_info __ro_after_init; |
223b5e57 | 25 | static struct execmem_info default_execmem_info __ro_after_init; |
f6bec26c | 26 | |
2e45474a MRM |
27 | #ifdef CONFIG_MMU |
28 | static void *execmem_vmalloc(struct execmem_range *range, size_t size, | |
29 | pgprot_t pgprot, unsigned long vm_flags) | |
12af2b83 | 30 | { |
223b5e57 | 31 | bool kasan = range->flags & EXECMEM_KASAN_SHADOW; |
223b5e57 | 32 | gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; |
2e45474a | 33 | unsigned int align = range->alignment; |
f6bec26c MRI |
34 | unsigned long start = range->start; |
35 | unsigned long end = range->end; | |
223b5e57 MRI |
36 | void *p; |
37 | ||
38 | if (kasan) | |
39 | vm_flags |= VM_DEFER_KMEMLEAK; | |
40 | ||
2e45474a MRM |
41 | if (vm_flags & VM_ALLOW_HUGE_VMAP) |
42 | align = PMD_SIZE; | |
43 | ||
223b5e57 MRI |
44 | p = __vmalloc_node_range(size, align, start, end, gfp_flags, |
45 | pgprot, vm_flags, NUMA_NO_NODE, | |
46 | __builtin_return_address(0)); | |
47 | if (!p && range->fallback_start) { | |
48 | start = range->fallback_start; | |
49 | end = range->fallback_end; | |
50 | p = __vmalloc_node_range(size, align, start, end, gfp_flags, | |
51 | pgprot, vm_flags, NUMA_NO_NODE, | |
52 | __builtin_return_address(0)); | |
53 | } | |
54 | ||
55 | if (!p) { | |
2e45474a | 56 | pr_warn_ratelimited("unable to allocate memory\n"); |
223b5e57 MRI |
57 | return NULL; |
58 | } | |
59 | ||
60 | if (kasan && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { | |
61 | vfree(p); | |
62 | return NULL; | |
63 | } | |
f6bec26c | 64 | |
2e45474a MRM |
65 | return p; |
66 | } | |
0f9b6856 SB |
67 | |
68 | struct vm_struct *execmem_vmap(size_t size) | |
69 | { | |
70 | struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA]; | |
71 | struct vm_struct *area; | |
72 | ||
73 | area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, | |
74 | range->start, range->end, NUMA_NO_NODE, | |
75 | GFP_KERNEL, __builtin_return_address(0)); | |
76 | if (!area && range->fallback_start) | |
77 | area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, | |
78 | range->fallback_start, range->fallback_end, | |
79 | NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); | |
80 | ||
81 | return area; | |
82 | } | |
2e45474a MRM |
83 | #else |
84 | static void *execmem_vmalloc(struct execmem_range *range, size_t size, | |
85 | pgprot_t pgprot, unsigned long vm_flags) | |
86 | { | |
87 | return vmalloc(size); | |
12af2b83 | 88 | } |
2e45474a MRM |
89 | #endif /* CONFIG_MMU */ |
90 | ||
91 | #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX | |
92 | struct execmem_cache { | |
93 | struct mutex mutex; | |
94 | struct maple_tree busy_areas; | |
95 | struct maple_tree free_areas; | |
96 | }; | |
97 | ||
98 | static struct execmem_cache execmem_cache = { | |
99 | .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), | |
100 | .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN, | |
101 | execmem_cache.mutex), | |
102 | .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN, | |
103 | execmem_cache.mutex), | |
104 | }; | |
105 | ||
106 | static inline unsigned long mas_range_len(struct ma_state *mas) | |
107 | { | |
108 | return mas->last - mas->index + 1; | |
109 | } | |
110 | ||
111 | static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid) | |
112 | { | |
113 | unsigned int nr = (1 << get_vm_area_page_order(vm)); | |
114 | unsigned int updated = 0; | |
115 | int err = 0; | |
116 | ||
117 | for (int i = 0; i < vm->nr_pages; i += nr) { | |
118 | err = set_direct_map_valid_noflush(vm->pages[i], nr, valid); | |
119 | if (err) | |
120 | goto err_restore; | |
121 | updated += nr; | |
122 | } | |
123 | ||
124 | return 0; | |
125 | ||
126 | err_restore: | |
127 | for (int i = 0; i < updated; i += nr) | |
128 | set_direct_map_valid_noflush(vm->pages[i], nr, !valid); | |
129 | ||
130 | return err; | |
131 | } | |
132 | ||
133 | static void execmem_cache_clean(struct work_struct *work) | |
134 | { | |
135 | struct maple_tree *free_areas = &execmem_cache.free_areas; | |
136 | struct mutex *mutex = &execmem_cache.mutex; | |
137 | MA_STATE(mas, free_areas, 0, ULONG_MAX); | |
138 | void *area; | |
139 | ||
140 | mutex_lock(mutex); | |
141 | mas_for_each(&mas, area, ULONG_MAX) { | |
142 | size_t size = mas_range_len(&mas); | |
143 | ||
144 | if (IS_ALIGNED(size, PMD_SIZE) && | |
145 | IS_ALIGNED(mas.index, PMD_SIZE)) { | |
146 | struct vm_struct *vm = find_vm_area(area); | |
147 | ||
148 | execmem_set_direct_map_valid(vm, true); | |
149 | mas_store_gfp(&mas, NULL, GFP_KERNEL); | |
150 | vfree(area); | |
151 | } | |
152 | } | |
153 | mutex_unlock(mutex); | |
154 | } | |
155 | ||
156 | static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); | |
157 | ||
158 | static int execmem_cache_add(void *ptr, size_t size) | |
159 | { | |
160 | struct maple_tree *free_areas = &execmem_cache.free_areas; | |
161 | struct mutex *mutex = &execmem_cache.mutex; | |
162 | unsigned long addr = (unsigned long)ptr; | |
163 | MA_STATE(mas, free_areas, addr - 1, addr + 1); | |
164 | unsigned long lower, upper; | |
165 | void *area = NULL; | |
166 | int err; | |
167 | ||
168 | lower = addr; | |
169 | upper = addr + size - 1; | |
170 | ||
171 | mutex_lock(mutex); | |
172 | area = mas_walk(&mas); | |
173 | if (area && mas.last == addr - 1) | |
174 | lower = mas.index; | |
175 | ||
176 | area = mas_next(&mas, ULONG_MAX); | |
177 | if (area && mas.index == addr + size) | |
178 | upper = mas.last; | |
179 | ||
180 | mas_set_range(&mas, lower, upper); | |
181 | err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); | |
182 | mutex_unlock(mutex); | |
183 | if (err) | |
184 | return err; | |
185 | ||
186 | return 0; | |
187 | } | |
188 | ||
189 | static bool within_range(struct execmem_range *range, struct ma_state *mas, | |
190 | size_t size) | |
191 | { | |
192 | unsigned long addr = mas->index; | |
193 | ||
194 | if (addr >= range->start && addr + size < range->end) | |
195 | return true; | |
196 | ||
197 | if (range->fallback_start && | |
198 | addr >= range->fallback_start && addr + size < range->fallback_end) | |
199 | return true; | |
200 | ||
201 | return false; | |
202 | } | |
203 | ||
204 | static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) | |
205 | { | |
206 | struct maple_tree *free_areas = &execmem_cache.free_areas; | |
207 | struct maple_tree *busy_areas = &execmem_cache.busy_areas; | |
208 | MA_STATE(mas_free, free_areas, 0, ULONG_MAX); | |
209 | MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); | |
210 | struct mutex *mutex = &execmem_cache.mutex; | |
211 | unsigned long addr, last, area_size = 0; | |
212 | void *area, *ptr = NULL; | |
213 | int err; | |
214 | ||
215 | mutex_lock(mutex); | |
216 | mas_for_each(&mas_free, area, ULONG_MAX) { | |
217 | area_size = mas_range_len(&mas_free); | |
218 | ||
219 | if (area_size >= size && within_range(range, &mas_free, size)) | |
220 | break; | |
221 | } | |
222 | ||
223 | if (area_size < size) | |
224 | goto out_unlock; | |
225 | ||
226 | addr = mas_free.index; | |
227 | last = mas_free.last; | |
228 | ||
229 | /* insert allocated size to busy_areas at range [addr, addr + size) */ | |
230 | mas_set_range(&mas_busy, addr, addr + size - 1); | |
231 | err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL); | |
232 | if (err) | |
233 | goto out_unlock; | |
234 | ||
235 | mas_store_gfp(&mas_free, NULL, GFP_KERNEL); | |
236 | if (area_size > size) { | |
237 | void *ptr = (void *)(addr + size); | |
238 | ||
239 | /* | |
240 | * re-insert remaining free size to free_areas at range | |
241 | * [addr + size, last] | |
242 | */ | |
243 | mas_set_range(&mas_free, addr + size, last); | |
244 | err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL); | |
245 | if (err) { | |
246 | mas_store_gfp(&mas_busy, NULL, GFP_KERNEL); | |
247 | goto out_unlock; | |
248 | } | |
249 | } | |
250 | ptr = (void *)addr; | |
251 | ||
252 | out_unlock: | |
253 | mutex_unlock(mutex); | |
254 | return ptr; | |
255 | } | |
256 | ||
257 | static int execmem_cache_populate(struct execmem_range *range, size_t size) | |
258 | { | |
259 | unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; | |
260 | unsigned long start, end; | |
261 | struct vm_struct *vm; | |
262 | size_t alloc_size; | |
263 | int err = -ENOMEM; | |
264 | void *p; | |
265 | ||
266 | alloc_size = round_up(size, PMD_SIZE); | |
267 | p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); | |
268 | if (!p) | |
269 | return err; | |
270 | ||
271 | vm = find_vm_area(p); | |
272 | if (!vm) | |
273 | goto err_free_mem; | |
274 | ||
275 | /* fill memory with instructions that will trap */ | |
276 | execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); | |
277 | ||
278 | start = (unsigned long)p; | |
279 | end = start + alloc_size; | |
280 | ||
281 | vunmap_range(start, end); | |
282 | ||
283 | err = execmem_set_direct_map_valid(vm, false); | |
284 | if (err) | |
285 | goto err_free_mem; | |
286 | ||
287 | err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages, | |
288 | PMD_SHIFT); | |
289 | if (err) | |
290 | goto err_free_mem; | |
291 | ||
292 | err = execmem_cache_add(p, alloc_size); | |
293 | if (err) | |
294 | goto err_free_mem; | |
295 | ||
296 | return 0; | |
297 | ||
298 | err_free_mem: | |
299 | vfree(p); | |
300 | return err; | |
301 | } | |
302 | ||
303 | static void *execmem_cache_alloc(struct execmem_range *range, size_t size) | |
304 | { | |
305 | void *p; | |
306 | int err; | |
307 | ||
308 | p = __execmem_cache_alloc(range, size); | |
309 | if (p) | |
310 | return p; | |
311 | ||
312 | err = execmem_cache_populate(range, size); | |
313 | if (err) | |
314 | return NULL; | |
315 | ||
316 | return __execmem_cache_alloc(range, size); | |
317 | } | |
318 | ||
319 | static bool execmem_cache_free(void *ptr) | |
320 | { | |
321 | struct maple_tree *busy_areas = &execmem_cache.busy_areas; | |
322 | struct mutex *mutex = &execmem_cache.mutex; | |
323 | unsigned long addr = (unsigned long)ptr; | |
324 | MA_STATE(mas, busy_areas, addr, addr); | |
325 | size_t size; | |
326 | void *area; | |
327 | ||
328 | mutex_lock(mutex); | |
329 | area = mas_walk(&mas); | |
330 | if (!area) { | |
331 | mutex_unlock(mutex); | |
332 | return false; | |
333 | } | |
334 | size = mas_range_len(&mas); | |
335 | ||
336 | mas_store_gfp(&mas, NULL, GFP_KERNEL); | |
337 | mutex_unlock(mutex); | |
338 | ||
339 | execmem_fill_trapping_insns(ptr, size, /* writable = */ false); | |
340 | ||
341 | execmem_cache_add(ptr, size); | |
342 | ||
343 | schedule_work(&execmem_cache_clean_work); | |
344 | ||
345 | return true; | |
346 | } | |
347 | #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ | |
348 | static void *execmem_cache_alloc(struct execmem_range *range, size_t size) | |
349 | { | |
350 | return NULL; | |
351 | } | |
352 | ||
353 | static bool execmem_cache_free(void *ptr) | |
354 | { | |
355 | return false; | |
356 | } | |
357 | #endif /* CONFIG_ARCH_HAS_EXECMEM_ROX */ | |
12af2b83 MRI |
358 | |
359 | void *execmem_alloc(enum execmem_type type, size_t size) | |
360 | { | |
223b5e57 | 361 | struct execmem_range *range = &execmem_info->ranges[type]; |
2e45474a MRM |
362 | bool use_cache = range->flags & EXECMEM_ROX_CACHE; |
363 | unsigned long vm_flags = VM_FLUSH_RESET_PERMS; | |
364 | pgprot_t pgprot = range->pgprot; | |
365 | void *p; | |
f6bec26c | 366 | |
2e45474a MRM |
367 | if (use_cache) |
368 | p = execmem_cache_alloc(range, size); | |
369 | else | |
370 | p = execmem_vmalloc(range, size, pgprot, vm_flags); | |
371 | ||
372 | return kasan_reset_tag(p); | |
12af2b83 MRI |
373 | } |
374 | ||
375 | void execmem_free(void *ptr) | |
376 | { | |
377 | /* | |
378 | * This memory may be RO, and freeing RO memory in an interrupt is not | |
379 | * supported by vmalloc. | |
380 | */ | |
381 | WARN_ON(in_interrupt()); | |
2e45474a MRM |
382 | |
383 | if (!execmem_cache_free(ptr)) | |
384 | vfree(ptr); | |
12af2b83 | 385 | } |
f6bec26c | 386 | |
0c133b1e MRM |
387 | void *execmem_update_copy(void *dst, const void *src, size_t size) |
388 | { | |
389 | return text_poke_copy(dst, src, size); | |
390 | } | |
391 | ||
392 | bool execmem_is_rox(enum execmem_type type) | |
393 | { | |
394 | return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE); | |
395 | } | |
396 | ||
f6bec26c MRI |
397 | static bool execmem_validate(struct execmem_info *info) |
398 | { | |
399 | struct execmem_range *r = &info->ranges[EXECMEM_DEFAULT]; | |
400 | ||
401 | if (!r->alignment || !r->start || !r->end || !pgprot_val(r->pgprot)) { | |
402 | pr_crit("Invalid parameters for execmem allocator, module loading will fail"); | |
403 | return false; | |
404 | } | |
405 | ||
2e45474a MRM |
406 | if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) { |
407 | for (int i = EXECMEM_DEFAULT; i < EXECMEM_TYPE_MAX; i++) { | |
408 | r = &info->ranges[i]; | |
409 | ||
410 | if (r->flags & EXECMEM_ROX_CACHE) { | |
411 | pr_warn_once("ROX cache is not supported\n"); | |
412 | r->flags &= ~EXECMEM_ROX_CACHE; | |
413 | } | |
414 | } | |
415 | } | |
416 | ||
f6bec26c MRI |
417 | return true; |
418 | } | |
419 | ||
420 | static void execmem_init_missing(struct execmem_info *info) | |
421 | { | |
422 | struct execmem_range *default_range = &info->ranges[EXECMEM_DEFAULT]; | |
423 | ||
424 | for (int i = EXECMEM_DEFAULT + 1; i < EXECMEM_TYPE_MAX; i++) { | |
425 | struct execmem_range *r = &info->ranges[i]; | |
426 | ||
427 | if (!r->start) { | |
223b5e57 MRI |
428 | if (i == EXECMEM_MODULE_DATA) |
429 | r->pgprot = PAGE_KERNEL; | |
430 | else | |
431 | r->pgprot = default_range->pgprot; | |
f6bec26c MRI |
432 | r->alignment = default_range->alignment; |
433 | r->start = default_range->start; | |
434 | r->end = default_range->end; | |
223b5e57 MRI |
435 | r->flags = default_range->flags; |
436 | r->fallback_start = default_range->fallback_start; | |
437 | r->fallback_end = default_range->fallback_end; | |
f6bec26c MRI |
438 | } |
439 | } | |
440 | } | |
441 | ||
442 | struct execmem_info * __weak execmem_arch_setup(void) | |
443 | { | |
444 | return NULL; | |
445 | } | |
446 | ||
223b5e57 | 447 | static void __init __execmem_init(void) |
f6bec26c MRI |
448 | { |
449 | struct execmem_info *info = execmem_arch_setup(); | |
450 | ||
223b5e57 MRI |
451 | if (!info) { |
452 | info = execmem_info = &default_execmem_info; | |
453 | info->ranges[EXECMEM_DEFAULT].start = VMALLOC_START; | |
454 | info->ranges[EXECMEM_DEFAULT].end = VMALLOC_END; | |
455 | info->ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL_EXEC; | |
456 | info->ranges[EXECMEM_DEFAULT].alignment = 1; | |
457 | } | |
458 | ||
459 | if (!execmem_validate(info)) | |
f6bec26c MRI |
460 | return; |
461 | ||
462 | execmem_init_missing(info); | |
463 | ||
464 | execmem_info = info; | |
465 | } | |
223b5e57 MRI |
466 | |
467 | #ifdef CONFIG_ARCH_WANTS_EXECMEM_LATE | |
468 | static int __init execmem_late_init(void) | |
469 | { | |
470 | __execmem_init(); | |
471 | return 0; | |
472 | } | |
473 | core_initcall(execmem_late_init); | |
474 | #else | |
475 | void __init execmem_init(void) | |
476 | { | |
477 | __execmem_init(); | |
478 | } | |
479 | #endif |