mm: vmscan: avoid split during shrink_folio_list()
[linux.git] / mm / nommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  linux/mm/nommu.c
4  *
5  *  Replacement code for mm functions to support CPU's that don't
6  *  have any form of memory management unit (thus no virtual memory).
7  *
8  *  See Documentation/admin-guide/mm/nommu-mmap.rst
9  *
10  *  Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
11  *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
12  *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
13  *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
14  *  Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
15  */
16
17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
19 #include <linux/export.h>
20 #include <linux/mm.h>
21 #include <linux/sched/mm.h>
22 #include <linux/mman.h>
23 #include <linux/swap.h>
24 #include <linux/file.h>
25 #include <linux/highmem.h>
26 #include <linux/pagemap.h>
27 #include <linux/slab.h>
28 #include <linux/vmalloc.h>
29 #include <linux/backing-dev.h>
30 #include <linux/compiler.h>
31 #include <linux/mount.h>
32 #include <linux/personality.h>
33 #include <linux/security.h>
34 #include <linux/syscalls.h>
35 #include <linux/audit.h>
36 #include <linux/printk.h>
37
38 #include <linux/uaccess.h>
39 #include <linux/uio.h>
40 #include <asm/tlb.h>
41 #include <asm/tlbflush.h>
42 #include <asm/mmu_context.h>
43 #include "internal.h"
44
45 void *high_memory;
46 EXPORT_SYMBOL(high_memory);
47 struct page *mem_map;
48 unsigned long max_mapnr;
49 EXPORT_SYMBOL(max_mapnr);
50 unsigned long highest_memmap_pfn;
51 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
52 int heap_stack_gap = 0;
53
54 atomic_long_t mmap_pages_allocated;
55
56 EXPORT_SYMBOL(mem_map);
57
58 /* list of mapped, potentially shareable regions */
59 static struct kmem_cache *vm_region_jar;
60 struct rb_root nommu_region_tree = RB_ROOT;
61 DECLARE_RWSEM(nommu_region_sem);
62
63 const struct vm_operations_struct generic_file_vm_ops = {
64 };
65
66 /*
67  * Return the total memory allocated for this pointer, not
68  * just what the caller asked for.
69  *
70  * Doesn't have to be accurate, i.e. may have races.
71  */
72 unsigned int kobjsize(const void *objp)
73 {
74         struct page *page;
75
76         /*
77          * If the object we have should not have ksize performed on it,
78          * return size of 0
79          */
80         if (!objp || !virt_addr_valid(objp))
81                 return 0;
82
83         page = virt_to_head_page(objp);
84
85         /*
86          * If the allocator sets PageSlab, we know the pointer came from
87          * kmalloc().
88          */
89         if (PageSlab(page))
90                 return ksize(objp);
91
92         /*
93          * If it's not a compound page, see if we have a matching VMA
94          * region. This test is intentionally done in reverse order,
95          * so if there's no VMA, we still fall through and hand back
96          * PAGE_SIZE for 0-order pages.
97          */
98         if (!PageCompound(page)) {
99                 struct vm_area_struct *vma;
100
101                 vma = find_vma(current->mm, (unsigned long)objp);
102                 if (vma)
103                         return vma->vm_end - vma->vm_start;
104         }
105
106         /*
107          * The ksize() function is only guaranteed to work for pointers
108          * returned by kmalloc(). So handle arbitrary pointers here.
109          */
110         return page_size(page);
111 }
112
113 void vfree(const void *addr)
114 {
115         kfree(addr);
116 }
117 EXPORT_SYMBOL(vfree);
118
119 void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
120 {
121         /*
122          *  You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
123          * returns only a logical address.
124          */
125         return kmalloc_noprof(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
126 }
127 EXPORT_SYMBOL(__vmalloc_noprof);
128
129 void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
130                 unsigned long start, unsigned long end, gfp_t gfp_mask,
131                 pgprot_t prot, unsigned long vm_flags, int node,
132                 const void *caller)
133 {
134         return __vmalloc_noprof(size, gfp_mask);
135 }
136
137 void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask,
138                 int node, const void *caller)
139 {
140         return __vmalloc_noprof(size, gfp_mask);
141 }
142
143 static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
144 {
145         void *ret;
146
147         ret = __vmalloc(size, flags);
148         if (ret) {
149                 struct vm_area_struct *vma;
150
151                 mmap_write_lock(current->mm);
152                 vma = find_vma(current->mm, (unsigned long)ret);
153                 if (vma)
154                         vm_flags_set(vma, VM_USERMAP);
155                 mmap_write_unlock(current->mm);
156         }
157
158         return ret;
159 }
160
161 void *vmalloc_user_noprof(unsigned long size)
162 {
163         return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO);
164 }
165 EXPORT_SYMBOL(vmalloc_user_noprof);
166
167 struct page *vmalloc_to_page(const void *addr)
168 {
169         return virt_to_page(addr);
170 }
171 EXPORT_SYMBOL(vmalloc_to_page);
172
173 unsigned long vmalloc_to_pfn(const void *addr)
174 {
175         return page_to_pfn(virt_to_page(addr));
176 }
177 EXPORT_SYMBOL(vmalloc_to_pfn);
178
179 long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
180 {
181         /* Don't allow overflow */
182         if ((unsigned long) addr + count < count)
183                 count = -(unsigned long) addr;
184
185         return copy_to_iter(addr, count, iter);
186 }
187
188 /*
189  *      vmalloc  -  allocate virtually contiguous memory
190  *
191  *      @size:          allocation size
192  *
193  *      Allocate enough pages to cover @size from the page level
194  *      allocator and map them into contiguous kernel virtual space.
195  *
196  *      For tight control over page level allocator and protection flags
197  *      use __vmalloc() instead.
198  */
199 void *vmalloc_noprof(unsigned long size)
200 {
201         return __vmalloc_noprof(size, GFP_KERNEL);
202 }
203 EXPORT_SYMBOL(vmalloc_noprof);
204
205 void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
206
207 /*
208  *      vzalloc - allocate virtually contiguous memory with zero fill
209  *
210  *      @size:          allocation size
211  *
212  *      Allocate enough pages to cover @size from the page level
213  *      allocator and map them into contiguous kernel virtual space.
214  *      The memory allocated is set to zero.
215  *
216  *      For tight control over page level allocator and protection flags
217  *      use __vmalloc() instead.
218  */
219 void *vzalloc_noprof(unsigned long size)
220 {
221         return __vmalloc_noprof(size, GFP_KERNEL | __GFP_ZERO);
222 }
223 EXPORT_SYMBOL(vzalloc_noprof);
224
225 /**
226  * vmalloc_node - allocate memory on a specific node
227  * @size:       allocation size
228  * @node:       numa node
229  *
230  * Allocate enough pages to cover @size from the page level
231  * allocator and map them into contiguous kernel virtual space.
232  *
233  * For tight control over page level allocator and protection flags
234  * use __vmalloc() instead.
235  */
236 void *vmalloc_node_noprof(unsigned long size, int node)
237 {
238         return vmalloc_noprof(size);
239 }
240 EXPORT_SYMBOL(vmalloc_node_noprof);
241
242 /**
243  * vzalloc_node - allocate memory on a specific node with zero fill
244  * @size:       allocation size
245  * @node:       numa node
246  *
247  * Allocate enough pages to cover @size from the page level
248  * allocator and map them into contiguous kernel virtual space.
249  * The memory allocated is set to zero.
250  *
251  * For tight control over page level allocator and protection flags
252  * use __vmalloc() instead.
253  */
254 void *vzalloc_node_noprof(unsigned long size, int node)
255 {
256         return vzalloc_noprof(size);
257 }
258 EXPORT_SYMBOL(vzalloc_node_noprof);
259
260 /**
261  * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
262  *      @size:          allocation size
263  *
264  *      Allocate enough 32bit PA addressable pages to cover @size from the
265  *      page level allocator and map them into contiguous kernel virtual space.
266  */
267 void *vmalloc_32_noprof(unsigned long size)
268 {
269         return __vmalloc_noprof(size, GFP_KERNEL);
270 }
271 EXPORT_SYMBOL(vmalloc_32_noprof);
272
273 /**
274  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
275  *      @size:          allocation size
276  *
277  * The resulting memory area is 32bit addressable and zeroed so it can be
278  * mapped to userspace without leaking data.
279  *
280  * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
281  * remap_vmalloc_range() are permissible.
282  */
283 void *vmalloc_32_user_noprof(unsigned long size)
284 {
285         /*
286          * We'll have to sort out the ZONE_DMA bits for 64-bit,
287          * but for now this can simply use vmalloc_user() directly.
288          */
289         return vmalloc_user_noprof(size);
290 }
291 EXPORT_SYMBOL(vmalloc_32_user_noprof);
292
293 void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
294 {
295         BUG();
296         return NULL;
297 }
298 EXPORT_SYMBOL(vmap);
299
300 void vunmap(const void *addr)
301 {
302         BUG();
303 }
304 EXPORT_SYMBOL(vunmap);
305
306 void *vm_map_ram(struct page **pages, unsigned int count, int node)
307 {
308         BUG();
309         return NULL;
310 }
311 EXPORT_SYMBOL(vm_map_ram);
312
313 void vm_unmap_ram(const void *mem, unsigned int count)
314 {
315         BUG();
316 }
317 EXPORT_SYMBOL(vm_unmap_ram);
318
319 void vm_unmap_aliases(void)
320 {
321 }
322 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
323
324 void free_vm_area(struct vm_struct *area)
325 {
326         BUG();
327 }
328 EXPORT_SYMBOL_GPL(free_vm_area);
329
330 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
331                    struct page *page)
332 {
333         return -EINVAL;
334 }
335 EXPORT_SYMBOL(vm_insert_page);
336
337 int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
338                         unsigned long num)
339 {
340         return -EINVAL;
341 }
342 EXPORT_SYMBOL(vm_map_pages);
343
344 int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
345                                 unsigned long num)
346 {
347         return -EINVAL;
348 }
349 EXPORT_SYMBOL(vm_map_pages_zero);
350
351 /*
352  *  sys_brk() for the most part doesn't need the global kernel
353  *  lock, except when an application is doing something nasty
354  *  like trying to un-brk an area that has already been mapped
355  *  to a regular file.  in this case, the unmapping will need
356  *  to invoke file system routines that need the global lock.
357  */
358 SYSCALL_DEFINE1(brk, unsigned long, brk)
359 {
360         struct mm_struct *mm = current->mm;
361
362         if (brk < mm->start_brk || brk > mm->context.end_brk)
363                 return mm->brk;
364
365         if (mm->brk == brk)
366                 return mm->brk;
367
368         /*
369          * Always allow shrinking brk
370          */
371         if (brk <= mm->brk) {
372                 mm->brk = brk;
373                 return brk;
374         }
375
376         /*
377          * Ok, looks good - let it rip.
378          */
379         flush_icache_user_range(mm->brk, brk);
380         return mm->brk = brk;
381 }
382
383 /*
384  * initialise the percpu counter for VM and region record slabs
385  */
386 void __init mmap_init(void)
387 {
388         int ret;
389
390         ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
391         VM_BUG_ON(ret);
392         vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
393 }
394
395 /*
396  * validate the region tree
397  * - the caller must hold the region lock
398  */
399 #ifdef CONFIG_DEBUG_NOMMU_REGIONS
400 static noinline void validate_nommu_regions(void)
401 {
402         struct vm_region *region, *last;
403         struct rb_node *p, *lastp;
404
405         lastp = rb_first(&nommu_region_tree);
406         if (!lastp)
407                 return;
408
409         last = rb_entry(lastp, struct vm_region, vm_rb);
410         BUG_ON(last->vm_end <= last->vm_start);
411         BUG_ON(last->vm_top < last->vm_end);
412
413         while ((p = rb_next(lastp))) {
414                 region = rb_entry(p, struct vm_region, vm_rb);
415                 last = rb_entry(lastp, struct vm_region, vm_rb);
416
417                 BUG_ON(region->vm_end <= region->vm_start);
418                 BUG_ON(region->vm_top < region->vm_end);
419                 BUG_ON(region->vm_start < last->vm_top);
420
421                 lastp = p;
422         }
423 }
424 #else
425 static void validate_nommu_regions(void)
426 {
427 }
428 #endif
429
430 /*
431  * add a region into the global tree
432  */
433 static void add_nommu_region(struct vm_region *region)
434 {
435         struct vm_region *pregion;
436         struct rb_node **p, *parent;
437
438         validate_nommu_regions();
439
440         parent = NULL;
441         p = &nommu_region_tree.rb_node;
442         while (*p) {
443                 parent = *p;
444                 pregion = rb_entry(parent, struct vm_region, vm_rb);
445                 if (region->vm_start < pregion->vm_start)
446                         p = &(*p)->rb_left;
447                 else if (region->vm_start > pregion->vm_start)
448                         p = &(*p)->rb_right;
449                 else if (pregion == region)
450                         return;
451                 else
452                         BUG();
453         }
454
455         rb_link_node(&region->vm_rb, parent, p);
456         rb_insert_color(&region->vm_rb, &nommu_region_tree);
457
458         validate_nommu_regions();
459 }
460
461 /*
462  * delete a region from the global tree
463  */
464 static void delete_nommu_region(struct vm_region *region)
465 {
466         BUG_ON(!nommu_region_tree.rb_node);
467
468         validate_nommu_regions();
469         rb_erase(&region->vm_rb, &nommu_region_tree);
470         validate_nommu_regions();
471 }
472
473 /*
474  * free a contiguous series of pages
475  */
476 static void free_page_series(unsigned long from, unsigned long to)
477 {
478         for (; from < to; from += PAGE_SIZE) {
479                 struct page *page = virt_to_page((void *)from);
480
481                 atomic_long_dec(&mmap_pages_allocated);
482                 put_page(page);
483         }
484 }
485
486 /*
487  * release a reference to a region
488  * - the caller must hold the region semaphore for writing, which this releases
489  * - the region may not have been added to the tree yet, in which case vm_top
490  *   will equal vm_start
491  */
492 static void __put_nommu_region(struct vm_region *region)
493         __releases(nommu_region_sem)
494 {
495         BUG_ON(!nommu_region_tree.rb_node);
496
497         if (--region->vm_usage == 0) {
498                 if (region->vm_top > region->vm_start)
499                         delete_nommu_region(region);
500                 up_write(&nommu_region_sem);
501
502                 if (region->vm_file)
503                         fput(region->vm_file);
504
505                 /* IO memory and memory shared directly out of the pagecache
506                  * from ramfs/tmpfs mustn't be released here */
507                 if (region->vm_flags & VM_MAPPED_COPY)
508                         free_page_series(region->vm_start, region->vm_top);
509                 kmem_cache_free(vm_region_jar, region);
510         } else {
511                 up_write(&nommu_region_sem);
512         }
513 }
514
515 /*
516  * release a reference to a region
517  */
518 static void put_nommu_region(struct vm_region *region)
519 {
520         down_write(&nommu_region_sem);
521         __put_nommu_region(region);
522 }
523
524 static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
525 {
526         vma->vm_mm = mm;
527
528         /* add the VMA to the mapping */
529         if (vma->vm_file) {
530                 struct address_space *mapping = vma->vm_file->f_mapping;
531
532                 i_mmap_lock_write(mapping);
533                 flush_dcache_mmap_lock(mapping);
534                 vma_interval_tree_insert(vma, &mapping->i_mmap);
535                 flush_dcache_mmap_unlock(mapping);
536                 i_mmap_unlock_write(mapping);
537         }
538 }
539
540 static void cleanup_vma_from_mm(struct vm_area_struct *vma)
541 {
542         vma->vm_mm->map_count--;
543         /* remove the VMA from the mapping */
544         if (vma->vm_file) {
545                 struct address_space *mapping;
546                 mapping = vma->vm_file->f_mapping;
547
548                 i_mmap_lock_write(mapping);
549                 flush_dcache_mmap_lock(mapping);
550                 vma_interval_tree_remove(vma, &mapping->i_mmap);
551                 flush_dcache_mmap_unlock(mapping);
552                 i_mmap_unlock_write(mapping);
553         }
554 }
555
556 /*
557  * delete a VMA from its owning mm_struct and address space
558  */
559 static int delete_vma_from_mm(struct vm_area_struct *vma)
560 {
561         VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start);
562
563         vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
564         if (vma_iter_prealloc(&vmi, vma)) {
565                 pr_warn("Allocation of vma tree for process %d failed\n",
566                        current->pid);
567                 return -ENOMEM;
568         }
569         cleanup_vma_from_mm(vma);
570
571         /* remove from the MM's tree and list */
572         vma_iter_clear(&vmi);
573         return 0;
574 }
575 /*
576  * destroy a VMA record
577  */
578 static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
579 {
580         if (vma->vm_ops && vma->vm_ops->close)
581                 vma->vm_ops->close(vma);
582         if (vma->vm_file)
583                 fput(vma->vm_file);
584         put_nommu_region(vma->vm_region);
585         vm_area_free(vma);
586 }
587
588 struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
589                                              unsigned long start_addr,
590                                              unsigned long end_addr)
591 {
592         unsigned long index = start_addr;
593
594         mmap_assert_locked(mm);
595         return mt_find(&mm->mm_mt, &index, end_addr - 1);
596 }
597 EXPORT_SYMBOL(find_vma_intersection);
598
599 /*
600  * look up the first VMA in which addr resides, NULL if none
601  * - should be called with mm->mmap_lock at least held readlocked
602  */
603 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
604 {
605         VMA_ITERATOR(vmi, mm, addr);
606
607         return vma_iter_load(&vmi);
608 }
609 EXPORT_SYMBOL(find_vma);
610
611 /*
612  * At least xtensa ends up having protection faults even with no
613  * MMU.. No stack expansion, at least.
614  */
615 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
616                         unsigned long addr, struct pt_regs *regs)
617 {
618         struct vm_area_struct *vma;
619
620         mmap_read_lock(mm);
621         vma = vma_lookup(mm, addr);
622         if (!vma)
623                 mmap_read_unlock(mm);
624         return vma;
625 }
626
627 /*
628  * expand a stack to a given address
629  * - not supported under NOMMU conditions
630  */
631 int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr)
632 {
633         return -ENOMEM;
634 }
635
636 struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
637 {
638         mmap_read_unlock(mm);
639         return NULL;
640 }
641
642 /*
643  * look up the first VMA exactly that exactly matches addr
644  * - should be called with mm->mmap_lock at least held readlocked
645  */
646 static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
647                                              unsigned long addr,
648                                              unsigned long len)
649 {
650         struct vm_area_struct *vma;
651         unsigned long end = addr + len;
652         VMA_ITERATOR(vmi, mm, addr);
653
654         vma = vma_iter_load(&vmi);
655         if (!vma)
656                 return NULL;
657         if (vma->vm_start != addr)
658                 return NULL;
659         if (vma->vm_end != end)
660                 return NULL;
661
662         return vma;
663 }
664
665 /*
666  * determine whether a mapping should be permitted and, if so, what sort of
667  * mapping we're capable of supporting
668  */
669 static int validate_mmap_request(struct file *file,
670                                  unsigned long addr,
671                                  unsigned long len,
672                                  unsigned long prot,
673                                  unsigned long flags,
674                                  unsigned long pgoff,
675                                  unsigned long *_capabilities)
676 {
677         unsigned long capabilities, rlen;
678         int ret;
679
680         /* do the simple checks first */
681         if (flags & MAP_FIXED)
682                 return -EINVAL;
683
684         if ((flags & MAP_TYPE) != MAP_PRIVATE &&
685             (flags & MAP_TYPE) != MAP_SHARED)
686                 return -EINVAL;
687
688         if (!len)
689                 return -EINVAL;
690
691         /* Careful about overflows.. */
692         rlen = PAGE_ALIGN(len);
693         if (!rlen || rlen > TASK_SIZE)
694                 return -ENOMEM;
695
696         /* offset overflow? */
697         if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
698                 return -EOVERFLOW;
699
700         if (file) {
701                 /* files must support mmap */
702                 if (!file->f_op->mmap)
703                         return -ENODEV;
704
705                 /* work out if what we've got could possibly be shared
706                  * - we support chardevs that provide their own "memory"
707                  * - we support files/blockdevs that are memory backed
708                  */
709                 if (file->f_op->mmap_capabilities) {
710                         capabilities = file->f_op->mmap_capabilities(file);
711                 } else {
712                         /* no explicit capabilities set, so assume some
713                          * defaults */
714                         switch (file_inode(file)->i_mode & S_IFMT) {
715                         case S_IFREG:
716                         case S_IFBLK:
717                                 capabilities = NOMMU_MAP_COPY;
718                                 break;
719
720                         case S_IFCHR:
721                                 capabilities =
722                                         NOMMU_MAP_DIRECT |
723                                         NOMMU_MAP_READ |
724                                         NOMMU_MAP_WRITE;
725                                 break;
726
727                         default:
728                                 return -EINVAL;
729                         }
730                 }
731
732                 /* eliminate any capabilities that we can't support on this
733                  * device */
734                 if (!file->f_op->get_unmapped_area)
735                         capabilities &= ~NOMMU_MAP_DIRECT;
736                 if (!(file->f_mode & FMODE_CAN_READ))
737                         capabilities &= ~NOMMU_MAP_COPY;
738
739                 /* The file shall have been opened with read permission. */
740                 if (!(file->f_mode & FMODE_READ))
741                         return -EACCES;
742
743                 if (flags & MAP_SHARED) {
744                         /* do checks for writing, appending and locking */
745                         if ((prot & PROT_WRITE) &&
746                             !(file->f_mode & FMODE_WRITE))
747                                 return -EACCES;
748
749                         if (IS_APPEND(file_inode(file)) &&
750                             (file->f_mode & FMODE_WRITE))
751                                 return -EACCES;
752
753                         if (!(capabilities & NOMMU_MAP_DIRECT))
754                                 return -ENODEV;
755
756                         /* we mustn't privatise shared mappings */
757                         capabilities &= ~NOMMU_MAP_COPY;
758                 } else {
759                         /* we're going to read the file into private memory we
760                          * allocate */
761                         if (!(capabilities & NOMMU_MAP_COPY))
762                                 return -ENODEV;
763
764                         /* we don't permit a private writable mapping to be
765                          * shared with the backing device */
766                         if (prot & PROT_WRITE)
767                                 capabilities &= ~NOMMU_MAP_DIRECT;
768                 }
769
770                 if (capabilities & NOMMU_MAP_DIRECT) {
771                         if (((prot & PROT_READ)  && !(capabilities & NOMMU_MAP_READ))  ||
772                             ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
773                             ((prot & PROT_EXEC)  && !(capabilities & NOMMU_MAP_EXEC))
774                             ) {
775                                 capabilities &= ~NOMMU_MAP_DIRECT;
776                                 if (flags & MAP_SHARED) {
777                                         pr_warn("MAP_SHARED not completely supported on !MMU\n");
778                                         return -EINVAL;
779                                 }
780                         }
781                 }
782
783                 /* handle executable mappings and implied executable
784                  * mappings */
785                 if (path_noexec(&file->f_path)) {
786                         if (prot & PROT_EXEC)
787                                 return -EPERM;
788                 } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
789                         /* handle implication of PROT_EXEC by PROT_READ */
790                         if (current->personality & READ_IMPLIES_EXEC) {
791                                 if (capabilities & NOMMU_MAP_EXEC)
792                                         prot |= PROT_EXEC;
793                         }
794                 } else if ((prot & PROT_READ) &&
795                          (prot & PROT_EXEC) &&
796                          !(capabilities & NOMMU_MAP_EXEC)
797                          ) {
798                         /* backing file is not executable, try to copy */
799                         capabilities &= ~NOMMU_MAP_DIRECT;
800                 }
801         } else {
802                 /* anonymous mappings are always memory backed and can be
803                  * privately mapped
804                  */
805                 capabilities = NOMMU_MAP_COPY;
806
807                 /* handle PROT_EXEC implication by PROT_READ */
808                 if ((prot & PROT_READ) &&
809                     (current->personality & READ_IMPLIES_EXEC))
810                         prot |= PROT_EXEC;
811         }
812
813         /* allow the security API to have its say */
814         ret = security_mmap_addr(addr);
815         if (ret < 0)
816                 return ret;
817
818         /* looks okay */
819         *_capabilities = capabilities;
820         return 0;
821 }
822
823 /*
824  * we've determined that we can make the mapping, now translate what we
825  * now know into VMA flags
826  */
827 static unsigned long determine_vm_flags(struct file *file,
828                                         unsigned long prot,
829                                         unsigned long flags,
830                                         unsigned long capabilities)
831 {
832         unsigned long vm_flags;
833
834         vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
835
836         if (!file) {
837                 /*
838                  * MAP_ANONYMOUS. MAP_SHARED is mapped to MAP_PRIVATE, because
839                  * there is no fork().
840                  */
841                 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
842         } else if (flags & MAP_PRIVATE) {
843                 /* MAP_PRIVATE file mapping */
844                 if (capabilities & NOMMU_MAP_DIRECT)
845                         vm_flags |= (capabilities & NOMMU_VMFLAGS);
846                 else
847                         vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
848
849                 if (!(prot & PROT_WRITE) && !current->ptrace)
850                         /*
851                          * R/O private file mapping which cannot be used to
852                          * modify memory, especially also not via active ptrace
853                          * (e.g., set breakpoints) or later by upgrading
854                          * permissions (no mprotect()). We can try overlaying
855                          * the file mapping, which will work e.g., on chardevs,
856                          * ramfs/tmpfs/shmfs and romfs/cramf.
857                          */
858                         vm_flags |= VM_MAYOVERLAY;
859         } else {
860                 /* MAP_SHARED file mapping: NOMMU_MAP_DIRECT is set. */
861                 vm_flags |= VM_SHARED | VM_MAYSHARE |
862                             (capabilities & NOMMU_VMFLAGS);
863         }
864
865         return vm_flags;
866 }
867
868 /*
869  * set up a shared mapping on a file (the driver or filesystem provides and
870  * pins the storage)
871  */
872 static int do_mmap_shared_file(struct vm_area_struct *vma)
873 {
874         int ret;
875
876         ret = call_mmap(vma->vm_file, vma);
877         if (ret == 0) {
878                 vma->vm_region->vm_top = vma->vm_region->vm_end;
879                 return 0;
880         }
881         if (ret != -ENOSYS)
882                 return ret;
883
884         /* getting -ENOSYS indicates that direct mmap isn't possible (as
885          * opposed to tried but failed) so we can only give a suitable error as
886          * it's not possible to make a private copy if MAP_SHARED was given */
887         return -ENODEV;
888 }
889
890 /*
891  * set up a private mapping or an anonymous shared mapping
892  */
893 static int do_mmap_private(struct vm_area_struct *vma,
894                            struct vm_region *region,
895                            unsigned long len,
896                            unsigned long capabilities)
897 {
898         unsigned long total, point;
899         void *base;
900         int ret, order;
901
902         /*
903          * Invoke the file's mapping function so that it can keep track of
904          * shared mappings on devices or memory. VM_MAYOVERLAY will be set if
905          * it may attempt to share, which will make is_nommu_shared_mapping()
906          * happy.
907          */
908         if (capabilities & NOMMU_MAP_DIRECT) {
909                 ret = call_mmap(vma->vm_file, vma);
910                 /* shouldn't return success if we're not sharing */
911                 if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
912                         ret = -ENOSYS;
913                 if (ret == 0) {
914                         vma->vm_region->vm_top = vma->vm_region->vm_end;
915                         return 0;
916                 }
917                 if (ret != -ENOSYS)
918                         return ret;
919
920                 /* getting an ENOSYS error indicates that direct mmap isn't
921                  * possible (as opposed to tried but failed) so we'll try to
922                  * make a private copy of the data and map that instead */
923         }
924
925
926         /* allocate some memory to hold the mapping
927          * - note that this may not return a page-aligned address if the object
928          *   we're allocating is smaller than a page
929          */
930         order = get_order(len);
931         total = 1 << order;
932         point = len >> PAGE_SHIFT;
933
934         /* we don't want to allocate a power-of-2 sized page set */
935         if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages)
936                 total = point;
937
938         base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL);
939         if (!base)
940                 goto enomem;
941
942         atomic_long_add(total, &mmap_pages_allocated);
943
944         vm_flags_set(vma, VM_MAPPED_COPY);
945         region->vm_flags = vma->vm_flags;
946         region->vm_start = (unsigned long) base;
947         region->vm_end   = region->vm_start + len;
948         region->vm_top   = region->vm_start + (total << PAGE_SHIFT);
949
950         vma->vm_start = region->vm_start;
951         vma->vm_end   = region->vm_start + len;
952
953         if (vma->vm_file) {
954                 /* read the contents of a file into the copy */
955                 loff_t fpos;
956
957                 fpos = vma->vm_pgoff;
958                 fpos <<= PAGE_SHIFT;
959
960                 ret = kernel_read(vma->vm_file, base, len, &fpos);
961                 if (ret < 0)
962                         goto error_free;
963
964                 /* clear the last little bit */
965                 if (ret < len)
966                         memset(base + ret, 0, len - ret);
967
968         } else {
969                 vma_set_anonymous(vma);
970         }
971
972         return 0;
973
974 error_free:
975         free_page_series(region->vm_start, region->vm_top);
976         region->vm_start = vma->vm_start = 0;
977         region->vm_end   = vma->vm_end = 0;
978         region->vm_top   = 0;
979         return ret;
980
981 enomem:
982         pr_err("Allocation of length %lu from process %d (%s) failed\n",
983                len, current->pid, current->comm);
984         show_mem();
985         return -ENOMEM;
986 }
987
988 /*
989  * handle mapping creation for uClinux
990  */
991 unsigned long do_mmap(struct file *file,
992                         unsigned long addr,
993                         unsigned long len,
994                         unsigned long prot,
995                         unsigned long flags,
996                         vm_flags_t vm_flags,
997                         unsigned long pgoff,
998                         unsigned long *populate,
999                         struct list_head *uf)
1000 {
1001         struct vm_area_struct *vma;
1002         struct vm_region *region;
1003         struct rb_node *rb;
1004         unsigned long capabilities, result;
1005         int ret;
1006         VMA_ITERATOR(vmi, current->mm, 0);
1007
1008         *populate = 0;
1009
1010         /* decide whether we should attempt the mapping, and if so what sort of
1011          * mapping */
1012         ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
1013                                     &capabilities);
1014         if (ret < 0)
1015                 return ret;
1016
1017         /* we ignore the address hint */
1018         addr = 0;
1019         len = PAGE_ALIGN(len);
1020
1021         /* we've determined that we can make the mapping, now translate what we
1022          * now know into VMA flags */
1023         vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
1024
1025
1026         /* we're going to need to record the mapping */
1027         region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
1028         if (!region)
1029                 goto error_getting_region;
1030
1031         vma = vm_area_alloc(current->mm);
1032         if (!vma)
1033                 goto error_getting_vma;
1034
1035         region->vm_usage = 1;
1036         region->vm_flags = vm_flags;
1037         region->vm_pgoff = pgoff;
1038
1039         vm_flags_init(vma, vm_flags);
1040         vma->vm_pgoff = pgoff;
1041
1042         if (file) {
1043                 region->vm_file = get_file(file);
1044                 vma->vm_file = get_file(file);
1045         }
1046
1047         down_write(&nommu_region_sem);
1048
1049         /* if we want to share, we need to check for regions created by other
1050          * mmap() calls that overlap with our proposed mapping
1051          * - we can only share with a superset match on most regular files
1052          * - shared mappings on character devices and memory backed files are
1053          *   permitted to overlap inexactly as far as we are concerned for in
1054          *   these cases, sharing is handled in the driver or filesystem rather
1055          *   than here
1056          */
1057         if (is_nommu_shared_mapping(vm_flags)) {
1058                 struct vm_region *pregion;
1059                 unsigned long pglen, rpglen, pgend, rpgend, start;
1060
1061                 pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1062                 pgend = pgoff + pglen;
1063
1064                 for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
1065                         pregion = rb_entry(rb, struct vm_region, vm_rb);
1066
1067                         if (!is_nommu_shared_mapping(pregion->vm_flags))
1068                                 continue;
1069
1070                         /* search for overlapping mappings on the same file */
1071                         if (file_inode(pregion->vm_file) !=
1072                             file_inode(file))
1073                                 continue;
1074
1075                         if (pregion->vm_pgoff >= pgend)
1076                                 continue;
1077
1078                         rpglen = pregion->vm_end - pregion->vm_start;
1079                         rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1080                         rpgend = pregion->vm_pgoff + rpglen;
1081                         if (pgoff >= rpgend)
1082                                 continue;
1083
1084                         /* handle inexactly overlapping matches between
1085                          * mappings */
1086                         if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
1087                             !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
1088                                 /* new mapping is not a subset of the region */
1089                                 if (!(capabilities & NOMMU_MAP_DIRECT))
1090                                         goto sharing_violation;
1091                                 continue;
1092                         }
1093
1094                         /* we've found a region we can share */
1095                         pregion->vm_usage++;
1096                         vma->vm_region = pregion;
1097                         start = pregion->vm_start;
1098                         start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
1099                         vma->vm_start = start;
1100                         vma->vm_end = start + len;
1101
1102                         if (pregion->vm_flags & VM_MAPPED_COPY)
1103                                 vm_flags_set(vma, VM_MAPPED_COPY);
1104                         else {
1105                                 ret = do_mmap_shared_file(vma);
1106                                 if (ret < 0) {
1107                                         vma->vm_region = NULL;
1108                                         vma->vm_start = 0;
1109                                         vma->vm_end = 0;
1110                                         pregion->vm_usage--;
1111                                         pregion = NULL;
1112                                         goto error_just_free;
1113                                 }
1114                         }
1115                         fput(region->vm_file);
1116                         kmem_cache_free(vm_region_jar, region);
1117                         region = pregion;
1118                         result = start;
1119                         goto share;
1120                 }
1121
1122                 /* obtain the address at which to make a shared mapping
1123                  * - this is the hook for quasi-memory character devices to
1124                  *   tell us the location of a shared mapping
1125                  */
1126                 if (capabilities & NOMMU_MAP_DIRECT) {
1127                         addr = file->f_op->get_unmapped_area(file, addr, len,
1128                                                              pgoff, flags);
1129                         if (IS_ERR_VALUE(addr)) {
1130                                 ret = addr;
1131                                 if (ret != -ENOSYS)
1132                                         goto error_just_free;
1133
1134                                 /* the driver refused to tell us where to site
1135                                  * the mapping so we'll have to attempt to copy
1136                                  * it */
1137                                 ret = -ENODEV;
1138                                 if (!(capabilities & NOMMU_MAP_COPY))
1139                                         goto error_just_free;
1140
1141                                 capabilities &= ~NOMMU_MAP_DIRECT;
1142                         } else {
1143                                 vma->vm_start = region->vm_start = addr;
1144                                 vma->vm_end = region->vm_end = addr + len;
1145                         }
1146                 }
1147         }
1148
1149         vma->vm_region = region;
1150
1151         /* set up the mapping
1152          * - the region is filled in if NOMMU_MAP_DIRECT is still set
1153          */
1154         if (file && vma->vm_flags & VM_SHARED)
1155                 ret = do_mmap_shared_file(vma);
1156         else
1157                 ret = do_mmap_private(vma, region, len, capabilities);
1158         if (ret < 0)
1159                 goto error_just_free;
1160         add_nommu_region(region);
1161
1162         /* clear anonymous mappings that don't ask for uninitialized data */
1163         if (!vma->vm_file &&
1164             (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) ||
1165              !(flags & MAP_UNINITIALIZED)))
1166                 memset((void *)region->vm_start, 0,
1167                        region->vm_end - region->vm_start);
1168
1169         /* okay... we have a mapping; now we have to register it */
1170         result = vma->vm_start;
1171
1172         current->mm->total_vm += len >> PAGE_SHIFT;
1173
1174 share:
1175         BUG_ON(!vma->vm_region);
1176         vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
1177         if (vma_iter_prealloc(&vmi, vma))
1178                 goto error_just_free;
1179
1180         setup_vma_to_mm(vma, current->mm);
1181         current->mm->map_count++;
1182         /* add the VMA to the tree */
1183         vma_iter_store(&vmi, vma);
1184
1185         /* we flush the region from the icache only when the first executable
1186          * mapping of it is made  */
1187         if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
1188                 flush_icache_user_range(region->vm_start, region->vm_end);
1189                 region->vm_icache_flushed = true;
1190         }
1191
1192         up_write(&nommu_region_sem);
1193
1194         return result;
1195
1196 error_just_free:
1197         up_write(&nommu_region_sem);
1198 error:
1199         vma_iter_free(&vmi);
1200         if (region->vm_file)
1201                 fput(region->vm_file);
1202         kmem_cache_free(vm_region_jar, region);
1203         if (vma->vm_file)
1204                 fput(vma->vm_file);
1205         vm_area_free(vma);
1206         return ret;
1207
1208 sharing_violation:
1209         up_write(&nommu_region_sem);
1210         pr_warn("Attempt to share mismatched mappings\n");
1211         ret = -EINVAL;
1212         goto error;
1213
1214 error_getting_vma:
1215         kmem_cache_free(vm_region_jar, region);
1216         pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
1217                         len, current->pid);
1218         show_mem();
1219         return -ENOMEM;
1220
1221 error_getting_region:
1222         pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
1223                         len, current->pid);
1224         show_mem();
1225         return -ENOMEM;
1226 }
1227
1228 unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
1229                               unsigned long prot, unsigned long flags,
1230                               unsigned long fd, unsigned long pgoff)
1231 {
1232         struct file *file = NULL;
1233         unsigned long retval = -EBADF;
1234
1235         audit_mmap_fd(fd, flags);
1236         if (!(flags & MAP_ANONYMOUS)) {
1237                 file = fget(fd);
1238                 if (!file)
1239                         goto out;
1240         }
1241
1242         retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1243
1244         if (file)
1245                 fput(file);
1246 out:
1247         return retval;
1248 }
1249
1250 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1251                 unsigned long, prot, unsigned long, flags,
1252                 unsigned long, fd, unsigned long, pgoff)
1253 {
1254         return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
1255 }
1256
1257 #ifdef __ARCH_WANT_SYS_OLD_MMAP
1258 struct mmap_arg_struct {
1259         unsigned long addr;
1260         unsigned long len;
1261         unsigned long prot;
1262         unsigned long flags;
1263         unsigned long fd;
1264         unsigned long offset;
1265 };
1266
1267 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1268 {
1269         struct mmap_arg_struct a;
1270
1271         if (copy_from_user(&a, arg, sizeof(a)))
1272                 return -EFAULT;
1273         if (offset_in_page(a.offset))
1274                 return -EINVAL;
1275
1276         return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1277                                a.offset >> PAGE_SHIFT);
1278 }
1279 #endif /* __ARCH_WANT_SYS_OLD_MMAP */
1280
1281 /*
1282  * split a vma into two pieces at address 'addr', a new vma is allocated either
1283  * for the first part or the tail.
1284  */
1285 static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
1286                      unsigned long addr, int new_below)
1287 {
1288         struct vm_area_struct *new;
1289         struct vm_region *region;
1290         unsigned long npages;
1291         struct mm_struct *mm;
1292
1293         /* we're only permitted to split anonymous regions (these should have
1294          * only a single usage on the region) */
1295         if (vma->vm_file)
1296                 return -ENOMEM;
1297
1298         mm = vma->vm_mm;
1299         if (mm->map_count >= sysctl_max_map_count)
1300                 return -ENOMEM;
1301
1302         region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
1303         if (!region)
1304                 return -ENOMEM;
1305
1306         new = vm_area_dup(vma);
1307         if (!new)
1308                 goto err_vma_dup;
1309
1310         /* most fields are the same, copy all, and then fixup */
1311         *region = *vma->vm_region;
1312         new->vm_region = region;
1313
1314         npages = (addr - vma->vm_start) >> PAGE_SHIFT;
1315
1316         if (new_below) {
1317                 region->vm_top = region->vm_end = new->vm_end = addr;
1318         } else {
1319                 region->vm_start = new->vm_start = addr;
1320                 region->vm_pgoff = new->vm_pgoff += npages;
1321         }
1322
1323         vma_iter_config(vmi, new->vm_start, new->vm_end);
1324         if (vma_iter_prealloc(vmi, vma)) {
1325                 pr_warn("Allocation of vma tree for process %d failed\n",
1326                         current->pid);
1327                 goto err_vmi_preallocate;
1328         }
1329
1330         if (new->vm_ops && new->vm_ops->open)
1331                 new->vm_ops->open(new);
1332
1333         down_write(&nommu_region_sem);
1334         delete_nommu_region(vma->vm_region);
1335         if (new_below) {
1336                 vma->vm_region->vm_start = vma->vm_start = addr;
1337                 vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
1338         } else {
1339                 vma->vm_region->vm_end = vma->vm_end = addr;
1340                 vma->vm_region->vm_top = addr;
1341         }
1342         add_nommu_region(vma->vm_region);
1343         add_nommu_region(new->vm_region);
1344         up_write(&nommu_region_sem);
1345
1346         setup_vma_to_mm(vma, mm);
1347         setup_vma_to_mm(new, mm);
1348         vma_iter_store(vmi, new);
1349         mm->map_count++;
1350         return 0;
1351
1352 err_vmi_preallocate:
1353         vm_area_free(new);
1354 err_vma_dup:
1355         kmem_cache_free(vm_region_jar, region);
1356         return -ENOMEM;
1357 }
1358
1359 /*
1360  * shrink a VMA by removing the specified chunk from either the beginning or
1361  * the end
1362  */
1363 static int vmi_shrink_vma(struct vma_iterator *vmi,
1364                       struct vm_area_struct *vma,
1365                       unsigned long from, unsigned long to)
1366 {
1367         struct vm_region *region;
1368
1369         /* adjust the VMA's pointers, which may reposition it in the MM's tree
1370          * and list */
1371         if (from > vma->vm_start) {
1372                 if (vma_iter_clear_gfp(vmi, from, vma->vm_end, GFP_KERNEL))
1373                         return -ENOMEM;
1374                 vma->vm_end = from;
1375         } else {
1376                 if (vma_iter_clear_gfp(vmi, vma->vm_start, to, GFP_KERNEL))
1377                         return -ENOMEM;
1378                 vma->vm_start = to;
1379         }
1380
1381         /* cut the backing region down to size */
1382         region = vma->vm_region;
1383         BUG_ON(region->vm_usage != 1);
1384
1385         down_write(&nommu_region_sem);
1386         delete_nommu_region(region);
1387         if (from > region->vm_start) {
1388                 to = region->vm_top;
1389                 region->vm_top = region->vm_end = from;
1390         } else {
1391                 region->vm_start = to;
1392         }
1393         add_nommu_region(region);
1394         up_write(&nommu_region_sem);
1395
1396         free_page_series(from, to);
1397         return 0;
1398 }
1399
1400 /*
1401  * release a mapping
1402  * - under NOMMU conditions the chunk to be unmapped must be backed by a single
1403  *   VMA, though it need not cover the whole VMA
1404  */
1405 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
1406 {
1407         VMA_ITERATOR(vmi, mm, start);
1408         struct vm_area_struct *vma;
1409         unsigned long end;
1410         int ret = 0;
1411
1412         len = PAGE_ALIGN(len);
1413         if (len == 0)
1414                 return -EINVAL;
1415
1416         end = start + len;
1417
1418         /* find the first potentially overlapping VMA */
1419         vma = vma_find(&vmi, end);
1420         if (!vma) {
1421                 static int limit;
1422                 if (limit < 5) {
1423                         pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n",
1424                                         current->pid, current->comm,
1425                                         start, start + len - 1);
1426                         limit++;
1427                 }
1428                 return -EINVAL;
1429         }
1430
1431         /* we're allowed to split an anonymous VMA but not a file-backed one */
1432         if (vma->vm_file) {
1433                 do {
1434                         if (start > vma->vm_start)
1435                                 return -EINVAL;
1436                         if (end == vma->vm_end)
1437                                 goto erase_whole_vma;
1438                         vma = vma_find(&vmi, end);
1439                 } while (vma);
1440                 return -EINVAL;
1441         } else {
1442                 /* the chunk must be a subset of the VMA found */
1443                 if (start == vma->vm_start && end == vma->vm_end)
1444                         goto erase_whole_vma;
1445                 if (start < vma->vm_start || end > vma->vm_end)
1446                         return -EINVAL;
1447                 if (offset_in_page(start))
1448                         return -EINVAL;
1449                 if (end != vma->vm_end && offset_in_page(end))
1450                         return -EINVAL;
1451                 if (start != vma->vm_start && end != vma->vm_end) {
1452                         ret = split_vma(&vmi, vma, start, 1);
1453                         if (ret < 0)
1454                                 return ret;
1455                 }
1456                 return vmi_shrink_vma(&vmi, vma, start, end);
1457         }
1458
1459 erase_whole_vma:
1460         if (delete_vma_from_mm(vma))
1461                 ret = -ENOMEM;
1462         else
1463                 delete_vma(mm, vma);
1464         return ret;
1465 }
1466
1467 int vm_munmap(unsigned long addr, size_t len)
1468 {
1469         struct mm_struct *mm = current->mm;
1470         int ret;
1471
1472         mmap_write_lock(mm);
1473         ret = do_munmap(mm, addr, len, NULL);
1474         mmap_write_unlock(mm);
1475         return ret;
1476 }
1477 EXPORT_SYMBOL(vm_munmap);
1478
1479 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
1480 {
1481         return vm_munmap(addr, len);
1482 }
1483
1484 /*
1485  * release all the mappings made in a process's VM space
1486  */
1487 void exit_mmap(struct mm_struct *mm)
1488 {
1489         VMA_ITERATOR(vmi, mm, 0);
1490         struct vm_area_struct *vma;
1491
1492         if (!mm)
1493                 return;
1494
1495         mm->total_vm = 0;
1496
1497         /*
1498          * Lock the mm to avoid assert complaining even though this is the only
1499          * user of the mm
1500          */
1501         mmap_write_lock(mm);
1502         for_each_vma(vmi, vma) {
1503                 cleanup_vma_from_mm(vma);
1504                 delete_vma(mm, vma);
1505                 cond_resched();
1506         }
1507         __mt_destroy(&mm->mm_mt);
1508         mmap_write_unlock(mm);
1509 }
1510
1511 /*
1512  * expand (or shrink) an existing mapping, potentially moving it at the same
1513  * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1514  *
1515  * under NOMMU conditions, we only permit changing a mapping's size, and only
1516  * as long as it stays within the region allocated by do_mmap_private() and the
1517  * block is not shareable
1518  *
1519  * MREMAP_FIXED is not supported under NOMMU conditions
1520  */
1521 static unsigned long do_mremap(unsigned long addr,
1522                         unsigned long old_len, unsigned long new_len,
1523                         unsigned long flags, unsigned long new_addr)
1524 {
1525         struct vm_area_struct *vma;
1526
1527         /* insanity checks first */
1528         old_len = PAGE_ALIGN(old_len);
1529         new_len = PAGE_ALIGN(new_len);
1530         if (old_len == 0 || new_len == 0)
1531                 return (unsigned long) -EINVAL;
1532
1533         if (offset_in_page(addr))
1534                 return -EINVAL;
1535
1536         if (flags & MREMAP_FIXED && new_addr != addr)
1537                 return (unsigned long) -EINVAL;
1538
1539         vma = find_vma_exact(current->mm, addr, old_len);
1540         if (!vma)
1541                 return (unsigned long) -EINVAL;
1542
1543         if (vma->vm_end != vma->vm_start + old_len)
1544                 return (unsigned long) -EFAULT;
1545
1546         if (is_nommu_shared_mapping(vma->vm_flags))
1547                 return (unsigned long) -EPERM;
1548
1549         if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
1550                 return (unsigned long) -ENOMEM;
1551
1552         /* all checks complete - do it */
1553         vma->vm_end = vma->vm_start + new_len;
1554         return vma->vm_start;
1555 }
1556
1557 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1558                 unsigned long, new_len, unsigned long, flags,
1559                 unsigned long, new_addr)
1560 {
1561         unsigned long ret;
1562
1563         mmap_write_lock(current->mm);
1564         ret = do_mremap(addr, old_len, new_len, flags, new_addr);
1565         mmap_write_unlock(current->mm);
1566         return ret;
1567 }
1568
1569 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1570                          unsigned int foll_flags)
1571 {
1572         return NULL;
1573 }
1574
1575 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1576                 unsigned long pfn, unsigned long size, pgprot_t prot)
1577 {
1578         if (addr != (pfn << PAGE_SHIFT))
1579                 return -EINVAL;
1580
1581         vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
1582         return 0;
1583 }
1584 EXPORT_SYMBOL(remap_pfn_range);
1585
1586 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
1587 {
1588         unsigned long pfn = start >> PAGE_SHIFT;
1589         unsigned long vm_len = vma->vm_end - vma->vm_start;
1590
1591         pfn += vma->vm_pgoff;
1592         return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
1593 }
1594 EXPORT_SYMBOL(vm_iomap_memory);
1595
1596 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1597                         unsigned long pgoff)
1598 {
1599         unsigned int size = vma->vm_end - vma->vm_start;
1600
1601         if (!(vma->vm_flags & VM_USERMAP))
1602                 return -EINVAL;
1603
1604         vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
1605         vma->vm_end = vma->vm_start + size;
1606
1607         return 0;
1608 }
1609 EXPORT_SYMBOL(remap_vmalloc_range);
1610
1611 vm_fault_t filemap_fault(struct vm_fault *vmf)
1612 {
1613         BUG();
1614         return 0;
1615 }
1616 EXPORT_SYMBOL(filemap_fault);
1617
1618 vm_fault_t filemap_map_pages(struct vm_fault *vmf,
1619                 pgoff_t start_pgoff, pgoff_t end_pgoff)
1620 {
1621         BUG();
1622         return 0;
1623 }
1624 EXPORT_SYMBOL(filemap_map_pages);
1625
1626 static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
1627                               void *buf, int len, unsigned int gup_flags)
1628 {
1629         struct vm_area_struct *vma;
1630         int write = gup_flags & FOLL_WRITE;
1631
1632         if (mmap_read_lock_killable(mm))
1633                 return 0;
1634
1635         /* the access must start within one of the target process's mappings */
1636         vma = find_vma(mm, addr);
1637         if (vma) {
1638                 /* don't overrun this mapping */
1639                 if (addr + len >= vma->vm_end)
1640                         len = vma->vm_end - addr;
1641
1642                 /* only read or write mappings where it is permitted */
1643                 if (write && vma->vm_flags & VM_MAYWRITE)
1644                         copy_to_user_page(vma, NULL, addr,
1645                                          (void *) addr, buf, len);
1646                 else if (!write && vma->vm_flags & VM_MAYREAD)
1647                         copy_from_user_page(vma, NULL, addr,
1648                                             buf, (void *) addr, len);
1649                 else
1650                         len = 0;
1651         } else {
1652                 len = 0;
1653         }
1654
1655         mmap_read_unlock(mm);
1656
1657         return len;
1658 }
1659
1660 /**
1661  * access_remote_vm - access another process' address space
1662  * @mm:         the mm_struct of the target address space
1663  * @addr:       start address to access
1664  * @buf:        source or destination buffer
1665  * @len:        number of bytes to transfer
1666  * @gup_flags:  flags modifying lookup behaviour
1667  *
1668  * The caller must hold a reference on @mm.
1669  */
1670 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
1671                 void *buf, int len, unsigned int gup_flags)
1672 {
1673         return __access_remote_vm(mm, addr, buf, len, gup_flags);
1674 }
1675
1676 /*
1677  * Access another process' address space.
1678  * - source/target buffer must be kernel space
1679  */
1680 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len,
1681                 unsigned int gup_flags)
1682 {
1683         struct mm_struct *mm;
1684
1685         if (addr + len < addr)
1686                 return 0;
1687
1688         mm = get_task_mm(tsk);
1689         if (!mm)
1690                 return 0;
1691
1692         len = __access_remote_vm(mm, addr, buf, len, gup_flags);
1693
1694         mmput(mm);
1695         return len;
1696 }
1697 EXPORT_SYMBOL_GPL(access_process_vm);
1698
1699 /**
1700  * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
1701  * @inode: The inode to check
1702  * @size: The current filesize of the inode
1703  * @newsize: The proposed filesize of the inode
1704  *
1705  * Check the shared mappings on an inode on behalf of a shrinking truncate to
1706  * make sure that any outstanding VMAs aren't broken and then shrink the
1707  * vm_regions that extend beyond so that do_mmap() doesn't
1708  * automatically grant mappings that are too large.
1709  */
1710 int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
1711                                 size_t newsize)
1712 {
1713         struct vm_area_struct *vma;
1714         struct vm_region *region;
1715         pgoff_t low, high;
1716         size_t r_size, r_top;
1717
1718         low = newsize >> PAGE_SHIFT;
1719         high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1720
1721         down_write(&nommu_region_sem);
1722         i_mmap_lock_read(inode->i_mapping);
1723
1724         /* search for VMAs that fall within the dead zone */
1725         vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
1726                 /* found one - only interested if it's shared out of the page
1727                  * cache */
1728                 if (vma->vm_flags & VM_SHARED) {
1729                         i_mmap_unlock_read(inode->i_mapping);
1730                         up_write(&nommu_region_sem);
1731                         return -ETXTBSY; /* not quite true, but near enough */
1732                 }
1733         }
1734
1735         /* reduce any regions that overlap the dead zone - if in existence,
1736          * these will be pointed to by VMAs that don't overlap the dead zone
1737          *
1738          * we don't check for any regions that start beyond the EOF as there
1739          * shouldn't be any
1740          */
1741         vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) {
1742                 if (!(vma->vm_flags & VM_SHARED))
1743                         continue;
1744
1745                 region = vma->vm_region;
1746                 r_size = region->vm_top - region->vm_start;
1747                 r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
1748
1749                 if (r_top > newsize) {
1750                         region->vm_top -= r_top - newsize;
1751                         if (region->vm_end > region->vm_top)
1752                                 region->vm_end = region->vm_top;
1753                 }
1754         }
1755
1756         i_mmap_unlock_read(inode->i_mapping);
1757         up_write(&nommu_region_sem);
1758         return 0;
1759 }
1760
1761 /*
1762  * Initialise sysctl_user_reserve_kbytes.
1763  *
1764  * This is intended to prevent a user from starting a single memory hogging
1765  * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
1766  * mode.
1767  *
1768  * The default value is min(3% of free memory, 128MB)
1769  * 128MB is enough to recover with sshd/login, bash, and top/kill.
1770  */
1771 static int __meminit init_user_reserve(void)
1772 {
1773         unsigned long free_kbytes;
1774
1775         free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
1776
1777         sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
1778         return 0;
1779 }
1780 subsys_initcall(init_user_reserve);
1781
1782 /*
1783  * Initialise sysctl_admin_reserve_kbytes.
1784  *
1785  * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
1786  * to log in and kill a memory hogging process.
1787  *
1788  * Systems with more than 256MB will reserve 8MB, enough to recover
1789  * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
1790  * only reserve 3% of free pages by default.
1791  */
1792 static int __meminit init_admin_reserve(void)
1793 {
1794         unsigned long free_kbytes;
1795
1796         free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));
1797
1798         sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
1799         return 0;
1800 }
1801 subsys_initcall(init_admin_reserve);
This page took 0.130466 seconds and 4 git commands to generate.