[linux.git] / mm / userfaultfd.c

/*
 *  mm/userfaultfd.c
 *
 *  Copyright (C) 2015  Red Hat, Inc.
 *
 *  This work is licensed under the terms of the GNU GPL, version 2. See
 *  the COPYING file in the top-level directory.
 */

#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/shmem_fs.h>
#include <asm/tlbflush.h>
#include "internal.h"

static int mcopy_atomic_pte(struct mm_struct *dst_mm,
			    pmd_t *dst_pmd,
			    struct vm_area_struct *dst_vma,
			    unsigned long dst_addr,
			    unsigned long src_addr,
			    struct page **pagep)
{
	struct mem_cgroup *memcg;
	pte_t _dst_pte, *dst_pte;
	spinlock_t *ptl;
	void *page_kaddr;
	int ret;
	struct page *page;

	if (!*pagep) {
		ret = -ENOMEM;
		page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
		if (!page)
			goto out;

		page_kaddr = kmap_atomic(page);
		ret = copy_from_user(page_kaddr,
				     (const void __user *) src_addr,
				     PAGE_SIZE);
		kunmap_atomic(page_kaddr);

		/* fallback to copy_from_user outside mmap_sem */
		if (unlikely(ret)) {
			ret = -EFAULT;
			*pagep = page;
			/* don't free the page */
			goto out;
		}
	} else {
		page = *pagep;
		*pagep = NULL;
	}

	/*
	 * The memory barrier inside __SetPageUptodate makes sure that
	 * preceeding stores to the page contents become visible before
	 * the set_pte_at() write.
	 */
	__SetPageUptodate(page);

	ret = -ENOMEM;
	if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
		goto out_release;

	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
	if (dst_vma->vm_flags & VM_WRITE)
		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));

	ret = -EEXIST;
	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	if (!pte_none(*dst_pte))
		goto out_release_uncharge_unlock;

	inc_mm_counter(dst_mm, MM_ANONPAGES);
	page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
	mem_cgroup_commit_charge(page, memcg, false, false);
	lru_cache_add_active_or_unevictable(page, dst_vma);

	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);

	/* No need to invalidate - it was non-present before */
	update_mmu_cache(dst_vma, dst_addr, dst_pte);

	pte_unmap_unlock(dst_pte, ptl);
	ret = 0;
out:
	return ret;
out_release_uncharge_unlock:
	pte_unmap_unlock(dst_pte, ptl);
	mem_cgroup_cancel_charge(page, memcg, false);
out_release:
	put_page(page);
	goto out;
}

static int mfill_zeropage_pte(struct mm_struct *dst_mm,
			      pmd_t *dst_pmd,
			      struct vm_area_struct *dst_vma,
			      unsigned long dst_addr)
{
	pte_t _dst_pte, *dst_pte;
	spinlock_t *ptl;
	int ret;

	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
					 dst_vma->vm_page_prot));
	ret = -EEXIST;
	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	if (!pte_none(*dst_pte))
		goto out_unlock;
	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
	/* No need to invalidate - it was non-present before */
	update_mmu_cache(dst_vma, dst_addr, dst_pte);
	ret = 0;
out_unlock:
	pte_unmap_unlock(dst_pte, ptl);
	return ret;
}

static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
	pgd_t *pgd;
	p4d_t *p4d;
	pud_t *pud;

	pgd = pgd_offset(mm, address);
	p4d = p4d_alloc(mm, pgd, address);
	if (!p4d)
		return NULL;
	pud = pud_alloc(mm, p4d, address);
	if (!pud)
		return NULL;
	/*
	 * Note that we didn't run this because the pmd was
	 * missing, the *pmd may be already established and in
	 * turn it may also be a trans_huge_pmd.
	 */
	return pmd_alloc(mm, pud, address);
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * __mcopy_atomic processing for HUGETLB vmas.  Note that this routine is
 * called with mmap_sem held, it will release mmap_sem before returning.
 */
static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
					      struct vm_area_struct *dst_vma,
					      unsigned long dst_start,
					      unsigned long src_start,
					      unsigned long len,
					      bool zeropage)
{
	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
	int vm_shared = dst_vma->vm_flags & VM_SHARED;
	ssize_t err;
	pte_t *dst_pte;
	unsigned long src_addr, dst_addr;
	long copied;
	struct page *page;
	struct hstate *h;
	unsigned long vma_hpagesize;
	pgoff_t idx;
	u32 hash;
	struct address_space *mapping;

	/*
	 * There is no default zero huge page for all huge page sizes as
	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
	 * by THP.  Since we can not reliably insert a zero page, this
	 * feature is not supported.
	 */
	if (zeropage) {
		up_read(&dst_mm->mmap_sem);
		return -EINVAL;
	}

	src_addr = src_start;
	dst_addr = dst_start;
	copied = 0;
	page = NULL;
	vma_hpagesize = vma_kernel_pagesize(dst_vma);

	/*
	 * Validate alignment based on huge page size
	 */
	err = -EINVAL;
	if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
		goto out_unlock;

retry:
	/*
	 * On routine entry dst_vma is set.  If we had to drop mmap_sem and
	 * retry, dst_vma will be set to NULL and we must lookup again.
	 */
	if (!dst_vma) {
		err = -ENOENT;
		dst_vma = find_vma(dst_mm, dst_start);
		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
			goto out_unlock;
		/*
		 * Only allow __mcopy_atomic_hugetlb on userfaultfd
		 * registered ranges.
		 */
		if (!dst_vma->vm_userfaultfd_ctx.ctx)
			goto out_unlock;

		if (dst_start < dst_vma->vm_start ||
		    dst_start + len > dst_vma->vm_end)
			goto out_unlock;

		err = -EINVAL;
		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
			goto out_unlock;

		vm_shared = dst_vma->vm_flags & VM_SHARED;
	}

	if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
		    (len - copied) & (vma_hpagesize - 1)))
		goto out_unlock;

	/*
	 * If not shared, ensure the dst_vma has a anon_vma.
	 */
	err = -ENOMEM;
	if (!vm_shared) {
		if (unlikely(anon_vma_prepare(dst_vma)))
			goto out_unlock;
	}

	h = hstate_vma(dst_vma);

	while (src_addr < src_start + len) {
		pte_t dst_pteval;

		BUG_ON(dst_addr >= dst_start + len);
		VM_BUG_ON(dst_addr & ~huge_page_mask(h));

		/*
		 * Serialize via hugetlb_fault_mutex
		 */
		idx = linear_page_index(dst_vma, dst_addr);
		mapping = dst_vma->vm_file->f_mapping;
		hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
								idx, dst_addr);
		mutex_lock(&hugetlb_fault_mutex_table[hash]);

		err = -ENOMEM;
		dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
		if (!dst_pte) {
			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
			goto out_unlock;
		}

		err = -EEXIST;
		dst_pteval = huge_ptep_get(dst_pte);
		if (!huge_pte_none(dst_pteval)) {
			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
			goto out_unlock;
		}

		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
						dst_addr, src_addr, &page);

		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
		vm_alloc_shared = vm_shared;

		cond_resched();

		if (unlikely(err == -EFAULT)) {
			up_read(&dst_mm->mmap_sem);
			BUG_ON(!page);

			err = copy_huge_page_from_user(page,
						(const void __user *)src_addr,
						pages_per_huge_page(h), true);
			if (unlikely(err)) {
				err = -EFAULT;
				goto out;
			}
			down_read(&dst_mm->mmap_sem);

			dst_vma = NULL;
			goto retry;
		} else
			BUG_ON(page);

		if (!err) {
			dst_addr += vma_hpagesize;
			src_addr += vma_hpagesize;
			copied += vma_hpagesize;

			if (fatal_signal_pending(current))
				err = -EINTR;
		}
		if (err)
			break;
	}

out_unlock:
	up_read(&dst_mm->mmap_sem);
out:
	if (page) {
		/*
		 * We encountered an error and are about to free a newly
		 * allocated huge page.
		 *
		 * Reservation handling is very subtle, and is different for
		 * private and shared mappings.  See the routine
		 * restore_reserve_on_error for details.  Unfortunately, we
		 * can not call restore_reserve_on_error now as it would
		 * require holding mmap_sem.
		 *
		 * If a reservation for the page existed in the reservation
		 * map of a private mapping, the map was modified to indicate
		 * the reservation was consumed when the page was allocated.
		 * We clear the PagePrivate flag now so that the global
		 * reserve count will not be incremented in free_huge_page.
		 * The reservation map will still indicate the reservation
		 * was consumed and possibly prevent later page allocation.
		 * This is better than leaking a global reservation.  If no
		 * reservation existed, it is still safe to clear PagePrivate
		 * as no adjustments to reservation counts were made during
		 * allocation.
		 *
		 * The reservation map for shared mappings indicates which
		 * pages have reservations.  When a huge page is allocated
		 * for an address with a reservation, no change is made to
		 * the reserve map.  In this case PagePrivate will be set
		 * to indicate that the global reservation count should be
		 * incremented when the page is freed.  This is the desired
		 * behavior.  However, when a huge page is allocated for an
		 * address without a reservation a reservation entry is added
		 * to the reservation map, and PagePrivate will not be set.
		 * When the page is freed, the global reserve count will NOT
		 * be incremented and it will appear as though we have leaked
		 * reserved page.  In this case, set PagePrivate so that the
		 * global reserve count will be incremented to match the
		 * reservation map entry which was created.
		 *
		 * Note that vm_alloc_shared is based on the flags of the vma
		 * for which the page was originally allocated.  dst_vma could
		 * be different or NULL on error.
		 */
		if (vm_alloc_shared)
			SetPagePrivate(page);
		else
			ClearPagePrivate(page);
		put_page(page);
	}
	BUG_ON(copied < 0);
	BUG_ON(err > 0);
	BUG_ON(!copied && !err);
	return copied ? copied : err;
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
				      struct vm_area_struct *dst_vma,
				      unsigned long dst_start,
				      unsigned long src_start,
				      unsigned long len,
				      bool zeropage);
#endif /* CONFIG_HUGETLB_PAGE */

static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
					      unsigned long dst_start,
					      unsigned long src_start,
					      unsigned long len,
					      bool zeropage)
{
	struct vm_area_struct *dst_vma;
	ssize_t err;
	pmd_t *dst_pmd;
	unsigned long src_addr, dst_addr;
	long copied;
	struct page *page;

	/*
	 * Sanitize the command parameters:
	 */
	BUG_ON(dst_start & ~PAGE_MASK);
	BUG_ON(len & ~PAGE_MASK);

	/* Does the address range wrap, or is the span zero-sized? */
	BUG_ON(src_start + len <= src_start);
	BUG_ON(dst_start + len <= dst_start);

	src_addr = src_start;
	dst_addr = dst_start;
	copied = 0;
	page = NULL;
retry:
	down_read(&dst_mm->mmap_sem);

	/*
	 * Make sure the vma is not shared, that the dst range is
	 * both valid and fully within a single existing vma.
	 */
	err = -ENOENT;
	dst_vma = find_vma(dst_mm, dst_start);
	if (!dst_vma)
		goto out_unlock;
	/*
	 * Be strict and only allow __mcopy_atomic on userfaultfd
	 * registered ranges to prevent userland errors going
	 * unnoticed. As far as the VM consistency is concerned, it
	 * would be perfectly safe to remove this check, but there's
	 * no useful usage for __mcopy_atomic ouside of userfaultfd
	 * registered ranges. This is after all why these are ioctls
	 * belonging to the userfaultfd and not syscalls.
	 */
	if (!dst_vma->vm_userfaultfd_ctx.ctx)
		goto out_unlock;

	if (dst_start < dst_vma->vm_start ||
	    dst_start + len > dst_vma->vm_end)
		goto out_unlock;

	err = -EINVAL;
	/*
	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
	 * it will overwrite vm_ops, so vma_is_anonymous must return false.
	 */
	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
	    dst_vma->vm_flags & VM_SHARED))
		goto out_unlock;

	/*
	 * If this is a HUGETLB vma, pass off to appropriate routine
	 */
	if (is_vm_hugetlb_page(dst_vma))
		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
						src_start, len, zeropage);

	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
		goto out_unlock;

	/*
	 * Ensure the dst_vma has a anon_vma or this page
	 * would get a NULL anon_vma when moved in the
	 * dst_vma.
	 */
	err = -ENOMEM;
	if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma)))
		goto out_unlock;

	while (src_addr < src_start + len) {
		pmd_t dst_pmdval;

		BUG_ON(dst_addr >= dst_start + len);

		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
		if (unlikely(!dst_pmd)) {
			err = -ENOMEM;
			break;
		}

		dst_pmdval = pmd_read_atomic(dst_pmd);
		/*
		 * If the dst_pmd is mapped as THP don't
		 * override it and just be strict.
		 */
		if (unlikely(pmd_trans_huge(dst_pmdval))) {
			err = -EEXIST;
			break;
		}
		if (unlikely(pmd_none(dst_pmdval)) &&
		    unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
			err = -ENOMEM;
			break;
		}
		/* If an huge pmd materialized from under us fail */
		if (unlikely(pmd_trans_huge(*dst_pmd))) {
			err = -EFAULT;
			break;
		}

		BUG_ON(pmd_none(*dst_pmd));
		BUG_ON(pmd_trans_huge(*dst_pmd));

		if (vma_is_anonymous(dst_vma)) {
			if (!zeropage)
				err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
						       dst_addr, src_addr,
						       &page);
			else
				err = mfill_zeropage_pte(dst_mm, dst_pmd,
							 dst_vma, dst_addr);
		} else {
			err = -EINVAL; /* if zeropage is true return -EINVAL */
			if (likely(!zeropage))
				err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
							     dst_vma, dst_addr,
							     src_addr, &page);
		}

		cond_resched();

		if (unlikely(err == -EFAULT)) {
			void *page_kaddr;

			up_read(&dst_mm->mmap_sem);
			BUG_ON(!page);

			page_kaddr = kmap(page);
			err = copy_from_user(page_kaddr,
					     (const void __user *) src_addr,
					     PAGE_SIZE);
			kunmap(page);
			if (unlikely(err)) {
				err = -EFAULT;
				goto out;
			}
			goto retry;
		} else
			BUG_ON(page);

		if (!err) {
			dst_addr += PAGE_SIZE;
			src_addr += PAGE_SIZE;
			copied += PAGE_SIZE;

			if (fatal_signal_pending(current))
				err = -EINTR;
		}
		if (err)
			break;
	}

out_unlock:
	up_read(&dst_mm->mmap_sem);
out:
	if (page)
		put_page(page);
	BUG_ON(copied < 0);
	BUG_ON(err > 0);
	BUG_ON(!copied && !err);
	return copied ? copied : err;
}

ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
		     unsigned long src_start, unsigned long len)
{
	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
}

ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
		       unsigned long len)
{
	return __mcopy_atomic(dst_mm, start, 0, len, true);
}
Commit	Line	Data
c1a4de99 AA	1	/*
	2	* mm/userfaultfd.c
	3	*
	4	* Copyright (C) 2015 Red Hat, Inc.
	5	*
	6	* This work is licensed under the terms of the GNU GPL, version 2. See
	7	* the COPYING file in the top-level directory.
	8	*/
	9
	10	#include <linux/mm.h>
174cd4b1	11	#include <linux/sched/signal.h>
c1a4de99 AA	12	#include <linux/pagemap.h>
	13	#include <linux/rmap.h>
	14	#include <linux/swap.h>
	15	#include <linux/swapops.h>
	16	#include <linux/userfaultfd_k.h>
	17	#include <linux/mmu_notifier.h>
60d4d2d2 MK	18	#include <linux/hugetlb.h>
60d4d2d2 MK	19	#include <linux/pagemap.h>
26071ced	20	#include <linux/shmem_fs.h>
c1a4de99 AA	21	#include <asm/tlbflush.h>
	22	#include "internal.h"
	23
	24	static int mcopy_atomic_pte(struct mm_struct *dst_mm,
	25	pmd_t *dst_pmd,
	26	struct vm_area_struct *dst_vma,
	27	unsigned long dst_addr,
b6ebaedb AA	28	unsigned long src_addr,
b6ebaedb AA	29	struct page **pagep)
c1a4de99 AA	30	{
	31	struct mem_cgroup *memcg;
	32	pte_t _dst_pte, *dst_pte;
	33	spinlock_t *ptl;
c1a4de99 AA	34	void *page_kaddr;
c1a4de99 AA	35	int ret;
b6ebaedb	36	struct page *page;
c1a4de99	37
b6ebaedb AA	38	if (!*pagep) {
	39	ret = -ENOMEM;
	40	page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
	41	if (!page)
	42	goto out;
	43
	44	page_kaddr = kmap_atomic(page);
	45	ret = copy_from_user(page_kaddr,
	46	(const void __user *) src_addr,
	47	PAGE_SIZE);
	48	kunmap_atomic(page_kaddr);
	49
	50	/* fallback to copy_from_user outside mmap_sem */
	51	if (unlikely(ret)) {
	52	ret = -EFAULT;
	53	*pagep = page;
	54	/* don't free the page */
	55	goto out;
	56	}
	57	} else {
	58	page = *pagep;
	59	*pagep = NULL;
	60	}
c1a4de99 AA	61
	62	/*
	63	* The memory barrier inside __SetPageUptodate makes sure that
	64	* preceeding stores to the page contents become visible before
	65	* the set_pte_at() write.
	66	*/
	67	__SetPageUptodate(page);
	68
	69	ret = -ENOMEM;
f627c2f5	70	if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
c1a4de99 AA	71	goto out_release;
	72
	73	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
	74	if (dst_vma->vm_flags & VM_WRITE)
	75	_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
	76
	77	ret = -EEXIST;
	78	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	79	if (!pte_none(*dst_pte))
	80	goto out_release_uncharge_unlock;
	81
	82	inc_mm_counter(dst_mm, MM_ANONPAGES);
d281ee61	83	page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
f627c2f5	84	mem_cgroup_commit_charge(page, memcg, false, false);
c1a4de99 AA	85	lru_cache_add_active_or_unevictable(page, dst_vma);
	86
	87	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
	88
	89	/* No need to invalidate - it was non-present before */
	90	update_mmu_cache(dst_vma, dst_addr, dst_pte);
	91
	92	pte_unmap_unlock(dst_pte, ptl);
	93	ret = 0;
	94	out:
	95	return ret;
	96	out_release_uncharge_unlock:
	97	pte_unmap_unlock(dst_pte, ptl);
f627c2f5	98	mem_cgroup_cancel_charge(page, memcg, false);
c1a4de99	99	out_release:
09cbfeaf	100	put_page(page);
c1a4de99	101	goto out;
c1a4de99 AA	102	}
	103
	104	static int mfill_zeropage_pte(struct mm_struct *dst_mm,
	105	pmd_t *dst_pmd,
	106	struct vm_area_struct *dst_vma,
	107	unsigned long dst_addr)
	108	{
	109	pte_t _dst_pte, *dst_pte;
	110	spinlock_t *ptl;
	111	int ret;
	112
	113	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
	114	dst_vma->vm_page_prot));
	115	ret = -EEXIST;
	116	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	117	if (!pte_none(*dst_pte))
	118	goto out_unlock;
	119	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
	120	/* No need to invalidate - it was non-present before */
	121	update_mmu_cache(dst_vma, dst_addr, dst_pte);
	122	ret = 0;
	123	out_unlock:
	124	pte_unmap_unlock(dst_pte, ptl);
	125	return ret;
	126	}
	127
	128	static pmd_t mm_alloc_pmd(struct mm_struct mm, unsigned long address)
	129	{
	130	pgd_t *pgd;
c2febafc	131	p4d_t *p4d;
c1a4de99	132	pud_t *pud;
c1a4de99 AA	133
c1a4de99 AA	134	pgd = pgd_offset(mm, address);
c2febafc KS	135	p4d = p4d_alloc(mm, pgd, address);
	136	if (!p4d)
	137	return NULL;
	138	pud = pud_alloc(mm, p4d, address);
	139	if (!pud)
	140	return NULL;
	141	/*
	142	* Note that we didn't run this because the pmd was
	143	* missing, the *pmd may be already established and in
	144	* turn it may also be a trans_huge_pmd.
	145	*/
	146	return pmd_alloc(mm, pud, address);
c1a4de99 AA	147	}
c1a4de99 AA	148
60d4d2d2 MK	149	#ifdef CONFIG_HUGETLB_PAGE
	150	/*
	151	* __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
	152	* called with mmap_sem held, it will release mmap_sem before returning.
	153	*/
	154	static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
	155	struct vm_area_struct *dst_vma,
	156	unsigned long dst_start,
	157	unsigned long src_start,
	158	unsigned long len,
	159	bool zeropage)
	160	{
1c9e8def MK	161	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
1c9e8def MK	162	int vm_shared = dst_vma->vm_flags & VM_SHARED;
60d4d2d2 MK	163	ssize_t err;
	164	pte_t *dst_pte;
	165	unsigned long src_addr, dst_addr;
	166	long copied;
	167	struct page *page;
	168	struct hstate *h;
	169	unsigned long vma_hpagesize;
	170	pgoff_t idx;
	171	u32 hash;
	172	struct address_space *mapping;
	173
	174	/*
	175	* There is no default zero huge page for all huge page sizes as
	176	* supported by hugetlb. A PMD_SIZE huge pages may exist as used
	177	* by THP. Since we can not reliably insert a zero page, this
	178	* feature is not supported.
	179	*/
	180	if (zeropage) {
	181	up_read(&dst_mm->mmap_sem);
	182	return -EINVAL;
	183	}
	184
	185	src_addr = src_start;
	186	dst_addr = dst_start;
	187	copied = 0;
	188	page = NULL;
	189	vma_hpagesize = vma_kernel_pagesize(dst_vma);
	190
	191	/*
	192	* Validate alignment based on huge page size
	193	*/
	194	err = -EINVAL;
	195	if (dst_start & (vma_hpagesize - 1) \|\| len & (vma_hpagesize - 1))
	196	goto out_unlock;
	197
	198	retry:
	199	/*
	200	* On routine entry dst_vma is set. If we had to drop mmap_sem and
	201	* retry, dst_vma will be set to NULL and we must lookup again.
	202	*/
	203	if (!dst_vma) {
27d02568	204	err = -ENOENT;
60d4d2d2 MK	205	dst_vma = find_vma(dst_mm, dst_start);
	206	if (!dst_vma \|\| !is_vm_hugetlb_page(dst_vma))
	207	goto out_unlock;
60d4d2d2	208	/*
27d02568 MR	209	* Only allow __mcopy_atomic_hugetlb on userfaultfd
27d02568 MR	210	* registered ranges.
60d4d2d2	211	*/
27d02568 MR	212	if (!dst_vma->vm_userfaultfd_ctx.ctx)
	213	goto out_unlock;
	214
60d4d2d2 MK	215	if (dst_start < dst_vma->vm_start \|\|
	216	dst_start + len > dst_vma->vm_end)
	217	goto out_unlock;
1c9e8def	218
27d02568 MR	219	err = -EINVAL;
	220	if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
	221	goto out_unlock;
	222
1c9e8def	223	vm_shared = dst_vma->vm_flags & VM_SHARED;
60d4d2d2 MK	224	}
	225
	226	if (WARN_ON(dst_addr & (vma_hpagesize - 1) \|\|
	227	(len - copied) & (vma_hpagesize - 1)))
	228	goto out_unlock;
	229
60d4d2d2	230	/*
1c9e8def	231	* If not shared, ensure the dst_vma has a anon_vma.
60d4d2d2 MK	232	*/
60d4d2d2 MK	233	err = -ENOMEM;
1c9e8def MK	234	if (!vm_shared) {
	235	if (unlikely(anon_vma_prepare(dst_vma)))
	236	goto out_unlock;
	237	}
60d4d2d2 MK	238
	239	h = hstate_vma(dst_vma);
	240
	241	while (src_addr < src_start + len) {
	242	pte_t dst_pteval;
	243
	244	BUG_ON(dst_addr >= dst_start + len);
	245	VM_BUG_ON(dst_addr & ~huge_page_mask(h));
	246
	247	/*
	248	* Serialize via hugetlb_fault_mutex
	249	*/
	250	idx = linear_page_index(dst_vma, dst_addr);
	251	mapping = dst_vma->vm_file->f_mapping;
	252	hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
	253	idx, dst_addr);
	254	mutex_lock(&hugetlb_fault_mutex_table[hash]);
	255
	256	err = -ENOMEM;
	257	dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
	258	if (!dst_pte) {
	259	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
	260	goto out_unlock;
	261	}
	262
	263	err = -EEXIST;
	264	dst_pteval = huge_ptep_get(dst_pte);
	265	if (!huge_pte_none(dst_pteval)) {
	266	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
	267	goto out_unlock;
	268	}
	269
	270	err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
	271	dst_addr, src_addr, &page);
	272
	273	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
1c9e8def	274	vm_alloc_shared = vm_shared;
60d4d2d2 MK	275
	276	cond_resched();
	277
	278	if (unlikely(err == -EFAULT)) {
	279	up_read(&dst_mm->mmap_sem);
	280	BUG_ON(!page);
	281
	282	err = copy_huge_page_from_user(page,
	283	(const void __user *)src_addr,
810a56b9	284	pages_per_huge_page(h), true);
60d4d2d2 MK	285	if (unlikely(err)) {
	286	err = -EFAULT;
	287	goto out;
	288	}
	289	down_read(&dst_mm->mmap_sem);
	290
	291	dst_vma = NULL;
	292	goto retry;
	293	} else
	294	BUG_ON(page);
	295
	296	if (!err) {
	297	dst_addr += vma_hpagesize;
	298	src_addr += vma_hpagesize;
	299	copied += vma_hpagesize;
	300
	301	if (fatal_signal_pending(current))
	302	err = -EINTR;
	303	}
	304	if (err)
	305	break;
	306	}
	307
	308	out_unlock:
	309	up_read(&dst_mm->mmap_sem);
	310	out:
21205bf8 MK	311	if (page) {
	312	/*
	313	* We encountered an error and are about to free a newly
1c9e8def MK	314	* allocated huge page.
	315	*
	316	* Reservation handling is very subtle, and is different for
	317	* private and shared mappings. See the routine
	318	* restore_reserve_on_error for details. Unfortunately, we
	319	* can not call restore_reserve_on_error now as it would
	320	* require holding mmap_sem.
	321	*
	322	* If a reservation for the page existed in the reservation
	323	* map of a private mapping, the map was modified to indicate
	324	* the reservation was consumed when the page was allocated.
	325	* We clear the PagePrivate flag now so that the global
21205bf8 MK	326	* reserve count will not be incremented in free_huge_page.
	327	* The reservation map will still indicate the reservation
	328	* was consumed and possibly prevent later page allocation.
1c9e8def MK	329	* This is better than leaking a global reservation. If no
	330	* reservation existed, it is still safe to clear PagePrivate
	331	* as no adjustments to reservation counts were made during
	332	* allocation.
	333	*
	334	* The reservation map for shared mappings indicates which
	335	* pages have reservations. When a huge page is allocated
	336	* for an address with a reservation, no change is made to
	337	* the reserve map. In this case PagePrivate will be set
	338	* to indicate that the global reservation count should be
	339	* incremented when the page is freed. This is the desired
	340	* behavior. However, when a huge page is allocated for an
	341	* address without a reservation a reservation entry is added
	342	* to the reservation map, and PagePrivate will not be set.
	343	* When the page is freed, the global reserve count will NOT
	344	* be incremented and it will appear as though we have leaked
	345	* reserved page. In this case, set PagePrivate so that the
	346	* global reserve count will be incremented to match the
	347	* reservation map entry which was created.
	348	*
	349	* Note that vm_alloc_shared is based on the flags of the vma
	350	* for which the page was originally allocated. dst_vma could
	351	* be different or NULL on error.
21205bf8	352	*/
1c9e8def MK	353	if (vm_alloc_shared)
	354	SetPagePrivate(page);
	355	else
	356	ClearPagePrivate(page);
60d4d2d2	357	put_page(page);
21205bf8	358	}
60d4d2d2 MK	359	BUG_ON(copied < 0);
	360	BUG_ON(err > 0);
	361	BUG_ON(!copied && !err);
	362	return copied ? copied : err;
	363	}
	364	#else /* !CONFIG_HUGETLB_PAGE */
	365	/* fail at build time if gcc attempts to use this */
	366	extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
	367	struct vm_area_struct *dst_vma,
	368	unsigned long dst_start,
	369	unsigned long src_start,
	370	unsigned long len,
	371	bool zeropage);
	372	#endif /* CONFIG_HUGETLB_PAGE */
	373
c1a4de99 AA	374	static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
	375	unsigned long dst_start,
	376	unsigned long src_start,
	377	unsigned long len,
	378	bool zeropage)
	379	{
	380	struct vm_area_struct *dst_vma;
	381	ssize_t err;
	382	pmd_t *dst_pmd;
	383	unsigned long src_addr, dst_addr;
b6ebaedb AA	384	long copied;
b6ebaedb AA	385	struct page *page;
c1a4de99 AA	386
	387	/*
	388	* Sanitize the command parameters:
	389	*/
	390	BUG_ON(dst_start & ~PAGE_MASK);
	391	BUG_ON(len & ~PAGE_MASK);
	392
	393	/* Does the address range wrap, or is the span zero-sized? */
	394	BUG_ON(src_start + len <= src_start);
	395	BUG_ON(dst_start + len <= dst_start);
	396
b6ebaedb AA	397	src_addr = src_start;
	398	dst_addr = dst_start;
	399	copied = 0;
	400	page = NULL;
	401	retry:
c1a4de99 AA	402	down_read(&dst_mm->mmap_sem);
	403
	404	/*
	405	* Make sure the vma is not shared, that the dst range is
	406	* both valid and fully within a single existing vma.
	407	*/
27d02568	408	err = -ENOENT;
c1a4de99	409	dst_vma = find_vma(dst_mm, dst_start);
26071ced MR	410	if (!dst_vma)
26071ced MR	411	goto out_unlock;
1c9e8def	412	/*
27d02568 MR	413	* Be strict and only allow __mcopy_atomic on userfaultfd
	414	* registered ranges to prevent userland errors going
	415	* unnoticed. As far as the VM consistency is concerned, it
	416	* would be perfectly safe to remove this check, but there's
	417	* no useful usage for __mcopy_atomic ouside of userfaultfd
	418	* registered ranges. This is after all why these are ioctls
	419	* belonging to the userfaultfd and not syscalls.
1c9e8def	420	*/
27d02568	421	if (!dst_vma->vm_userfaultfd_ctx.ctx)
b6ebaedb	422	goto out_unlock;
1c9e8def	423
c1a4de99 AA	424	if (dst_start < dst_vma->vm_start \|\|
c1a4de99 AA	425	dst_start + len > dst_vma->vm_end)
b6ebaedb	426	goto out_unlock;
c1a4de99	427
27d02568 MR	428	err = -EINVAL;
	429	/*
	430	* shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS\|MAP_SHARED but
	431	* it will overwrite vm_ops, so vma_is_anonymous must return false.
	432	*/
	433	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
	434	dst_vma->vm_flags & VM_SHARED))
	435	goto out_unlock;
	436
60d4d2d2 MK	437	/*
	438	* If this is a HUGETLB vma, pass off to appropriate routine
	439	*/
	440	if (is_vm_hugetlb_page(dst_vma))
	441	return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
	442	src_start, len, zeropage);
	443
26071ced	444	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
b6ebaedb	445	goto out_unlock;
c1a4de99 AA	446
	447	/*
	448	* Ensure the dst_vma has a anon_vma or this page
	449	* would get a NULL anon_vma when moved in the
	450	* dst_vma.
	451	*/
	452	err = -ENOMEM;
26071ced	453	if (vma_is_anonymous(dst_vma) && unlikely(anon_vma_prepare(dst_vma)))
b6ebaedb	454	goto out_unlock;
c1a4de99	455
b6ebaedb	456	while (src_addr < src_start + len) {
c1a4de99	457	pmd_t dst_pmdval;
b6ebaedb	458
c1a4de99	459	BUG_ON(dst_addr >= dst_start + len);
b6ebaedb	460
c1a4de99 AA	461	dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
	462	if (unlikely(!dst_pmd)) {
	463	err = -ENOMEM;
	464	break;
	465	}
	466
	467	dst_pmdval = pmd_read_atomic(dst_pmd);
	468	/*
	469	* If the dst_pmd is mapped as THP don't
	470	* override it and just be strict.
	471	*/
	472	if (unlikely(pmd_trans_huge(dst_pmdval))) {
	473	err = -EEXIST;
	474	break;
	475	}
	476	if (unlikely(pmd_none(dst_pmdval)) &&
3ed3a4f0	477	unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
c1a4de99 AA	478	err = -ENOMEM;
	479	break;
	480	}
	481	/* If an huge pmd materialized from under us fail */
	482	if (unlikely(pmd_trans_huge(*dst_pmd))) {
	483	err = -EFAULT;
	484	break;
	485	}
	486
	487	BUG_ON(pmd_none(*dst_pmd));
	488	BUG_ON(pmd_trans_huge(*dst_pmd));
	489
26071ced MR	490	if (vma_is_anonymous(dst_vma)) {
	491	if (!zeropage)
	492	err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
	493	dst_addr, src_addr,
	494	&page);
	495	else
	496	err = mfill_zeropage_pte(dst_mm, dst_pmd,
	497	dst_vma, dst_addr);
	498	} else {
	499	err = -EINVAL; /* if zeropage is true return -EINVAL */
	500	if (likely(!zeropage))
	501	err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
	502	dst_vma, dst_addr,
	503	src_addr, &page);
	504	}
c1a4de99 AA	505
	506	cond_resched();
	507
b6ebaedb AA	508	if (unlikely(err == -EFAULT)) {
	509	void *page_kaddr;
	510
	511	up_read(&dst_mm->mmap_sem);
	512	BUG_ON(!page);
	513
	514	page_kaddr = kmap(page);
	515	err = copy_from_user(page_kaddr,
	516	(const void __user *) src_addr,
	517	PAGE_SIZE);
	518	kunmap(page);
	519	if (unlikely(err)) {
	520	err = -EFAULT;
	521	goto out;
	522	}
	523	goto retry;
	524	} else
	525	BUG_ON(page);
	526
c1a4de99 AA	527	if (!err) {
	528	dst_addr += PAGE_SIZE;
	529	src_addr += PAGE_SIZE;
	530	copied += PAGE_SIZE;
	531
	532	if (fatal_signal_pending(current))
	533	err = -EINTR;
	534	}
	535	if (err)
	536	break;
	537	}
	538
b6ebaedb	539	out_unlock:
c1a4de99	540	up_read(&dst_mm->mmap_sem);
b6ebaedb AA	541	out:
b6ebaedb AA	542	if (page)
09cbfeaf	543	put_page(page);
c1a4de99 AA	544	BUG_ON(copied < 0);
	545	BUG_ON(err > 0);
	546	BUG_ON(!copied && !err);
	547	return copied ? copied : err;
	548	}
	549
	550	ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
	551	unsigned long src_start, unsigned long len)
	552	{
	553	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
	554	}
	555
	556	ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
	557	unsigned long len)
	558	{
	559	return __mcopy_atomic(dst_mm, start, 0, len, true);
	560	}