#include <linux/spinlock.h>
#include <linux/mm.h>
+#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
unsigned long address, pmd_t *pmd, unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
+ struct dev_pagemap *pgmap = NULL;
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
}
page = vm_normal_page(vma, address, pte);
- if (unlikely(!page)) {
+ if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
+ /*
+ * Only return device mapping pages in the FOLL_GET case since
+ * they are only valid while holding the pgmap reference.
+ */
+ pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
+ if (pgmap)
+ page = pte_page(pte);
+ else
+ goto no_page;
+ } else if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
page = ERR_PTR(-EFAULT);
}
}
- if (flags & FOLL_GET)
- get_page_foll(page);
+ if (flags & FOLL_SPLIT && PageTransCompound(page)) {
+ int ret;
+ get_page(page);
+ pte_unmap_unlock(ptep, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return ERR_PTR(ret);
+ goto retry;
+ }
+
+ if (flags & FOLL_GET) {
+ get_page(page);
+
+ /* drop the pgmap reference now that we hold the page */
+ if (pgmap) {
+ put_dev_pagemap(pgmap);
+ pgmap = NULL;
+ }
+ }
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
mark_page_accessed(page);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /* Do not mlock pte-mapped THP */
+ if (PageTransCompound(page))
+ goto out;
+
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
}
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
- if (pmd_trans_huge(*pmd)) {
- if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(vma, address, pmd);
- return follow_page_pte(vma, address, pmd, flags);
- }
+ if (pmd_devmap(*pmd)) {
ptl = pmd_lock(mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- } else {
- page = follow_trans_huge_pmd(vma, address,
- pmd, flags);
- spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
- return page;
- }
- } else
+ page = follow_devmap_pmd(vma, address, pmd, flags);
+ spin_unlock(ptl);
+ if (page)
+ return page;
+ }
+ if (likely(!pmd_trans_huge(*pmd)))
+ return follow_page_pte(vma, address, pmd, flags);
+
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(ptl);
+ return follow_page_pte(vma, address, pmd, flags);
+ }
+ if (flags & FOLL_SPLIT) {
+ int ret;
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ ret = 0;
+ split_huge_pmd(vma, pmd, address);
+ } else {
+ get_page(page);
spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return ret ? ERR_PTR(ret) :
+ follow_page_pte(vma, address, pmd, flags);
}
- return follow_page_pte(vma, address, pmd, flags);
+
+ page = follow_trans_huge_pmd(vma, address, pmd, flags);
+ spin_unlock(ptl);
+ *page_mask = HPAGE_PMD_NR - 1;
+ return page;
}
static int get_gate_page(struct mm_struct *mm, unsigned long address,
* @mm: mm_struct of target mm
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller
+ * does not allow retry
*
* This is meant to be called in the specific scenario where for locking reasons
* we try to access user memory in atomic context (within a pagefault_disable()
* The main difference with get_user_pages() is that this function will
* unconditionally call handle_mm_fault() which will in turn perform all the
* necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
+ * get_user_pages() only guarantees to update these in the struct page.
*
* This is important for some architectures where those bits also gate the
* access permission to the page because they are maintained in software. On
* such architectures, gup() will not be enough to make a subsequent access
* succeed.
*
- * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
+ * This function will not return with an unlocked mmap_sem. So it has not the
+ * same semantics wrt the @mm->mmap_sem as does filemap_fault().
*/
int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long address, unsigned int fault_flags)
+ unsigned long address, unsigned int fault_flags,
+ bool *unlocked)
{
struct vm_area_struct *vma;
vm_flags_t vm_flags;
- int ret;
+ int ret, major = 0;
+ if (unlocked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+
+retry:
vma = find_extend_vma(mm, address);
if (!vma || address < vma->vm_start)
return -EFAULT;
return -EFAULT;
ret = handle_mm_fault(mm, vma, address, fault_flags);
+ major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return -ENOMEM;
return -EFAULT;
BUG();
}
+
+ if (ret & VM_FAULT_RETRY) {
+ down_read(&mm->mmap_sem);
+ if (!(fault_flags & FAULT_FLAG_TRIED)) {
+ *unlocked = true;
+ fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ fault_flags |= FAULT_FLAG_TRIED;
+ goto retry;
+ }
+ }
+
if (tsk) {
- if (ret & VM_FAULT_MAJOR)
+ if (major)
tsk->maj_flt++;
else
tsk->min_flt++;
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
if (vma->vm_flags & VM_LOCKONFAULT)
gup_flags &= ~FOLL_POPULATE;
-
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
* *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
* pages containing page tables.
*
- * *) THP splits will broadcast an IPI, this can be achieved by overriding
- * pmdp_splitting_flush.
- *
* *) ptes can be read atomically by the architecture.
*
* *) access_ok is sufficient to validate userspace address ranges.
* for an example see gup_get_pte in arch/x86/mm/gup.c
*/
pte_t pte = READ_ONCE(*ptep);
- struct page *page;
+ struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
+ head = compound_head(page);
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_page(page);
+ put_page(head);
goto pte_unmap;
}
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pmd_write(orig))
refs = 0;
head = pmd_page(orig);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
return 0;
}
- /*
- * Any tail pages need their mapcount reference taken before we
- * return. (This allows the THP code to bump their ref count when
- * they are split into base pages).
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pud_write(orig))
refs = 0;
head = pud_page(orig);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
struct page **pages, int *nr)
{
int refs;
- struct page *head, *page, *tail;
+ struct page *head, *page;
if (write && !pgd_write(orig))
return 0;
refs = 0;
head = pgd_page(orig);
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
pmd_t pmd = READ_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {