]> Git Repo - J-linux.git/commitdiff
Merge tag 'libnvdimm-for-4.19_dax-memory-failure' of gitolite.kernel.org:pub/scm...
authorLinus Torvalds <[email protected]>
Sun, 26 Aug 2018 01:43:59 +0000 (18:43 -0700)
committerLinus Torvalds <[email protected]>
Sun, 26 Aug 2018 01:43:59 +0000 (18:43 -0700)
Pull libnvdimm memory-failure update from Dave Jiang:
 "As it stands, memory_failure() gets thoroughly confused by dev_pagemap
  backed mappings. The recovery code has specific enabling for several
  possible page states and needs new enabling to handle poison in dax
  mappings.

  In order to support reliable reverse mapping of user space addresses:

   1/ Add new locking in the memory_failure() rmap path to prevent races
      that would typically be handled by the page lock.

   2/ Since dev_pagemap pages are hidden from the page allocator and the
      "compound page" accounting machinery, add a mechanism to determine
      the size of the mapping that encompasses a given poisoned pfn.

   3/ Given pmem errors can be repaired, change the speculatively
      accessed poison protection, mce_unmap_kpfn(), to be reversible and
      otherwise allow ongoing access from the kernel.

  A side effect of this enabling is that MADV_HWPOISON becomes usable
  for dax mappings, however the primary motivation is to allow the
  system to survive userspace consumption of hardware-poison via dax.
  Specifically the current behavior is:

     mce: Uncorrected hardware memory error in user-access at af34214200
     {1}[Hardware Error]: It has been corrected by h/w and requires no further action
     mce: [Hardware Error]: Machine check events logged
     {1}[Hardware Error]: event severity: corrected
     Memory failure: 0xaf34214: reserved kernel page still referenced by 1 users
     [..]
     Memory failure: 0xaf34214: recovery action for reserved kernel page: Failed
     mce: Memory error not recovered
     <reboot>

  ...and with these changes:

     Injecting memory failure for pfn 0x20cb00 at process virtual address 0x7f763dd00000
     Memory failure: 0x20cb00: Killing dax-pmd:5421 due to hardware memory corruption
     Memory failure: 0x20cb00: recovery action for dax page: Recovered

  Given all the cross dependencies I propose taking this through
  nvdimm.git with acks from Naoya, x86/core, x86/RAS, and of course dax
  folks"

* tag 'libnvdimm-for-4.19_dax-memory-failure' of gitolite.kernel.org:pub/scm/linux/kernel/git/nvdimm/nvdimm:
  libnvdimm, pmem: Restore page attributes when clearing errors
  x86/memory_failure: Introduce {set, clear}_mce_nospec()
  x86/mm/pat: Prepare {reserve, free}_memtype() for "decoy" addresses
  mm, memory_failure: Teach memory_failure() about dev_pagemap pages
  filesystem-dax: Introduce dax_lock_mapping_entry()
  mm, memory_failure: Collect mapping size in collect_procs()
  mm, madvise_inject_error: Let memory_failure() optionally take a page reference
  mm, dev_pagemap: Do not clear ->mapping on final put
  mm, madvise_inject_error: Disable MADV_SOFT_OFFLINE for ZONE_DEVICE pages
  filesystem-dax: Set page->index
  device-dax: Set page->index
  device-dax: Enable page_mapping()
  device-dax: Convert to vmf_insert_mixed and vm_fault_t

1  2 
arch/x86/include/asm/set_memory.h
arch/x86/kernel/cpu/mcheck/mce.c
drivers/dax/device.c
drivers/nvdimm/pmem.c
fs/dax.c
include/linux/huge_mm.h
include/linux/mm.h
kernel/memremap.c
mm/hmm.c
mm/huge_memory.c
mm/memory-failure.c

index 34cffcef7375dfa15cb30832972aa3d71e86d678,cf5e9124b45ea17258d0654cbdeb47371cda6a7b..07a25753e85c5cd53b2613a71db91862fa31684f
@@@ -46,7 -46,6 +46,7 @@@ int set_memory_np(unsigned long addr, i
  int set_memory_4k(unsigned long addr, int numpages);
  int set_memory_encrypted(unsigned long addr, int numpages);
  int set_memory_decrypted(unsigned long addr, int numpages);
 +int set_memory_np_noalias(unsigned long addr, int numpages);
  
  int set_memory_array_uc(unsigned long *addr, int addrinarray);
  int set_memory_array_wc(unsigned long *addr, int addrinarray);
@@@ -89,4 -88,46 +89,46 @@@ extern int kernel_set_to_readonly
  void set_kernel_text_rw(void);
  void set_kernel_text_ro(void);
  
+ #ifdef CONFIG_X86_64
+ static inline int set_mce_nospec(unsigned long pfn)
+ {
+       unsigned long decoy_addr;
+       int rc;
+       /*
+        * Mark the linear address as UC to make sure we don't log more
+        * errors because of speculative access to the page.
+        * We would like to just call:
+        *      set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
+        * but doing that would radically increase the odds of a
+        * speculative access to the poison page because we'd have
+        * the virtual address of the kernel 1:1 mapping sitting
+        * around in registers.
+        * Instead we get tricky.  We create a non-canonical address
+        * that looks just like the one we want, but has bit 63 flipped.
+        * This relies on set_memory_uc() properly sanitizing any __pa()
+        * results with __PHYSICAL_MASK or PTE_PFN_MASK.
+        */
+       decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+       rc = set_memory_uc(decoy_addr, 1);
+       if (rc)
+               pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+       return rc;
+ }
+ #define set_mce_nospec set_mce_nospec
+ /* Restore full speculative operation to the pfn. */
+ static inline int clear_mce_nospec(unsigned long pfn)
+ {
+       return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
+ }
+ #define clear_mce_nospec clear_mce_nospec
+ #else
+ /*
+  * Few people would run a 32-bit kernel on a machine that supports
+  * recoverable errors because they have too much memory to boot 32-bit.
+  */
+ #endif
  #endif /* _ASM_X86_SET_MEMORY_H */
index 4b767284b7f5e59e529c5c7e1ae90174d7d23654,42a061ce1f5d350b625ac19aec5bc5dcd8aa1771..953b3ce92dccf0f684ce90e3a27015c99e692470
@@@ -42,6 -42,7 +42,7 @@@
  #include <linux/irq_work.h>
  #include <linux/export.h>
  #include <linux/jump_label.h>
+ #include <linux/set_memory.h>
  
  #include <asm/intel-family.h>
  #include <asm/processor.h>
@@@ -50,7 -51,6 +51,6 @@@
  #include <asm/mce.h>
  #include <asm/msr.h>
  #include <asm/reboot.h>
- #include <asm/set_memory.h>
  
  #include "mce-internal.h"
  
@@@ -108,10 -108,6 +108,6 @@@ static struct irq_work mce_irq_work
  
  static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
  
- #ifndef mce_unmap_kpfn
- static void mce_unmap_kpfn(unsigned long pfn);
- #endif
  /*
   * CPU/chipset specific EDAC code can register a notifier call here to print
   * MCE errors in a human-readable form.
@@@ -123,8 -119,8 +119,8 @@@ void mce_setup(struct mce *m
  {
        memset(m, 0, sizeof(struct mce));
        m->cpu = m->extcpu = smp_processor_id();
 -      /* We hope get_seconds stays lockless */
 -      m->time = get_seconds();
 +      /* need the internal __ version to avoid deadlocks */
 +      m->time = __ktime_get_real_seconds();
        m->cpuvendor = boot_cpu_data.x86_vendor;
        m->cpuid = cpuid_eax(1);
        m->socketid = cpu_data(m->extcpu).phys_proc_id;
@@@ -602,7 -598,7 +598,7 @@@ static int srao_decode_notifier(struct 
        if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
                pfn = mce->addr >> PAGE_SHIFT;
                if (!memory_failure(pfn, 0))
-                       mce_unmap_kpfn(pfn);
+                       set_mce_nospec(pfn);
        }
  
        return NOTIFY_OK;
@@@ -1072,133 -1068,10 +1068,105 @@@ static int do_memory_failure(struct mc
        if (ret)
                pr_err("Memory error not recovered");
        else
-               mce_unmap_kpfn(m->addr >> PAGE_SHIFT);
+               set_mce_nospec(m->addr >> PAGE_SHIFT);
        return ret;
  }
  
- #ifndef mce_unmap_kpfn
- static void mce_unmap_kpfn(unsigned long pfn)
- {
-       unsigned long decoy_addr;
-       /*
-        * Unmap this page from the kernel 1:1 mappings to make sure
-        * we don't log more errors because of speculative access to
-        * the page.
-        * We would like to just call:
-        *      set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
-        * but doing that would radically increase the odds of a
-        * speculative access to the poison page because we'd have
-        * the virtual address of the kernel 1:1 mapping sitting
-        * around in registers.
-        * Instead we get tricky.  We create a non-canonical address
-        * that looks just like the one we want, but has bit 63 flipped.
-        * This relies on set_memory_np() not checking whether we passed
-        * a legal address.
-        */
-       decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
-       if (set_memory_np(decoy_addr, 1))
-               pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
- }
- #endif
 +
 +/*
 + * Cases where we avoid rendezvous handler timeout:
 + * 1) If this CPU is offline.
 + *
 + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
 + *  skip those CPUs which remain looping in the 1st kernel - see
 + *  crash_nmi_callback().
 + *
 + * Note: there still is a small window between kexec-ing and the new,
 + * kdump kernel establishing a new #MC handler where a broadcasted MCE
 + * might not get handled properly.
 + */
 +static bool __mc_check_crashing_cpu(int cpu)
 +{
 +      if (cpu_is_offline(cpu) ||
 +          (crashing_cpu != -1 && crashing_cpu != cpu)) {
 +              u64 mcgstatus;
 +
 +              mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 +              if (mcgstatus & MCG_STATUS_RIPV) {
 +                      mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
 +                      return true;
 +              }
 +      }
 +      return false;
 +}
 +
 +static void __mc_scan_banks(struct mce *m, struct mce *final,
 +                          unsigned long *toclear, unsigned long *valid_banks,
 +                          int no_way_out, int *worst)
 +{
 +      struct mca_config *cfg = &mca_cfg;
 +      int severity, i;
 +
 +      for (i = 0; i < cfg->banks; i++) {
 +              __clear_bit(i, toclear);
 +              if (!test_bit(i, valid_banks))
 +                      continue;
 +
 +              if (!mce_banks[i].ctl)
 +                      continue;
 +
 +              m->misc = 0;
 +              m->addr = 0;
 +              m->bank = i;
 +
 +              m->status = mce_rdmsrl(msr_ops.status(i));
 +              if (!(m->status & MCI_STATUS_VAL))
 +                      continue;
 +
 +              /*
 +               * Corrected or non-signaled errors are handled by
 +               * machine_check_poll(). Leave them alone, unless this panics.
 +               */
 +              if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
 +                      !no_way_out)
 +                      continue;
 +
 +              /* Set taint even when machine check was not enabled. */
 +              add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 +
 +              severity = mce_severity(m, cfg->tolerant, NULL, true);
 +
 +              /*
 +               * When machine check was for corrected/deferred handler don't
 +               * touch, unless we're panicking.
 +               */
 +              if ((severity == MCE_KEEP_SEVERITY ||
 +                   severity == MCE_UCNA_SEVERITY) && !no_way_out)
 +                      continue;
 +
 +              __set_bit(i, toclear);
 +
 +              /* Machine check event was not enabled. Clear, but ignore. */
 +              if (severity == MCE_NO_SEVERITY)
 +                      continue;
 +
 +              mce_read_aux(m, i);
 +
 +              /* assuming valid severity level != 0 */
 +              m->severity = severity;
 +
 +              mce_log(m);
 +
 +              if (severity > *worst) {
 +                      *final = *m;
 +                      *worst = severity;
 +              }
 +      }
 +
 +      /* mce_clear_state will clear *final, save locally for use later */
 +      *m = *final;
 +}
 +
  /*
   * The actual machine check handler. This only handles real
   * exceptions when something got corrupted coming in through int 18.
   */
  void do_machine_check(struct pt_regs *regs, long error_code)
  {
 +      DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 +      DECLARE_BITMAP(toclear, MAX_NR_BANKS);
        struct mca_config *cfg = &mca_cfg;
 +      int cpu = smp_processor_id();
 +      char *msg = "Unknown";
        struct mce m, *final;
 -      int i;
        int worst = 0;
 -      int severity;
  
        /*
         * Establish sequential order between the CPUs entering the machine
         * check handler.
         */
        int order = -1;
 +
        /*
         * If no_way_out gets set, there is no safe way to recover from this
         * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
         */
        int no_way_out = 0;
 +
        /*
         * If kill_it gets set, there might be a way to recover from this
         * error.
         */
        int kill_it = 0;
 -      DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 -      DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 -      char *msg = "Unknown";
  
        /*
         * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
         * on Intel.
         */
        int lmce = 1;
 -      int cpu = smp_processor_id();
 -
 -      /*
 -       * Cases where we avoid rendezvous handler timeout:
 -       * 1) If this CPU is offline.
 -       *
 -       * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
 -       *  skip those CPUs which remain looping in the 1st kernel - see
 -       *  crash_nmi_callback().
 -       *
 -       * Note: there still is a small window between kexec-ing and the new,
 -       * kdump kernel establishing a new #MC handler where a broadcasted MCE
 -       * might not get handled properly.
 -       */
 -      if (cpu_is_offline(cpu) ||
 -          (crashing_cpu != -1 && crashing_cpu != cpu)) {
 -              u64 mcgstatus;
  
 -              mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 -              if (mcgstatus & MCG_STATUS_RIPV) {
 -                      mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
 -                      return;
 -              }
 -      }
 +      if (__mc_check_crashing_cpu(cpu))
 +              return;
  
        ist_enter(regs);
  
        this_cpu_inc(mce_exception_count);
  
 -      if (!cfg->banks)
 -              goto out;
 -
        mce_gather_info(&m, regs);
        m.tsc = rdtsc();
  
                order = mce_start(&no_way_out);
        }
  
 -      for (i = 0; i < cfg->banks; i++) {
 -              __clear_bit(i, toclear);
 -              if (!test_bit(i, valid_banks))
 -                      continue;
 -              if (!mce_banks[i].ctl)
 -                      continue;
 -
 -              m.misc = 0;
 -              m.addr = 0;
 -              m.bank = i;
 -
 -              m.status = mce_rdmsrl(msr_ops.status(i));
 -              if ((m.status & MCI_STATUS_VAL) == 0)
 -                      continue;
 -
 -              /*
 -               * Non uncorrected or non signaled errors are handled by
 -               * machine_check_poll. Leave them alone, unless this panics.
 -               */
 -              if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
 -                      !no_way_out)
 -                      continue;
 -
 -              /*
 -               * Set taint even when machine check was not enabled.
 -               */
 -              add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 -
 -              severity = mce_severity(&m, cfg->tolerant, NULL, true);
 -
 -              /*
 -               * When machine check was for corrected/deferred handler don't
 -               * touch, unless we're panicing.
 -               */
 -              if ((severity == MCE_KEEP_SEVERITY ||
 -                   severity == MCE_UCNA_SEVERITY) && !no_way_out)
 -                      continue;
 -              __set_bit(i, toclear);
 -              if (severity == MCE_NO_SEVERITY) {
 -                      /*
 -                       * Machine check event was not enabled. Clear, but
 -                       * ignore.
 -                       */
 -                      continue;
 -              }
 -
 -              mce_read_aux(&m, i);
 -
 -              /* assuming valid severity level != 0 */
 -              m.severity = severity;
 -
 -              mce_log(&m);
 -
 -              if (severity > worst) {
 -                      *final = m;
 -                      worst = severity;
 -              }
 -      }
 -
 -      /* mce_clear_state will clear *final, save locally for use later */
 -      m = *final;
 +      __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
  
        if (!no_way_out)
                mce_clear_state(toclear);
        if (worst > 0)
                mce_report_event(regs);
        mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
 -out:
 +
        sync_core();
  
        if (worst != MCE_AR_SEVERITY && !kill_it)
@@@ -2177,6 -2133,9 +2145,6 @@@ static ssize_t store_int_with_restart(s
        if (check_interval == old_check_interval)
                return ret;
  
 -      if (check_interval < 1)
 -              check_interval = 1;
 -
        mutex_lock(&mce_sysfs_mutex);
        mce_restart();
        mutex_unlock(&mce_sysfs_mutex);
diff --combined drivers/dax/device.c
index 0a2acd7993f0b3a561cf91beb0b4200838f601f4,361a1108959168b13adb0bd08e90ce61581a43df..6fd46083e62958eea61716225cd9f6c7fe11e748
@@@ -189,16 -189,14 +189,16 @@@ static int check_vma(struct dev_dax *de
  
        /* prevent private mappings from being established */
        if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
 -              dev_info(dev, "%s: %s: fail, attempted private mapping\n",
 +              dev_info_ratelimited(dev,
 +                              "%s: %s: fail, attempted private mapping\n",
                                current->comm, func);
                return -EINVAL;
        }
  
        mask = dax_region->align - 1;
        if (vma->vm_start & mask || vma->vm_end & mask) {
 -              dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
 +              dev_info_ratelimited(dev,
 +                              "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
                                current->comm, func, vma->vm_start, vma->vm_end,
                                mask);
                return -EINVAL;
  
        if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
                        && (vma->vm_flags & VM_DONTCOPY) == 0) {
 -              dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
 +              dev_info_ratelimited(dev,
 +                              "%s: %s: fail, dax range requires MADV_DONTFORK\n",
                                current->comm, func);
                return -EINVAL;
        }
  
        if (!vma_is_dax(vma)) {
 -              dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
 +              dev_info_ratelimited(dev,
 +                              "%s: %s: fail, vma is not DAX capable\n",
                                current->comm, func);
                return -EINVAL;
        }
@@@ -248,13 -244,12 +248,12 @@@ __weak phys_addr_t dax_pgoff_to_phys(st
        return -1;
  }
  
- static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
+                               struct vm_fault *vmf, pfn_t *pfn)
  {
        struct device *dev = &dev_dax->dev;
        struct dax_region *dax_region;
-       int rc = VM_FAULT_SIGBUS;
        phys_addr_t phys;
-       pfn_t pfn;
        unsigned int fault_size = PAGE_SIZE;
  
        if (check_vma(dev_dax, vmf->vma, __func__))
                return VM_FAULT_SIGBUS;
        }
  
-       pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
-       rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
-       if (rc == -ENOMEM)
-               return VM_FAULT_OOM;
-       if (rc < 0 && rc != -EBUSY)
-               return VM_FAULT_SIGBUS;
+       *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
  
-       return VM_FAULT_NOPAGE;
+       return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
  }
  
- static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
+                               struct vm_fault *vmf, pfn_t *pfn)
  {
        unsigned long pmd_addr = vmf->address & PMD_MASK;
        struct device *dev = &dev_dax->dev;
        struct dax_region *dax_region;
        phys_addr_t phys;
        pgoff_t pgoff;
-       pfn_t pfn;
        unsigned int fault_size = PMD_SIZE;
  
        if (check_vma(dev_dax, vmf->vma, __func__))
                return VM_FAULT_SIGBUS;
        }
  
-       pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+       *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
  
-       return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
+       return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn,
                        vmf->flags & FAULT_FLAG_WRITE);
  }
  
  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
- static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
+                               struct vm_fault *vmf, pfn_t *pfn)
  {
        unsigned long pud_addr = vmf->address & PUD_MASK;
        struct device *dev = &dev_dax->dev;
        struct dax_region *dax_region;
        phys_addr_t phys;
        pgoff_t pgoff;
-       pfn_t pfn;
        unsigned int fault_size = PUD_SIZE;
  
  
                return VM_FAULT_SIGBUS;
        }
  
-       pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+       *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
  
-       return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn,
+       return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn,
                        vmf->flags & FAULT_FLAG_WRITE);
  }
  #else
- static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
+                               struct vm_fault *vmf, pfn_t *pfn)
  {
        return VM_FAULT_FALLBACK;
  }
  #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  
- static int dev_dax_huge_fault(struct vm_fault *vmf,
+ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
                enum page_entry_size pe_size)
  {
-       int rc, id;
        struct file *filp = vmf->vma->vm_file;
+       unsigned long fault_size;
+       int rc, id;
+       pfn_t pfn;
        struct dev_dax *dev_dax = filp->private_data;
  
        dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm,
        id = dax_read_lock();
        switch (pe_size) {
        case PE_SIZE_PTE:
-               rc = __dev_dax_pte_fault(dev_dax, vmf);
+               fault_size = PAGE_SIZE;
+               rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
                break;
        case PE_SIZE_PMD:
-               rc = __dev_dax_pmd_fault(dev_dax, vmf);
+               fault_size = PMD_SIZE;
+               rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
                break;
        case PE_SIZE_PUD:
-               rc = __dev_dax_pud_fault(dev_dax, vmf);
+               fault_size = PUD_SIZE;
+               rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
                break;
        default:
                rc = VM_FAULT_SIGBUS;
        }
+       if (rc == VM_FAULT_NOPAGE) {
+               unsigned long i;
+               pgoff_t pgoff;
+               /*
+                * In the device-dax case the only possibility for a
+                * VM_FAULT_NOPAGE result is when device-dax capacity is
+                * mapped. No need to consider the zero page, or racing
+                * conflicting mappings.
+                */
+               pgoff = linear_page_index(vmf->vma, vmf->address
+                               & ~(fault_size - 1));
+               for (i = 0; i < fault_size / PAGE_SIZE; i++) {
+                       struct page *page;
+                       page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
+                       if (page->mapping)
+                               continue;
+                       page->mapping = filp->f_mapping;
+                       page->index = pgoff + i;
+               }
+       }
        dax_read_unlock(id);
  
        return rc;
  }
  
- static int dev_dax_fault(struct vm_fault *vmf)
+ static vm_fault_t dev_dax_fault(struct vm_fault *vmf)
  {
        return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
  }
@@@ -474,7 -491,7 +495,7 @@@ static int dax_mmap(struct file *filp, 
                return rc;
  
        vma->vm_ops = &dax_vm_ops;
 -      vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
 +      vma->vm_flags |= VM_HUGEPAGE;
        return 0;
  }
  
diff --combined drivers/nvdimm/pmem.c
index c236498676964fd147df293b0cec30880b07cf93,55c7a69751d39306659e906af9b0c49fb344a042..6071e2942053c903564d6f08f278d3735a619308
@@@ -20,6 -20,7 +20,7 @@@
  #include <linux/hdreg.h>
  #include <linux/init.h>
  #include <linux/platform_device.h>
+ #include <linux/set_memory.h>
  #include <linux/module.h>
  #include <linux/moduleparam.h>
  #include <linux/badblocks.h>
@@@ -51,6 -52,30 +52,30 @@@ static struct nd_region *to_region(stru
        return to_nd_region(to_dev(pmem)->parent);
  }
  
+ static void hwpoison_clear(struct pmem_device *pmem,
+               phys_addr_t phys, unsigned int len)
+ {
+       unsigned long pfn_start, pfn_end, pfn;
+       /* only pmem in the linear map supports HWPoison */
+       if (is_vmalloc_addr(pmem->virt_addr))
+               return;
+       pfn_start = PHYS_PFN(phys);
+       pfn_end = pfn_start + PHYS_PFN(len);
+       for (pfn = pfn_start; pfn < pfn_end; pfn++) {
+               struct page *page = pfn_to_page(pfn);
+               /*
+                * Note, no need to hold a get_dev_pagemap() reference
+                * here since we're in the driver I/O path and
+                * outstanding I/O requests pin the dev_pagemap.
+                */
+               if (test_and_clear_pmem_poison(page))
+                       clear_mce_nospec(pfn);
+       }
+ }
  static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
                phys_addr_t offset, unsigned int len)
  {
@@@ -65,6 -90,7 +90,7 @@@
        if (cleared < len)
                rc = BLK_STS_IOERR;
        if (cleared > 0 && cleared / 512) {
+               hwpoison_clear(pmem, pmem->phys_addr + offset, cleared);
                cleared /= 512;
                dev_dbg(dev, "%#llx clear %ld sector%s\n",
                                (unsigned long long) sector, cleared,
@@@ -120,7 -146,7 +146,7 @@@ static blk_status_t read_pmem(struct pa
  }
  
  static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 -                      unsigned int len, unsigned int off, bool is_write,
 +                      unsigned int len, unsigned int off, unsigned int op,
                        sector_t sector)
  {
        blk_status_t rc = BLK_STS_OK;
        if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
                bad_pmem = true;
  
 -      if (!is_write) {
 +      if (!op_is_write(op)) {
                if (unlikely(bad_pmem))
                        rc = BLK_STS_IOERR;
                else {
@@@ -180,7 -206,8 +206,7 @@@ static blk_qc_t pmem_make_request(struc
        do_acct = nd_iostat_start(bio, &start);
        bio_for_each_segment(bvec, bio, iter) {
                rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
 -                              bvec.bv_offset, op_is_write(bio_op(bio)),
 -                              iter.bi_sector);
 +                              bvec.bv_offset, bio_op(bio), iter.bi_sector);
                if (rc) {
                        bio->bi_status = rc;
                        break;
  }
  
  static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 -                     struct page *page, bool is_write)
 +                     struct page *page, unsigned int op)
  {
        struct pmem_device *pmem = bdev->bd_queue->queuedata;
        blk_status_t rc;
  
        rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
 -                        0, is_write, sector);
 +                        0, op, sector);
  
        /*
         * The ->rw_page interface is subtle and tricky.  The core
         * caused by double completion.
         */
        if (rc == 0)
 -              page_endio(page, is_write, 0);
 +              page_endio(page, op_is_write(op), 0);
  
        return blk_status_to_errno(rc);
  }
@@@ -226,11 -253,8 +252,11 @@@ __weak long __pmem_direct_access(struc
        if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
                                        PFN_PHYS(nr_pages))))
                return -EIO;
 -      *kaddr = pmem->virt_addr + offset;
 -      *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
 +
 +      if (kaddr)
 +              *kaddr = pmem->virt_addr + offset;
 +      if (pfn)
 +              *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
  
        /*
         * If badblocks are present, limit known good range to the
diff --combined fs/dax.c
index f76724139f80c50da39e37e6b95058b7095eabbc,57ec272038da698b7053d4cc362a3533b2a87f11..f32d7125ad0f237d61173cd72383683ac380c4e4
+++ b/fs/dax.c
@@@ -226,8 -226,8 +226,8 @@@ static inline void *unlock_slot(struct 
   *
   * Must be called with the i_pages lock held.
   */
- static void *get_unlocked_mapping_entry(struct address_space *mapping,
-                                       pgoff_t index, void ***slotp)
+ static void *__get_unlocked_mapping_entry(struct address_space *mapping,
+               pgoff_t index, void ***slotp, bool (*wait_fn)(void))
  {
        void *entry, **slot;
        struct wait_exceptional_entry_queue ewait;
        ewait.wait.func = wake_exceptional_entry_func;
  
        for (;;) {
+               bool revalidate;
                entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
                                          &slot);
                if (!entry ||
                prepare_to_wait_exclusive(wq, &ewait.wait,
                                          TASK_UNINTERRUPTIBLE);
                xa_unlock_irq(&mapping->i_pages);
-               schedule();
+               revalidate = wait_fn();
                finish_wait(wq, &ewait.wait);
                xa_lock_irq(&mapping->i_pages);
+               if (revalidate)
+                       return ERR_PTR(-EAGAIN);
        }
  }
  
- static void dax_unlock_mapping_entry(struct address_space *mapping,
-                                    pgoff_t index)
+ static bool entry_wait(void)
+ {
+       schedule();
+       /*
+        * Never return an ERR_PTR() from
+        * __get_unlocked_mapping_entry(), just keep looping.
+        */
+       return false;
+ }
+ static void *get_unlocked_mapping_entry(struct address_space *mapping,
+               pgoff_t index, void ***slotp)
+ {
+       return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
+ }
+ static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
  {
        void *entry, **slot;
  
  static void put_locked_mapping_entry(struct address_space *mapping,
                pgoff_t index)
  {
-       dax_unlock_mapping_entry(mapping, index);
+       unlock_mapping_entry(mapping, index);
  }
  
  /*
@@@ -319,18 -338,27 +338,27 @@@ static unsigned long dax_radix_end_pfn(
        for (pfn = dax_radix_pfn(entry); \
                        pfn < dax_radix_end_pfn(entry); pfn++)
  
- static void dax_associate_entry(void *entry, struct address_space *mapping)
+ /*
+  * TODO: for reflink+dax we need a way to associate a single page with
+  * multiple address_space instances at different linear_page_index()
+  * offsets.
+  */
+ static void dax_associate_entry(void *entry, struct address_space *mapping,
+               struct vm_area_struct *vma, unsigned long address)
  {
-       unsigned long pfn;
+       unsigned long size = dax_entry_size(entry), pfn, index;
+       int i = 0;
  
        if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
                return;
  
+       index = linear_page_index(vma, address & ~(size - 1));
        for_each_mapped_pfn(entry, pfn) {
                struct page *page = pfn_to_page(pfn);
  
                WARN_ON_ONCE(page->mapping);
                page->mapping = mapping;
+               page->index = index + i++;
        }
  }
  
@@@ -348,6 -376,7 +376,7 @@@ static void dax_disassociate_entry(voi
                WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
                WARN_ON_ONCE(page->mapping && page->mapping != mapping);
                page->mapping = NULL;
+               page->index = 0;
        }
  }
  
@@@ -364,6 -393,84 +393,84 @@@ static struct page *dax_busy_page(void 
        return NULL;
  }
  
+ static bool entry_wait_revalidate(void)
+ {
+       rcu_read_unlock();
+       schedule();
+       rcu_read_lock();
+       /*
+        * Tell __get_unlocked_mapping_entry() to take a break, we need
+        * to revalidate page->mapping after dropping locks
+        */
+       return true;
+ }
+ bool dax_lock_mapping_entry(struct page *page)
+ {
+       pgoff_t index;
+       struct inode *inode;
+       bool did_lock = false;
+       void *entry = NULL, **slot;
+       struct address_space *mapping;
+       rcu_read_lock();
+       for (;;) {
+               mapping = READ_ONCE(page->mapping);
+               if (!dax_mapping(mapping))
+                       break;
+               /*
+                * In the device-dax case there's no need to lock, a
+                * struct dev_pagemap pin is sufficient to keep the
+                * inode alive, and we assume we have dev_pagemap pin
+                * otherwise we would not have a valid pfn_to_page()
+                * translation.
+                */
+               inode = mapping->host;
+               if (S_ISCHR(inode->i_mode)) {
+                       did_lock = true;
+                       break;
+               }
+               xa_lock_irq(&mapping->i_pages);
+               if (mapping != page->mapping) {
+                       xa_unlock_irq(&mapping->i_pages);
+                       continue;
+               }
+               index = page->index;
+               entry = __get_unlocked_mapping_entry(mapping, index, &slot,
+                               entry_wait_revalidate);
+               if (!entry) {
+                       xa_unlock_irq(&mapping->i_pages);
+                       break;
+               } else if (IS_ERR(entry)) {
+                       WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
+                       continue;
+               }
+               lock_slot(mapping, slot);
+               did_lock = true;
+               xa_unlock_irq(&mapping->i_pages);
+               break;
+       }
+       rcu_read_unlock();
+       return did_lock;
+ }
+ void dax_unlock_mapping_entry(struct page *page)
+ {
+       struct address_space *mapping = page->mapping;
+       struct inode *inode = mapping->host;
+       if (S_ISCHR(inode->i_mode))
+               return;
+       unlock_mapping_entry(mapping, page->index);
+ }
  /*
   * Find radix tree entry at given index. If it points to an exceptional entry,
   * return it with the radix tree entry locked. If the radix tree doesn't
@@@ -566,8 -673,7 +673,8 @@@ struct page *dax_layout_busy_page(struc
                        if (index >= end)
                                break;
  
 -                      if (!radix_tree_exceptional_entry(pvec_ent))
 +                      if (WARN_ON_ONCE(
 +                           !radix_tree_exceptional_entry(pvec_ent)))
                                continue;
  
                        xa_lock_irq(&mapping->i_pages);
                        if (page)
                                break;
                }
 +
 +              /*
 +               * We don't expect normal struct page entries to exist in our
 +               * tree, but we keep these pagevec calls so that this code is
 +               * consistent with the common pattern for handling pagevecs
 +               * throughout the kernel.
 +               */
                pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                index++;
@@@ -655,6 -754,7 +762,6 @@@ static int copy_user_dax(struct block_d
  {
        void *vto, *kaddr;
        pgoff_t pgoff;
 -      pfn_t pfn;
        long rc;
        int id;
  
                return rc;
  
        id = dax_read_lock();
 -      rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
 +      rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL);
        if (rc < 0) {
                dax_read_unlock(id);
                return rc;
@@@ -708,7 -808,7 +815,7 @@@ static void *dax_insert_mapping_entry(s
        new_entry = dax_radix_locked_entry(pfn, flags);
        if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
                dax_disassociate_entry(entry, mapping, false);
-               dax_associate_entry(new_entry, mapping);
+               dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
        }
  
        if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
@@@ -974,6 -1074,7 +1081,6 @@@ static int dax_iomap_pfn(struct iomap *
  {
        const sector_t sector = dax_iomap_sector(iomap, pos);
        pgoff_t pgoff;
 -      void *kaddr;
        int id, rc;
        long length;
  
                return rc;
        id = dax_read_lock();
        length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
 -                                 &kaddr, pfnp);
 +                                 NULL, pfnp);
        if (length < 0) {
                rc = length;
                goto out;
@@@ -1058,13 -1159,15 +1165,13 @@@ int __dax_zero_page_range(struct block_
                pgoff_t pgoff;
                long rc, id;
                void *kaddr;
 -              pfn_t pfn;
  
                rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
                if (rc)
                        return rc;
  
                id = dax_read_lock();
 -              rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr,
 -                              &pfn);
 +              rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
                if (rc < 0) {
                        dax_read_unlock(id);
                        return rc;
@@@ -1120,6 -1223,7 +1227,6 @@@ dax_iomap_actor(struct inode *inode, lo
                ssize_t map_len;
                pgoff_t pgoff;
                void *kaddr;
 -              pfn_t pfn;
  
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        break;
  
                map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
 -                              &kaddr, &pfn);
 +                              &kaddr, NULL);
                if (map_len < 0) {
                        ret = map_len;
                        break;
diff --combined include/linux/huge_mm.h
index 27e3e32135a84de8d9c9834ae195f653c6ed2b78,d3bbf6bea9e91c055b6232d0dec4d070140b6be3..99c19b06d9a46d2cebf20ad5f21a1612a94b5855
@@@ -3,10 -3,11 +3,11 @@@
  #define _LINUX_HUGE_MM_H
  
  #include <linux/sched/coredump.h>
+ #include <linux/mm_types.h>
  
  #include <linux/fs.h> /* only for vma_is_dax() */
  
 -extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf);
 +extern vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
  extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                         pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                         struct vm_area_struct *vma);
@@@ -23,7 -24,7 +24,7 @@@ static inline void huge_pud_set_accesse
  }
  #endif
  
 -extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
 +extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
  extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                          unsigned long addr,
                                          pmd_t *pmd,
@@@ -46,9 -47,9 +47,9 @@@ extern bool move_huge_pmd(struct vm_are
  extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, pgprot_t newprot,
                        int prot_numa);
int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
                        pmd_t *pmd, pfn_t pfn, bool write);
int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
                        pud_t *pud, pfn_t pfn, bool write);
  enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_FLAG,
@@@ -216,7 -217,7 +217,7 @@@ struct page *follow_devmap_pmd(struct v
  struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
                pud_t *pud, int flags);
  
 -extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
 +extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
  
  extern struct page *huge_zero_page;
  
@@@ -321,8 -322,7 +322,8 @@@ static inline spinlock_t *pud_trans_hug
        return NULL;
  }
  
 -static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd)
 +static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf,
 +              pmd_t orig_pmd)
  {
        return 0;
  }
diff --combined include/linux/mm.h
index 8fcc36660de672c84fce606ba0a1fff22cd5a537,374e5e9284f7a17796751405e7e08a4e2664abc5..a61ebe8ad4ca92e72e23855c17f8e7c9ad059a54
@@@ -155,9 -155,7 +155,9 @@@ extern int overcommit_kbytes_handler(st
   * mmap() functions).
   */
  
 -extern struct kmem_cache *vm_area_cachep;
 +struct vm_area_struct *vm_area_alloc(struct mm_struct *);
 +struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
 +void vm_area_free(struct vm_area_struct *);
  
  #ifndef CONFIG_MMU
  extern struct rb_root nommu_region_tree;
@@@ -452,24 -450,6 +452,24 @@@ struct vm_operations_struct 
                                          unsigned long addr);
  };
  
 +static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 +{
 +      static const struct vm_operations_struct dummy_vm_ops = {};
 +
 +      memset(vma, 0, sizeof(*vma));
 +      vma->vm_mm = mm;
 +      vma->vm_ops = &dummy_vm_ops;
 +      INIT_LIST_HEAD(&vma->anon_vma_chain);
 +}
 +
 +static inline void vma_set_anonymous(struct vm_area_struct *vma)
 +{
 +      vma->vm_ops = NULL;
 +}
 +
 +/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
 +#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
 +
  struct mmu_gather;
  struct inode;
  
@@@ -728,10 -708,10 +728,10 @@@ static inline pte_t maybe_mkwrite(pte_
        return pte;
  }
  
 -int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
 +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
                struct page *page);
 -int finish_fault(struct vm_fault *vmf);
 -int finish_mkwrite_fault(struct vm_fault *vmf);
 +vm_fault_t finish_fault(struct vm_fault *vmf);
 +vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
  #endif
  
  /*
@@@ -960,6 -940,15 +960,6 @@@ static inline int page_zone_id(struct p
        return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
  }
  
 -static inline int zone_to_nid(struct zone *zone)
 -{
 -#ifdef CONFIG_NUMA
 -      return zone->node;
 -#else
 -      return 0;
 -#endif
 -}
 -
  #ifdef NODE_NOT_IN_PAGE_FLAGS
  extern int page_to_nid(const struct page *page);
  #else
@@@ -1403,8 -1392,8 +1403,8 @@@ int generic_error_remove_page(struct ad
  int invalidate_inode_page(struct page *page);
  
  #ifdef CONFIG_MMU
 -extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 -              unsigned int flags);
 +extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
 +                      unsigned long address, unsigned int flags);
  extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                            unsigned long address, unsigned int fault_flags,
                            bool *unlocked);
@@@ -1413,7 -1402,7 +1413,7 @@@ void unmap_mapping_pages(struct address
  void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows);
  #else
 -static inline int handle_mm_fault(struct vm_area_struct *vma,
 +static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags)
  {
        /* should never happen if there's no MMU */
@@@ -2015,7 -2004,7 +2015,7 @@@ static inline spinlock_t *pud_lock(stru
  
  extern void __init pagecache_init(void);
  extern void free_area_init(unsigned long * zones_size);
 -extern void free_area_init_node(int nid, unsigned long * zones_size,
 +extern void __init free_area_init_node(int nid, unsigned long * zones_size,
                unsigned long zone_start_pfn, unsigned long *zholes_size);
  extern void free_initmem(void);
  
@@@ -2143,7 -2132,7 +2143,7 @@@ extern int __meminit __early_pfn_to_nid
                                        struct mminit_pfnnid_cache *state);
  #endif
  
 -#ifdef CONFIG_HAVE_MEMBLOCK
 +#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
  void zero_resv_unavail(void);
  #else
  static inline void zero_resv_unavail(void) {}
@@@ -2563,7 -2552,7 +2563,7 @@@ static inline struct page *follow_page(
  #define FOLL_COW      0x4000  /* internal GUP flag */
  #define FOLL_ANON     0x8000  /* don't do file mappings */
  
 -static inline int vm_fault_to_errno(int vm_fault, int foll_flags)
 +static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
  {
        if (vm_fault & VM_FAULT_OOM)
                return -ENOMEM;
@@@ -2657,7 -2646,12 +2657,7 @@@ extern int randomize_va_space
  const char * arch_vma_name(struct vm_area_struct *vma);
  void print_vma_addr(char *prefix, unsigned long rip);
  
 -void sparse_mem_maps_populate_node(struct page **map_map,
 -                                 unsigned long pnum_begin,
 -                                 unsigned long pnum_end,
 -                                 unsigned long map_count,
 -                                 int nodeid);
 -
 +void *sparse_buffer_alloc(unsigned long size);
  struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
                struct vmem_altmap *altmap);
  pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
@@@ -2731,6 -2725,7 +2731,7 @@@ enum mf_action_page_type 
        MF_MSG_TRUNCATED_LRU,
        MF_MSG_BUDDY,
        MF_MSG_BUDDY_2ND,
+       MF_MSG_DAX,
        MF_MSG_UNKNOWN,
  };
  
@@@ -2739,8 -2734,7 +2740,8 @@@ extern void clear_huge_page(struct pag
                            unsigned long addr_hint,
                            unsigned int pages_per_huge_page);
  extern void copy_user_huge_page(struct page *dst, struct page *src,
 -                              unsigned long addr, struct vm_area_struct *vma,
 +                              unsigned long addr_hint,
 +                              struct vm_area_struct *vma,
                                unsigned int pages_per_huge_page);
  extern long copy_huge_page_from_user(struct page *dst_page,
                                const void __user *usr_src,
diff --combined kernel/memremap.c
index d57d58f77409214cf93ece9454354242ffd8dd85,62603634a1d235366c6eb19cb4d8f6062b7a70fa..5b8600d39931964adcef966b9660aceac5918124
@@@ -5,7 -5,6 +5,7 @@@
  #include <linux/types.h>
  #include <linux/pfn_t.h>
  #include <linux/io.h>
 +#include <linux/kasan.h>
  #include <linux/mm.h>
  #include <linux/memory_hotplug.h>
  #include <linux/swap.h>
@@@ -43,7 -42,7 +43,7 @@@ static unsigned long order_at(struct re
                        pgoff += 1UL << order, order = order_at((res), pgoff))
  
  #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 -int device_private_entry_fault(struct vm_area_struct *vma,
 +vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
                       unsigned long addr,
                       swp_entry_t entry,
                       unsigned int flags,
@@@ -138,7 -137,6 +138,7 @@@ static void devm_memremap_pages_release
        mem_hotplug_begin();
        arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
                        &pgmap->altmap : NULL);
 +      kasan_remove_zero_shadow(__va(align_start), align_size);
        mem_hotplug_done();
  
        untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
@@@ -178,27 -176,10 +178,27 @@@ void *devm_memremap_pages(struct devic
        unsigned long pfn, pgoff, order;
        pgprot_t pgprot = PAGE_KERNEL;
        int error, nid, is_ram;
 +      struct dev_pagemap *conflict_pgmap;
  
        align_start = res->start & ~(SECTION_SIZE - 1);
        align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
                - align_start;
 +      align_end = align_start + align_size - 1;
 +
 +      conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL);
 +      if (conflict_pgmap) {
 +              dev_WARN(dev, "Conflicting mapping in same section\n");
 +              put_dev_pagemap(conflict_pgmap);
 +              return ERR_PTR(-ENOMEM);
 +      }
 +
 +      conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
 +      if (conflict_pgmap) {
 +              dev_WARN(dev, "Conflicting mapping in same section\n");
 +              put_dev_pagemap(conflict_pgmap);
 +              return ERR_PTR(-ENOMEM);
 +      }
 +
        is_ram = region_intersects(align_start, align_size,
                IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
  
  
        mutex_lock(&pgmap_lock);
        error = 0;
 -      align_end = align_start + align_size - 1;
  
        foreach_order_pgoff(res, order, pgoff) {
                error = __radix_tree_insert(&pgmap_radix,
                goto err_pfn_remap;
  
        mem_hotplug_begin();
 +      error = kasan_add_zero_shadow(__va(align_start), align_size);
 +      if (error) {
 +              mem_hotplug_done();
 +              goto err_kasan;
 +      }
 +
        error = arch_add_memory(nid, align_start, align_size, altmap, false);
        if (!error)
                move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
        return __va(res->start);
  
   err_add_memory:
 +      kasan_remove_zero_shadow(__va(align_start), align_size);
 + err_kasan:
        untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
   err_pfn_remap:
   err_radix:
@@@ -331,7 -305,7 +331,7 @@@ EXPORT_SYMBOL_GPL(get_dev_pagemap)
  
  #ifdef CONFIG_DEV_PAGEMAP_OPS
  DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
 -EXPORT_SYMBOL_GPL(devmap_managed_key);
 +EXPORT_SYMBOL(devmap_managed_key);
  static atomic_t devmap_enable;
  
  /*
@@@ -365,12 -339,11 +365,11 @@@ void __put_devmap_managed_page(struct p
                __ClearPageActive(page);
                __ClearPageWaiters(page);
  
-               page->mapping = NULL;
                mem_cgroup_uncharge(page);
  
                page->pgmap->page_free(page, page->pgmap->data);
        } else if (!count)
                __put_page(page);
  }
 -EXPORT_SYMBOL_GPL(__put_devmap_managed_page);
 +EXPORT_SYMBOL(__put_devmap_managed_page);
  #endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --combined mm/hmm.c
index 0b05545916106cad2f5bd490af19514107a6b9c5,f9d1d89dec4d66d4948c2c4de3602e0878efa7ca..c968e49f7a0c527258a85b8c0259467e3b2924de
+++ b/mm/hmm.c
@@@ -177,19 -177,16 +177,19 @@@ static void hmm_release(struct mmu_noti
        up_write(&hmm->mirrors_sem);
  }
  
 -static void hmm_invalidate_range_start(struct mmu_notifier *mn,
 +static int hmm_invalidate_range_start(struct mmu_notifier *mn,
                                       struct mm_struct *mm,
                                       unsigned long start,
 -                                     unsigned long end)
 +                                     unsigned long end,
 +                                     bool blockable)
  {
        struct hmm *hmm = mm->hmm;
  
        VM_BUG_ON(!hmm);
  
        atomic_inc(&hmm->sequence);
 +
 +      return 0;
  }
  
  static void hmm_invalidate_range_end(struct mmu_notifier *mn,
@@@ -302,14 -299,14 +302,14 @@@ static int hmm_vma_do_fault(struct mm_w
        struct hmm_vma_walk *hmm_vma_walk = walk->private;
        struct hmm_range *range = hmm_vma_walk->range;
        struct vm_area_struct *vma = walk->vma;
 -      int r;
 +      vm_fault_t ret;
  
        flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
        flags |= write_fault ? FAULT_FLAG_WRITE : 0;
 -      r = handle_mm_fault(vma, addr, flags);
 -      if (r & VM_FAULT_RETRY)
 +      ret = handle_mm_fault(vma, addr, flags);
 +      if (ret & VM_FAULT_RETRY)
                return -EBUSY;
 -      if (r & VM_FAULT_ERROR) {
 +      if (ret & VM_FAULT_ERROR) {
                *pfn = range->values[HMM_PFN_ERROR];
                return -EFAULT;
        }
@@@ -679,8 -676,7 +679,8 @@@ int hmm_vma_get_pfns(struct hmm_range *
                return -EINVAL;
  
        /* FIXME support hugetlb fs */
 -      if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
 +      if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
 +                      vma_is_dax(vma)) {
                hmm_pfns_special(range);
                return -EINVAL;
        }
@@@ -853,8 -849,7 +853,8 @@@ int hmm_vma_fault(struct hmm_range *ran
                return -EINVAL;
  
        /* FIXME support hugetlb fs */
 -      if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
 +      if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
 +                      vma_is_dax(vma)) {
                hmm_pfns_special(range);
                return -EINVAL;
        }
@@@ -968,6 -963,8 +968,8 @@@ static void hmm_devmem_free(struct pag
  {
        struct hmm_devmem *devmem = data;
  
+       page->mapping = NULL;
        devmem->ops->free(devmem, page);
  }
  
@@@ -976,7 -973,10 +978,7 @@@ static RADIX_TREE(hmm_devmem_radix, GFP
  
  static void hmm_devmem_radix_release(struct resource *resource)
  {
 -      resource_size_t key, align_start, align_size;
 -
 -      align_start = resource->start & ~(PA_SECTION_SIZE - 1);
 -      align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
 +      resource_size_t key;
  
        mutex_lock(&hmm_devmem_lock);
        for (key = resource->start;
diff --combined mm/huge_memory.c
index 08b544383d7467df273eac5b97c7c4e07d2e7e4a,feba371169ca4c516f71ef7ee9d240f22d32deac..c3bc7e9c9a2acc550ea8aeb68720a8c5a611c610
@@@ -541,18 -541,18 +541,18 @@@ unsigned long thp_get_unmapped_area(str
  }
  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
  
 -static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 -              gfp_t gfp)
 +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 +                      struct page *page, gfp_t gfp)
  {
        struct vm_area_struct *vma = vmf->vma;
        struct mem_cgroup *memcg;
        pgtable_t pgtable;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 -      int ret = 0;
 +      vm_fault_t ret = 0;
  
        VM_BUG_ON_PAGE(!PageCompound(page), page);
  
 -      if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
 +      if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
  
                /* Deliver the page fault to userland */
                if (userfaultfd_missing(vma)) {
 -                      int ret;
 +                      vm_fault_t ret2;
  
                        spin_unlock(vmf->ptl);
                        mem_cgroup_cancel_charge(page, memcg, true);
                        put_page(page);
                        pte_free(vma->vm_mm, pgtable);
 -                      ret = handle_userfault(vmf, VM_UFFD_MISSING);
 -                      VM_BUG_ON(ret & VM_FAULT_FALLBACK);
 -                      return ret;
 +                      ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
 +                      VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
 +                      return ret2;
                }
  
                entry = mk_huge_pmd(page, vma->vm_page_prot);
@@@ -663,7 -663,7 +663,7 @@@ static bool set_huge_zero_page(pgtable_
        return true;
  }
  
 -int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 +vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
  {
        struct vm_area_struct *vma = vmf->vma;
        gfp_t gfp;
                pgtable_t pgtable;
                struct page *zero_page;
                bool set;
 -              int ret;
 +              vm_fault_t ret;
                pgtable = pte_alloc_one(vma->vm_mm, haddr);
                if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
@@@ -752,7 -752,7 +752,7 @@@ static void insert_pfn_pmd(struct vm_ar
        spin_unlock(ptl);
  }
  
int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
                        pmd_t *pmd, pfn_t pfn, bool write)
  {
        pgprot_t pgprot = vma->vm_page_prot;
         * but we need to be consistent with PTEs and architectures that
         * can't support a 'special' bit.
         */
 -      BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
 +      BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
 +                      !pfn_t_devmap(pfn));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
 -      BUG_ON(!pfn_t_devmap(pfn));
  
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;
@@@ -812,7 -812,7 +812,7 @@@ static void insert_pfn_pud(struct vm_ar
        spin_unlock(ptl);
  }
  
int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
                        pud_t *pud, pfn_t pfn, bool write)
  {
        pgprot_t pgprot = vma->vm_page_prot;
@@@ -1118,16 -1118,15 +1118,16 @@@ unlock
        spin_unlock(vmf->ptl);
  }
  
 -static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
 -              struct page *page)
 +static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
 +                      pmd_t orig_pmd, struct page *page)
  {
        struct vm_area_struct *vma = vmf->vma;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        struct mem_cgroup *memcg;
        pgtable_t pgtable;
        pmd_t _pmd;
 -      int ret = 0, i;
 +      int i;
 +      vm_fault_t ret = 0;
        struct page **pages;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
                pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
                                               vmf->address, page_to_nid(page));
                if (unlikely(!pages[i] ||
 -                           mem_cgroup_try_charge(pages[i], vma->vm_mm,
 +                           mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
                                     GFP_KERNEL, &memcg, false))) {
                        if (pages[i])
                                put_page(pages[i]);
@@@ -1237,7 -1236,7 +1237,7 @@@ out_free_pages
        goto out;
  }
  
 -int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
  {
        struct vm_area_struct *vma = vmf->vma;
        struct page *page = NULL, *new_page;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
        gfp_t huge_gfp;                 /* for allocation and charge */
 -      int ret = 0;
 +      vm_fault_t ret = 0;
  
        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@@ -1313,7 -1312,7 +1313,7 @@@ alloc
                goto out;
        }
  
 -      if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
 +      if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
                                        huge_gfp, &memcg, true))) {
                put_page(new_page);
                split_huge_pmd(vma, vmf->pmd, vmf->address);
        if (!page)
                clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
        else
 -              copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
 +              copy_user_huge_page(new_page, page, vmf->address,
 +                                  vma, HPAGE_PMD_NR);
        __SetPageUptodate(new_page);
  
        mmun_start = haddr;
@@@ -1458,7 -1456,7 +1458,7 @@@ out
  }
  
  /* NUMA hinting page fault entry point for trans huge pmds */
 -int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 +vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
  {
        struct vm_area_struct *vma = vmf->vma;
        struct anon_vma *anon_vma = NULL;
@@@ -1742,7 -1740,7 +1742,7 @@@ int zap_huge_pmd(struct mmu_gather *tlb
                } else {
                        if (arch_needs_pgtable_deposit())
                                zap_deposited_table(tlb->mm, pmd);
 -                      add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
 +                      add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
                }
  
                spin_unlock(ptl);
@@@ -2086,13 -2084,11 +2086,13 @@@ static void __split_huge_pmd_locked(str
                if (vma_is_dax(vma))
                        return;
                page = pmd_page(_pmd);
 +              if (!PageDirty(page) && pmd_dirty(_pmd))
 +                      set_page_dirty(page);
                if (!PageReferenced(page) && pmd_young(_pmd))
                        SetPageReferenced(page);
                page_remove_rmap(page, true);
                put_page(page);
 -              add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
 +              add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
                return;
        } else if (is_huge_zero_pmd(*pmd)) {
                /*
diff --combined mm/memory-failure.c
index 192d0bbfc9ea58823b41f14e7a5a515932398eba,32a644d9c2eeb8a179f6cfe72021a7e6c21fdaf8..0cd3de3550f0830f507d286b0499789d7961171e
  #include <linux/hugetlb.h>
  #include <linux/memory_hotplug.h>
  #include <linux/mm_inline.h>
+ #include <linux/memremap.h>
  #include <linux/kfifo.h>
  #include <linux/ratelimit.h>
 +#include <linux/page-isolation.h>
  #include "internal.h"
  #include "ras/ras_event.h"
  
@@@ -174,23 -174,52 +175,52 @@@ int hwpoison_filter(struct page *p
  
  EXPORT_SYMBOL_GPL(hwpoison_filter);
  
+ /*
+  * Kill all processes that have a poisoned page mapped and then isolate
+  * the page.
+  *
+  * General strategy:
+  * Find all processes having the page mapped and kill them.
+  * But we keep a page reference around so that the page is not
+  * actually freed yet.
+  * Then stash the page away
+  *
+  * There's no convenient way to get back to mapped processes
+  * from the VMAs. So do a brute-force search over all
+  * running processes.
+  *
+  * Remember that machine checks are not common (or rather
+  * if they are common you have other problems), so this shouldn't
+  * be a performance issue.
+  *
+  * Also there are some races possible while we get from the
+  * error detection to actually handle it.
+  */
+ struct to_kill {
+       struct list_head nd;
+       struct task_struct *tsk;
+       unsigned long addr;
+       short size_shift;
+       char addr_valid;
+ };
  /*
   * Send all the processes who have the page mapped a signal.
   * ``action optional'' if they are not immediately affected by the error
   * ``action required'' if error happened in current execution context
   */
- static int kill_proc(struct task_struct *t, unsigned long addr,
-                       unsigned long pfn, struct page *page, int flags)
+ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
  {
-       short addr_lsb;
+       struct task_struct *t = tk->tsk;
+       short addr_lsb = tk->size_shift;
        int ret;
  
        pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
                pfn, t->comm, t->pid);
-       addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
  
        if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
-               ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr,
+               ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
                                       addr_lsb, current);
        } else {
                /*
                 * This could cause a loop when the user sets SIGBUS
                 * to SIG_IGN, but hopefully no one will do that?
                 */
-               ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)addr,
+               ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
                                      addr_lsb, t);  /* synchronous? */
        }
        if (ret < 0)
@@@ -235,34 -264,39 +265,39 @@@ void shake_page(struct page *p, int acc
  }
  EXPORT_SYMBOL_GPL(shake_page);
  
- /*
-  * Kill all processes that have a poisoned page mapped and then isolate
-  * the page.
-  *
-  * General strategy:
-  * Find all processes having the page mapped and kill them.
-  * But we keep a page reference around so that the page is not
-  * actually freed yet.
-  * Then stash the page away
-  *
-  * There's no convenient way to get back to mapped processes
-  * from the VMAs. So do a brute-force search over all
-  * running processes.
-  *
-  * Remember that machine checks are not common (or rather
-  * if they are common you have other problems), so this shouldn't
-  * be a performance issue.
-  *
-  * Also there are some races possible while we get from the
-  * error detection to actually handle it.
-  */
- struct to_kill {
-       struct list_head nd;
-       struct task_struct *tsk;
-       unsigned long addr;
-       char addr_valid;
- };
+ static unsigned long dev_pagemap_mapping_shift(struct page *page,
+               struct vm_area_struct *vma)
+ {
+       unsigned long address = vma_address(page, vma);
+       pgd_t *pgd;
+       p4d_t *p4d;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       pgd = pgd_offset(vma->vm_mm, address);
+       if (!pgd_present(*pgd))
+               return 0;
+       p4d = p4d_offset(pgd, address);
+       if (!p4d_present(*p4d))
+               return 0;
+       pud = pud_offset(p4d, address);
+       if (!pud_present(*pud))
+               return 0;
+       if (pud_devmap(*pud))
+               return PUD_SHIFT;
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return 0;
+       if (pmd_devmap(*pmd))
+               return PMD_SHIFT;
+       pte = pte_offset_map(pmd, address);
+       if (!pte_present(*pte))
+               return 0;
+       if (pte_devmap(*pte))
+               return PAGE_SHIFT;
+       return 0;
+ }
  
  /*
   * Failure handling: if we can't find or can't kill a process there's
@@@ -293,6 -327,10 +328,10 @@@ static void add_to_kill(struct task_str
        }
        tk->addr = page_address_in_vma(p, vma);
        tk->addr_valid = 1;
+       if (is_zone_device_page(p))
+               tk->size_shift = dev_pagemap_mapping_shift(p, vma);
+       else
+               tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
  
        /*
         * In theory we don't have to kill when the page was
         * likely very rare kill anyways just out of paranoia, but use
         * a SIGKILL because the error is not contained anymore.
         */
-       if (tk->addr == -EFAULT) {
+       if (tk->addr == -EFAULT || tk->size_shift == 0) {
                pr_info("Memory failure: Unable to find user space address %lx in %s\n",
                        page_to_pfn(p), tsk->comm);
                tk->addr_valid = 0;
   * Also when FAIL is set do a force kill because something went
   * wrong earlier.
   */
- static void kill_procs(struct list_head *to_kill, int forcekill,
-                         bool fail, struct page *page, unsigned long pfn,
-                         int flags)
+ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
+               unsigned long pfn, int flags)
  {
        struct to_kill *tk, *next;
  
                         * check for that, but we need to tell the
                         * process anyways.
                         */
-                       else if (kill_proc(tk->tsk, tk->addr,
-                                             pfn, page, flags) < 0)
+                       else if (kill_proc(tk, pfn, flags) < 0)
                                pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
                                       pfn, tk->tsk->comm, tk->tsk->pid);
                }
@@@ -516,6 -552,7 +553,7 @@@ static const char * const action_page_t
        [MF_MSG_TRUNCATED_LRU]          = "already truncated LRU page",
        [MF_MSG_BUDDY]                  = "free buddy page",
        [MF_MSG_BUDDY_2ND]              = "free buddy page (2nd try)",
+       [MF_MSG_DAX]                    = "dax page",
        [MF_MSG_UNKNOWN]                = "unknown page",
  };
  
@@@ -1013,7 -1050,7 +1051,7 @@@ static bool hwpoison_user_mappings(stru
         * any accesses to the poisoned memory.
         */
        forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
-       kill_procs(&tokill, forcekill, !unmap_success, p, pfn, flags);
+       kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
  
        return unmap_success;
  }
        return res;
  }
  
+ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
+               struct dev_pagemap *pgmap)
+ {
+       struct page *page = pfn_to_page(pfn);
+       const bool unmap_success = true;
+       unsigned long size = 0;
+       struct to_kill *tk;
+       LIST_HEAD(tokill);
+       int rc = -EBUSY;
+       loff_t start;
+       /*
+        * Prevent the inode from being freed while we are interrogating
+        * the address_space, typically this would be handled by
+        * lock_page(), but dax pages do not use the page lock. This
+        * also prevents changes to the mapping of this pfn until
+        * poison signaling is complete.
+        */
+       if (!dax_lock_mapping_entry(page))
+               goto out;
+       if (hwpoison_filter(page)) {
+               rc = 0;
+               goto unlock;
+       }
+       switch (pgmap->type) {
+       case MEMORY_DEVICE_PRIVATE:
+       case MEMORY_DEVICE_PUBLIC:
+               /*
+                * TODO: Handle HMM pages which may need coordination
+                * with device-side memory.
+                */
+               goto unlock;
+       default:
+               break;
+       }
+       /*
+        * Use this flag as an indication that the dax page has been
+        * remapped UC to prevent speculative consumption of poison.
+        */
+       SetPageHWPoison(page);
+       /*
+        * Unlike System-RAM there is no possibility to swap in a
+        * different physical page at a given virtual address, so all
+        * userspace consumption of ZONE_DEVICE memory necessitates
+        * SIGBUS (i.e. MF_MUST_KILL)
+        */
+       flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+       collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+       list_for_each_entry(tk, &tokill, nd)
+               if (tk->size_shift)
+                       size = max(size, 1UL << tk->size_shift);
+       if (size) {
+               /*
+                * Unmap the largest mapping to avoid breaking up
+                * device-dax mappings which are constant size. The
+                * actual size of the mapping being torn down is
+                * communicated in siginfo, see kill_proc()
+                */
+               start = (page->index << PAGE_SHIFT) & ~(size - 1);
+               unmap_mapping_range(page->mapping, start, start + size, 0);
+       }
+       kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+       rc = 0;
+ unlock:
+       dax_unlock_mapping_entry(page);
+ out:
+       /* drop pgmap ref acquired in caller */
+       put_dev_pagemap(pgmap);
+       action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
+       return rc;
+ }
  /**
   * memory_failure - Handle memory failure of a page.
   * @pfn: Page Number of the corrupted page
@@@ -1135,6 -1249,7 +1250,7 @@@ int memory_failure(unsigned long pfn, i
        struct page *p;
        struct page *hpage;
        struct page *orig_head;
+       struct dev_pagemap *pgmap;
        int res;
        unsigned long page_flags;
  
                return -ENXIO;
        }
  
+       pgmap = get_dev_pagemap(pfn, NULL);
+       if (pgmap)
+               return memory_failure_dev_pagemap(pfn, flags, pgmap);
        p = pfn_to_page(pfn);
        if (PageHuge(p))
                return memory_failure_hugetlb(pfn, flags);
         *    R/W the page; let's pray that the page has been
         *    used and will be freed some time later.
         * In fact it's dangerous to directly bump up page count from 0,
 -       * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
 +       * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
         */
        if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
                if (is_free_buddy_page(p)) {
@@@ -1599,18 -1718,8 +1719,18 @@@ static int soft_offline_huge_page(struc
                if (ret > 0)
                        ret = -EIO;
        } else {
 -              if (PageHuge(page))
 -                      dissolve_free_huge_page(page);
 +              /*
 +               * We set PG_hwpoison only when the migration source hugepage
 +               * was successfully dissolved, because otherwise hwpoisoned
 +               * hugepage remains on free hugepage list, then userspace will
 +               * find it as SIGBUS by allocation failure. That's not expected
 +               * in soft-offlining.
 +               */
 +              ret = dissolve_free_huge_page(page);
 +              if (!ret) {
 +                      if (set_hwpoison_free_buddy_page(page))
 +                              num_poisoned_pages_inc();
 +              }
        }
        return ret;
  }
@@@ -1698,7 -1807,6 +1818,7 @@@ static int __soft_offline_page(struct p
  static int soft_offline_in_use_page(struct page *page, int flags)
  {
        int ret;
 +      int mt;
        struct page *hpage = compound_head(page);
  
        if (!PageHuge(page) && PageTransHuge(hpage)) {
                put_hwpoison_page(hpage);
        }
  
 +      /*
 +       * Setting MIGRATE_ISOLATE here ensures that the page will be linked
 +       * to free list immediately (not via pcplist) when released after
 +       * successful page migration. Otherwise we can't guarantee that the
 +       * page is really free after put_page() returns, so
 +       * set_hwpoison_free_buddy_page() highly likely fails.
 +       */
 +      mt = get_pageblock_migratetype(page);
 +      set_pageblock_migratetype(page, MIGRATE_ISOLATE);
        if (PageHuge(page))
                ret = soft_offline_huge_page(page, flags);
        else
                ret = __soft_offline_page(page, flags);
 -
 +      set_pageblock_migratetype(page, mt);
        return ret;
  }
  
 -static void soft_offline_free_page(struct page *page)
 +static int soft_offline_free_page(struct page *page)
  {
 +      int rc = 0;
        struct page *head = compound_head(page);
  
 -      if (!TestSetPageHWPoison(head)) {
 -              num_poisoned_pages_inc();
 -              if (PageHuge(head))
 -                      dissolve_free_huge_page(page);
 +      if (PageHuge(head))
 +              rc = dissolve_free_huge_page(page);
 +      if (!rc) {
 +              if (set_hwpoison_free_buddy_page(page))
 +                      num_poisoned_pages_inc();
 +              else
 +                      rc = -EBUSY;
        }
 +      return rc;
  }
  
  /**
@@@ -1777,6 -1871,14 +1897,14 @@@ int soft_offline_page(struct page *page
        int ret;
        unsigned long pfn = page_to_pfn(page);
  
+       if (is_zone_device_page(page)) {
+               pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
+                               pfn);
+               if (flags & MF_COUNT_INCREASED)
+                       put_page(page);
+               return -EIO;
+       }
        if (PageHWPoison(page)) {
                pr_info("soft offline: %#lx page already poisoned\n", pfn);
                if (flags & MF_COUNT_INCREASED)
        if (ret > 0)
                ret = soft_offline_in_use_page(page, flags);
        else if (ret == 0)
 -              soft_offline_free_page(page);
 +              ret = soft_offline_free_page(page);
  
        return ret;
  }
This page took 0.159142 seconds and 4 git commands to generate.