Merge tag 'libnvdimm-for-4.19_dax-memory-failure' of gitolite.kernel.org:pub/scm...

author Linus Torvalds <[email protected]>

Sun, 26 Aug 2018 01:43:59 +0000 (18:43 -0700)

committer Linus Torvalds <[email protected]>

Sun, 26 Aug 2018 01:43:59 +0000 (18:43 -0700)
author Linus Torvalds <[email protected]>
Sun, 26 Aug 2018 01:43:59 +0000 (18:43 -0700)
committer Linus Torvalds <[email protected]>
Sun, 26 Aug 2018 01:43:59 +0000 (18:43 -0700)
diff --combined arch/x86/include/asm/set_memory.h

index 34cffcef7375dfa15cb30832972aa3d71e86d678,cf5e9124b45ea17258d0654cbdeb47371cda6a7b..07a25753e85c5cd53b2613a71db91862fa31684f
--- 1/arch/x86/include/asm/set_memory.h
--- 2/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@@ -46,7 -46,6 +46,7 @@@ int set_memory_np(unsigned long addr, i
   int set_memory_4k(unsigned long addr, int numpages);
   int set_memory_encrypted(unsigned long addr, int numpages);
   int set_memory_decrypted(unsigned long addr, int numpages);
+ +int set_memory_np_noalias(unsigned long addr, int numpages);
   
   int set_memory_array_uc(unsigned long *addr, int addrinarray);
   int set_memory_array_wc(unsigned long *addr, int addrinarray);
@@@ -89,4 -88,46 +89,46 @@@ extern int kernel_set_to_readonly
   void set_kernel_text_rw(void);
   void set_kernel_text_ro(void);
   
+ #ifdef CONFIG_X86_64
+ static inline int set_mce_nospec(unsigned long pfn)
+ {
+       unsigned long decoy_addr;
+       int rc;
+ 
+       /*
+        * Mark the linear address as UC to make sure we don't log more
+        * errors because of speculative access to the page.
+        * We would like to just call:
+        *      set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
+        * but doing that would radically increase the odds of a
+        * speculative access to the poison page because we'd have
+        * the virtual address of the kernel 1:1 mapping sitting
+        * around in registers.
+        * Instead we get tricky.  We create a non-canonical address
+        * that looks just like the one we want, but has bit 63 flipped.
+        * This relies on set_memory_uc() properly sanitizing any __pa()
+        * results with __PHYSICAL_MASK or PTE_PFN_MASK.
+        */
+       decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+ 
+       rc = set_memory_uc(decoy_addr, 1);
+       if (rc)
+               pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+       return rc;
+ }
+ #define set_mce_nospec set_mce_nospec
+ 
+ /* Restore full speculative operation to the pfn. */
+ static inline int clear_mce_nospec(unsigned long pfn)
+ {
+       return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
+ }
+ #define clear_mce_nospec clear_mce_nospec
+ #else
+ /*
+  * Few people would run a 32-bit kernel on a machine that supports
+  * recoverable errors because they have too much memory to boot 32-bit.
+  */
+ #endif
+ 
   #endif /* _ASM_X86_SET_MEMORY_H */
diff --combined arch/x86/kernel/cpu/mcheck/mce.c

index 4b767284b7f5e59e529c5c7e1ae90174d7d23654,42a061ce1f5d350b625ac19aec5bc5dcd8aa1771..953b3ce92dccf0f684ce90e3a27015c99e692470
--- 1/arch/x86/kernel/cpu/mcheck/mce.c
--- 2/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@@ -42,6 -42,7 +42,7 @@@
   #include <linux/irq_work.h>
   #include <linux/export.h>
   #include <linux/jump_label.h>
+ #include <linux/set_memory.h>
   
   #include <asm/intel-family.h>
   #include <asm/processor.h>
@@@ -50,7 -51,6 +51,6 @@@
   #include <asm/mce.h>
   #include <asm/msr.h>
   #include <asm/reboot.h>
- #include <asm/set_memory.h>
   
   #include "mce-internal.h"
   
@@@ -108,10 -108,6 +108,6 @@@ static struct irq_work mce_irq_work
   
   static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
   
- #ifndef mce_unmap_kpfn
- static void mce_unmap_kpfn(unsigned long pfn);
- #endif
- 
   /*
    * CPU/chipset specific EDAC code can register a notifier call here to print
    * MCE errors in a human-readable form.
@@@ -123,8 -119,8 +119,8 @@@ void mce_setup(struct mce *m
   {
         memset(m, 0, sizeof(struct mce));
         m->cpu = m->extcpu = smp_processor_id();
- -      /* We hope get_seconds stays lockless */
- -      m->time = get_seconds();
+ +      /* need the internal __ version to avoid deadlocks */
+ +      m->time = __ktime_get_real_seconds();
         m->cpuvendor = boot_cpu_data.x86_vendor;
         m->cpuid = cpuid_eax(1);
         m->socketid = cpu_data(m->extcpu).phys_proc_id;
@@@ -602,7 -598,7 +598,7 @@@ static int srao_decode_notifier(struct 
         if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
                 pfn = mce->addr >> PAGE_SHIFT;
                 if (!memory_failure(pfn, 0))
-                       mce_unmap_kpfn(pfn);
+                       set_mce_nospec(pfn);
         }
   
         return NOTIFY_OK;
@@@ -1072,133 -1068,10 +1068,105 @@@ static int do_memory_failure(struct mc
         if (ret)
                 pr_err("Memory error not recovered");
         else
-               mce_unmap_kpfn(m->addr >> PAGE_SHIFT);
+               set_mce_nospec(m->addr >> PAGE_SHIFT);
         return ret;
   }
   
- #ifndef mce_unmap_kpfn
- static void mce_unmap_kpfn(unsigned long pfn)
- {
-       unsigned long decoy_addr;
- 
-       /*
-        * Unmap this page from the kernel 1:1 mappings to make sure
-        * we don't log more errors because of speculative access to
-        * the page.
-        * We would like to just call:
-        *      set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
-        * but doing that would radically increase the odds of a
-        * speculative access to the poison page because we'd have
-        * the virtual address of the kernel 1:1 mapping sitting
-        * around in registers.
-        * Instead we get tricky.  We create a non-canonical address
-        * that looks just like the one we want, but has bit 63 flipped.
-        * This relies on set_memory_np() not checking whether we passed
-        * a legal address.
-        */
- 
-       decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
- 
-       if (set_memory_np(decoy_addr, 1))
-               pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
- }
- #endif
- 
+ +
+ +/*
+ + * Cases where we avoid rendezvous handler timeout:
+ + * 1) If this CPU is offline.
+ + *
+ + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ + *  skip those CPUs which remain looping in the 1st kernel - see
+ + *  crash_nmi_callback().
+ + *
+ + * Note: there still is a small window between kexec-ing and the new,
+ + * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ + * might not get handled properly.
+ + */
+ +static bool __mc_check_crashing_cpu(int cpu)
+ +{
+ +      if (cpu_is_offline(cpu) ||
+ +          (crashing_cpu != -1 && crashing_cpu != cpu)) {
+ +              u64 mcgstatus;
+ +
+ +              mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ +              if (mcgstatus & MCG_STATUS_RIPV) {
+ +                      mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ +                      return true;
+ +              }
+ +      }
+ +      return false;
+ +}
+ +
+ +static void __mc_scan_banks(struct mce *m, struct mce *final,
+ +                          unsigned long *toclear, unsigned long *valid_banks,
+ +                          int no_way_out, int *worst)
+ +{
+ +      struct mca_config *cfg = &mca_cfg;
+ +      int severity, i;
+ +
+ +      for (i = 0; i < cfg->banks; i++) {
+ +              __clear_bit(i, toclear);
+ +              if (!test_bit(i, valid_banks))
+ +                      continue;
+ +
+ +              if (!mce_banks[i].ctl)
+ +                      continue;
+ +
+ +              m->misc = 0;
+ +              m->addr = 0;
+ +              m->bank = i;
+ +
+ +              m->status = mce_rdmsrl(msr_ops.status(i));
+ +              if (!(m->status & MCI_STATUS_VAL))
+ +                      continue;
+ +
+ +              /*
+ +               * Corrected or non-signaled errors are handled by
+ +               * machine_check_poll(). Leave them alone, unless this panics.
+ +               */
+ +              if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ +                      !no_way_out)
+ +                      continue;
+ +
+ +              /* Set taint even when machine check was not enabled. */
+ +              add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+ +
+ +              severity = mce_severity(m, cfg->tolerant, NULL, true);
+ +
+ +              /*
+ +               * When machine check was for corrected/deferred handler don't
+ +               * touch, unless we're panicking.
+ +               */
+ +              if ((severity == MCE_KEEP_SEVERITY ||
+ +                   severity == MCE_UCNA_SEVERITY) && !no_way_out)
+ +                      continue;
+ +
+ +              __set_bit(i, toclear);
+ +
+ +              /* Machine check event was not enabled. Clear, but ignore. */
+ +              if (severity == MCE_NO_SEVERITY)
+ +                      continue;
+ +
+ +              mce_read_aux(m, i);
+ +
+ +              /* assuming valid severity level != 0 */
+ +              m->severity = severity;
+ +
+ +              mce_log(m);
+ +
+ +              if (severity > *worst) {
+ +                      *final = *m;
+ +                      *worst = severity;
+ +              }
+ +      }
+ +
+ +      /* mce_clear_state will clear *final, save locally for use later */
+ +      *m = *final;
+ +}
+ +
   /*
    * The actual machine check handler. This only handles real
    * exceptions when something got corrupted coming in through int 18.
@@@ -1213,45 -1086,68 +1181,45 @@@
    */
   void do_machine_check(struct pt_regs *regs, long error_code)
   {
+ +      DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
+ +      DECLARE_BITMAP(toclear, MAX_NR_BANKS);
         struct mca_config *cfg = &mca_cfg;
+ +      int cpu = smp_processor_id();
+ +      char *msg = "Unknown";
         struct mce m, *final;
- -      int i;
         int worst = 0;
- -      int severity;
   
         /*
          * Establish sequential order between the CPUs entering the machine
          * check handler.
          */
         int order = -1;
+ +
         /*
          * If no_way_out gets set, there is no safe way to recover from this
          * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
          */
         int no_way_out = 0;
+ +
         /*
          * If kill_it gets set, there might be a way to recover from this
          * error.
          */
         int kill_it = 0;
- -      DECLARE_BITMAP(toclear, MAX_NR_BANKS);
- -      DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
- -      char *msg = "Unknown";
   
         /*
          * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
          * on Intel.
          */
         int lmce = 1;
- -      int cpu = smp_processor_id();
- -
- -      /*
- -       * Cases where we avoid rendezvous handler timeout:
- -       * 1) If this CPU is offline.
- -       *
- -       * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
- -       *  skip those CPUs which remain looping in the 1st kernel - see
- -       *  crash_nmi_callback().
- -       *
- -       * Note: there still is a small window between kexec-ing and the new,
- -       * kdump kernel establishing a new #MC handler where a broadcasted MCE
- -       * might not get handled properly.
- -       */
- -      if (cpu_is_offline(cpu) ||
- -          (crashing_cpu != -1 && crashing_cpu != cpu)) {
- -              u64 mcgstatus;
   
- -              mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- -              if (mcgstatus & MCG_STATUS_RIPV) {
- -                      mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
- -                      return;
- -              }
- -      }
+ +      if (__mc_check_crashing_cpu(cpu))
+ +              return;
   
         ist_enter(regs);
   
         this_cpu_inc(mce_exception_count);
   
- -      if (!cfg->banks)
- -              goto out;
- -
         mce_gather_info(&m, regs);
         m.tsc = rdtsc();
   
@@@ -1292,7 -1188,67 +1260,7 @@@
                 order = mce_start(&no_way_out);
         }
   
- -      for (i = 0; i < cfg->banks; i++) {
- -              __clear_bit(i, toclear);
- -              if (!test_bit(i, valid_banks))
- -                      continue;
- -              if (!mce_banks[i].ctl)
- -                      continue;
- -
- -              m.misc = 0;
- -              m.addr = 0;
- -              m.bank = i;
- -
- -              m.status = mce_rdmsrl(msr_ops.status(i));
- -              if ((m.status & MCI_STATUS_VAL) == 0)
- -                      continue;
- -
- -              /*
- -               * Non uncorrected or non signaled errors are handled by
- -               * machine_check_poll. Leave them alone, unless this panics.
- -               */
- -              if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
- -                      !no_way_out)
- -                      continue;
- -
- -              /*
- -               * Set taint even when machine check was not enabled.
- -               */
- -              add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
- -
- -              severity = mce_severity(&m, cfg->tolerant, NULL, true);
- -
- -              /*
- -               * When machine check was for corrected/deferred handler don't
- -               * touch, unless we're panicing.
- -               */
- -              if ((severity == MCE_KEEP_SEVERITY ||
- -                   severity == MCE_UCNA_SEVERITY) && !no_way_out)
- -                      continue;
- -              __set_bit(i, toclear);
- -              if (severity == MCE_NO_SEVERITY) {
- -                      /*
- -                       * Machine check event was not enabled. Clear, but
- -                       * ignore.
- -                       */
- -                      continue;
- -              }
- -
- -              mce_read_aux(&m, i);
- -
- -              /* assuming valid severity level != 0 */
- -              m.severity = severity;
- -
- -              mce_log(&m);
- -
- -              if (severity > worst) {
- -                      *final = m;
- -                      worst = severity;
- -              }
- -      }
- -
- -      /* mce_clear_state will clear *final, save locally for use later */
- -      m = *final;
+ +      __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
   
         if (!no_way_out)
                 mce_clear_state(toclear);
@@@ -1331,7 -1287,7 +1299,7 @@@
         if (worst > 0)
                 mce_report_event(regs);
         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
- -out:
+ +
         sync_core();
   
         if (worst != MCE_AR_SEVERITY && !kill_it)
@@@ -2177,6 -2133,9 +2145,6 @@@ static ssize_t store_int_with_restart(s
         if (check_interval == old_check_interval)
                 return ret;
   
- -      if (check_interval < 1)
- -              check_interval = 1;
- -
         mutex_lock(&mce_sysfs_mutex);
         mce_restart();
         mutex_unlock(&mce_sysfs_mutex);
diff --combined drivers/dax/device.c

index 0a2acd7993f0b3a561cf91beb0b4200838f601f4,361a1108959168b13adb0bd08e90ce61581a43df..6fd46083e62958eea61716225cd9f6c7fe11e748
--- 1/drivers/dax/device.c
--- 2/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@@ -189,16 -189,14 +189,16 @@@ static int check_vma(struct dev_dax *de
   
         /* prevent private mappings from being established */
         if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
- -              dev_info(dev, "%s: %s: fail, attempted private mapping\n",
+ +              dev_info_ratelimited(dev,
+ +                              "%s: %s: fail, attempted private mapping\n",
                                 current->comm, func);
                 return -EINVAL;
         }
   
         mask = dax_region->align - 1;
         if (vma->vm_start & mask || vma->vm_end & mask) {
- -              dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
+ +              dev_info_ratelimited(dev,
+ +                              "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
                                 current->comm, func, vma->vm_start, vma->vm_end,
                                 mask);
                 return -EINVAL;
@@@ -206,15 -204,13 +206,15 @@@
   
         if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
                         && (vma->vm_flags & VM_DONTCOPY) == 0) {
- -              dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
+ +              dev_info_ratelimited(dev,
+ +                              "%s: %s: fail, dax range requires MADV_DONTFORK\n",
                                 current->comm, func);
                 return -EINVAL;
         }
   
         if (!vma_is_dax(vma)) {
- -              dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
+ +              dev_info_ratelimited(dev,
+ +                              "%s: %s: fail, vma is not DAX capable\n",
                                 current->comm, func);
                 return -EINVAL;
         }
@@@ -248,13 -244,12 +248,12 @@@ __weak phys_addr_t dax_pgoff_to_phys(st
         return -1;
   }
   
- static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
+                               struct vm_fault *vmf, pfn_t *pfn)
   {
         struct device *dev = &dev_dax->dev;
         struct dax_region *dax_region;
-       int rc = VM_FAULT_SIGBUS;
         phys_addr_t phys;
-       pfn_t pfn;
         unsigned int fault_size = PAGE_SIZE;
   
         if (check_vma(dev_dax, vmf->vma, __func__))
@@@ -276,26 -271,19 +275,19 @@@
                 return VM_FAULT_SIGBUS;
         }
   
-       pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
- 
-       rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
- 
-       if (rc == -ENOMEM)
-               return VM_FAULT_OOM;
-       if (rc < 0 && rc != -EBUSY)
-               return VM_FAULT_SIGBUS;
+       *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
   
-       return VM_FAULT_NOPAGE;
+       return vmf_insert_mixed(vmf->vma, vmf->address, *pfn);
   }
   
- static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
+                               struct vm_fault *vmf, pfn_t *pfn)
   {
         unsigned long pmd_addr = vmf->address & PMD_MASK;
         struct device *dev = &dev_dax->dev;
         struct dax_region *dax_region;
         phys_addr_t phys;
         pgoff_t pgoff;
-       pfn_t pfn;
         unsigned int fault_size = PMD_SIZE;
   
         if (check_vma(dev_dax, vmf->vma, __func__))
@@@ -331,21 -319,21 +323,21 @@@
                 return VM_FAULT_SIGBUS;
         }
   
-       pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+       *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
   
-       return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
+       return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn,
                         vmf->flags & FAULT_FLAG_WRITE);
   }
   
   #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
- static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
+                               struct vm_fault *vmf, pfn_t *pfn)
   {
         unsigned long pud_addr = vmf->address & PUD_MASK;
         struct device *dev = &dev_dax->dev;
         struct dax_region *dax_region;
         phys_addr_t phys;
         pgoff_t pgoff;
-       pfn_t pfn;
         unsigned int fault_size = PUD_SIZE;
   
   
@@@ -382,23 -370,26 +374,26 @@@
                 return VM_FAULT_SIGBUS;
         }
   
-       pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+       *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
   
-       return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn,
+       return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn,
                         vmf->flags & FAULT_FLAG_WRITE);
   }
   #else
- static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
+                               struct vm_fault *vmf, pfn_t *pfn)
   {
         return VM_FAULT_FALLBACK;
   }
   #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
   
- static int dev_dax_huge_fault(struct vm_fault *vmf,
+ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf,
                 enum page_entry_size pe_size)
   {
-       int rc, id;
         struct file *filp = vmf->vma->vm_file;
+       unsigned long fault_size;
+       int rc, id;
+       pfn_t pfn;
         struct dev_dax *dev_dax = filp->private_data;
   
         dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm,
@@@ -408,23 -399,49 +403,49 @@@
         id = dax_read_lock();
         switch (pe_size) {
         case PE_SIZE_PTE:
-               rc = __dev_dax_pte_fault(dev_dax, vmf);
+               fault_size = PAGE_SIZE;
+               rc = __dev_dax_pte_fault(dev_dax, vmf, &pfn);
                 break;
         case PE_SIZE_PMD:
-               rc = __dev_dax_pmd_fault(dev_dax, vmf);
+               fault_size = PMD_SIZE;
+               rc = __dev_dax_pmd_fault(dev_dax, vmf, &pfn);
                 break;
         case PE_SIZE_PUD:
-               rc = __dev_dax_pud_fault(dev_dax, vmf);
+               fault_size = PUD_SIZE;
+               rc = __dev_dax_pud_fault(dev_dax, vmf, &pfn);
                 break;
         default:
                 rc = VM_FAULT_SIGBUS;
         }
+ 
+       if (rc == VM_FAULT_NOPAGE) {
+               unsigned long i;
+               pgoff_t pgoff;
+ 
+               /*
+                * In the device-dax case the only possibility for a
+                * VM_FAULT_NOPAGE result is when device-dax capacity is
+                * mapped. No need to consider the zero page, or racing
+                * conflicting mappings.
+                */
+               pgoff = linear_page_index(vmf->vma, vmf->address
+                               & ~(fault_size - 1));
+               for (i = 0; i < fault_size / PAGE_SIZE; i++) {
+                       struct page *page;
+ 
+                       page = pfn_to_page(pfn_t_to_pfn(pfn) + i);
+                       if (page->mapping)
+                               continue;
+                       page->mapping = filp->f_mapping;
+                       page->index = pgoff + i;
+               }
+       }
         dax_read_unlock(id);
   
         return rc;
   }
   
- static int dev_dax_fault(struct vm_fault *vmf)
+ static vm_fault_t dev_dax_fault(struct vm_fault *vmf)
   {
         return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
   }
@@@ -474,7 -491,7 +495,7 @@@ static int dax_mmap(struct file *filp, 
                 return rc;
   
         vma->vm_ops = &dax_vm_ops;
- -      vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ +      vma->vm_flags |= VM_HUGEPAGE;
         return 0;
   }
   
diff --combined drivers/nvdimm/pmem.c

index c236498676964fd147df293b0cec30880b07cf93,55c7a69751d39306659e906af9b0c49fb344a042..6071e2942053c903564d6f08f278d3735a619308
--- 1/drivers/nvdimm/pmem.c
--- 2/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@@ -20,6 -20,7 +20,7 @@@
   #include <linux/hdreg.h>
   #include <linux/init.h>
   #include <linux/platform_device.h>
+ #include <linux/set_memory.h>
   #include <linux/module.h>
   #include <linux/moduleparam.h>
   #include <linux/badblocks.h>
@@@ -51,6 -52,30 +52,30 @@@ static struct nd_region *to_region(stru
         return to_nd_region(to_dev(pmem)->parent);
   }
   
+ static void hwpoison_clear(struct pmem_device *pmem,
+               phys_addr_t phys, unsigned int len)
+ {
+       unsigned long pfn_start, pfn_end, pfn;
+ 
+       /* only pmem in the linear map supports HWPoison */
+       if (is_vmalloc_addr(pmem->virt_addr))
+               return;
+ 
+       pfn_start = PHYS_PFN(phys);
+       pfn_end = pfn_start + PHYS_PFN(len);
+       for (pfn = pfn_start; pfn < pfn_end; pfn++) {
+               struct page *page = pfn_to_page(pfn);
+ 
+               /*
+                * Note, no need to hold a get_dev_pagemap() reference
+                * here since we're in the driver I/O path and
+                * outstanding I/O requests pin the dev_pagemap.
+                */
+               if (test_and_clear_pmem_poison(page))
+                       clear_mce_nospec(pfn);
+       }
+ }
+ 
   static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
                 phys_addr_t offset, unsigned int len)
   {
@@@ -65,6 -90,7 +90,7 @@@
         if (cleared < len)
                 rc = BLK_STS_IOERR;
         if (cleared > 0 && cleared / 512) {
+               hwpoison_clear(pmem, pmem->phys_addr + offset, cleared);
                 cleared /= 512;
                 dev_dbg(dev, "%#llx clear %ld sector%s\n",
                                 (unsigned long long) sector, cleared,
@@@ -120,7 -146,7 +146,7 @@@ static blk_status_t read_pmem(struct pa
   }
   
   static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
- -                      unsigned int len, unsigned int off, bool is_write,
+ +                      unsigned int len, unsigned int off, unsigned int op,
                         sector_t sector)
   {
         blk_status_t rc = BLK_STS_OK;
@@@ -131,7 -157,7 +157,7 @@@
         if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
                 bad_pmem = true;
   
- -      if (!is_write) {
+ +      if (!op_is_write(op)) {
                 if (unlikely(bad_pmem))
                         rc = BLK_STS_IOERR;
                 else {
@@@ -180,7 -206,8 +206,7 @@@ static blk_qc_t pmem_make_request(struc
         do_acct = nd_iostat_start(bio, &start);
         bio_for_each_segment(bvec, bio, iter) {
                 rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
- -                              bvec.bv_offset, op_is_write(bio_op(bio)),
- -                              iter.bi_sector);
+ +                              bvec.bv_offset, bio_op(bio), iter.bi_sector);
                 if (rc) {
                         bio->bi_status = rc;
                         break;
@@@ -197,13 -224,13 +223,13 @@@
   }
   
   static int pmem_rw_page(struct block_device *bdev, sector_t sector,
- -                     struct page *page, bool is_write)
+ +                     struct page *page, unsigned int op)
   {
         struct pmem_device *pmem = bdev->bd_queue->queuedata;
         blk_status_t rc;
   
         rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE,
- -                        0, is_write, sector);
+ +                        0, op, sector);
   
         /*
          * The ->rw_page interface is subtle and tricky.  The core
@@@ -212,7 -239,7 +238,7 @@@
          * caused by double completion.
          */
         if (rc == 0)
- -              page_endio(page, is_write, 0);
+ +              page_endio(page, op_is_write(op), 0);
   
         return blk_status_to_errno(rc);
   }
@@@ -226,11 -253,8 +252,11 @@@ __weak long __pmem_direct_access(struc
         if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
                                         PFN_PHYS(nr_pages))))
                 return -EIO;
- -      *kaddr = pmem->virt_addr + offset;
- -      *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+ +
+ +      if (kaddr)
+ +              *kaddr = pmem->virt_addr + offset;
+ +      if (pfn)
+ +              *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
   
         /*
          * If badblocks are present, limit known good range to the
diff --combined fs/dax.c

index f76724139f80c50da39e37e6b95058b7095eabbc,57ec272038da698b7053d4cc362a3533b2a87f11..f32d7125ad0f237d61173cd72383683ac380c4e4
--- 1/fs/dax.c
--- 2/fs/dax.c
+++ b/fs/dax.c
@@@ -226,8 -226,8 +226,8 @@@ static inline void *unlock_slot(struct 
    *
    * Must be called with the i_pages lock held.
    */
- static void *get_unlocked_mapping_entry(struct address_space *mapping,
-                                       pgoff_t index, void ***slotp)
+ static void *__get_unlocked_mapping_entry(struct address_space *mapping,
+               pgoff_t index, void ***slotp, bool (*wait_fn)(void))
   {
         void *entry, **slot;
         struct wait_exceptional_entry_queue ewait;
@@@ -237,6 -237,8 +237,8 @@@
         ewait.wait.func = wake_exceptional_entry_func;
   
         for (;;) {
+               bool revalidate;
+ 
                 entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
                                           &slot);
                 if (!entry ||
@@@ -251,14 -253,31 +253,31 @@@
                 prepare_to_wait_exclusive(wq, &ewait.wait,
                                           TASK_UNINTERRUPTIBLE);
                 xa_unlock_irq(&mapping->i_pages);
-               schedule();
+               revalidate = wait_fn();
                 finish_wait(wq, &ewait.wait);
                 xa_lock_irq(&mapping->i_pages);
+               if (revalidate)
+                       return ERR_PTR(-EAGAIN);
         }
   }
   
- static void dax_unlock_mapping_entry(struct address_space *mapping,
-                                    pgoff_t index)
+ static bool entry_wait(void)
+ {
+       schedule();
+       /*
+        * Never return an ERR_PTR() from
+        * __get_unlocked_mapping_entry(), just keep looping.
+        */
+       return false;
+ }
+ 
+ static void *get_unlocked_mapping_entry(struct address_space *mapping,
+               pgoff_t index, void ***slotp)
+ {
+       return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
+ }
+ 
+ static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
   {
         void *entry, **slot;
   
@@@ -277,7 -296,7 +296,7 @@@
   static void put_locked_mapping_entry(struct address_space *mapping,
                 pgoff_t index)
   {
-       dax_unlock_mapping_entry(mapping, index);
+       unlock_mapping_entry(mapping, index);
   }
   
   /*
@@@ -319,18 -338,27 +338,27 @@@ static unsigned long dax_radix_end_pfn(
         for (pfn = dax_radix_pfn(entry); \
                         pfn < dax_radix_end_pfn(entry); pfn++)
   
- static void dax_associate_entry(void *entry, struct address_space *mapping)
+ /*
+  * TODO: for reflink+dax we need a way to associate a single page with
+  * multiple address_space instances at different linear_page_index()
+  * offsets.
+  */
+ static void dax_associate_entry(void *entry, struct address_space *mapping,
+               struct vm_area_struct *vma, unsigned long address)
   {
-       unsigned long pfn;
+       unsigned long size = dax_entry_size(entry), pfn, index;
+       int i = 0;
   
         if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
                 return;
   
+       index = linear_page_index(vma, address & ~(size - 1));
         for_each_mapped_pfn(entry, pfn) {
                 struct page *page = pfn_to_page(pfn);
   
                 WARN_ON_ONCE(page->mapping);
                 page->mapping = mapping;
+               page->index = index + i++;
         }
   }
   
@@@ -348,6 -376,7 +376,7 @@@ static void dax_disassociate_entry(voi
                 WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
                 WARN_ON_ONCE(page->mapping && page->mapping != mapping);
                 page->mapping = NULL;
+               page->index = 0;
         }
   }
   
@@@ -364,6 -393,84 +393,84 @@@ static struct page *dax_busy_page(void 
         return NULL;
   }
   
+ static bool entry_wait_revalidate(void)
+ {
+       rcu_read_unlock();
+       schedule();
+       rcu_read_lock();
+ 
+       /*
+        * Tell __get_unlocked_mapping_entry() to take a break, we need
+        * to revalidate page->mapping after dropping locks
+        */
+       return true;
+ }
+ 
+ bool dax_lock_mapping_entry(struct page *page)
+ {
+       pgoff_t index;
+       struct inode *inode;
+       bool did_lock = false;
+       void *entry = NULL, **slot;
+       struct address_space *mapping;
+ 
+       rcu_read_lock();
+       for (;;) {
+               mapping = READ_ONCE(page->mapping);
+ 
+               if (!dax_mapping(mapping))
+                       break;
+ 
+               /*
+                * In the device-dax case there's no need to lock, a
+                * struct dev_pagemap pin is sufficient to keep the
+                * inode alive, and we assume we have dev_pagemap pin
+                * otherwise we would not have a valid pfn_to_page()
+                * translation.
+                */
+               inode = mapping->host;
+               if (S_ISCHR(inode->i_mode)) {
+                       did_lock = true;
+                       break;
+               }
+ 
+               xa_lock_irq(&mapping->i_pages);
+               if (mapping != page->mapping) {
+                       xa_unlock_irq(&mapping->i_pages);
+                       continue;
+               }
+               index = page->index;
+ 
+               entry = __get_unlocked_mapping_entry(mapping, index, &slot,
+                               entry_wait_revalidate);
+               if (!entry) {
+                       xa_unlock_irq(&mapping->i_pages);
+                       break;
+               } else if (IS_ERR(entry)) {
+                       WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
+                       continue;
+               }
+               lock_slot(mapping, slot);
+               did_lock = true;
+               xa_unlock_irq(&mapping->i_pages);
+               break;
+       }
+       rcu_read_unlock();
+ 
+       return did_lock;
+ }
+ 
+ void dax_unlock_mapping_entry(struct page *page)
+ {
+       struct address_space *mapping = page->mapping;
+       struct inode *inode = mapping->host;
+ 
+       if (S_ISCHR(inode->i_mode))
+               return;
+ 
+       unlock_mapping_entry(mapping, page->index);
+ }
+ 
   /*
    * Find radix tree entry at given index. If it points to an exceptional entry,
    * return it with the radix tree entry locked. If the radix tree doesn't
@@@ -566,8 -673,7 +673,8 @@@ struct page *dax_layout_busy_page(struc
                         if (index >= end)
                                 break;
   
- -                      if (!radix_tree_exceptional_entry(pvec_ent))
+ +                      if (WARN_ON_ONCE(
+ +                           !radix_tree_exceptional_entry(pvec_ent)))
                                 continue;
   
                         xa_lock_irq(&mapping->i_pages);
@@@ -579,13 -685,6 +686,13 @@@
                         if (page)
                                 break;
                 }
+ +
+ +              /*
+ +               * We don't expect normal struct page entries to exist in our
+ +               * tree, but we keep these pagevec calls so that this code is
+ +               * consistent with the common pattern for handling pagevecs
+ +               * throughout the kernel.
+ +               */
                 pagevec_remove_exceptionals(&pvec);
                 pagevec_release(&pvec);
                 index++;
@@@ -655,6 -754,7 +762,6 @@@ static int copy_user_dax(struct block_d
   {
         void *vto, *kaddr;
         pgoff_t pgoff;
- -      pfn_t pfn;
         long rc;
         int id;
   
@@@ -663,7 -763,7 +770,7 @@@
                 return rc;
   
         id = dax_read_lock();
- -      rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+ +      rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL);
         if (rc < 0) {
                 dax_read_unlock(id);
                 return rc;
@@@ -708,7 -808,7 +815,7 @@@ static void *dax_insert_mapping_entry(s
         new_entry = dax_radix_locked_entry(pfn, flags);
         if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
                 dax_disassociate_entry(entry, mapping, false);
-               dax_associate_entry(new_entry, mapping);
+               dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
         }
   
         if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
@@@ -974,6 -1074,7 +1081,6 @@@ static int dax_iomap_pfn(struct iomap *
   {
         const sector_t sector = dax_iomap_sector(iomap, pos);
         pgoff_t pgoff;
- -      void *kaddr;
         int id, rc;
         long length;
   
@@@ -982,7 -1083,7 +1089,7 @@@
                 return rc;
         id = dax_read_lock();
         length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
- -                                 &kaddr, pfnp);
+ +                                 NULL, pfnp);
         if (length < 0) {
                 rc = length;
                 goto out;
@@@ -1058,13 -1159,15 +1165,13 @@@ int __dax_zero_page_range(struct block_
                 pgoff_t pgoff;
                 long rc, id;
                 void *kaddr;
- -              pfn_t pfn;
   
                 rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff);
                 if (rc)
                         return rc;
   
                 id = dax_read_lock();
- -              rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr,
- -                              &pfn);
+ +              rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
                 if (rc < 0) {
                         dax_read_unlock(id);
                         return rc;
@@@ -1120,6 -1223,7 +1227,6 @@@ dax_iomap_actor(struct inode *inode, lo
                 ssize_t map_len;
                 pgoff_t pgoff;
                 void *kaddr;
- -              pfn_t pfn;
   
                 if (fatal_signal_pending(current)) {
                         ret = -EINTR;
@@@ -1131,7 -1235,7 +1238,7 @@@
                         break;
   
                 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
- -                              &kaddr, &pfn);
+ +                              &kaddr, NULL);
                 if (map_len < 0) {
                         ret = map_len;
                         break;
diff --combined include/linux/huge_mm.h

index 27e3e32135a84de8d9c9834ae195f653c6ed2b78,d3bbf6bea9e91c055b6232d0dec4d070140b6be3..99c19b06d9a46d2cebf20ad5f21a1612a94b5855
--- 1/include/linux/huge_mm.h
--- 2/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@@ -3,10 -3,11 +3,11 @@@
   #define _LINUX_HUGE_MM_H
   
   #include <linux/sched/coredump.h>
+ #include <linux/mm_types.h>
   
   #include <linux/fs.h> /* only for vma_is_dax() */
   
- -extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf);
+ +extern vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
   extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                          pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                          struct vm_area_struct *vma);
@@@ -23,7 -24,7 +24,7 @@@ static inline void huge_pud_set_accesse
   }
   #endif
   
- -extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
+ +extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
   extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pmd_t *pmd,
@@@ -46,9 -47,9 +47,9 @@@ extern bool move_huge_pmd(struct vm_are
   extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                         unsigned long addr, pgprot_t newprot,
                         int prot_numa);
- int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
                         pmd_t *pmd, pfn_t pfn, bool write);
- int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+ vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
                         pud_t *pud, pfn_t pfn, bool write);
   enum transparent_hugepage_flag {
         TRANSPARENT_HUGEPAGE_FLAG,
@@@ -216,7 -217,7 +217,7 @@@ struct page *follow_devmap_pmd(struct v
   struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
                 pud_t *pud, int flags);
   
- -extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
+ +extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
   
   extern struct page *huge_zero_page;
   
@@@ -321,8 -322,7 +322,8 @@@ static inline spinlock_t *pud_trans_hug
         return NULL;
   }
   
- -static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd)
+ +static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf,
+ +              pmd_t orig_pmd)
   {
         return 0;
   }
diff --combined include/linux/mm.h

index 8fcc36660de672c84fce606ba0a1fff22cd5a537,374e5e9284f7a17796751405e7e08a4e2664abc5..a61ebe8ad4ca92e72e23855c17f8e7c9ad059a54
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -155,9 -155,7 +155,9 @@@ extern int overcommit_kbytes_handler(st
    * mmap() functions).
    */
   
- -extern struct kmem_cache *vm_area_cachep;
+ +struct vm_area_struct *vm_area_alloc(struct mm_struct *);
+ +struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
+ +void vm_area_free(struct vm_area_struct *);
   
   #ifndef CONFIG_MMU
   extern struct rb_root nommu_region_tree;
@@@ -452,24 -450,6 +452,24 @@@ struct vm_operations_struct 
                                           unsigned long addr);
   };
   
+ +static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
+ +{
+ +      static const struct vm_operations_struct dummy_vm_ops = {};
+ +
+ +      memset(vma, 0, sizeof(*vma));
+ +      vma->vm_mm = mm;
+ +      vma->vm_ops = &dummy_vm_ops;
+ +      INIT_LIST_HEAD(&vma->anon_vma_chain);
+ +}
+ +
+ +static inline void vma_set_anonymous(struct vm_area_struct *vma)
+ +{
+ +      vma->vm_ops = NULL;
+ +}
+ +
+ +/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
+ +#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
+ +
   struct mmu_gather;
   struct inode;
   
@@@ -728,10 -708,10 +728,10 @@@ static inline pte_t maybe_mkwrite(pte_
         return pte;
   }
   
- -int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
+ +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
                 struct page *page);
- -int finish_fault(struct vm_fault *vmf);
- -int finish_mkwrite_fault(struct vm_fault *vmf);
+ +vm_fault_t finish_fault(struct vm_fault *vmf);
+ +vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
   #endif
   
   /*
@@@ -960,6 -940,15 +960,6 @@@ static inline int page_zone_id(struct p
         return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
   }
   
- -static inline int zone_to_nid(struct zone *zone)
- -{
- -#ifdef CONFIG_NUMA
- -      return zone->node;
- -#else
- -      return 0;
- -#endif
- -}
- -
   #ifdef NODE_NOT_IN_PAGE_FLAGS
   extern int page_to_nid(const struct page *page);
   #else
@@@ -1403,8 -1392,8 +1403,8 @@@ int generic_error_remove_page(struct ad
   int invalidate_inode_page(struct page *page);
   
   #ifdef CONFIG_MMU
- -extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
- -              unsigned int flags);
+ +extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
+ +                      unsigned long address, unsigned int flags);
   extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                             unsigned long address, unsigned int fault_flags,
                             bool *unlocked);
@@@ -1413,7 -1402,7 +1413,7 @@@ void unmap_mapping_pages(struct address
   void unmap_mapping_range(struct address_space *mapping,
                 loff_t const holebegin, loff_t const holelen, int even_cows);
   #else
- -static inline int handle_mm_fault(struct vm_area_struct *vma,
+ +static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                 unsigned long address, unsigned int flags)
   {
         /* should never happen if there's no MMU */
@@@ -2015,7 -2004,7 +2015,7 @@@ static inline spinlock_t *pud_lock(stru
   
   extern void __init pagecache_init(void);
   extern void free_area_init(unsigned long * zones_size);
- -extern void free_area_init_node(int nid, unsigned long * zones_size,
+ +extern void __init free_area_init_node(int nid, unsigned long * zones_size,
                 unsigned long zone_start_pfn, unsigned long *zholes_size);
   extern void free_initmem(void);
   
@@@ -2143,7 -2132,7 +2143,7 @@@ extern int __meminit __early_pfn_to_nid
                                         struct mminit_pfnnid_cache *state);
   #endif
   
- -#ifdef CONFIG_HAVE_MEMBLOCK
+ +#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
   void zero_resv_unavail(void);
   #else
   static inline void zero_resv_unavail(void) {}
@@@ -2563,7 -2552,7 +2563,7 @@@ static inline struct page *follow_page(
   #define FOLL_COW      0x4000  /* internal GUP flag */
   #define FOLL_ANON     0x8000  /* don't do file mappings */
   
- -static inline int vm_fault_to_errno(int vm_fault, int foll_flags)
+ +static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
   {
         if (vm_fault & VM_FAULT_OOM)
                 return -ENOMEM;
@@@ -2657,7 -2646,12 +2657,7 @@@ extern int randomize_va_space
   const char * arch_vma_name(struct vm_area_struct *vma);
   void print_vma_addr(char *prefix, unsigned long rip);
   
- -void sparse_mem_maps_populate_node(struct page **map_map,
- -                                 unsigned long pnum_begin,
- -                                 unsigned long pnum_end,
- -                                 unsigned long map_count,
- -                                 int nodeid);
- -
+ +void *sparse_buffer_alloc(unsigned long size);
   struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
                 struct vmem_altmap *altmap);
   pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
@@@ -2731,6 -2725,7 +2731,7 @@@ enum mf_action_page_type 
         MF_MSG_TRUNCATED_LRU,
         MF_MSG_BUDDY,
         MF_MSG_BUDDY_2ND,
+       MF_MSG_DAX,
         MF_MSG_UNKNOWN,
   };
   
@@@ -2739,8 -2734,7 +2740,8 @@@ extern void clear_huge_page(struct pag
                             unsigned long addr_hint,
                             unsigned int pages_per_huge_page);
   extern void copy_user_huge_page(struct page *dst, struct page *src,
- -                              unsigned long addr, struct vm_area_struct *vma,
+ +                              unsigned long addr_hint,
+ +                              struct vm_area_struct *vma,
                                 unsigned int pages_per_huge_page);
   extern long copy_huge_page_from_user(struct page *dst_page,
                                 const void __user *usr_src,
diff --combined kernel/memremap.c

index d57d58f77409214cf93ece9454354242ffd8dd85,62603634a1d235366c6eb19cb4d8f6062b7a70fa..5b8600d39931964adcef966b9660aceac5918124
--- 1/kernel/memremap.c
--- 2/kernel/memremap.c
+++ b/kernel/memremap.c
@@@ -5,7 -5,6 +5,7 @@@
   #include <linux/types.h>
   #include <linux/pfn_t.h>
   #include <linux/io.h>
+ +#include <linux/kasan.h>
   #include <linux/mm.h>
   #include <linux/memory_hotplug.h>
   #include <linux/swap.h>
@@@ -43,7 -42,7 +43,7 @@@ static unsigned long order_at(struct re
                         pgoff += 1UL << order, order = order_at((res), pgoff))
   
   #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
- -int device_private_entry_fault(struct vm_area_struct *vma,
+ +vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
                        unsigned long addr,
                        swp_entry_t entry,
                        unsigned int flags,
@@@ -138,7 -137,6 +138,7 @@@ static void devm_memremap_pages_release
         mem_hotplug_begin();
         arch_remove_memory(align_start, align_size, pgmap->altmap_valid ?
                         &pgmap->altmap : NULL);
+ +      kasan_remove_zero_shadow(__va(align_start), align_size);
         mem_hotplug_done();
   
         untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
@@@ -178,27 -176,10 +178,27 @@@ void *devm_memremap_pages(struct devic
         unsigned long pfn, pgoff, order;
         pgprot_t pgprot = PAGE_KERNEL;
         int error, nid, is_ram;
+ +      struct dev_pagemap *conflict_pgmap;
   
         align_start = res->start & ~(SECTION_SIZE - 1);
         align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
                 - align_start;
+ +      align_end = align_start + align_size - 1;
+ +
+ +      conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL);
+ +      if (conflict_pgmap) {
+ +              dev_WARN(dev, "Conflicting mapping in same section\n");
+ +              put_dev_pagemap(conflict_pgmap);
+ +              return ERR_PTR(-ENOMEM);
+ +      }
+ +
+ +      conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
+ +      if (conflict_pgmap) {
+ +              dev_WARN(dev, "Conflicting mapping in same section\n");
+ +              put_dev_pagemap(conflict_pgmap);
+ +              return ERR_PTR(-ENOMEM);
+ +      }
+ +
         is_ram = region_intersects(align_start, align_size,
                 IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
   
@@@ -218,6 -199,7 +218,6 @@@
   
         mutex_lock(&pgmap_lock);
         error = 0;
- -      align_end = align_start + align_size - 1;
   
         foreach_order_pgoff(res, order, pgoff) {
                 error = __radix_tree_insert(&pgmap_radix,
@@@ -241,12 -223,6 +241,12 @@@
                 goto err_pfn_remap;
   
         mem_hotplug_begin();
+ +      error = kasan_add_zero_shadow(__va(align_start), align_size);
+ +      if (error) {
+ +              mem_hotplug_done();
+ +              goto err_kasan;
+ +      }
+ +
         error = arch_add_memory(nid, align_start, align_size, altmap, false);
         if (!error)
                 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
@@@ -275,8 -251,6 +275,8 @@@
         return __va(res->start);
   
    err_add_memory:
+ +      kasan_remove_zero_shadow(__va(align_start), align_size);
+ + err_kasan:
         untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
    err_pfn_remap:
    err_radix:
@@@ -331,7 -305,7 +331,7 @@@ EXPORT_SYMBOL_GPL(get_dev_pagemap)
   
   #ifdef CONFIG_DEV_PAGEMAP_OPS
   DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
- -EXPORT_SYMBOL_GPL(devmap_managed_key);
+ +EXPORT_SYMBOL(devmap_managed_key);
   static atomic_t devmap_enable;
   
   /*
@@@ -365,12 -339,11 +365,11 @@@ void __put_devmap_managed_page(struct p
                 __ClearPageActive(page);
                 __ClearPageWaiters(page);
   
-               page->mapping = NULL;
                 mem_cgroup_uncharge(page);
   
                 page->pgmap->page_free(page, page->pgmap->data);
         } else if (!count)
                 __put_page(page);
   }
- -EXPORT_SYMBOL_GPL(__put_devmap_managed_page);
+ +EXPORT_SYMBOL(__put_devmap_managed_page);
   #endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --combined mm/hmm.c

index 0b05545916106cad2f5bd490af19514107a6b9c5,f9d1d89dec4d66d4948c2c4de3602e0878efa7ca..c968e49f7a0c527258a85b8c0259467e3b2924de
--- 1/mm/hmm.c
--- 2/mm/hmm.c
+++ b/mm/hmm.c
@@@ -177,19 -177,16 +177,19 @@@ static void hmm_release(struct mmu_noti
         up_write(&hmm->mirrors_sem);
   }
   
- -static void hmm_invalidate_range_start(struct mmu_notifier *mn,
+ +static int hmm_invalidate_range_start(struct mmu_notifier *mn,
                                        struct mm_struct *mm,
                                        unsigned long start,
- -                                     unsigned long end)
+ +                                     unsigned long end,
+ +                                     bool blockable)
   {
         struct hmm *hmm = mm->hmm;
   
         VM_BUG_ON(!hmm);
   
         atomic_inc(&hmm->sequence);
+ +
+ +      return 0;
   }
   
   static void hmm_invalidate_range_end(struct mmu_notifier *mn,
@@@ -302,14 -299,14 +302,14 @@@ static int hmm_vma_do_fault(struct mm_w
         struct hmm_vma_walk *hmm_vma_walk = walk->private;
         struct hmm_range *range = hmm_vma_walk->range;
         struct vm_area_struct *vma = walk->vma;
- -      int r;
+ +      vm_fault_t ret;
   
         flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
         flags |= write_fault ? FAULT_FLAG_WRITE : 0;
- -      r = handle_mm_fault(vma, addr, flags);
- -      if (r & VM_FAULT_RETRY)
+ +      ret = handle_mm_fault(vma, addr, flags);
+ +      if (ret & VM_FAULT_RETRY)
                 return -EBUSY;
- -      if (r & VM_FAULT_ERROR) {
+ +      if (ret & VM_FAULT_ERROR) {
                 *pfn = range->values[HMM_PFN_ERROR];
                 return -EFAULT;
         }
@@@ -679,8 -676,7 +679,8 @@@ int hmm_vma_get_pfns(struct hmm_range *
                 return -EINVAL;
   
         /* FIXME support hugetlb fs */
- -      if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ +      if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
+ +                      vma_is_dax(vma)) {
                 hmm_pfns_special(range);
                 return -EINVAL;
         }
@@@ -853,8 -849,7 +853,8 @@@ int hmm_vma_fault(struct hmm_range *ran
                 return -EINVAL;
   
         /* FIXME support hugetlb fs */
- -      if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+ +      if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
+ +                      vma_is_dax(vma)) {
                 hmm_pfns_special(range);
                 return -EINVAL;
         }
@@@ -968,6 -963,8 +968,8 @@@ static void hmm_devmem_free(struct pag
   {
         struct hmm_devmem *devmem = data;
   
+       page->mapping = NULL;
+ 
         devmem->ops->free(devmem, page);
   }
   
@@@ -976,7 -973,10 +978,7 @@@ static RADIX_TREE(hmm_devmem_radix, GFP
   
   static void hmm_devmem_radix_release(struct resource *resource)
   {
- -      resource_size_t key, align_start, align_size;
- -
- -      align_start = resource->start & ~(PA_SECTION_SIZE - 1);
- -      align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
+ +      resource_size_t key;
   
         mutex_lock(&hmm_devmem_lock);
         for (key = resource->start;
diff --combined mm/huge_memory.c

index 08b544383d7467df273eac5b97c7c4e07d2e7e4a,feba371169ca4c516f71ef7ee9d240f22d32deac..c3bc7e9c9a2acc550ea8aeb68720a8c5a611c610
--- 1/mm/huge_memory.c
--- 2/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@@ -541,18 -541,18 +541,18 @@@ unsigned long thp_get_unmapped_area(str
   }
   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
   
- -static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
- -              gfp_t gfp)
+ +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
+ +                      struct page *page, gfp_t gfp)
   {
         struct vm_area_struct *vma = vmf->vma;
         struct mem_cgroup *memcg;
         pgtable_t pgtable;
         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- -      int ret = 0;
+ +      vm_fault_t ret = 0;
   
         VM_BUG_ON_PAGE(!PageCompound(page), page);
   
- -      if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+ +      if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
                 put_page(page);
                 count_vm_event(THP_FAULT_FALLBACK);
                 return VM_FAULT_FALLBACK;
@@@ -584,15 -584,15 +584,15 @@@
   
                 /* Deliver the page fault to userland */
                 if (userfaultfd_missing(vma)) {
- -                      int ret;
+ +                      vm_fault_t ret2;
   
                         spin_unlock(vmf->ptl);
                         mem_cgroup_cancel_charge(page, memcg, true);
                         put_page(page);
                         pte_free(vma->vm_mm, pgtable);
- -                      ret = handle_userfault(vmf, VM_UFFD_MISSING);
- -                      VM_BUG_ON(ret & VM_FAULT_FALLBACK);
- -                      return ret;
+ +                      ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
+ +                      VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
+ +                      return ret2;
                 }
   
                 entry = mk_huge_pmd(page, vma->vm_page_prot);
@@@ -663,7 -663,7 +663,7 @@@ static bool set_huge_zero_page(pgtable_
         return true;
   }
   
- -int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+ +vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
   {
         struct vm_area_struct *vma = vmf->vma;
         gfp_t gfp;
@@@ -682,7 -682,7 +682,7 @@@
                 pgtable_t pgtable;
                 struct page *zero_page;
                 bool set;
- -              int ret;
+ +              vm_fault_t ret;
                 pgtable = pte_alloc_one(vma->vm_mm, haddr);
                 if (unlikely(!pgtable))
                         return VM_FAULT_OOM;
@@@ -752,7 -752,7 +752,7 @@@ static void insert_pfn_pmd(struct vm_ar
         spin_unlock(ptl);
   }
   
- int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
                         pmd_t *pmd, pfn_t pfn, bool write)
   {
         pgprot_t pgprot = vma->vm_page_prot;
@@@ -762,11 -762,11 +762,11 @@@
          * but we need to be consistent with PTEs and architectures that
          * can't support a 'special' bit.
          */
- -      BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ +      BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
+ +                      !pfn_t_devmap(pfn));
         BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                 (VM_PFNMAP|VM_MIXEDMAP));
         BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- -      BUG_ON(!pfn_t_devmap(pfn));
   
         if (addr < vma->vm_start || addr >= vma->vm_end)
                 return VM_FAULT_SIGBUS;
@@@ -812,7 -812,7 +812,7 @@@ static void insert_pfn_pud(struct vm_ar
         spin_unlock(ptl);
   }
   
- int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
+ vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
                         pud_t *pud, pfn_t pfn, bool write)
   {
         pgprot_t pgprot = vma->vm_page_prot;
@@@ -1118,16 -1118,15 +1118,16 @@@ unlock
         spin_unlock(vmf->ptl);
   }
   
- -static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
- -              struct page *page)
+ +static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
+ +                      pmd_t orig_pmd, struct page *page)
   {
         struct vm_area_struct *vma = vmf->vma;
         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
         struct mem_cgroup *memcg;
         pgtable_t pgtable;
         pmd_t _pmd;
- -      int ret = 0, i;
+ +      int i;
+ +      vm_fault_t ret = 0;
         struct page **pages;
         unsigned long mmun_start;       /* For mmu_notifiers */
         unsigned long mmun_end;         /* For mmu_notifiers */
@@@ -1143,7 -1142,7 +1143,7 @@@
                 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
                                                vmf->address, page_to_nid(page));
                 if (unlikely(!pages[i] ||
- -                           mem_cgroup_try_charge(pages[i], vma->vm_mm,
+ +                           mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
                                      GFP_KERNEL, &memcg, false))) {
                         if (pages[i])
                                 put_page(pages[i]);
@@@ -1237,7 -1236,7 +1237,7 @@@ out_free_pages
         goto out;
   }
   
- -int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
+ +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
   {
         struct vm_area_struct *vma = vmf->vma;
         struct page *page = NULL, *new_page;
@@@ -1246,7 -1245,7 +1246,7 @@@
         unsigned long mmun_start;       /* For mmu_notifiers */
         unsigned long mmun_end;         /* For mmu_notifiers */
         gfp_t huge_gfp;                 /* for allocation and charge */
- -      int ret = 0;
+ +      vm_fault_t ret = 0;
   
         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
         VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@@ -1313,7 -1312,7 +1313,7 @@@ alloc
                 goto out;
         }
   
- -      if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
+ +      if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
                                         huge_gfp, &memcg, true))) {
                 put_page(new_page);
                 split_huge_pmd(vma, vmf->pmd, vmf->address);
@@@ -1329,8 -1328,7 +1329,8 @@@
         if (!page)
                 clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
         else
- -              copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ +              copy_user_huge_page(new_page, page, vmf->address,
+ +                                  vma, HPAGE_PMD_NR);
         __SetPageUptodate(new_page);
   
         mmun_start = haddr;
@@@ -1458,7 -1456,7 +1458,7 @@@ out
   }
   
   /* NUMA hinting page fault entry point for trans huge pmds */
- -int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
+ +vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
   {
         struct vm_area_struct *vma = vmf->vma;
         struct anon_vma *anon_vma = NULL;
@@@ -1742,7 -1740,7 +1742,7 @@@ int zap_huge_pmd(struct mmu_gather *tlb
                 } else {
                         if (arch_needs_pgtable_deposit())
                                 zap_deposited_table(tlb->mm, pmd);
- -                      add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
+ +                      add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
                 }
   
                 spin_unlock(ptl);
@@@ -2086,13 -2084,11 +2086,13 @@@ static void __split_huge_pmd_locked(str
                 if (vma_is_dax(vma))
                         return;
                 page = pmd_page(_pmd);
+ +              if (!PageDirty(page) && pmd_dirty(_pmd))
+ +                      set_page_dirty(page);
                 if (!PageReferenced(page) && pmd_young(_pmd))
                         SetPageReferenced(page);
                 page_remove_rmap(page, true);
                 put_page(page);
- -              add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
+ +              add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
                 return;
         } else if (is_huge_zero_pmd(*pmd)) {
                 /*
diff --combined mm/memory-failure.c

index 192d0bbfc9ea58823b41f14e7a5a515932398eba,32a644d9c2eeb8a179f6cfe72021a7e6c21fdaf8..0cd3de3550f0830f507d286b0499789d7961171e
--- 1/mm/memory-failure.c
--- 2/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@@ -55,9 -55,9 +55,10 @@@
   #include <linux/hugetlb.h>
   #include <linux/memory_hotplug.h>
   #include <linux/mm_inline.h>
+ #include <linux/memremap.h>
   #include <linux/kfifo.h>
   #include <linux/ratelimit.h>
+ +#include <linux/page-isolation.h>
   #include "internal.h"
   #include "ras/ras_event.h"
   
@@@ -174,23 -174,52 +175,52 @@@ int hwpoison_filter(struct page *p
   
   EXPORT_SYMBOL_GPL(hwpoison_filter);
   
+ /*
+  * Kill all processes that have a poisoned page mapped and then isolate
+  * the page.
+  *
+  * General strategy:
+  * Find all processes having the page mapped and kill them.
+  * But we keep a page reference around so that the page is not
+  * actually freed yet.
+  * Then stash the page away
+  *
+  * There's no convenient way to get back to mapped processes
+  * from the VMAs. So do a brute-force search over all
+  * running processes.
+  *
+  * Remember that machine checks are not common (or rather
+  * if they are common you have other problems), so this shouldn't
+  * be a performance issue.
+  *
+  * Also there are some races possible while we get from the
+  * error detection to actually handle it.
+  */
+ 
+ struct to_kill {
+       struct list_head nd;
+       struct task_struct *tsk;
+       unsigned long addr;
+       short size_shift;
+       char addr_valid;
+ };
+ 
   /*
    * Send all the processes who have the page mapped a signal.
    * ``action optional'' if they are not immediately affected by the error
    * ``action required'' if error happened in current execution context
    */
- static int kill_proc(struct task_struct *t, unsigned long addr,
-                       unsigned long pfn, struct page *page, int flags)
+ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
   {
-       short addr_lsb;
+       struct task_struct *t = tk->tsk;
+       short addr_lsb = tk->size_shift;
         int ret;
   
         pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
                 pfn, t->comm, t->pid);
-       addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
   
         if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
-               ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr,
+               ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
                                        addr_lsb, current);
         } else {
                 /*
@@@ -199,7 -228,7 +229,7 @@@
                  * This could cause a loop when the user sets SIGBUS
                  * to SIG_IGN, but hopefully no one will do that?
                  */
-               ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)addr,
+               ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
                                       addr_lsb, t);  /* synchronous? */
         }
         if (ret < 0)
@@@ -235,34 -264,39 +265,39 @@@ void shake_page(struct page *p, int acc
   }
   EXPORT_SYMBOL_GPL(shake_page);
   
- /*
-  * Kill all processes that have a poisoned page mapped and then isolate
-  * the page.
-  *
-  * General strategy:
-  * Find all processes having the page mapped and kill them.
-  * But we keep a page reference around so that the page is not
-  * actually freed yet.
-  * Then stash the page away
-  *
-  * There's no convenient way to get back to mapped processes
-  * from the VMAs. So do a brute-force search over all
-  * running processes.
-  *
-  * Remember that machine checks are not common (or rather
-  * if they are common you have other problems), so this shouldn't
-  * be a performance issue.
-  *
-  * Also there are some races possible while we get from the
-  * error detection to actually handle it.
-  */
- 
- struct to_kill {
-       struct list_head nd;
-       struct task_struct *tsk;
-       unsigned long addr;
-       char addr_valid;
- };
+ static unsigned long dev_pagemap_mapping_shift(struct page *page,
+               struct vm_area_struct *vma)
+ {
+       unsigned long address = vma_address(page, vma);
+       pgd_t *pgd;
+       p4d_t *p4d;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+ 
+       pgd = pgd_offset(vma->vm_mm, address);
+       if (!pgd_present(*pgd))
+               return 0;
+       p4d = p4d_offset(pgd, address);
+       if (!p4d_present(*p4d))
+               return 0;
+       pud = pud_offset(p4d, address);
+       if (!pud_present(*pud))
+               return 0;
+       if (pud_devmap(*pud))
+               return PUD_SHIFT;
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return 0;
+       if (pmd_devmap(*pmd))
+               return PMD_SHIFT;
+       pte = pte_offset_map(pmd, address);
+       if (!pte_present(*pte))
+               return 0;
+       if (pte_devmap(*pte))
+               return PAGE_SHIFT;
+       return 0;
+ }
   
   /*
    * Failure handling: if we can't find or can't kill a process there's
@@@ -293,6 -327,10 +328,10 @@@ static void add_to_kill(struct task_str
         }
         tk->addr = page_address_in_vma(p, vma);
         tk->addr_valid = 1;
+       if (is_zone_device_page(p))
+               tk->size_shift = dev_pagemap_mapping_shift(p, vma);
+       else
+               tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
   
         /*
          * In theory we don't have to kill when the page was
@@@ -300,7 -338,7 +339,7 @@@
          * likely very rare kill anyways just out of paranoia, but use
          * a SIGKILL because the error is not contained anymore.
          */
-       if (tk->addr == -EFAULT) {
+       if (tk->addr == -EFAULT || tk->size_shift == 0) {
                 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
                         page_to_pfn(p), tsk->comm);
                 tk->addr_valid = 0;
@@@ -318,9 -356,8 +357,8 @@@
    * Also when FAIL is set do a force kill because something went
    * wrong earlier.
    */
- static void kill_procs(struct list_head *to_kill, int forcekill,
-                         bool fail, struct page *page, unsigned long pfn,
-                         int flags)
+ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
+               unsigned long pfn, int flags)
   {
         struct to_kill *tk, *next;
   
@@@ -343,8 -380,7 +381,7 @@@
                          * check for that, but we need to tell the
                          * process anyways.
                          */
-                       else if (kill_proc(tk->tsk, tk->addr,
-                                             pfn, page, flags) < 0)
+                       else if (kill_proc(tk, pfn, flags) < 0)
                                 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
                                        pfn, tk->tsk->comm, tk->tsk->pid);
                 }
@@@ -516,6 -552,7 +553,7 @@@ static const char * const action_page_t
         [MF_MSG_TRUNCATED_LRU]          = "already truncated LRU page",
         [MF_MSG_BUDDY]                  = "free buddy page",
         [MF_MSG_BUDDY_2ND]              = "free buddy page (2nd try)",
+       [MF_MSG_DAX]                    = "dax page",
         [MF_MSG_UNKNOWN]                = "unknown page",
   };
   
@@@ -1013,7 -1050,7 +1051,7 @@@ static bool hwpoison_user_mappings(stru
          * any accesses to the poisoned memory.
          */
         forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
-       kill_procs(&tokill, forcekill, !unmap_success, p, pfn, flags);
+       kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
   
         return unmap_success;
   }
@@@ -1113,6 -1150,83 +1151,83 @@@ out
         return res;
   }
   
+ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
+               struct dev_pagemap *pgmap)
+ {
+       struct page *page = pfn_to_page(pfn);
+       const bool unmap_success = true;
+       unsigned long size = 0;
+       struct to_kill *tk;
+       LIST_HEAD(tokill);
+       int rc = -EBUSY;
+       loff_t start;
+ 
+       /*
+        * Prevent the inode from being freed while we are interrogating
+        * the address_space, typically this would be handled by
+        * lock_page(), but dax pages do not use the page lock. This
+        * also prevents changes to the mapping of this pfn until
+        * poison signaling is complete.
+        */
+       if (!dax_lock_mapping_entry(page))
+               goto out;
+ 
+       if (hwpoison_filter(page)) {
+               rc = 0;
+               goto unlock;
+       }
+ 
+       switch (pgmap->type) {
+       case MEMORY_DEVICE_PRIVATE:
+       case MEMORY_DEVICE_PUBLIC:
+               /*
+                * TODO: Handle HMM pages which may need coordination
+                * with device-side memory.
+                */
+               goto unlock;
+       default:
+               break;
+       }
+ 
+       /*
+        * Use this flag as an indication that the dax page has been
+        * remapped UC to prevent speculative consumption of poison.
+        */
+       SetPageHWPoison(page);
+ 
+       /*
+        * Unlike System-RAM there is no possibility to swap in a
+        * different physical page at a given virtual address, so all
+        * userspace consumption of ZONE_DEVICE memory necessitates
+        * SIGBUS (i.e. MF_MUST_KILL)
+        */
+       flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+       collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+ 
+       list_for_each_entry(tk, &tokill, nd)
+               if (tk->size_shift)
+                       size = max(size, 1UL << tk->size_shift);
+       if (size) {
+               /*
+                * Unmap the largest mapping to avoid breaking up
+                * device-dax mappings which are constant size. The
+                * actual size of the mapping being torn down is
+                * communicated in siginfo, see kill_proc()
+                */
+               start = (page->index << PAGE_SHIFT) & ~(size - 1);
+               unmap_mapping_range(page->mapping, start, start + size, 0);
+       }
+       kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+       rc = 0;
+ unlock:
+       dax_unlock_mapping_entry(page);
+ out:
+       /* drop pgmap ref acquired in caller */
+       put_dev_pagemap(pgmap);
+       action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
+       return rc;
+ }
+ 
   /**
    * memory_failure - Handle memory failure of a page.
    * @pfn: Page Number of the corrupted page
@@@ -1135,6 -1249,7 +1250,7 @@@ int memory_failure(unsigned long pfn, i
         struct page *p;
         struct page *hpage;
         struct page *orig_head;
+       struct dev_pagemap *pgmap;
         int res;
         unsigned long page_flags;
   
@@@ -1147,6 -1262,10 +1263,10 @@@
                 return -ENXIO;
         }
   
+       pgmap = get_dev_pagemap(pfn, NULL);
+       if (pgmap)
+               return memory_failure_dev_pagemap(pfn, flags, pgmap);
+ 
         p = pfn_to_page(pfn);
         if (PageHuge(p))
                 return memory_failure_hugetlb(pfn, flags);
@@@ -1168,7 -1287,7 +1288,7 @@@
          *    R/W the page; let's pray that the page has been
          *    used and will be freed some time later.
          * In fact it's dangerous to directly bump up page count from 0,
- -       * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+ +       * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
          */
         if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
                 if (is_free_buddy_page(p)) {
@@@ -1599,18 -1718,8 +1719,18 @@@ static int soft_offline_huge_page(struc
                 if (ret > 0)
                         ret = -EIO;
         } else {
- -              if (PageHuge(page))
- -                      dissolve_free_huge_page(page);
+ +              /*
+ +               * We set PG_hwpoison only when the migration source hugepage
+ +               * was successfully dissolved, because otherwise hwpoisoned
+ +               * hugepage remains on free hugepage list, then userspace will
+ +               * find it as SIGBUS by allocation failure. That's not expected
+ +               * in soft-offlining.
+ +               */
+ +              ret = dissolve_free_huge_page(page);
+ +              if (!ret) {
+ +                      if (set_hwpoison_free_buddy_page(page))
+ +                              num_poisoned_pages_inc();
+ +              }
         }
         return ret;
   }
@@@ -1698,7 -1807,6 +1818,7 @@@ static int __soft_offline_page(struct p
   static int soft_offline_in_use_page(struct page *page, int flags)
   {
         int ret;
+ +      int mt;
         struct page *hpage = compound_head(page);
   
         if (!PageHuge(page) && PageTransHuge(hpage)) {
@@@ -1717,37 -1825,23 +1837,37 @@@
                 put_hwpoison_page(hpage);
         }
   
+ +      /*
+ +       * Setting MIGRATE_ISOLATE here ensures that the page will be linked
+ +       * to free list immediately (not via pcplist) when released after
+ +       * successful page migration. Otherwise we can't guarantee that the
+ +       * page is really free after put_page() returns, so
+ +       * set_hwpoison_free_buddy_page() highly likely fails.
+ +       */
+ +      mt = get_pageblock_migratetype(page);
+ +      set_pageblock_migratetype(page, MIGRATE_ISOLATE);
         if (PageHuge(page))
                 ret = soft_offline_huge_page(page, flags);
         else
                 ret = __soft_offline_page(page, flags);
- -
+ +      set_pageblock_migratetype(page, mt);
         return ret;
   }
   
- -static void soft_offline_free_page(struct page *page)
+ +static int soft_offline_free_page(struct page *page)
   {
+ +      int rc = 0;
         struct page *head = compound_head(page);
   
- -      if (!TestSetPageHWPoison(head)) {
- -              num_poisoned_pages_inc();
- -              if (PageHuge(head))
- -                      dissolve_free_huge_page(page);
+ +      if (PageHuge(head))
+ +              rc = dissolve_free_huge_page(page);
+ +      if (!rc) {
+ +              if (set_hwpoison_free_buddy_page(page))
+ +                      num_poisoned_pages_inc();
+ +              else
+ +                      rc = -EBUSY;
         }
+ +      return rc;
   }
   
   /**
@@@ -1777,6 -1871,14 +1897,14 @@@ int soft_offline_page(struct page *page
         int ret;
         unsigned long pfn = page_to_pfn(page);
   
+       if (is_zone_device_page(page)) {
+               pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
+                               pfn);
+               if (flags & MF_COUNT_INCREASED)
+                       put_page(page);
+               return -EIO;
+       }
+ 
         if (PageHWPoison(page)) {
                 pr_info("soft offline: %#lx page already poisoned\n", pfn);
                 if (flags & MF_COUNT_INCREASED)
@@@ -1791,7 -1893,7 +1919,7 @@@
         if (ret > 0)
                 ret = soft_offline_in_use_page(page, flags);
         else if (ret == 0)
- -              soft_offline_free_page(page);
+ +              ret = soft_offline_free_page(page);
   
         return ret;
   }
author	Linus Torvalds <[email protected]>
	Sun, 26 Aug 2018 01:43:59 +0000 (18:43 -0700)
committer	Linus Torvalds <[email protected]>
	Sun, 26 Aug 2018 01:43:59 +0000 (18:43 -0700)
		1	2
arch/x86/include/asm/set_memory.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/mcheck/mce.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/dax/device.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvdimm/pmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/dax.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/huge_mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/memremap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/hmm.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/huge_memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory-failure.c	patch \|	diff1 \|	diff2 \|	blob \| history