]> Git Repo - linux.git/commitdiff
Merge tag 'libnvdimm-for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdim...
authorLinus Torvalds <[email protected]>
Mon, 11 Sep 2017 20:10:57 +0000 (13:10 -0700)
committerLinus Torvalds <[email protected]>
Mon, 11 Sep 2017 20:10:57 +0000 (13:10 -0700)
Pull libnvdimm from Dan Williams:
 "A rework of media error handling in the BTT driver and other updates.
  It has appeared in a few -next releases and collected some late-
  breaking build-error and warning fixups as a result.

  Summary:

   - Media error handling support in the Block Translation Table (BTT)
     driver is reworked to address sleeping-while-atomic locking and
     memory-allocation-context conflicts.

   - The dax_device lookup overhead for xfs and ext4 is moved out of the
     iomap hot-path to a mount-time lookup.

   - A new 'ecc_unit_size' sysfs attribute is added to advertise the
     read-modify-write boundary property of a persistent memory range.

   - Preparatory fix-ups for arm and powerpc pmem support are included
     along with other miscellaneous fixes"

* tag 'libnvdimm-for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (26 commits)
  libnvdimm, btt: fix format string warnings
  libnvdimm, btt: clean up warning and error messages
  ext4: fix null pointer dereference on sbi
  libnvdimm, nfit: move the check on nd_reserved2 to the endpoint
  dax: fix FS_DAX=n BLOCK=y compilation
  libnvdimm: fix integer overflow static analysis warning
  libnvdimm, nd_blk: remove mmio_flush_range()
  libnvdimm, btt: rework error clearing
  libnvdimm: fix potential deadlock while clearing errors
  libnvdimm, btt: cache sector_size in arena_info
  libnvdimm, btt: ensure that flags were also unchanged during a map_read
  libnvdimm, btt: refactor map entry operations with macros
  libnvdimm, btt: fix a missed NVDIMM_IO_ATOMIC case in the write path
  libnvdimm, nfit: export an 'ecc_unit_size' sysfs attribute
  ext4: perform dax_device lookup at mount
  ext2: perform dax_device lookup at mount
  xfs: perform dax_device lookup at mount
  dax: introduce a fs_dax_get_by_bdev() helper
  libnvdimm, btt: check memory allocation failure
  libnvdimm, label: fix index block size calculation
  ...

13 files changed:
1  2 
arch/x86/Kconfig
drivers/acpi/nfit/core.c
drivers/nvdimm/btt.c
drivers/nvdimm/nd.h
fs/ext4/ext4.h
fs/ext4/inode.c
fs/ext4/super.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_super.c
include/linux/dax.h
lib/Kconfig

diff --combined arch/x86/Kconfig
index a3e6e6136a47ad9ad01917c8f244dd2dd9d0b0ed,87602cef7aba07ce1b459356c3a554102417a949..971feac135060d371e130680b54034f9dd39556a
@@@ -53,10 -53,7 +53,9 @@@ config X8
        select ARCH_HAS_FORTIFY_SOURCE
        select ARCH_HAS_GCOV_PROFILE_ALL
        select ARCH_HAS_KCOV                    if X86_64
-       select ARCH_HAS_MMIO_FLUSH
        select ARCH_HAS_PMEM_API                if X86_64
 +      # Causing hangs/crashes, see the commit that added this change for details.
 +      select ARCH_HAS_REFCOUNT                if BROKEN
        select ARCH_HAS_UACCESS_FLUSHCACHE      if X86_64
        select ARCH_HAS_SET_MEMORY
        select ARCH_HAS_SG_CHAIN
@@@ -75,6 -72,7 +74,6 @@@
        select ARCH_USE_QUEUED_RWLOCKS
        select ARCH_USE_QUEUED_SPINLOCKS
        select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 -      select ARCH_WANT_FRAME_POINTERS
        select ARCH_WANTS_DYNAMIC_TASK_STRUCT
        select ARCH_WANTS_THP_SWAP              if X86_64
        select BUILDTIME_EXTABLE_SORT
        select HAVE_MEMBLOCK
        select HAVE_MEMBLOCK_NODE_MAP
        select HAVE_MIXED_BREAKPOINTS_REGS
 +      select HAVE_MOD_ARCH_SPECIFIC
        select HAVE_NMI
        select HAVE_OPROFILE
        select HAVE_OPTPROBES
        select HAVE_HARDLOCKUP_DETECTOR_PERF    if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
        select HAVE_PERF_REGS
        select HAVE_PERF_USER_STACK_DUMP
 +      select HAVE_RCU_TABLE_FREE
        select HAVE_REGS_AND_STACK_ACCESS_API
 -      select HAVE_RELIABLE_STACKTRACE         if X86_64 && FRAME_POINTER && STACK_VALIDATION
 +      select HAVE_RELIABLE_STACKTRACE         if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
        select HAVE_STACK_VALIDATION            if X86_64
        select HAVE_SYSCALL_TRACEPOINTS
        select HAVE_UNSTABLE_SCHED_CLOCK
@@@ -330,7 -326,6 +329,7 @@@ config FIX_EARLYCON_ME
  
  config PGTABLE_LEVELS
        int
 +      default 5 if X86_5LEVEL
        default 4 if X86_64
        default 3 if X86_PAE
        default 2
@@@ -429,16 -424,16 +428,16 @@@ config GOLDFIS
         def_bool y
         depends on X86_GOLDFISH
  
 -config INTEL_RDT_A
 -      bool "Intel Resource Director Technology Allocation support"
 +config INTEL_RDT
 +      bool "Intel Resource Director Technology support"
        default n
        depends on X86 && CPU_SUP_INTEL
        select KERNFS
        help
 -        Select to enable resource allocation which is a sub-feature of
 -        Intel Resource Director Technology(RDT). More information about
 -        RDT can be found in the Intel x86 Architecture Software
 -        Developer Manual.
 +        Select to enable resource allocation and monitoring which are
 +        sub-features of Intel Resource Director Technology(RDT). More
 +        information about RDT can be found in the Intel x86
 +        Architecture Software Developer Manual.
  
          Say N if unsure.
  
@@@ -782,6 -777,8 +781,6 @@@ config KVM_DEBUG_F
          Statistics are displayed in debugfs filesystem. Enabling this option
          may incur significant overhead.
  
 -source "arch/x86/lguest/Kconfig"
 -
  config PARAVIRT_TIME_ACCOUNTING
        bool "Paravirtual steal time accounting"
        depends on PARAVIRT
@@@ -1401,24 -1398,6 +1400,24 @@@ config X86_PA
          has the cost of more pagetable lookup overhead, and also
          consumes more pagetable space per process.
  
 +config X86_5LEVEL
 +      bool "Enable 5-level page tables support"
 +      depends on X86_64
 +      ---help---
 +        5-level paging enables access to larger address space:
 +        upto 128 PiB of virtual address space and 4 PiB of
 +        physical address space.
 +
 +        It will be supported by future Intel CPUs.
 +
 +        Note: a kernel with this option enabled can only be booted
 +        on machines that support the feature.
 +
 +        See Documentation/x86/x86_64/5level-paging.txt for more
 +        information.
 +
 +        Say N if unsure.
 +
  config ARCH_PHYS_ADDR_T_64BIT
        def_bool y
        depends on X86_64 || X86_PAE
@@@ -1436,35 -1415,6 +1435,35 @@@ config X86_DIRECT_GBPAGE
          supports them), so don't confuse the user by printing
          that we have them enabled.
  
 +config ARCH_HAS_MEM_ENCRYPT
 +      def_bool y
 +
 +config AMD_MEM_ENCRYPT
 +      bool "AMD Secure Memory Encryption (SME) support"
 +      depends on X86_64 && CPU_SUP_AMD
 +      ---help---
 +        Say yes to enable support for the encryption of system memory.
 +        This requires an AMD processor that supports Secure Memory
 +        Encryption (SME).
 +
 +config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
 +      bool "Activate AMD Secure Memory Encryption (SME) by default"
 +      default y
 +      depends on AMD_MEM_ENCRYPT
 +      ---help---
 +        Say yes to have system memory encrypted by default if running on
 +        an AMD processor that supports Secure Memory Encryption (SME).
 +
 +        If set to Y, then the encryption of system memory can be
 +        deactivated with the mem_encrypt=off command line option.
 +
 +        If set to N, then the encryption of system memory can be
 +        activated with the mem_encrypt=on command line option.
 +
 +config ARCH_USE_MEMREMAP_PROT
 +      def_bool y
 +      depends on AMD_MEM_ENCRYPT
 +
  # Common NUMA Features
  config NUMA
        bool "Numa Memory Allocation and Scheduler Support"
@@@ -1806,9 -1756,7 +1805,9 @@@ config X86_SMA
  config X86_INTEL_MPX
        prompt "Intel MPX (Memory Protection Extensions)"
        def_bool n
 -      depends on CPU_SUP_INTEL
 +      # Note: only available in 64-bit mode due to VMA flags shortage
 +      depends on CPU_SUP_INTEL && X86_64
 +      select ARCH_USES_HIGH_VMA_FLAGS
        ---help---
          MPX provides hardware features that can be used in
          conjunction with compiler-instrumented code to check
@@@ -2323,10 -2271,6 +2322,10 @@@ source "kernel/livepatch/Kconfig
  
  endmenu
  
 +config ARCH_HAS_ADD_PAGES
 +      def_bool y
 +      depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
 +
  config ARCH_ENABLE_MEMORY_HOTPLUG
        def_bool y
        depends on X86_64 || (X86_32 && HIGHMEM)
@@@ -2347,10 -2291,6 +2346,10 @@@ config ARCH_ENABLE_HUGEPAGE_MIGRATIO
        def_bool y
        depends on X86_64 && HUGETLB_PAGE && MIGRATION
  
 +config ARCH_ENABLE_THP_MIGRATION
 +      def_bool y
 +      depends on X86_64 && TRANSPARENT_HUGEPAGE
 +
  menu "Power management and ACPI options"
  
  config ARCH_HIBERNATION_HEADER
diff --combined drivers/acpi/nfit/core.c
index 1893e416e7c0d95a88854670ddef631590bdb8bc,42221e785c47b9f52a51572214e41973a9b96cd0..9c2c49b6a240d55164ee8c39ce64e1d595072c62
@@@ -228,6 -228,10 +228,10 @@@ int acpi_nfit_ctl(struct nvdimm_bus_des
        if (cmd == ND_CMD_CALL) {
                call_pkg = buf;
                func = call_pkg->nd_command;
+               for (i = 0; i < ARRAY_SIZE(call_pkg->nd_reserved2); i++)
+                       if (call_pkg->nd_reserved2[i])
+                               return -EINVAL;
        }
  
        if (nvdimm) {
@@@ -1674,8 -1678,19 +1678,19 @@@ static ssize_t range_index_show(struct 
  }
  static DEVICE_ATTR_RO(range_index);
  
+ static ssize_t ecc_unit_size_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+ {
+       struct nd_region *nd_region = to_nd_region(dev);
+       struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region);
+       return sprintf(buf, "%d\n", nfit_spa->clear_err_unit);
+ }
+ static DEVICE_ATTR_RO(ecc_unit_size);
  static struct attribute *acpi_nfit_region_attributes[] = {
        &dev_attr_range_index.attr,
+       &dev_attr_ecc_unit_size.attr,
        NULL,
  };
  
@@@ -1804,6 -1819,7 +1819,7 @@@ static int acpi_nfit_init_interleave_se
                struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
                struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc,
                                spa->range_index, i);
+               struct acpi_nfit_control_region *dcr = nfit_mem->dcr;
  
                if (!memdev || !nfit_mem->dcr) {
                        dev_err(dev, "%s: failed to find DCR\n", __func__);
                }
  
                map->region_offset = memdev->region_offset;
-               map->serial_number = nfit_mem->dcr->serial_number;
+               map->serial_number = dcr->serial_number;
  
                map2->region_offset = memdev->region_offset;
-               map2->serial_number = nfit_mem->dcr->serial_number;
-               map2->vendor_id = nfit_mem->dcr->vendor_id;
-               map2->manufacturing_date = nfit_mem->dcr->manufacturing_date;
-               map2->manufacturing_location = nfit_mem->dcr->manufacturing_location;
+               map2->serial_number = dcr->serial_number;
+               map2->vendor_id = dcr->vendor_id;
+               map2->manufacturing_date = dcr->manufacturing_date;
+               map2->manufacturing_location = dcr->manufacturing_location;
        }
  
        /* v1.1 namespaces */
                        cmp_map_compat, NULL);
        nd_set->altcookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
  
+       /* record the result of the sort for the mapping position */
+       for (i = 0; i < nr; i++) {
+               struct nfit_set_info_map2 *map2 = &info2->mapping[i];
+               int j;
+               for (j = 0; j < nr; j++) {
+                       struct nd_mapping_desc *mapping = &ndr_desc->mapping[j];
+                       struct nvdimm *nvdimm = mapping->nvdimm;
+                       struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+                       struct acpi_nfit_control_region *dcr = nfit_mem->dcr;
+                       if (map2->serial_number == dcr->serial_number &&
+                           map2->vendor_id == dcr->vendor_id &&
+                           map2->manufacturing_date == dcr->manufacturing_date &&
+                           map2->manufacturing_location
+                                   == dcr->manufacturing_location) {
+                               mapping->position = i;
+                               break;
+                       }
+               }
+       }
        ndr_desc->nd_set = nd_set;
        devm_kfree(dev, info);
        devm_kfree(dev, info2);
@@@ -1930,7 -1968,7 +1968,7 @@@ static int acpi_nfit_blk_single_io(stru
                        memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c);
                else {
                        if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH)
-                               mmio_flush_range((void __force *)
+                               arch_invalidate_pmem((void __force *)
                                        mmio->addr.aperture + offset, c);
  
                        memcpy(iobuf + copied, mmio->addr.aperture + offset, c);
@@@ -2884,7 -2922,7 +2922,7 @@@ static int acpi_nfit_flush_probe(struc
         * need to be interruptible while waiting.
         */
        INIT_WORK_ONSTACK(&flush.work, flush_probe);
 -      COMPLETION_INITIALIZER_ONSTACK(flush.cmp);
 +      init_completion(&flush.cmp);
        queue_work(nfit_wq, &flush.work);
        mutex_unlock(&acpi_desc->init_mutex);
  
diff --combined drivers/nvdimm/btt.c
index 60491641a8d67c8e05d7484f929ac897ac511a58,b9008c3f0d1789766d1268d58f9316575240fd32..d5612bd1cc81cc4306f383ed7d1448cd3b487293
@@@ -31,6 -31,16 +31,16 @@@ enum log_ent_request 
        LOG_OLD_ENT
  };
  
+ static struct device *to_dev(struct arena_info *arena)
+ {
+       return &arena->nd_btt->dev;
+ }
+ static u64 adjust_initial_offset(struct nd_btt *nd_btt, u64 offset)
+ {
+       return offset + nd_btt->initial_offset;
+ }
  static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
                void *buf, size_t n, unsigned long flags)
  {
@@@ -38,7 -48,7 +48,7 @@@
        struct nd_namespace_common *ndns = nd_btt->ndns;
  
        /* arena offsets may be shifted from the base of the device */
-       offset += arena->nd_btt->initial_offset;
+       offset = adjust_initial_offset(nd_btt, offset);
        return nvdimm_read_bytes(ndns, offset, buf, n, flags);
  }
  
@@@ -49,7 -59,7 +59,7 @@@ static int arena_write_bytes(struct are
        struct nd_namespace_common *ndns = nd_btt->ndns;
  
        /* arena offsets may be shifted from the base of the device */
-       offset += arena->nd_btt->initial_offset;
+       offset = adjust_initial_offset(nd_btt, offset);
        return nvdimm_write_bytes(ndns, offset, buf, n, flags);
  }
  
@@@ -62,8 -72,10 +72,10 @@@ static int btt_info_write(struct arena_
         * We rely on that to make sure rw_bytes does error clearing
         * correctly, so make sure that is the case.
         */
-       WARN_ON_ONCE(!IS_ALIGNED(arena->infooff, 512));
-       WARN_ON_ONCE(!IS_ALIGNED(arena->info2off, 512));
+       dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->infooff, 512),
+               "arena->infooff: %#llx is unaligned\n", arena->infooff);
+       dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->info2off, 512),
+               "arena->info2off: %#llx is unaligned\n", arena->info2off);
  
        ret = arena_write_bytes(arena, arena->info2off, super,
                        sizeof(struct btt_sb), 0);
@@@ -76,7 -88,6 +88,6 @@@
  
  static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
  {
-       WARN_ON(!super);
        return arena_read_bytes(arena, arena->infooff, super,
                        sizeof(struct btt_sb), 0);
  }
@@@ -92,7 -103,10 +103,10 @@@ static int __btt_map_write(struct arena
  {
        u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
  
-       WARN_ON(lba >= arena->external_nlba);
+       if (unlikely(lba >= arena->external_nlba))
+               dev_err_ratelimited(to_dev(arena),
+                       "%s: lba %#x out of range (max: %#x)\n",
+                       __func__, lba, arena->external_nlba);
        return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags);
  }
  
@@@ -106,7 -120,7 +120,7 @@@ static int btt_map_write(struct arena_i
         * This 'mapping' is supposed to be just the LBA mapping, without
         * any flags set, so strip the flag bits.
         */
-       mapping &= MAP_LBA_MASK;
+       mapping = ent_lba(mapping);
  
        ze = (z_flag << 1) + e_flag;
        switch (ze) {
                 * construed as a valid 'normal' case, but we decide not to,
                 * to avoid confusion
                 */
-               WARN_ONCE(1, "Invalid use of Z and E flags\n");
+               dev_err_ratelimited(to_dev(arena),
+                       "Invalid use of Z and E flags\n");
                return -EIO;
        }
  
@@@ -147,7 -162,10 +162,10 @@@ static int btt_map_read(struct arena_in
        u32 raw_mapping, postmap, ze, z_flag, e_flag;
        u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
  
-       WARN_ON(lba >= arena->external_nlba);
+       if (unlikely(lba >= arena->external_nlba))
+               dev_err_ratelimited(to_dev(arena),
+                       "%s: lba %#x out of range (max: %#x)\n",
+                       __func__, lba, arena->external_nlba);
  
        ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags);
        if (ret)
  
        raw_mapping = le32_to_cpu(in);
  
-       z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
-       e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
+       z_flag = ent_z_flag(raw_mapping);
+       e_flag = ent_e_flag(raw_mapping);
        ze = (z_flag << 1) + e_flag;
-       postmap = raw_mapping & MAP_LBA_MASK;
+       postmap = ent_lba(raw_mapping);
  
        /* Reuse the {z,e}_flag variables for *trim and *error */
        z_flag = 0;
  static int btt_log_read_pair(struct arena_info *arena, u32 lane,
                        struct log_entry *ent)
  {
-       WARN_ON(!ent);
        return arena_read_bytes(arena,
                        arena->logoff + (2 * lane * LOG_ENT_SIZE), ent,
                        2 * LOG_ENT_SIZE, 0);
@@@ -299,11 -316,6 +316,6 @@@ static int btt_log_get_old(struct log_e
        return old;
  }
  
- static struct device *to_dev(struct arena_info *arena)
- {
-       return &arena->nd_btt->dev;
- }
  /*
   * This function copies the desired (old/new) log entry into ent if
   * it is not NULL. It returns the sub-slot number (0 or 1)
@@@ -381,7 -393,9 +393,9 @@@ static int btt_flog_write(struct arena_
        arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
        if (++(arena->freelist[lane].seq) == 4)
                arena->freelist[lane].seq = 1;
-       arena->freelist[lane].block = le32_to_cpu(ent->old_map);
+       if (ent_e_flag(ent->old_map))
+               arena->freelist[lane].has_err = 1;
+       arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map));
  
        return ret;
  }
@@@ -407,12 -421,14 +421,14 @@@ static int btt_map_init(struct arena_in
         * make sure rw_bytes does error clearing correctly, so make sure that
         * is the case.
         */
-       WARN_ON_ONCE(!IS_ALIGNED(arena->mapoff, 512));
+       dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->mapoff, 512),
+               "arena->mapoff: %#llx is unaligned\n", arena->mapoff);
  
        while (mapsize) {
                size_t size = min(mapsize, chunk_size);
  
-               WARN_ON_ONCE(size < 512);
+               dev_WARN_ONCE(to_dev(arena), size < 512,
+                       "chunk size: %#zx is unaligned\n", size);
                ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
                                size, 0);
                if (ret)
@@@ -449,12 -465,14 +465,14 @@@ static int btt_log_init(struct arena_in
         * make sure rw_bytes does error clearing correctly, so make sure that
         * is the case.
         */
-       WARN_ON_ONCE(!IS_ALIGNED(arena->logoff, 512));
+       dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->logoff, 512),
+               "arena->logoff: %#llx is unaligned\n", arena->logoff);
  
        while (logsize) {
                size_t size = min(logsize, chunk_size);
  
-               WARN_ON_ONCE(size < 512);
+               dev_WARN_ONCE(to_dev(arena), size < 512,
+                       "chunk size: %#zx is unaligned\n", size);
                ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf,
                                size, 0);
                if (ret)
        return ret;
  }
  
+ static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
+ {
+       return arena->dataoff + ((u64)lba * arena->internal_lbasize);
+ }
+ static int arena_clear_freelist_error(struct arena_info *arena, u32 lane)
+ {
+       int ret = 0;
+       if (arena->freelist[lane].has_err) {
+               void *zero_page = page_address(ZERO_PAGE(0));
+               u32 lba = arena->freelist[lane].block;
+               u64 nsoff = to_namespace_offset(arena, lba);
+               unsigned long len = arena->sector_size;
+               mutex_lock(&arena->err_lock);
+               while (len) {
+                       unsigned long chunk = min(len, PAGE_SIZE);
+                       ret = arena_write_bytes(arena, nsoff, zero_page,
+                               chunk, 0);
+                       if (ret)
+                               break;
+                       len -= chunk;
+                       nsoff += chunk;
+                       if (len == 0)
+                               arena->freelist[lane].has_err = 0;
+               }
+               mutex_unlock(&arena->err_lock);
+       }
+       return ret;
+ }
  static int btt_freelist_init(struct arena_info *arena)
  {
        int old, new, ret;
                arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq));
                arena->freelist[i].block = le32_to_cpu(log_new.old_map);
  
+               /*
+                * FIXME: if error clearing fails during init, we want to make
+                * the BTT read-only
+                */
+               if (ent_e_flag(log_new.old_map)) {
+                       ret = arena_clear_freelist_error(arena, i);
+                       if (ret)
+                               dev_err_ratelimited(to_dev(arena),
+                                       "Unable to clear known errors\n");
+               }
                /* This implies a newly created or untouched flog entry */
                if (log_new.old_map == log_new.new_map)
                        continue;
                        if (ret)
                                return ret;
                }
        }
  
        return 0;
@@@ -566,6 -628,7 +628,7 @@@ static struct arena_info *alloc_arena(s
        if (!arena)
                return NULL;
        arena->nd_btt = btt->nd_btt;
+       arena->sector_size = btt->sector_size;
  
        if (!size)
                return arena;
@@@ -694,6 -757,7 +757,7 @@@ static int discover_arenas(struct btt *
                arena->external_lba_start = cur_nlba;
                parse_arena_meta(arena, super, cur_off);
  
+               mutex_init(&arena->err_lock);
                ret = btt_freelist_init(arena);
                if (ret)
                        goto out;
@@@ -904,11 -968,6 +968,6 @@@ static void unlock_map(struct arena_inf
        spin_unlock(&arena->map_locks[idx].lock);
  }
  
- static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
- {
-       return arena->dataoff + ((u64)lba * arena->internal_lbasize);
- }
  static int btt_data_read(struct arena_info *arena, struct page *page,
                        unsigned int off, u32 lba, u32 len)
  {
@@@ -1032,6 -1091,7 +1091,7 @@@ static int btt_read_pg(struct btt *btt
                 */
                while (1) {
                        u32 new_map;
+                       int new_t, new_e;
  
                        if (t_flag) {
                                zero_fill_data(page, off, cur_len);
                         */
                        barrier();
  
-                       ret = btt_map_read(arena, premap, &new_map, &t_flag,
-                                               &e_flag, NVDIMM_IO_ATOMIC);
+                       ret = btt_map_read(arena, premap, &new_map, &new_t,
+                                               &new_e, NVDIMM_IO_ATOMIC);
                        if (ret)
                                goto out_rtt;
  
-                       if (postmap == new_map)
+                       if ((postmap == new_map) && (t_flag == new_t) &&
+                                       (e_flag == new_e))
                                break;
  
                        postmap = new_map;
+                       t_flag = new_t;
+                       e_flag = new_e;
                }
  
                ret = btt_data_read(arena, page, off, postmap, cur_len);
-               if (ret)
+               if (ret) {
+                       int rc;
+                       /* Media error - set the e_flag */
+                       rc = btt_map_write(arena, premap, postmap, 0, 1,
+                               NVDIMM_IO_ATOMIC);
                        goto out_rtt;
+               }
  
                if (bip) {
                        ret = btt_rw_integrity(btt, bip, arena, postmap, READ);
        return ret;
  }
  
+ /*
+  * Normally, arena_{read,write}_bytes will take care of the initial offset
+  * adjustment, but in the case of btt_is_badblock, where we query is_bad_pmem,
+  * we need the final, raw namespace offset here
+  */
+ static bool btt_is_badblock(struct btt *btt, struct arena_info *arena,
+               u32 postmap)
+ {
+       u64 nsoff = adjust_initial_offset(arena->nd_btt,
+                       to_namespace_offset(arena, postmap));
+       sector_t phys_sector = nsoff >> 9;
+       return is_bad_pmem(btt->phys_bb, phys_sector, arena->internal_lbasize);
+ }
  static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
                        sector_t sector, struct page *page, unsigned int off,
                        unsigned int len)
  
        while (len) {
                u32 cur_len;
+               int e_flag;
  
+  retry:
                lane = nd_region_acquire_lane(btt->nd_region);
  
                ret = lba_to_arena(btt, sector, &premap, &arena);
                        goto out_lane;
                }
  
+               if (btt_is_badblock(btt, arena, arena->freelist[lane].block))
+                       arena->freelist[lane].has_err = 1;
+               if (mutex_is_locked(&arena->err_lock)
+                               || arena->freelist[lane].has_err) {
+                       nd_region_release_lane(btt->nd_region, lane);
+                       ret = arena_clear_freelist_error(arena, lane);
+                       if (ret)
+                               return ret;
+                       /* OK to acquire a different lane/free block */
+                       goto retry;
+               }
                new_postmap = arena->freelist[lane].block;
  
                /* Wait if the new block is being read from */
                }
  
                lock_map(arena, premap);
-               ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL,
+               ret = btt_map_read(arena, premap, &old_postmap, NULL, &e_flag,
                                NVDIMM_IO_ATOMIC);
                if (ret)
                        goto out_map;
                        ret = -EIO;
                        goto out_map;
                }
+               if (e_flag)
+                       set_e_flag(old_postmap);
  
                log.lba = cpu_to_le32(premap);
                log.old_map = cpu_to_le32(old_postmap);
                if (ret)
                        goto out_map;
  
-               ret = btt_map_write(arena, premap, new_postmap, 0, 0, 0);
+               ret = btt_map_write(arena, premap, new_postmap, 0, 0,
+                       NVDIMM_IO_ATOMIC);
                if (ret)
                        goto out_map;
  
                unlock_map(arena, premap);
                nd_region_release_lane(btt->nd_region, lane);
  
+               if (e_flag) {
+                       ret = arena_clear_freelist_error(arena, lane);
+                       if (ret)
+                               return ret;
+               }
                len -= cur_len;
                off += cur_len;
                sector += btt->sector_size >> SECTOR_SHIFT;
@@@ -1211,11 -1321,13 +1321,13 @@@ static blk_qc_t btt_make_request(struc
        bio_for_each_segment(bvec, bio, iter) {
                unsigned int len = bvec.bv_len;
  
-               BUG_ON(len > PAGE_SIZE);
-               /* Make sure len is in multiples of sector size. */
-               /* XXX is this right? */
-               BUG_ON(len < btt->sector_size);
-               BUG_ON(len % btt->sector_size);
+               if (len > PAGE_SIZE || len < btt->sector_size ||
+                               len % btt->sector_size) {
+                       dev_err_ratelimited(&btt->nd_btt->dev,
+                               "unaligned bio segment (len: %d)\n", len);
+                       bio->bi_status = BLK_STS_IOERR;
+                       break;
+               }
  
                err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
                                  op_is_write(bio_op(bio)), iter.bi_sector);
@@@ -1241,10 -1353,8 +1353,10 @@@ static int btt_rw_page(struct block_dev
  {
        struct btt *btt = bdev->bd_disk->private_data;
        int rc;
 +      unsigned int len;
  
 -      rc = btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, is_write, sector);
 +      len = hpage_nr_pages(page) * PAGE_SIZE;
 +      rc = btt_do_bvec(btt, NULL, page, len, 0, is_write, sector);
        if (rc == 0)
                page_endio(page, is_write, 0);
  
@@@ -1345,6 -1455,7 +1457,7 @@@ static struct btt *btt_init(struct nd_b
  {
        int ret;
        struct btt *btt;
+       struct nd_namespace_io *nsio;
        struct device *dev = &nd_btt->dev;
  
        btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL);
        INIT_LIST_HEAD(&btt->arena_list);
        mutex_init(&btt->init_lock);
        btt->nd_region = nd_region;
+       nsio = to_nd_namespace_io(&nd_btt->ndns->dev);
+       btt->phys_bb = &nsio->bb;
  
        ret = discover_arenas(btt);
        if (ret) {
@@@ -1431,6 -1544,8 +1546,8 @@@ int nvdimm_namespace_attach_btt(struct 
        }
  
        btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL);
+       if (!btt_sb)
+               return -ENOMEM;
  
        /*
         * If this returns < 0, that is ok as it just means there wasn't
diff --combined drivers/nvdimm/nd.h
index a87f793f2945ec4b46a283fa482f746b94996c15,023fc93e21a5f23fbd8c7d847a3060ae0862cd35..9c758a91372bbf6c72f9ca9a9af99615ec965426
@@@ -42,7 -42,7 +42,7 @@@ struct nd_poison 
  
  struct nvdimm_drvdata {
        struct device *dev;
-       int nsindex_size, nslabel_size;
+       int nslabel_size;
        struct nd_cmd_get_config_size nsarea;
        void *data;
        int ns_current, ns_next;
@@@ -134,6 -134,7 +134,7 @@@ struct nd_mapping 
        struct nvdimm *nvdimm;
        u64 start;
        u64 size;
+       int position;
        struct list_head labels;
        struct mutex lock;
        /*
@@@ -233,10 -234,10 +234,10 @@@ void nd_device_unregister(struct devic
  void nd_device_notify(struct device *dev, enum nvdimm_event event);
  int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
                size_t len);
- ssize_t nd_sector_size_show(unsigned long current_lbasize,
+ ssize_t nd_size_select_show(unsigned long current_size,
                const unsigned long *supported, char *buf);
- ssize_t nd_sector_size_store(struct device *dev, const char *buf,
-               unsigned long *current_lbasize, const unsigned long *supported);
+ ssize_t nd_size_select_store(struct device *dev, const char *buf,
+               unsigned long *current_size, const unsigned long *supported);
  int __init nvdimm_init(void);
  int __init nd_region_init(void);
  int __init nd_label_init(void);
@@@ -285,6 -286,13 +286,13 @@@ static inline struct device *nd_btt_cre
  
  struct nd_pfn *to_nd_pfn(struct device *dev);
  #if IS_ENABLED(CONFIG_NVDIMM_PFN)
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ #define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE
+ #else
+ #define PFN_DEFAULT_ALIGNMENT PAGE_SIZE
+ #endif
  int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
  bool is_nd_pfn(struct device *dev);
  struct device *nd_pfn_create(struct nd_region *nd_region);
@@@ -390,22 -398,21 +398,22 @@@ int nd_region_activate(struct nd_regio
  void __nd_iostat_start(struct bio *bio, unsigned long *start);
  static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
  {
 -      struct gendisk *disk = bio->bi_bdev->bd_disk;
 +      struct gendisk *disk = bio->bi_disk;
  
        if (!blk_queue_io_stat(disk->queue))
                return false;
  
        *start = jiffies;
 -      generic_start_io_acct(bio_data_dir(bio),
 +      generic_start_io_acct(disk->queue, bio_data_dir(bio),
                              bio_sectors(bio), &disk->part0);
        return true;
  }
  static inline void nd_iostat_end(struct bio *bio, unsigned long start)
  {
 -      struct gendisk *disk = bio->bi_bdev->bd_disk;
 +      struct gendisk *disk = bio->bi_disk;
  
 -      generic_end_io_acct(bio_data_dir(bio), &disk->part0, start);
 +      generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0,
 +                              start);
  }
  static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
                unsigned int len)
diff --combined fs/ext4/ext4.h
index 84b9da1922387999d315cbd442b4a7e6dc92ab49,194e622dc3dd942b37b460d4c49405aa64083c3c..e2abe01c8c6bac60e04ef26fd4559c854fbef280
@@@ -838,11 -838,13 +838,11 @@@ static inline void ext4_decode_extra_ti
  {
        if (unlikely(sizeof(time->tv_sec) > 4 &&
                        (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) {
 -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0)
 +
 +#if 1
                /* Handle legacy encoding of pre-1970 dates with epoch
 -               * bits 1,1.  We assume that by kernel version 4.20,
 -               * everyone will have run fsck over the affected
 -               * filesystems to correct the problem.  (This
 -               * backwards compatibility may be removed before this
 -               * time, at the discretion of the ext4 developers.)
 +               * bits 1,1. (This backwards compatibility may be removed
 +               * at the discretion of the ext4 developers.)
                 */
                u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
                if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
@@@ -1526,6 -1528,7 +1526,7 @@@ struct ext4_sb_info 
  
        /* Barrier between changing inodes' journal flags and writepages ops. */
        struct percpu_rw_semaphore s_journal_flag_rwsem;
+       struct dax_device *s_daxdev;
  };
  
  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@@ -1565,7 -1568,6 +1566,7 @@@ enum 
                                           nolocking */
        EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
        EXT4_STATE_EXT_PRECACHED,       /* extents have been precached */
 +      EXT4_STATE_LUSTRE_EA_INODE,     /* Lustre-style ea_inode */
  };
  
  #define EXT4_INODE_BIT_FNS(name, field, offset)                               \
diff --combined fs/ext4/inode.c
index e963508ea35ffae472a1cd0800b104fe23fc0b52,16424b5c4e885b7d9f075cd4c850e8bda0cf5394..31db875bc7a13dde67b9a6b17ce498b4c16bd17f
@@@ -1720,12 -1720,13 +1720,12 @@@ static void mpage_release_unused_pages(
  
        pagevec_init(&pvec, 0);
        while (index <= end) {
 -              nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 +              nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
 -                      if (page->index > end)
 -                              break;
 +
                        BUG_ON(!PageLocked(page));
                        BUG_ON(PageWriteback(page));
                        if (invalidate) {
                        }
                        unlock_page(page);
                }
 -              index = pvec.pages[nr_pages - 1]->index + 1;
                pagevec_release(&pvec);
        }
  }
@@@ -2346,13 -2348,17 +2346,13 @@@ static int mpage_map_and_submit_buffers
  
        pagevec_init(&pvec, 0);
        while (start <= end) {
 -              nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
 -                                        PAGEVEC_SIZE);
 +              nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
 +                                              &start, end);
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
  
 -                      if (page->index > end)
 -                              break;
 -                      /* Up to 'end' pages must be contiguous */
 -                      BUG_ON(page->index != start);
                        bh = head = page_buffers(page);
                        do {
                                if (lblk < mpd->map.m_lblk)
                                pagevec_release(&pvec);
                                return err;
                        }
 -                      start++;
                }
                pagevec_release(&pvec);
        }
@@@ -3397,7 -3404,7 +3397,7 @@@ static int ext4_releasepage(struct pag
  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                            unsigned flags, struct iomap *iomap)
  {
-       struct block_device *bdev;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned int blkbits = inode->i_blkbits;
        unsigned long first_block = offset >> blkbits;
        unsigned long last_block = (offset + length - 1) >> blkbits;
@@@ -3466,12 -3473,8 +3466,8 @@@ retry
        }
  
        iomap->flags = 0;
-       bdev = inode->i_sb->s_bdev;
-       iomap->bdev = bdev;
-       if (blk_queue_dax(bdev->bd_queue))
-               iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
-       else
-               iomap->dax_dev = NULL;
+       iomap->bdev = inode->i_sb->s_bdev;
+       iomap->dax_dev = sbi->s_daxdev;
        iomap->offset = first_block << blkbits;
  
        if (ret == 0) {
@@@ -3504,7 -3507,6 +3500,6 @@@ static int ext4_iomap_end(struct inode 
        int blkbits = inode->i_blkbits;
        bool truncate = false;
  
-       fs_put_dax(iomap->dax_dev);
        if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
                return 0;
  
@@@ -4890,6 -4892,14 +4885,6 @@@ struct inode *ext4_iget(struct super_bl
        brelse(iloc.bh);
        ext4_set_inode_flags(inode);
  
 -      if (ei->i_flags & EXT4_EA_INODE_FL) {
 -              ext4_xattr_inode_set_class(inode);
 -
 -              inode_lock(inode);
 -              inode->i_flags |= S_NOQUOTA;
 -              inode_unlock(inode);
 -      }
 -
        unlock_new_inode(inode);
        return inode;
  
diff --combined fs/ext4/super.c
index 93aece6891f296b4ffa7f571f1cb84625e900311,55772b2d05ee5a19d1bf5357c7288fc1925a338f..71b9a667e1bc2281c701231a461538080a64ad23
@@@ -951,6 -951,7 +951,7 @@@ static void ext4_put_super(struct super
        if (sbi->s_chksum_driver)
                crypto_free_shash(sbi->s_chksum_driver);
        kfree(sbi->s_blockgroup_lock);
+       fs_put_dax(sbi->s_daxdev);
        kfree(sbi);
  }
  
@@@ -2404,7 -2405,6 +2405,7 @@@ static void ext4_orphan_cleanup(struct 
        unsigned int s_flags = sb->s_flags;
        int ret, nr_orphans = 0, nr_truncates = 0;
  #ifdef CONFIG_QUOTA
 +      int quota_update = 0;
        int i;
  #endif
        if (!es->s_last_orphan) {
  #ifdef CONFIG_QUOTA
        /* Needed for iput() to work correctly and not trash data */
        sb->s_flags |= MS_ACTIVE;
 -      /* Turn on quotas so that they are updated correctly */
 +
 +      /*
 +       * Turn on quotas which were not enabled for read-only mounts if
 +       * filesystem has quota feature, so that they are updated correctly.
 +       */
 +      if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) {
 +              int ret = ext4_enable_quotas(sb);
 +
 +              if (!ret)
 +                      quota_update = 1;
 +              else
 +                      ext4_msg(sb, KERN_ERR,
 +                              "Cannot turn on quotas: error %d", ret);
 +      }
 +
 +      /* Turn on journaled quotas used for old sytle */
        for (i = 0; i < EXT4_MAXQUOTAS; i++) {
                if (EXT4_SB(sb)->s_qf_names[i]) {
                        int ret = ext4_quota_on_mount(sb, i);
 -                      if (ret < 0)
 +
 +                      if (!ret)
 +                              quota_update = 1;
 +                      else
                                ext4_msg(sb, KERN_ERR,
                                        "Cannot turn on journaled "
 -                                      "quota: error %d", ret);
 +                                      "quota: type %d: error %d", i, ret);
                }
        }
  #endif
                ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
                       PLURAL(nr_truncates));
  #ifdef CONFIG_QUOTA
 -      /* Turn quotas off */
 -      for (i = 0; i < EXT4_MAXQUOTAS; i++) {
 -              if (sb_dqopt(sb)->files[i])
 -                      dquot_quota_off(sb, i);
 +      /* Turn off quotas if they were enabled for orphan cleanup */
 +      if (quota_update) {
 +              for (i = 0; i < EXT4_MAXQUOTAS; i++) {
 +                      if (sb_dqopt(sb)->files[i])
 +                              dquot_quota_off(sb, i);
 +              }
        }
  #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@@ -3398,6 -3378,7 +3399,7 @@@ static void ext4_set_resv_clusters(stru
  
  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
  {
+       struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
        char *orig_data = kstrdup(data, GFP_KERNEL);
        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
        if ((data && !orig_data) || !sbi)
                goto out_free_base;
  
+       sbi->s_daxdev = dax_dev;
        sbi->s_blockgroup_lock =
                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
        if (!sbi->s_blockgroup_lock)
@@@ -4399,6 -4381,7 +4402,7 @@@ out_fail
  out_free_base:
        kfree(sbi);
        kfree(orig_data);
+       fs_put_dax(dax_dev);
        return err ? err : ret;
  }
  
@@@ -5215,7 -5198,7 +5219,7 @@@ static int ext4_statfs_project(struct s
        dquot = dqget(sb, qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
 -      spin_lock(&dq_data_lock);
 +      spin_lock(&dquot->dq_dqb_lock);
  
        limit = (dquot->dq_dqb.dqb_bsoftlimit ?
                 dquot->dq_dqb.dqb_bsoftlimit :
                         (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
        }
  
 -      spin_unlock(&dq_data_lock);
 +      spin_unlock(&dquot->dq_dqb_lock);
        dqput(dquot);
        return 0;
  }
@@@ -5284,13 -5267,18 +5288,13 @@@ static int ext4_statfs(struct dentry *d
        return 0;
  }
  
 -/* Helper function for writing quotas on sync - we need to start transaction
 - * before quota file is locked for write. Otherwise the are possible deadlocks:
 - * Process 1                         Process 2
 - * ext4_create()                     quota_sync()
 - *   jbd2_journal_start()                  write_dquot()
 - *   dquot_initialize()                         down(dqio_mutex)
 - *     down(dqio_mutex)                    jbd2_journal_start()
 - *
 - */
  
  #ifdef CONFIG_QUOTA
  
 +/*
 + * Helper functions so that transaction is started before we acquire dqio_sem
 + * to keep correct lock ordering of transaction > dqio_sem
 + */
  static inline struct inode *dquot_to_inode(struct dquot *dquot)
  {
        return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
@@@ -5425,13 -5413,6 +5429,13 @@@ static int ext4_quota_on(struct super_b
                        ext4_msg(sb, KERN_WARNING,
                                "Quota file not on filesystem root. "
                                "Journaled quota will not work");
 +              sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
 +      } else {
 +              /*
 +               * Clear the flag just in case mount options changed since
 +               * last time.
 +               */
 +              sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
        }
  
        /*
@@@ -5528,16 -5509,13 +5532,16 @@@ static int ext4_enable_quotas(struct su
                test_opt(sb, PRJQUOTA),
        };
  
 -      sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
 +      sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
        for (type = 0; type < EXT4_MAXQUOTAS; type++) {
                if (qf_inums[type]) {
                        err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
                                DQUOT_USAGE_ENABLED |
                                (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
                        if (err) {
 +                              for (type--; type >= 0; type--)
 +                                      dquot_quota_off(sb, type);
 +
                                ext4_warning(sb,
                                        "Failed to enable quota tracking "
                                        "(type=%d, err=%d). Please run "
diff --combined fs/xfs/xfs_aops.c
index fffae1390d7f493af03dfcc48d1a545190791d03,78185f3b10b2c3fd5a5702e7822b06ce1affb1d7..29172609f2a31b756cd40da7b42f288fe8b0915b
@@@ -80,16 -80,29 +80,29 @@@ xfs_find_bdev_for_inode
                return mp->m_ddev_targp->bt_bdev;
  }
  
+ struct dax_device *
+ xfs_find_daxdev_for_inode(
+       struct inode            *inode)
+ {
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       if (XFS_IS_REALTIME_INODE(ip))
+               return mp->m_rtdev_targp->bt_daxdev;
+       else
+               return mp->m_ddev_targp->bt_daxdev;
+ }
  /*
   * We're now finished for good with this page.  Update the page state via the
   * associated buffer_heads, paying attention to the start and end offsets that
   * we need to process on the page.
   *
 - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
 - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
 - * the page at all, as we may be racing with memory reclaim and it can free both
 - * the bufferhead chain and the page as it will see the page as clean and
 - * unused.
 + * Note that we open code the action in end_buffer_async_write here so that we
 + * only have to iterate over the buffers attached to the page once.  This is not
 + * only more efficient, but also ensures that we only calls end_page_writeback
 + * at the end of the iteration, and thus avoids the pitfall of having the page
 + * and buffers potentially freed after every call to end_buffer_async_write.
   */
  static void
  xfs_finish_page_writeback(
        struct bio_vec          *bvec,
        int                     error)
  {
 -      unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
 -      struct buffer_head      *head, *bh, *next;
 +      struct buffer_head      *head = page_buffers(bvec->bv_page), *bh = head;
 +      bool                    busy = false;
        unsigned int            off = 0;
 -      unsigned int            bsize;
 +      unsigned long           flags;
  
        ASSERT(bvec->bv_offset < PAGE_SIZE);
        ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
 -      ASSERT(end < PAGE_SIZE);
 +      ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
        ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
  
 -      bh = head = page_buffers(bvec->bv_page);
 -
 -      bsize = bh->b_size;
 +      local_irq_save(flags);
 +      bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
        do {
 -              if (off > end)
 -                      break;
 -              next = bh->b_this_page;
 -              if (off < bvec->bv_offset)
 -                      goto next_bh;
 -              bh->b_end_io(bh, !error);
 -next_bh:
 -              off += bsize;
 -      } while ((bh = next) != head);
 +              if (off >= bvec->bv_offset &&
 +                  off < bvec->bv_offset + bvec->bv_len) {
 +                      ASSERT(buffer_async_write(bh));
 +                      ASSERT(bh->b_end_io == NULL);
 +
 +                      if (error) {
 +                              mark_buffer_write_io_error(bh);
 +                              clear_buffer_uptodate(bh);
 +                              SetPageError(bvec->bv_page);
 +                      } else {
 +                              set_buffer_uptodate(bh);
 +                      }
 +                      clear_buffer_async_write(bh);
 +                      unlock_buffer(bh);
 +              } else if (buffer_async_write(bh)) {
 +                      ASSERT(buffer_locked(bh));
 +                      busy = true;
 +              }
 +              off += bh->b_size;
 +      } while ((bh = bh->b_this_page) != head);
 +      bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
 +      local_irq_restore(flags);
 +
 +      if (!busy)
 +              end_page_writeback(bvec->bv_page);
  }
  
  /*
@@@ -148,10 -146,8 +161,10 @@@ xfs_destroy_ioend
        int                     error)
  {
        struct inode            *inode = ioend->io_inode;
 -      struct bio              *last = ioend->io_bio;
 -      struct bio              *bio, *next;
 +      struct bio              *bio = &ioend->io_inline_bio;
 +      struct bio              *last = ioend->io_bio, *next;
 +      u64                     start = bio->bi_iter.bi_sector;
 +      bool                    quiet = bio_flagged(bio, BIO_QUIET);
  
        for (bio = &ioend->io_inline_bio; bio; bio = next) {
                struct bio_vec  *bvec;
  
                bio_put(bio);
        }
 +
 +      if (unlikely(error && !quiet)) {
 +              xfs_err_ratelimited(XFS_I(inode)->i_mount,
 +                      "writeback error on sector %llu", start);
 +      }
  }
  
  /*
@@@ -445,8 -436,7 +458,8 @@@ xfs_start_buffer_writeback
        ASSERT(!buffer_delay(bh));
        ASSERT(!buffer_unwritten(bh));
  
 -      mark_buffer_async_write(bh);
 +      bh->b_end_io = NULL;
 +      set_buffer_async_write(bh);
        set_buffer_uptodate(bh);
        clear_buffer_dirty(bh);
  }
@@@ -540,7 -530,7 +553,7 @@@ xfs_init_bio_from_bh
        struct buffer_head      *bh)
  {
        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 -      bio->bi_bdev = bh->b_bdev;
 +      bio_set_dev(bio, bh->b_bdev);
  }
  
  static struct xfs_ioend *
diff --combined fs/xfs/xfs_buf.c
index b1c9711e79a46051801a0fe34e95dd0515ff4fef,6deb86c845d1567d32d9a6affadb30edc3712ddc..da14658da3103475940555600581f0bf12217d55
@@@ -1281,7 -1281,7 +1281,7 @@@ next_chunk
        nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
  
        bio = bio_alloc(GFP_NOIO, nr_pages);
 -      bio->bi_bdev = bp->b_target->bt_bdev;
 +      bio_set_dev(bio, bp->b_target->bt_bdev);
        bio->bi_iter.bi_sector = sector;
        bio->bi_end_io = xfs_buf_bio_end_io;
        bio->bi_private = bp;
@@@ -1802,7 -1802,8 +1802,8 @@@ xfs_setsize_buftarg_early
  xfs_buftarg_t *
  xfs_alloc_buftarg(
        struct xfs_mount        *mp,
-       struct block_device     *bdev)
+       struct block_device     *bdev,
+       struct dax_device       *dax_dev)
  {
        xfs_buftarg_t           *btp;
  
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
+       btp->bt_daxdev = dax_dev;
  
        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
diff --combined fs/xfs/xfs_iomap.c
index 79cb5b3d140c522fd04f9a0f72666f1621c45155,7c934e40733253ae1ffa7468aa4f184d177221f2..a1909bc064e9e70c90ef6ced3017935951aefc2e
@@@ -69,6 -69,7 +69,7 @@@ xfs_bmbt_to_iomap
        iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
        iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
        iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+       iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
  }
  
  xfs_extlen_t
@@@ -274,7 -275,7 +275,7 @@@ xfs_iomap_write_direct
        /*
         * Complete the transaction
         */
 -      error = xfs_defer_finish(&tp, &dfops, NULL);
 +      error = xfs_defer_finish(&tp, &dfops);
        if (error)
                goto out_bmap_cancel;
  
@@@ -520,6 -521,7 +521,6 @@@ xfs_file_iomap_begin_delay
        struct inode            *inode,
        loff_t                  offset,
        loff_t                  count,
 -      unsigned                flags,
        struct iomap            *iomap)
  {
        struct xfs_inode        *ip = XFS_I(inode);
@@@ -783,7 -785,7 +784,7 @@@ xfs_iomap_write_allocate
                        if (error)
                                goto trans_cancel;
  
 -                      error = xfs_defer_finish(&tp, &dfops, NULL);
 +                      error = xfs_defer_finish(&tp, &dfops);
                        if (error)
                                goto trans_cancel;
  
@@@ -905,7 -907,7 +906,7 @@@ xfs_iomap_write_unwritten
                        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
                }
  
 -              error = xfs_defer_finish(&tp, &dfops, NULL);
 +              error = xfs_defer_finish(&tp, &dfops);
                if (error)
                        goto error_on_bmapi_transaction;
  
@@@ -975,7 -977,6 +976,6 @@@ xfs_file_iomap_begin
        int                     nimaps = 1, error = 0;
        bool                    shared = false, trimmed = false;
        unsigned                lockmode;
-       struct block_device     *bdev;
  
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
        if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
                        !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
                /* Reserve delalloc blocks for regular writeback. */
 -              return xfs_file_iomap_begin_delay(inode, offset, length, flags,
 -                              iomap);
 +              return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
        }
  
        if (need_excl_ilock(ip, flags)) {
  
        xfs_bmbt_to_iomap(ip, iomap, &imap);
  
-       /* optionally associate a dax device with the iomap bdev */
-       bdev = iomap->bdev;
-       if (blk_queue_dax(bdev->bd_queue))
-               iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
-       else
-               iomap->dax_dev = NULL;
        if (shared)
                iomap->flags |= IOMAP_F_SHARED;
        return 0;
@@@ -1169,7 -1164,6 +1162,6 @@@ xfs_file_iomap_end
        unsigned                flags,
        struct iomap            *iomap)
  {
-       fs_put_dax(iomap->dax_dev);
        if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
                return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
                                length, written, iomap);
diff --combined fs/xfs/xfs_super.c
index c1c4c2ea1014a70f2723289769d795088b562fe2,ee4225c65f0ca85e7d5dbbc0b96250cba6c70c26..3008f31753dfc4e915be5cc96144a51a07658f2a
@@@ -714,17 -714,26 +714,26 @@@ STATIC voi
  xfs_close_devices(
        struct xfs_mount        *mp)
  {
+       struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev;
        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
                struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
+               struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev;
                xfs_free_buftarg(mp, mp->m_logdev_targp);
                xfs_blkdev_put(logdev);
+               fs_put_dax(dax_logdev);
        }
        if (mp->m_rtdev_targp) {
                struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
+               struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev;
                xfs_free_buftarg(mp, mp->m_rtdev_targp);
                xfs_blkdev_put(rtdev);
+               fs_put_dax(dax_rtdev);
        }
        xfs_free_buftarg(mp, mp->m_ddev_targp);
+       fs_put_dax(dax_ddev);
  }
  
  /*
@@@ -742,6 -751,8 +751,8 @@@ xfs_open_devices
        struct xfs_mount        *mp)
  {
        struct block_device     *ddev = mp->m_super->s_bdev;
+       struct dax_device       *dax_ddev = fs_dax_get_by_bdev(ddev);
+       struct dax_device       *dax_logdev = NULL, *dax_rtdev = NULL;
        struct block_device     *logdev = NULL, *rtdev = NULL;
        int                     error;
  
                error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
                if (error)
                        goto out;
+               dax_logdev = fs_dax_get_by_bdev(logdev);
        }
  
        if (mp->m_rtname) {
                        error = -EINVAL;
                        goto out_close_rtdev;
                }
+               dax_rtdev = fs_dax_get_by_bdev(rtdev);
        }
  
        /*
         * Setup xfs_mount buffer target pointers
         */
        error = -ENOMEM;
-       mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
+       mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev);
        if (!mp->m_ddev_targp)
                goto out_close_rtdev;
  
        if (rtdev) {
-               mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
+               mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev);
                if (!mp->m_rtdev_targp)
                        goto out_free_ddev_targ;
        }
  
        if (logdev && logdev != ddev) {
-               mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
+               mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev);
                if (!mp->m_logdev_targp)
                        goto out_free_rtdev_targ;
        } else {
        xfs_free_buftarg(mp, mp->m_ddev_targp);
   out_close_rtdev:
        xfs_blkdev_put(rtdev);
+       fs_put_dax(dax_rtdev);
   out_close_logdev:
-       if (logdev && logdev != ddev)
+       if (logdev && logdev != ddev) {
                xfs_blkdev_put(logdev);
+               fs_put_dax(dax_logdev);
+       }
   out:
+       fs_put_dax(dax_ddev);
        return error;
  }
  
@@@ -1220,7 -1237,7 +1237,7 @@@ xfs_test_remount_options
        tmp_mp->m_super = sb;
        error = xfs_parseargs(tmp_mp, options);
        xfs_free_fsname(tmp_mp);
 -      kfree(tmp_mp);
 +      kmem_free(tmp_mp);
  
        return error;
  }
diff --combined include/linux/dax.h
index eb0bff6f1eab2a1d5043d17c086ffd58a51d4278,ac8afa18f707410ec6b837051f7395650ac6f4dc..46cad1d0f12970e4c764374992c9432af431b7b3
@@@ -57,6 -57,7 +57,7 @@@ static inline void fs_put_dax(struct da
        put_dax(dax_dev);
  }
  
+ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
  #else
  static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
  {
@@@ -71,6 -72,11 +72,11 @@@ static inline struct dax_device *fs_dax
  static inline void fs_put_dax(struct dax_device *dax_dev)
  {
  }
+ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+ {
+       return NULL;
+ }
  #endif
  
  int dax_read_lock(void);
@@@ -89,6 -95,34 +95,6 @@@ void dax_flush(struct dax_device *dax_d
  void dax_write_cache(struct dax_device *dax_dev, bool wc);
  bool dax_write_cache_enabled(struct dax_device *dax_dev);
  
 -/*
 - * We use lowest available bit in exceptional entry for locking, one bit for
 - * the entry size (PMD) and two more to tell us if the entry is a huge zero
 - * page (HZP) or an empty entry that is just used for locking.  In total four
 - * special bits.
 - *
 - * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
 - * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
 - * block allocation.
 - */
 -#define RADIX_DAX_SHIFT       (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
 -#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
 -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
 -#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
 -#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
 -
 -static inline unsigned long dax_radix_sector(void *entry)
 -{
 -      return (unsigned long)entry >> RADIX_DAX_SHIFT;
 -}
 -
 -static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
 -{
 -      return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
 -                      ((unsigned long)sector << RADIX_DAX_SHIFT) |
 -                      RADIX_DAX_ENTRY_LOCK);
 -}
 -
  ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops);
  int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
  int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                      pgoff_t index);
 -void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 -              pgoff_t index, void *entry, bool wake_all);
  
  #ifdef CONFIG_FS_DAX
  int __dax_zero_page_range(struct block_device *bdev,
@@@ -110,6 -146,21 +116,6 @@@ static inline int __dax_zero_page_range
  }
  #endif
  
 -#ifdef CONFIG_FS_DAX_PMD
 -static inline unsigned int dax_radix_order(void *entry)
 -{
 -      if ((unsigned long)entry & RADIX_DAX_PMD)
 -              return PMD_SHIFT - PAGE_SHIFT;
 -      return 0;
 -}
 -#else
 -static inline unsigned int dax_radix_order(void *entry)
 -{
 -      return 0;
 -}
 -#endif
 -int dax_pfn_mkwrite(struct vm_fault *vmf);
 -
  static inline bool dax_mapping(struct address_space *mapping)
  {
        return mapping->host && IS_DAX(mapping->host);
diff --combined lib/Kconfig
index 40b114a11d7cea28a6b93521fa58ad958d8ddbe4,527da69e3be1a1c192e10c787befcb82bad350cd..a85e6f76add5c9149c79b0787804b4b1774c0982
@@@ -559,9 -559,6 +559,6 @@@ config ARCH_HAS_PMEM_AP
  config ARCH_HAS_UACCESS_FLUSHCACHE
        bool
  
- config ARCH_HAS_MMIO_FLUSH
-       bool
  config STACKDEPOT
        bool
        select STACKTRACE
@@@ -575,7 -572,4 +572,7 @@@ config PARMA
  config PRIME_NUMBERS
        tristate
  
 +config STRING_SELFTEST
 +      bool "Test string functions"
 +
  endmenu
This page took 0.14247 seconds and 4 git commands to generate.