Merge tag 'libnvdimm-for-5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdim...

author Linus Torvalds <[email protected]>

Fri, 27 May 2022 22:49:30 +0000 (15:49 -0700)

committer Linus Torvalds <[email protected]>

Fri, 27 May 2022 22:49:30 +0000 (15:49 -0700)
author Linus Torvalds <[email protected]>
Fri, 27 May 2022 22:49:30 +0000 (15:49 -0700)
committer Linus Torvalds <[email protected]>
Fri, 27 May 2022 22:49:30 +0000 (15:49 -0700)
diff --combined arch/x86/kernel/cpu/mce/core.c

index d775fcd74e98d268d4251402d10c1cbf5a52e66f,fa67bb9d1afedabfe8420a7313f54d833c4b4ab8..2c8ec5c71712191486000cf3d0e34ebe24f18eb2
--- 1/arch/x86/kernel/cpu/mce/core.c
--- 2/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@@ -69,9 -69,7 +69,9 @@@ DEFINE_PER_CPU_READ_MOSTLY(unsigned int
   
   struct mce_bank {
         u64                     ctl;                    /* subevents to enable */
- -      bool                    init;                   /* initialise bank? */
+ +
+ +      __u64 init                      : 1,            /* initialise bank? */
+ +            __reserved_1              : 63;
   };
   static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
   
@@@ -581,7 -579,7 +581,7 @@@ static int uc_decode_notifier(struct no
   
         pfn = mce->addr >> PAGE_SHIFT;
         if (!memory_failure(pfn, 0)) {
-               set_mce_nospec(pfn, whole_page(mce));
+               set_mce_nospec(pfn);
                 mce->kflags |= MCE_HANDLED_UC;
         }
   
@@@ -1318,7 -1316,7 +1318,7 @@@ static void kill_me_maybe(struct callba
   
         ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
         if (!ret) {
-               set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+               set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
                 sync_core();
                 return;
         }
@@@ -1344,7 -1342,7 +1344,7 @@@ static void kill_me_never(struct callba
         p->mce_count = 0;
         pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
         if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
-               set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
+               set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
   }
   
   static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
diff --combined arch/x86/mm/pat/set_memory.c

index 0656db33574d390e73d2af31ab6815fad1b9922d,44f0d4260bd896752b00372044e8fe7d72eb1dae..1abd5438f1269553cbb5dd7a4a55425a5abfbd60
--- 1/arch/x86/mm/pat/set_memory.c
--- 2/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@@ -19,6 -19,7 +19,7 @@@
   #include <linux/vmstat.h>
   #include <linux/kernel.h>
   #include <linux/cc_platform.h>
+ #include <linux/set_memory.h>
   
   #include <asm/e820/api.h>
   #include <asm/processor.h>
@@@ -29,7 -30,6 +30,6 @@@
   #include <asm/pgalloc.h>
   #include <asm/proto.h>
   #include <asm/memtype.h>
- #include <asm/set_memory.h>
   #include <asm/hyperv-tlfs.h>
   #include <asm/mshyperv.h>
   
@@@ -638,6 -638,17 +638,6 @@@ pte_t *lookup_address(unsigned long add
   }
   EXPORT_SYMBOL_GPL(lookup_address);
   
- -/*
- - * Lookup the page table entry for a virtual address in a given mm. Return a
- - * pointer to the entry and the level of the mapping.
- - */
- -pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
- -                          unsigned int *level)
- -{
- -      return lookup_address_in_pgd(pgd_offset(mm, address), address, level);
- -}
- -EXPORT_SYMBOL_GPL(lookup_address_in_mm);
- -
   static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
                                   unsigned int *level)
   {
@@@ -1805,7 -1816,7 +1805,7 @@@ static inline int cpa_clear_pages_array
   }
   
   /*
-  * _set_memory_prot is an internal helper for callers that have been passed
+  * __set_memory_prot is an internal helper for callers that have been passed
    * a pgprot_t value from upper layers and a reservation has already been taken.
    * If you want to set the pgprot to a specific page protocol, use the
    * set_memory_xx() functions.
@@@ -1914,6 -1925,51 +1914,51 @@@ int set_memory_wb(unsigned long addr, i
   }
   EXPORT_SYMBOL(set_memory_wb);
   
+ /* Prevent speculative access to a page by marking it not-present */
+ #ifdef CONFIG_X86_64
+ int set_mce_nospec(unsigned long pfn)
+ {
+       unsigned long decoy_addr;
+       int rc;
+ 
+       /* SGX pages are not in the 1:1 map */
+       if (arch_is_platform_page(pfn << PAGE_SHIFT))
+               return 0;
+       /*
+        * We would like to just call:
+        *      set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
+        * but doing that would radically increase the odds of a
+        * speculative access to the poison page because we'd have
+        * the virtual address of the kernel 1:1 mapping sitting
+        * around in registers.
+        * Instead we get tricky.  We create a non-canonical address
+        * that looks just like the one we want, but has bit 63 flipped.
+        * This relies on set_memory_XX() properly sanitizing any __pa()
+        * results with __PHYSICAL_MASK or PTE_PFN_MASK.
+        */
+       decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+ 
+       rc = set_memory_np(decoy_addr, 1);
+       if (rc)
+               pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+       return rc;
+ }
+ 
+ static int set_memory_present(unsigned long *addr, int numpages)
+ {
+       return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0);
+ }
+ 
+ /* Restore full speculative operation to the pfn. */
+ int clear_mce_nospec(unsigned long pfn)
+ {
+       unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
+ 
+       return set_memory_present(&addr, 1);
+ }
+ EXPORT_SYMBOL_GPL(clear_mce_nospec);
+ #endif /* CONFIG_X86_64 */
+ 
   int set_memory_x(unsigned long addr, int numpages)
   {
         if (!(__supported_pte_mask & _PAGE_NX))
diff --combined drivers/md/dm-linear.c

index 0a6abbbe3745274e5118b0ee89fdce62b86247ab,cdf48bc8c5b05ff6c02b31101ac87ba57691dd51..3212ef6aa81bb515dac745f6fa349e472b5b42b0
--- 1/drivers/md/dm-linear.c
--- 2/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@@ -84,12 -84,19 +84,12 @@@ static sector_t linear_map_sector(struc
         return lc->start + dm_target_offset(ti, bi_sector);
   }
   
- -static void linear_map_bio(struct dm_target *ti, struct bio *bio)
+ +static int linear_map(struct dm_target *ti, struct bio *bio)
   {
         struct linear_c *lc = ti->private;
   
         bio_set_dev(bio, lc->dev->bdev);
- -      if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio)))
- -              bio->bi_iter.bi_sector =
- -                      linear_map_sector(ti, bio->bi_iter.bi_sector);
- -}
- -
- -static int linear_map(struct dm_target *ti, struct bio *bio)
- -{
- -      linear_map_bio(ti, bio);
+ +      bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector);
   
         return DM_MAPIO_REMAPPED;
   }
@@@ -165,11 -172,12 +165,12 @@@ static struct dax_device *linear_dax_pg
   }
   
   static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
-               long nr_pages, void **kaddr, pfn_t *pfn)
+               long nr_pages, enum dax_access_mode mode, void **kaddr,
+               pfn_t *pfn)
   {
         struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);
   
-       return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
+       return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn);
   }
   
   static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
@@@ -180,9 -188,18 +181,18 @@@
         return dax_zero_page_range(dax_dev, pgoff, nr_pages);
   }
   
+ static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff,
+               void *addr, size_t bytes, struct iov_iter *i)
+ {
+       struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);
+ 
+       return dax_recovery_write(dax_dev, pgoff, addr, bytes, i);
+ }
+ 
   #else
   #define linear_dax_direct_access NULL
   #define linear_dax_zero_page_range NULL
+ #define linear_dax_recovery_write NULL
   #endif
   
   static struct target_type linear_target = {
@@@ -200,6 -217,7 +210,7 @@@
         .iterate_devices = linear_iterate_devices,
         .direct_access = linear_dax_direct_access,
         .dax_zero_page_range = linear_dax_zero_page_range,
+       .dax_recovery_write = linear_dax_recovery_write,
   };
   
   int __init dm_linear_init(void)
diff --combined drivers/md/dm-log-writes.c

index e194226c89e54082e9c010a395ff9b624aa365bb,22739dccdd173b6b394b97b70f190011cb22c3fb..20fd688f72e7c0cd9cabd92c740fedcfd1235730
--- 1/drivers/md/dm-log-writes.c
--- 2/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@@ -866,8 -866,9 +866,8 @@@ static int log_writes_message(struct dm
   static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
   {
         struct log_writes_c *lc = ti->private;
- -      struct request_queue *q = bdev_get_queue(lc->dev->bdev);
   
- -      if (!q || !blk_queue_discard(q)) {
+ +      if (!bdev_max_discard_sectors(lc->dev->bdev)) {
                 lc->device_supports_discard = false;
                 limits->discard_granularity = lc->sectorsize;
                 limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
@@@ -888,11 -889,12 +888,12 @@@ static struct dax_device *log_writes_da
   }
   
   static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
-                                        long nr_pages, void **kaddr, pfn_t *pfn)
+               long nr_pages, enum dax_access_mode mode, void **kaddr,
+               pfn_t *pfn)
   {
         struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
   
-       return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
+       return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn);
   }
   
   static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
@@@ -903,9 -905,18 +904,18 @@@
         return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT);
   }
   
+ static size_t log_writes_dax_recovery_write(struct dm_target *ti,
+               pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
+ {
+       struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
+ 
+       return dax_recovery_write(dax_dev, pgoff, addr, bytes, i);
+ }
+ 
   #else
   #define log_writes_dax_direct_access NULL
   #define log_writes_dax_zero_page_range NULL
+ #define log_writes_dax_recovery_write NULL
   #endif
   
   static struct target_type log_writes_target = {
@@@ -923,6 -934,7 +933,7 @@@
         .io_hints = log_writes_io_hints,
         .direct_access = log_writes_dax_direct_access,
         .dax_zero_page_range = log_writes_dax_zero_page_range,
+       .dax_recovery_write = log_writes_dax_recovery_write,
   };
   
   static int __init dm_log_writes_init(void)
diff --combined drivers/md/dm.c

index d62f1354ecbfed062acf9d464c6ccf93a89eee42,3fe76ab20069c048d4ef3da44da71b60e2f03ef2..dfb0a551bd880e9da428c8c28ee9a8822278ed09
--- 1/drivers/md/dm.c
--- 2/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@@ -71,10 -71,6 +71,10 @@@ void dm_issue_global_event(void
         wake_up(&dm_global_eventq);
   }
   
+ +DEFINE_STATIC_KEY_FALSE(stats_enabled);
+ +DEFINE_STATIC_KEY_FALSE(swap_bios_enabled);
+ +DEFINE_STATIC_KEY_FALSE(zoned_enabled);
+ +
   /*
    * One of these is allocated (on-stack) per original bio.
    */
@@@ -84,8 -80,7 +84,8 @@@ struct clone_info 
         struct dm_io *io;
         sector_t sector;
         unsigned sector_count;
- -      bool submit_as_polled;
+ +      bool is_abnormal_io:1;
+ +      bool submit_as_polled:1;
   };
   
   #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
@@@ -503,76 -498,69 +503,76 @@@ static bool bio_is_flush_with_data(stru
         return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
   }
   
- -static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio,
- -                     unsigned long start_time, struct dm_stats_aux *stats_aux)
+ +static void dm_io_acct(struct dm_io *io, bool end)
   {
- -      bool is_flush_with_data;
- -      unsigned int bi_size;
+ +      struct dm_stats_aux *stats_aux = &io->stats_aux;
+ +      unsigned long start_time = io->start_time;
+ +      struct mapped_device *md = io->md;
+ +      struct bio *bio = io->orig_bio;
+ +      unsigned int sectors;
   
- -      /* If REQ_PREFLUSH set save any payload but do not account it */
- -      is_flush_with_data = bio_is_flush_with_data(bio);
- -      if (is_flush_with_data) {
- -              bi_size = bio->bi_iter.bi_size;
- -              bio->bi_iter.bi_size = 0;
- -      }
+ +      /*
+ +       * If REQ_PREFLUSH set, don't account payload, it will be
+ +       * submitted (and accounted) after this flush completes.
+ +       */
+ +      if (bio_is_flush_with_data(bio))
+ +              sectors = 0;
+ +      else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT))))
+ +              sectors = bio_sectors(bio);
+ +      else
+ +              sectors = io->sectors;
   
         if (!end)
- -              bio_start_io_acct_time(bio, start_time);
+ +              bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio),
+ +                                 start_time);
         else
- -              bio_end_io_acct(bio, start_time);
+ +              bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time);
+ +
+ +      if (static_branch_unlikely(&stats_enabled) &&
+ +          unlikely(dm_stats_used(&md->stats))) {
+ +              sector_t sector;
+ +
+ +              if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT)))
+ +                      sector = bio->bi_iter.bi_sector;
+ +              else
+ +                      sector = bio_end_sector(bio) - io->sector_offset;
   
- -      if (unlikely(dm_stats_used(&md->stats)))
                 dm_stats_account_io(&md->stats, bio_data_dir(bio),
- -                                  bio->bi_iter.bi_sector, bio_sectors(bio),
+ +                                  sector, sectors,
                                     end, start_time, stats_aux);
- -
- -      /* Restore bio's payload so it does get accounted upon requeue */
- -      if (is_flush_with_data)
- -              bio->bi_iter.bi_size = bi_size;
+ +      }
   }
   
- -static void __dm_start_io_acct(struct dm_io *io, struct bio *bio)
+ +static void __dm_start_io_acct(struct dm_io *io)
   {
- -      dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux);
+ +      dm_io_acct(io, false);
   }
   
   static void dm_start_io_acct(struct dm_io *io, struct bio *clone)
   {
- -      /* Must account IO to DM device in terms of orig_bio */
- -      struct bio *bio = io->orig_bio;
- -
         /*
          * Ensure IO accounting is only ever started once.
- -       * Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO.
          */
- -      if (!clone ||
- -          likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) {
- -              if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED)))
- -                      return;
+ +      if (dm_io_flagged(io, DM_IO_ACCOUNTED))
+ +              return;
+ +
+ +      /* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */
+ +      if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) {
                 dm_io_set_flag(io, DM_IO_ACCOUNTED);
         } else {
                 unsigned long flags;
- -              if (dm_io_flagged(io, DM_IO_ACCOUNTED))
- -                      return;
                 /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */
                 spin_lock_irqsave(&io->lock, flags);
                 dm_io_set_flag(io, DM_IO_ACCOUNTED);
                 spin_unlock_irqrestore(&io->lock, flags);
         }
   
- -      __dm_start_io_acct(io, bio);
+ +      __dm_start_io_acct(io);
   }
   
- -static void dm_end_io_acct(struct dm_io *io, struct bio *bio)
+ +static void dm_end_io_acct(struct dm_io *io)
   {
- -      dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux);
+ +      dm_io_acct(io, true);
   }
   
   static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
@@@ -581,9 -569,7 +581,9 @@@
         struct dm_target_io *tio;
         struct bio *clone;
   
- -      clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs);
+ +      clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->io_bs);
+ +      /* Set default bdev, but target must bio_set_dev() before issuing IO */
+ +      clone->bi_bdev = md->disk->part0;
   
         tio = clone_to_tio(clone);
         tio->flags = 0;
@@@ -592,19 -578,17 +592,19 @@@
   
         io = container_of(tio, struct dm_io, tio);
         io->magic = DM_IO_MAGIC;
- -      io->status = 0;
- -      atomic_set(&io->io_count, 1);
+ +      io->status = BLK_STS_OK;
+ +
+ +      /* one ref is for submission, the other is for completion */
+ +      atomic_set(&io->io_count, 2);
         this_cpu_inc(*md->pending_io);
- -      io->orig_bio = NULL;
+ +      io->orig_bio = bio;
         io->md = md;
- -      io->map_task = current;
         spin_lock_init(&io->lock);
         io->start_time = jiffies;
         io->flags = 0;
   
- -      dm_stats_record_start(&md->stats, &io->stats_aux);
+ +      if (static_branch_unlikely(&stats_enabled))
+ +              dm_stats_record_start(&md->stats, &io->stats_aux);
   
         return io;
   }
@@@ -615,7 -599,7 +615,7 @@@ static void free_io(struct dm_io *io
   }
   
   static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
- -              unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
+ +                           unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
   {
         struct dm_target_io *tio;
         struct bio *clone;
@@@ -626,13 -610,10 +626,13 @@@
                 /* alloc_io() already initialized embedded clone */
                 clone = &tio->clone;
         } else {
- -              clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio,
- -                                      gfp_mask, &ci->io->md->bs);
+ +              struct mapped_device *md = ci->io->md;
+ +
+ +              clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->bs);
                 if (!clone)
                         return NULL;
+ +              /* Set default bdev, but target must bio_set_dev() before issuing IO */
+ +              clone->bi_bdev = md->disk->part0;
   
                 /* REQ_DM_POLL_LIST shouldn't be inherited */
                 clone->bi_opf &= ~REQ_DM_POLL_LIST;
@@@ -682,16 -663,14 +682,16 @@@ static void queue_io(struct mapped_devi
    * function to access the md->map field, and make sure they call
    * dm_put_live_table() when finished.
    */
- -struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
+ +struct dm_table *dm_get_live_table(struct mapped_device *md,
+ +                                 int *srcu_idx) __acquires(md->io_barrier)
   {
         *srcu_idx = srcu_read_lock(&md->io_barrier);
   
         return srcu_dereference(md->map, &md->io_barrier);
   }
   
- -void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
+ +void dm_put_live_table(struct mapped_device *md,
+ +                     int srcu_idx) __releases(md->io_barrier)
   {
         srcu_read_unlock(&md->io_barrier, srcu_idx);
   }
@@@ -717,24 -696,6 +717,24 @@@ static void dm_put_live_table_fast(stru
         rcu_read_unlock();
   }
   
+ +static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md,
+ +                                                   int *srcu_idx, struct bio *bio)
+ +{
+ +      if (bio->bi_opf & REQ_NOWAIT)
+ +              return dm_get_live_table_fast(md);
+ +      else
+ +              return dm_get_live_table(md, srcu_idx);
+ +}
+ +
+ +static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx,
+ +                                       struct bio *bio)
+ +{
+ +      if (bio->bi_opf & REQ_NOWAIT)
+ +              dm_put_live_table_fast(md);
+ +      else
+ +              dm_put_live_table(md, srcu_idx);
+ +}
+ +
   static char *_dm_claim_ptr = "I belong to device-mapper";
   
   /*
@@@ -914,14 -875,14 +914,14 @@@ static void dm_io_complete(struct dm_i
   
         io_error = io->status;
         if (dm_io_flagged(io, DM_IO_ACCOUNTED))
- -              dm_end_io_acct(io, bio);
+ +              dm_end_io_acct(io);
         else if (!io_error) {
                 /*
                  * Must handle target that DM_MAPIO_SUBMITTED only to
                  * then bio_endio() rather than dm_submit_bio_remap()
                  */
- -              __dm_start_io_acct(io, bio);
- -              dm_end_io_acct(io, bio);
+ +              __dm_start_io_acct(io);
+ +              dm_end_io_acct(io);
         }
         free_io(io);
         smp_wmb();
@@@ -938,7 -899,7 +938,7 @@@
                          * may only reflect a subset of the pre-split original)
                          * so clear REQ_POLLED in case of requeue.
                          */
- -                      bio->bi_opf &= ~REQ_POLLED;
+ +                      bio_clear_polled(bio);
                         if (io_error == BLK_STS_AGAIN) {
                                 /* io_uring doesn't handle BLK_STS_AGAIN (yet) */
                                 queue_io(md, bio);
@@@ -962,35 -923,30 +962,35 @@@
         }
   }
   
- -static inline bool dm_tio_is_normal(struct dm_target_io *tio)
- -{
- -      return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) &&
- -              !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
- -}
- -
   /*
    * Decrements the number of outstanding ios that a bio has been
    * cloned into, completing the original io if necc.
    */
- -void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
+ +static inline void __dm_io_dec_pending(struct dm_io *io)
   {
+ +      if (atomic_dec_and_test(&io->io_count))
+ +              dm_io_complete(io);
+ +}
+ +
+ +static void dm_io_set_error(struct dm_io *io, blk_status_t error)
+ +{
+ +      unsigned long flags;
+ +
         /* Push-back supersedes any I/O errors */
- -      if (unlikely(error)) {
- -              unsigned long flags;
- -              spin_lock_irqsave(&io->lock, flags);
- -              if (!(io->status == BLK_STS_DM_REQUEUE &&
- -                    __noflush_suspending(io->md)))
- -                      io->status = error;
- -              spin_unlock_irqrestore(&io->lock, flags);
+ +      spin_lock_irqsave(&io->lock, flags);
+ +      if (!(io->status == BLK_STS_DM_REQUEUE &&
+ +            __noflush_suspending(io->md))) {
+ +              io->status = error;
         }
+ +      spin_unlock_irqrestore(&io->lock, flags);
+ +}
   
- -      if (atomic_dec_and_test(&io->io_count))
- -              dm_io_complete(io);
+ +static void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
+ +{
+ +      if (unlikely(error))
+ +              dm_io_set_error(io, error);
+ +
+ +      __dm_io_dec_pending(io);
   }
   
   void disable_discard(struct mapped_device *md)
@@@ -999,6 -955,7 +999,6 @@@
   
         /* device doesn't really support DISCARD, disable it */
         limits->max_discard_sectors = 0;
- -      blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
   }
   
   void disable_write_zeroes(struct mapped_device *md)
@@@ -1018,43 -975,35 +1018,43 @@@ static void clone_endio(struct bio *bio
   {
         blk_status_t error = bio->bi_status;
         struct dm_target_io *tio = clone_to_tio(bio);
+ +      struct dm_target *ti = tio->ti;
+ +      dm_endio_fn endio = ti->type->end_io;
         struct dm_io *io = tio->io;
- -      struct mapped_device *md = tio->io->md;
- -      dm_endio_fn endio = tio->ti->type->end_io;
- -      struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+ +      struct mapped_device *md = io->md;
   
- -      if (unlikely(error == BLK_STS_TARGET)) {
- -              if (bio_op(bio) == REQ_OP_DISCARD &&
- -                  !q->limits.max_discard_sectors)
- -                      disable_discard(md);
- -              else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
- -                       !q->limits.max_write_zeroes_sectors)
- -                      disable_write_zeroes(md);
- -      }
+ +      if (likely(bio->bi_bdev != md->disk->part0)) {
+ +              struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ +
+ +              if (unlikely(error == BLK_STS_TARGET)) {
+ +                      if (bio_op(bio) == REQ_OP_DISCARD &&
+ +                          !bdev_max_discard_sectors(bio->bi_bdev))
+ +                              disable_discard(md);
+ +                      else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+ +                               !q->limits.max_write_zeroes_sectors)
+ +                              disable_write_zeroes(md);
+ +              }
   
- -      if (blk_queue_is_zoned(q))
- -              dm_zone_endio(io, bio);
+ +              if (static_branch_unlikely(&zoned_enabled) &&
+ +                  unlikely(blk_queue_is_zoned(q)))
+ +                      dm_zone_endio(io, bio);
+ +      }
   
         if (endio) {
- -              int r = endio(tio->ti, bio, &error);
+ +              int r = endio(ti, bio, &error);
                 switch (r) {
                 case DM_ENDIO_REQUEUE:
- -                      /*
- -                       * Requeuing writes to a sequential zone of a zoned
- -                       * target will break the sequential write pattern:
- -                       * fail such IO.
- -                       */
- -                      if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
- -                              error = BLK_STS_IOERR;
- -                      else
+ +                      if (static_branch_unlikely(&zoned_enabled)) {
+ +                              /*
+ +                               * Requeuing writes to a sequential zone of a zoned
+ +                               * target will break the sequential write pattern:
+ +                               * fail such IO.
+ +                               */
+ +                              if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
+ +                                      error = BLK_STS_IOERR;
+ +                              else
+ +                                      error = BLK_STS_DM_REQUEUE;
+ +                      } else
                                 error = BLK_STS_DM_REQUEUE;
                         fallthrough;
                 case DM_ENDIO_DONE:
@@@ -1068,9 -1017,10 +1068,9 @@@
                 }
         }
   
- -      if (unlikely(swap_bios_limit(tio->ti, bio))) {
- -              struct mapped_device *md = io->md;
+ +      if (static_branch_unlikely(&swap_bios_enabled) &&
+ +          unlikely(swap_bios_limit(ti, bio)))
                 up(&md->swap_bios_semaphore);
- -      }
   
         free_tio(bio);
         dm_io_dec_pending(io, error);
@@@ -1143,7 -1093,8 +1143,8 @@@ static struct dm_target *dm_dax_get_liv
   }
   
   static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
-                                long nr_pages, void **kaddr, pfn_t *pfn)
+               long nr_pages, enum dax_access_mode mode, void **kaddr,
+               pfn_t *pfn)
   {
         struct mapped_device *md = dax_get_private(dax_dev);
         sector_t sector = pgoff * PAGE_SECTORS;
@@@ -1161,7 -1112,7 +1162,7 @@@
         if (len < 1)
                 goto out;
         nr_pages = min(len, nr_pages);
-       ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
+       ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn);
   
    out:
         dm_put_live_table(md, srcu_idx);
@@@ -1196,6 -1147,25 +1197,25 @@@ static int dm_dax_zero_page_range(struc
         return ret;
   }
   
+ static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
+               void *addr, size_t bytes, struct iov_iter *i)
+ {
+       struct mapped_device *md = dax_get_private(dax_dev);
+       sector_t sector = pgoff * PAGE_SECTORS;
+       struct dm_target *ti;
+       int srcu_idx;
+       long ret = 0;
+ 
+       ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+       if (!ti || !ti->type->dax_recovery_write)
+               goto out;
+ 
+       ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i);
+ out:
+       dm_put_live_table(md, srcu_idx);
+       return ret;
+ }
+ 
   /*
    * A target may call dm_accept_partial_bio only from the map routine.  It is
    * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
@@@ -1212,7 -1182,7 +1232,7 @@@
    * +--------------------+---------------+-------+
    *
    * <-------------- *tio->len_ptr --------------->
- - *                      <------- bi_size ------->
+ + *                      <----- bio_sectors ----->
    *                      <-- n_sectors -->
    *
    * Region 1 was already iterated over with bio_advance or similar function.
@@@ -1229,25 -1199,25 +1249,25 @@@
   void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
   {
         struct dm_target_io *tio = clone_to_tio(bio);
- -      unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+ +      unsigned bio_sectors = bio_sectors(bio);
   
         BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
         BUG_ON(op_is_zone_mgmt(bio_op(bio)));
         BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
- -      BUG_ON(bi_size > *tio->len_ptr);
- -      BUG_ON(n_sectors > bi_size);
+ +      BUG_ON(bio_sectors > *tio->len_ptr);
+ +      BUG_ON(n_sectors > bio_sectors);
   
- -      *tio->len_ptr -= bi_size - n_sectors;
+ +      *tio->len_ptr -= bio_sectors - n_sectors;
         bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
- -}
- -EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
   
- -static inline void __dm_submit_bio_remap(struct bio *clone,
- -                                       dev_t dev, sector_t old_sector)
- -{
- -      trace_block_bio_remap(clone, dev, old_sector);
- -      submit_bio_noacct(clone);
+ +      /*
+ +       * __split_and_process_bio() may have already saved mapped part
+ +       * for accounting but it is being reduced so update accordingly.
+ +       */
+ +      dm_io_set_flag(tio->io, DM_IO_WAS_SPLIT);
+ +      tio->io->sectors = n_sectors;
   }
+ +EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
   
   /*
    * @clone: clone bio that DM core passed to target's .map function
@@@ -1263,6 -1233,8 +1283,6 @@@ void dm_submit_bio_remap(struct bio *cl
         struct dm_target_io *tio = clone_to_tio(clone);
         struct dm_io *io = tio->io;
   
- -      WARN_ON_ONCE(!tio->ti->accounts_remapped_io);
- -
         /* establish bio that will get submitted */
         if (!tgt_clone)
                 tgt_clone = clone;
@@@ -1271,11 -1243,22 +1291,11 @@@
          * Account io->origin_bio to DM dev on behalf of target
          * that took ownership of IO with DM_MAPIO_SUBMITTED.
          */
- -      if (io->map_task == current) {
- -              /* Still in target's map function */
- -              dm_io_set_flag(io, DM_IO_START_ACCT);
- -      } else {
- -              /*
- -               * Called by another thread, managed by DM target,
- -               * wait for dm_split_and_process_bio() to store
- -               * io->orig_bio
- -               */
- -              while (unlikely(!smp_load_acquire(&io->orig_bio)))
- -                      msleep(1);
- -              dm_start_io_acct(io, clone);
- -      }
+ +      dm_start_io_acct(io, clone);
   
- -      __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk),
+ +      trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk),
                               tio->old_sector);
+ +      submit_bio_noacct(tgt_clone);
   }
   EXPORT_SYMBOL_GPL(dm_submit_bio_remap);
   
@@@ -1298,53 -1281,55 +1318,53 @@@ static noinline void __set_swap_bios_li
   static void __map_bio(struct bio *clone)
   {
         struct dm_target_io *tio = clone_to_tio(clone);
- -      int r;
- -      struct dm_io *io = tio->io;
         struct dm_target *ti = tio->ti;
+ +      struct dm_io *io = tio->io;
+ +      struct mapped_device *md = io->md;
+ +      int r;
   
         clone->bi_end_io = clone_endio;
   
         /*
          * Map the clone.
          */
- -      dm_io_inc_pending(io);
         tio->old_sector = clone->bi_iter.bi_sector;
   
- -      if (unlikely(swap_bios_limit(ti, clone))) {
- -              struct mapped_device *md = io->md;
+ +      if (static_branch_unlikely(&swap_bios_enabled) &&
+ +          unlikely(swap_bios_limit(ti, clone))) {
                 int latch = get_swap_bios();
                 if (unlikely(latch != md->swap_bios))
                         __set_swap_bios_limit(md, latch);
                 down(&md->swap_bios_semaphore);
         }
   
- -      /*
- -       * Check if the IO needs a special mapping due to zone append emulation
- -       * on zoned target. In this case, dm_zone_map_bio() calls the target
- -       * map operation.
- -       */
- -      if (dm_emulate_zone_append(io->md))
- -              r = dm_zone_map_bio(tio);
- -      else
+ +      if (static_branch_unlikely(&zoned_enabled)) {
+ +              /*
+ +               * Check if the IO needs a special mapping due to zone append
+ +               * emulation on zoned target. In this case, dm_zone_map_bio()
+ +               * calls the target map operation.
+ +               */
+ +              if (unlikely(dm_emulate_zone_append(md)))
+ +                      r = dm_zone_map_bio(tio);
+ +              else
+ +                      r = ti->type->map(ti, clone);
+ +      } else
                 r = ti->type->map(ti, clone);
   
         switch (r) {
         case DM_MAPIO_SUBMITTED:
                 /* target has assumed ownership of this io */
                 if (!ti->accounts_remapped_io)
- -                      dm_io_set_flag(io, DM_IO_START_ACCT);
+ +                      dm_start_io_acct(io, clone);
                 break;
         case DM_MAPIO_REMAPPED:
- -              /*
- -               * the bio has been remapped so dispatch it, but defer
- -               * dm_start_io_acct() until after possible bio_split().
- -               */
- -              __dm_submit_bio_remap(clone, disk_devt(io->md->disk),
- -                                    tio->old_sector);
- -              dm_io_set_flag(io, DM_IO_START_ACCT);
+ +              dm_submit_bio_remap(clone, NULL);
                 break;
         case DM_MAPIO_KILL:
         case DM_MAPIO_REQUEUE:
- -              if (unlikely(swap_bios_limit(ti, clone)))
- -                      up(&io->md->swap_bios_semaphore);
+ +              if (static_branch_unlikely(&swap_bios_enabled) &&
+ +                  unlikely(swap_bios_limit(ti, clone)))
+ +                      up(&md->swap_bios_semaphore);
                 free_tio(clone);
                 if (r == DM_MAPIO_KILL)
                         dm_io_dec_pending(io, BLK_STS_IOERR);
@@@ -1357,31 -1342,6 +1377,31 @@@
         }
   }
   
+ +static void setup_split_accounting(struct clone_info *ci, unsigned len)
+ +{
+ +      struct dm_io *io = ci->io;
+ +
+ +      if (ci->sector_count > len) {
+ +              /*
+ +               * Split needed, save the mapped part for accounting.
+ +               * NOTE: dm_accept_partial_bio() will update accordingly.
+ +               */
+ +              dm_io_set_flag(io, DM_IO_WAS_SPLIT);
+ +              io->sectors = len;
+ +      }
+ +
+ +      if (static_branch_unlikely(&stats_enabled) &&
+ +          unlikely(dm_stats_used(&io->md->stats))) {
+ +              /*
+ +               * Save bi_sector in terms of its offset from end of
+ +               * original bio, only needed for DM-stats' benefit.
+ +               * - saved regardless of whether split needed so that
+ +               *   dm_accept_partial_bio() doesn't need to.
+ +               */
+ +              io->sector_offset = bio_end_sector(ci->bio) - ci->sector;
+ +      }
+ +}
+ +
   static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
                                 struct dm_target *ti, unsigned num_bios)
   {
@@@ -1411,22 -1371,18 +1431,22 @@@
         }
   }
   
- -static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
+ +static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
                                   unsigned num_bios, unsigned *len)
   {
         struct bio_list blist = BIO_EMPTY_LIST;
         struct bio *clone;
+ +      int ret = 0;
   
         switch (num_bios) {
         case 0:
                 break;
         case 1:
+ +              if (len)
+ +                      setup_split_accounting(ci, *len);
                 clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
                 __map_bio(clone);
+ +              ret = 1;
                 break;
         default:
                 /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */
@@@ -1434,12 -1390,9 +1454,12 @@@
                 while ((clone = bio_list_pop(&blist))) {
                         dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO);
                         __map_bio(clone);
+ +                      ret += 1;
                 }
                 break;
         }
+ +
+ +      return ret;
   }
   
   static void __send_empty_flush(struct clone_info *ci)
@@@ -1460,19 -1413,8 +1480,19 @@@
         ci->sector_count = 0;
         ci->io->tio.clone.bi_iter.bi_size = 0;
   
- -      while ((ti = dm_table_get_target(ci->map, target_nr++)))
- -              __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
+ +      while ((ti = dm_table_get_target(ci->map, target_nr++))) {
+ +              int bios;
+ +
+ +              atomic_add(ti->num_flush_bios, &ci->io->io_count);
+ +              bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
+ +              atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
+ +      }
+ +
+ +      /*
+ +       * alloc_io() takes one extra reference for submission, so the
+ +       * reference won't reach 0 without the following subtraction
+ +       */
+ +      atomic_sub(1, &ci->io->io_count);
   
         bio_uninit(ci->bio);
   }
@@@ -1481,18 -1423,11 +1501,18 @@@ static void __send_changing_extent_only
                                         unsigned num_bios)
   {
         unsigned len;
+ +      int bios;
   
         len = min_t(sector_t, ci->sector_count,
                     max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
   
- -      __send_duplicate_bios(ci, ti, num_bios, &len);
+ +      atomic_add(num_bios, &ci->io->io_count);
+ +      bios = __send_duplicate_bios(ci, ti, num_bios, &len);
+ +      /*
+ +       * alloc_io() takes one extra reference for submission, so the
+ +       * reference won't reach 0 without the following (+1) subtraction
+ +       */
+ +      atomic_sub(num_bios - bios + 1, &ci->io->io_count);
   
         ci->sector += len;
         ci->sector_count -= len;
@@@ -1500,24 -1435,21 +1520,24 @@@
   
   static bool is_abnormal_io(struct bio *bio)
   {
- -      bool r = false;
+ +      unsigned int op = bio_op(bio);
   
- -      switch (bio_op(bio)) {
- -      case REQ_OP_DISCARD:
- -      case REQ_OP_SECURE_ERASE:
- -      case REQ_OP_WRITE_ZEROES:
- -              r = true;
- -              break;
+ +      if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) {
+ +              switch (op) {
+ +              case REQ_OP_DISCARD:
+ +              case REQ_OP_SECURE_ERASE:
+ +              case REQ_OP_WRITE_ZEROES:
+ +                      return true;
+ +              default:
+ +                      break;
+ +              }
         }
   
- -      return r;
+ +      return false;
   }
   
- -static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
- -                                int *result)
+ +static blk_status_t __process_abnormal_io(struct clone_info *ci,
+ +                                        struct dm_target *ti)
   {
         unsigned num_bios = 0;
   
@@@ -1531,6 -1463,8 +1551,6 @@@
         case REQ_OP_WRITE_ZEROES:
                 num_bios = ti->num_write_zeroes_bios;
                 break;
- -      default:
- -              return false;
         }
   
         /*
@@@ -1539,15 -1473,17 +1559,15 @@@
          * reconfiguration might also have changed that since the
          * check was performed.
          */
- -      if (!num_bios)
- -              *result = -EOPNOTSUPP;
- -      else {
- -              __send_changing_extent_only(ci, ti, num_bios);
- -              *result = 0;
- -      }
- -      return true;
+ +      if (unlikely(!num_bios))
+ +              return BLK_STS_NOTSUPP;
+ +
+ +      __send_changing_extent_only(ci, ti, num_bios);
+ +      return BLK_STS_OK;
   }
   
   /*
- - * Reuse ->bi_private as hlist head for storing all dm_io instances
+ + * Reuse ->bi_private as dm_io list head for storing all dm_io instances
    * associated with this bio, and this bio's bi_private needs to be
    * stored in dm_io->data before the reuse.
    *
@@@ -1555,53 -1491,54 +1575,53 @@@
    * touch it after splitting. Meantime it won't be changed by anyone after
    * bio is submitted. So this reuse is safe.
    */
- -static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio)
+ +static inline struct dm_io **dm_poll_list_head(struct bio *bio)
   {
- -      return (struct hlist_head *)&bio->bi_private;
+ +      return (struct dm_io **)&bio->bi_private;
   }
   
   static void dm_queue_poll_io(struct bio *bio, struct dm_io *io)
   {
- -      struct hlist_head *head = dm_get_bio_hlist_head(bio);
+ +      struct dm_io **head = dm_poll_list_head(bio);
   
         if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {
                 bio->bi_opf |= REQ_DM_POLL_LIST;
                 /*
                  * Save .bi_private into dm_io, so that we can reuse
- -               * .bi_private as hlist head for storing dm_io list
+ +               * .bi_private as dm_io list head for storing dm_io list
                  */
                 io->data = bio->bi_private;
   
- -              INIT_HLIST_HEAD(head);
- -
                 /* tell block layer to poll for completion */
                 bio->bi_cookie = ~BLK_QC_T_NONE;
+ +
+ +              io->next = NULL;
         } else {
                 /*
                  * bio recursed due to split, reuse original poll list,
                  * and save bio->bi_private too.
                  */
- -              io->data = hlist_entry(head->first, struct dm_io, node)->data;
+ +              io->data = (*head)->data;
+ +              io->next = *head;
         }
   
- -      hlist_add_head(&io->node, head);
+ +      *head = io;
   }
   
   /*
    * Select the correct strategy for processing a non-flush bio.
    */
- -static int __split_and_process_bio(struct clone_info *ci)
+ +static blk_status_t __split_and_process_bio(struct clone_info *ci)
   {
         struct bio *clone;
         struct dm_target *ti;
         unsigned len;
- -      int r;
   
         ti = dm_table_find_target(ci->map, ci->sector);
- -      if (!ti)
- -              return -EIO;
- -
- -      if (__process_abnormal_io(ci, ti, &r))
- -              return r;
+ +      if (unlikely(!ti))
+ +              return BLK_STS_IOERR;
+ +      else if (unlikely(ci->is_abnormal_io))
+ +              return __process_abnormal_io(ci, ti);
   
         /*
          * Only support bio polling for normal IO, and the target io is
@@@ -1610,30 -1547,27 +1630,30 @@@
         ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED;
   
         len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
+ +      setup_split_accounting(ci, len);
         clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
         __map_bio(clone);
   
         ci->sector += len;
         ci->sector_count -= len;
   
- -      return 0;
+ +      return BLK_STS_OK;
   }
   
   static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
- -                          struct dm_table *map, struct bio *bio)
+ +                          struct dm_table *map, struct bio *bio, bool is_abnormal)
   {
         ci->map = map;
         ci->io = alloc_io(md, bio);
         ci->bio = bio;
+ +      ci->is_abnormal_io = is_abnormal;
         ci->submit_as_polled = false;
         ci->sector = bio->bi_iter.bi_sector;
         ci->sector_count = bio_sectors(bio);
   
         /* Shouldn't happen but sector_count was being set to 0 so... */
- -      if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count))
+ +      if (static_branch_unlikely(&zoned_enabled) &&
+ +          WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count))
                 ci->sector_count = 0;
   }
   
@@@ -1644,21 -1578,10 +1664,21 @@@ static void dm_split_and_process_bio(st
                                      struct dm_table *map, struct bio *bio)
   {
         struct clone_info ci;
- -      struct bio *orig_bio = NULL;
- -      int error = 0;
+ +      struct dm_io *io;
+ +      blk_status_t error = BLK_STS_OK;
+ +      bool is_abnormal;
   
- -      init_clone_info(&ci, md, map, bio);
+ +      is_abnormal = is_abnormal_io(bio);
+ +      if (unlikely(is_abnormal)) {
+ +              /*
+ +               * Use blk_queue_split() for abnormal IO (e.g. discard, etc)
+ +               * otherwise associated queue_limits won't be imposed.
+ +               */
+ +              blk_queue_split(&bio);
+ +      }
+ +
+ +      init_clone_info(&ci, md, map, bio, is_abnormal);
+ +      io = ci.io;
   
         if (bio->bi_opf & REQ_PREFLUSH) {
                 __send_empty_flush(&ci);
@@@ -1667,34 -1590,40 +1687,34 @@@
         }
   
         error = __split_and_process_bio(&ci);
- -      ci.io->map_task = NULL;
         if (error || !ci.sector_count)
                 goto out;
- -
         /*
          * Remainder must be passed to submit_bio_noacct() so it gets handled
          * *after* bios already submitted have been completely processed.
- -       * We take a clone of the original to store in ci.io->orig_bio to be
- -       * used by dm_end_io_acct() and for dm_io_complete() to use for
- -       * completion handling.
          */
- -      orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count,
- -                           GFP_NOIO, &md->queue->bio_split);
- -      bio_chain(orig_bio, bio);
- -      trace_block_split(orig_bio, bio->bi_iter.bi_sector);
+ +      bio_trim(bio, io->sectors, ci.sector_count);
+ +      trace_block_split(bio, bio->bi_iter.bi_sector);
+ +      bio_inc_remaining(bio);
         submit_bio_noacct(bio);
   out:
- -      if (!orig_bio)
- -              orig_bio = bio;
- -      smp_store_release(&ci.io->orig_bio, orig_bio);
- -      if (dm_io_flagged(ci.io, DM_IO_START_ACCT))
- -              dm_start_io_acct(ci.io, NULL);
- -
         /*
          * Drop the extra reference count for non-POLLED bio, and hold one
          * reference for POLLED bio, which will be released in dm_poll_bio
          *
- -       * Add every dm_io instance into the hlist_head which is stored in
- -       * bio->bi_private, so that dm_poll_bio can poll them all.
+ +       * Add every dm_io instance into the dm_io list head which is stored
+ +       * in bio->bi_private, so that dm_poll_bio can poll them all.
          */
- -      if (error || !ci.submit_as_polled)
- -              dm_io_dec_pending(ci.io, errno_to_blk_status(error));
- -      else
- -              dm_queue_poll_io(bio, ci.io);
+ +      if (error || !ci.submit_as_polled) {
+ +              /*
+ +               * In case of submission failure, the extra reference for
+ +               * submitting io isn't consumed yet
+ +               */
+ +              if (error)
+ +                      atomic_dec(&io->io_count);
+ +              dm_io_dec_pending(io, error);
+ +      } else
+ +              dm_queue_poll_io(bio, io);
   }
   
   static void dm_submit_bio(struct bio *bio)
@@@ -1703,7 -1632,7 +1723,7 @@@
         int srcu_idx;
         struct dm_table *map;
   
- -      map = dm_get_live_table(md, &srcu_idx);
+ +      map = dm_get_live_table_bio(md, &srcu_idx, bio);
   
         /* If suspended, or map not yet available, queue this IO for later */
         if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) ||
@@@ -1717,9 -1646,16 +1737,9 @@@
                 goto out;
         }
   
- -      /*
- -       * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc)
- -       * otherwise associated queue_limits won't be imposed.
- -       */
- -      if (is_abnormal_io(bio))
- -              blk_queue_split(&bio);
- -
         dm_split_and_process_bio(md, map, bio);
   out:
- -      dm_put_live_table(md, srcu_idx);
+ +      dm_put_live_table_bio(md, srcu_idx, bio);
   }
   
   static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,
@@@ -1738,16 -1674,18 +1758,16 @@@
   static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob,
                        unsigned int flags)
   {
- -      struct hlist_head *head = dm_get_bio_hlist_head(bio);
- -      struct hlist_head tmp = HLIST_HEAD_INIT;
- -      struct hlist_node *next;
- -      struct dm_io *io;
+ +      struct dm_io **head = dm_poll_list_head(bio);
+ +      struct dm_io *list = *head;
+ +      struct dm_io *tmp = NULL;
+ +      struct dm_io *curr, *next;
   
         /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */
         if (!(bio->bi_opf & REQ_DM_POLL_LIST))
                 return 0;
   
- -      WARN_ON_ONCE(hlist_empty(head));
- -
- -      hlist_move_list(head, &tmp);
+ +      WARN_ON_ONCE(!list);
   
         /*
          * Restore .bi_private before possibly completing dm_io.
@@@ -1758,27 -1696,24 +1778,27 @@@
          * clearing REQ_DM_POLL_LIST here.
          */
         bio->bi_opf &= ~REQ_DM_POLL_LIST;
- -      bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data;
+ +      bio->bi_private = list->data;
   
- -      hlist_for_each_entry_safe(io, next, &tmp, node) {
- -              if (dm_poll_dm_io(io, iob, flags)) {
- -                      hlist_del_init(&io->node);
+ +      for (curr = list, next = curr->next; curr; curr = next, next =
+ +                      curr ? curr->next : NULL) {
+ +              if (dm_poll_dm_io(curr, iob, flags)) {
                         /*
- -                       * clone_endio() has already occurred, so passing
- -                       * error as 0 here doesn't override io->status
+ +                       * clone_endio() has already occurred, so no
+ +                       * error handling is needed here.
                          */
- -                      dm_io_dec_pending(io, 0);
+ +                      __dm_io_dec_pending(curr);
+ +              } else {
+ +                      curr->next = tmp;
+ +                      tmp = curr;
                 }
         }
   
         /* Not done? */
- -      if (!hlist_empty(&tmp)) {
+ +      if (tmp) {
                 bio->bi_opf |= REQ_DM_POLL_LIST;
                 /* Reset bio->bi_private to dm_io list head */
- -              hlist_move_list(&tmp, head);
+ +              *head = tmp;
                 return 0;
         }
         return 1;
@@@ -2987,8 -2922,8 +3007,8 @@@ int dm_noflush_suspending(struct dm_tar
   EXPORT_SYMBOL_GPL(dm_noflush_suspending);
   
   struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
- -                                          unsigned integrity, unsigned per_io_data_size,
- -                                          unsigned min_pool_size)
+ +                                          unsigned per_io_data_size, unsigned min_pool_size,
+ +                                          bool integrity, bool poll)
   {
         struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
         unsigned int pool_size = 0;
@@@ -3004,7 -2939,7 +3024,7 @@@
                 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
                 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET;
                 io_front_pad = roundup(per_io_data_size,  __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
- -              ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
+ +              ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, poll ? BIOSET_PERCPU_CACHE : 0);
                 if (ret)
                         goto out;
                 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
@@@ -3231,6 -3166,7 +3251,7 @@@ static const struct block_device_operat
   static const struct dax_operations dm_dax_ops = {
         .direct_access = dm_dax_direct_access,
         .zero_page_range = dm_dax_zero_page_range,
+       .recovery_write = dm_dax_recovery_write,
   };
   
   /*
diff --combined fs/dax.c

index 1ac12e877f4f3f84acd9afc715fdf67f416f5500,a1e4b45cbf55b16ace2b7eb77b7ace871a48e2aa..4155a6107fa10d20914446e5798110fcddddc08f
--- 1/fs/dax.c
--- 2/fs/dax.c
+++ b/fs/dax.c
@@@ -24,7 -24,6 +24,7 @@@
   #include <linux/sizes.h>
   #include <linux/mmu_notifier.h>
   #include <linux/iomap.h>
+ +#include <linux/rmap.h>
   #include <asm/pgalloc.h>
   
   #define CREATE_TRACE_POINTS
@@@ -722,7 -721,8 +722,8 @@@ static int copy_cow_page_dax(struct vm_
         int id;
   
         id = dax_read_lock();
-       rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL);
+       rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
+                               &kaddr, NULL);
         if (rc < 0) {
                 dax_read_unlock(id);
                 return rc;
@@@ -790,12 -790,95 +791,12 @@@ static void *dax_insert_entry(struct xa
         return entry;
   }
   
- -static inline
- -unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
- -{
- -      unsigned long address;
- -
- -      address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- -      VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
- -      return address;
- -}
- -
- -/* Walk all mappings of a given index of a file and writeprotect them */
- -static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
- -              unsigned long pfn)
- -{
- -      struct vm_area_struct *vma;
- -      pte_t pte, *ptep = NULL;
- -      pmd_t *pmdp = NULL;
- -      spinlock_t *ptl;
- -
- -      i_mmap_lock_read(mapping);
- -      vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
- -              struct mmu_notifier_range range;
- -              unsigned long address;
- -
- -              cond_resched();
- -
- -              if (!(vma->vm_flags & VM_SHARED))
- -                      continue;
- -
- -              address = pgoff_address(index, vma);
- -
- -              /*
- -               * follow_invalidate_pte() will use the range to call
- -               * mmu_notifier_invalidate_range_start() on our behalf before
- -               * taking any lock.
- -               */
- -              if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
- -                                        &pmdp, &ptl))
- -                      continue;
- -
- -              /*
- -               * No need to call mmu_notifier_invalidate_range() as we are
- -               * downgrading page table protection not changing it to point
- -               * to a new page.
- -               *
- -               * See Documentation/vm/mmu_notifier.rst
- -               */
- -              if (pmdp) {
- -#ifdef CONFIG_FS_DAX_PMD
- -                      pmd_t pmd;
- -
- -                      if (pfn != pmd_pfn(*pmdp))
- -                              goto unlock_pmd;
- -                      if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
- -                              goto unlock_pmd;
- -
- -                      flush_cache_page(vma, address, pfn);
- -                      pmd = pmdp_invalidate(vma, address, pmdp);
- -                      pmd = pmd_wrprotect(pmd);
- -                      pmd = pmd_mkclean(pmd);
- -                      set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- -unlock_pmd:
- -#endif
- -                      spin_unlock(ptl);
- -              } else {
- -                      if (pfn != pte_pfn(*ptep))
- -                              goto unlock_pte;
- -                      if (!pte_dirty(*ptep) && !pte_write(*ptep))
- -                              goto unlock_pte;
- -
- -                      flush_cache_page(vma, address, pfn);
- -                      pte = ptep_clear_flush(vma, address, ptep);
- -                      pte = pte_wrprotect(pte);
- -                      pte = pte_mkclean(pte);
- -                      set_pte_at(vma->vm_mm, address, ptep, pte);
- -unlock_pte:
- -                      pte_unmap_unlock(ptep, ptl);
- -              }
- -
- -              mmu_notifier_invalidate_range_end(&range);
- -      }
- -      i_mmap_unlock_read(mapping);
- -}
- -
   static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
                 struct address_space *mapping, void *entry)
   {
- -      unsigned long pfn, index, count;
+ +      unsigned long pfn, index, count, end;
         long ret = 0;
+ +      struct vm_area_struct *vma;
   
         /*
          * A page got tagged dirty in DAX mapping? Something is seriously
@@@ -853,16 -936,8 +854,16 @@@
         pfn = dax_to_pfn(entry);
         count = 1UL << dax_entry_order(entry);
         index = xas->xa_index & ~(count - 1);
+ +      end = index + count - 1;
+ +
+ +      /* Walk all mappings of a given index of a file and writeprotect them */
+ +      i_mmap_lock_read(mapping);
+ +      vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
+ +              pfn_mkclean_range(pfn, count, index, vma);
+ +              cond_resched();
+ +      }
+ +      i_mmap_unlock_read(mapping);
   
- -      dax_entry_mkclean(mapping, index, pfn);
         dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
         /*
          * After we have flushed the cache, we can clear the dirty tag. There
@@@ -939,7 -1014,7 +940,7 @@@ static int dax_iomap_pfn(const struct i
   
         id = dax_read_lock();
         length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
-                                  NULL, pfnp);
+                                  DAX_ACCESS, NULL, pfnp);
         if (length < 0) {
                 rc = length;
                 goto out;
@@@ -1048,7 -1123,7 +1049,7 @@@ static int dax_memzero(struct dax_devic
         void *kaddr;
         long ret;
   
-       ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL);
+       ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL);
         if (ret > 0) {
                 memset(kaddr + offset, 0, size);
                 dax_flush(dax_dev, kaddr + offset, size);
@@@ -1165,6 -1240,7 +1166,7 @@@ static loff_t dax_iomap_iter(const stru
                 const size_t size = ALIGN(length + offset, PAGE_SIZE);
                 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
                 ssize_t map_len;
+               bool recovery = false;
                 void *kaddr;
   
                 if (fatal_signal_pending(current)) {
@@@ -1173,7 -1249,14 +1175,14 @@@
                 }
   
                 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
-                               &kaddr, NULL);
+                               DAX_ACCESS, &kaddr, NULL);
+               if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
+                       map_len = dax_direct_access(dax_dev, pgoff,
+                                       PHYS_PFN(size), DAX_RECOVERY_WRITE,
+                                       &kaddr, NULL);
+                       if (map_len > 0)
+                               recovery = true;
+               }
                 if (map_len < 0) {
                         ret = map_len;
                         break;
@@@ -1185,7 -1268,10 +1194,10 @@@
                 if (map_len > end - pos)
                         map_len = end - pos;
   
-               if (iov_iter_rw(iter) == WRITE)
+               if (recovery)
+                       xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
+                                       map_len, iter);
+               else if (iov_iter_rw(iter) == WRITE)
                         xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
                                         map_len, iter);
                 else
author	Linus Torvalds <[email protected]>
	Fri, 27 May 2022 22:49:30 +0000 (15:49 -0700)
committer	Linus Torvalds <[email protected]>
	Fri, 27 May 2022 22:49:30 +0000 (15:49 -0700)
		1	2
arch/x86/kernel/cpu/mce/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/pat/set_memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-linear.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-log-writes.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/dax.c	patch \|	diff1 \|	diff2 \|	blob \| history