From: Linus Torvalds Date: Fri, 27 May 2022 22:49:30 +0000 (-0700) Subject: Merge tag 'libnvdimm-for-5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdim... X-Git-Url: https://repo.jachan.dev/J-linux.git/commitdiff_plain/35cdd8656eac470b9abc9de8d4bd268fbc0fb34b?hp=-c Merge tag 'libnvdimm-for-5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm Pull libnvdimm and DAX updates from Dan Williams: "New support for clearing memory errors when a file is in DAX mode, alongside with some other fixes and cleanups. Previously it was only possible to clear these errors using a truncate or hole-punch operation to trigger the filesystem to reallocate the block, now, any page aligned write can opportunistically clear errors as well. This change spans x86/mm, nvdimm, and fs/dax, and has received the appropriate sign-offs. Thanks to Jane for her work on this. Summary: - Add support for clearing memory error via pwrite(2) on DAX - Fix 'security overwrite' support in the presence of media errors - Miscellaneous cleanups and fixes for nfit_test (nvdimm unit tests)" * tag 'libnvdimm-for-5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: pmem: implement pmem_recovery_write() pmem: refactor pmem_clear_poison() dax: add .recovery_write dax_operation dax: introduce DAX_RECOVERY_WRITE dax access mode mce: fix set_mce_nospec to always unmap the whole page x86/mce: relocate set{clear}_mce_nospec() functions acpi/nfit: rely on mce->misc to determine poison granularity testing: nvdimm: asm/mce.h is not needed in nfit.c testing: nvdimm: iomap: make __nfit_test_ioremap a macro nvdimm: Allow overwrite in the presence of disabled dimms tools/testing/nvdimm: remove unneeded flush_workqueue --- 35cdd8656eac470b9abc9de8d4bd268fbc0fb34b diff --combined arch/x86/kernel/cpu/mce/core.c index d775fcd74e98,fa67bb9d1afe..2c8ec5c71712 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@@ -69,9 -69,7 +69,9 @@@ DEFINE_PER_CPU_READ_MOSTLY(unsigned int struct mce_bank { u64 ctl; /* subevents to enable */ - bool init; /* initialise bank? */ + + __u64 init : 1, /* initialise bank? */ + __reserved_1 : 63; }; static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); @@@ -581,7 -579,7 +581,7 @@@ static int uc_decode_notifier(struct no pfn = mce->addr >> PAGE_SHIFT; if (!memory_failure(pfn, 0)) { - set_mce_nospec(pfn, whole_page(mce)); + set_mce_nospec(pfn); mce->kflags |= MCE_HANDLED_UC; } @@@ -1318,7 -1316,7 +1318,7 @@@ static void kill_me_maybe(struct callba ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); if (!ret) { - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); sync_core(); return; } @@@ -1344,7 -1342,7 +1344,7 @@@ static void kill_me_never(struct callba p->mce_count = 0; pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); } static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) diff --combined arch/x86/mm/pat/set_memory.c index 0656db33574d,44f0d4260bd8..1abd5438f126 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@@ -19,6 -19,7 +19,7 @@@ #include #include #include + #include #include #include @@@ -29,7 -30,6 +30,6 @@@ #include #include #include - #include #include #include @@@ -638,6 -638,17 +638,6 @@@ pte_t *lookup_address(unsigned long add } EXPORT_SYMBOL_GPL(lookup_address); -/* - * Lookup the page table entry for a virtual address in a given mm. Return a - * pointer to the entry and the level of the mapping. - */ -pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address, - unsigned int *level) -{ - return lookup_address_in_pgd(pgd_offset(mm, address), address, level); -} -EXPORT_SYMBOL_GPL(lookup_address_in_mm); - static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, unsigned int *level) { @@@ -1805,7 -1816,7 +1805,7 @@@ static inline int cpa_clear_pages_array } /* - * _set_memory_prot is an internal helper for callers that have been passed + * __set_memory_prot is an internal helper for callers that have been passed * a pgprot_t value from upper layers and a reservation has already been taken. * If you want to set the pgprot to a specific page protocol, use the * set_memory_xx() functions. @@@ -1914,6 -1925,51 +1914,51 @@@ int set_memory_wb(unsigned long addr, i } EXPORT_SYMBOL(set_memory_wb); + /* Prevent speculative access to a page by marking it not-present */ + #ifdef CONFIG_X86_64 + int set_mce_nospec(unsigned long pfn) + { + unsigned long decoy_addr; + int rc; + + /* SGX pages are not in the 1:1 map */ + if (arch_is_platform_page(pfn << PAGE_SHIFT)) + return 0; + /* + * We would like to just call: + * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the poison page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_XX() properly sanitizing any __pa() + * results with __PHYSICAL_MASK or PTE_PFN_MASK. + */ + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); + + rc = set_memory_np(decoy_addr, 1); + if (rc) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + return rc; + } + + static int set_memory_present(unsigned long *addr, int numpages) + { + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); + } + + /* Restore full speculative operation to the pfn. */ + int clear_mce_nospec(unsigned long pfn) + { + unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); + + return set_memory_present(&addr, 1); + } + EXPORT_SYMBOL_GPL(clear_mce_nospec); + #endif /* CONFIG_X86_64 */ + int set_memory_x(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) diff --combined drivers/md/dm-linear.c index 0a6abbbe3745,cdf48bc8c5b0..3212ef6aa81b --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@@ -84,12 -84,19 +84,12 @@@ static sector_t linear_map_sector(struc return lc->start + dm_target_offset(ti, bi_sector); } -static void linear_map_bio(struct dm_target *ti, struct bio *bio) +static int linear_map(struct dm_target *ti, struct bio *bio) { struct linear_c *lc = ti->private; bio_set_dev(bio, lc->dev->bdev); - if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) - bio->bi_iter.bi_sector = - linear_map_sector(ti, bio->bi_iter.bi_sector); -} - -static int linear_map(struct dm_target *ti, struct bio *bio) -{ - linear_map_bio(ti, bio); + bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector); return DM_MAPIO_REMAPPED; } @@@ -165,11 -172,12 +165,12 @@@ static struct dax_device *linear_dax_pg } static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); - return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); } static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, @@@ -180,9 -188,18 +181,18 @@@ return dax_zero_page_range(dax_dev, pgoff, nr_pages); } + static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) + { + struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); + } + #else #define linear_dax_direct_access NULL #define linear_dax_zero_page_range NULL + #define linear_dax_recovery_write NULL #endif static struct target_type linear_target = { @@@ -200,6 -217,7 +210,7 @@@ .iterate_devices = linear_iterate_devices, .direct_access = linear_dax_direct_access, .dax_zero_page_range = linear_dax_zero_page_range, + .dax_recovery_write = linear_dax_recovery_write, }; int __init dm_linear_init(void) diff --combined drivers/md/dm-log-writes.c index e194226c89e5,22739dccdd17..20fd688f72e7 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@@ -866,8 -866,9 +866,8 @@@ static int log_writes_message(struct dm static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct log_writes_c *lc = ti->private; - struct request_queue *q = bdev_get_queue(lc->dev->bdev); - if (!q || !blk_queue_discard(q)) { + if (!bdev_max_discard_sectors(lc->dev->bdev)) { lc->device_supports_discard = false; limits->discard_granularity = lc->sectorsize; limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT); @@@ -888,11 -889,12 +888,12 @@@ static struct dax_device *log_writes_da } static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); - return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); + return dax_direct_access(dax_dev, pgoff, nr_pages, mode, kaddr, pfn); } static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff, @@@ -903,9 -905,18 +904,18 @@@ return dax_zero_page_range(dax_dev, pgoff, nr_pages << PAGE_SHIFT); } + static size_t log_writes_dax_recovery_write(struct dm_target *ti, + pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) + { + struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); + + return dax_recovery_write(dax_dev, pgoff, addr, bytes, i); + } + #else #define log_writes_dax_direct_access NULL #define log_writes_dax_zero_page_range NULL + #define log_writes_dax_recovery_write NULL #endif static struct target_type log_writes_target = { @@@ -923,6 -934,7 +933,7 @@@ .io_hints = log_writes_io_hints, .direct_access = log_writes_dax_direct_access, .dax_zero_page_range = log_writes_dax_zero_page_range, + .dax_recovery_write = log_writes_dax_recovery_write, }; static int __init dm_log_writes_init(void) diff --combined drivers/md/dm.c index d62f1354ecbf,3fe76ab20069..dfb0a551bd88 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@@ -71,10 -71,6 +71,10 @@@ void dm_issue_global_event(void wake_up(&dm_global_eventq); } +DEFINE_STATIC_KEY_FALSE(stats_enabled); +DEFINE_STATIC_KEY_FALSE(swap_bios_enabled); +DEFINE_STATIC_KEY_FALSE(zoned_enabled); + /* * One of these is allocated (on-stack) per original bio. */ @@@ -84,8 -80,7 +84,8 @@@ struct clone_info struct dm_io *io; sector_t sector; unsigned sector_count; - bool submit_as_polled; + bool is_abnormal_io:1; + bool submit_as_polled:1; }; #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) @@@ -503,76 -498,69 +503,76 @@@ static bool bio_is_flush_with_data(stru return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); } -static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, - unsigned long start_time, struct dm_stats_aux *stats_aux) +static void dm_io_acct(struct dm_io *io, bool end) { - bool is_flush_with_data; - unsigned int bi_size; + struct dm_stats_aux *stats_aux = &io->stats_aux; + unsigned long start_time = io->start_time; + struct mapped_device *md = io->md; + struct bio *bio = io->orig_bio; + unsigned int sectors; - /* If REQ_PREFLUSH set save any payload but do not account it */ - is_flush_with_data = bio_is_flush_with_data(bio); - if (is_flush_with_data) { - bi_size = bio->bi_iter.bi_size; - bio->bi_iter.bi_size = 0; - } + /* + * If REQ_PREFLUSH set, don't account payload, it will be + * submitted (and accounted) after this flush completes. + */ + if (bio_is_flush_with_data(bio)) + sectors = 0; + else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT)))) + sectors = bio_sectors(bio); + else + sectors = io->sectors; if (!end) - bio_start_io_acct_time(bio, start_time); + bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio), + start_time); else - bio_end_io_acct(bio, start_time); + bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); + + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&md->stats))) { + sector_t sector; + + if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT))) + sector = bio->bi_iter.bi_sector; + else + sector = bio_end_sector(bio) - io->sector_offset; - if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), - bio->bi_iter.bi_sector, bio_sectors(bio), + sector, sectors, end, start_time, stats_aux); - - /* Restore bio's payload so it does get accounted upon requeue */ - if (is_flush_with_data) - bio->bi_iter.bi_size = bi_size; + } } -static void __dm_start_io_acct(struct dm_io *io, struct bio *bio) +static void __dm_start_io_acct(struct dm_io *io) { - dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux); + dm_io_acct(io, false); } static void dm_start_io_acct(struct dm_io *io, struct bio *clone) { - /* Must account IO to DM device in terms of orig_bio */ - struct bio *bio = io->orig_bio; - /* * Ensure IO accounting is only ever started once. - * Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ - if (!clone || - likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) { - if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED))) - return; + if (dm_io_flagged(io, DM_IO_ACCOUNTED)) + return; + + /* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ + if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) { dm_io_set_flag(io, DM_IO_ACCOUNTED); } else { unsigned long flags; - if (dm_io_flagged(io, DM_IO_ACCOUNTED)) - return; /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */ spin_lock_irqsave(&io->lock, flags); dm_io_set_flag(io, DM_IO_ACCOUNTED); spin_unlock_irqrestore(&io->lock, flags); } - __dm_start_io_acct(io, bio); + __dm_start_io_acct(io); } -static void dm_end_io_acct(struct dm_io *io, struct bio *bio) +static void dm_end_io_acct(struct dm_io *io) { - dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux); + dm_io_acct(io, true); } static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) @@@ -581,9 -569,7 +581,9 @@@ struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs); + clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->io_bs); + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; tio = clone_to_tio(clone); tio->flags = 0; @@@ -592,19 -578,17 +592,19 @@@ io = container_of(tio, struct dm_io, tio); io->magic = DM_IO_MAGIC; - io->status = 0; - atomic_set(&io->io_count, 1); + io->status = BLK_STS_OK; + + /* one ref is for submission, the other is for completion */ + atomic_set(&io->io_count, 2); this_cpu_inc(*md->pending_io); - io->orig_bio = NULL; + io->orig_bio = bio; io->md = md; - io->map_task = current; spin_lock_init(&io->lock); io->start_time = jiffies; io->flags = 0; - dm_stats_record_start(&md->stats, &io->stats_aux); + if (static_branch_unlikely(&stats_enabled)) + dm_stats_record_start(&md->stats, &io->stats_aux); return io; } @@@ -615,7 -599,7 +615,7 @@@ static void free_io(struct dm_io *io } static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, - unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) + unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) { struct dm_target_io *tio; struct bio *clone; @@@ -626,13 -610,10 +626,13 @@@ /* alloc_io() already initialized embedded clone */ clone = &tio->clone; } else { - clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, - gfp_mask, &ci->io->md->bs); + struct mapped_device *md = ci->io->md; + + clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->bs); if (!clone) return NULL; + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; /* REQ_DM_POLL_LIST shouldn't be inherited */ clone->bi_opf &= ~REQ_DM_POLL_LIST; @@@ -682,16 -663,14 +682,16 @@@ static void queue_io(struct mapped_devi * function to access the md->map field, and make sure they call * dm_put_live_table() when finished. */ -struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) +struct dm_table *dm_get_live_table(struct mapped_device *md, + int *srcu_idx) __acquires(md->io_barrier) { *srcu_idx = srcu_read_lock(&md->io_barrier); return srcu_dereference(md->map, &md->io_barrier); } -void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) +void dm_put_live_table(struct mapped_device *md, + int srcu_idx) __releases(md->io_barrier) { srcu_read_unlock(&md->io_barrier, srcu_idx); } @@@ -717,24 -696,6 +717,24 @@@ static void dm_put_live_table_fast(stru rcu_read_unlock(); } +static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md, + int *srcu_idx, struct bio *bio) +{ + if (bio->bi_opf & REQ_NOWAIT) + return dm_get_live_table_fast(md); + else + return dm_get_live_table(md, srcu_idx); +} + +static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx, + struct bio *bio) +{ + if (bio->bi_opf & REQ_NOWAIT) + dm_put_live_table_fast(md); + else + dm_put_live_table(md, srcu_idx); +} + static char *_dm_claim_ptr = "I belong to device-mapper"; /* @@@ -914,14 -875,14 +914,14 @@@ static void dm_io_complete(struct dm_i io_error = io->status; if (dm_io_flagged(io, DM_IO_ACCOUNTED)) - dm_end_io_acct(io, bio); + dm_end_io_acct(io); else if (!io_error) { /* * Must handle target that DM_MAPIO_SUBMITTED only to * then bio_endio() rather than dm_submit_bio_remap() */ - __dm_start_io_acct(io, bio); - dm_end_io_acct(io, bio); + __dm_start_io_acct(io); + dm_end_io_acct(io); } free_io(io); smp_wmb(); @@@ -938,7 -899,7 +938,7 @@@ * may only reflect a subset of the pre-split original) * so clear REQ_POLLED in case of requeue. */ - bio->bi_opf &= ~REQ_POLLED; + bio_clear_polled(bio); if (io_error == BLK_STS_AGAIN) { /* io_uring doesn't handle BLK_STS_AGAIN (yet) */ queue_io(md, bio); @@@ -962,35 -923,30 +962,35 @@@ } } -static inline bool dm_tio_is_normal(struct dm_target_io *tio) -{ - return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && - !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); -} - /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ -void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +static inline void __dm_io_dec_pending(struct dm_io *io) { + if (atomic_dec_and_test(&io->io_count)) + dm_io_complete(io); +} + +static void dm_io_set_error(struct dm_io *io, blk_status_t error) +{ + unsigned long flags; + /* Push-back supersedes any I/O errors */ - if (unlikely(error)) { - unsigned long flags; - spin_lock_irqsave(&io->lock, flags); - if (!(io->status == BLK_STS_DM_REQUEUE && - __noflush_suspending(io->md))) - io->status = error; - spin_unlock_irqrestore(&io->lock, flags); + spin_lock_irqsave(&io->lock, flags); + if (!(io->status == BLK_STS_DM_REQUEUE && + __noflush_suspending(io->md))) { + io->status = error; } + spin_unlock_irqrestore(&io->lock, flags); +} - if (atomic_dec_and_test(&io->io_count)) - dm_io_complete(io); +static void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +{ + if (unlikely(error)) + dm_io_set_error(io, error); + + __dm_io_dec_pending(io); } void disable_discard(struct mapped_device *md) @@@ -999,6 -955,7 +999,6 @@@ /* device doesn't really support DISCARD, disable it */ limits->max_discard_sectors = 0; - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue); } void disable_write_zeroes(struct mapped_device *md) @@@ -1018,43 -975,35 +1018,43 @@@ static void clone_endio(struct bio *bio { blk_status_t error = bio->bi_status; struct dm_target_io *tio = clone_to_tio(bio); + struct dm_target *ti = tio->ti; + dm_endio_fn endio = ti->type->end_io; struct dm_io *io = tio->io; - struct mapped_device *md = tio->io->md; - dm_endio_fn endio = tio->ti->type->end_io; - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct mapped_device *md = io->md; - if (unlikely(error == BLK_STS_TARGET)) { - if (bio_op(bio) == REQ_OP_DISCARD && - !q->limits.max_discard_sectors) - disable_discard(md); - else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !q->limits.max_write_zeroes_sectors) - disable_write_zeroes(md); - } + if (likely(bio->bi_bdev != md->disk->part0)) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + if (unlikely(error == BLK_STS_TARGET)) { + if (bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev)) + disable_discard(md); + else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && + !q->limits.max_write_zeroes_sectors) + disable_write_zeroes(md); + } - if (blk_queue_is_zoned(q)) - dm_zone_endio(io, bio); + if (static_branch_unlikely(&zoned_enabled) && + unlikely(blk_queue_is_zoned(q))) + dm_zone_endio(io, bio); + } if (endio) { - int r = endio(tio->ti, bio, &error); + int r = endio(ti, bio, &error); switch (r) { case DM_ENDIO_REQUEUE: - /* - * Requeuing writes to a sequential zone of a zoned - * target will break the sequential write pattern: - * fail such IO. - */ - if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) - error = BLK_STS_IOERR; - else + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Requeuing writes to a sequential zone of a zoned + * target will break the sequential write pattern: + * fail such IO. + */ + if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) + error = BLK_STS_IOERR; + else + error = BLK_STS_DM_REQUEUE; + } else error = BLK_STS_DM_REQUEUE; fallthrough; case DM_ENDIO_DONE: @@@ -1068,9 -1017,10 +1068,9 @@@ } } - if (unlikely(swap_bios_limit(tio->ti, bio))) { - struct mapped_device *md = io->md; + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, bio))) up(&md->swap_bios_semaphore); - } free_tio(bio); dm_io_dec_pending(io, error); @@@ -1143,7 -1093,8 +1143,8 @@@ static struct dm_target *dm_dax_get_liv } static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, - long nr_pages, void **kaddr, pfn_t *pfn) + long nr_pages, enum dax_access_mode mode, void **kaddr, + pfn_t *pfn) { struct mapped_device *md = dax_get_private(dax_dev); sector_t sector = pgoff * PAGE_SECTORS; @@@ -1161,7 -1112,7 +1162,7 @@@ if (len < 1) goto out; nr_pages = min(len, nr_pages); - ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); + ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn); out: dm_put_live_table(md, srcu_idx); @@@ -1196,6 -1147,25 +1197,25 @@@ static int dm_dax_zero_page_range(struc return ret; } + static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, + void *addr, size_t bytes, struct iov_iter *i) + { + struct mapped_device *md = dax_get_private(dax_dev); + sector_t sector = pgoff * PAGE_SECTORS; + struct dm_target *ti; + int srcu_idx; + long ret = 0; + + ti = dm_dax_get_live_target(md, sector, &srcu_idx); + if (!ti || !ti->type->dax_recovery_write) + goto out; + + ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i); + out: + dm_put_live_table(md, srcu_idx); + return ret; + } + /* * A target may call dm_accept_partial_bio only from the map routine. It is * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management @@@ -1212,7 -1182,7 +1232,7 @@@ * +--------------------+---------------+-------+ * * <-------------- *tio->len_ptr ---------------> - * <------- bi_size -------> + * <----- bio_sectors -----> * <-- n_sectors --> * * Region 1 was already iterated over with bio_advance or similar function. @@@ -1229,25 -1199,25 +1249,25 @@@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = clone_to_tio(bio); - unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + unsigned bio_sectors = bio_sectors(bio); BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); BUG_ON(op_is_zone_mgmt(bio_op(bio))); BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); - BUG_ON(bi_size > *tio->len_ptr); - BUG_ON(n_sectors > bi_size); + BUG_ON(bio_sectors > *tio->len_ptr); + BUG_ON(n_sectors > bio_sectors); - *tio->len_ptr -= bi_size - n_sectors; + *tio->len_ptr -= bio_sectors - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; -} -EXPORT_SYMBOL_GPL(dm_accept_partial_bio); -static inline void __dm_submit_bio_remap(struct bio *clone, - dev_t dev, sector_t old_sector) -{ - trace_block_bio_remap(clone, dev, old_sector); - submit_bio_noacct(clone); + /* + * __split_and_process_bio() may have already saved mapped part + * for accounting but it is being reduced so update accordingly. + */ + dm_io_set_flag(tio->io, DM_IO_WAS_SPLIT); + tio->io->sectors = n_sectors; } +EXPORT_SYMBOL_GPL(dm_accept_partial_bio); /* * @clone: clone bio that DM core passed to target's .map function @@@ -1263,6 -1233,8 +1283,6 @@@ void dm_submit_bio_remap(struct bio *cl struct dm_target_io *tio = clone_to_tio(clone); struct dm_io *io = tio->io; - WARN_ON_ONCE(!tio->ti->accounts_remapped_io); - /* establish bio that will get submitted */ if (!tgt_clone) tgt_clone = clone; @@@ -1271,11 -1243,22 +1291,11 @@@ * Account io->origin_bio to DM dev on behalf of target * that took ownership of IO with DM_MAPIO_SUBMITTED. */ - if (io->map_task == current) { - /* Still in target's map function */ - dm_io_set_flag(io, DM_IO_START_ACCT); - } else { - /* - * Called by another thread, managed by DM target, - * wait for dm_split_and_process_bio() to store - * io->orig_bio - */ - while (unlikely(!smp_load_acquire(&io->orig_bio))) - msleep(1); - dm_start_io_acct(io, clone); - } + dm_start_io_acct(io, clone); - __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk), + trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk), tio->old_sector); + submit_bio_noacct(tgt_clone); } EXPORT_SYMBOL_GPL(dm_submit_bio_remap); @@@ -1298,53 -1281,55 +1318,53 @@@ static noinline void __set_swap_bios_li static void __map_bio(struct bio *clone) { struct dm_target_io *tio = clone_to_tio(clone); - int r; - struct dm_io *io = tio->io; struct dm_target *ti = tio->ti; + struct dm_io *io = tio->io; + struct mapped_device *md = io->md; + int r; clone->bi_end_io = clone_endio; /* * Map the clone. */ - dm_io_inc_pending(io); tio->old_sector = clone->bi_iter.bi_sector; - if (unlikely(swap_bios_limit(ti, clone))) { - struct mapped_device *md = io->md; + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) { int latch = get_swap_bios(); if (unlikely(latch != md->swap_bios)) __set_swap_bios_limit(md, latch); down(&md->swap_bios_semaphore); } - /* - * Check if the IO needs a special mapping due to zone append emulation - * on zoned target. In this case, dm_zone_map_bio() calls the target - * map operation. - */ - if (dm_emulate_zone_append(io->md)) - r = dm_zone_map_bio(tio); - else + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Check if the IO needs a special mapping due to zone append + * emulation on zoned target. In this case, dm_zone_map_bio() + * calls the target map operation. + */ + if (unlikely(dm_emulate_zone_append(md))) + r = dm_zone_map_bio(tio); + else + r = ti->type->map(ti, clone); + } else r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: /* target has assumed ownership of this io */ if (!ti->accounts_remapped_io) - dm_io_set_flag(io, DM_IO_START_ACCT); + dm_start_io_acct(io, clone); break; case DM_MAPIO_REMAPPED: - /* - * the bio has been remapped so dispatch it, but defer - * dm_start_io_acct() until after possible bio_split(). - */ - __dm_submit_bio_remap(clone, disk_devt(io->md->disk), - tio->old_sector); - dm_io_set_flag(io, DM_IO_START_ACCT); + dm_submit_bio_remap(clone, NULL); break; case DM_MAPIO_KILL: case DM_MAPIO_REQUEUE: - if (unlikely(swap_bios_limit(ti, clone))) - up(&io->md->swap_bios_semaphore); + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) + up(&md->swap_bios_semaphore); free_tio(clone); if (r == DM_MAPIO_KILL) dm_io_dec_pending(io, BLK_STS_IOERR); @@@ -1357,31 -1342,6 +1377,31 @@@ } } +static void setup_split_accounting(struct clone_info *ci, unsigned len) +{ + struct dm_io *io = ci->io; + + if (ci->sector_count > len) { + /* + * Split needed, save the mapped part for accounting. + * NOTE: dm_accept_partial_bio() will update accordingly. + */ + dm_io_set_flag(io, DM_IO_WAS_SPLIT); + io->sectors = len; + } + + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&io->md->stats))) { + /* + * Save bi_sector in terms of its offset from end of + * original bio, only needed for DM-stats' benefit. + * - saved regardless of whether split needed so that + * dm_accept_partial_bio() doesn't need to. + */ + io->sector_offset = bio_end_sector(ci->bio) - ci->sector; + } +} + static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, struct dm_target *ti, unsigned num_bios) { @@@ -1411,22 -1371,18 +1431,22 @@@ } } -static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, +static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, unsigned num_bios, unsigned *len) { struct bio_list blist = BIO_EMPTY_LIST; struct bio *clone; + int ret = 0; switch (num_bios) { case 0: break; case 1: + if (len) + setup_split_accounting(ci, *len); clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); __map_bio(clone); + ret = 1; break; default: /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ @@@ -1434,12 -1390,9 +1454,12 @@@ while ((clone = bio_list_pop(&blist))) { dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); __map_bio(clone); + ret += 1; } break; } + + return ret; } static void __send_empty_flush(struct clone_info *ci) @@@ -1460,19 -1413,8 +1480,19 @@@ ci->sector_count = 0; ci->io->tio.clone.bi_iter.bi_size = 0; - while ((ti = dm_table_get_target(ci->map, target_nr++))) - __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + while ((ti = dm_table_get_target(ci->map, target_nr++))) { + int bios; + + atomic_add(ti->num_flush_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); + } + + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following subtraction + */ + atomic_sub(1, &ci->io->io_count); bio_uninit(ci->bio); } @@@ -1481,18 -1423,11 +1501,18 @@@ static void __send_changing_extent_only unsigned num_bios) { unsigned len; + int bios; len = min_t(sector_t, ci->sector_count, max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); - __send_duplicate_bios(ci, ti, num_bios, &len); + atomic_add(num_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, num_bios, &len); + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following (+1) subtraction + */ + atomic_sub(num_bios - bios + 1, &ci->io->io_count); ci->sector += len; ci->sector_count -= len; @@@ -1500,24 -1435,21 +1520,24 @@@ static bool is_abnormal_io(struct bio *bio) { - bool r = false; + unsigned int op = bio_op(bio); - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - r = true; - break; + if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) { + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + return true; + default: + break; + } } - return r; + return false; } -static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, - int *result) +static blk_status_t __process_abnormal_io(struct clone_info *ci, + struct dm_target *ti) { unsigned num_bios = 0; @@@ -1531,6 -1463,8 +1551,6 @@@ case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; break; - default: - return false; } /* @@@ -1539,15 -1473,17 +1559,15 @@@ * reconfiguration might also have changed that since the * check was performed. */ - if (!num_bios) - *result = -EOPNOTSUPP; - else { - __send_changing_extent_only(ci, ti, num_bios); - *result = 0; - } - return true; + if (unlikely(!num_bios)) + return BLK_STS_NOTSUPP; + + __send_changing_extent_only(ci, ti, num_bios); + return BLK_STS_OK; } /* - * Reuse ->bi_private as hlist head for storing all dm_io instances + * Reuse ->bi_private as dm_io list head for storing all dm_io instances * associated with this bio, and this bio's bi_private needs to be * stored in dm_io->data before the reuse. * @@@ -1555,53 -1491,54 +1575,53 @@@ * touch it after splitting. Meantime it won't be changed by anyone after * bio is submitted. So this reuse is safe. */ -static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio) +static inline struct dm_io **dm_poll_list_head(struct bio *bio) { - return (struct hlist_head *)&bio->bi_private; + return (struct dm_io **)&bio->bi_private; } static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) { - struct hlist_head *head = dm_get_bio_hlist_head(bio); + struct dm_io **head = dm_poll_list_head(bio); if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { bio->bi_opf |= REQ_DM_POLL_LIST; /* * Save .bi_private into dm_io, so that we can reuse - * .bi_private as hlist head for storing dm_io list + * .bi_private as dm_io list head for storing dm_io list */ io->data = bio->bi_private; - INIT_HLIST_HEAD(head); - /* tell block layer to poll for completion */ bio->bi_cookie = ~BLK_QC_T_NONE; + + io->next = NULL; } else { /* * bio recursed due to split, reuse original poll list, * and save bio->bi_private too. */ - io->data = hlist_entry(head->first, struct dm_io, node)->data; + io->data = (*head)->data; + io->next = *head; } - hlist_add_head(&io->node, head); + *head = io; } /* * Select the correct strategy for processing a non-flush bio. */ -static int __split_and_process_bio(struct clone_info *ci) +static blk_status_t __split_and_process_bio(struct clone_info *ci) { struct bio *clone; struct dm_target *ti; unsigned len; - int r; ti = dm_table_find_target(ci->map, ci->sector); - if (!ti) - return -EIO; - - if (__process_abnormal_io(ci, ti, &r)) - return r; + if (unlikely(!ti)) + return BLK_STS_IOERR; + else if (unlikely(ci->is_abnormal_io)) + return __process_abnormal_io(ci, ti); /* * Only support bio polling for normal IO, and the target io is @@@ -1610,30 -1547,27 +1630,30 @@@ ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); + setup_split_accounting(ci, len); clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); __map_bio(clone); ci->sector += len; ci->sector_count -= len; - return 0; + return BLK_STS_OK; } static void init_clone_info(struct clone_info *ci, struct mapped_device *md, - struct dm_table *map, struct bio *bio) + struct dm_table *map, struct bio *bio, bool is_abnormal) { ci->map = map; ci->io = alloc_io(md, bio); ci->bio = bio; + ci->is_abnormal_io = is_abnormal; ci->submit_as_polled = false; ci->sector = bio->bi_iter.bi_sector; ci->sector_count = bio_sectors(bio); /* Shouldn't happen but sector_count was being set to 0 so... */ - if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) + if (static_branch_unlikely(&zoned_enabled) && + WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) ci->sector_count = 0; } @@@ -1644,21 -1578,10 +1664,21 @@@ static void dm_split_and_process_bio(st struct dm_table *map, struct bio *bio) { struct clone_info ci; - struct bio *orig_bio = NULL; - int error = 0; + struct dm_io *io; + blk_status_t error = BLK_STS_OK; + bool is_abnormal; - init_clone_info(&ci, md, map, bio); + is_abnormal = is_abnormal_io(bio); + if (unlikely(is_abnormal)) { + /* + * Use blk_queue_split() for abnormal IO (e.g. discard, etc) + * otherwise associated queue_limits won't be imposed. + */ + blk_queue_split(&bio); + } + + init_clone_info(&ci, md, map, bio, is_abnormal); + io = ci.io; if (bio->bi_opf & REQ_PREFLUSH) { __send_empty_flush(&ci); @@@ -1667,34 -1590,40 +1687,34 @@@ } error = __split_and_process_bio(&ci); - ci.io->map_task = NULL; if (error || !ci.sector_count) goto out; - /* * Remainder must be passed to submit_bio_noacct() so it gets handled * *after* bios already submitted have been completely processed. - * We take a clone of the original to store in ci.io->orig_bio to be - * used by dm_end_io_acct() and for dm_io_complete() to use for - * completion handling. */ - orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count, - GFP_NOIO, &md->queue->bio_split); - bio_chain(orig_bio, bio); - trace_block_split(orig_bio, bio->bi_iter.bi_sector); + bio_trim(bio, io->sectors, ci.sector_count); + trace_block_split(bio, bio->bi_iter.bi_sector); + bio_inc_remaining(bio); submit_bio_noacct(bio); out: - if (!orig_bio) - orig_bio = bio; - smp_store_release(&ci.io->orig_bio, orig_bio); - if (dm_io_flagged(ci.io, DM_IO_START_ACCT)) - dm_start_io_acct(ci.io, NULL); - /* * Drop the extra reference count for non-POLLED bio, and hold one * reference for POLLED bio, which will be released in dm_poll_bio * - * Add every dm_io instance into the hlist_head which is stored in - * bio->bi_private, so that dm_poll_bio can poll them all. + * Add every dm_io instance into the dm_io list head which is stored + * in bio->bi_private, so that dm_poll_bio can poll them all. */ - if (error || !ci.submit_as_polled) - dm_io_dec_pending(ci.io, errno_to_blk_status(error)); - else - dm_queue_poll_io(bio, ci.io); + if (error || !ci.submit_as_polled) { + /* + * In case of submission failure, the extra reference for + * submitting io isn't consumed yet + */ + if (error) + atomic_dec(&io->io_count); + dm_io_dec_pending(io, error); + } else + dm_queue_poll_io(bio, io); } static void dm_submit_bio(struct bio *bio) @@@ -1703,7 -1632,7 +1723,7 @@@ int srcu_idx; struct dm_table *map; - map = dm_get_live_table(md, &srcu_idx); + map = dm_get_live_table_bio(md, &srcu_idx, bio); /* If suspended, or map not yet available, queue this IO for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || @@@ -1717,9 -1646,16 +1737,9 @@@ goto out; } - /* - * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) - * otherwise associated queue_limits won't be imposed. - */ - if (is_abnormal_io(bio)) - blk_queue_split(&bio); - dm_split_and_process_bio(md, map, bio); out: - dm_put_live_table(md, srcu_idx); + dm_put_live_table_bio(md, srcu_idx, bio); } static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, @@@ -1738,16 -1674,18 +1758,16 @@@ static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { - struct hlist_head *head = dm_get_bio_hlist_head(bio); - struct hlist_head tmp = HLIST_HEAD_INIT; - struct hlist_node *next; - struct dm_io *io; + struct dm_io **head = dm_poll_list_head(bio); + struct dm_io *list = *head; + struct dm_io *tmp = NULL; + struct dm_io *curr, *next; /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ if (!(bio->bi_opf & REQ_DM_POLL_LIST)) return 0; - WARN_ON_ONCE(hlist_empty(head)); - - hlist_move_list(head, &tmp); + WARN_ON_ONCE(!list); /* * Restore .bi_private before possibly completing dm_io. @@@ -1758,27 -1696,24 +1778,27 @@@ * clearing REQ_DM_POLL_LIST here. */ bio->bi_opf &= ~REQ_DM_POLL_LIST; - bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data; + bio->bi_private = list->data; - hlist_for_each_entry_safe(io, next, &tmp, node) { - if (dm_poll_dm_io(io, iob, flags)) { - hlist_del_init(&io->node); + for (curr = list, next = curr->next; curr; curr = next, next = + curr ? curr->next : NULL) { + if (dm_poll_dm_io(curr, iob, flags)) { /* - * clone_endio() has already occurred, so passing - * error as 0 here doesn't override io->status + * clone_endio() has already occurred, so no + * error handling is needed here. */ - dm_io_dec_pending(io, 0); + __dm_io_dec_pending(curr); + } else { + curr->next = tmp; + tmp = curr; } } /* Not done? */ - if (!hlist_empty(&tmp)) { + if (tmp) { bio->bi_opf |= REQ_DM_POLL_LIST; /* Reset bio->bi_private to dm_io list head */ - hlist_move_list(&tmp, head); + *head = tmp; return 0; } return 1; @@@ -2987,8 -2922,8 +3007,8 @@@ int dm_noflush_suspending(struct dm_tar EXPORT_SYMBOL_GPL(dm_noflush_suspending); struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned integrity, unsigned per_io_data_size, - unsigned min_pool_size) + unsigned per_io_data_size, unsigned min_pool_size, + bool integrity, bool poll) { struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); unsigned int pool_size = 0; @@@ -3004,7 -2939,7 +3024,7 @@@ pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; - ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); + ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, poll ? BIOSET_PERCPU_CACHE : 0); if (ret) goto out; if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) @@@ -3231,6 -3166,7 +3251,7 @@@ static const struct block_device_operat static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, .zero_page_range = dm_dax_zero_page_range, + .recovery_write = dm_dax_recovery_write, }; /* diff --combined fs/dax.c index 1ac12e877f4f,a1e4b45cbf55..4155a6107fa1 --- a/fs/dax.c +++ b/fs/dax.c @@@ -24,7 -24,6 +24,7 @@@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@@ -722,7 -721,8 +722,8 @@@ static int copy_cow_page_dax(struct vm_ int id; id = dax_read_lock(); - rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, &kaddr, NULL); + rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS, + &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); return rc; @@@ -790,12 -790,95 +791,12 @@@ static void *dax_insert_entry(struct xa return entry; } -static inline -unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) -{ - unsigned long address; - - address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - return address; -} - -/* Walk all mappings of a given index of a file and writeprotect them */ -static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, - unsigned long pfn) -{ - struct vm_area_struct *vma; - pte_t pte, *ptep = NULL; - pmd_t *pmdp = NULL; - spinlock_t *ptl; - - i_mmap_lock_read(mapping); - vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { - struct mmu_notifier_range range; - unsigned long address; - - cond_resched(); - - if (!(vma->vm_flags & VM_SHARED)) - continue; - - address = pgoff_address(index, vma); - - /* - * follow_invalidate_pte() will use the range to call - * mmu_notifier_invalidate_range_start() on our behalf before - * taking any lock. - */ - if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep, - &pmdp, &ptl)) - continue; - - /* - * No need to call mmu_notifier_invalidate_range() as we are - * downgrading page table protection not changing it to point - * to a new page. - * - * See Documentation/vm/mmu_notifier.rst - */ - if (pmdp) { -#ifdef CONFIG_FS_DAX_PMD - pmd_t pmd; - - if (pfn != pmd_pfn(*pmdp)) - goto unlock_pmd; - if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) - goto unlock_pmd; - - flush_cache_page(vma, address, pfn); - pmd = pmdp_invalidate(vma, address, pmdp); - pmd = pmd_wrprotect(pmd); - pmd = pmd_mkclean(pmd); - set_pmd_at(vma->vm_mm, address, pmdp, pmd); -unlock_pmd: -#endif - spin_unlock(ptl); - } else { - if (pfn != pte_pfn(*ptep)) - goto unlock_pte; - if (!pte_dirty(*ptep) && !pte_write(*ptep)) - goto unlock_pte; - - flush_cache_page(vma, address, pfn); - pte = ptep_clear_flush(vma, address, ptep); - pte = pte_wrprotect(pte); - pte = pte_mkclean(pte); - set_pte_at(vma->vm_mm, address, ptep, pte); -unlock_pte: - pte_unmap_unlock(ptep, ptl); - } - - mmu_notifier_invalidate_range_end(&range); - } - i_mmap_unlock_read(mapping); -} - static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, struct address_space *mapping, void *entry) { - unsigned long pfn, index, count; + unsigned long pfn, index, count, end; long ret = 0; + struct vm_area_struct *vma; /* * A page got tagged dirty in DAX mapping? Something is seriously @@@ -853,16 -936,8 +854,16 @@@ pfn = dax_to_pfn(entry); count = 1UL << dax_entry_order(entry); index = xas->xa_index & ~(count - 1); + end = index + count - 1; + + /* Walk all mappings of a given index of a file and writeprotect them */ + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) { + pfn_mkclean_range(pfn, count, index, vma); + cond_resched(); + } + i_mmap_unlock_read(mapping); - dax_entry_mkclean(mapping, index, pfn); dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); /* * After we have flushed the cache, we can clear the dirty tag. There @@@ -939,7 -1014,7 +940,7 @@@ static int dax_iomap_pfn(const struct i id = dax_read_lock(); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), - NULL, pfnp); + DAX_ACCESS, NULL, pfnp); if (length < 0) { rc = length; goto out; @@@ -1048,7 -1123,7 +1049,7 @@@ static int dax_memzero(struct dax_devic void *kaddr; long ret; - ret = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); + ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); if (ret > 0) { memset(kaddr + offset, 0, size); dax_flush(dax_dev, kaddr + offset, size); @@@ -1165,6 -1240,7 +1166,7 @@@ static loff_t dax_iomap_iter(const stru const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); ssize_t map_len; + bool recovery = false; void *kaddr; if (fatal_signal_pending(current)) { @@@ -1173,7 -1249,14 +1175,14 @@@ } map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), - &kaddr, NULL); + DAX_ACCESS, &kaddr, NULL); + if (map_len == -EIO && iov_iter_rw(iter) == WRITE) { + map_len = dax_direct_access(dax_dev, pgoff, + PHYS_PFN(size), DAX_RECOVERY_WRITE, + &kaddr, NULL); + if (map_len > 0) + recovery = true; + } if (map_len < 0) { ret = map_len; break; @@@ -1185,7 -1268,10 +1194,10 @@@ if (map_len > end - pos) map_len = end - pos; - if (iov_iter_rw(iter) == WRITE) + if (recovery) + xfer = dax_recovery_write(dax_dev, pgoff, kaddr, + map_len, iter); + else if (iov_iter_rw(iter) == WRITE) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else