From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 13 May 2024 20:03:54 +0000 (-0700)
Subject: Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux
X-Git-Tag: v6.10-rc1~219
X-Git-Url: https://repo.jachan.dev/linux.git/commitdiff_plain/0c9f4ac808b017a0013cee92a30de980550145d5?hp=-c

Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux

Pull block updates from Jens Axboe:

 - Add a partscan attribute in sysfs, fixing an issue with systemd
   relying on an internal interface that went away.

 - Attempt #2 at making long running discards interruptible. The
   previous attempt went into 6.9, but we ended up mostly reverting it
   as it had issues.

 - Remove old ida_simple API in bcache

 - Support for zoned write plugging, greatly improving the performance
   on zoned devices.

 - Remove the old throttle low interface, which has been experimental
   since 2017 and never made it beyond that and isn't being used.

 - Remove page->index debugging checks in brd, as it hasn't caught
   anything and prepares us for removing in struct page.

 - MD pull request from Song

 - Don't schedule block workers on isolated CPUs

* tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux: (84 commits)
  blk-throttle: delay initialization until configuration
  blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW
  block: fix that util can be greater than 100%
  block: support to account io_ticks precisely
  block: add plug while submitting IO
  bcache: fix variable length array abuse in btree_iter
  bcache: Remove usage of the deprecated ida_simple_xx() API
  md: Revert "md: Fix overflow in is_mddev_idle"
  blk-lib: check for kill signal in ioctl BLKDISCARD
  block: add a bio_await_chain helper
  block: add a blk_alloc_discard_bio helper
  block: add a bio_chain_and_submit helper
  block: move discard checks into the ioctl handler
  block: remove the discard_granularity check in __blkdev_issue_discard
  block/ioctl: prefer different overflow check
  null_blk: Fix the WARNING: modpost: missing MODULE_DESCRIPTION()
  block: fix and simplify blkdevparts= cmdline parsing
  block: refine the EOF check in blkdev_iomap_begin
  block: add a partscan sysfs attribute for disks
  block: add a disk_has_partscan helper
  ...
---

0c9f4ac808b017a0013cee92a30de980550145d5
diff --combined block/blk-cgroup.c
index 059467086b13,5e1f10525677..4b1a35ab0ea4
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@@ -218,8 -218,7 +218,7 @@@ static void blkg_async_bio_workfn(struc
  
  	/* as long as there are pending bios, @blkg can't go away */
  	spin_lock(&blkg->async_bio_lock);
- 	bio_list_merge(&bios, &blkg->async_bios);
- 	bio_list_init(&blkg->async_bios);
+ 	bio_list_merge_init(&bios, &blkg->async_bios);
  	spin_unlock(&blkg->async_bio_lock);
  
  	/* start plug only when bio_list contains at least 2 bios */
@@@ -1409,12 -1408,6 +1408,12 @@@ static int blkcg_css_online(struct cgro
  	return 0;
  }
  
 +void blkg_init_queue(struct request_queue *q)
 +{
 +	INIT_LIST_HEAD(&q->blkg_list);
 +	mutex_init(&q->blkcg_mutex);
 +}
 +
  int blkcg_init_disk(struct gendisk *disk)
  {
  	struct request_queue *q = disk->queue;
@@@ -1422,6 -1415,9 +1421,6 @@@
  	bool preloaded;
  	int ret;
  
 -	INIT_LIST_HEAD(&q->blkg_list);
 -	mutex_init(&q->blkcg_mutex);
 -
  	new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
  	if (!new_blkg)
  		return -ENOMEM;
@@@ -1444,14 -1440,8 +1443,8 @@@
  	if (ret)
  		goto err_destroy_all;
  
- 	ret = blk_throtl_init(disk);
- 	if (ret)
- 		goto err_ioprio_exit;
- 
  	return 0;
  
- err_ioprio_exit:
- 	blk_ioprio_exit(disk);
  err_destroy_all:
  	blkg_destroy_all(disk);
  	return ret;
diff --combined block/blk-core.c
index b795ac177281,8566bbd8aeba..01186333c88e
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@@ -442,8 -442,6 +442,8 @@@ struct request_queue *blk_alloc_queue(s
  	init_waitqueue_head(&q->mq_freeze_wq);
  	mutex_init(&q->mq_freeze_lock);
  
 +	blkg_init_queue(q);
 +
  	/*
  	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
  	 * See blk_register_queue() for details.
@@@ -591,8 -589,7 +591,7 @@@ static inline blk_status_t blk_check_zo
  		return BLK_STS_NOTSUPP;
  
  	/* The bio sector must point to the start of a sequential zone */
- 	if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
- 	    !bio_zone_is_seq(bio))
+ 	if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
  		return BLK_STS_IOERR;
  
  	/*
@@@ -604,7 -601,7 +603,7 @@@
  		return BLK_STS_IOERR;
  
  	/* Make sure the BIO is small enough and will not get split */
- 	if (nr_sectors > q->limits.max_zone_append_sectors)
+ 	if (nr_sectors > queue_max_zone_append_sectors(q))
  		return BLK_STS_IOERR;
  
  	bio->bi_opf |= REQ_NOMERGE;
@@@ -649,11 -646,13 +648,13 @@@ static void __submit_bio(struct bio *bi
  static void __submit_bio_noacct(struct bio *bio)
  {
  	struct bio_list bio_list_on_stack[2];
+ 	struct blk_plug plug;
  
  	BUG_ON(bio->bi_next);
  
  	bio_list_init(&bio_list_on_stack[0]);
  	current->bio_list = bio_list_on_stack;
+ 	blk_start_plug(&plug);
  
  	do {
  		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@@ -687,19 -686,23 +688,23 @@@
  		bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
  	} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
  
+ 	blk_finish_plug(&plug);
  	current->bio_list = NULL;
  }
  
  static void __submit_bio_noacct_mq(struct bio *bio)
  {
  	struct bio_list bio_list[2] = { };
+ 	struct blk_plug plug;
  
  	current->bio_list = bio_list;
+ 	blk_start_plug(&plug);
  
  	do {
  		__submit_bio(bio);
  	} while ((bio = bio_list_pop(&bio_list[0])));
  
+ 	blk_finish_plug(&plug);
  	current->bio_list = NULL;
  }
  
@@@ -910,12 -913,6 +915,6 @@@ int bio_poll(struct bio *bio, struct io
  	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
  		return 0;
  
- 	/*
- 	 * As the requests that require a zone lock are not plugged in the
- 	 * first place, directly accessing the plug instead of using
- 	 * blk_mq_plug() should not have any consequences during flushing for
- 	 * zoned devices.
- 	 */
  	blk_flush_plug(current->plug, false);
  
  	/*
@@@ -987,10 -984,11 +986,11 @@@ void update_io_ticks(struct block_devic
  	unsigned long stamp;
  again:
  	stamp = READ_ONCE(part->bd_stamp);
- 	if (unlikely(time_after(now, stamp))) {
- 		if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
- 			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
- 	}
+ 	if (unlikely(time_after(now, stamp)) &&
+ 	    likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
+ 	    (end || part_in_flight(part)))
+ 		__part_stat_add(part, io_ticks, now - stamp);
+ 
  	if (part->bd_partno) {
  		part = bdev_whole(part);
  		goto again;
@@@ -1197,7 -1195,6 +1197,7 @@@ void __blk_flush_plug(struct blk_plug *
  	if (unlikely(!rq_list_empty(plug->cached_rq)))
  		blk_mq_free_plug_rqs(plug);
  
 +	plug->cur_ktime = 0;
  	current->flags &= ~PF_BLOCK_TS;
  }
  
diff --combined block/blk-settings.c
index 9d6033e01f2e,715f4b6356c4..ebba05a2bc7f
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@@ -182,15 -182,16 +182,15 @@@ static int blk_validate_limits(struct q
  		return -EINVAL;
  
  	/*
 -	 * Devices that require a virtual boundary do not support scatter/gather
 -	 * I/O natively, but instead require a descriptor list entry for each
 -	 * page (which might not be identical to the Linux PAGE_SIZE).  Because
 -	 * of that they are not limited by our notion of "segment size".
 +	 * Stacking device may have both virtual boundary and max segment
 +	 * size limit, so allow this setting now, and long-term the two
 +	 * might need to move out of stacking limits since we have immutable
 +	 * bvec and lower layer bio splitting is supposed to handle the two
 +	 * correctly.
  	 */
  	if (lim->virt_boundary_mask) {
 -		if (WARN_ON_ONCE(lim->max_segment_size &&
 -				 lim->max_segment_size != UINT_MAX))
 -			return -EINVAL;
 -		lim->max_segment_size = UINT_MAX;
 +		if (!lim->max_segment_size)
 +			lim->max_segment_size = UINT_MAX;
  	} else {
  		/*
  		 * The maximum segment size has an odd historic 64k default that
@@@ -411,24 -412,32 +411,32 @@@ EXPORT_SYMBOL(blk_queue_max_write_zeroe
   * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
   * @q:  the request queue for the device
   * @max_zone_append_sectors: maximum number of sectors to write per command
+  *
+  * Sets the maximum number of sectors allowed for zone append commands. If
+  * Specifying 0 for @max_zone_append_sectors indicates that the queue does
+  * not natively support zone append operations and that the block layer must
+  * emulate these operations using regular writes.
   **/
  void blk_queue_max_zone_append_sectors(struct request_queue *q,
  		unsigned int max_zone_append_sectors)
  {
- 	unsigned int max_sectors;
+ 	unsigned int max_sectors = 0;
  
  	if (WARN_ON(!blk_queue_is_zoned(q)))
  		return;
  
- 	max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
- 	max_sectors = min(q->limits.chunk_sectors, max_sectors);
+ 	if (max_zone_append_sectors) {
+ 		max_sectors = min(q->limits.max_hw_sectors,
+ 				  max_zone_append_sectors);
+ 		max_sectors = min(q->limits.chunk_sectors, max_sectors);
  
- 	/*
- 	 * Signal eventual driver bugs resulting in the max_zone_append sectors limit
- 	 * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
- 	 * or the max_hw_sectors limit not set.
- 	 */
- 	WARN_ON(!max_sectors);
+ 		/*
+ 		 * Signal eventual driver bugs resulting in the max_zone_append
+ 		 * sectors limit being 0 due to the chunk_sectors limit (zone
+ 		 * size) not set or the max_hw_sectors limit not set.
+ 		 */
+ 		WARN_ON_ONCE(!max_sectors);
+ 	}
  
  	q->limits.max_zone_append_sectors = max_sectors;
  }
@@@ -755,8 -764,8 +763,8 @@@ int blk_stack_limits(struct queue_limit
  	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
  	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
  					b->max_write_zeroes_sectors);
- 	t->max_zone_append_sectors = min(t->max_zone_append_sectors,
- 					b->max_zone_append_sectors);
+ 	t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
+ 					 queue_limits_max_zone_append_sectors(b));
  	t->bounce = max(t->bounce, b->bounce);
  
  	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@@ -1043,22 -1052,6 +1051,6 @@@ void blk_queue_write_cache(struct reque
  }
  EXPORT_SYMBOL_GPL(blk_queue_write_cache);
  
- /**
-  * blk_queue_required_elevator_features - Set a queue required elevator features
-  * @q:		the request queue for the target device
-  * @features:	Required elevator features OR'ed together
-  *
-  * Tell the block layer that for the device controlled through @q, only the
-  * only elevators that can be used are those that implement at least the set of
-  * features specified by @features.
-  */
- void blk_queue_required_elevator_features(struct request_queue *q,
- 					  unsigned int features)
- {
- 	q->required_elevator_features = features;
- }
- EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
- 
  /**
   * blk_queue_can_use_dma_map_merging - configure queue for merging segments.
   * @q:		the request queue for the device
diff --combined block/fops.c
index af6c244314af,5159ef3a1948..7a163f7fe2d8
--- a/block/fops.c
+++ b/block/fops.c
@@@ -44,18 -44,15 +44,15 @@@ static bool blkdev_dio_unaligned(struc
  #define DIO_INLINE_BIO_VECS 4
  
  static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
- 		struct iov_iter *iter, unsigned int nr_pages)
+ 		struct iov_iter *iter, struct block_device *bdev,
+ 		unsigned int nr_pages)
  {
- 	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
  	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
  	loff_t pos = iocb->ki_pos;
  	bool should_dirty = false;
  	struct bio bio;
  	ssize_t ret;
  
- 	if (blkdev_dio_unaligned(bdev, pos, iter))
- 		return -EINVAL;
- 
  	if (nr_pages <= DIO_INLINE_BIO_VECS)
  		vecs = inline_vecs;
  	else {
@@@ -161,9 -158,8 +158,8 @@@ static void blkdev_bio_end_io(struct bi
  }
  
  static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- 		unsigned int nr_pages)
+ 		struct block_device *bdev, unsigned int nr_pages)
  {
- 	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
  	struct blk_plug plug;
  	struct blkdev_dio *dio;
  	struct bio *bio;
@@@ -172,9 -168,6 +168,6 @@@
  	loff_t pos = iocb->ki_pos;
  	int ret = 0;
  
- 	if (blkdev_dio_unaligned(bdev, pos, iter))
- 		return -EINVAL;
- 
  	if (iocb->ki_flags & IOCB_ALLOC_CACHE)
  		opf |= REQ_ALLOC_CACHE;
  	bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@@ -302,9 -295,9 +295,9 @@@ static void blkdev_bio_end_io_async(str
  
  static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
  					struct iov_iter *iter,
+ 					struct block_device *bdev,
  					unsigned int nr_pages)
  {
- 	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
  	bool is_read = iov_iter_rw(iter) == READ;
  	blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
  	struct blkdev_dio *dio;
@@@ -312,9 -305,6 +305,6 @@@
  	loff_t pos = iocb->ki_pos;
  	int ret = 0;
  
- 	if (blkdev_dio_unaligned(bdev, pos, iter))
- 		return -EINVAL;
- 
  	if (iocb->ki_flags & IOCB_ALLOC_CACHE)
  		opf |= REQ_ALLOC_CACHE;
  	bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@@ -368,18 -358,23 +358,23 @@@
  
  static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
  {
+ 	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
  	unsigned int nr_pages;
  
  	if (!iov_iter_count(iter))
  		return 0;
  
+ 	if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter))
+ 		return -EINVAL;
+ 
  	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
  	if (likely(nr_pages <= BIO_MAX_VECS)) {
  		if (is_sync_kiocb(iocb))
- 			return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
- 		return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+ 			return __blkdev_direct_IO_simple(iocb, iter, bdev,
+ 							nr_pages);
+ 		return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
  	}
- 	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+ 	return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
  }
  
  static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@@ -390,7 -385,7 +385,7 @@@
  
  	iomap->bdev = bdev;
  	iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
- 	if (iomap->offset >= isize)
+ 	if (offset >= isize)
  		return -EIO;
  	iomap->type = IOMAP_MAPPED;
  	iomap->addr = iomap->offset;
@@@ -863,7 -858,6 +858,7 @@@ const struct file_operations def_blk_fo
  	.splice_read	= filemap_splice_read,
  	.splice_write	= iter_file_splice_write,
  	.fallocate	= blkdev_fallocate,
 +	.fop_flags	= FOP_BUFFER_RASYNC,
  };
  
  static __init int blkdev_init(void)
diff --combined block/ioctl.c
index f505f9c341eb,d7a6c6931a1e..c7db3bd2d653
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@@ -33,7 -33,7 +33,7 @@@ static int blkpg_do_ioctl(struct block_
  	if (op == BLKPG_DEL_PARTITION)
  		return bdev_del_partition(disk, p.pno);
  
- 	if (p.start < 0 || p.length <= 0 || p.start + p.length < 0)
+ 	if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start)
  		return -EINVAL;
  	/* Check that the partition is aligned to the block size */
  	if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
@@@ -95,9 -95,12 +95,12 @@@ static int compat_blkpg_ioctl(struct bl
  static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
  		unsigned long arg)
  {
- 	uint64_t range[2];
- 	uint64_t start, len, end;
+ 	unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
  	struct inode *inode = bdev->bd_inode;
 -	uint64_t range[2], start, len;
++	uint64_t range[2], start, len, end;
+ 	struct bio *prev = NULL, *bio;
+ 	sector_t sector, nr_sects;
+ 	struct blk_plug plug;
  	int err;
  
  	if (!(mode & BLK_OPEN_WRITE))
@@@ -105,6 -108,8 +108,8 @@@
  
  	if (!bdev_max_discard_sectors(bdev))
  		return -EOPNOTSUPP;
+ 	if (bdev_read_only(bdev))
+ 		return -EPERM;
  
  	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
  		return -EFAULT;
@@@ -112,20 -117,44 +117,45 @@@
  	start = range[0];
  	len = range[1];
  
- 	if (start & 511)
+ 	if (!len)
  		return -EINVAL;
- 	if (len & 511)
+ 	if ((start | len) & bs_mask)
  		return -EINVAL;
  
 -	if (start + len > bdev_nr_bytes(bdev))
 +	if (check_add_overflow(start, len, &end) ||
 +	    end > bdev_nr_bytes(bdev))
  		return -EINVAL;
  
  	filemap_invalidate_lock(inode->i_mapping);
  	err = truncate_bdev_range(bdev, mode, start, start + len - 1);
  	if (err)
  		goto fail;
- 	err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+ 
+ 	sector = start >> SECTOR_SHIFT;
+ 	nr_sects = len >> SECTOR_SHIFT;
+ 
+ 	blk_start_plug(&plug);
+ 	while (1) {
+ 		if (fatal_signal_pending(current)) {
+ 			if (prev)
+ 				bio_await_chain(prev);
+ 			err = -EINTR;
+ 			goto out_unplug;
+ 		}
+ 		bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
+ 				GFP_KERNEL);
+ 		if (!bio)
+ 			break;
+ 		prev = bio_chain_and_submit(prev, bio);
+ 	}
+ 	if (prev) {
+ 		err = submit_bio_wait(prev);
+ 		if (err == -EOPNOTSUPP)
+ 			err = 0;
+ 		bio_put(prev);
+ 	}
+ out_unplug:
+ 	blk_finish_plug(&plug);
  fail:
  	filemap_invalidate_unlock(inode->i_mapping);
  	return err;
@@@ -563,8 -592,7 +593,8 @@@ static int blkdev_common_ioctl(struct b
  			return -EACCES;
  		if (bdev_is_partition(bdev))
  			return -EINVAL;
 -		return disk_scan_partitions(bdev->bd_disk, mode);
 +		return disk_scan_partitions(bdev->bd_disk,
 +				mode | BLK_OPEN_STRICT_SCAN);
  	case BLKTRACESTART:
  	case BLKTRACESTOP:
  	case BLKTRACETEARDOWN:
diff --combined drivers/block/null_blk/main.c
index ed33cf7192d2,f7b9078f6913..4005a8b685e8
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@@ -225,6 -225,10 +225,10 @@@ static unsigned long g_cache_size
  module_param_named(cache_size, g_cache_size, ulong, 0444);
  MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
  
+ static bool g_fua = true;
+ module_param_named(fua, g_fua, bool, 0444);
+ MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true");
+ 
  static unsigned int g_mbps;
  module_param_named(mbps, g_mbps, uint, 0444);
  MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
@@@ -253,6 -257,11 +257,11 @@@ static unsigned int g_zone_max_active
  module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
  MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
  
+ static int g_zone_append_max_sectors = INT_MAX;
+ module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444);
+ MODULE_PARM_DESC(zone_append_max_sectors,
+ 		 "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation");
+ 
  static struct nullb_device *null_alloc_dev(void);
  static void null_free_dev(struct nullb_device *dev);
  static void null_del_dev(struct nullb *nullb);
@@@ -436,10 -445,12 +445,12 @@@ NULLB_DEVICE_ATTR(zone_capacity, ulong
  NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
  NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
  NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
+ NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL);
  NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
  NULLB_DEVICE_ATTR(no_sched, bool, NULL);
  NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
  NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
+ NULLB_DEVICE_ATTR(fua, bool, NULL);
  
  static ssize_t nullb_device_power_show(struct config_item *item, char *page)
  {
@@@ -580,12 -591,14 +591,14 @@@ static struct configfs_attribute *nullb
  	&nullb_device_attr_zone_nr_conv,
  	&nullb_device_attr_zone_max_open,
  	&nullb_device_attr_zone_max_active,
+ 	&nullb_device_attr_zone_append_max_sectors,
  	&nullb_device_attr_zone_readonly,
  	&nullb_device_attr_zone_offline,
  	&nullb_device_attr_virt_boundary,
  	&nullb_device_attr_no_sched,
  	&nullb_device_attr_shared_tags,
  	&nullb_device_attr_shared_tag_bitmap,
+ 	&nullb_device_attr_fua,
  	NULL,
  };
  
@@@ -664,14 -677,14 +677,14 @@@ nullb_group_drop_item(struct config_gro
  static ssize_t memb_group_features_show(struct config_item *item, char *page)
  {
  	return snprintf(page, PAGE_SIZE,
- 			"badblocks,blocking,blocksize,cache_size,"
+ 			"badblocks,blocking,blocksize,cache_size,fua,"
  			"completion_nsec,discard,home_node,hw_queue_depth,"
  			"irqmode,max_sectors,mbps,memory_backed,no_sched,"
  			"poll_queues,power,queue_mode,shared_tag_bitmap,"
  			"shared_tags,size,submit_queues,use_per_node_hctx,"
  			"virt_boundary,zoned,zone_capacity,zone_max_active,"
  			"zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
- 			"zone_size\n");
+ 			"zone_size,zone_append_max_sectors\n");
  }
  
  CONFIGFS_ATTR_RO(memb_group_, features);
@@@ -751,10 -764,13 +764,13 @@@ static struct nullb_device *null_alloc_
  	dev->zone_nr_conv = g_zone_nr_conv;
  	dev->zone_max_open = g_zone_max_open;
  	dev->zone_max_active = g_zone_max_active;
+ 	dev->zone_append_max_sectors = g_zone_append_max_sectors;
  	dev->virt_boundary = g_virt_boundary;
  	dev->no_sched = g_no_sched;
  	dev->shared_tags = g_shared_tags;
  	dev->shared_tag_bitmap = g_shared_tag_bitmap;
+ 	dev->fua = g_fua;
+ 
  	return dev;
  }
  
@@@ -1151,7 -1167,7 +1167,7 @@@ blk_status_t null_handle_discard(struc
  	return BLK_STS_OK;
  }
  
- static int null_handle_flush(struct nullb *nullb)
+ static blk_status_t null_handle_flush(struct nullb *nullb)
  {
  	int err;
  
@@@ -1168,7 -1184,7 +1184,7 @@@
  
  	WARN_ON(!radix_tree_empty(&nullb->dev->cache));
  	spin_unlock_irq(&nullb->lock);
- 	return err;
+ 	return errno_to_blk_status(err);
  }
  
  static int null_transfer(struct nullb *nullb, struct page *page,
@@@ -1206,7 -1222,7 +1222,7 @@@ static int null_handle_rq(struct nullb_
  {
  	struct request *rq = blk_mq_rq_from_pdu(cmd);
  	struct nullb *nullb = cmd->nq->dev->nullb;
- 	int err;
+ 	int err = 0;
  	unsigned int len;
  	sector_t sector = blk_rq_pos(rq);
  	struct req_iterator iter;
@@@ -1218,15 -1234,13 +1234,13 @@@
  		err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
  				     op_is_write(req_op(rq)), sector,
  				     rq->cmd_flags & REQ_FUA);
- 		if (err) {
- 			spin_unlock_irq(&nullb->lock);
- 			return err;
- 		}
+ 		if (err)
+ 			break;
  		sector += len >> SECTOR_SHIFT;
  	}
  	spin_unlock_irq(&nullb->lock);
  
- 	return 0;
+ 	return errno_to_blk_status(err);
  }
  
  static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
@@@ -1273,8 -1287,8 +1287,8 @@@ static inline blk_status_t null_handle_
  
  	if (op == REQ_OP_DISCARD)
  		return null_handle_discard(dev, sector, nr_sectors);
- 	return errno_to_blk_status(null_handle_rq(cmd));
  
+ 	return null_handle_rq(cmd);
  }
  
  static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
@@@ -1343,7 -1357,7 +1357,7 @@@ static void null_handle_cmd(struct null
  	blk_status_t sts;
  
  	if (op == REQ_OP_FLUSH) {
- 		cmd->error = errno_to_blk_status(null_handle_flush(nullb));
+ 		cmd->error = null_handle_flush(nullb);
  		goto out;
  	}
  
@@@ -1912,7 -1926,7 +1926,7 @@@ static int null_add_dev(struct nullb_de
  
  	if (dev->cache_size > 0) {
  		set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
- 		blk_queue_write_cache(nullb->q, true, true);
+ 		blk_queue_write_cache(nullb->q, true, dev->fua);
  	}
  
  	nullb->q->queuedata = nullb;
@@@ -1965,10 -1979,10 +1979,10 @@@
  
  out_ida_free:
  	ida_free(&nullb_indexes, nullb->index);
 -out_cleanup_zone:
 -	null_free_zoned_dev(dev);
  out_cleanup_disk:
  	put_disk(nullb->disk);
 +out_cleanup_zone:
 +	null_free_zoned_dev(dev);
  out_cleanup_tags:
  	if (nullb->tag_set == &nullb->__tag_set)
  		blk_mq_free_tag_set(nullb->tag_set);
@@@ -2113,10 -2127,13 +2127,13 @@@ static void __exit null_exit(void
  
  	if (tag_set.ops)
  		blk_mq_free_tag_set(&tag_set);
+ 
+ 	mutex_destroy(&lock);
  }
  
  module_init(null_init);
  module_exit(null_exit);
  
  MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
+ MODULE_DESCRIPTION("multi queue aware block test driver");
  MODULE_LICENSE("GPL");
diff --combined drivers/block/ublk_drv.c
index 374e4efa8759,851c78913de2..176657dce3e3
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@@ -221,7 -221,7 +221,7 @@@ static int ublk_get_nr_zones(const stru
  
  static int ublk_revalidate_disk_zones(struct ublk_device *ub)
  {
- 	return blk_revalidate_disk_zones(ub->ub_disk, NULL);
+ 	return blk_revalidate_disk_zones(ub->ub_disk);
  }
  
  static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
@@@ -249,8 -249,7 +249,7 @@@
  static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
  {
  	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
- 	blk_queue_required_elevator_features(ub->ub_disk->queue,
- 					     ELEVATOR_F_ZBD_SEQ_WRITE);
+ 
  	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
  }
  
@@@ -2177,8 -2176,7 +2176,8 @@@ static int ublk_ctrl_start_dev(struct u
  		.max_hw_sectors		= p->max_sectors,
  		.chunk_sectors		= p->chunk_sectors,
  		.virt_boundary_mask	= p->virt_boundary_mask,
 -
 +		.max_segments		= USHRT_MAX,
 +		.max_segment_size	= UINT_MAX,
  	};
  	struct gendisk *disk;
  	int ret = -EINVAL;
diff --combined drivers/md/dm.c
index 7d0746b37c8e,2369d10c8475..597dd7a25823
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@@ -765,7 -765,7 +765,7 @@@ static struct table_device *open_table_
  	return td;
  
  out_blkdev_put:
 -	fput(bdev_file);
 +	__fput_sync(bdev_file);
  out_free_td:
  	kfree(td);
  	return ERR_PTR(r);
@@@ -778,13 -778,7 +778,13 @@@ static void close_table_device(struct t
  {
  	if (md->disk->slave_dir)
  		bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
 -	fput(td->dm_dev.bdev_file);
 +
 +	/* Leverage async fput() if DMF_DEFERRED_REMOVE set */
 +	if (unlikely(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 +		fput(td->dm_dev.bdev_file);
 +	else
 +		__fput_sync(td->dm_dev.bdev_file);
 +
  	put_dax(td->dm_dev.dax_dev);
  	list_del(&td->list);
  	kfree(td);
@@@ -1428,25 -1422,12 +1428,12 @@@ static void __map_bio(struct bio *clone
  		down(&md->swap_bios_semaphore);
  	}
  
- 	if (static_branch_unlikely(&zoned_enabled)) {
- 		/*
- 		 * Check if the IO needs a special mapping due to zone append
- 		 * emulation on zoned target. In this case, dm_zone_map_bio()
- 		 * calls the target map operation.
- 		 */
- 		if (unlikely(dm_emulate_zone_append(md)))
- 			r = dm_zone_map_bio(tio);
- 		else
- 			goto do_map;
- 	} else {
- do_map:
- 		if (likely(ti->type->map == linear_map))
- 			r = linear_map(ti, clone);
- 		else if (ti->type->map == stripe_map)
- 			r = stripe_map(ti, clone);
- 		else
- 			r = ti->type->map(ti, clone);
- 	}
+ 	if (likely(ti->type->map == linear_map))
+ 		r = linear_map(ti, clone);
+ 	else if (ti->type->map == stripe_map)
+ 		r = stripe_map(ti, clone);
+ 	else
+ 		r = ti->type->map(ti, clone);
  
  	switch (r) {
  	case DM_MAPIO_SUBMITTED:
@@@ -1774,6 -1755,33 +1761,33 @@@ static void init_clone_info(struct clon
  		ci->sector_count = 0;
  }
  
+ #ifdef CONFIG_BLK_DEV_ZONED
+ static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+ 					   struct bio *bio)
+ {
+ 	/*
+ 	 * For mapped device that need zone append emulation, we must
+ 	 * split any large BIO that straddles zone boundaries.
+ 	 */
+ 	return dm_emulate_zone_append(md) && bio_straddles_zones(bio) &&
+ 		!bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+ }
+ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+ {
+ 	return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
+ }
+ #else
+ static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+ 					   struct bio *bio)
+ {
+ 	return false;
+ }
+ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+ {
+ 	return false;
+ }
+ #endif
+ 
  /*
   * Entry point to split a bio into clones and submit them to the targets.
   */
@@@ -1783,19 -1791,32 +1797,32 @@@ static void dm_split_and_process_bio(st
  	struct clone_info ci;
  	struct dm_io *io;
  	blk_status_t error = BLK_STS_OK;
- 	bool is_abnormal;
+ 	bool is_abnormal, need_split;
+ 
+ 	need_split = is_abnormal = is_abnormal_io(bio);
+ 	if (static_branch_unlikely(&zoned_enabled))
+ 		need_split = is_abnormal || dm_zone_bio_needs_split(md, bio);
  
- 	is_abnormal = is_abnormal_io(bio);
- 	if (unlikely(is_abnormal)) {
+ 	if (unlikely(need_split)) {
  		/*
  		 * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
  		 * otherwise associated queue_limits won't be imposed.
+ 		 * Also split the BIO for mapped devices needing zone append
+ 		 * emulation to ensure that the BIO does not cross zone
+ 		 * boundaries.
  		 */
  		bio = bio_split_to_limits(bio);
  		if (!bio)
  			return;
  	}
  
+ 	/*
+ 	 * Use the block layer zone write plugging for mapped devices that
+ 	 * need zone append emulation (e.g. dm-crypt).
+ 	 */
+ 	if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio))
+ 		return;
+ 
  	/* Only support nowait for normal IO */
  	if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {
  		io = alloc_io(md, bio, GFP_NOWAIT);
@@@ -2016,7 -2037,6 +2043,6 @@@ static void cleanup_mapped_device(struc
  		md->dax_dev = NULL;
  	}
  
- 	dm_cleanup_zoned_dev(md);
  	if (md->disk) {
  		spin_lock(&_minor_lock);
  		md->disk->private_data = NULL;
diff --combined drivers/nvme/host/core.c
index 095f59e7aa93,c9955ecd1790..bf7615cb36ee
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@@ -628,6 -628,27 +628,6 @@@ bool nvme_change_ctrl_state(struct nvme
  }
  EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
  
 -/*
 - * Returns true for sink states that can't ever transition back to live.
 - */
 -static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
 -{
 -	switch (nvme_ctrl_state(ctrl)) {
 -	case NVME_CTRL_NEW:
 -	case NVME_CTRL_LIVE:
 -	case NVME_CTRL_RESETTING:
 -	case NVME_CTRL_CONNECTING:
 -		return false;
 -	case NVME_CTRL_DELETING:
 -	case NVME_CTRL_DELETING_NOIO:
 -	case NVME_CTRL_DEAD:
 -		return true;
 -	default:
 -		WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
 -		return true;
 -	}
 -}
 -
  /*
   * Waits for the controller state to be resetting, or returns false if it is
   * not possible to ever transition to that state.
@@@ -2055,7 -2076,6 +2055,7 @@@ static int nvme_update_ns_info_block(st
  	bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT;
  	struct queue_limits lim;
  	struct nvme_id_ns_nvm *nvm = NULL;
 +	struct nvme_zone_info zi = {};
  	struct nvme_id_ns *id;
  	sector_t capacity;
  	unsigned lbaf;
@@@ -2068,10 -2088,9 +2068,10 @@@
  	if (id->ncap == 0) {
  		/* namespace not allocated or attached */
  		info->is_removed = true;
 -		ret = -ENODEV;
 +		ret = -ENXIO;
  		goto out;
  	}
 +	lbaf = nvme_lbaf_index(id->flbas);
  
  	if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
  		ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
@@@ -2079,14 -2098,8 +2079,14 @@@
  			goto out;
  	}
  
 +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 +	    ns->head->ids.csi == NVME_CSI_ZNS) {
 +		ret = nvme_query_zone_info(ns, lbaf, &zi);
 +		if (ret < 0)
 +			goto out;
 +	}
 +
  	blk_mq_freeze_queue(ns->disk->queue);
 -	lbaf = nvme_lbaf_index(id->flbas);
  	ns->head->lba_shift = id->lbaf[lbaf].ds;
  	ns->head->nuse = le64_to_cpu(id->nuse);
  	capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
@@@ -2099,8 -2112,13 +2099,8 @@@
  		capacity = 0;
  	nvme_config_discard(ns, &lim);
  	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 -	    ns->head->ids.csi == NVME_CSI_ZNS) {
 -		ret = nvme_update_zone_info(ns, lbaf, &lim);
 -		if (ret) {
 -			blk_mq_unfreeze_queue(ns->disk->queue);
 -			goto out;
 -		}
 -	}
 +	    ns->head->ids.csi == NVME_CSI_ZNS)
 +		nvme_update_zone_info(ns, &lim, &zi);
  	ret = queue_limits_commit_update(ns->disk->queue, &lim);
  	if (ret) {
  		blk_mq_unfreeze_queue(ns->disk->queue);
@@@ -2132,7 -2150,7 +2132,7 @@@
  	blk_mq_unfreeze_queue(ns->disk->queue);
  
  	if (blk_queue_is_zoned(ns->queue)) {
- 		ret = blk_revalidate_disk_zones(ns->disk, NULL);
+ 		ret = blk_revalidate_disk_zones(ns->disk);
  		if (ret && !nvme_first_scan(ns->disk))
  			goto out;
  	}
@@@ -2183,7 -2201,6 +2183,7 @@@ static int nvme_update_ns_info(struct n
  	}
  
  	if (!ret && nvme_ns_head_multipath(ns->head)) {
 +		struct queue_limits *ns_lim = &ns->disk->queue->limits;
  		struct queue_limits lim;
  
  		blk_mq_freeze_queue(ns->head->disk->queue);
@@@ -2195,26 -2212,7 +2195,26 @@@
  		set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
  		nvme_mpath_revalidate_paths(ns);
  
 +		/*
 +		 * queue_limits mixes values that are the hardware limitations
 +		 * for bio splitting with what is the device configuration.
 +		 *
 +		 * For NVMe the device configuration can change after e.g. a
 +		 * Format command, and we really want to pick up the new format
 +		 * value here.  But we must still stack the queue limits to the
 +		 * least common denominator for multipathing to split the bios
 +		 * properly.
 +		 *
 +		 * To work around this, we explicitly set the device
 +		 * configuration to those that we just queried, but only stack
 +		 * the splitting limits in to make sure we still obey possibly
 +		 * lower limitations of other controllers.
 +		 */
  		lim = queue_limits_start_update(ns->head->disk->queue);
 +		lim.logical_block_size = ns_lim->logical_block_size;
 +		lim.physical_block_size = ns_lim->physical_block_size;
 +		lim.io_min = ns_lim->io_min;
 +		lim.io_opt = ns_lim->io_opt;
  		queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
  					ns->head->disk->disk_name);
  		ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
@@@ -3660,7 -3658,7 +3660,7 @@@ static int nvme_init_ns_head(struct nvm
  				"Found shared namespace %d, but multipathing not supported.\n",
  				info->nsid);
  			dev_warn_once(ctrl->device,
 -				"Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
 +				"Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n");
  		}
  	}
  
diff --combined drivers/scsi/scsi_lib.c
index 5b3230ef51fe,9ca96116bd33..967b6d62bb37
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@@ -635,9 -635,10 +635,9 @@@ static bool scsi_end_request(struct req
  	if (blk_queue_add_random(q))
  		add_disk_randomness(req->q->disk);
  
 -	if (!blk_rq_is_passthrough(req)) {
 -		WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED));
 -		cmd->flags &= ~SCMD_INITIALIZED;
 -	}
 +	WARN_ON_ONCE(!blk_rq_is_passthrough(req) &&
 +		     !(cmd->flags & SCMD_INITIALIZED));
 +	cmd->flags = 0;
  
  	/*
  	 * Calling rcu_barrier() is not necessary here because the
@@@ -1869,7 -1870,6 +1869,6 @@@ out_put_budget
  	case BLK_STS_OK:
  		break;
  	case BLK_STS_RESOURCE:
- 	case BLK_STS_ZONE_RESOURCE:
  		if (scsi_device_blocked(sdev))
  			ret = BLK_STS_DEV_RESOURCE;
  		break;
diff --combined drivers/scsi/sd.c
index 65cdc8b77e35,dcba9530ffa5..64c5129044b3
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@@ -1260,12 -1260,6 +1260,6 @@@ static blk_status_t sd_setup_read_write
  		}
  	}
  
- 	if (req_op(rq) == REQ_OP_ZONE_APPEND) {
- 		ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks);
- 		if (ret)
- 			goto fail;
- 	}
- 
  	fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0;
  	dix = scsi_prot_sg_count(cmd);
  	dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type);
@@@ -1348,7 -1342,6 +1342,6 @@@ static blk_status_t sd_init_command(str
  		return sd_setup_flush_cmnd(cmd);
  	case REQ_OP_READ:
  	case REQ_OP_WRITE:
- 	case REQ_OP_ZONE_APPEND:
  		return sd_setup_read_write_cmnd(cmd);
  	case REQ_OP_ZONE_RESET:
  		return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
@@@ -3120,7 -3113,6 +3113,7 @@@ static void sd_read_io_hints(struct scs
  {
  	struct scsi_device *sdp = sdkp->device;
  	const struct scsi_io_group_descriptor *desc, *start, *end;
 +	u16 permanent_stream_count_old;
  	struct scsi_sense_hdr sshdr;
  	struct scsi_mode_data data;
  	int res;
@@@ -3141,13 -3133,12 +3134,13 @@@
  	for (desc = start; desc < end; desc++)
  		if (!desc->st_enble || !sd_is_perm_stream(sdkp, desc - start))
  			break;
 +	permanent_stream_count_old = sdkp->permanent_stream_count;
  	sdkp->permanent_stream_count = desc - start;
  	if (sdkp->rscs && sdkp->permanent_stream_count < 2)
  		sd_printk(KERN_INFO, sdkp,
  			  "Unexpected: RSCS has been set and the permanent stream count is %u\n",
  			  sdkp->permanent_stream_count);
 -	else if (sdkp->permanent_stream_count)
 +	else if (sdkp->permanent_stream_count != permanent_stream_count_old)
  		sd_printk(KERN_INFO, sdkp, "permanent stream count = %d\n",
  			  sdkp->permanent_stream_count);
  }
@@@ -3922,7 -3913,7 +3915,7 @@@ static int sd_probe(struct device *dev
  
  	error = device_add_disk(dev, gd, NULL);
  	if (error) {
 -		put_device(&sdkp->disk_dev);
 +		device_unregister(&sdkp->disk_dev);
  		put_disk(gd);
  		goto out;
  	}
@@@ -3981,7 -3972,6 +3974,6 @@@ static void scsi_disk_release(struct de
  	struct scsi_disk *sdkp = to_scsi_disk(dev);
  
  	ida_free(&sd_index_ida, sdkp->index);
- 	sd_zbc_free_zone_info(sdkp);
  	put_device(&sdkp->device->sdev_gendev);
  	free_opal_dev(sdkp->opal_dev);
  
diff --combined include/linux/blk_types.h
index c3e098b21c16,5751292fee6a..25dbf1097085
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@@ -88,9 -88,15 +88,9 @@@ struct block_device 
  
  /*
   * Block error status values.  See block/blk-core:blk_errors for the details.
 - * Alpha cannot write a byte atomically, so we need to use 32-bit value.
   */
 -#if defined(CONFIG_ALPHA) && !defined(__alpha_bwx__)
 -typedef u32 __bitwise blk_status_t;
 -typedef u32 blk_short_t;
 -#else
  typedef u8 __bitwise blk_status_t;
  typedef u16 blk_short_t;
 -#endif
  #define	BLK_STS_OK 0
  #define BLK_STS_NOTSUPP		((__force blk_status_t)1)
  #define BLK_STS_TIMEOUT		((__force blk_status_t)2)
@@@ -130,18 -136,6 +130,6 @@@
   */
  #define BLK_STS_DEV_RESOURCE	((__force blk_status_t)13)
  
- /*
-  * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone
-  * related resources are unavailable, but the driver can guarantee the queue
-  * will be rerun in the future once the resources become available again.
-  *
-  * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references
-  * a zone specific resource and IO to a different zone on the same device could
-  * still be served. Examples of that are zones that are write-locked, but a read
-  * to the same zone could be served.
-  */
- #define BLK_STS_ZONE_RESOURCE	((__force blk_status_t)14)
- 
  /*
   * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion
   * path if the device returns a status indicating that too many zone resources
@@@ -149,7 -143,7 +137,7 @@@
   * after the number of open zones decreases below the device's limits, which is
   * reported in the request_queue's max_open_zones.
   */
- #define BLK_STS_ZONE_OPEN_RESOURCE	((__force blk_status_t)15)
+ #define BLK_STS_ZONE_OPEN_RESOURCE	((__force blk_status_t)14)
  
  /*
   * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion
@@@ -158,20 -152,20 +146,20 @@@
   * after the number of active zones decreases below the device's limits, which
   * is reported in the request_queue's max_active_zones.
   */
- #define BLK_STS_ZONE_ACTIVE_RESOURCE	((__force blk_status_t)16)
+ #define BLK_STS_ZONE_ACTIVE_RESOURCE	((__force blk_status_t)15)
  
  /*
   * BLK_STS_OFFLINE is returned from the driver when the target device is offline
   * or is being taken offline. This could help differentiate the case where a
   * device is intentionally being shut down from a real I/O error.
   */
- #define BLK_STS_OFFLINE		((__force blk_status_t)17)
+ #define BLK_STS_OFFLINE		((__force blk_status_t)16)
  
  /*
   * BLK_STS_DURATION_LIMIT is returned from the driver when the target device
   * aborted the command because it exceeded one of its Command Duration Limits.
   */
- #define BLK_STS_DURATION_LIMIT	((__force blk_status_t)18)
+ #define BLK_STS_DURATION_LIMIT	((__force blk_status_t)17)
  
  /**
   * blk_path_error - returns true if error may be path related
@@@ -228,7 -222,12 +216,12 @@@ struct bio 
  
  	struct bvec_iter	bi_iter;
  
- 	blk_qc_t		bi_cookie;
+ 	union {
+ 		/* for polled bios: */
+ 		blk_qc_t		bi_cookie;
+ 		/* for plugged zoned writes only: */
+ 		unsigned int		__bi_nr_segments;
+ 	};
  	bio_end_io_t		*bi_end_io;
  	void			*bi_private;
  #ifdef CONFIG_BLK_CGROUP
@@@ -298,7 -297,8 +291,8 @@@ enum 
  	BIO_QOS_THROTTLED,	/* bio went through rq_qos throttle path */
  	BIO_QOS_MERGED,		/* but went through rq_qos merge path */
  	BIO_REMAPPED,
- 	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
+ 	BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
+ 	BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
  	BIO_FLAG_LAST
  };
  
diff --combined include/linux/blkdev.h
index 69e7da33ca49,26acf80c50c0..69c4f113db42
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -128,8 -128,6 +128,8 @@@ typedef unsigned int __bitwise blk_mode
  #define BLK_OPEN_WRITE_IOCTL	((__force blk_mode_t)(1 << 4))
  /* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
  #define BLK_OPEN_RESTRICT_WRITES	((__force blk_mode_t)(1 << 5))
 +/* return partition scanning errors */
 +#define BLK_OPEN_STRICT_SCAN	((__force blk_mode_t)(1 << 6))
  
  struct gendisk {
  	/*
@@@ -179,22 -177,21 +179,21 @@@
  
  #ifdef CONFIG_BLK_DEV_ZONED
  	/*
- 	 * Zoned block device information for request dispatch control.
- 	 * nr_zones is the total number of zones of the device. This is always
- 	 * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
- 	 * bits which indicates if a zone is conventional (bit set) or
- 	 * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
- 	 * bits which indicates if a zone is write locked, that is, if a write
- 	 * request targeting the zone was dispatched.
- 	 *
- 	 * Reads of this information must be protected with blk_queue_enter() /
- 	 * blk_queue_exit(). Modifying this information is only allowed while
- 	 * no requests are being processed. See also blk_mq_freeze_queue() and
- 	 * blk_mq_unfreeze_queue().
+ 	 * Zoned block device information. Reads of this information must be
+ 	 * protected with blk_queue_enter() / blk_queue_exit(). Modifying this
+ 	 * information is only allowed while no requests are being processed.
+ 	 * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue().
  	 */
  	unsigned int		nr_zones;
+ 	unsigned int		zone_capacity;
  	unsigned long		*conv_zones_bitmap;
- 	unsigned long		*seq_zones_wlock;
+ 	unsigned int            zone_wplugs_hash_bits;
+ 	spinlock_t              zone_wplugs_lock;
+ 	struct mempool_s	*zone_wplugs_pool;
+ 	struct hlist_head       *zone_wplugs_hash;
+ 	struct list_head        zone_wplugs_err_list;
+ 	struct work_struct	zone_wplugs_work;
+ 	struct workqueue_struct *zone_wplugs_wq;
  #endif /* CONFIG_BLK_DEV_ZONED */
  
  #if IS_ENABLED(CONFIG_CDROM)
@@@ -233,6 -230,19 +232,19 @@@ static inline unsigned int disk_openers
  	return atomic_read(&disk->part0->bd_openers);
  }
  
+ /**
+  * disk_has_partscan - return %true if partition scanning is enabled on a disk
+  * @disk: disk to check
+  *
+  * Returns %true if partitions scanning is enabled for @disk, or %false if
+  * partition scanning is disabled either permanently or temporarily.
+  */
+ static inline bool disk_has_partscan(struct gendisk *disk)
+ {
+ 	return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) &&
+ 		!test_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
+ }
+ 
  /*
   * The gendisk is refcounted by the part0 block_device, and the bd_device
   * therein is also used for device model presentation in sysfs.
@@@ -331,8 -341,7 +343,7 @@@ int blkdev_report_zones(struct block_de
  		unsigned int nr_zones, report_zones_cb cb, void *data);
  int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
  		sector_t sectors, sector_t nr_sectors);
- int blk_revalidate_disk_zones(struct gendisk *disk,
- 		void (*update_driver_data)(struct gendisk *disk));
+ int blk_revalidate_disk_zones(struct gendisk *disk);
  
  /*
   * Independent access ranges: struct blk_independent_access_range describes
@@@ -449,8 -458,6 +460,6 @@@ struct request_queue 
  
  	atomic_t		nr_active_requests_shared_tags;
  
- 	unsigned int		required_elevator_features;
- 
  	struct blk_mq_tags	*sched_shared_tags;
  
  	struct list_head	icq_list;
@@@ -633,15 -640,6 +642,6 @@@ static inline unsigned int disk_zone_no
  	return sector >> ilog2(disk->queue->limits.chunk_sectors);
  }
  
- static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
- {
- 	if (!blk_queue_is_zoned(disk->queue))
- 		return false;
- 	if (!disk->conv_zones_bitmap)
- 		return true;
- 	return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
- }
- 
  static inline void disk_set_max_open_zones(struct gendisk *disk,
  		unsigned int max_open_zones)
  {
@@@ -664,6 -662,7 +664,7 @@@ static inline unsigned int bdev_max_act
  	return bdev->bd_disk->queue->limits.max_active_zones;
  }
  
+ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);
  #else /* CONFIG_BLK_DEV_ZONED */
  static inline unsigned int bdev_nr_zones(struct block_device *bdev)
  {
@@@ -674,10 -673,6 +675,6 @@@ static inline unsigned int disk_nr_zone
  {
  	return 0;
  }
- static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
- {
- 	return false;
- }
  static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
  {
  	return 0;
@@@ -691,6 -686,10 +688,10 @@@ static inline unsigned int bdev_max_act
  {
  	return 0;
  }
+ static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+ {
+ 	return false;
+ }
  #endif /* CONFIG_BLK_DEV_ZONED */
  
  static inline unsigned int blk_queue_depth(struct request_queue *q)
@@@ -855,9 -854,11 +856,11 @@@ static inline unsigned int bio_zone_no(
  	return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
  }
  
- static inline unsigned int bio_zone_is_seq(struct bio *bio)
+ static inline bool bio_straddles_zones(struct bio *bio)
  {
- 	return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
+ 	return bio_sectors(bio) &&
+ 		bio_zone_no(bio) !=
+ 		disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1);
  }
  
  /*
@@@ -942,14 -943,6 +945,6 @@@ disk_alloc_independent_access_ranges(st
  void disk_set_independent_access_ranges(struct gendisk *disk,
  				struct blk_independent_access_ranges *iars);
  
- /*
-  * Elevator features for blk_queue_required_elevator_features:
-  */
- /* Supports zoned block devices sequential write constraint */
- #define ELEVATOR_F_ZBD_SEQ_WRITE	(1U << 0)
- 
- extern void blk_queue_required_elevator_features(struct request_queue *q,
- 						 unsigned int features);
  extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
  					      struct device *dev);
  
@@@ -1156,12 -1149,29 +1151,29 @@@ static inline unsigned int queue_max_se
  	return q->limits.max_segment_size;
  }
  
- static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q)
+ static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l)
  {
+ 	unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
  
- 	const struct queue_limits *l = &q->limits;
+ 	return min_not_zero(l->max_zone_append_sectors, max_sectors);
+ }
+ 
+ static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
+ {
+ 	if (!blk_queue_is_zoned(q))
+ 		return 0;
+ 
+ 	return queue_limits_max_zone_append_sectors(&q->limits);
+ }
+ 
+ static inline bool queue_emulates_zone_append(struct request_queue *q)
+ {
+ 	return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
+ }
  
- 	return min(l->max_zone_append_sectors, l->max_sectors);
+ static inline bool bdev_emulates_zone_append(struct block_device *bdev)
+ {
+ 	return queue_emulates_zone_append(bdev_get_queue(bdev));
  }
  
  static inline unsigned int
@@@ -1303,18 -1313,6 +1315,6 @@@ static inline unsigned int bdev_zone_no
  	return disk_zone_no(bdev->bd_disk, sec);
  }
  
- /* Whether write serialization is required for @op on zoned devices. */
- static inline bool op_needs_zoned_write_locking(enum req_op op)
- {
- 	return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES;
- }
- 
- static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
- 					  enum req_op op)
- {
- 	return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op);
- }
- 
  static inline sector_t bdev_zone_sectors(struct block_device *bdev)
  {
  	struct request_queue *q = bdev_get_queue(bdev);
@@@ -1330,6 -1328,12 +1330,12 @@@ static inline sector_t bdev_offset_from
  	return sector & (bdev_zone_sectors(bdev) - 1);
  }
  
+ static inline sector_t bio_offset_from_zone_start(struct bio *bio)
+ {
+ 	return bdev_offset_from_zone_start(bio->bi_bdev,
+ 					   bio->bi_iter.bi_sector);
+ }
+ 
  static inline bool bdev_is_zone_start(struct block_device *bdev,
  				      sector_t sector)
  {
@@@ -1507,6 -1511,16 +1513,6 @@@ struct blk_holder_ops 
  	 * Thaw the file system mounted on the block device.
  	 */
  	int (*thaw)(struct block_device *bdev);
 -
 -	/*
 -	 * If needed, get a reference to the holder.
 -	 */
 -	void (*get_holder)(void *holder);
 -
 -	/*
 -	 * Release the holder.
 -	 */
 -	void (*put_holder)(void *holder);
  };
  
  /*
@@@ -1577,7 -1591,6 +1583,7 @@@ static inline int early_lookup_bdev(con
  
  int bdev_freeze(struct block_device *bdev);
  int bdev_thaw(struct block_device *bdev);
 +void bdev_fput(struct file *bdev_file);
  
  struct io_comp_batch {
  	struct request *req_list;