Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux

author Linus Torvalds <[email protected]>

Mon, 13 May 2024 20:03:54 +0000 (13:03 -0700)

committer Linus Torvalds <[email protected]>

Mon, 13 May 2024 20:03:54 +0000 (13:03 -0700)
author Linus Torvalds <[email protected]>
Mon, 13 May 2024 20:03:54 +0000 (13:03 -0700)
committer Linus Torvalds <[email protected]>
Mon, 13 May 2024 20:03:54 +0000 (13:03 -0700)
diff --combined block/blk-cgroup.c

index 059467086b13123b26630c1e84942980f3001216,5e1f1052567731da34e32e9dcabb7b24800529f9..4b1a35ab0ea4c9c73a910a8c7454bb9ca2229881
--- 1/block/blk-cgroup.c
--- 2/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@@ -218,8 -218,7 +218,7 @@@ static void blkg_async_bio_workfn(struc
   
         /* as long as there are pending bios, @blkg can't go away */
         spin_lock(&blkg->async_bio_lock);
-       bio_list_merge(&bios, &blkg->async_bios);
-       bio_list_init(&blkg->async_bios);
+       bio_list_merge_init(&bios, &blkg->async_bios);
         spin_unlock(&blkg->async_bio_lock);
   
         /* start plug only when bio_list contains at least 2 bios */
@@@ -1409,12 -1408,6 +1408,12 @@@ static int blkcg_css_online(struct cgro
         return 0;
   }
   
+ +void blkg_init_queue(struct request_queue *q)
+ +{
+ +      INIT_LIST_HEAD(&q->blkg_list);
+ +      mutex_init(&q->blkcg_mutex);
+ +}
+ +
   int blkcg_init_disk(struct gendisk *disk)
   {
         struct request_queue *q = disk->queue;
@@@ -1422,6 -1415,9 +1421,6 @@@
         bool preloaded;
         int ret;
   
- -      INIT_LIST_HEAD(&q->blkg_list);
- -      mutex_init(&q->blkcg_mutex);
- -
         new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
         if (!new_blkg)
                 return -ENOMEM;
@@@ -1444,14 -1440,8 +1443,8 @@@
         if (ret)
                 goto err_destroy_all;
   
-       ret = blk_throtl_init(disk);
-       if (ret)
-               goto err_ioprio_exit;
- 
         return 0;
   
- err_ioprio_exit:
-       blk_ioprio_exit(disk);
   err_destroy_all:
         blkg_destroy_all(disk);
         return ret;
diff --combined block/blk-core.c

index b795ac177281ad7adec63528d53def2fff1139a5,8566bbd8aeba2c41ce6dcd29ab2bd680139fe3e3..01186333c88ec5bbcc5c1bb9cbe2bbe723f84a91
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -442,8 -442,6 +442,8 @@@ struct request_queue *blk_alloc_queue(s
         init_waitqueue_head(&q->mq_freeze_wq);
         mutex_init(&q->mq_freeze_lock);
   
+ +      blkg_init_queue(q);
+ +
         /*
          * Init percpu_ref in atomic mode so that it's faster to shutdown.
          * See blk_register_queue() for details.
@@@ -591,8 -589,7 +591,7 @@@ static inline blk_status_t blk_check_zo
                 return BLK_STS_NOTSUPP;
   
         /* The bio sector must point to the start of a sequential zone */
-       if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
-           !bio_zone_is_seq(bio))
+       if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
                 return BLK_STS_IOERR;
   
         /*
@@@ -604,7 -601,7 +603,7 @@@
                 return BLK_STS_IOERR;
   
         /* Make sure the BIO is small enough and will not get split */
-       if (nr_sectors > q->limits.max_zone_append_sectors)
+       if (nr_sectors > queue_max_zone_append_sectors(q))
                 return BLK_STS_IOERR;
   
         bio->bi_opf |= REQ_NOMERGE;
@@@ -649,11 -646,13 +648,13 @@@ static void __submit_bio(struct bio *bi
   static void __submit_bio_noacct(struct bio *bio)
   {
         struct bio_list bio_list_on_stack[2];
+       struct blk_plug plug;
   
         BUG_ON(bio->bi_next);
   
         bio_list_init(&bio_list_on_stack[0]);
         current->bio_list = bio_list_on_stack;
+       blk_start_plug(&plug);
   
         do {
                 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@@ -687,19 -686,23 +688,23 @@@
                 bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
         } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
   
+       blk_finish_plug(&plug);
         current->bio_list = NULL;
   }
   
   static void __submit_bio_noacct_mq(struct bio *bio)
   {
         struct bio_list bio_list[2] = { };
+       struct blk_plug plug;
   
         current->bio_list = bio_list;
+       blk_start_plug(&plug);
   
         do {
                 __submit_bio(bio);
         } while ((bio = bio_list_pop(&bio_list[0])));
   
+       blk_finish_plug(&plug);
         current->bio_list = NULL;
   }
   
@@@ -910,12 -913,6 +915,6 @@@ int bio_poll(struct bio *bio, struct io
             !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
                 return 0;
   
-       /*
-        * As the requests that require a zone lock are not plugged in the
-        * first place, directly accessing the plug instead of using
-        * blk_mq_plug() should not have any consequences during flushing for
-        * zoned devices.
-        */
         blk_flush_plug(current->plug, false);
   
         /*
@@@ -987,10 -984,11 +986,11 @@@ void update_io_ticks(struct block_devic
         unsigned long stamp;
   again:
         stamp = READ_ONCE(part->bd_stamp);
-       if (unlikely(time_after(now, stamp))) {
-               if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
-                       __part_stat_add(part, io_ticks, end ? now - stamp : 1);
-       }
+       if (unlikely(time_after(now, stamp)) &&
+           likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
+           (end || part_in_flight(part)))
+               __part_stat_add(part, io_ticks, now - stamp);
+ 
         if (part->bd_partno) {
                 part = bdev_whole(part);
                 goto again;
@@@ -1197,7 -1195,6 +1197,7 @@@ void __blk_flush_plug(struct blk_plug *
         if (unlikely(!rq_list_empty(plug->cached_rq)))
                 blk_mq_free_plug_rqs(plug);
   
+ +      plug->cur_ktime = 0;
         current->flags &= ~PF_BLOCK_TS;
   }
   
diff --combined block/blk-settings.c

index 9d6033e01f2e170307cbe5ffdb0365d1eb5c8b9b,715f4b6356c4652d663d32a7abf7bb9f358e3379..ebba05a2bc7f590c80e1ee37cc0c409d26ba2f6d
--- 1/block/blk-settings.c
--- 2/block/blk-settings.c
+++ b/block/blk-settings.c
@@@ -182,15 -182,16 +182,15 @@@ static int blk_validate_limits(struct q
                 return -EINVAL;
   
         /*
- -       * Devices that require a virtual boundary do not support scatter/gather
- -       * I/O natively, but instead require a descriptor list entry for each
- -       * page (which might not be identical to the Linux PAGE_SIZE).  Because
- -       * of that they are not limited by our notion of "segment size".
+ +       * Stacking device may have both virtual boundary and max segment
+ +       * size limit, so allow this setting now, and long-term the two
+ +       * might need to move out of stacking limits since we have immutable
+ +       * bvec and lower layer bio splitting is supposed to handle the two
+ +       * correctly.
          */
         if (lim->virt_boundary_mask) {
- -              if (WARN_ON_ONCE(lim->max_segment_size &&
- -                               lim->max_segment_size != UINT_MAX))
- -                      return -EINVAL;
- -              lim->max_segment_size = UINT_MAX;
+ +              if (!lim->max_segment_size)
+ +                      lim->max_segment_size = UINT_MAX;
         } else {
                 /*
                  * The maximum segment size has an odd historic 64k default that
@@@ -411,24 -412,32 +411,32 @@@ EXPORT_SYMBOL(blk_queue_max_write_zeroe
    * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
    * @q:  the request queue for the device
    * @max_zone_append_sectors: maximum number of sectors to write per command
+  *
+  * Sets the maximum number of sectors allowed for zone append commands. If
+  * Specifying 0 for @max_zone_append_sectors indicates that the queue does
+  * not natively support zone append operations and that the block layer must
+  * emulate these operations using regular writes.
    **/
   void blk_queue_max_zone_append_sectors(struct request_queue *q,
                 unsigned int max_zone_append_sectors)
   {
-       unsigned int max_sectors;
+       unsigned int max_sectors = 0;
   
         if (WARN_ON(!blk_queue_is_zoned(q)))
                 return;
   
-       max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
-       max_sectors = min(q->limits.chunk_sectors, max_sectors);
+       if (max_zone_append_sectors) {
+               max_sectors = min(q->limits.max_hw_sectors,
+                                 max_zone_append_sectors);
+               max_sectors = min(q->limits.chunk_sectors, max_sectors);
   
-       /*
-        * Signal eventual driver bugs resulting in the max_zone_append sectors limit
-        * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
-        * or the max_hw_sectors limit not set.
-        */
-       WARN_ON(!max_sectors);
+               /*
+                * Signal eventual driver bugs resulting in the max_zone_append
+                * sectors limit being 0 due to the chunk_sectors limit (zone
+                * size) not set or the max_hw_sectors limit not set.
+                */
+               WARN_ON_ONCE(!max_sectors);
+       }
   
         q->limits.max_zone_append_sectors = max_sectors;
   }
@@@ -755,8 -764,8 +763,8 @@@ int blk_stack_limits(struct queue_limit
         t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
         t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
                                         b->max_write_zeroes_sectors);
-       t->max_zone_append_sectors = min(t->max_zone_append_sectors,
-                                       b->max_zone_append_sectors);
+       t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
+                                        queue_limits_max_zone_append_sectors(b));
         t->bounce = max(t->bounce, b->bounce);
   
         t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@@ -1043,22 -1052,6 +1051,6 @@@ void blk_queue_write_cache(struct reque
   }
   EXPORT_SYMBOL_GPL(blk_queue_write_cache);
   
- /**
-  * blk_queue_required_elevator_features - Set a queue required elevator features
-  * @q:                the request queue for the target device
-  * @features: Required elevator features OR'ed together
-  *
-  * Tell the block layer that for the device controlled through @q, only the
-  * only elevators that can be used are those that implement at least the set of
-  * features specified by @features.
-  */
- void blk_queue_required_elevator_features(struct request_queue *q,
-                                         unsigned int features)
- {
-       q->required_elevator_features = features;
- }
- EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
- 
   /**
    * blk_queue_can_use_dma_map_merging - configure queue for merging segments.
    * @q:                the request queue for the device
diff --combined block/fops.c

index af6c244314afadb0674c1c354bf749de0f1ef74f,5159ef3a1948746c665e1c5d7151ac31dc73e6ea..7a163f7fe2d8ccb4e130214805633422bb05dbe7
--- 1/block/fops.c
--- 2/block/fops.c
+++ b/block/fops.c
@@@ -44,18 -44,15 +44,15 @@@ static bool blkdev_dio_unaligned(struc
   #define DIO_INLINE_BIO_VECS 4
   
   static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
-               struct iov_iter *iter, unsigned int nr_pages)
+               struct iov_iter *iter, struct block_device *bdev,
+               unsigned int nr_pages)
   {
-       struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
         struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
         loff_t pos = iocb->ki_pos;
         bool should_dirty = false;
         struct bio bio;
         ssize_t ret;
   
-       if (blkdev_dio_unaligned(bdev, pos, iter))
-               return -EINVAL;
- 
         if (nr_pages <= DIO_INLINE_BIO_VECS)
                 vecs = inline_vecs;
         else {
@@@ -161,9 -158,8 +158,8 @@@ static void blkdev_bio_end_io(struct bi
   }
   
   static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-               unsigned int nr_pages)
+               struct block_device *bdev, unsigned int nr_pages)
   {
-       struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
         struct blk_plug plug;
         struct blkdev_dio *dio;
         struct bio *bio;
@@@ -172,9 -168,6 +168,6 @@@
         loff_t pos = iocb->ki_pos;
         int ret = 0;
   
-       if (blkdev_dio_unaligned(bdev, pos, iter))
-               return -EINVAL;
- 
         if (iocb->ki_flags & IOCB_ALLOC_CACHE)
                 opf |= REQ_ALLOC_CACHE;
         bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@@ -302,9 -295,9 +295,9 @@@ static void blkdev_bio_end_io_async(str
   
   static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
                                         struct iov_iter *iter,
+                                       struct block_device *bdev,
                                         unsigned int nr_pages)
   {
-       struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
         bool is_read = iov_iter_rw(iter) == READ;
         blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
         struct blkdev_dio *dio;
@@@ -312,9 -305,6 +305,6 @@@
         loff_t pos = iocb->ki_pos;
         int ret = 0;
   
-       if (blkdev_dio_unaligned(bdev, pos, iter))
-               return -EINVAL;
- 
         if (iocb->ki_flags & IOCB_ALLOC_CACHE)
                 opf |= REQ_ALLOC_CACHE;
         bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@@ -368,18 -358,23 +358,23 @@@
   
   static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
   {
+       struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
         unsigned int nr_pages;
   
         if (!iov_iter_count(iter))
                 return 0;
   
+       if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter))
+               return -EINVAL;
+ 
         nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
         if (likely(nr_pages <= BIO_MAX_VECS)) {
                 if (is_sync_kiocb(iocb))
-                       return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
-               return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+                       return __blkdev_direct_IO_simple(iocb, iter, bdev,
+                                                       nr_pages);
+               return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
         }
-       return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+       return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
   }
   
   static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@@ -390,7 -385,7 +385,7 @@@
   
         iomap->bdev = bdev;
         iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
-       if (iomap->offset >= isize)
+       if (offset >= isize)
                 return -EIO;
         iomap->type = IOMAP_MAPPED;
         iomap->addr = iomap->offset;
@@@ -863,7 -858,6 +858,7 @@@ const struct file_operations def_blk_fo
         .splice_read    = filemap_splice_read,
         .splice_write   = iter_file_splice_write,
         .fallocate      = blkdev_fallocate,
+ +      .fop_flags      = FOP_BUFFER_RASYNC,
   };
   
   static __init int blkdev_init(void)
diff --combined block/ioctl.c

index f505f9c341eb08bd57bbcb729f603b5ac48453f0,d7a6c6931a1e7219a8687932f7955b5f8ba4e819..c7db3bd2d653c8c08b7294d393eaeca5784e79ea
--- 1/block/ioctl.c
--- 2/block/ioctl.c
+++ b/block/ioctl.c
@@@ -33,7 -33,7 +33,7 @@@ static int blkpg_do_ioctl(struct block_
         if (op == BLKPG_DEL_PARTITION)
                 return bdev_del_partition(disk, p.pno);
   
-       if (p.start < 0 || p.length <= 0 || p.start + p.length < 0)
+       if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start)
                 return -EINVAL;
         /* Check that the partition is aligned to the block size */
         if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
@@@ -95,9 -95,12 +95,12 @@@ static int compat_blkpg_ioctl(struct bl
   static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
                 unsigned long arg)
   {
-       uint64_t range[2];
-       uint64_t start, len, end;
+       unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
         struct inode *inode = bdev->bd_inode;
- -      uint64_t range[2], start, len;
++      uint64_t range[2], start, len, end;
+       struct bio *prev = NULL, *bio;
+       sector_t sector, nr_sects;
+       struct blk_plug plug;
         int err;
   
         if (!(mode & BLK_OPEN_WRITE))
@@@ -105,6 -108,8 +108,8 @@@
   
         if (!bdev_max_discard_sectors(bdev))
                 return -EOPNOTSUPP;
+       if (bdev_read_only(bdev))
+               return -EPERM;
   
         if (copy_from_user(range, (void __user *)arg, sizeof(range)))
                 return -EFAULT;
@@@ -112,20 -117,44 +117,45 @@@
         start = range[0];
         len = range[1];
   
-       if (start & 511)
+       if (!len)
                 return -EINVAL;
-       if (len & 511)
+       if ((start | len) & bs_mask)
                 return -EINVAL;
   
- -      if (start + len > bdev_nr_bytes(bdev))
+ +      if (check_add_overflow(start, len, &end) ||
+ +          end > bdev_nr_bytes(bdev))
                 return -EINVAL;
   
         filemap_invalidate_lock(inode->i_mapping);
         err = truncate_bdev_range(bdev, mode, start, start + len - 1);
         if (err)
                 goto fail;
-       err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+ 
+       sector = start >> SECTOR_SHIFT;
+       nr_sects = len >> SECTOR_SHIFT;
+ 
+       blk_start_plug(&plug);
+       while (1) {
+               if (fatal_signal_pending(current)) {
+                       if (prev)
+                               bio_await_chain(prev);
+                       err = -EINTR;
+                       goto out_unplug;
+               }
+               bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
+                               GFP_KERNEL);
+               if (!bio)
+                       break;
+               prev = bio_chain_and_submit(prev, bio);
+       }
+       if (prev) {
+               err = submit_bio_wait(prev);
+               if (err == -EOPNOTSUPP)
+                       err = 0;
+               bio_put(prev);
+       }
+ out_unplug:
+       blk_finish_plug(&plug);
   fail:
         filemap_invalidate_unlock(inode->i_mapping);
         return err;
@@@ -563,8 -592,7 +593,8 @@@ static int blkdev_common_ioctl(struct b
                         return -EACCES;
                 if (bdev_is_partition(bdev))
                         return -EINVAL;
- -              return disk_scan_partitions(bdev->bd_disk, mode);
+ +              return disk_scan_partitions(bdev->bd_disk,
+ +                              mode | BLK_OPEN_STRICT_SCAN);
         case BLKTRACESTART:
         case BLKTRACESTOP:
         case BLKTRACETEARDOWN:
diff --combined drivers/block/null_blk/main.c

index ed33cf7192d21672fb389a93c20fbbb887796337,f7b9078f69138b4d577ca3485ebbddbf4383855a..4005a8b685e89373645d51005fe37c370632bcaa
--- 1/drivers/block/null_blk/main.c
--- 2/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@@ -225,6 -225,10 +225,10 @@@ static unsigned long g_cache_size
   module_param_named(cache_size, g_cache_size, ulong, 0444);
   MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
   
+ static bool g_fua = true;
+ module_param_named(fua, g_fua, bool, 0444);
+ MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true");
+ 
   static unsigned int g_mbps;
   module_param_named(mbps, g_mbps, uint, 0444);
   MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
@@@ -253,6 -257,11 +257,11 @@@ static unsigned int g_zone_max_active
   module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
   MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
   
+ static int g_zone_append_max_sectors = INT_MAX;
+ module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444);
+ MODULE_PARM_DESC(zone_append_max_sectors,
+                "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation");
+ 
   static struct nullb_device *null_alloc_dev(void);
   static void null_free_dev(struct nullb_device *dev);
   static void null_del_dev(struct nullb *nullb);
@@@ -436,10 -445,12 +445,12 @@@ NULLB_DEVICE_ATTR(zone_capacity, ulong
   NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
   NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
   NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
+ NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL);
   NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
   NULLB_DEVICE_ATTR(no_sched, bool, NULL);
   NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
   NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
+ NULLB_DEVICE_ATTR(fua, bool, NULL);
   
   static ssize_t nullb_device_power_show(struct config_item *item, char *page)
   {
@@@ -580,12 -591,14 +591,14 @@@ static struct configfs_attribute *nullb
         &nullb_device_attr_zone_nr_conv,
         &nullb_device_attr_zone_max_open,
         &nullb_device_attr_zone_max_active,
+       &nullb_device_attr_zone_append_max_sectors,
         &nullb_device_attr_zone_readonly,
         &nullb_device_attr_zone_offline,
         &nullb_device_attr_virt_boundary,
         &nullb_device_attr_no_sched,
         &nullb_device_attr_shared_tags,
         &nullb_device_attr_shared_tag_bitmap,
+       &nullb_device_attr_fua,
         NULL,
   };
   
@@@ -664,14 -677,14 +677,14 @@@ nullb_group_drop_item(struct config_gro
   static ssize_t memb_group_features_show(struct config_item *item, char *page)
   {
         return snprintf(page, PAGE_SIZE,
-                       "badblocks,blocking,blocksize,cache_size,"
+                       "badblocks,blocking,blocksize,cache_size,fua,"
                         "completion_nsec,discard,home_node,hw_queue_depth,"
                         "irqmode,max_sectors,mbps,memory_backed,no_sched,"
                         "poll_queues,power,queue_mode,shared_tag_bitmap,"
                         "shared_tags,size,submit_queues,use_per_node_hctx,"
                         "virt_boundary,zoned,zone_capacity,zone_max_active,"
                         "zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
-                       "zone_size\n");
+                       "zone_size,zone_append_max_sectors\n");
   }
   
   CONFIGFS_ATTR_RO(memb_group_, features);
@@@ -751,10 -764,13 +764,13 @@@ static struct nullb_device *null_alloc_
         dev->zone_nr_conv = g_zone_nr_conv;
         dev->zone_max_open = g_zone_max_open;
         dev->zone_max_active = g_zone_max_active;
+       dev->zone_append_max_sectors = g_zone_append_max_sectors;
         dev->virt_boundary = g_virt_boundary;
         dev->no_sched = g_no_sched;
         dev->shared_tags = g_shared_tags;
         dev->shared_tag_bitmap = g_shared_tag_bitmap;
+       dev->fua = g_fua;
+ 
         return dev;
   }
   
@@@ -1151,7 -1167,7 +1167,7 @@@ blk_status_t null_handle_discard(struc
         return BLK_STS_OK;
   }
   
- static int null_handle_flush(struct nullb *nullb)
+ static blk_status_t null_handle_flush(struct nullb *nullb)
   {
         int err;
   
@@@ -1168,7 -1184,7 +1184,7 @@@
   
         WARN_ON(!radix_tree_empty(&nullb->dev->cache));
         spin_unlock_irq(&nullb->lock);
-       return err;
+       return errno_to_blk_status(err);
   }
   
   static int null_transfer(struct nullb *nullb, struct page *page,
@@@ -1206,7 -1222,7 +1222,7 @@@ static int null_handle_rq(struct nullb_
   {
         struct request *rq = blk_mq_rq_from_pdu(cmd);
         struct nullb *nullb = cmd->nq->dev->nullb;
-       int err;
+       int err = 0;
         unsigned int len;
         sector_t sector = blk_rq_pos(rq);
         struct req_iterator iter;
@@@ -1218,15 -1234,13 +1234,13 @@@
                 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
                                      op_is_write(req_op(rq)), sector,
                                      rq->cmd_flags & REQ_FUA);
-               if (err) {
-                       spin_unlock_irq(&nullb->lock);
-                       return err;
-               }
+               if (err)
+                       break;
                 sector += len >> SECTOR_SHIFT;
         }
         spin_unlock_irq(&nullb->lock);
   
-       return 0;
+       return errno_to_blk_status(err);
   }
   
   static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
@@@ -1273,8 -1287,8 +1287,8 @@@ static inline blk_status_t null_handle_
   
         if (op == REQ_OP_DISCARD)
                 return null_handle_discard(dev, sector, nr_sectors);
-       return errno_to_blk_status(null_handle_rq(cmd));
   
+       return null_handle_rq(cmd);
   }
   
   static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
@@@ -1343,7 -1357,7 +1357,7 @@@ static void null_handle_cmd(struct null
         blk_status_t sts;
   
         if (op == REQ_OP_FLUSH) {
-               cmd->error = errno_to_blk_status(null_handle_flush(nullb));
+               cmd->error = null_handle_flush(nullb);
                 goto out;
         }
   
@@@ -1912,7 -1926,7 +1926,7 @@@ static int null_add_dev(struct nullb_de
   
         if (dev->cache_size > 0) {
                 set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
-               blk_queue_write_cache(nullb->q, true, true);
+               blk_queue_write_cache(nullb->q, true, dev->fua);
         }
   
         nullb->q->queuedata = nullb;
@@@ -1965,10 -1979,10 +1979,10 @@@
   
   out_ida_free:
         ida_free(&nullb_indexes, nullb->index);
- -out_cleanup_zone:
- -      null_free_zoned_dev(dev);
   out_cleanup_disk:
         put_disk(nullb->disk);
+ +out_cleanup_zone:
+ +      null_free_zoned_dev(dev);
   out_cleanup_tags:
         if (nullb->tag_set == &nullb->__tag_set)
                 blk_mq_free_tag_set(nullb->tag_set);
@@@ -2113,10 -2127,13 +2127,13 @@@ static void __exit null_exit(void
   
         if (tag_set.ops)
                 blk_mq_free_tag_set(&tag_set);
+ 
+       mutex_destroy(&lock);
   }
   
   module_init(null_init);
   module_exit(null_exit);
   
   MODULE_AUTHOR("Jens Axboe <[email protected]>");
+ MODULE_DESCRIPTION("multi queue aware block test driver");
   MODULE_LICENSE("GPL");
diff --combined drivers/block/ublk_drv.c

index 374e4efa8759fba62df2cdbbc49c9428ddb5ea5b,851c78913de2b28294f4b59bbad2a9b87aa63f73..176657dce3e3850c34d01936c9cd19a68d48f7a1
--- 1/drivers/block/ublk_drv.c
--- 2/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@@ -221,7 -221,7 +221,7 @@@ static int ublk_get_nr_zones(const stru
   
   static int ublk_revalidate_disk_zones(struct ublk_device *ub)
   {
-       return blk_revalidate_disk_zones(ub->ub_disk, NULL);
+       return blk_revalidate_disk_zones(ub->ub_disk);
   }
   
   static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
@@@ -249,8 -249,7 +249,7 @@@
   static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
   {
         blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
-       blk_queue_required_elevator_features(ub->ub_disk->queue,
-                                            ELEVATOR_F_ZBD_SEQ_WRITE);
+ 
         ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
   }
   
@@@ -2177,8 -2176,7 +2176,8 @@@ static int ublk_ctrl_start_dev(struct u
                 .max_hw_sectors         = p->max_sectors,
                 .chunk_sectors          = p->chunk_sectors,
                 .virt_boundary_mask     = p->virt_boundary_mask,
- -
+ +              .max_segments           = USHRT_MAX,
+ +              .max_segment_size       = UINT_MAX,
         };
         struct gendisk *disk;
         int ret = -EINVAL;
diff --combined drivers/md/dm.c

index 7d0746b37c8ec791f111d6e589476eb2b500e9d4,2369d10c84753864fa0f25cff290b6d62b3e0276..597dd7a258234aeb4b982f74cbd9ae62ffb368ce
--- 1/drivers/md/dm.c
--- 2/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@@ -765,7 -765,7 +765,7 @@@ static struct table_device *open_table_
         return td;
   
   out_blkdev_put:
- -      fput(bdev_file);
+ +      __fput_sync(bdev_file);
   out_free_td:
         kfree(td);
         return ERR_PTR(r);
@@@ -778,13 -778,7 +778,13 @@@ static void close_table_device(struct t
   {
         if (md->disk->slave_dir)
                 bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
- -      fput(td->dm_dev.bdev_file);
+ +
+ +      /* Leverage async fput() if DMF_DEFERRED_REMOVE set */
+ +      if (unlikely(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
+ +              fput(td->dm_dev.bdev_file);
+ +      else
+ +              __fput_sync(td->dm_dev.bdev_file);
+ +
         put_dax(td->dm_dev.dax_dev);
         list_del(&td->list);
         kfree(td);
@@@ -1428,25 -1422,12 +1428,12 @@@ static void __map_bio(struct bio *clone
                 down(&md->swap_bios_semaphore);
         }
   
-       if (static_branch_unlikely(&zoned_enabled)) {
-               /*
-                * Check if the IO needs a special mapping due to zone append
-                * emulation on zoned target. In this case, dm_zone_map_bio()
-                * calls the target map operation.
-                */
-               if (unlikely(dm_emulate_zone_append(md)))
-                       r = dm_zone_map_bio(tio);
-               else
-                       goto do_map;
-       } else {
- do_map:
-               if (likely(ti->type->map == linear_map))
-                       r = linear_map(ti, clone);
-               else if (ti->type->map == stripe_map)
-                       r = stripe_map(ti, clone);
-               else
-                       r = ti->type->map(ti, clone);
-       }
+       if (likely(ti->type->map == linear_map))
+               r = linear_map(ti, clone);
+       else if (ti->type->map == stripe_map)
+               r = stripe_map(ti, clone);
+       else
+               r = ti->type->map(ti, clone);
   
         switch (r) {
         case DM_MAPIO_SUBMITTED:
@@@ -1774,6 -1755,33 +1761,33 @@@ static void init_clone_info(struct clon
                 ci->sector_count = 0;
   }
   
+ #ifdef CONFIG_BLK_DEV_ZONED
+ static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+                                          struct bio *bio)
+ {
+       /*
+        * For mapped device that need zone append emulation, we must
+        * split any large BIO that straddles zone boundaries.
+        */
+       return dm_emulate_zone_append(md) && bio_straddles_zones(bio) &&
+               !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+ }
+ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+ {
+       return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
+ }
+ #else
+ static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+                                          struct bio *bio)
+ {
+       return false;
+ }
+ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+ {
+       return false;
+ }
+ #endif
+ 
   /*
    * Entry point to split a bio into clones and submit them to the targets.
    */
@@@ -1783,19 -1791,32 +1797,32 @@@ static void dm_split_and_process_bio(st
         struct clone_info ci;
         struct dm_io *io;
         blk_status_t error = BLK_STS_OK;
-       bool is_abnormal;
+       bool is_abnormal, need_split;
+ 
+       need_split = is_abnormal = is_abnormal_io(bio);
+       if (static_branch_unlikely(&zoned_enabled))
+               need_split = is_abnormal || dm_zone_bio_needs_split(md, bio);
   
-       is_abnormal = is_abnormal_io(bio);
-       if (unlikely(is_abnormal)) {
+       if (unlikely(need_split)) {
                 /*
                  * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
                  * otherwise associated queue_limits won't be imposed.
+                * Also split the BIO for mapped devices needing zone append
+                * emulation to ensure that the BIO does not cross zone
+                * boundaries.
                  */
                 bio = bio_split_to_limits(bio);
                 if (!bio)
                         return;
         }
   
+       /*
+        * Use the block layer zone write plugging for mapped devices that
+        * need zone append emulation (e.g. dm-crypt).
+        */
+       if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio))
+               return;
+ 
         /* Only support nowait for normal IO */
         if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {
                 io = alloc_io(md, bio, GFP_NOWAIT);
@@@ -2016,7 -2037,6 +2043,6 @@@ static void cleanup_mapped_device(struc
                 md->dax_dev = NULL;
         }
   
-       dm_cleanup_zoned_dev(md);
         if (md->disk) {
                 spin_lock(&_minor_lock);
                 md->disk->private_data = NULL;
diff --combined drivers/nvme/host/core.c

index 095f59e7aa937aa3ae9d8e3fc3f1a25608c99f25,c9955ecd1790851f87a973e1655952717188ee32..bf7615cb36ee6e03f0f515d7ddc1d416a2e300b8
--- 1/drivers/nvme/host/core.c
--- 2/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@@ -628,6 -628,27 +628,6 @@@ bool nvme_change_ctrl_state(struct nvme
   }
   EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
   
- -/*
- - * Returns true for sink states that can't ever transition back to live.
- - */
- -static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
- -{
- -      switch (nvme_ctrl_state(ctrl)) {
- -      case NVME_CTRL_NEW:
- -      case NVME_CTRL_LIVE:
- -      case NVME_CTRL_RESETTING:
- -      case NVME_CTRL_CONNECTING:
- -              return false;
- -      case NVME_CTRL_DELETING:
- -      case NVME_CTRL_DELETING_NOIO:
- -      case NVME_CTRL_DEAD:
- -              return true;
- -      default:
- -              WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
- -              return true;
- -      }
- -}
- -
   /*
    * Waits for the controller state to be resetting, or returns false if it is
    * not possible to ever transition to that state.
@@@ -2055,7 -2076,6 +2055,7 @@@ static int nvme_update_ns_info_block(st
         bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT;
         struct queue_limits lim;
         struct nvme_id_ns_nvm *nvm = NULL;
+ +      struct nvme_zone_info zi = {};
         struct nvme_id_ns *id;
         sector_t capacity;
         unsigned lbaf;
@@@ -2068,10 -2088,9 +2068,10 @@@
         if (id->ncap == 0) {
                 /* namespace not allocated or attached */
                 info->is_removed = true;
- -              ret = -ENODEV;
+ +              ret = -ENXIO;
                 goto out;
         }
+ +      lbaf = nvme_lbaf_index(id->flbas);
   
         if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
                 ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
@@@ -2079,14 -2098,8 +2079,14 @@@
                         goto out;
         }
   
+ +      if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ +          ns->head->ids.csi == NVME_CSI_ZNS) {
+ +              ret = nvme_query_zone_info(ns, lbaf, &zi);
+ +              if (ret < 0)
+ +                      goto out;
+ +      }
+ +
         blk_mq_freeze_queue(ns->disk->queue);
- -      lbaf = nvme_lbaf_index(id->flbas);
         ns->head->lba_shift = id->lbaf[lbaf].ds;
         ns->head->nuse = le64_to_cpu(id->nuse);
         capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
@@@ -2099,8 -2112,13 +2099,8 @@@
                 capacity = 0;
         nvme_config_discard(ns, &lim);
         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- -          ns->head->ids.csi == NVME_CSI_ZNS) {
- -              ret = nvme_update_zone_info(ns, lbaf, &lim);
- -              if (ret) {
- -                      blk_mq_unfreeze_queue(ns->disk->queue);
- -                      goto out;
- -              }
- -      }
+ +          ns->head->ids.csi == NVME_CSI_ZNS)
+ +              nvme_update_zone_info(ns, &lim, &zi);
         ret = queue_limits_commit_update(ns->disk->queue, &lim);
         if (ret) {
                 blk_mq_unfreeze_queue(ns->disk->queue);
@@@ -2132,7 -2150,7 +2132,7 @@@
         blk_mq_unfreeze_queue(ns->disk->queue);
   
         if (blk_queue_is_zoned(ns->queue)) {
-               ret = blk_revalidate_disk_zones(ns->disk, NULL);
+               ret = blk_revalidate_disk_zones(ns->disk);
                 if (ret && !nvme_first_scan(ns->disk))
                         goto out;
         }
@@@ -2183,7 -2201,6 +2183,7 @@@ static int nvme_update_ns_info(struct n
         }
   
         if (!ret && nvme_ns_head_multipath(ns->head)) {
+ +              struct queue_limits *ns_lim = &ns->disk->queue->limits;
                 struct queue_limits lim;
   
                 blk_mq_freeze_queue(ns->head->disk->queue);
@@@ -2195,26 -2212,7 +2195,26 @@@
                 set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
                 nvme_mpath_revalidate_paths(ns);
   
+ +              /*
+ +               * queue_limits mixes values that are the hardware limitations
+ +               * for bio splitting with what is the device configuration.
+ +               *
+ +               * For NVMe the device configuration can change after e.g. a
+ +               * Format command, and we really want to pick up the new format
+ +               * value here.  But we must still stack the queue limits to the
+ +               * least common denominator for multipathing to split the bios
+ +               * properly.
+ +               *
+ +               * To work around this, we explicitly set the device
+ +               * configuration to those that we just queried, but only stack
+ +               * the splitting limits in to make sure we still obey possibly
+ +               * lower limitations of other controllers.
+ +               */
                 lim = queue_limits_start_update(ns->head->disk->queue);
+ +              lim.logical_block_size = ns_lim->logical_block_size;
+ +              lim.physical_block_size = ns_lim->physical_block_size;
+ +              lim.io_min = ns_lim->io_min;
+ +              lim.io_opt = ns_lim->io_opt;
                 queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
                                         ns->head->disk->disk_name);
                 ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
@@@ -3660,7 -3658,7 +3660,7 @@@ static int nvme_init_ns_head(struct nvm
                                 "Found shared namespace %d, but multipathing not supported.\n",
                                 info->nsid);
                         dev_warn_once(ctrl->device,
- -                              "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
+ +                              "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n");
                 }
         }
   
diff --combined drivers/scsi/scsi_lib.c

index 5b3230ef51fe61bce58ba1cc83bff7cb0a6ddbc1,9ca96116bd3325c0df08b528c996079b78569b72..967b6d62bb37e1c463de3df7659c582c81d8ec8c
--- 1/drivers/scsi/scsi_lib.c
--- 2/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@@ -635,9 -635,10 +635,9 @@@ static bool scsi_end_request(struct req
         if (blk_queue_add_random(q))
                 add_disk_randomness(req->q->disk);
   
- -      if (!blk_rq_is_passthrough(req)) {
- -              WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED));
- -              cmd->flags &= ~SCMD_INITIALIZED;
- -      }
+ +      WARN_ON_ONCE(!blk_rq_is_passthrough(req) &&
+ +                   !(cmd->flags & SCMD_INITIALIZED));
+ +      cmd->flags = 0;
   
         /*
          * Calling rcu_barrier() is not necessary here because the
@@@ -1869,7 -1870,6 +1869,6 @@@ out_put_budget
         case BLK_STS_OK:
                 break;
         case BLK_STS_RESOURCE:
-       case BLK_STS_ZONE_RESOURCE:
                 if (scsi_device_blocked(sdev))
                         ret = BLK_STS_DEV_RESOURCE;
                 break;
diff --combined drivers/scsi/sd.c

index 65cdc8b77e358546fd1768fe4ec1bb4588c4b692,dcba9530ffa5269e6321f0bacb244524e6be636d..64c5129044b3053b54b827d6316a52ef5aa49732
--- 1/drivers/scsi/sd.c
--- 2/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@@ -1260,12 -1260,6 +1260,6 @@@ static blk_status_t sd_setup_read_write
                 }
         }
   
-       if (req_op(rq) == REQ_OP_ZONE_APPEND) {
-               ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks);
-               if (ret)
-                       goto fail;
-       }
- 
         fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0;
         dix = scsi_prot_sg_count(cmd);
         dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type);
@@@ -1348,7 -1342,6 +1342,6 @@@ static blk_status_t sd_init_command(str
                 return sd_setup_flush_cmnd(cmd);
         case REQ_OP_READ:
         case REQ_OP_WRITE:
-       case REQ_OP_ZONE_APPEND:
                 return sd_setup_read_write_cmnd(cmd);
         case REQ_OP_ZONE_RESET:
                 return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
@@@ -3120,7 -3113,6 +3113,7 @@@ static void sd_read_io_hints(struct scs
   {
         struct scsi_device *sdp = sdkp->device;
         const struct scsi_io_group_descriptor *desc, *start, *end;
+ +      u16 permanent_stream_count_old;
         struct scsi_sense_hdr sshdr;
         struct scsi_mode_data data;
         int res;
@@@ -3141,13 -3133,12 +3134,13 @@@
         for (desc = start; desc < end; desc++)
                 if (!desc->st_enble || !sd_is_perm_stream(sdkp, desc - start))
                         break;
+ +      permanent_stream_count_old = sdkp->permanent_stream_count;
         sdkp->permanent_stream_count = desc - start;
         if (sdkp->rscs && sdkp->permanent_stream_count < 2)
                 sd_printk(KERN_INFO, sdkp,
                           "Unexpected: RSCS has been set and the permanent stream count is %u\n",
                           sdkp->permanent_stream_count);
- -      else if (sdkp->permanent_stream_count)
+ +      else if (sdkp->permanent_stream_count != permanent_stream_count_old)
                 sd_printk(KERN_INFO, sdkp, "permanent stream count = %d\n",
                           sdkp->permanent_stream_count);
   }
@@@ -3922,7 -3913,7 +3915,7 @@@ static int sd_probe(struct device *dev
   
         error = device_add_disk(dev, gd, NULL);
         if (error) {
- -              put_device(&sdkp->disk_dev);
+ +              device_unregister(&sdkp->disk_dev);
                 put_disk(gd);
                 goto out;
         }
@@@ -3981,7 -3972,6 +3974,6 @@@ static void scsi_disk_release(struct de
         struct scsi_disk *sdkp = to_scsi_disk(dev);
   
         ida_free(&sd_index_ida, sdkp->index);
-       sd_zbc_free_zone_info(sdkp);
         put_device(&sdkp->device->sdev_gendev);
         free_opal_dev(sdkp->opal_dev);
   
diff --combined include/linux/blk_types.h

index c3e098b21c161ecaf0bb13fd2e96fc3e2b4a1d57,5751292fee6a6c3540cbe3453a4a08422dc29198..25dbf1097085588a9f947bede4ad2467ba1eeea2
--- 1/include/linux/blk_types.h
--- 2/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@@ -88,9 -88,15 +88,9 @@@ struct block_device 
   
   /*
    * Block error status values.  See block/blk-core:blk_errors for the details.
- - * Alpha cannot write a byte atomically, so we need to use 32-bit value.
    */
- -#if defined(CONFIG_ALPHA) && !defined(__alpha_bwx__)
- -typedef u32 __bitwise blk_status_t;
- -typedef u32 blk_short_t;
- -#else
   typedef u8 __bitwise blk_status_t;
   typedef u16 blk_short_t;
- -#endif
   #define       BLK_STS_OK 0
   #define BLK_STS_NOTSUPP               ((__force blk_status_t)1)
   #define BLK_STS_TIMEOUT               ((__force blk_status_t)2)
@@@ -130,18 -136,6 +130,6 @@@
    */
   #define BLK_STS_DEV_RESOURCE  ((__force blk_status_t)13)
   
- /*
-  * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone
-  * related resources are unavailable, but the driver can guarantee the queue
-  * will be rerun in the future once the resources become available again.
-  *
-  * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references
-  * a zone specific resource and IO to a different zone on the same device could
-  * still be served. Examples of that are zones that are write-locked, but a read
-  * to the same zone could be served.
-  */
- #define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14)
- 
   /*
    * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion
    * path if the device returns a status indicating that too many zone resources
@@@ -149,7 -143,7 +137,7 @@@
    * after the number of open zones decreases below the device's limits, which is
    * reported in the request_queue's max_open_zones.
    */
- #define BLK_STS_ZONE_OPEN_RESOURCE    ((__force blk_status_t)15)
+ #define BLK_STS_ZONE_OPEN_RESOURCE    ((__force blk_status_t)14)
   
   /*
    * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion
@@@ -158,20 -152,20 +146,20 @@@
    * after the number of active zones decreases below the device's limits, which
    * is reported in the request_queue's max_active_zones.
    */
- #define BLK_STS_ZONE_ACTIVE_RESOURCE  ((__force blk_status_t)16)
+ #define BLK_STS_ZONE_ACTIVE_RESOURCE  ((__force blk_status_t)15)
   
   /*
    * BLK_STS_OFFLINE is returned from the driver when the target device is offline
    * or is being taken offline. This could help differentiate the case where a
    * device is intentionally being shut down from a real I/O error.
    */
- #define BLK_STS_OFFLINE               ((__force blk_status_t)17)
+ #define BLK_STS_OFFLINE               ((__force blk_status_t)16)
   
   /*
    * BLK_STS_DURATION_LIMIT is returned from the driver when the target device
    * aborted the command because it exceeded one of its Command Duration Limits.
    */
- #define BLK_STS_DURATION_LIMIT        ((__force blk_status_t)18)
+ #define BLK_STS_DURATION_LIMIT        ((__force blk_status_t)17)
   
   /**
    * blk_path_error - returns true if error may be path related
@@@ -228,7 -222,12 +216,12 @@@ struct bio 
   
         struct bvec_iter        bi_iter;
   
-       blk_qc_t                bi_cookie;
+       union {
+               /* for polled bios: */
+               blk_qc_t                bi_cookie;
+               /* for plugged zoned writes only: */
+               unsigned int            __bi_nr_segments;
+       };
         bio_end_io_t            *bi_end_io;
         void                    *bi_private;
   #ifdef CONFIG_BLK_CGROUP
@@@ -298,7 -297,8 +291,8 @@@ enum 
         BIO_QOS_THROTTLED,      /* bio went through rq_qos throttle path */
         BIO_QOS_MERGED,         /* but went through rq_qos merge path */
         BIO_REMAPPED,
-       BIO_ZONE_WRITE_LOCKED,  /* Owns a zoned device zone write lock */
+       BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
+       BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
         BIO_FLAG_LAST
   };
   
diff --combined include/linux/blkdev.h

index 69e7da33ca49a6f5caeec85c56728069cc61a69e,26acf80c50c03de65ff800e12664cf828d9e203b..69c4f113db428dfb44bea0d69d140e7969806792
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -128,8 -128,6 +128,8 @@@ typedef unsigned int __bitwise blk_mode
   #define BLK_OPEN_WRITE_IOCTL  ((__force blk_mode_t)(1 << 4))
   /* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
   #define BLK_OPEN_RESTRICT_WRITES      ((__force blk_mode_t)(1 << 5))
+ +/* return partition scanning errors */
+ +#define BLK_OPEN_STRICT_SCAN  ((__force blk_mode_t)(1 << 6))
   
   struct gendisk {
         /*
@@@ -179,22 -177,21 +179,21 @@@
   
   #ifdef CONFIG_BLK_DEV_ZONED
         /*
-        * Zoned block device information for request dispatch control.
-        * nr_zones is the total number of zones of the device. This is always
-        * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
-        * bits which indicates if a zone is conventional (bit set) or
-        * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
-        * bits which indicates if a zone is write locked, that is, if a write
-        * request targeting the zone was dispatched.
-        *
-        * Reads of this information must be protected with blk_queue_enter() /
-        * blk_queue_exit(). Modifying this information is only allowed while
-        * no requests are being processed. See also blk_mq_freeze_queue() and
-        * blk_mq_unfreeze_queue().
+        * Zoned block device information. Reads of this information must be
+        * protected with blk_queue_enter() / blk_queue_exit(). Modifying this
+        * information is only allowed while no requests are being processed.
+        * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue().
          */
         unsigned int            nr_zones;
+       unsigned int            zone_capacity;
         unsigned long           *conv_zones_bitmap;
-       unsigned long           *seq_zones_wlock;
+       unsigned int            zone_wplugs_hash_bits;
+       spinlock_t              zone_wplugs_lock;
+       struct mempool_s        *zone_wplugs_pool;
+       struct hlist_head       *zone_wplugs_hash;
+       struct list_head        zone_wplugs_err_list;
+       struct work_struct      zone_wplugs_work;
+       struct workqueue_struct *zone_wplugs_wq;
   #endif /* CONFIG_BLK_DEV_ZONED */
   
   #if IS_ENABLED(CONFIG_CDROM)
@@@ -233,6 -230,19 +232,19 @@@ static inline unsigned int disk_openers
         return atomic_read(&disk->part0->bd_openers);
   }
   
+ /**
+  * disk_has_partscan - return %true if partition scanning is enabled on a disk
+  * @disk: disk to check
+  *
+  * Returns %true if partitions scanning is enabled for @disk, or %false if
+  * partition scanning is disabled either permanently or temporarily.
+  */
+ static inline bool disk_has_partscan(struct gendisk *disk)
+ {
+       return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) &&
+               !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
+ }
+ 
   /*
    * The gendisk is refcounted by the part0 block_device, and the bd_device
    * therein is also used for device model presentation in sysfs.
@@@ -331,8 -341,7 +343,7 @@@ int blkdev_report_zones(struct block_de
                 unsigned int nr_zones, report_zones_cb cb, void *data);
   int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
                 sector_t sectors, sector_t nr_sectors);
- int blk_revalidate_disk_zones(struct gendisk *disk,
-               void (*update_driver_data)(struct gendisk *disk));
+ int blk_revalidate_disk_zones(struct gendisk *disk);
   
   /*
    * Independent access ranges: struct blk_independent_access_range describes
@@@ -449,8 -458,6 +460,6 @@@ struct request_queue 
   
         atomic_t                nr_active_requests_shared_tags;
   
-       unsigned int            required_elevator_features;
- 
         struct blk_mq_tags      *sched_shared_tags;
   
         struct list_head        icq_list;
@@@ -633,15 -640,6 +642,6 @@@ static inline unsigned int disk_zone_no
         return sector >> ilog2(disk->queue->limits.chunk_sectors);
   }
   
- static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
- {
-       if (!blk_queue_is_zoned(disk->queue))
-               return false;
-       if (!disk->conv_zones_bitmap)
-               return true;
-       return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
- }
- 
   static inline void disk_set_max_open_zones(struct gendisk *disk,
                 unsigned int max_open_zones)
   {
@@@ -664,6 -662,7 +664,7 @@@ static inline unsigned int bdev_max_act
         return bdev->bd_disk->queue->limits.max_active_zones;
   }
   
+ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);
   #else /* CONFIG_BLK_DEV_ZONED */
   static inline unsigned int bdev_nr_zones(struct block_device *bdev)
   {
@@@ -674,10 -673,6 +675,6 @@@ static inline unsigned int disk_nr_zone
   {
         return 0;
   }
- static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
- {
-       return false;
- }
   static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
   {
         return 0;
@@@ -691,6 -686,10 +688,10 @@@ static inline unsigned int bdev_max_act
   {
         return 0;
   }
+ static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+ {
+       return false;
+ }
   #endif /* CONFIG_BLK_DEV_ZONED */
   
   static inline unsigned int blk_queue_depth(struct request_queue *q)
@@@ -855,9 -854,11 +856,11 @@@ static inline unsigned int bio_zone_no(
         return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
   }
   
- static inline unsigned int bio_zone_is_seq(struct bio *bio)
+ static inline bool bio_straddles_zones(struct bio *bio)
   {
-       return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
+       return bio_sectors(bio) &&
+               bio_zone_no(bio) !=
+               disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1);
   }
   
   /*
@@@ -942,14 -943,6 +945,6 @@@ disk_alloc_independent_access_ranges(st
   void disk_set_independent_access_ranges(struct gendisk *disk,
                                 struct blk_independent_access_ranges *iars);
   
- /*
-  * Elevator features for blk_queue_required_elevator_features:
-  */
- /* Supports zoned block devices sequential write constraint */
- #define ELEVATOR_F_ZBD_SEQ_WRITE      (1U << 0)
- 
- extern void blk_queue_required_elevator_features(struct request_queue *q,
-                                                unsigned int features);
   extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
                                               struct device *dev);
   
@@@ -1156,12 -1149,29 +1151,29 @@@ static inline unsigned int queue_max_se
         return q->limits.max_segment_size;
   }
   
- static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q)
+ static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l)
   {
+       unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
   
-       const struct queue_limits *l = &q->limits;
+       return min_not_zero(l->max_zone_append_sectors, max_sectors);
+ }
+ 
+ static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
+ {
+       if (!blk_queue_is_zoned(q))
+               return 0;
+ 
+       return queue_limits_max_zone_append_sectors(&q->limits);
+ }
+ 
+ static inline bool queue_emulates_zone_append(struct request_queue *q)
+ {
+       return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
+ }
   
-       return min(l->max_zone_append_sectors, l->max_sectors);
+ static inline bool bdev_emulates_zone_append(struct block_device *bdev)
+ {
+       return queue_emulates_zone_append(bdev_get_queue(bdev));
   }
   
   static inline unsigned int
@@@ -1303,18 -1313,6 +1315,6 @@@ static inline unsigned int bdev_zone_no
         return disk_zone_no(bdev->bd_disk, sec);
   }
   
- /* Whether write serialization is required for @op on zoned devices. */
- static inline bool op_needs_zoned_write_locking(enum req_op op)
- {
-       return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES;
- }
- 
- static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
-                                         enum req_op op)
- {
-       return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op);
- }
- 
   static inline sector_t bdev_zone_sectors(struct block_device *bdev)
   {
         struct request_queue *q = bdev_get_queue(bdev);
@@@ -1330,6 -1328,12 +1330,12 @@@ static inline sector_t bdev_offset_from
         return sector & (bdev_zone_sectors(bdev) - 1);
   }
   
+ static inline sector_t bio_offset_from_zone_start(struct bio *bio)
+ {
+       return bdev_offset_from_zone_start(bio->bi_bdev,
+                                          bio->bi_iter.bi_sector);
+ }
+ 
   static inline bool bdev_is_zone_start(struct block_device *bdev,
                                       sector_t sector)
   {
@@@ -1507,6 -1511,16 +1513,6 @@@ struct blk_holder_ops 
          * Thaw the file system mounted on the block device.
          */
         int (*thaw)(struct block_device *bdev);
- -
- -      /*
- -       * If needed, get a reference to the holder.
- -       */
- -      void (*get_holder)(void *holder);
- -
- -      /*
- -       * Release the holder.
- -       */
- -      void (*put_holder)(void *holder);
   };
   
   /*
@@@ -1577,7 -1591,6 +1583,7 @@@ static inline int early_lookup_bdev(con
   
   int bdev_freeze(struct block_device *bdev);
   int bdev_thaw(struct block_device *bdev);
+ +void bdev_fput(struct file *bdev_file);
   
   struct io_comp_batch {
         struct request *req_list;
author	Linus Torvalds <[email protected]>
	Mon, 13 May 2024 20:03:54 +0000 (13:03 -0700)
committer	Linus Torvalds <[email protected]>
	Mon, 13 May 2024 20:03:54 +0000 (13:03 -0700)
		1	2
block/blk-cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-settings.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/fops.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/null_blk/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/ublk_drv.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/scsi_lib.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/sd.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history