From: Linus Torvalds Date: Mon, 13 May 2024 20:03:54 +0000 (-0700) Subject: Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux X-Git-Tag: v6.10-rc1~219 X-Git-Url: https://repo.jachan.dev/linux.git/commitdiff_plain/0c9f4ac808b017a0013cee92a30de980550145d5?hp=-c Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux Pull block updates from Jens Axboe: - Add a partscan attribute in sysfs, fixing an issue with systemd relying on an internal interface that went away. - Attempt #2 at making long running discards interruptible. The previous attempt went into 6.9, but we ended up mostly reverting it as it had issues. - Remove old ida_simple API in bcache - Support for zoned write plugging, greatly improving the performance on zoned devices. - Remove the old throttle low interface, which has been experimental since 2017 and never made it beyond that and isn't being used. - Remove page->index debugging checks in brd, as it hasn't caught anything and prepares us for removing in struct page. - MD pull request from Song - Don't schedule block workers on isolated CPUs * tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux: (84 commits) blk-throttle: delay initialization until configuration blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW block: fix that util can be greater than 100% block: support to account io_ticks precisely block: add plug while submitting IO bcache: fix variable length array abuse in btree_iter bcache: Remove usage of the deprecated ida_simple_xx() API md: Revert "md: Fix overflow in is_mddev_idle" blk-lib: check for kill signal in ioctl BLKDISCARD block: add a bio_await_chain helper block: add a blk_alloc_discard_bio helper block: add a bio_chain_and_submit helper block: move discard checks into the ioctl handler block: remove the discard_granularity check in __blkdev_issue_discard block/ioctl: prefer different overflow check null_blk: Fix the WARNING: modpost: missing MODULE_DESCRIPTION() block: fix and simplify blkdevparts= cmdline parsing block: refine the EOF check in blkdev_iomap_begin block: add a partscan sysfs attribute for disks block: add a disk_has_partscan helper ... --- 0c9f4ac808b017a0013cee92a30de980550145d5 diff --combined block/blk-cgroup.c index 059467086b13,5e1f10525677..4b1a35ab0ea4 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@@ -218,8 -218,7 +218,7 @@@ static void blkg_async_bio_workfn(struc /* as long as there are pending bios, @blkg can't go away */ spin_lock(&blkg->async_bio_lock); - bio_list_merge(&bios, &blkg->async_bios); - bio_list_init(&blkg->async_bios); + bio_list_merge_init(&bios, &blkg->async_bios); spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ @@@ -1409,12 -1408,6 +1408,12 @@@ static int blkcg_css_online(struct cgro return 0; } +void blkg_init_queue(struct request_queue *q) +{ + INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex); +} + int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; @@@ -1422,6 -1415,9 +1421,6 @@@ bool preloaded; int ret; - INIT_LIST_HEAD(&q->blkg_list); - mutex_init(&q->blkcg_mutex); - new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; @@@ -1444,14 -1440,8 +1443,8 @@@ if (ret) goto err_destroy_all; - ret = blk_throtl_init(disk); - if (ret) - goto err_ioprio_exit; - return 0; - err_ioprio_exit: - blk_ioprio_exit(disk); err_destroy_all: blkg_destroy_all(disk); return ret; diff --combined block/blk-core.c index b795ac177281,8566bbd8aeba..01186333c88e --- a/block/blk-core.c +++ b/block/blk-core.c @@@ -442,8 -442,6 +442,8 @@@ struct request_queue *blk_alloc_queue(s init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); + blkg_init_queue(q); + /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. @@@ -591,8 -589,7 +591,7 @@@ static inline blk_status_t blk_check_zo return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ - if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) || - !bio_zone_is_seq(bio)) + if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) return BLK_STS_IOERR; /* @@@ -604,7 -601,7 +603,7 @@@ return BLK_STS_IOERR; /* Make sure the BIO is small enough and will not get split */ - if (nr_sectors > q->limits.max_zone_append_sectors) + if (nr_sectors > queue_max_zone_append_sectors(q)) return BLK_STS_IOERR; bio->bi_opf |= REQ_NOMERGE; @@@ -649,11 -646,13 +648,13 @@@ static void __submit_bio(struct bio *bi static void __submit_bio_noacct(struct bio *bio) { struct bio_list bio_list_on_stack[2]; + struct blk_plug plug; BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; + blk_start_plug(&plug); do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@@ -687,19 -686,23 +688,23 @@@ bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); + blk_finish_plug(&plug); current->bio_list = NULL; } static void __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; + struct blk_plug plug; current->bio_list = bio_list; + blk_start_plug(&plug); do { __submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); + blk_finish_plug(&plug); current->bio_list = NULL; } @@@ -910,12 -913,6 +915,6 @@@ int bio_poll(struct bio *bio, struct io !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return 0; - /* - * As the requests that require a zone lock are not plugged in the - * first place, directly accessing the plug instead of using - * blk_mq_plug() should not have any consequences during flushing for - * zoned devices. - */ blk_flush_plug(current->plug, false); /* @@@ -987,10 -984,11 +986,11 @@@ void update_io_ticks(struct block_devic unsigned long stamp; again: stamp = READ_ONCE(part->bd_stamp); - if (unlikely(time_after(now, stamp))) { - if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) - __part_stat_add(part, io_ticks, end ? now - stamp : 1); - } + if (unlikely(time_after(now, stamp)) && + likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && + (end || part_in_flight(part))) + __part_stat_add(part, io_ticks, now - stamp); + if (part->bd_partno) { part = bdev_whole(part); goto again; @@@ -1197,7 -1195,6 +1197,7 @@@ void __blk_flush_plug(struct blk_plug * if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); + plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; } diff --combined block/blk-settings.c index 9d6033e01f2e,715f4b6356c4..ebba05a2bc7f --- a/block/blk-settings.c +++ b/block/blk-settings.c @@@ -182,15 -182,16 +182,15 @@@ static int blk_validate_limits(struct q return -EINVAL; /* - * Devices that require a virtual boundary do not support scatter/gather - * I/O natively, but instead require a descriptor list entry for each - * page (which might not be identical to the Linux PAGE_SIZE). Because - * of that they are not limited by our notion of "segment size". + * Stacking device may have both virtual boundary and max segment + * size limit, so allow this setting now, and long-term the two + * might need to move out of stacking limits since we have immutable + * bvec and lower layer bio splitting is supposed to handle the two + * correctly. */ if (lim->virt_boundary_mask) { - if (WARN_ON_ONCE(lim->max_segment_size && - lim->max_segment_size != UINT_MAX)) - return -EINVAL; - lim->max_segment_size = UINT_MAX; + if (!lim->max_segment_size) + lim->max_segment_size = UINT_MAX; } else { /* * The maximum segment size has an odd historic 64k default that @@@ -411,24 -412,32 +411,32 @@@ EXPORT_SYMBOL(blk_queue_max_write_zeroe * blk_queue_max_zone_append_sectors - set max sectors for a single zone append * @q: the request queue for the device * @max_zone_append_sectors: maximum number of sectors to write per command + * + * Sets the maximum number of sectors allowed for zone append commands. If + * Specifying 0 for @max_zone_append_sectors indicates that the queue does + * not natively support zone append operations and that the block layer must + * emulate these operations using regular writes. **/ void blk_queue_max_zone_append_sectors(struct request_queue *q, unsigned int max_zone_append_sectors) { - unsigned int max_sectors; + unsigned int max_sectors = 0; if (WARN_ON(!blk_queue_is_zoned(q))) return; - max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors); - max_sectors = min(q->limits.chunk_sectors, max_sectors); + if (max_zone_append_sectors) { + max_sectors = min(q->limits.max_hw_sectors, + max_zone_append_sectors); + max_sectors = min(q->limits.chunk_sectors, max_sectors); - /* - * Signal eventual driver bugs resulting in the max_zone_append sectors limit - * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set, - * or the max_hw_sectors limit not set. - */ - WARN_ON(!max_sectors); + /* + * Signal eventual driver bugs resulting in the max_zone_append + * sectors limit being 0 due to the chunk_sectors limit (zone + * size) not set or the max_hw_sectors limit not set. + */ + WARN_ON_ONCE(!max_sectors); + } q->limits.max_zone_append_sectors = max_sectors; } @@@ -755,8 -764,8 +763,8 @@@ int blk_stack_limits(struct queue_limit t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); - t->max_zone_append_sectors = min(t->max_zone_append_sectors, - b->max_zone_append_sectors); + t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t), + queue_limits_max_zone_append_sectors(b)); t->bounce = max(t->bounce, b->bounce); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, @@@ -1043,22 -1052,6 +1051,6 @@@ void blk_queue_write_cache(struct reque } EXPORT_SYMBOL_GPL(blk_queue_write_cache); - /** - * blk_queue_required_elevator_features - Set a queue required elevator features - * @q: the request queue for the target device - * @features: Required elevator features OR'ed together - * - * Tell the block layer that for the device controlled through @q, only the - * only elevators that can be used are those that implement at least the set of - * features specified by @features. - */ - void blk_queue_required_elevator_features(struct request_queue *q, - unsigned int features) - { - q->required_elevator_features = features; - } - EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features); - /** * blk_queue_can_use_dma_map_merging - configure queue for merging segments. * @q: the request queue for the device diff --combined block/fops.c index af6c244314af,5159ef3a1948..7a163f7fe2d8 --- a/block/fops.c +++ b/block/fops.c @@@ -44,18 -44,15 +44,15 @@@ static bool blkdev_dio_unaligned(struc #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, - struct iov_iter *iter, unsigned int nr_pages) + struct iov_iter *iter, struct block_device *bdev, + unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@@ -161,9 -158,8 +158,8 @@@ static void blkdev_bio_end_io(struct bi } static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) + struct block_device *bdev, unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; @@@ -172,9 -168,6 +168,6 @@@ loff_t pos = iocb->ki_pos; int ret = 0; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, @@@ -302,9 -295,9 +295,9 @@@ static void blkdev_bio_end_io_async(str static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter, + struct block_device *bdev, unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); bool is_read = iov_iter_rw(iter) == READ; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; @@@ -312,9 -305,6 +305,6 @@@ loff_t pos = iocb->ki_pos; int ret = 0; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, @@@ -368,18 -358,23 +358,23 @@@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; + if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter)) + return -EINVAL; + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS)) { if (is_sync_kiocb(iocb)) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - return __blkdev_direct_IO_async(iocb, iter, nr_pages); + return __blkdev_direct_IO_simple(iocb, iter, bdev, + nr_pages); + return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); } - return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); + return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); } static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, @@@ -390,7 -385,7 +385,7 @@@ iomap->bdev = bdev; iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); - if (iomap->offset >= isize) + if (offset >= isize) return -EIO; iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; @@@ -863,7 -858,6 +858,7 @@@ const struct file_operations def_blk_fo .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, + .fop_flags = FOP_BUFFER_RASYNC, }; static __init int blkdev_init(void) diff --combined block/ioctl.c index f505f9c341eb,d7a6c6931a1e..c7db3bd2d653 --- a/block/ioctl.c +++ b/block/ioctl.c @@@ -33,7 -33,7 +33,7 @@@ static int blkpg_do_ioctl(struct block_ if (op == BLKPG_DEL_PARTITION) return bdev_del_partition(disk, p.pno); - if (p.start < 0 || p.length <= 0 || p.start + p.length < 0) + if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start) return -EINVAL; /* Check that the partition is aligned to the block size */ if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev))) @@@ -95,9 -95,12 +95,12 @@@ static int compat_blkpg_ioctl(struct bl static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { - uint64_t range[2]; - uint64_t start, len, end; + unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; struct inode *inode = bdev->bd_inode; - uint64_t range[2], start, len; ++ uint64_t range[2], start, len, end; + struct bio *prev = NULL, *bio; + sector_t sector, nr_sects; + struct blk_plug plug; int err; if (!(mode & BLK_OPEN_WRITE)) @@@ -105,6 -108,8 +108,8 @@@ if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; + if (bdev_read_only(bdev)) + return -EPERM; if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; @@@ -112,20 -117,44 +117,45 @@@ start = range[0]; len = range[1]; - if (start & 511) + if (!len) return -EINVAL; - if (len & 511) + if ((start | len) & bs_mask) return -EINVAL; - if (start + len > bdev_nr_bytes(bdev)) + if (check_add_overflow(start, len, &end) || + end > bdev_nr_bytes(bdev)) return -EINVAL; filemap_invalidate_lock(inode->i_mapping); err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) goto fail; - err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + + sector = start >> SECTOR_SHIFT; + nr_sects = len >> SECTOR_SHIFT; + + blk_start_plug(&plug); + while (1) { + if (fatal_signal_pending(current)) { + if (prev) + bio_await_chain(prev); + err = -EINTR; + goto out_unplug; + } + bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, + GFP_KERNEL); + if (!bio) + break; + prev = bio_chain_and_submit(prev, bio); + } + if (prev) { + err = submit_bio_wait(prev); + if (err == -EOPNOTSUPP) + err = 0; + bio_put(prev); + } + out_unplug: + blk_finish_plug(&plug); fail: filemap_invalidate_unlock(inode->i_mapping); return err; @@@ -563,8 -592,7 +593,8 @@@ static int blkdev_common_ioctl(struct b return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; - return disk_scan_partitions(bdev->bd_disk, mode); + return disk_scan_partitions(bdev->bd_disk, + mode | BLK_OPEN_STRICT_SCAN); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: diff --combined drivers/block/null_blk/main.c index ed33cf7192d2,f7b9078f6913..4005a8b685e8 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@@ -225,6 -225,10 +225,10 @@@ static unsigned long g_cache_size module_param_named(cache_size, g_cache_size, ulong, 0444); MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); + static bool g_fua = true; + module_param_named(fua, g_fua, bool, 0444); + MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true"); + static unsigned int g_mbps; module_param_named(mbps, g_mbps, uint, 0444); MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); @@@ -253,6 -257,11 +257,11 @@@ static unsigned int g_zone_max_active module_param_named(zone_max_active, g_zone_max_active, uint, 0444); MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); + static int g_zone_append_max_sectors = INT_MAX; + module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444); + MODULE_PARM_DESC(zone_append_max_sectors, + "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation"); + static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); @@@ -436,10 -445,12 +445,12 @@@ NULLB_DEVICE_ATTR(zone_capacity, ulong NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); + NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); NULLB_DEVICE_ATTR(no_sched, bool, NULL); NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); + NULLB_DEVICE_ATTR(fua, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@@ -580,12 -591,14 +591,14 @@@ static struct configfs_attribute *nullb &nullb_device_attr_zone_nr_conv, &nullb_device_attr_zone_max_open, &nullb_device_attr_zone_max_active, + &nullb_device_attr_zone_append_max_sectors, &nullb_device_attr_zone_readonly, &nullb_device_attr_zone_offline, &nullb_device_attr_virt_boundary, &nullb_device_attr_no_sched, &nullb_device_attr_shared_tags, &nullb_device_attr_shared_tag_bitmap, + &nullb_device_attr_fua, NULL, }; @@@ -664,14 -677,14 +677,14 @@@ nullb_group_drop_item(struct config_gro static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, - "badblocks,blocking,blocksize,cache_size," + "badblocks,blocking,blocksize,cache_size,fua," "completion_nsec,discard,home_node,hw_queue_depth," "irqmode,max_sectors,mbps,memory_backed,no_sched," "poll_queues,power,queue_mode,shared_tag_bitmap," "shared_tags,size,submit_queues,use_per_node_hctx," "virt_boundary,zoned,zone_capacity,zone_max_active," "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," - "zone_size\n"); + "zone_size,zone_append_max_sectors\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@@ -751,10 -764,13 +764,13 @@@ static struct nullb_device *null_alloc_ dev->zone_nr_conv = g_zone_nr_conv; dev->zone_max_open = g_zone_max_open; dev->zone_max_active = g_zone_max_active; + dev->zone_append_max_sectors = g_zone_append_max_sectors; dev->virt_boundary = g_virt_boundary; dev->no_sched = g_no_sched; dev->shared_tags = g_shared_tags; dev->shared_tag_bitmap = g_shared_tag_bitmap; + dev->fua = g_fua; + return dev; } @@@ -1151,7 -1167,7 +1167,7 @@@ blk_status_t null_handle_discard(struc return BLK_STS_OK; } - static int null_handle_flush(struct nullb *nullb) + static blk_status_t null_handle_flush(struct nullb *nullb) { int err; @@@ -1168,7 -1184,7 +1184,7 @@@ WARN_ON(!radix_tree_empty(&nullb->dev->cache)); spin_unlock_irq(&nullb->lock); - return err; + return errno_to_blk_status(err); } static int null_transfer(struct nullb *nullb, struct page *page, @@@ -1206,7 -1222,7 +1222,7 @@@ static int null_handle_rq(struct nullb_ { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; - int err; + int err = 0; unsigned int len; sector_t sector = blk_rq_pos(rq); struct req_iterator iter; @@@ -1218,15 -1234,13 +1234,13 @@@ err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, op_is_write(req_op(rq)), sector, rq->cmd_flags & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } + if (err) + break; sector += len >> SECTOR_SHIFT; } spin_unlock_irq(&nullb->lock); - return 0; + return errno_to_blk_status(err); } static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) @@@ -1273,8 -1287,8 +1287,8 @@@ static inline blk_status_t null_handle_ if (op == REQ_OP_DISCARD) return null_handle_discard(dev, sector, nr_sectors); - return errno_to_blk_status(null_handle_rq(cmd)); + return null_handle_rq(cmd); } static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) @@@ -1343,7 -1357,7 +1357,7 @@@ static void null_handle_cmd(struct null blk_status_t sts; if (op == REQ_OP_FLUSH) { - cmd->error = errno_to_blk_status(null_handle_flush(nullb)); + cmd->error = null_handle_flush(nullb); goto out; } @@@ -1912,7 -1926,7 +1926,7 @@@ static int null_add_dev(struct nullb_de if (dev->cache_size > 0) { set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); - blk_queue_write_cache(nullb->q, true, true); + blk_queue_write_cache(nullb->q, true, dev->fua); } nullb->q->queuedata = nullb; @@@ -1965,10 -1979,10 +1979,10 @@@ out_ida_free: ida_free(&nullb_indexes, nullb->index); -out_cleanup_zone: - null_free_zoned_dev(dev); out_cleanup_disk: put_disk(nullb->disk); +out_cleanup_zone: + null_free_zoned_dev(dev); out_cleanup_tags: if (nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); @@@ -2113,10 -2127,13 +2127,13 @@@ static void __exit null_exit(void if (tag_set.ops) blk_mq_free_tag_set(&tag_set); + + mutex_destroy(&lock); } module_init(null_init); module_exit(null_exit); MODULE_AUTHOR("Jens Axboe "); + MODULE_DESCRIPTION("multi queue aware block test driver"); MODULE_LICENSE("GPL"); diff --combined drivers/block/ublk_drv.c index 374e4efa8759,851c78913de2..176657dce3e3 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@@ -221,7 -221,7 +221,7 @@@ static int ublk_get_nr_zones(const stru static int ublk_revalidate_disk_zones(struct ublk_device *ub) { - return blk_revalidate_disk_zones(ub->ub_disk, NULL); + return blk_revalidate_disk_zones(ub->ub_disk); } static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) @@@ -249,8 -249,7 +249,7 @@@ static void ublk_dev_param_zoned_apply(struct ublk_device *ub) { blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); - blk_queue_required_elevator_features(ub->ub_disk->queue, - ELEVATOR_F_ZBD_SEQ_WRITE); + ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); } @@@ -2177,8 -2176,7 +2176,8 @@@ static int ublk_ctrl_start_dev(struct u .max_hw_sectors = p->max_sectors, .chunk_sectors = p->chunk_sectors, .virt_boundary_mask = p->virt_boundary_mask, - + .max_segments = USHRT_MAX, + .max_segment_size = UINT_MAX, }; struct gendisk *disk; int ret = -EINVAL; diff --combined drivers/md/dm.c index 7d0746b37c8e,2369d10c8475..597dd7a25823 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@@ -765,7 -765,7 +765,7 @@@ static struct table_device *open_table_ return td; out_blkdev_put: - fput(bdev_file); + __fput_sync(bdev_file); out_free_td: kfree(td); return ERR_PTR(r); @@@ -778,13 -778,7 +778,13 @@@ static void close_table_device(struct t { if (md->disk->slave_dir) bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); - fput(td->dm_dev.bdev_file); + + /* Leverage async fput() if DMF_DEFERRED_REMOVE set */ + if (unlikely(test_bit(DMF_DEFERRED_REMOVE, &md->flags))) + fput(td->dm_dev.bdev_file); + else + __fput_sync(td->dm_dev.bdev_file); + put_dax(td->dm_dev.dax_dev); list_del(&td->list); kfree(td); @@@ -1428,25 -1422,12 +1428,12 @@@ static void __map_bio(struct bio *clone down(&md->swap_bios_semaphore); } - if (static_branch_unlikely(&zoned_enabled)) { - /* - * Check if the IO needs a special mapping due to zone append - * emulation on zoned target. In this case, dm_zone_map_bio() - * calls the target map operation. - */ - if (unlikely(dm_emulate_zone_append(md))) - r = dm_zone_map_bio(tio); - else - goto do_map; - } else { - do_map: - if (likely(ti->type->map == linear_map)) - r = linear_map(ti, clone); - else if (ti->type->map == stripe_map) - r = stripe_map(ti, clone); - else - r = ti->type->map(ti, clone); - } + if (likely(ti->type->map == linear_map)) + r = linear_map(ti, clone); + else if (ti->type->map == stripe_map) + r = stripe_map(ti, clone); + else + r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: @@@ -1774,6 -1755,33 +1761,33 @@@ static void init_clone_info(struct clon ci->sector_count = 0; } + #ifdef CONFIG_BLK_DEV_ZONED + static inline bool dm_zone_bio_needs_split(struct mapped_device *md, + struct bio *bio) + { + /* + * For mapped device that need zone append emulation, we must + * split any large BIO that straddles zone boundaries. + */ + return dm_emulate_zone_append(md) && bio_straddles_zones(bio) && + !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); + } + static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) + { + return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0); + } + #else + static inline bool dm_zone_bio_needs_split(struct mapped_device *md, + struct bio *bio) + { + return false; + } + static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) + { + return false; + } + #endif + /* * Entry point to split a bio into clones and submit them to the targets. */ @@@ -1783,19 -1791,32 +1797,32 @@@ static void dm_split_and_process_bio(st struct clone_info ci; struct dm_io *io; blk_status_t error = BLK_STS_OK; - bool is_abnormal; + bool is_abnormal, need_split; + + need_split = is_abnormal = is_abnormal_io(bio); + if (static_branch_unlikely(&zoned_enabled)) + need_split = is_abnormal || dm_zone_bio_needs_split(md, bio); - is_abnormal = is_abnormal_io(bio); - if (unlikely(is_abnormal)) { + if (unlikely(need_split)) { /* * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc) * otherwise associated queue_limits won't be imposed. + * Also split the BIO for mapped devices needing zone append + * emulation to ensure that the BIO does not cross zone + * boundaries. */ bio = bio_split_to_limits(bio); if (!bio) return; } + /* + * Use the block layer zone write plugging for mapped devices that + * need zone append emulation (e.g. dm-crypt). + */ + if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio)) + return; + /* Only support nowait for normal IO */ if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) { io = alloc_io(md, bio, GFP_NOWAIT); @@@ -2016,7 -2037,6 +2043,6 @@@ static void cleanup_mapped_device(struc md->dax_dev = NULL; } - dm_cleanup_zoned_dev(md); if (md->disk) { spin_lock(&_minor_lock); md->disk->private_data = NULL; diff --combined drivers/nvme/host/core.c index 095f59e7aa93,c9955ecd1790..bf7615cb36ee --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@@ -628,6 -628,27 +628,6 @@@ bool nvme_change_ctrl_state(struct nvme } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); -/* - * Returns true for sink states that can't ever transition back to live. - */ -static bool nvme_state_terminal(struct nvme_ctrl *ctrl) -{ - switch (nvme_ctrl_state(ctrl)) { - case NVME_CTRL_NEW: - case NVME_CTRL_LIVE: - case NVME_CTRL_RESETTING: - case NVME_CTRL_CONNECTING: - return false; - case NVME_CTRL_DELETING: - case NVME_CTRL_DELETING_NOIO: - case NVME_CTRL_DEAD: - return true; - default: - WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state); - return true; - } -} - /* * Waits for the controller state to be resetting, or returns false if it is * not possible to ever transition to that state. @@@ -2055,7 -2076,6 +2055,7 @@@ static int nvme_update_ns_info_block(st bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT; struct queue_limits lim; struct nvme_id_ns_nvm *nvm = NULL; + struct nvme_zone_info zi = {}; struct nvme_id_ns *id; sector_t capacity; unsigned lbaf; @@@ -2068,10 -2088,9 +2068,10 @@@ if (id->ncap == 0) { /* namespace not allocated or attached */ info->is_removed = true; - ret = -ENODEV; + ret = -ENXIO; goto out; } + lbaf = nvme_lbaf_index(id->flbas); if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) { ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm); @@@ -2079,14 -2098,8 +2079,14 @@@ goto out; } + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + ns->head->ids.csi == NVME_CSI_ZNS) { + ret = nvme_query_zone_info(ns, lbaf, &zi); + if (ret < 0) + goto out; + } + blk_mq_freeze_queue(ns->disk->queue); - lbaf = nvme_lbaf_index(id->flbas); ns->head->lba_shift = id->lbaf[lbaf].ds; ns->head->nuse = le64_to_cpu(id->nuse); capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze)); @@@ -2099,8 -2112,13 +2099,8 @@@ capacity = 0; nvme_config_discard(ns, &lim); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - ns->head->ids.csi == NVME_CSI_ZNS) { - ret = nvme_update_zone_info(ns, lbaf, &lim); - if (ret) { - blk_mq_unfreeze_queue(ns->disk->queue); - goto out; - } - } + ns->head->ids.csi == NVME_CSI_ZNS) + nvme_update_zone_info(ns, &lim, &zi); ret = queue_limits_commit_update(ns->disk->queue, &lim); if (ret) { blk_mq_unfreeze_queue(ns->disk->queue); @@@ -2132,7 -2150,7 +2132,7 @@@ blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { - ret = blk_revalidate_disk_zones(ns->disk, NULL); + ret = blk_revalidate_disk_zones(ns->disk); if (ret && !nvme_first_scan(ns->disk)) goto out; } @@@ -2183,7 -2201,6 +2183,7 @@@ static int nvme_update_ns_info(struct n } if (!ret && nvme_ns_head_multipath(ns->head)) { + struct queue_limits *ns_lim = &ns->disk->queue->limits; struct queue_limits lim; blk_mq_freeze_queue(ns->head->disk->queue); @@@ -2195,26 -2212,7 +2195,26 @@@ set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); nvme_mpath_revalidate_paths(ns); + /* + * queue_limits mixes values that are the hardware limitations + * for bio splitting with what is the device configuration. + * + * For NVMe the device configuration can change after e.g. a + * Format command, and we really want to pick up the new format + * value here. But we must still stack the queue limits to the + * least common denominator for multipathing to split the bios + * properly. + * + * To work around this, we explicitly set the device + * configuration to those that we just queried, but only stack + * the splitting limits in to make sure we still obey possibly + * lower limitations of other controllers. + */ lim = queue_limits_start_update(ns->head->disk->queue); + lim.logical_block_size = ns_lim->logical_block_size; + lim.physical_block_size = ns_lim->physical_block_size; + lim.io_min = ns_lim->io_min; + lim.io_opt = ns_lim->io_opt; queue_limits_stack_bdev(&lim, ns->disk->part0, 0, ns->head->disk->disk_name); ret = queue_limits_commit_update(ns->head->disk->queue, &lim); @@@ -3660,7 -3658,7 +3660,7 @@@ static int nvme_init_ns_head(struct nvm "Found shared namespace %d, but multipathing not supported.\n", info->nsid); dev_warn_once(ctrl->device, - "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n."); + "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n"); } } diff --combined drivers/scsi/scsi_lib.c index 5b3230ef51fe,9ca96116bd33..967b6d62bb37 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@@ -635,9 -635,10 +635,9 @@@ static bool scsi_end_request(struct req if (blk_queue_add_random(q)) add_disk_randomness(req->q->disk); - if (!blk_rq_is_passthrough(req)) { - WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED)); - cmd->flags &= ~SCMD_INITIALIZED; - } + WARN_ON_ONCE(!blk_rq_is_passthrough(req) && + !(cmd->flags & SCMD_INITIALIZED)); + cmd->flags = 0; /* * Calling rcu_barrier() is not necessary here because the @@@ -1869,7 -1870,6 +1869,6 @@@ out_put_budget case BLK_STS_OK: break; case BLK_STS_RESOURCE: - case BLK_STS_ZONE_RESOURCE: if (scsi_device_blocked(sdev)) ret = BLK_STS_DEV_RESOURCE; break; diff --combined drivers/scsi/sd.c index 65cdc8b77e35,dcba9530ffa5..64c5129044b3 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@@ -1260,12 -1260,6 +1260,6 @@@ static blk_status_t sd_setup_read_write } } - if (req_op(rq) == REQ_OP_ZONE_APPEND) { - ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks); - if (ret) - goto fail; - } - fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0; dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); @@@ -1348,7 -1342,6 +1342,6 @@@ static blk_status_t sd_init_command(str return sd_setup_flush_cmnd(cmd); case REQ_OP_READ: case REQ_OP_WRITE: - case REQ_OP_ZONE_APPEND: return sd_setup_read_write_cmnd(cmd); case REQ_OP_ZONE_RESET: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, @@@ -3120,7 -3113,6 +3113,7 @@@ static void sd_read_io_hints(struct scs { struct scsi_device *sdp = sdkp->device; const struct scsi_io_group_descriptor *desc, *start, *end; + u16 permanent_stream_count_old; struct scsi_sense_hdr sshdr; struct scsi_mode_data data; int res; @@@ -3141,13 -3133,12 +3134,13 @@@ for (desc = start; desc < end; desc++) if (!desc->st_enble || !sd_is_perm_stream(sdkp, desc - start)) break; + permanent_stream_count_old = sdkp->permanent_stream_count; sdkp->permanent_stream_count = desc - start; if (sdkp->rscs && sdkp->permanent_stream_count < 2) sd_printk(KERN_INFO, sdkp, "Unexpected: RSCS has been set and the permanent stream count is %u\n", sdkp->permanent_stream_count); - else if (sdkp->permanent_stream_count) + else if (sdkp->permanent_stream_count != permanent_stream_count_old) sd_printk(KERN_INFO, sdkp, "permanent stream count = %d\n", sdkp->permanent_stream_count); } @@@ -3922,7 -3913,7 +3915,7 @@@ static int sd_probe(struct device *dev error = device_add_disk(dev, gd, NULL); if (error) { - put_device(&sdkp->disk_dev); + device_unregister(&sdkp->disk_dev); put_disk(gd); goto out; } @@@ -3981,7 -3972,6 +3974,6 @@@ static void scsi_disk_release(struct de struct scsi_disk *sdkp = to_scsi_disk(dev); ida_free(&sd_index_ida, sdkp->index); - sd_zbc_free_zone_info(sdkp); put_device(&sdkp->device->sdev_gendev); free_opal_dev(sdkp->opal_dev); diff --combined include/linux/blk_types.h index c3e098b21c16,5751292fee6a..25dbf1097085 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@@ -88,9 -88,15 +88,9 @@@ struct block_device /* * Block error status values. See block/blk-core:blk_errors for the details. - * Alpha cannot write a byte atomically, so we need to use 32-bit value. */ -#if defined(CONFIG_ALPHA) && !defined(__alpha_bwx__) -typedef u32 __bitwise blk_status_t; -typedef u32 blk_short_t; -#else typedef u8 __bitwise blk_status_t; typedef u16 blk_short_t; -#endif #define BLK_STS_OK 0 #define BLK_STS_NOTSUPP ((__force blk_status_t)1) #define BLK_STS_TIMEOUT ((__force blk_status_t)2) @@@ -130,18 -136,6 +130,6 @@@ */ #define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13) - /* - * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone - * related resources are unavailable, but the driver can guarantee the queue - * will be rerun in the future once the resources become available again. - * - * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references - * a zone specific resource and IO to a different zone on the same device could - * still be served. Examples of that are zones that are write-locked, but a read - * to the same zone could be served. - */ - #define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14) - /* * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion * path if the device returns a status indicating that too many zone resources @@@ -149,7 -143,7 +137,7 @@@ * after the number of open zones decreases below the device's limits, which is * reported in the request_queue's max_open_zones. */ - #define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)15) + #define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)14) /* * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion @@@ -158,20 -152,20 +146,20 @@@ * after the number of active zones decreases below the device's limits, which * is reported in the request_queue's max_active_zones. */ - #define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16) + #define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)15) /* * BLK_STS_OFFLINE is returned from the driver when the target device is offline * or is being taken offline. This could help differentiate the case where a * device is intentionally being shut down from a real I/O error. */ - #define BLK_STS_OFFLINE ((__force blk_status_t)17) + #define BLK_STS_OFFLINE ((__force blk_status_t)16) /* * BLK_STS_DURATION_LIMIT is returned from the driver when the target device * aborted the command because it exceeded one of its Command Duration Limits. */ - #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)18) + #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17) /** * blk_path_error - returns true if error may be path related @@@ -228,7 -222,12 +216,12 @@@ struct bio struct bvec_iter bi_iter; - blk_qc_t bi_cookie; + union { + /* for polled bios: */ + blk_qc_t bi_cookie; + /* for plugged zoned writes only: */ + unsigned int __bi_nr_segments; + }; bio_end_io_t *bi_end_io; void *bi_private; #ifdef CONFIG_BLK_CGROUP @@@ -298,7 -297,8 +291,8 @@@ enum BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */ BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, - BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ + BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ + BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */ BIO_FLAG_LAST }; diff --combined include/linux/blkdev.h index 69e7da33ca49,26acf80c50c0..69c4f113db42 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@@ -128,8 -128,6 +128,8 @@@ typedef unsigned int __bitwise blk_mode #define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) /* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */ #define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5)) +/* return partition scanning errors */ +#define BLK_OPEN_STRICT_SCAN ((__force blk_mode_t)(1 << 6)) struct gendisk { /* @@@ -179,22 -177,21 +179,21 @@@ #ifdef CONFIG_BLK_DEV_ZONED /* - * Zoned block device information for request dispatch control. - * nr_zones is the total number of zones of the device. This is always - * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones - * bits which indicates if a zone is conventional (bit set) or - * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones - * bits which indicates if a zone is write locked, that is, if a write - * request targeting the zone was dispatched. - * - * Reads of this information must be protected with blk_queue_enter() / - * blk_queue_exit(). Modifying this information is only allowed while - * no requests are being processed. See also blk_mq_freeze_queue() and - * blk_mq_unfreeze_queue(). + * Zoned block device information. Reads of this information must be + * protected with blk_queue_enter() / blk_queue_exit(). Modifying this + * information is only allowed while no requests are being processed. + * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue(). */ unsigned int nr_zones; + unsigned int zone_capacity; unsigned long *conv_zones_bitmap; - unsigned long *seq_zones_wlock; + unsigned int zone_wplugs_hash_bits; + spinlock_t zone_wplugs_lock; + struct mempool_s *zone_wplugs_pool; + struct hlist_head *zone_wplugs_hash; + struct list_head zone_wplugs_err_list; + struct work_struct zone_wplugs_work; + struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@@ -233,6 -230,19 +232,19 @@@ static inline unsigned int disk_openers return atomic_read(&disk->part0->bd_openers); } + /** + * disk_has_partscan - return %true if partition scanning is enabled on a disk + * @disk: disk to check + * + * Returns %true if partitions scanning is enabled for @disk, or %false if + * partition scanning is disabled either permanently or temporarily. + */ + static inline bool disk_has_partscan(struct gendisk *disk) + { + return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) && + !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state); + } + /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. @@@ -331,8 -341,7 +343,7 @@@ int blkdev_report_zones(struct block_de unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sectors, sector_t nr_sectors); - int blk_revalidate_disk_zones(struct gendisk *disk, - void (*update_driver_data)(struct gendisk *disk)); + int blk_revalidate_disk_zones(struct gendisk *disk); /* * Independent access ranges: struct blk_independent_access_range describes @@@ -449,8 -458,6 +460,6 @@@ struct request_queue atomic_t nr_active_requests_shared_tags; - unsigned int required_elevator_features; - struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; @@@ -633,15 -640,6 +642,6 @@@ static inline unsigned int disk_zone_no return sector >> ilog2(disk->queue->limits.chunk_sectors); } - static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) - { - if (!blk_queue_is_zoned(disk->queue)) - return false; - if (!disk->conv_zones_bitmap) - return true; - return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); - } - static inline void disk_set_max_open_zones(struct gendisk *disk, unsigned int max_open_zones) { @@@ -664,6 -662,7 +664,7 @@@ static inline unsigned int bdev_max_act return bdev->bd_disk->queue->limits.max_active_zones; } + bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int bdev_nr_zones(struct block_device *bdev) { @@@ -674,10 -673,6 +675,6 @@@ static inline unsigned int disk_nr_zone { return 0; } - static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) - { - return false; - } static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) { return 0; @@@ -691,6 -686,10 +688,10 @@@ static inline unsigned int bdev_max_act { return 0; } + static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) + { + return false; + } #endif /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blk_queue_depth(struct request_queue *q) @@@ -855,9 -854,11 +856,11 @@@ static inline unsigned int bio_zone_no( return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); } - static inline unsigned int bio_zone_is_seq(struct bio *bio) + static inline bool bio_straddles_zones(struct bio *bio) { - return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); + return bio_sectors(bio) && + bio_zone_no(bio) != + disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1); } /* @@@ -942,14 -943,6 +945,6 @@@ disk_alloc_independent_access_ranges(st void disk_set_independent_access_ranges(struct gendisk *disk, struct blk_independent_access_ranges *iars); - /* - * Elevator features for blk_queue_required_elevator_features: - */ - /* Supports zoned block devices sequential write constraint */ - #define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) - - extern void blk_queue_required_elevator_features(struct request_queue *q, - unsigned int features); extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, struct device *dev); @@@ -1156,12 -1149,29 +1151,29 @@@ static inline unsigned int queue_max_se return q->limits.max_segment_size; } - static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q) + static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l) { + unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); - const struct queue_limits *l = &q->limits; + return min_not_zero(l->max_zone_append_sectors, max_sectors); + } + + static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q) + { + if (!blk_queue_is_zoned(q)) + return 0; + + return queue_limits_max_zone_append_sectors(&q->limits); + } + + static inline bool queue_emulates_zone_append(struct request_queue *q) + { + return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors; + } - return min(l->max_zone_append_sectors, l->max_sectors); + static inline bool bdev_emulates_zone_append(struct block_device *bdev) + { + return queue_emulates_zone_append(bdev_get_queue(bdev)); } static inline unsigned int @@@ -1303,18 -1313,6 +1315,6 @@@ static inline unsigned int bdev_zone_no return disk_zone_no(bdev->bd_disk, sec); } - /* Whether write serialization is required for @op on zoned devices. */ - static inline bool op_needs_zoned_write_locking(enum req_op op) - { - return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; - } - - static inline bool bdev_op_is_zoned_write(struct block_device *bdev, - enum req_op op) - { - return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op); - } - static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); @@@ -1330,6 -1328,12 +1330,12 @@@ static inline sector_t bdev_offset_from return sector & (bdev_zone_sectors(bdev) - 1); } + static inline sector_t bio_offset_from_zone_start(struct bio *bio) + { + return bdev_offset_from_zone_start(bio->bi_bdev, + bio->bi_iter.bi_sector); + } + static inline bool bdev_is_zone_start(struct block_device *bdev, sector_t sector) { @@@ -1507,6 -1511,16 +1513,6 @@@ struct blk_holder_ops * Thaw the file system mounted on the block device. */ int (*thaw)(struct block_device *bdev); - - /* - * If needed, get a reference to the holder. - */ - void (*get_holder)(void *holder); - - /* - * Release the holder. - */ - void (*put_holder)(void *holder); }; /* @@@ -1577,7 -1591,6 +1583,7 @@@ static inline int early_lookup_bdev(con int bdev_freeze(struct block_device *bdev); int bdev_thaw(struct block_device *bdev); +void bdev_fput(struct file *bdev_file); struct io_comp_batch { struct request *req_list;