/* as long as there are pending bios, @blkg can't go away */
spin_lock(&blkg->async_bio_lock);
- bio_list_merge(&bios, &blkg->async_bios);
- bio_list_init(&blkg->async_bios);
+ bio_list_merge_init(&bios, &blkg->async_bios);
spin_unlock(&blkg->async_bio_lock);
/* start plug only when bio_list contains at least 2 bios */
return 0;
}
+void blkg_init_queue(struct request_queue *q)
+{
+ INIT_LIST_HEAD(&q->blkg_list);
+ mutex_init(&q->blkcg_mutex);
+}
+
int blkcg_init_disk(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
bool preloaded;
int ret;
- INIT_LIST_HEAD(&q->blkg_list);
- mutex_init(&q->blkcg_mutex);
-
new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
if (!new_blkg)
return -ENOMEM;
if (ret)
goto err_destroy_all;
- ret = blk_throtl_init(disk);
- if (ret)
- goto err_ioprio_exit;
-
return 0;
- err_ioprio_exit:
- blk_ioprio_exit(disk);
err_destroy_all:
blkg_destroy_all(disk);
return ret;
init_waitqueue_head(&q->mq_freeze_wq);
mutex_init(&q->mq_freeze_lock);
+ blkg_init_queue(q);
+
/*
* Init percpu_ref in atomic mode so that it's faster to shutdown.
* See blk_register_queue() for details.
return BLK_STS_NOTSUPP;
/* The bio sector must point to the start of a sequential zone */
- if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
- !bio_zone_is_seq(bio))
+ if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
return BLK_STS_IOERR;
/*
return BLK_STS_IOERR;
/* Make sure the BIO is small enough and will not get split */
- if (nr_sectors > q->limits.max_zone_append_sectors)
+ if (nr_sectors > queue_max_zone_append_sectors(q))
return BLK_STS_IOERR;
bio->bi_opf |= REQ_NOMERGE;
static void __submit_bio_noacct(struct bio *bio)
{
struct bio_list bio_list_on_stack[2];
+ struct blk_plug plug;
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack[0]);
current->bio_list = bio_list_on_stack;
+ blk_start_plug(&plug);
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
+ blk_finish_plug(&plug);
current->bio_list = NULL;
}
static void __submit_bio_noacct_mq(struct bio *bio)
{
struct bio_list bio_list[2] = { };
+ struct blk_plug plug;
current->bio_list = bio_list;
+ blk_start_plug(&plug);
do {
__submit_bio(bio);
} while ((bio = bio_list_pop(&bio_list[0])));
+ blk_finish_plug(&plug);
current->bio_list = NULL;
}
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
- /*
- * As the requests that require a zone lock are not plugged in the
- * first place, directly accessing the plug instead of using
- * blk_mq_plug() should not have any consequences during flushing for
- * zoned devices.
- */
blk_flush_plug(current->plug, false);
/*
unsigned long stamp;
again:
stamp = READ_ONCE(part->bd_stamp);
- if (unlikely(time_after(now, stamp))) {
- if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
- __part_stat_add(part, io_ticks, end ? now - stamp : 1);
- }
+ if (unlikely(time_after(now, stamp)) &&
+ likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
+ (end || part_in_flight(part)))
+ __part_stat_add(part, io_ticks, now - stamp);
+
if (part->bd_partno) {
part = bdev_whole(part);
goto again;
if (unlikely(!rq_list_empty(plug->cached_rq)))
blk_mq_free_plug_rqs(plug);
+ plug->cur_ktime = 0;
current->flags &= ~PF_BLOCK_TS;
}
return -EINVAL;
/*
- * Devices that require a virtual boundary do not support scatter/gather
- * I/O natively, but instead require a descriptor list entry for each
- * page (which might not be identical to the Linux PAGE_SIZE). Because
- * of that they are not limited by our notion of "segment size".
+ * Stacking device may have both virtual boundary and max segment
+ * size limit, so allow this setting now, and long-term the two
+ * might need to move out of stacking limits since we have immutable
+ * bvec and lower layer bio splitting is supposed to handle the two
+ * correctly.
*/
if (lim->virt_boundary_mask) {
- if (WARN_ON_ONCE(lim->max_segment_size &&
- lim->max_segment_size != UINT_MAX))
- return -EINVAL;
- lim->max_segment_size = UINT_MAX;
+ if (!lim->max_segment_size)
+ lim->max_segment_size = UINT_MAX;
} else {
/*
* The maximum segment size has an odd historic 64k default that
* blk_queue_max_zone_append_sectors - set max sectors for a single zone append
* @q: the request queue for the device
* @max_zone_append_sectors: maximum number of sectors to write per command
+ *
+ * Sets the maximum number of sectors allowed for zone append commands. If
+ * Specifying 0 for @max_zone_append_sectors indicates that the queue does
+ * not natively support zone append operations and that the block layer must
+ * emulate these operations using regular writes.
**/
void blk_queue_max_zone_append_sectors(struct request_queue *q,
unsigned int max_zone_append_sectors)
{
- unsigned int max_sectors;
+ unsigned int max_sectors = 0;
if (WARN_ON(!blk_queue_is_zoned(q)))
return;
- max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
- max_sectors = min(q->limits.chunk_sectors, max_sectors);
+ if (max_zone_append_sectors) {
+ max_sectors = min(q->limits.max_hw_sectors,
+ max_zone_append_sectors);
+ max_sectors = min(q->limits.chunk_sectors, max_sectors);
- /*
- * Signal eventual driver bugs resulting in the max_zone_append sectors limit
- * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
- * or the max_hw_sectors limit not set.
- */
- WARN_ON(!max_sectors);
+ /*
+ * Signal eventual driver bugs resulting in the max_zone_append
+ * sectors limit being 0 due to the chunk_sectors limit (zone
+ * size) not set or the max_hw_sectors limit not set.
+ */
+ WARN_ON_ONCE(!max_sectors);
+ }
q->limits.max_zone_append_sectors = max_sectors;
}
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
- t->max_zone_append_sectors = min(t->max_zone_append_sectors,
- b->max_zone_append_sectors);
+ t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
+ queue_limits_max_zone_append_sectors(b));
t->bounce = max(t->bounce, b->bounce);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
- /**
- * blk_queue_required_elevator_features - Set a queue required elevator features
- * @q: the request queue for the target device
- * @features: Required elevator features OR'ed together
- *
- * Tell the block layer that for the device controlled through @q, only the
- * only elevators that can be used are those that implement at least the set of
- * features specified by @features.
- */
- void blk_queue_required_elevator_features(struct request_queue *q,
- unsigned int features)
- {
- q->required_elevator_features = features;
- }
- EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
-
/**
* blk_queue_can_use_dma_map_merging - configure queue for merging segments.
* @q: the request queue for the device
#define DIO_INLINE_BIO_VECS 4
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
- struct iov_iter *iter, unsigned int nr_pages)
+ struct iov_iter *iter, struct block_device *bdev,
+ unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
loff_t pos = iocb->ki_pos;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
}
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- unsigned int nr_pages)
+ struct block_device *bdev, unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
struct iov_iter *iter,
+ struct block_device *bdev,
unsigned int nr_pages)
{
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
bool is_read = iov_iter_rw(iter) == READ;
blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
+ if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter))
+ return -EINVAL;
+
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
- return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
- return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+ return __blkdev_direct_IO_simple(iocb, iter, bdev,
+ nr_pages);
+ return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
}
- return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+ return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
}
static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->bdev = bdev;
iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
- if (iomap->offset >= isize)
+ if (offset >= isize)
return -EIO;
iomap->type = IOMAP_MAPPED;
iomap->addr = iomap->offset;
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
+ .fop_flags = FOP_BUFFER_RASYNC,
};
static __init int blkdev_init(void)
if (op == BLKPG_DEL_PARTITION)
return bdev_del_partition(disk, p.pno);
- if (p.start < 0 || p.length <= 0 || p.start + p.length < 0)
+ if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start)
return -EINVAL;
/* Check that the partition is aligned to the block size */
if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
unsigned long arg)
{
- uint64_t range[2];
- uint64_t start, len, end;
+ unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
struct inode *inode = bdev->bd_inode;
- uint64_t range[2], start, len;
++ uint64_t range[2], start, len, end;
+ struct bio *prev = NULL, *bio;
+ sector_t sector, nr_sects;
+ struct blk_plug plug;
int err;
if (!(mode & BLK_OPEN_WRITE))
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
+ if (bdev_read_only(bdev))
+ return -EPERM;
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
start = range[0];
len = range[1];
- if (start & 511)
+ if (!len)
return -EINVAL;
- if (len & 511)
+ if ((start | len) & bs_mask)
return -EINVAL;
- if (start + len > bdev_nr_bytes(bdev))
+ if (check_add_overflow(start, len, &end) ||
+ end > bdev_nr_bytes(bdev))
return -EINVAL;
filemap_invalidate_lock(inode->i_mapping);
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err)
goto fail;
- err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+
+ sector = start >> SECTOR_SHIFT;
+ nr_sects = len >> SECTOR_SHIFT;
+
+ blk_start_plug(&plug);
+ while (1) {
+ if (fatal_signal_pending(current)) {
+ if (prev)
+ bio_await_chain(prev);
+ err = -EINTR;
+ goto out_unplug;
+ }
+ bio = blk_alloc_discard_bio(bdev, §or, &nr_sects,
+ GFP_KERNEL);
+ if (!bio)
+ break;
+ prev = bio_chain_and_submit(prev, bio);
+ }
+ if (prev) {
+ err = submit_bio_wait(prev);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ bio_put(prev);
+ }
+ out_unplug:
+ blk_finish_plug(&plug);
fail:
filemap_invalidate_unlock(inode->i_mapping);
return err;
return -EACCES;
if (bdev_is_partition(bdev))
return -EINVAL;
- return disk_scan_partitions(bdev->bd_disk, mode);
+ return disk_scan_partitions(bdev->bd_disk,
+ mode | BLK_OPEN_STRICT_SCAN);
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
module_param_named(cache_size, g_cache_size, ulong, 0444);
MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
+ static bool g_fua = true;
+ module_param_named(fua, g_fua, bool, 0444);
+ MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true");
+
static unsigned int g_mbps;
module_param_named(mbps, g_mbps, uint, 0444);
MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
+ static int g_zone_append_max_sectors = INT_MAX;
+ module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444);
+ MODULE_PARM_DESC(zone_append_max_sectors,
+ "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation");
+
static struct nullb_device *null_alloc_dev(void);
static void null_free_dev(struct nullb_device *dev);
static void null_del_dev(struct nullb *nullb);
NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
+ NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL);
NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
NULLB_DEVICE_ATTR(no_sched, bool, NULL);
NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
+ NULLB_DEVICE_ATTR(fua, bool, NULL);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
{
&nullb_device_attr_zone_nr_conv,
&nullb_device_attr_zone_max_open,
&nullb_device_attr_zone_max_active,
+ &nullb_device_attr_zone_append_max_sectors,
&nullb_device_attr_zone_readonly,
&nullb_device_attr_zone_offline,
&nullb_device_attr_virt_boundary,
&nullb_device_attr_no_sched,
&nullb_device_attr_shared_tags,
&nullb_device_attr_shared_tag_bitmap,
+ &nullb_device_attr_fua,
NULL,
};
static ssize_t memb_group_features_show(struct config_item *item, char *page)
{
return snprintf(page, PAGE_SIZE,
- "badblocks,blocking,blocksize,cache_size,"
+ "badblocks,blocking,blocksize,cache_size,fua,"
"completion_nsec,discard,home_node,hw_queue_depth,"
"irqmode,max_sectors,mbps,memory_backed,no_sched,"
"poll_queues,power,queue_mode,shared_tag_bitmap,"
"shared_tags,size,submit_queues,use_per_node_hctx,"
"virt_boundary,zoned,zone_capacity,zone_max_active,"
"zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
- "zone_size\n");
+ "zone_size,zone_append_max_sectors\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);
dev->zone_nr_conv = g_zone_nr_conv;
dev->zone_max_open = g_zone_max_open;
dev->zone_max_active = g_zone_max_active;
+ dev->zone_append_max_sectors = g_zone_append_max_sectors;
dev->virt_boundary = g_virt_boundary;
dev->no_sched = g_no_sched;
dev->shared_tags = g_shared_tags;
dev->shared_tag_bitmap = g_shared_tag_bitmap;
+ dev->fua = g_fua;
+
return dev;
}
return BLK_STS_OK;
}
- static int null_handle_flush(struct nullb *nullb)
+ static blk_status_t null_handle_flush(struct nullb *nullb)
{
int err;
WARN_ON(!radix_tree_empty(&nullb->dev->cache));
spin_unlock_irq(&nullb->lock);
- return err;
+ return errno_to_blk_status(err);
}
static int null_transfer(struct nullb *nullb, struct page *page,
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb *nullb = cmd->nq->dev->nullb;
- int err;
+ int err = 0;
unsigned int len;
sector_t sector = blk_rq_pos(rq);
struct req_iterator iter;
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
op_is_write(req_op(rq)), sector,
rq->cmd_flags & REQ_FUA);
- if (err) {
- spin_unlock_irq(&nullb->lock);
- return err;
- }
+ if (err)
+ break;
sector += len >> SECTOR_SHIFT;
}
spin_unlock_irq(&nullb->lock);
- return 0;
+ return errno_to_blk_status(err);
}
static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
if (op == REQ_OP_DISCARD)
return null_handle_discard(dev, sector, nr_sectors);
- return errno_to_blk_status(null_handle_rq(cmd));
+ return null_handle_rq(cmd);
}
static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
blk_status_t sts;
if (op == REQ_OP_FLUSH) {
- cmd->error = errno_to_blk_status(null_handle_flush(nullb));
+ cmd->error = null_handle_flush(nullb);
goto out;
}
if (dev->cache_size > 0) {
set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
- blk_queue_write_cache(nullb->q, true, true);
+ blk_queue_write_cache(nullb->q, true, dev->fua);
}
nullb->q->queuedata = nullb;
out_ida_free:
ida_free(&nullb_indexes, nullb->index);
-out_cleanup_zone:
- null_free_zoned_dev(dev);
out_cleanup_disk:
put_disk(nullb->disk);
+out_cleanup_zone:
+ null_free_zoned_dev(dev);
out_cleanup_tags:
if (nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set);
if (tag_set.ops)
blk_mq_free_tag_set(&tag_set);
+
+ mutex_destroy(&lock);
}
module_init(null_init);
module_exit(null_exit);
+ MODULE_DESCRIPTION("multi queue aware block test driver");
MODULE_LICENSE("GPL");
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
{
- return blk_revalidate_disk_zones(ub->ub_disk, NULL);
+ return blk_revalidate_disk_zones(ub->ub_disk);
}
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
- blk_queue_required_elevator_features(ub->ub_disk->queue,
- ELEVATOR_F_ZBD_SEQ_WRITE);
+
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
}
.max_hw_sectors = p->max_sectors,
.chunk_sectors = p->chunk_sectors,
.virt_boundary_mask = p->virt_boundary_mask,
-
+ .max_segments = USHRT_MAX,
+ .max_segment_size = UINT_MAX,
};
struct gendisk *disk;
int ret = -EINVAL;
return td;
out_blkdev_put:
- fput(bdev_file);
+ __fput_sync(bdev_file);
out_free_td:
kfree(td);
return ERR_PTR(r);
{
if (md->disk->slave_dir)
bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
- fput(td->dm_dev.bdev_file);
+
+ /* Leverage async fput() if DMF_DEFERRED_REMOVE set */
+ if (unlikely(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
+ fput(td->dm_dev.bdev_file);
+ else
+ __fput_sync(td->dm_dev.bdev_file);
+
put_dax(td->dm_dev.dax_dev);
list_del(&td->list);
kfree(td);
down(&md->swap_bios_semaphore);
}
- if (static_branch_unlikely(&zoned_enabled)) {
- /*
- * Check if the IO needs a special mapping due to zone append
- * emulation on zoned target. In this case, dm_zone_map_bio()
- * calls the target map operation.
- */
- if (unlikely(dm_emulate_zone_append(md)))
- r = dm_zone_map_bio(tio);
- else
- goto do_map;
- } else {
- do_map:
- if (likely(ti->type->map == linear_map))
- r = linear_map(ti, clone);
- else if (ti->type->map == stripe_map)
- r = stripe_map(ti, clone);
- else
- r = ti->type->map(ti, clone);
- }
+ if (likely(ti->type->map == linear_map))
+ r = linear_map(ti, clone);
+ else if (ti->type->map == stripe_map)
+ r = stripe_map(ti, clone);
+ else
+ r = ti->type->map(ti, clone);
switch (r) {
case DM_MAPIO_SUBMITTED:
ci->sector_count = 0;
}
+ #ifdef CONFIG_BLK_DEV_ZONED
+ static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+ struct bio *bio)
+ {
+ /*
+ * For mapped device that need zone append emulation, we must
+ * split any large BIO that straddles zone boundaries.
+ */
+ return dm_emulate_zone_append(md) && bio_straddles_zones(bio) &&
+ !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+ }
+ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+ {
+ return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
+ }
+ #else
+ static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+ struct bio *bio)
+ {
+ return false;
+ }
+ static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+ {
+ return false;
+ }
+ #endif
+
/*
* Entry point to split a bio into clones and submit them to the targets.
*/
struct clone_info ci;
struct dm_io *io;
blk_status_t error = BLK_STS_OK;
- bool is_abnormal;
+ bool is_abnormal, need_split;
+
+ need_split = is_abnormal = is_abnormal_io(bio);
+ if (static_branch_unlikely(&zoned_enabled))
+ need_split = is_abnormal || dm_zone_bio_needs_split(md, bio);
- is_abnormal = is_abnormal_io(bio);
- if (unlikely(is_abnormal)) {
+ if (unlikely(need_split)) {
/*
* Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
* otherwise associated queue_limits won't be imposed.
+ * Also split the BIO for mapped devices needing zone append
+ * emulation to ensure that the BIO does not cross zone
+ * boundaries.
*/
bio = bio_split_to_limits(bio);
if (!bio)
return;
}
+ /*
+ * Use the block layer zone write plugging for mapped devices that
+ * need zone append emulation (e.g. dm-crypt).
+ */
+ if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio))
+ return;
+
/* Only support nowait for normal IO */
if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {
io = alloc_io(md, bio, GFP_NOWAIT);
md->dax_dev = NULL;
}
- dm_cleanup_zoned_dev(md);
if (md->disk) {
spin_lock(&_minor_lock);
md->disk->private_data = NULL;
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
-/*
- * Returns true for sink states that can't ever transition back to live.
- */
-static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
-{
- switch (nvme_ctrl_state(ctrl)) {
- case NVME_CTRL_NEW:
- case NVME_CTRL_LIVE:
- case NVME_CTRL_RESETTING:
- case NVME_CTRL_CONNECTING:
- return false;
- case NVME_CTRL_DELETING:
- case NVME_CTRL_DELETING_NOIO:
- case NVME_CTRL_DEAD:
- return true;
- default:
- WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
- return true;
- }
-}
-
/*
* Waits for the controller state to be resetting, or returns false if it is
* not possible to ever transition to that state.
bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT;
struct queue_limits lim;
struct nvme_id_ns_nvm *nvm = NULL;
+ struct nvme_zone_info zi = {};
struct nvme_id_ns *id;
sector_t capacity;
unsigned lbaf;
if (id->ncap == 0) {
/* namespace not allocated or attached */
info->is_removed = true;
- ret = -ENODEV;
+ ret = -ENXIO;
goto out;
}
+ lbaf = nvme_lbaf_index(id->flbas);
if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
goto out;
}
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ ns->head->ids.csi == NVME_CSI_ZNS) {
+ ret = nvme_query_zone_info(ns, lbaf, &zi);
+ if (ret < 0)
+ goto out;
+ }
+
blk_mq_freeze_queue(ns->disk->queue);
- lbaf = nvme_lbaf_index(id->flbas);
ns->head->lba_shift = id->lbaf[lbaf].ds;
ns->head->nuse = le64_to_cpu(id->nuse);
capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
capacity = 0;
nvme_config_discard(ns, &lim);
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- ns->head->ids.csi == NVME_CSI_ZNS) {
- ret = nvme_update_zone_info(ns, lbaf, &lim);
- if (ret) {
- blk_mq_unfreeze_queue(ns->disk->queue);
- goto out;
- }
- }
+ ns->head->ids.csi == NVME_CSI_ZNS)
+ nvme_update_zone_info(ns, &lim, &zi);
ret = queue_limits_commit_update(ns->disk->queue, &lim);
if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue);
blk_mq_unfreeze_queue(ns->disk->queue);
if (blk_queue_is_zoned(ns->queue)) {
- ret = blk_revalidate_disk_zones(ns->disk, NULL);
+ ret = blk_revalidate_disk_zones(ns->disk);
if (ret && !nvme_first_scan(ns->disk))
goto out;
}
}
if (!ret && nvme_ns_head_multipath(ns->head)) {
+ struct queue_limits *ns_lim = &ns->disk->queue->limits;
struct queue_limits lim;
blk_mq_freeze_queue(ns->head->disk->queue);
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
+ /*
+ * queue_limits mixes values that are the hardware limitations
+ * for bio splitting with what is the device configuration.
+ *
+ * For NVMe the device configuration can change after e.g. a
+ * Format command, and we really want to pick up the new format
+ * value here. But we must still stack the queue limits to the
+ * least common denominator for multipathing to split the bios
+ * properly.
+ *
+ * To work around this, we explicitly set the device
+ * configuration to those that we just queried, but only stack
+ * the splitting limits in to make sure we still obey possibly
+ * lower limitations of other controllers.
+ */
lim = queue_limits_start_update(ns->head->disk->queue);
+ lim.logical_block_size = ns_lim->logical_block_size;
+ lim.physical_block_size = ns_lim->physical_block_size;
+ lim.io_min = ns_lim->io_min;
+ lim.io_opt = ns_lim->io_opt;
queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
ns->head->disk->disk_name);
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
"Found shared namespace %d, but multipathing not supported.\n",
info->nsid);
dev_warn_once(ctrl->device,
- "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n.");
+ "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n");
}
}
if (blk_queue_add_random(q))
add_disk_randomness(req->q->disk);
- if (!blk_rq_is_passthrough(req)) {
- WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED));
- cmd->flags &= ~SCMD_INITIALIZED;
- }
+ WARN_ON_ONCE(!blk_rq_is_passthrough(req) &&
+ !(cmd->flags & SCMD_INITIALIZED));
+ cmd->flags = 0;
/*
* Calling rcu_barrier() is not necessary here because the
case BLK_STS_OK:
break;
case BLK_STS_RESOURCE:
- case BLK_STS_ZONE_RESOURCE:
if (scsi_device_blocked(sdev))
ret = BLK_STS_DEV_RESOURCE;
break;
}
}
- if (req_op(rq) == REQ_OP_ZONE_APPEND) {
- ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks);
- if (ret)
- goto fail;
- }
-
fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0;
dix = scsi_prot_sg_count(cmd);
dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type);
return sd_setup_flush_cmnd(cmd);
case REQ_OP_READ:
case REQ_OP_WRITE:
- case REQ_OP_ZONE_APPEND:
return sd_setup_read_write_cmnd(cmd);
case REQ_OP_ZONE_RESET:
return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
{
struct scsi_device *sdp = sdkp->device;
const struct scsi_io_group_descriptor *desc, *start, *end;
+ u16 permanent_stream_count_old;
struct scsi_sense_hdr sshdr;
struct scsi_mode_data data;
int res;
for (desc = start; desc < end; desc++)
if (!desc->st_enble || !sd_is_perm_stream(sdkp, desc - start))
break;
+ permanent_stream_count_old = sdkp->permanent_stream_count;
sdkp->permanent_stream_count = desc - start;
if (sdkp->rscs && sdkp->permanent_stream_count < 2)
sd_printk(KERN_INFO, sdkp,
"Unexpected: RSCS has been set and the permanent stream count is %u\n",
sdkp->permanent_stream_count);
- else if (sdkp->permanent_stream_count)
+ else if (sdkp->permanent_stream_count != permanent_stream_count_old)
sd_printk(KERN_INFO, sdkp, "permanent stream count = %d\n",
sdkp->permanent_stream_count);
}
error = device_add_disk(dev, gd, NULL);
if (error) {
- put_device(&sdkp->disk_dev);
+ device_unregister(&sdkp->disk_dev);
put_disk(gd);
goto out;
}
struct scsi_disk *sdkp = to_scsi_disk(dev);
ida_free(&sd_index_ida, sdkp->index);
- sd_zbc_free_zone_info(sdkp);
put_device(&sdkp->device->sdev_gendev);
free_opal_dev(sdkp->opal_dev);
/*
* Block error status values. See block/blk-core:blk_errors for the details.
- * Alpha cannot write a byte atomically, so we need to use 32-bit value.
*/
-#if defined(CONFIG_ALPHA) && !defined(__alpha_bwx__)
-typedef u32 __bitwise blk_status_t;
-typedef u32 blk_short_t;
-#else
typedef u8 __bitwise blk_status_t;
typedef u16 blk_short_t;
-#endif
#define BLK_STS_OK 0
#define BLK_STS_NOTSUPP ((__force blk_status_t)1)
#define BLK_STS_TIMEOUT ((__force blk_status_t)2)
*/
#define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13)
- /*
- * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone
- * related resources are unavailable, but the driver can guarantee the queue
- * will be rerun in the future once the resources become available again.
- *
- * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references
- * a zone specific resource and IO to a different zone on the same device could
- * still be served. Examples of that are zones that are write-locked, but a read
- * to the same zone could be served.
- */
- #define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14)
-
/*
* BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion
* path if the device returns a status indicating that too many zone resources
* after the number of open zones decreases below the device's limits, which is
* reported in the request_queue's max_open_zones.
*/
- #define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)15)
+ #define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)14)
/*
* BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion
* after the number of active zones decreases below the device's limits, which
* is reported in the request_queue's max_active_zones.
*/
- #define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16)
+ #define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)15)
/*
* BLK_STS_OFFLINE is returned from the driver when the target device is offline
* or is being taken offline. This could help differentiate the case where a
* device is intentionally being shut down from a real I/O error.
*/
- #define BLK_STS_OFFLINE ((__force blk_status_t)17)
+ #define BLK_STS_OFFLINE ((__force blk_status_t)16)
/*
* BLK_STS_DURATION_LIMIT is returned from the driver when the target device
* aborted the command because it exceeded one of its Command Duration Limits.
*/
- #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)18)
+ #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17)
/**
* blk_path_error - returns true if error may be path related
struct bvec_iter bi_iter;
- blk_qc_t bi_cookie;
+ union {
+ /* for polled bios: */
+ blk_qc_t bi_cookie;
+ /* for plugged zoned writes only: */
+ unsigned int __bi_nr_segments;
+ };
bio_end_io_t *bi_end_io;
void *bi_private;
#ifdef CONFIG_BLK_CGROUP
BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */
BIO_QOS_MERGED, /* but went through rq_qos merge path */
BIO_REMAPPED,
- BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
+ BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
+ BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
BIO_FLAG_LAST
};
#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4))
/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */
#define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5))
+/* return partition scanning errors */
+#define BLK_OPEN_STRICT_SCAN ((__force blk_mode_t)(1 << 6))
struct gendisk {
/*
#ifdef CONFIG_BLK_DEV_ZONED
/*
- * Zoned block device information for request dispatch control.
- * nr_zones is the total number of zones of the device. This is always
- * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
- * bits which indicates if a zone is conventional (bit set) or
- * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
- * bits which indicates if a zone is write locked, that is, if a write
- * request targeting the zone was dispatched.
- *
- * Reads of this information must be protected with blk_queue_enter() /
- * blk_queue_exit(). Modifying this information is only allowed while
- * no requests are being processed. See also blk_mq_freeze_queue() and
- * blk_mq_unfreeze_queue().
+ * Zoned block device information. Reads of this information must be
+ * protected with blk_queue_enter() / blk_queue_exit(). Modifying this
+ * information is only allowed while no requests are being processed.
+ * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue().
*/
unsigned int nr_zones;
+ unsigned int zone_capacity;
unsigned long *conv_zones_bitmap;
- unsigned long *seq_zones_wlock;
+ unsigned int zone_wplugs_hash_bits;
+ spinlock_t zone_wplugs_lock;
+ struct mempool_s *zone_wplugs_pool;
+ struct hlist_head *zone_wplugs_hash;
+ struct list_head zone_wplugs_err_list;
+ struct work_struct zone_wplugs_work;
+ struct workqueue_struct *zone_wplugs_wq;
#endif /* CONFIG_BLK_DEV_ZONED */
#if IS_ENABLED(CONFIG_CDROM)
return atomic_read(&disk->part0->bd_openers);
}
+ /**
+ * disk_has_partscan - return %true if partition scanning is enabled on a disk
+ * @disk: disk to check
+ *
+ * Returns %true if partitions scanning is enabled for @disk, or %false if
+ * partition scanning is disabled either permanently or temporarily.
+ */
+ static inline bool disk_has_partscan(struct gendisk *disk)
+ {
+ return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) &&
+ !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
+ }
+
/*
* The gendisk is refcounted by the part0 block_device, and the bd_device
* therein is also used for device model presentation in sysfs.
unsigned int nr_zones, report_zones_cb cb, void *data);
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
sector_t sectors, sector_t nr_sectors);
- int blk_revalidate_disk_zones(struct gendisk *disk,
- void (*update_driver_data)(struct gendisk *disk));
+ int blk_revalidate_disk_zones(struct gendisk *disk);
/*
* Independent access ranges: struct blk_independent_access_range describes
atomic_t nr_active_requests_shared_tags;
- unsigned int required_elevator_features;
-
struct blk_mq_tags *sched_shared_tags;
struct list_head icq_list;
return sector >> ilog2(disk->queue->limits.chunk_sectors);
}
- static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
- {
- if (!blk_queue_is_zoned(disk->queue))
- return false;
- if (!disk->conv_zones_bitmap)
- return true;
- return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
- }
-
static inline void disk_set_max_open_zones(struct gendisk *disk,
unsigned int max_open_zones)
{
return bdev->bd_disk->queue->limits.max_active_zones;
}
+ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);
#else /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int bdev_nr_zones(struct block_device *bdev)
{
{
return 0;
}
- static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
- {
- return false;
- }
static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
{
return 0;
{
return 0;
}
+ static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+ {
+ return false;
+ }
#endif /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int blk_queue_depth(struct request_queue *q)
return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
}
- static inline unsigned int bio_zone_is_seq(struct bio *bio)
+ static inline bool bio_straddles_zones(struct bio *bio)
{
- return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
+ return bio_sectors(bio) &&
+ bio_zone_no(bio) !=
+ disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1);
}
/*
void disk_set_independent_access_ranges(struct gendisk *disk,
struct blk_independent_access_ranges *iars);
- /*
- * Elevator features for blk_queue_required_elevator_features:
- */
- /* Supports zoned block devices sequential write constraint */
- #define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0)
-
- extern void blk_queue_required_elevator_features(struct request_queue *q,
- unsigned int features);
extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
struct device *dev);
return q->limits.max_segment_size;
}
- static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q)
+ static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l)
{
+ unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
- const struct queue_limits *l = &q->limits;
+ return min_not_zero(l->max_zone_append_sectors, max_sectors);
+ }
+
+ static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
+ {
+ if (!blk_queue_is_zoned(q))
+ return 0;
+
+ return queue_limits_max_zone_append_sectors(&q->limits);
+ }
+
+ static inline bool queue_emulates_zone_append(struct request_queue *q)
+ {
+ return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
+ }
- return min(l->max_zone_append_sectors, l->max_sectors);
+ static inline bool bdev_emulates_zone_append(struct block_device *bdev)
+ {
+ return queue_emulates_zone_append(bdev_get_queue(bdev));
}
static inline unsigned int
return disk_zone_no(bdev->bd_disk, sec);
}
- /* Whether write serialization is required for @op on zoned devices. */
- static inline bool op_needs_zoned_write_locking(enum req_op op)
- {
- return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES;
- }
-
- static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
- enum req_op op)
- {
- return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op);
- }
-
static inline sector_t bdev_zone_sectors(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
return sector & (bdev_zone_sectors(bdev) - 1);
}
+ static inline sector_t bio_offset_from_zone_start(struct bio *bio)
+ {
+ return bdev_offset_from_zone_start(bio->bi_bdev,
+ bio->bi_iter.bi_sector);
+ }
+
static inline bool bdev_is_zone_start(struct block_device *bdev,
sector_t sector)
{
* Thaw the file system mounted on the block device.
*/
int (*thaw)(struct block_device *bdev);
-
- /*
- * If needed, get a reference to the holder.
- */
- void (*get_holder)(void *holder);
-
- /*
- * Release the holder.
- */
- void (*put_holder)(void *holder);
};
/*
int bdev_freeze(struct block_device *bdev);
int bdev_thaw(struct block_device *bdev);
+void bdev_fput(struct file *bdev_file);
struct io_comp_batch {
struct request *req_list;