#include <linux/blk-mq.h>
#include <linux/highmem.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
#include <linux/psi.h>
+ #include <linux/sched/sysctl.h>
+ #include <linux/blk-crypto.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
refcount_set(&rq->ref, 1);
+ blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);
REQ_OP_NAME(ZONE_OPEN),
REQ_OP_NAME(ZONE_CLOSE),
REQ_OP_NAME(ZONE_FINISH),
+ REQ_OP_NAME(ZONE_APPEND),
REQ_OP_NAME(WRITE_SAME),
REQ_OP_NAME(WRITE_ZEROES),
REQ_OP_NAME(SCSI_IN),
bio_advance(bio, nbytes);
+ if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
+ /*
+ * Partial zone append completions cannot be supported as the
+ * BIO fragments may end up not being written sequentially.
+ */
+ if (bio->bi_iter.bi_size)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_iter.bi_sector = rq->__sector;
+ }
+
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
bio_endio(bio);
}
}
+ static inline int bio_queue_enter(struct bio *bio)
+ {
+ struct request_queue *q = bio->bi_disk->queue;
+ bool nowait = bio->bi_opf & REQ_NOWAIT;
+ int ret;
+
+ ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
+ if (unlikely(ret)) {
+ if (nowait && !blk_queue_dying(q))
+ bio_wouldblock_error(bio);
+ else
+ bio_io_error(bio);
+ }
+
+ return ret;
+ }
+
void blk_queue_exit(struct request_queue *q)
{
percpu_ref_put(&q->q_usage_counter);
if (ret)
goto fail_id;
- q->backing_dev_info = bdi_alloc_node(GFP_KERNEL, node_id);
+ q->backing_dev_info = bdi_alloc(node_id);
if (!q->backing_dev_info)
goto fail_split;
q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
- q->backing_dev_info->name = "block";
q->node = node_id;
timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
}
EXPORT_SYMBOL(blk_put_request);
+ static void blk_account_io_merge_bio(struct request *req)
+ {
+ if (!blk_do_io_stat(req))
+ return;
+
+ part_stat_lock();
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_unlock();
+ }
+
bool bio_attempt_back_merge(struct request *req, struct bio *bio,
unsigned int nr_segs)
{
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
- blk_account_io_start(req, false);
+ bio_crypt_free_ctx(bio);
+
+ blk_account_io_merge_bio(req);
return true;
}
req->__sector = bio->bi_iter.bi_sector;
req->__data_len += bio->bi_iter.bi_size;
- blk_account_io_start(req, false);
+ bio_crypt_do_front_merge(req, bio);
+
+ blk_account_io_merge_bio(req);
return true;
}
req->__data_len += bio->bi_iter.bi_size;
req->nr_phys_segments = segments + 1;
- blk_account_io_start(req, false);
+ blk_account_io_merge_bio(req);
return true;
no_merge:
req_set_nomerge(q, req);
return ret;
}
+ /*
+ * Check write append to a zoned block device.
+ */
+ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
+ struct bio *bio)
+ {
+ sector_t pos = bio->bi_iter.bi_sector;
+ int nr_sectors = bio_sectors(bio);
+
+ /* Only applicable to zoned block devices */
+ if (!blk_queue_is_zoned(q))
+ return BLK_STS_NOTSUPP;
+
+ /* The bio sector must point to the start of a sequential zone */
+ if (pos & (blk_queue_zone_sectors(q) - 1) ||
+ !blk_queue_zone_is_seq(q, pos))
+ return BLK_STS_IOERR;
+
+ /*
+ * Not allowed to cross zone boundaries. Otherwise, the BIO will be
+ * split and could result in non-contiguous sectors being written in
+ * different zones.
+ */
+ if (nr_sectors > q->limits.chunk_sectors)
+ return BLK_STS_IOERR;
+
+ /* Make sure the BIO is small enough and will not get split */
+ if (nr_sectors > q->limits.max_zone_append_sectors)
+ return BLK_STS_IOERR;
+
+ bio->bi_opf |= REQ_NOMERGE;
+
+ return BLK_STS_OK;
+ }
+
static noinline_for_stack bool
generic_make_request_checks(struct bio *bio)
{
}
/*
- * Non-mq queues do not honor REQ_NOWAIT, so complete a bio
- * with BLK_STS_AGAIN status in order to catch -EAGAIN and
- * to give a chance to the caller to repeat request gracefully.
+ * For a REQ_NOWAIT based request, return -EOPNOTSUPP
+ * if queue is not a request based queue.
*/
- if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) {
- status = BLK_STS_AGAIN;
- goto end_io;
- }
+ if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q))
+ goto not_supported;
if (should_fail_bio(bio))
goto end_io;
if (!q->limits.max_write_same_sectors)
goto not_supported;
break;
+ case REQ_OP_ZONE_APPEND:
+ status = blk_check_zone_append(q, bio);
+ if (status != BLK_STS_OK)
+ goto end_io;
+ break;
case REQ_OP_ZONE_RESET:
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
}
/*
- * Various block parts want %current->io_context and lazy ioc
- * allocation ends up trading a lot of pain for a small amount of
- * memory. Just allocate it upfront. This may fail and block
- * layer knows how to live with it.
+ * Various block parts want %current->io_context, so allocate it up
+ * front rather than dealing with lots of pain to allocate it only
+ * where needed. This may fail and the block layer knows how to live
+ * with it.
*/
- create_io_context(GFP_ATOMIC, q->node);
+ if (unlikely(!current->io_context))
+ create_task_io_context(current, GFP_ATOMIC, q->node);
if (!blkcg_bio_issue_check(q, bio))
return false;
return false;
}
+ static blk_qc_t do_make_request(struct bio *bio)
+ {
+ struct request_queue *q = bio->bi_disk->queue;
+ blk_qc_t ret = BLK_QC_T_NONE;
+
+ if (blk_crypto_bio_prep(&bio)) {
+ if (!q->make_request_fn)
+ return blk_mq_make_request(q, bio);
+ ret = q->make_request_fn(q, bio);
+ }
+ blk_queue_exit(q);
+ return ret;
+ }
+
/**
- * generic_make_request - hand a buffer to its device driver for I/O
+ * generic_make_request - re-submit a bio to the block device layer for I/O
* @bio: The bio describing the location in memory and on the device.
*
- * generic_make_request() is used to make I/O requests of block
- * devices. It is passed a &struct bio, which describes the I/O that needs
- * to be done.
- *
- * generic_make_request() does not return any status. The
- * success/failure status of the request, along with notification of
- * completion, is delivered asynchronously through the bio->bi_end_io
- * function described (one day) else where.
- *
- * The caller of generic_make_request must make sure that bi_io_vec
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
- * set to describe the device address, and the
- * bi_end_io and optionally bi_private are set to describe how
- * completion notification should be signaled.
- *
- * generic_make_request and the drivers it calls may use bi_next if this
- * bio happens to be merged with someone else, and may resubmit the bio to
- * a lower device by calling into generic_make_request recursively, which
- * means the bio should NOT be touched after the call to ->make_request_fn.
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level drivers by stacking block drivers. All file
+ * systems and other upper level users of the block layer should use
+ * submit_bio() instead.
*/
blk_qc_t generic_make_request(struct bio *bio)
{
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bio->bi_disk->queue;
- blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
- BLK_MQ_REQ_NOWAIT : 0;
- if (likely(blk_queue_enter(q, flags) == 0)) {
+ if (likely(bio_queue_enter(bio) == 0)) {
struct bio_list lower, same;
/* Create a fresh bio_list for all subordinate requests */
bio_list_on_stack[1] = bio_list_on_stack[0];
bio_list_init(&bio_list_on_stack[0]);
- ret = q->make_request_fn(q, bio);
-
- blk_queue_exit(q);
+ ret = do_make_request(bio);
/* sort new bios into those for a lower level
* and those for the same level
bio_list_merge(&bio_list_on_stack[0], &lower);
bio_list_merge(&bio_list_on_stack[0], &same);
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
- } else {
- if (unlikely(!blk_queue_dying(q) &&
- (bio->bi_opf & REQ_NOWAIT)))
- bio_wouldblock_error(bio);
- else
- bio_io_error(bio);
}
bio = bio_list_pop(&bio_list_on_stack[0]);
} while (bio);
*
* This function behaves like generic_make_request(), but does not protect
* against recursion. Must only be used if the called driver is known
- * to not call generic_make_request (or direct_make_request) again from
- * its make_request function. (Calling direct_make_request again from
- * a workqueue is perfectly fine as that doesn't recurse).
+ * to be blk-mq based.
*/
blk_qc_t direct_make_request(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
- bool nowait = bio->bi_opf & REQ_NOWAIT;
- blk_qc_t ret;
+ if (WARN_ON_ONCE(q->make_request_fn)) {
+ bio_io_error(bio);
+ return BLK_QC_T_NONE;
+ }
if (!generic_make_request_checks(bio))
return BLK_QC_T_NONE;
-
- if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
- if (nowait && !blk_queue_dying(q))
- bio_wouldblock_error(bio);
- else
- bio_io_error(bio);
+ if (unlikely(bio_queue_enter(bio)))
+ return BLK_QC_T_NONE;
+ if (!blk_crypto_bio_prep(&bio)) {
+ blk_queue_exit(q);
return BLK_QC_T_NONE;
}
-
- ret = q->make_request_fn(q, bio);
- blk_queue_exit(q);
- return ret;
+ return blk_mq_make_request(q, bio);
}
EXPORT_SYMBOL_GPL(direct_make_request);
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
*
- * submit_bio() is very similar in purpose to generic_make_request(), and
- * uses that function to do most of the work. Both are fairly rough
- * interfaces; @bio must be presetup and ready for I/O.
+ * submit_bio() is used to submit I/O requests to block devices. It is passed a
+ * fully set up &struct bio that describes the I/O that needs to be done. The
+ * bio will be send to the device described by the bi_disk and bi_partno fields.
*
+ * The success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the ->bi_end_io() callback
+ * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
+ * been called.
*/
blk_qc_t submit_bio(struct bio *bio)
{
- bool workingset_read = false;
- unsigned long pflags;
- blk_qc_t ret;
-
if (blkcg_punt_bio_submit(bio))
return BLK_QC_T_NONE;
if (op_is_write(bio_op(bio))) {
count_vm_events(PGPGOUT, count);
} else {
- if (bio_flagged(bio, BIO_WORKINGSET))
- workingset_read = true;
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
}
/*
- * If we're reading data that is part of the userspace
- * workingset, count submission time as memory stall. When the
- * device is congested, or the submitting cgroup IO-throttled,
- * submission can be a significant part of overall IO time.
+ * If we're reading data that is part of the userspace workingset, count
+ * submission time as memory stall. When the device is congested, or
+ * the submitting cgroup IO-throttled, submission can be a significant
+ * part of overall IO time.
*/
- if (workingset_read)
- psi_memstall_enter(&pflags);
-
- ret = generic_make_request(bio);
+ if (unlikely(bio_op(bio) == REQ_OP_READ &&
+ bio_flagged(bio, BIO_WORKINGSET))) {
+ unsigned long pflags;
+ blk_qc_t ret;
- if (workingset_read)
+ psi_memstall_enter(&pflags);
+ ret = generic_make_request(bio);
psi_memstall_leave(&pflags);
- return ret;
+ return ret;
+ }
+
+ return generic_make_request(bio);
}
EXPORT_SYMBOL(submit_bio);
should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
return BLK_STS_IOERR;
+ if (blk_crypto_insert_cloned_request(rq))
+ return BLK_STS_IOERR;
+
if (blk_queue_io_stat(q))
- blk_account_io_start(rq, true);
+ blk_account_io_start(rq);
/*
* Since we have a scheduler attached on the top device,
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
- void blk_account_io_completion(struct request *req, unsigned int bytes)
+ static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
+ {
+ unsigned long stamp;
+ again:
+ stamp = READ_ONCE(part->stamp);
+ if (unlikely(stamp != now)) {
+ if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
+ __part_stat_add(part, io_ticks, end ? now - stamp : 1);
+ }
+ if (part->partno) {
+ part = &part_to_disk(part)->part0;
+ goto again;
+ }
+ }
+
+ static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
if (req->part && blk_do_io_stat(req)) {
const int sgrp = op_stat_group(req_op(req));
update_io_ticks(part, jiffies, true);
part_stat_inc(part, ios[sgrp]);
part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
- part_dec_in_flight(req->q, part, rq_data_dir(req));
+ part_stat_unlock();
hd_struct_put(part);
- part_stat_unlock();
}
}
- void blk_account_io_start(struct request *rq, bool new_io)
+ void blk_account_io_start(struct request *rq)
{
- struct hd_struct *part;
- int rw = rq_data_dir(rq);
-
if (!blk_do_io_stat(rq))
return;
+ rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+
part_stat_lock();
+ update_io_ticks(rq->part, jiffies, false);
+ part_stat_unlock();
+ }
- if (!new_io) {
- part = rq->part;
- part_stat_inc(part, merges[rw]);
- } else {
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
- if (!hd_struct_try_get(part)) {
- /*
- * The partition is already being removed,
- * the request will be accounted on the disk only
- *
- * We take a reference on disk->part0 although that
- * partition will never be deleted, so we can treat
- * it as any other partition.
- */
- part = &rq->rq_disk->part0;
- hd_struct_get(part);
- }
- part_inc_in_flight(rq->q, part, rw);
- rq->part = part;
- }
+ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
+ unsigned int op)
+ {
+ struct hd_struct *part = &disk->part0;
+ const int sgrp = op_stat_group(op);
+ unsigned long now = READ_ONCE(jiffies);
+
+ part_stat_lock();
+ update_io_ticks(part, now, false);
+ part_stat_inc(part, ios[sgrp]);
+ part_stat_add(part, sectors[sgrp], sectors);
+ part_stat_local_inc(part, in_flight[op_is_write(op)]);
+ part_stat_unlock();
- update_io_ticks(part, jiffies, false);
+ return now;
+ }
+ EXPORT_SYMBOL(disk_start_io_acct);
+
+ void disk_end_io_acct(struct gendisk *disk, unsigned int op,
+ unsigned long start_time)
+ {
+ struct hd_struct *part = &disk->part0;
+ const int sgrp = op_stat_group(op);
+ unsigned long now = READ_ONCE(jiffies);
+ unsigned long duration = now - start_time;
+ part_stat_lock();
+ update_io_ticks(part, now, true);
+ part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
+ part_stat_local_dec(part, in_flight[op_is_write(op)]);
part_stat_unlock();
}
+ EXPORT_SYMBOL(disk_end_io_acct);
/*
* Steal bios from a request and add them to a bio list.
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
rq->ioprio = rq_src->ioprio;
- rq->extra_len = rq_src->extra_len;
+
+ if (rq->bio)
+ blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask);
return 0;
}
EXPORT_SYMBOL(blk_finish_plug);
+ void blk_io_schedule(void)
+ {
+ /* Prevent hang_check timer from firing at us during very long I/O */
+ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
+
+ if (timeout)
+ io_schedule_timeout(timeout);
+ else
+ io_schedule();
+ }
+ EXPORT_SYMBOL_GPL(blk_io_schedule);
+
int __init blk_dev_init(void)
{
BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
link->flags |= DL_FLAG_STATELESS;
goto reorder;
} else {
+ link->flags |= DL_FLAG_STATELESS;
goto out;
}
}
flags & DL_FLAG_PM_RUNTIME)
pm_runtime_resume(supplier);
+ list_add_tail_rcu(&link->s_node, &supplier->links.consumers);
+ list_add_tail_rcu(&link->c_node, &consumer->links.suppliers);
+
if (flags & DL_FLAG_SYNC_STATE_ONLY) {
dev_dbg(consumer,
"Linked as a sync state only consumer to %s\n",
dev_name(supplier));
goto out;
}
+
reorder:
/*
* Move the consumer and all of the devices depending on it to the end
*/
device_reorder_to_tail(consumer, NULL);
- list_add_tail_rcu(&link->s_node, &supplier->links.consumers);
- list_add_tail_rcu(&link->c_node, &consumer->links.suppliers);
-
dev_dbg(consumer, "Linked as a consumer to %s\n", dev_name(supplier));
- out:
+out:
device_pm_unlock();
device_links_write_unlock();
list_add_tail(&sup->links.defer_sync, &deferred_sync);
}
+static void device_link_drop_managed(struct device_link *link)
+{
+ link->flags &= ~DL_FLAG_MANAGED;
+ WRITE_ONCE(link->status, DL_STATE_NONE);
+ kref_put(&link->kref, __device_link_del);
+}
+
/**
* device_links_driver_bound - Update device links after probing its driver.
* @dev: Device to update the links for.
*/
void device_links_driver_bound(struct device *dev)
{
- struct device_link *link;
+ struct device_link *link, *ln;
LIST_HEAD(sync_list);
/*
else
__device_links_queue_sync_state(dev, &sync_list);
- list_for_each_entry(link, &dev->links.suppliers, c_node) {
+ list_for_each_entry_safe(link, ln, &dev->links.suppliers, c_node) {
+ struct device *supplier;
+
if (!(link->flags & DL_FLAG_MANAGED))
continue;
- WARN_ON(link->status != DL_STATE_CONSUMER_PROBE);
- WRITE_ONCE(link->status, DL_STATE_ACTIVE);
+ supplier = link->supplier;
+ if (link->flags & DL_FLAG_SYNC_STATE_ONLY) {
+ /*
+ * When DL_FLAG_SYNC_STATE_ONLY is set, it means no
+ * other DL_MANAGED_LINK_FLAGS have been set. So, it's
+ * save to drop the managed link completely.
+ */
+ device_link_drop_managed(link);
+ } else {
+ WARN_ON(link->status != DL_STATE_CONSUMER_PROBE);
+ WRITE_ONCE(link->status, DL_STATE_ACTIVE);
+ }
+ /*
+ * This needs to be done even for the deleted
+ * DL_FLAG_SYNC_STATE_ONLY device link in case it was the last
+ * device link that was preventing the supplier from getting a
+ * sync_state() call.
+ */
if (defer_sync_state_count)
- __device_links_supplier_defer_sync(link->supplier);
+ __device_links_supplier_defer_sync(supplier);
else
- __device_links_queue_sync_state(link->supplier,
- &sync_list);
+ __device_links_queue_sync_state(supplier, &sync_list);
}
dev->links.status = DL_DEV_DRIVER_BOUND;
device_links_flush_sync_list(&sync_list, dev);
}
-static void device_link_drop_managed(struct device_link *link)
-{
- link->flags &= ~DL_FLAG_MANAGED;
- WRITE_ONCE(link->status, DL_STATE_NONE);
- kref_put(&link->kref, __device_link_del);
-}
-
/**
* __device_links_no_driver - Update links of a device without a driver.
* @dev: Device without a drvier.
else if (dev->class && dev->class->dev_release)
dev->class->dev_release(dev);
else
- WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/kobject.txt.\n",
+ WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
dev_name(dev));
kfree(p);
}
return fw_devlink_flags;
}
+static bool fw_devlink_is_permissive(void)
+{
+ return fw_devlink_flags == DL_FLAG_SYNC_STATE_ONLY;
+}
+
/**
* device_add - add device to device hierarchy.
* @dev: device.
if (fw_devlink_flags && is_fwnode_dev &&
fwnode_has_op(dev->fwnode, add_links)) {
fw_ret = fwnode_call_int_op(dev->fwnode, add_links, dev);
- if (fw_ret == -ENODEV)
+ if (fw_ret == -ENODEV && !fw_devlink_is_permissive())
device_link_wait_for_mandatory_supplier(dev);
else if (fw_ret)
device_link_wait_for_optional_supplier(dev);
return ERR_PTR(retval);
}
- /**
- * device_create_vargs - creates a device and registers it with sysfs
- * @class: pointer to the struct class that this device should be registered to
- * @parent: pointer to the parent struct device of this new device, if any
- * @devt: the dev_t for the char device to be added
- * @drvdata: the data to be added to the device for callbacks
- * @fmt: string for the device's name
- * @args: va_list for the device's name
- *
- * This function can be used by char device classes. A struct device
- * will be created in sysfs, registered to the specified class.
- *
- * A "dev" file will be created, showing the dev_t for the device, if
- * the dev_t is not 0,0.
- * If a pointer to a parent struct device is passed in, the newly created
- * struct device will be a child of that device in sysfs.
- * The pointer to the struct device will be returned from the call.
- * Any further sysfs files that might be required can be created using this
- * pointer.
- *
- * Returns &struct device pointer on success, or ERR_PTR() on error.
- *
- * Note: the struct class passed to this function must have previously
- * been created with a call to class_create().
- */
- struct device *device_create_vargs(struct class *class, struct device *parent,
- dev_t devt, void *drvdata, const char *fmt,
- va_list args)
- {
- return device_create_groups_vargs(class, parent, devt, drvdata, NULL,
- fmt, args);
- }
- EXPORT_SYMBOL_GPL(device_create_vargs);
-
/**
* device_create - creates a device and registers it with sysfs
* @class: pointer to the struct class that this device should be registered to
struct device *dev;
va_start(vargs, fmt);
- dev = device_create_vargs(class, parent, devt, drvdata, fmt, vargs);
+ dev = device_create_groups_vargs(class, parent, devt, drvdata, NULL,
+ fmt, vargs);
va_end(vargs);
return dev;
}
else
dev->fwnode = fwnode;
}
+EXPORT_SYMBOL_GPL(set_secondary_fwnode);
/**
* device_set_of_node_from_dev - reuse device-tree node of another device
static int loop_kthread_worker_fn(void *worker_ptr)
{
- current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO;
+ current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
return kthread_worker_fn(worker_ptr);
}
lo->tag_set.queue_depth = 128;
lo->tag_set.numa_node = NUMA_NO_NODE;
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
- lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
lo->tag_set.driver_data = lo;
err = blk_mq_alloc_tag_set(&lo->tag_set);
return errno_to_blk_status(err);
}
+ static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
+ {
+ struct nullb_device *dev = cmd->nq->dev;
+ struct bio *bio;
+
+ if (dev->memory_backed)
+ return;
+
+ if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) {
+ zero_fill_bio(cmd->bio);
+ } else if (req_op(cmd->rq) == REQ_OP_READ) {
+ __rq_for_each_bio(bio, cmd->rq)
+ zero_fill_bio(bio);
+ }
+ }
+
static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
{
+ /*
+ * Since root privileges are required to configure the null_blk
+ * driver, it is fine that this driver does not initialize the
+ * data buffers of read commands. Zero-initialize these buffers
+ * anyway if KMSAN is enabled to prevent that KMSAN complains
+ * about null_blk not initializing read data buffers.
+ */
+ if (IS_ENABLED(CONFIG_KMSAN))
+ nullb_zero_read_cmd_buffer(cmd);
+
/* Complete IO by inline, softirq or timer */
switch (cmd->nq->dev->irqmode) {
case NULL_IRQ_SOFTIRQ:
static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
{
pr_info("rq %p timed out\n", rq);
- blk_mq_complete_request(rq);
+ blk_mq_force_complete_rq(rq);
return BLK_EH_DONE;
}
{
if (nullb->dev->discard == false)
return;
+
+ if (nullb->dev->zoned) {
+ nullb->dev->discard = false;
+ pr_info("discard option is ignored in zoned mode\n");
+ return;
+ }
+
nullb->q->limits.discard_granularity = nullb->dev->blocksize;
nullb->q->limits.discard_alignment = nullb->dev->blocksize;
blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
pr_err("zone_size must be power-of-two\n");
return -EINVAL;
}
+ if (dev->zone_size > dev->size) {
+ pr_err("Zone size larger than device capacity\n");
+ return -EINVAL;
+ }
dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
dev->nr_zones = dev_size >>
int null_register_zoned_dev(struct nullb *nullb)
{
+ struct nullb_device *dev = nullb->dev;
struct request_queue *q = nullb->q;
- if (queue_is_mq(q))
- return blk_revalidate_disk_zones(nullb->disk);
+ if (queue_is_mq(q)) {
+ int ret = blk_revalidate_disk_zones(nullb->disk, NULL);
+
+ if (ret)
+ return ret;
+ } else {
+ blk_queue_chunk_sectors(q, dev->zone_size_sects);
+ q->nr_zones = blkdev_nr_zones(nullb->disk);
+ }
- blk_queue_chunk_sectors(q, nullb->dev->zone_size_sects);
- q->nr_zones = blkdev_nr_zones(nullb->disk);
+ blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
return 0;
}
}
static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
- unsigned int nr_sectors)
+ unsigned int nr_sectors, bool append)
{
struct nullb_device *dev = cmd->nq->dev;
unsigned int zno = null_zone_no(dev, sector);
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
case BLK_ZONE_COND_CLOSED:
- /* Writes must be at the write pointer position */
- if (sector != zone->wp)
+ /*
+ * Regular writes must be at the write pointer position.
+ * Zone append writes are automatically issued at the write
+ * pointer and the position returned using the request or BIO
+ * sector.
+ */
+ if (append) {
+ sector = zone->wp;
+ if (cmd->bio)
+ cmd->bio->bi_iter.bi_sector = sector;
+ else
+ cmd->rq->__sector = sector;
+ } else if (sector != zone->wp) {
return BLK_STS_IOERR;
+ }
if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
{
switch (op) {
case REQ_OP_WRITE:
- return null_zone_write(cmd, sector, nr_sectors);
+ return null_zone_write(cmd, sector, nr_sectors, false);
+ case REQ_OP_ZONE_APPEND:
+ return null_zone_write(cmd, sector, nr_sectors, true);
case REQ_OP_ZONE_RESET:
case REQ_OP_ZONE_RESET_ALL:
case REQ_OP_ZONE_OPEN:
config.id = -1;
config.dev = &mtd->dev;
- config.name = mtd->name;
+ config.name = dev_name(&mtd->dev);
config.owner = THIS_MODULE;
config.reg_read = mtd_nvmem_reg_read;
config.size = mtd->size;
struct backing_dev_info *bdi;
int ret;
- bdi = bdi_alloc(GFP_KERNEL);
+ bdi = bdi_alloc(NUMA_NO_NODE);
if (!bdi)
return ERR_PTR(-ENOMEM);
- bdi->name = name;
/*
* We put '-0' suffix to the name to get the same name format as we
* used to get. Since this is called only once, we get a unique name.
scsi_io_completion_action(cmd, result);
}
- static blk_status_t scsi_init_sgtable(struct request *req,
- struct scsi_data_buffer *sdb)
+ static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev,
+ struct request *rq)
{
- int count;
-
- /*
- * If sg table allocation fails, requeue request later.
- */
- if (unlikely(sg_alloc_table_chained(&sdb->table,
- blk_rq_nr_phys_segments(req), sdb->table.sgl,
- SCSI_INLINE_SG_CNT)))
- return BLK_STS_RESOURCE;
-
- /*
- * Next, walk the list, and fill in the addresses and sizes of
- * each segment.
- */
- count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
- BUG_ON(count > sdb->table.nents);
- sdb->table.nents = count;
- sdb->length = blk_rq_payload_bytes(req);
- return BLK_STS_OK;
+ return sdev->dma_drain_len && blk_rq_is_passthrough(rq) &&
+ !op_is_write(req_op(rq)) &&
+ sdev->host->hostt->dma_need_drain(rq);
}
/*
*/
blk_status_t scsi_init_io(struct scsi_cmnd *cmd)
{
+ struct scsi_device *sdev = cmd->device;
struct request *rq = cmd->request;
+ unsigned short nr_segs = blk_rq_nr_phys_segments(rq);
+ struct scatterlist *last_sg = NULL;
blk_status_t ret;
+ bool need_drain = scsi_cmd_needs_dma_drain(sdev, rq);
+ int count;
- if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
+ if (WARN_ON_ONCE(!nr_segs))
return BLK_STS_IOERR;
- ret = scsi_init_sgtable(rq, &cmd->sdb);
- if (ret)
- return ret;
+ /*
+ * Make sure there is space for the drain. The driver must adjust
+ * max_hw_segments to be prepared for this.
+ */
+ if (need_drain)
+ nr_segs++;
+
+ /*
+ * If sg table allocation fails, requeue request later.
+ */
+ if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, nr_segs,
+ cmd->sdb.table.sgl, SCSI_INLINE_SG_CNT)))
+ return BLK_STS_RESOURCE;
+
+ /*
+ * Next, walk the list, and fill in the addresses and sizes of
+ * each segment.
+ */
+ count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg);
+
+ if (blk_rq_bytes(rq) & rq->q->dma_pad_mask) {
+ unsigned int pad_len =
+ (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
+
+ last_sg->length += pad_len;
+ cmd->extra_len += pad_len;
+ }
+
+ if (need_drain) {
+ sg_unmark_end(last_sg);
+ last_sg = sg_next(last_sg);
+ sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len);
+ sg_mark_end(last_sg);
+
+ cmd->extra_len += sdev->dma_drain_len;
+ count++;
+ }
+
+ BUG_ON(count > cmd->sdb.table.nents);
+ cmd->sdb.table.nents = count;
+ cmd->sdb.length = blk_rq_payload_bytes(rq);
if (blk_integrity_rq(rq)) {
struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
- int ivecs, count;
+ int ivecs;
if (WARN_ON_ONCE(!prot_sdb)) {
/*
struct request_queue *q = hctx->queue;
struct scsi_device *sdev = q->queuedata;
- if (scsi_dev_queue_ready(q, sdev))
- return true;
-
- if (atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev))
- blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY);
- return false;
+ return scsi_dev_queue_ready(q, sdev);
}
static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
case BLK_STS_OK:
break;
case BLK_STS_RESOURCE:
+ case BLK_STS_ZONE_RESOURCE:
if (atomic_read(&sdev->device_busy) ||
scsi_device_blocked(sdev))
ret = BLK_STS_DEV_RESOURCE;
switch (oldstate) {
case SDEV_RUNNING:
case SDEV_CREATED_BLOCK:
+ case SDEV_QUIESCE:
case SDEV_OFFLINE:
break;
default:
#include <linux/blkdev.h>
#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
+ #include <linux/mutex.h>
#include <asm/unaligned.h>
#include "sd.h"
+ static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone)
+ {
+ if (zone->type == ZBC_ZONE_TYPE_CONV)
+ return 0;
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return zone->wp - zone->start;
+ case BLK_ZONE_COND_FULL:
+ return zone->len;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ default:
+ /*
+ * Offline and read-only zones do not have a valid
+ * write pointer. Use 0 as for an empty zone.
+ */
+ return 0;
+ }
+ }
+
static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
unsigned int idx, report_zones_cb cb, void *data)
{
struct scsi_device *sdp = sdkp->device;
struct blk_zone zone = { 0 };
+ int ret;
zone.type = buf[0] & 0x0f;
zone.cond = (buf[1] >> 4) & 0xf;
zone.cond == ZBC_ZONE_COND_FULL)
zone.wp = zone.start + zone.len;
- return cb(&zone, idx, data);
+ ret = cb(&zone, idx, data);
+ if (ret)
+ return ret;
+
+ if (sdkp->rev_wp_offset)
+ sdkp->rev_wp_offset[idx] = sd_zbc_get_zone_wp_offset(&zone);
+
+ return 0;
}
/**
while (bufsize >= SECTOR_SIZE) {
buf = __vmalloc(bufsize,
- GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY,
- PAGE_KERNEL);
+ GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY);
if (buf) {
*buflen = bufsize;
return buf;
return ret;
}
+ static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd)
+ {
+ struct request *rq = cmd->request;
+ struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+ sector_t sector = blk_rq_pos(rq);
+
+ if (!sd_is_zoned(sdkp))
+ /* Not a zoned device */
+ return BLK_STS_IOERR;
+
+ if (sdkp->device->changed)
+ return BLK_STS_IOERR;
+
+ if (sector & (sd_zbc_zone_sectors(sdkp) - 1))
+ /* Unaligned request */
+ return BLK_STS_IOERR;
+
+ return BLK_STS_OK;
+ }
+
+ #define SD_ZBC_INVALID_WP_OFST (~0u)
+ #define SD_ZBC_UPDATING_WP_OFST (SD_ZBC_INVALID_WP_OFST - 1)
+
+ static int sd_zbc_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
+ {
+ struct scsi_disk *sdkp = data;
+
+ lockdep_assert_held(&sdkp->zones_wp_offset_lock);
+
+ sdkp->zones_wp_offset[idx] = sd_zbc_get_zone_wp_offset(zone);
+
+ return 0;
+ }
+
+ static void sd_zbc_update_wp_offset_workfn(struct work_struct *work)
+ {
+ struct scsi_disk *sdkp;
+ unsigned int zno;
+ int ret;
+
+ sdkp = container_of(work, struct scsi_disk, zone_wp_offset_work);
+
+ spin_lock_bh(&sdkp->zones_wp_offset_lock);
+ for (zno = 0; zno < sdkp->nr_zones; zno++) {
+ if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
+ continue;
+
+ spin_unlock_bh(&sdkp->zones_wp_offset_lock);
+ ret = sd_zbc_do_report_zones(sdkp, sdkp->zone_wp_update_buf,
+ SD_BUF_SIZE,
+ zno * sdkp->zone_blocks, true);
+ spin_lock_bh(&sdkp->zones_wp_offset_lock);
+ if (!ret)
+ sd_zbc_parse_report(sdkp, sdkp->zone_wp_update_buf + 64,
+ zno, sd_zbc_update_wp_offset_cb,
+ sdkp);
+ }
+ spin_unlock_bh(&sdkp->zones_wp_offset_lock);
+
+ scsi_device_put(sdkp->device);
+ }
+
+ /**
+ * sd_zbc_prepare_zone_append() - Prepare an emulated ZONE_APPEND command.
+ * @cmd: the command to setup
+ * @lba: the LBA to patch
+ * @nr_blocks: the number of LBAs to be written
+ *
+ * Called from sd_setup_read_write_cmnd() for REQ_OP_ZONE_APPEND.
+ * @sd_zbc_prepare_zone_append() handles the necessary zone wrote locking and
+ * patching of the lba for an emulated ZONE_APPEND command.
+ *
+ * In case the cached write pointer offset is %SD_ZBC_INVALID_WP_OFST it will
+ * schedule a REPORT ZONES command and return BLK_STS_IOERR.
+ */
+ blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba,
+ unsigned int nr_blocks)
+ {
+ struct request *rq = cmd->request;
+ struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+ unsigned int wp_offset, zno = blk_rq_zone_no(rq);
+ blk_status_t ret;
+
+ ret = sd_zbc_cmnd_checks(cmd);
+ if (ret != BLK_STS_OK)
+ return ret;
+
+ if (!blk_rq_zone_is_seq(rq))
+ return BLK_STS_IOERR;
+
+ /* Unlock of the write lock will happen in sd_zbc_complete() */
+ if (!blk_req_zone_write_trylock(rq))
+ return BLK_STS_ZONE_RESOURCE;
+
+ spin_lock_bh(&sdkp->zones_wp_offset_lock);
+ wp_offset = sdkp->zones_wp_offset[zno];
+ switch (wp_offset) {
+ case SD_ZBC_INVALID_WP_OFST:
+ /*
+ * We are about to schedule work to update a zone write pointer
+ * offset, which will cause the zone append command to be
+ * requeued. So make sure that the scsi device does not go away
+ * while the work is being processed.
+ */
+ if (scsi_device_get(sdkp->device)) {
+ ret = BLK_STS_IOERR;
+ break;
+ }
+ sdkp->zones_wp_offset[zno] = SD_ZBC_UPDATING_WP_OFST;
+ schedule_work(&sdkp->zone_wp_offset_work);
+ fallthrough;
+ case SD_ZBC_UPDATING_WP_OFST:
+ ret = BLK_STS_DEV_RESOURCE;
+ break;
+ default:
+ wp_offset = sectors_to_logical(sdkp->device, wp_offset);
+ if (wp_offset + nr_blocks > sdkp->zone_blocks) {
+ ret = BLK_STS_IOERR;
+ break;
+ }
+
+ *lba += wp_offset;
+ }
+ spin_unlock_bh(&sdkp->zones_wp_offset_lock);
+ if (ret)
+ blk_req_zone_write_unlock(rq);
+ return ret;
+ }
+
/**
* sd_zbc_setup_zone_mgmt_cmnd - Prepare a zone ZBC_OUT command. The operations
* can be RESET WRITE POINTER, OPEN, CLOSE or FINISH.
unsigned char op, bool all)
{
struct request *rq = cmd->request;
- struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
sector_t sector = blk_rq_pos(rq);
+ struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
sector_t block = sectors_to_logical(sdkp->device, sector);
+ blk_status_t ret;
- if (!sd_is_zoned(sdkp))
- /* Not a zoned device */
- return BLK_STS_IOERR;
-
- if (sdkp->device->changed)
- return BLK_STS_IOERR;
-
- if (sector & (sd_zbc_zone_sectors(sdkp) - 1))
- /* Unaligned request */
- return BLK_STS_IOERR;
+ ret = sd_zbc_cmnd_checks(cmd);
+ if (ret != BLK_STS_OK)
+ return ret;
cmd->cmd_len = 16;
memset(cmd->cmnd, 0, cmd->cmd_len);
return BLK_STS_OK;
}
+ static bool sd_zbc_need_zone_wp_update(struct request *rq)
+ {
+ switch (req_op(rq)) {
+ case REQ_OP_ZONE_APPEND:
+ case REQ_OP_ZONE_FINISH:
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_RESET_ALL:
+ return true;
+ case REQ_OP_WRITE:
+ case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_WRITE_SAME:
+ return blk_rq_zone_is_seq(rq);
+ default:
+ return false;
+ }
+ }
+
+ /**
+ * sd_zbc_zone_wp_update - Update cached zone write pointer upon cmd completion
+ * @cmd: Completed command
+ * @good_bytes: Command reply bytes
+ *
+ * Called from sd_zbc_complete() to handle the update of the cached zone write
+ * pointer value in case an update is needed.
+ */
+ static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
+ unsigned int good_bytes)
+ {
+ int result = cmd->result;
+ struct request *rq = cmd->request;
+ struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+ unsigned int zno = blk_rq_zone_no(rq);
+ enum req_opf op = req_op(rq);
+
+ /*
+ * If we got an error for a command that needs updating the write
+ * pointer offset cache, we must mark the zone wp offset entry as
+ * invalid to force an update from disk the next time a zone append
+ * command is issued.
+ */
+ spin_lock_bh(&sdkp->zones_wp_offset_lock);
+
+ if (result && op != REQ_OP_ZONE_RESET_ALL) {
+ if (op == REQ_OP_ZONE_APPEND) {
+ /* Force complete completion (no retry) */
+ good_bytes = 0;
+ scsi_set_resid(cmd, blk_rq_bytes(rq));
+ }
+
+ /*
+ * Force an update of the zone write pointer offset on
+ * the next zone append access.
+ */
+ if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
+ sdkp->zones_wp_offset[zno] = SD_ZBC_INVALID_WP_OFST;
+ goto unlock_wp_offset;
+ }
+
+ switch (op) {
+ case REQ_OP_ZONE_APPEND:
+ rq->__sector += sdkp->zones_wp_offset[zno];
+ fallthrough;
+ case REQ_OP_WRITE_ZEROES:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE:
+ if (sdkp->zones_wp_offset[zno] < sd_zbc_zone_sectors(sdkp))
+ sdkp->zones_wp_offset[zno] +=
+ good_bytes >> SECTOR_SHIFT;
+ break;
+ case REQ_OP_ZONE_RESET:
+ sdkp->zones_wp_offset[zno] = 0;
+ break;
+ case REQ_OP_ZONE_FINISH:
+ sdkp->zones_wp_offset[zno] = sd_zbc_zone_sectors(sdkp);
+ break;
+ case REQ_OP_ZONE_RESET_ALL:
+ memset(sdkp->zones_wp_offset, 0,
+ sdkp->nr_zones * sizeof(unsigned int));
+ break;
+ default:
+ break;
+ }
+
+ unlock_wp_offset:
+ spin_unlock_bh(&sdkp->zones_wp_offset_lock);
+
+ return good_bytes;
+ }
+
/**
* sd_zbc_complete - ZBC command post processing.
* @cmd: Completed command
* @good_bytes: Command reply bytes
* @sshdr: command sense header
*
- * Called from sd_done(). Process report zones reply and handle reset zone
- * and write commands errors.
+ * Called from sd_done() to handle zone commands errors and updates to the
+ * device queue zone write pointer offset cahce.
*/
- void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
+ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
struct scsi_sense_hdr *sshdr)
{
int result = cmd->result;
* so be quiet about the error.
*/
rq->rq_flags |= RQF_QUIET;
- }
+ } else if (sd_zbc_need_zone_wp_update(rq))
+ good_bytes = sd_zbc_zone_wp_update(cmd, good_bytes);
+
+ if (req_op(rq) == REQ_OP_ZONE_APPEND)
+ blk_req_zone_write_unlock(rq);
+
+ return good_bytes;
}
/**
return 0;
}
+ static void sd_zbc_revalidate_zones_cb(struct gendisk *disk)
+ {
+ struct scsi_disk *sdkp = scsi_disk(disk);
+
+ swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset);
+ }
+
+ static int sd_zbc_revalidate_zones(struct scsi_disk *sdkp,
+ u32 zone_blocks,
+ unsigned int nr_zones)
+ {
+ struct gendisk *disk = sdkp->disk;
+ int ret = 0;
+
+ /*
+ * Make sure revalidate zones are serialized to ensure exclusive
+ * updates of the scsi disk data.
+ */
+ mutex_lock(&sdkp->rev_mutex);
+
+ /*
+ * Revalidate the disk zones to update the device request queue zone
+ * bitmaps and the zone write pointer offset array. Do this only once
+ * the device capacity is set on the second revalidate execution for
+ * disk scan or if something changed when executing a normal revalidate.
+ */
+ if (sdkp->first_scan) {
+ sdkp->zone_blocks = zone_blocks;
+ sdkp->nr_zones = nr_zones;
+ goto unlock;
+ }
+
+ if (sdkp->zone_blocks == zone_blocks &&
+ sdkp->nr_zones == nr_zones &&
+ disk->queue->nr_zones == nr_zones)
+ goto unlock;
+
+ sdkp->rev_wp_offset = kvcalloc(nr_zones, sizeof(u32), GFP_NOIO);
+ if (!sdkp->rev_wp_offset) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb);
+
+ kvfree(sdkp->rev_wp_offset);
+ sdkp->rev_wp_offset = NULL;
+
+ unlock:
+ mutex_unlock(&sdkp->rev_mutex);
+
+ return ret;
+ }
+
int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
{
struct gendisk *disk = sdkp->disk;
+ struct request_queue *q = disk->queue;
unsigned int nr_zones;
u32 zone_blocks = 0;
+ u32 max_append;
int ret;
if (!sd_is_zoned(sdkp))
goto err;
/* The drive satisfies the kernel restrictions: set it up */
- blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue);
- blk_queue_required_elevator_features(sdkp->disk->queue,
- ELEVATOR_F_ZBD_SEQ_WRITE);
+ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
+ blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
/* READ16/WRITE16 is mandatory for ZBC disks */
sdkp->device->use_16_for_rw = 1;
sdkp->device->use_10_for_rw = 0;
+ ret = sd_zbc_revalidate_zones(sdkp, zone_blocks, nr_zones);
+ if (ret)
+ goto err;
+
/*
- * Revalidate the disk zone bitmaps once the block device capacity is
- * set on the second revalidate execution during disk scan and if
- * something changed when executing a normal revalidate.
+ * On the first scan 'chunk_sectors' isn't setup yet, so calling
+ * blk_queue_max_zone_append_sectors() will result in a WARN(). Defer
+ * this setting to the second scan.
*/
- if (sdkp->first_scan) {
- sdkp->zone_blocks = zone_blocks;
- sdkp->nr_zones = nr_zones;
+ if (sdkp->first_scan)
return 0;
- }
- if (sdkp->zone_blocks != zone_blocks ||
- sdkp->nr_zones != nr_zones ||
- disk->queue->nr_zones != nr_zones) {
- ret = blk_revalidate_disk_zones(disk);
- if (ret != 0)
- goto err;
- sdkp->zone_blocks = zone_blocks;
- sdkp->nr_zones = nr_zones;
- }
+ max_append = min_t(u32, logical_to_sectors(sdkp->device, zone_blocks),
+ q->limits.max_segments << (PAGE_SHIFT - 9));
+ max_append = min_t(u32, max_append, queue_max_hw_sectors(q));
+
+ blk_queue_max_zone_append_sectors(q, max_append);
return 0;
sdkp->nr_zones,
sdkp->zone_blocks);
}
+
+ int sd_zbc_init_disk(struct scsi_disk *sdkp)
+ {
+ if (!sd_is_zoned(sdkp))
+ return 0;
+
+ sdkp->zones_wp_offset = NULL;
+ spin_lock_init(&sdkp->zones_wp_offset_lock);
+ sdkp->rev_wp_offset = NULL;
+ mutex_init(&sdkp->rev_mutex);
+ INIT_WORK(&sdkp->zone_wp_offset_work, sd_zbc_update_wp_offset_workfn);
+ sdkp->zone_wp_update_buf = kzalloc(SD_BUF_SIZE, GFP_KERNEL);
+ if (!sdkp->zone_wp_update_buf)
+ return -ENOMEM;
+
+ return 0;
+ }
+
+ void sd_zbc_release_disk(struct scsi_disk *sdkp)
+ {
+ kvfree(sdkp->zones_wp_offset);
+ sdkp->zones_wp_offset = NULL;
+ kfree(sdkp->zone_wp_update_buf);
+ sdkp->zone_wp_update_buf = NULL;
+ }
break;
if (!(iocb->ki_flags & IOCB_HIPRI) ||
!blk_poll(bdev_get_queue(bdev), qc, true))
- io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
if (!(iocb->ki_flags & IOCB_HIPRI) ||
!blk_poll(bdev_get_queue(bdev), qc, true))
- io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
return block_read_full_page(page, blkdev_get_block);
}
-static int blkdev_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void blkdev_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+ mpage_readahead(rac, blkdev_get_block);
}
static int blkdev_write_begin(struct file *file, struct address_space *mapping,
* i_mutex and doing so causes performance issues with concurrent
* O_SYNC writers to a block device.
*/
- error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL);
+ error = blkdev_issue_flush(bdev, GFP_KERNEL);
if (error == -EOPNOTSUPP)
error = 0;
blk_queue_exit(bdev->bd_queue);
return result;
}
- EXPORT_SYMBOL_GPL(bdev_read_page);
/**
* bdev_write_page() - Start writing a page to a block device
blk_queue_exit(bdev->bd_queue);
return result;
}
- EXPORT_SYMBOL_GPL(bdev_write_page);
/*
* pseudo-fs
static LIST_HEAD(all_bdevs);
- /*
- * If there is a bdev inode for this device, unhash it so that it gets evicted
- * as soon as last inode reference is dropped.
- */
- void bdev_unhash_inode(dev_t dev)
- {
- struct inode *inode;
-
- inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev);
- if (inode) {
- remove_inode_hash(inode);
- iput(inode);
- }
- }
-
struct block_device *bdget(dev_t dev)
{
struct block_device *bdev;
lockdep_assert_held(&bdev->bd_mutex);
rescan:
- ret = blk_drop_partitions(disk, bdev);
+ ret = blk_drop_partitions(bdev);
if (ret)
return ret;
if (bdev_read_only(I_BDEV(bd_inode)))
return -EPERM;
- /* uswsusp needs write permission to the swap */
- if (IS_SWAPFILE(bd_inode) && !hibernation_available())
+ if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode))
return -ETXTBSY;
if (!iov_iter_count(from))
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
- .readpages = blkdev_readpages,
+ .readahead = blkdev_readahead,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
+ fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
#ifdef CONFIG_UNICODE
utf8_unload(sbi->s_encoding);
#endif
return res;
}
-static bool ext4_dummy_context(struct inode *inode)
+static const union fscrypt_context *
+ext4_get_dummy_context(struct super_block *sb)
{
- return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
+ return EXT4_SB(sb)->s_dummy_enc_ctx.ctx;
}
static bool ext4_has_stable_inodes(struct super_block *sb)
.key_prefix = "ext4:",
.get_context = ext4_get_context,
.set_context = ext4_set_context,
- .dummy_context = ext4_dummy_context,
+ .get_dummy_context = ext4_get_dummy_context,
.empty_dir = ext4_empty_dir,
.max_namelen = EXT4_NAME_LEN,
.has_stable_inodes = ext4_has_stable_inodes,
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+ {Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
{Opt_nombcache, "nombcache"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
- {Opt_test_dummy_encryption, 0, MOPT_GTE0},
+ {Opt_test_dummy_encryption, 0, MOPT_STRING},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
{Opt_err, 0, 0}
};
}
#endif
+static int ext4_set_test_dummy_encryption(struct super_block *sb,
+ const char *opt,
+ const substring_t *arg,
+ bool is_remount)
+{
+#ifdef CONFIG_FS_ENCRYPTION
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ /*
+ * This mount option is just for testing, and it's not worthwhile to
+ * implement the extra complexity (e.g. RCU protection) that would be
+ * needed to allow it to be set or changed during remount. We do allow
+ * it to be specified during remount, but only if there is no change.
+ */
+ if (is_remount && !sbi->s_dummy_enc_ctx.ctx) {
+ ext4_msg(sb, KERN_WARNING,
+ "Can't set test_dummy_encryption on remount");
+ return -1;
+ }
+ err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx);
+ if (err) {
+ if (err == -EEXIST)
+ ext4_msg(sb, KERN_WARNING,
+ "Can't change test_dummy_encryption on remount");
+ else if (err == -EINVAL)
+ ext4_msg(sb, KERN_WARNING,
+ "Value of option \"%s\" is unrecognized", opt);
+ else
+ ext4_msg(sb, KERN_WARNING,
+ "Error processing option \"%s\" [%d]",
+ opt, err);
+ return -1;
+ }
+ ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
+#else
+ ext4_msg(sb, KERN_WARNING,
+ "Test dummy encryption mount option ignored");
+#endif
+ return 1;
+}
+
static int handle_mount_opt(struct super_block *sb, char *opt, int token,
substring_t *args, unsigned long *journal_devnum,
unsigned int *journal_ioprio, int is_remount)
*journal_ioprio =
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
} else if (token == Opt_test_dummy_encryption) {
-#ifdef CONFIG_FS_ENCRYPTION
- sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
- ext4_msg(sb, KERN_WARNING,
- "Test dummy encryption mode enabled");
-#else
- ext4_msg(sb, KERN_WARNING,
- "Test dummy encryption mount option ignored");
-#endif
+ return ext4_set_test_dummy_encryption(sb, opt, &args[0],
+ is_remount);
} else if (m->flags & MOPT_DATAJ) {
if (is_remount) {
if (!sbi->s_journal)
SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
if (test_opt(sb, DATA_ERR_ABORT))
SEQ_OPTS_PUTS("data_err=abort");
- if (DUMMY_ENCRYPTION_ENABLED(sbi))
- SEQ_OPTS_PUTS("test_dummy_encryption");
+
+ fscrypt_show_test_dummy_encryption(seq, sep, sb);
ext4_show_quota_options(seq, sb);
return 0;
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
#endif
+ fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
ext4_blkdev_remove(sbi);
brelse(bh);
out_fail:
needs_barrier = true;
if (needs_barrier) {
int err;
- err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
if (!ret)
ret = err;
}
static unsigned long get_nr_dirty_pages(void)
{
return global_node_page_state(NR_FILE_DIRTY) +
- global_node_page_state(NR_UNSTABLE_NFS) +
get_nr_dirty_inodes();
}
WARN(bdi_cap_writeback_dirty(wb->bdi) &&
!test_bit(WB_registered, &wb->state),
- "bdi-%s not registered\n", wb->bdi->name);
+ "bdi-%s not registered\n", bdi_dev_name(wb->bdi));
inode->dirtied_when = jiffies;
if (dirtytime)
static unsigned int isofs_get_last_session(struct super_block *sb, s32 session)
{
- struct cdrom_multisession ms_info;
- unsigned int vol_desc_start;
- struct block_device *bdev = sb->s_bdev;
- int i;
+ struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk);
+ unsigned int vol_desc_start = 0;
- vol_desc_start=0;
- ms_info.addr_format=CDROM_LBA;
if (session > 0) {
- struct cdrom_tocentry Te;
- Te.cdte_track=session;
- Te.cdte_format=CDROM_LBA;
- i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te);
- if (!i) {
+ struct cdrom_tocentry te;
+
+ if (!cdi)
+ return 0;
+
+ te.cdte_track = session;
+ te.cdte_format = CDROM_LBA;
+ if (cdrom_read_tocentry(cdi, &te) == 0) {
printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n",
- session, Te.cdte_addr.lba,
- Te.cdte_ctrl&CDROM_DATA_TRACK);
- if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4)
- return Te.cdte_addr.lba;
+ session, te.cdte_addr.lba,
+ te.cdte_ctrl & CDROM_DATA_TRACK);
+ if ((te.cdte_ctrl & CDROM_DATA_TRACK) == 4)
+ return te.cdte_addr.lba;
}
printk(KERN_ERR "ISOFS: Invalid session number or type of track\n");
}
- i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info);
- if (session > 0)
- printk(KERN_ERR "ISOFS: Invalid session number\n");
- #if 0
- printk(KERN_DEBUG "isofs.inode: CDROMMULTISESSION: rc=%d\n",i);
- if (i==0) {
- printk(KERN_DEBUG "isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no");
- printk(KERN_DEBUG "isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba);
- }
- #endif
- if (i==0)
+
+ if (cdi) {
+ struct cdrom_multisession ms_info;
+
+ ms_info.addr_format = CDROM_LBA;
+ if (cdrom_multisession(cdi, &ms_info) == 0) {
#if WE_OBEY_THE_WRITTEN_STANDARDS
- if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
+ /* necessary for a valid ms_info.addr */
+ if (ms_info.xa_flag)
#endif
- vol_desc_start=ms_info.addr.lba;
+ vol_desc_start = ms_info.addr.lba;
+ }
+ }
+
return vol_desc_start;
}
return mpage_readpage(page, isofs_get_block);
}
-static int isofs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void isofs_readahead(struct readahead_control *rac)
{
- return mpage_readpages(mapping, pages, nr_pages, isofs_get_block);
+ mpage_readahead(rac, isofs_get_block);
}
static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
static const struct address_space_operations isofs_aops = {
.readpage = isofs_readpage,
- .readpages = isofs_readpages,
+ .readahead = isofs_readahead,
.bmap = _isofs_bmap
};
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (bdev->bd_fsfreeze_count > 0) {
mutex_unlock(&bdev->bd_fsfreeze_mutex);
- blkdev_put(bdev, mode);
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
+ blkdev_put(bdev, mode);
return -EBUSY;
}
int err;
va_list args;
- bdi = bdi_alloc(GFP_KERNEL);
+ bdi = bdi_alloc(NUMA_NO_NODE);
if (!bdi)
return -ENOMEM;
- bdi->name = sb->s_type->name;
-
va_start(args, fmt);
err = bdi_register_va(bdi, fmt, args);
va_end(args);
#include <linux/mman.h>
#include <linux/sched/mm.h>
#include <linux/crc32.h>
+ #include <linux/task_io_accounting_ops.h>
#include "zonefs.h"
return iomap_readpage(page, &zonefs_iomap_ops);
}
-static int zonefs_readpages(struct file *unused, struct address_space *mapping,
- struct list_head *pages, unsigned int nr_pages)
+static void zonefs_readahead(struct readahead_control *rac)
{
- return iomap_readpages(mapping, pages, nr_pages, &zonefs_iomap_ops);
+ iomap_readahead(rac, &zonefs_iomap_ops);
}
/*
static const struct address_space_operations zonefs_file_aops = {
.readpage = zonefs_readpage,
- .readpages = zonefs_readpages,
+ .readahead = zonefs_readahead,
.writepage = zonefs_writepage,
.writepages = zonefs_writepages,
.set_page_dirty = iomap_set_page_dirty,
if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
ret = file_write_and_wait_range(file, start, end);
if (!ret)
- ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
if (ret)
zonefs_io_error(inode, true);
.end_io = zonefs_file_write_dio_end_io,
};
+ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
+ {
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ unsigned int max;
+ struct bio *bio;
+ ssize_t size;
+ int nr_pages;
+ ssize_t ret;
+
+ nr_pages = iov_iter_npages(from, BIO_MAX_PAGES);
+ if (!nr_pages)
+ return 0;
+
+ max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
+ max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
+ iov_iter_truncate(from, max);
+
+ bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set);
+ if (!bio)
+ return -ENOMEM;
+
+ bio_set_dev(bio, bdev);
+ bio->bi_iter.bi_sector = zi->i_zsector;
+ bio->bi_write_hint = iocb->ki_hint;
+ bio->bi_ioprio = iocb->ki_ioprio;
+ bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
+ if (iocb->ki_flags & IOCB_DSYNC)
+ bio->bi_opf |= REQ_FUA;
+
+ ret = bio_iov_iter_get_pages(bio, from);
+ if (unlikely(ret)) {
+ bio_io_error(bio);
+ return ret;
+ }
+ size = bio->bi_iter.bi_size;
+ task_io_account_write(ret);
+
+ if (iocb->ki_flags & IOCB_HIPRI)
+ bio_set_polled(bio, iocb);
+
+ ret = submit_bio_wait(bio);
+
+ bio_put(bio);
+
+ zonefs_file_write_dio_end_io(iocb, size, ret, 0);
+ if (ret >= 0) {
+ iocb->ki_pos += size;
+ return size;
+ }
+
+ return ret;
+ }
+
/*
* Handle direct writes. For sequential zone files, this is the only possible
* write path. For these files, check that the user is issuing writes
struct inode *inode = file_inode(iocb->ki_filp);
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct super_block *sb = inode->i_sb;
+ bool sync = is_sync_kiocb(iocb);
+ bool append = false;
size_t count;
ssize_t ret;
* as this can cause write reordering (e.g. the first aio gets EAGAIN
* on the inode lock but the second goes through but is now unaligned).
*/
- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb) &&
+ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync &&
(iocb->ki_flags & IOCB_NOWAIT))
return -EOPNOTSUPP;
}
/* Enforce sequential writes (append only) in sequential zones */
- mutex_lock(&zi->i_truncate_mutex);
- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && iocb->ki_pos != zi->i_wpoffset) {
+ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
+ mutex_lock(&zi->i_truncate_mutex);
+ if (iocb->ki_pos != zi->i_wpoffset) {
+ mutex_unlock(&zi->i_truncate_mutex);
+ ret = -EINVAL;
+ goto inode_unlock;
+ }
mutex_unlock(&zi->i_truncate_mutex);
- ret = -EINVAL;
- goto inode_unlock;
+ append = sync;
}
- mutex_unlock(&zi->i_truncate_mutex);
- ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
- &zonefs_write_dio_ops, is_sync_kiocb(iocb));
+ if (append)
+ ret = zonefs_file_dio_append(iocb, from);
+ else
+ ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
+ &zonefs_write_dio_ops, sync);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
struct page;
struct address_space;
struct writeback_control;
+struct readahead_control;
/*
* Write life time hint values.
*/
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
+ void (*readahead)(struct readahead_control *);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
errseq_t f_wb_err;
+ errseq_t f_sb_err; /* for syncfs */
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
__u32 handle_bytes;
int handle_type;
/* file identifier */
- unsigned char f_handle[0];
+ unsigned char f_handle[];
};
static inline struct file *get_file(struct file *f)
/* Being remounted read-only */
int s_readonly_remount;
+ /* per-sb errseq_t for reporting writeback errors via syncfs */
+ errseq_t s_wb_err;
+
/* AIO completions deferred from interrupt context */
struct workqueue_struct *s_dio_done_wq;
struct hlist_head s_pins;
extern int vfs_rmdir(struct inode *, struct dentry *);
extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
-extern int vfs_whiteout(struct inode *, struct dentry *);
+
+static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+{
+ return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
+}
extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode,
int open_flag);
#ifdef CONFIG_BLOCK
extern int register_blkdev(unsigned int, const char *);
extern void unregister_blkdev(unsigned int, const char *);
- extern void bdev_unhash_inode(dev_t dev);
extern struct block_device *bdget(dev_t);
extern struct block_device *bdgrab(struct block_device *bdev);
extern void bd_set_size(struct block_device *, loff_t size);
extern int revalidate_disk(struct gendisk *);
extern int check_disk_change(struct block_device *);
extern int __invalidate_device(struct block_device *, bool);
- extern int invalidate_partition(struct gendisk *, int);
#endif
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end);
return errseq_sample(&mapping->wb_err);
}
+/**
+ * file_sample_sb_err - sample the current errseq_t to test for later errors
+ * @mapping: mapping to be sampled
+ *
+ * Grab the most current superblock-level errseq_t value for the given
+ * struct file.
+ */
+static inline errseq_t file_sample_sb_err(struct file *file)
+{
+ return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
+}
+
static inline int filemap_nr_thps(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS