(GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET))
#define IS_READ_CAP_16(cdb) \
-((cdb[0] == SERVICE_ACTION_IN && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0)
+((cdb[0] == SERVICE_ACTION_IN_16 && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0)
/* Request Sense Helper Macros */
#define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb) \
nvme_offset += unit_num_blocks;
- nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
+ nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
if (nvme_sc != NVME_SC_SUCCESS) {
nvme_unmap_user_pages(dev,
(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
c.common.opcode = nvme_cmd_flush;
c.common.nsid = cpu_to_le32(ns->ns_id);
- nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
+ nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
res = nvme_trans_status_code(hdr, nvme_sc);
if (res)
goto out;
c.common.opcode = nvme_cmd_flush;
c.common.nsid = cpu_to_le32(ns->ns_id);
- nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
+ nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
res = nvme_trans_status_code(hdr, nvme_sc);
if (res)
c.dsm.nr = cpu_to_le32(ndesc - 1);
c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
- nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
+ nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
res = nvme_trans_status_code(hdr, nvme_sc);
dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len))
return -EFAULT;
+ /*
+ * Prime the hdr with good status for scsi commands that don't require
+ * an nvme command for translation.
+ */
+ retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS);
+ if (retcode)
+ return retcode;
+
opcode = cmd[0];
switch (opcode) {
case READ_CAPACITY:
retcode = nvme_trans_read_capacity(ns, hdr, cmd);
break;
- case SERVICE_ACTION_IN:
+ case SERVICE_ACTION_IN_16:
if (IS_READ_CAP_16(cmd))
retcode = nvme_trans_read_capacity(ns, hdr, cmd);
else
return retcode;
}
- #ifdef CONFIG_COMPAT
- typedef struct sg_io_hdr32 {
- compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */
- compat_int_t dxfer_direction; /* [i] data transfer direction */
- unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */
- unsigned char mx_sb_len; /* [i] max length to write to sbp */
- unsigned short iovec_count; /* [i] 0 implies no scatter gather */
- compat_uint_t dxfer_len; /* [i] byte count of data transfer */
- compat_uint_t dxferp; /* [i], [*io] points to data transfer memory
- or scatter gather list */
- compat_uptr_t cmdp; /* [i], [*i] points to command to perform */
- compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */
- compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */
- compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */
- compat_int_t pack_id; /* [i->o] unused internally (normally) */
- compat_uptr_t usr_ptr; /* [i->o] unused internally */
- unsigned char status; /* [o] scsi status */
- unsigned char masked_status; /* [o] shifted, masked scsi status */
- unsigned char msg_status; /* [o] messaging level data (optional) */
- unsigned char sb_len_wr; /* [o] byte count actually written to sbp */
- unsigned short host_status; /* [o] errors from host adapter */
- unsigned short driver_status; /* [o] errors from software driver */
- compat_int_t resid; /* [o] dxfer_len - actual_transferred */
- compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */
- compat_uint_t info; /* [o] auxiliary information */
- } sg_io_hdr32_t; /* 64 bytes long (on sparc32) */
-
- typedef struct sg_iovec32 {
- compat_uint_t iov_base;
- compat_uint_t iov_len;
- } sg_iovec32_t;
-
- static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count)
- {
- sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1);
- sg_iovec32_t __user *iov32 = dxferp;
- int i;
-
- for (i = 0; i < iovec_count; i++) {
- u32 base, len;
-
- if (get_user(base, &iov32[i].iov_base) ||
- get_user(len, &iov32[i].iov_len) ||
- put_user(compat_ptr(base), &iov[i].iov_base) ||
- put_user(len, &iov[i].iov_len))
- return -EFAULT;
- }
-
- if (put_user(iov, &sgio->dxferp))
- return -EFAULT;
- return 0;
- }
-
- int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg)
- {
- sg_io_hdr32_t __user *sgio32 = (sg_io_hdr32_t __user *)arg;
- sg_io_hdr_t __user *sgio;
- u16 iovec_count;
- u32 data;
- void __user *dxferp;
- int err;
- int interface_id;
-
- if (get_user(interface_id, &sgio32->interface_id))
- return -EFAULT;
- if (interface_id != 'S')
- return -EINVAL;
-
- if (get_user(iovec_count, &sgio32->iovec_count))
- return -EFAULT;
-
- {
- void __user *top = compat_alloc_user_space(0);
- void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) +
- (iovec_count * sizeof(sg_iovec_t)));
- if (new > top)
- return -EINVAL;
-
- sgio = new;
- }
-
- /* Ok, now construct. */
- if (copy_in_user(&sgio->interface_id, &sgio32->interface_id,
- (2 * sizeof(int)) +
- (2 * sizeof(unsigned char)) +
- (1 * sizeof(unsigned short)) +
- (1 * sizeof(unsigned int))))
- return -EFAULT;
-
- if (get_user(data, &sgio32->dxferp))
- return -EFAULT;
- dxferp = compat_ptr(data);
- if (iovec_count) {
- if (sg_build_iovec(sgio, dxferp, iovec_count))
- return -EFAULT;
- } else {
- if (put_user(dxferp, &sgio->dxferp))
- return -EFAULT;
- }
-
- {
- unsigned char __user *cmdp;
- unsigned char __user *sbp;
-
- if (get_user(data, &sgio32->cmdp))
- return -EFAULT;
- cmdp = compat_ptr(data);
-
- if (get_user(data, &sgio32->sbp))
- return -EFAULT;
- sbp = compat_ptr(data);
-
- if (put_user(cmdp, &sgio->cmdp) ||
- put_user(sbp, &sgio->sbp))
- return -EFAULT;
- }
-
- if (copy_in_user(&sgio->timeout, &sgio32->timeout,
- 3 * sizeof(int)))
- return -EFAULT;
-
- if (get_user(data, &sgio32->usr_ptr))
- return -EFAULT;
- if (put_user(compat_ptr(data), &sgio->usr_ptr))
- return -EFAULT;
-
- err = nvme_sg_io(ns, sgio);
- if (err >= 0) {
- void __user *datap;
-
- if (copy_in_user(&sgio32->pack_id, &sgio->pack_id,
- sizeof(int)) ||
- get_user(datap, &sgio->usr_ptr) ||
- put_user((u32)(unsigned long)datap,
- &sgio32->usr_ptr) ||
- copy_in_user(&sgio32->status, &sgio->status,
- (4 * sizeof(unsigned char)) +
- (2 * sizeof(unsigned short)) +
- (3 * sizeof(int))))
- err = -EFAULT;
- }
-
- return err;
- }
- #endif
-
int nvme_sg_get_version_num(int __user *ip)
{
return put_user(sg_version_num, ip);
#include <linux/idr.h>
#include <linux/hdreg.h>
#include <linux/delay.h>
+#include <linux/wait.h>
#include <trace/events/block.h>
#define DMF_NOFLUSH_SUSPENDING 5
#define DMF_MERGE_IS_OPTIONAL 6
#define DMF_DEFERRED_REMOVE 7
+#define DMF_SUSPENDED_INTERNALLY 8
/*
* A dummy definition to make RCU happy.
* Use dm_get_live_table{_fast} or take suspend_lock for
* dereference.
*/
- struct dm_table *map;
+ struct dm_table __rcu *map;
struct list_head table_devices;
struct mutex table_devices_lock;
goto out;
tgt = dm_table_get_target(map, 0);
+ if (!tgt->type->ioctl)
+ goto out;
if (dm_suspended_md(md)) {
r = -EAGAIN;
goto out;
}
- if (tgt->type->ioctl)
- r = tgt->type->ioctl(tgt, cmd, arg);
+ r = tgt->type->ioctl(tgt, cmd, arg);
out:
dm_put_live_table(md, srcu_idx);
struct mapped_device *md = io->md;
struct bio *bio = io->bio;
unsigned long duration = jiffies - io->start_time;
- int pending, cpu;
+ int pending;
int rw = bio_data_dir(bio);
- cpu = part_stat_lock();
- part_round_stats(cpu, &dm_disk(md)->part0);
- part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
- part_stat_unlock();
+ generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
* Find maximum amount of I/O that won't need splitting
*/
max_sectors = min(max_io_len(bvm->bi_sector, ti),
- (sector_t) BIO_MAX_SECTORS);
+ (sector_t) queue_max_sectors(q));
max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
- if (max_size < 0)
+ if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
max_size = 0;
/*
max_size = ti->type->merge(ti, bvm, biovec, max_size);
/*
* If the target doesn't support merge method and some of the devices
- * provided their merge_bvec method (we know this by looking at
- * queue_max_hw_sectors), then we can't allow bios with multiple vector
- * entries. So always set max_size to 0, and the code below allows
- * just one page.
+ * provided their merge_bvec method (we know this by looking for the
+ * max_hw_sectors that dm_set_device_limits may set), then we can't
+ * allow bios with multiple vector entries. So always set max_size
+ * to 0, and the code below allows just one page.
*/
else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
max_size = 0;
{
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
- int cpu;
int srcu_idx;
struct dm_table *map;
map = dm_get_live_table(md, &srcu_idx);
- cpu = part_stat_lock();
- part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
- part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
- part_stat_unlock();
+ generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
/* if we're suspended, we have to queue this io for later */
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
merge_is_optional = dm_table_merge_is_optional(t);
- old_map = md->map;
+ old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
rcu_assign_pointer(md->map, t);
md->immutable_target_type = dm_table_get_immutable_target_type(t);
set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
else
clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
- dm_sync_table(md);
+ if (old_map)
+ dm_sync_table(md);
return old_map;
}
*/
static struct dm_table *__unbind(struct mapped_device *md)
{
- struct dm_table *map = md->map;
+ struct dm_table *map = rcu_dereference_protected(md->map, 1);
if (!map)
return NULL;
}
/*
- * We need to be able to change a mapping table under a mounted
- * filesystem. For example we might want to move some data in
- * the background. Before the table can be swapped with
- * dm_bind_table, dm_suspend must be called to flush any in
- * flight bios and ensure that any further io gets deferred.
- */
-/*
- * Suspend mechanism in request-based dm.
+ * If __dm_suspend returns 0, the device is completely quiescent
+ * now. There is no request-processing activity. All new requests
+ * are being added to md->deferred list.
*
- * 1. Flush all I/Os by lock_fs() if needed.
- * 2. Stop dispatching any I/O by stopping the request_queue.
- * 3. Wait for all in-flight I/Os to be completed or requeued.
- *
- * To abort suspend, start the request_queue.
+ * Caller must hold md->suspend_lock
*/
-int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
+static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
+ unsigned suspend_flags, int interruptible)
{
- struct dm_table *map = NULL;
- int r = 0;
- int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
- int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
-
- mutex_lock(&md->suspend_lock);
-
- if (dm_suspended_md(md)) {
- r = -EINVAL;
- goto out_unlock;
- }
-
- map = md->map;
+ bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
+ bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
+ int r;
/*
* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
if (noflush)
set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- /* This does not get reverted if there's an error later. */
+ /*
+ * This gets reverted if there's an error later and the targets
+ * provide the .presuspend_undo hook.
+ */
dm_table_presuspend_targets(map);
/*
*/
if (!noflush && do_lockfs) {
r = lock_fs(md);
- if (r)
- goto out_unlock;
+ if (r) {
+ dm_table_presuspend_undo_targets(map);
+ return r;
+ }
}
/*
* flush_workqueue(md->wq).
*/
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
- synchronize_srcu(&md->io_barrier);
+ if (map)
+ synchronize_srcu(&md->io_barrier);
/*
* Stop md->queue before flushing md->wq in case request-based
* We call dm_wait_for_completion to wait for all existing requests
* to finish.
*/
- r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
+ r = dm_wait_for_completion(md, interruptible);
if (noflush)
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
- synchronize_srcu(&md->io_barrier);
+ if (map)
+ synchronize_srcu(&md->io_barrier);
/* were we interrupted ? */
if (r < 0) {
start_queue(md->queue);
unlock_fs(md);
- goto out_unlock; /* pushback list is already flushed, so skip flush */
+ dm_table_presuspend_undo_targets(map);
+ /* pushback list is already flushed, so skip flush */
}
- /*
- * If dm_wait_for_completion returned 0, the device is completely
- * quiescent now. There is no request-processing activity. All new
- * requests are being added to md->deferred list.
- */
+ return r;
+}
+
+/*
+ * We need to be able to change a mapping table under a mounted
+ * filesystem. For example we might want to move some data in
+ * the background. Before the table can be swapped with
+ * dm_bind_table, dm_suspend must be called to flush any in
+ * flight bios and ensure that any further io gets deferred.
+ */
+/*
+ * Suspend mechanism in request-based dm.
+ *
+ * 1. Flush all I/Os by lock_fs() if needed.
+ * 2. Stop dispatching any I/O by stopping the request_queue.
+ * 3. Wait for all in-flight I/Os to be completed or requeued.
+ *
+ * To abort suspend, start the request_queue.
+ */
+int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
+{
+ struct dm_table *map = NULL;
+ int r = 0;
+
+retry:
+ mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
+ if (dm_suspended_md(md)) {
+ r = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (dm_suspended_internally_md(md)) {
+ /* already internally suspended, wait for internal resume */
+ mutex_unlock(&md->suspend_lock);
+ r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+ if (r)
+ return r;
+ goto retry;
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+
+ r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
+ if (r)
+ goto out_unlock;
set_bit(DMF_SUSPENDED, &md->flags);
return r;
}
-int dm_resume(struct mapped_device *md)
+static int __dm_resume(struct mapped_device *md, struct dm_table *map)
{
- int r = -EINVAL;
- struct dm_table *map = NULL;
-
- mutex_lock(&md->suspend_lock);
- if (!dm_suspended_md(md))
- goto out;
-
- map = md->map;
- if (!map || !dm_table_get_size(map))
- goto out;
-
- r = dm_table_resume_targets(map);
- if (r)
- goto out;
+ if (map) {
+ int r = dm_table_resume_targets(map);
+ if (r)
+ return r;
+ }
dm_queue_flush(md);
unlock_fs(md);
+ return 0;
+}
+
+int dm_resume(struct mapped_device *md)
+{
+ int r = -EINVAL;
+ struct dm_table *map = NULL;
+
+retry:
+ mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
+ if (!dm_suspended_md(md))
+ goto out;
+
+ if (dm_suspended_internally_md(md)) {
+ /* already internally suspended, wait for internal resume */
+ mutex_unlock(&md->suspend_lock);
+ r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+ if (r)
+ return r;
+ goto retry;
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+ if (!map || !dm_table_get_size(map))
+ goto out;
+
+ r = __dm_resume(md, map);
+ if (r)
+ goto out;
+
clear_bit(DMF_SUSPENDED, &md->flags);
r = 0;
* Internal suspend/resume works like userspace-driven suspend. It waits
* until all bios finish and prevents issuing new bios to the target drivers.
* It may be used only from the kernel.
- *
- * Internal suspend holds md->suspend_lock, which prevents interaction with
- * userspace-driven suspend.
*/
-void dm_internal_suspend(struct mapped_device *md)
+static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
{
- mutex_lock(&md->suspend_lock);
+ struct dm_table *map = NULL;
+
+ if (dm_suspended_internally_md(md))
+ return; /* nested internal suspend */
+
+ if (dm_suspended_md(md)) {
+ set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+ return; /* nest suspend */
+ }
+
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+
+ /*
+ * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
+ * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
+ * would require changing .presuspend to return an error -- avoid this
+ * until there is a need for more elaborate variants of internal suspend.
+ */
+ (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
+
+ set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+
+ dm_table_postsuspend_targets(map);
+}
+
+static void __dm_internal_resume(struct mapped_device *md)
+{
+ if (!dm_suspended_internally_md(md))
+ return; /* resume from nested internal suspend */
+
if (dm_suspended_md(md))
+ goto done; /* resume from nested suspend */
+
+ /*
+ * NOTE: existing callers don't need to call dm_table_resume_targets
+ * (which may fail -- so best to avoid it for now by passing NULL map)
+ */
+ (void) __dm_resume(md, NULL);
+
+done:
+ clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
+}
+
+void dm_internal_suspend_noflush(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
+ mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
+
+void dm_internal_resume(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ __dm_internal_resume(md);
+ mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_resume);
+
+/*
+ * Fast variants of internal suspend/resume hold md->suspend_lock,
+ * which prevents interaction with userspace-driven suspend.
+ */
+
+void dm_internal_suspend_fast(struct mapped_device *md)
+{
+ mutex_lock(&md->suspend_lock);
+ if (dm_suspended_md(md) || dm_suspended_internally_md(md))
return;
set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
}
-void dm_internal_resume(struct mapped_device *md)
+void dm_internal_resume_fast(struct mapped_device *md)
{
- if (dm_suspended_md(md))
+ if (dm_suspended_md(md) || dm_suspended_internally_md(md))
goto done;
dm_queue_flush(md);
return test_bit(DMF_SUSPENDED, &md->flags);
}
+int dm_suspended_internally_md(struct mapped_device *md)
+{
+ return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+}
+
int dm_test_deferred_remove_flag(struct mapped_device *md)
{
return test_bit(DMF_DEFERRED_REMOVE, &md->flags);