drivers/nvme/host/core.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * NVM Express device driver
   4  * Copyright (c) 2011-2014, Intel Corporation.
   5  */
   6
   7 #include <linux/async.h>
   8 #include <linux/blkdev.h>
   9 #include <linux/blk-mq.h>
  10 #include <linux/blk-integrity.h>
  11 #include <linux/compat.h>
  12 #include <linux/delay.h>
  13 #include <linux/errno.h>
  14 #include <linux/hdreg.h>
  15 #include <linux/kernel.h>
  16 #include <linux/module.h>
  17 #include <linux/backing-dev.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/pr.h>
  21 #include <linux/ptrace.h>
  22 #include <linux/nvme_ioctl.h>
  23 #include <linux/pm_qos.h>
  24 #include <linux/ratelimit.h>
  25 #include <asm/unaligned.h>
  26
  27 #include "nvme.h"
  28 #include "fabrics.h"
  29 #include <linux/nvme-auth.h>
  30
  31 #define CREATE_TRACE_POINTS
  32 #include "trace.h"
  33
  34 #define NVME_MINORS             (1U << MINORBITS)
  35
  36 struct nvme_ns_info {
  37         struct nvme_ns_ids ids;
  38         u32 nsid;
  39         __le32 anagrpid;
  40         u8 pi_offset;
  41         bool is_shared;
  42         bool is_readonly;
  43         bool is_ready;
  44         bool is_removed;
  45 };
  46
  47 unsigned int admin_timeout = 60;
  48 module_param(admin_timeout, uint, 0644);
  49 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
  50 EXPORT_SYMBOL_GPL(admin_timeout);
  51
  52 unsigned int nvme_io_timeout = 30;
  53 module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
  54 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
  55 EXPORT_SYMBOL_GPL(nvme_io_timeout);
  56
  57 static unsigned char shutdown_timeout = 5;
  58 module_param(shutdown_timeout, byte, 0644);
  59 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
  60
  61 static u8 nvme_max_retries = 5;
  62 module_param_named(max_retries, nvme_max_retries, byte, 0644);
  63 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
  64
  65 static unsigned long default_ps_max_latency_us = 100000;
  66 module_param(default_ps_max_latency_us, ulong, 0644);
  67 MODULE_PARM_DESC(default_ps_max_latency_us,
  68                  "max power saving latency for new devices; use PM QOS to change per device");
  69
  70 static bool force_apst;
  71 module_param(force_apst, bool, 0644);
  72 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
  73
  74 static unsigned long apst_primary_timeout_ms = 100;
  75 module_param(apst_primary_timeout_ms, ulong, 0644);
  76 MODULE_PARM_DESC(apst_primary_timeout_ms,
  77         "primary APST timeout in ms");
  78
  79 static unsigned long apst_secondary_timeout_ms = 2000;
  80 module_param(apst_secondary_timeout_ms, ulong, 0644);
  81 MODULE_PARM_DESC(apst_secondary_timeout_ms,
  82         "secondary APST timeout in ms");
  83
  84 static unsigned long apst_primary_latency_tol_us = 15000;
  85 module_param(apst_primary_latency_tol_us, ulong, 0644);
  86 MODULE_PARM_DESC(apst_primary_latency_tol_us,
  87         "primary APST latency tolerance in us");
  88
  89 static unsigned long apst_secondary_latency_tol_us = 100000;
  90 module_param(apst_secondary_latency_tol_us, ulong, 0644);
  91 MODULE_PARM_DESC(apst_secondary_latency_tol_us,
  92         "secondary APST latency tolerance in us");
  93
  94 /*
  95  * nvme_wq - hosts nvme related works that are not reset or delete
  96  * nvme_reset_wq - hosts nvme reset works
  97  * nvme_delete_wq - hosts nvme delete works
  98  *
  99  * nvme_wq will host works such as scan, aen handling, fw activation,
 100  * keep-alive, periodic reconnects etc. nvme_reset_wq
 101  * runs reset works which also flush works hosted on nvme_wq for
 102  * serialization purposes. nvme_delete_wq host controller deletion
 103  * works which flush reset works for serialization.
 104  */
 105 struct workqueue_struct *nvme_wq;
 106 EXPORT_SYMBOL_GPL(nvme_wq);
 107
 108 struct workqueue_struct *nvme_reset_wq;
 109 EXPORT_SYMBOL_GPL(nvme_reset_wq);
 110
 111 struct workqueue_struct *nvme_delete_wq;
 112 EXPORT_SYMBOL_GPL(nvme_delete_wq);
 113
 114 static LIST_HEAD(nvme_subsystems);
 115 DEFINE_MUTEX(nvme_subsystems_lock);
 116
 117 static DEFINE_IDA(nvme_instance_ida);
 118 static dev_t nvme_ctrl_base_chr_devt;
 119 static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env);
 120 static const struct class nvme_class = {
 121         .name = "nvme",
 122         .dev_uevent = nvme_class_uevent,
 123 };
 124
 125 static const struct class nvme_subsys_class = {
 126         .name = "nvme-subsystem",
 127 };
 128
 129 static DEFINE_IDA(nvme_ns_chr_minor_ida);
 130 static dev_t nvme_ns_chr_devt;
 131 static const struct class nvme_ns_chr_class = {
 132         .name = "nvme-generic",
 133 };
 134
 135 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 136 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 137                                            unsigned nsid);
 138 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
 139                                    struct nvme_command *cmd);
 140
 141 void nvme_queue_scan(struct nvme_ctrl *ctrl)
 142 {
 143         /*
 144          * Only new queue scan work when admin and IO queues are both alive
 145          */
 146         if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE && ctrl->tagset)
 147                 queue_work(nvme_wq, &ctrl->scan_work);
 148 }
 149
 150 /*
 151  * Use this function to proceed with scheduling reset_work for a controller
 152  * that had previously been set to the resetting state. This is intended for
 153  * code paths that can't be interrupted by other reset attempts. A hot removal
 154  * may prevent this from succeeding.
 155  */
 156 int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
 157 {
 158         if (nvme_ctrl_state(ctrl) != NVME_CTRL_RESETTING)
 159                 return -EBUSY;
 160         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
 161                 return -EBUSY;
 162         return 0;
 163 }
 164 EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
 165
 166 static void nvme_failfast_work(struct work_struct *work)
 167 {
 168         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
 169                         struct nvme_ctrl, failfast_work);
 170
 171         if (nvme_ctrl_state(ctrl) != NVME_CTRL_CONNECTING)
 172                 return;
 173
 174         set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 175         dev_info(ctrl->device, "failfast expired\n");
 176         nvme_kick_requeue_lists(ctrl);
 177 }
 178
 179 static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
 180 {
 181         if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
 182                 return;
 183
 184         schedule_delayed_work(&ctrl->failfast_work,
 185                               ctrl->opts->fast_io_fail_tmo * HZ);
 186 }
 187
 188 static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
 189 {
 190         if (!ctrl->opts)
 191                 return;
 192
 193         cancel_delayed_work_sync(&ctrl->failfast_work);
 194         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 195 }
 196
 197
 198 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 199 {
 200         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 201                 return -EBUSY;
 202         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
 203                 return -EBUSY;
 204         return 0;
 205 }
 206 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
 207
 208 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
 209 {
 210         int ret;
 211
 212         ret = nvme_reset_ctrl(ctrl);
 213         if (!ret) {
 214                 flush_work(&ctrl->reset_work);
 215                 if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE)
 216                         ret = -ENETRESET;
 217         }
 218
 219         return ret;
 220 }
 221
 222 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
 223 {
 224         dev_info(ctrl->device,
 225                  "Removing ctrl: NQN \"%s\"\n", nvmf_ctrl_subsysnqn(ctrl));
 226
 227         flush_work(&ctrl->reset_work);
 228         nvme_stop_ctrl(ctrl);
 229         nvme_remove_namespaces(ctrl);
 230         ctrl->ops->delete_ctrl(ctrl);
 231         nvme_uninit_ctrl(ctrl);
 232 }
 233
 234 static void nvme_delete_ctrl_work(struct work_struct *work)
 235 {
 236         struct nvme_ctrl *ctrl =
 237                 container_of(work, struct nvme_ctrl, delete_work);
 238
 239         nvme_do_delete_ctrl(ctrl);
 240 }
 241
 242 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
 243 {
 244         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
 245                 return -EBUSY;
 246         if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
 247                 return -EBUSY;
 248         return 0;
 249 }
 250 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
 251
 252 void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
 253 {
 254         /*
 255          * Keep a reference until nvme_do_delete_ctrl() complete,
 256          * since ->delete_ctrl can free the controller.
 257          */
 258         nvme_get_ctrl(ctrl);
 259         if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
 260                 nvme_do_delete_ctrl(ctrl);
 261         nvme_put_ctrl(ctrl);
 262 }
 263
 264 static blk_status_t nvme_error_status(u16 status)
 265 {
 266         switch (status & NVME_SCT_SC_MASK) {
 267         case NVME_SC_SUCCESS:
 268                 return BLK_STS_OK;
 269         case NVME_SC_CAP_EXCEEDED:
 270                 return BLK_STS_NOSPC;
 271         case NVME_SC_LBA_RANGE:
 272         case NVME_SC_CMD_INTERRUPTED:
 273         case NVME_SC_NS_NOT_READY:
 274                 return BLK_STS_TARGET;
 275         case NVME_SC_BAD_ATTRIBUTES:
 276         case NVME_SC_ONCS_NOT_SUPPORTED:
 277         case NVME_SC_INVALID_OPCODE:
 278         case NVME_SC_INVALID_FIELD:
 279         case NVME_SC_INVALID_NS:
 280                 return BLK_STS_NOTSUPP;
 281         case NVME_SC_WRITE_FAULT:
 282         case NVME_SC_READ_ERROR:
 283         case NVME_SC_UNWRITTEN_BLOCK:
 284         case NVME_SC_ACCESS_DENIED:
 285         case NVME_SC_READ_ONLY:
 286         case NVME_SC_COMPARE_FAILED:
 287                 return BLK_STS_MEDIUM;
 288         case NVME_SC_GUARD_CHECK:
 289         case NVME_SC_APPTAG_CHECK:
 290         case NVME_SC_REFTAG_CHECK:
 291         case NVME_SC_INVALID_PI:
 292                 return BLK_STS_PROTECTION;
 293         case NVME_SC_RESERVATION_CONFLICT:
 294                 return BLK_STS_RESV_CONFLICT;
 295         case NVME_SC_HOST_PATH_ERROR:
 296                 return BLK_STS_TRANSPORT;
 297         case NVME_SC_ZONE_TOO_MANY_ACTIVE:
 298                 return BLK_STS_ZONE_ACTIVE_RESOURCE;
 299         case NVME_SC_ZONE_TOO_MANY_OPEN:
 300                 return BLK_STS_ZONE_OPEN_RESOURCE;
 301         default:
 302                 return BLK_STS_IOERR;
 303         }
 304 }
 305
 306 static void nvme_retry_req(struct request *req)
 307 {
 308         unsigned long delay = 0;
 309         u16 crd;
 310
 311         /* The mask and shift result must be <= 3 */
 312         crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> 11;
 313         if (crd)
 314                 delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
 315
 316         nvme_req(req)->retries++;
 317         blk_mq_requeue_request(req, false);
 318         blk_mq_delay_kick_requeue_list(req->q, delay);
 319 }
 320
 321 static void nvme_log_error(struct request *req)
 322 {
 323         struct nvme_ns *ns = req->q->queuedata;
 324         struct nvme_request *nr = nvme_req(req);
 325
 326         if (ns) {
 327                 pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
 328                        ns->disk ? ns->disk->disk_name : "?",
 329                        nvme_get_opcode_str(nr->cmd->common.opcode),
 330                        nr->cmd->common.opcode,
 331                        nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
 332                        blk_rq_bytes(req) >> ns->head->lba_shift,
 333                        nvme_get_error_status_str(nr->status),
 334                        NVME_SCT(nr->status),            /* Status Code Type */
 335                        nr->status & NVME_SC_MASK,       /* Status Code */
 336                        nr->status & NVME_STATUS_MORE ? "MORE " : "",
 337                        nr->status & NVME_STATUS_DNR  ? "DNR "  : "");
 338                 return;
 339         }
 340
 341         pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n",
 342                            dev_name(nr->ctrl->device),
 343                            nvme_get_admin_opcode_str(nr->cmd->common.opcode),
 344                            nr->cmd->common.opcode,
 345                            nvme_get_error_status_str(nr->status),
 346                            NVME_SCT(nr->status),        /* Status Code Type */
 347                            nr->status & NVME_SC_MASK,   /* Status Code */
 348                            nr->status & NVME_STATUS_MORE ? "MORE " : "",
 349                            nr->status & NVME_STATUS_DNR  ? "DNR "  : "");
 350 }
 351
 352 static void nvme_log_err_passthru(struct request *req)
 353 {
 354         struct nvme_ns *ns = req->q->queuedata;
 355         struct nvme_request *nr = nvme_req(req);
 356
 357         pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s"
 358                 "cdw10=0x%x cdw11=0x%x cdw12=0x%x cdw13=0x%x cdw14=0x%x cdw15=0x%x\n",
 359                 ns ? ns->disk->disk_name : dev_name(nr->ctrl->device),
 360                 ns ? nvme_get_opcode_str(nr->cmd->common.opcode) :
 361                      nvme_get_admin_opcode_str(nr->cmd->common.opcode),
 362                 nr->cmd->common.opcode,
 363                 nvme_get_error_status_str(nr->status),
 364                 NVME_SCT(nr->status),           /* Status Code Type */
 365                 nr->status & NVME_SC_MASK,      /* Status Code */
 366                 nr->status & NVME_STATUS_MORE ? "MORE " : "",
 367                 nr->status & NVME_STATUS_DNR  ? "DNR "  : "",
 368                 nr->cmd->common.cdw10,
 369                 nr->cmd->common.cdw11,
 370                 nr->cmd->common.cdw12,
 371                 nr->cmd->common.cdw13,
 372                 nr->cmd->common.cdw14,
 373                 nr->cmd->common.cdw14);
 374 }
 375
 376 enum nvme_disposition {
 377         COMPLETE,
 378         RETRY,
 379         FAILOVER,
 380         AUTHENTICATE,
 381 };
 382
 383 static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 384 {
 385         if (likely(nvme_req(req)->status == 0))
 386                 return COMPLETE;
 387
 388         if (blk_noretry_request(req) ||
 389             (nvme_req(req)->status & NVME_STATUS_DNR) ||
 390             nvme_req(req)->retries >= nvme_max_retries)
 391                 return COMPLETE;
 392
 393         if ((nvme_req(req)->status & NVME_SCT_SC_MASK) == NVME_SC_AUTH_REQUIRED)
 394                 return AUTHENTICATE;
 395
 396         if (req->cmd_flags & REQ_NVME_MPATH) {
 397                 if (nvme_is_path_error(nvme_req(req)->status) ||
 398                     blk_queue_dying(req->q))
 399                         return FAILOVER;
 400         } else {
 401                 if (blk_queue_dying(req->q))
 402                         return COMPLETE;
 403         }
 404
 405         return RETRY;
 406 }
 407
 408 static inline void nvme_end_req_zoned(struct request *req)
 409 {
 410         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 411             req_op(req) == REQ_OP_ZONE_APPEND) {
 412                 struct nvme_ns *ns = req->q->queuedata;
 413
 414                 req->__sector = nvme_lba_to_sect(ns->head,
 415                         le64_to_cpu(nvme_req(req)->result.u64));
 416         }
 417 }
 418
 419 static inline void __nvme_end_req(struct request *req)
 420 {
 421         nvme_end_req_zoned(req);
 422         nvme_trace_bio_complete(req);
 423         if (req->cmd_flags & REQ_NVME_MPATH)
 424                 nvme_mpath_end_request(req);
 425 }
 426
 427 void nvme_end_req(struct request *req)
 428 {
 429         blk_status_t status = nvme_error_status(nvme_req(req)->status);
 430
 431         if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
 432                 if (blk_rq_is_passthrough(req))
 433                         nvme_log_err_passthru(req);
 434                 else
 435                         nvme_log_error(req);
 436         }
 437         __nvme_end_req(req);
 438         blk_mq_end_request(req, status);
 439 }
 440
 441 void nvme_complete_rq(struct request *req)
 442 {
 443         struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
 444
 445         trace_nvme_complete_rq(req);
 446         nvme_cleanup_cmd(req);
 447
 448         /*
 449          * Completions of long-running commands should not be able to
 450          * defer sending of periodic keep alives, since the controller
 451          * may have completed processing such commands a long time ago
 452          * (arbitrarily close to command submission time).
 453          * req->deadline - req->timeout is the command submission time
 454          * in jiffies.
 455          */
 456         if (ctrl->kas &&
 457             req->deadline - req->timeout >= ctrl->ka_last_check_time)
 458                 ctrl->comp_seen = true;
 459
 460         switch (nvme_decide_disposition(req)) {
 461         case COMPLETE:
 462                 nvme_end_req(req);
 463                 return;
 464         case RETRY:
 465                 nvme_retry_req(req);
 466                 return;
 467         case FAILOVER:
 468                 nvme_failover_req(req);
 469                 return;
 470         case AUTHENTICATE:
 471 #ifdef CONFIG_NVME_HOST_AUTH
 472                 queue_work(nvme_wq, &ctrl->dhchap_auth_work);
 473                 nvme_retry_req(req);
 474 #else
 475                 nvme_end_req(req);
 476 #endif
 477                 return;
 478         }
 479 }
 480 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 481
 482 void nvme_complete_batch_req(struct request *req)
 483 {
 484         trace_nvme_complete_rq(req);
 485         nvme_cleanup_cmd(req);
 486         __nvme_end_req(req);
 487 }
 488 EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
 489
 490 /*
 491  * Called to unwind from ->queue_rq on a failed command submission so that the
 492  * multipathing code gets called to potentially failover to another path.
 493  * The caller needs to unwind all transport specific resource allocations and
 494  * must return propagate the return value.
 495  */
 496 blk_status_t nvme_host_path_error(struct request *req)
 497 {
 498         nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
 499         blk_mq_set_request_complete(req);
 500         nvme_complete_rq(req);
 501         return BLK_STS_OK;
 502 }
 503 EXPORT_SYMBOL_GPL(nvme_host_path_error);
 504
 505 bool nvme_cancel_request(struct request *req, void *data)
 506 {
 507         dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
 508                                 "Cancelling I/O %d", req->tag);
 509
 510         /* don't abort one completed or idle request */
 511         if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT)
 512                 return true;
 513
 514         nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
 515         nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 516         blk_mq_complete_request(req);
 517         return true;
 518 }
 519 EXPORT_SYMBOL_GPL(nvme_cancel_request);
 520
 521 void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
 522 {
 523         if (ctrl->tagset) {
 524                 blk_mq_tagset_busy_iter(ctrl->tagset,
 525                                 nvme_cancel_request, ctrl);
 526                 blk_mq_tagset_wait_completed_request(ctrl->tagset);
 527         }
 528 }
 529 EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
 530
 531 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
 532 {
 533         if (ctrl->admin_tagset) {
 534                 blk_mq_tagset_busy_iter(ctrl->admin_tagset,
 535                                 nvme_cancel_request, ctrl);
 536                 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
 537         }
 538 }
 539 EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
 540
 541 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 542                 enum nvme_ctrl_state new_state)
 543 {
 544         enum nvme_ctrl_state old_state;
 545         unsigned long flags;
 546         bool changed = false;
 547
 548         spin_lock_irqsave(&ctrl->lock, flags);
 549
 550         old_state = nvme_ctrl_state(ctrl);
 551         switch (new_state) {
 552         case NVME_CTRL_LIVE:
 553                 switch (old_state) {
 554                 case NVME_CTRL_NEW:
 555                 case NVME_CTRL_RESETTING:
 556                 case NVME_CTRL_CONNECTING:
 557                         changed = true;
 558                         fallthrough;
 559                 default:
 560                         break;
 561                 }
 562                 break;
 563         case NVME_CTRL_RESETTING:
 564                 switch (old_state) {
 565                 case NVME_CTRL_NEW:
 566                 case NVME_CTRL_LIVE:
 567                         changed = true;
 568                         fallthrough;
 569                 default:
 570                         break;
 571                 }
 572                 break;
 573         case NVME_CTRL_CONNECTING:
 574                 switch (old_state) {
 575                 case NVME_CTRL_NEW:
 576                 case NVME_CTRL_RESETTING:
 577                         changed = true;
 578                         fallthrough;
 579                 default:
 580                         break;
 581                 }
 582                 break;
 583         case NVME_CTRL_DELETING:
 584                 switch (old_state) {
 585                 case NVME_CTRL_LIVE:
 586                 case NVME_CTRL_RESETTING:
 587                 case NVME_CTRL_CONNECTING:
 588                         changed = true;
 589                         fallthrough;
 590                 default:
 591                         break;
 592                 }
 593                 break;
 594         case NVME_CTRL_DELETING_NOIO:
 595                 switch (old_state) {
 596                 case NVME_CTRL_DELETING:
 597                 case NVME_CTRL_DEAD:
 598                         changed = true;
 599                         fallthrough;
 600                 default:
 601                         break;
 602                 }
 603                 break;
 604         case NVME_CTRL_DEAD:
 605                 switch (old_state) {
 606                 case NVME_CTRL_DELETING:
 607                         changed = true;
 608                         fallthrough;
 609                 default:
 610                         break;
 611                 }
 612                 break;
 613         default:
 614                 break;
 615         }
 616
 617         if (changed) {
 618                 WRITE_ONCE(ctrl->state, new_state);
 619                 wake_up_all(&ctrl->state_wq);
 620         }
 621
 622         spin_unlock_irqrestore(&ctrl->lock, flags);
 623         if (!changed)
 624                 return false;
 625
 626         if (new_state == NVME_CTRL_LIVE) {
 627                 if (old_state == NVME_CTRL_CONNECTING)
 628                         nvme_stop_failfast_work(ctrl);
 629                 nvme_kick_requeue_lists(ctrl);
 630         } else if (new_state == NVME_CTRL_CONNECTING &&
 631                 old_state == NVME_CTRL_RESETTING) {
 632                 nvme_start_failfast_work(ctrl);
 633         }
 634         return changed;
 635 }
 636 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
 637
 638 /*
 639  * Waits for the controller state to be resetting, or returns false if it is
 640  * not possible to ever transition to that state.
 641  */
 642 bool nvme_wait_reset(struct nvme_ctrl *ctrl)
 643 {
 644         wait_event(ctrl->state_wq,
 645                    nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
 646                    nvme_state_terminal(ctrl));
 647         return nvme_ctrl_state(ctrl) == NVME_CTRL_RESETTING;
 648 }
 649 EXPORT_SYMBOL_GPL(nvme_wait_reset);
 650
 651 static void nvme_free_ns_head(struct kref *ref)
 652 {
 653         struct nvme_ns_head *head =
 654                 container_of(ref, struct nvme_ns_head, ref);
 655
 656         nvme_mpath_remove_disk(head);
 657         ida_free(&head->subsys->ns_ida, head->instance);
 658         cleanup_srcu_struct(&head->srcu);
 659         nvme_put_subsystem(head->subsys);
 660         kfree(head);
 661 }
 662
 663 bool nvme_tryget_ns_head(struct nvme_ns_head *head)
 664 {
 665         return kref_get_unless_zero(&head->ref);
 666 }
 667
 668 void nvme_put_ns_head(struct nvme_ns_head *head)
 669 {
 670         kref_put(&head->ref, nvme_free_ns_head);
 671 }
 672
 673 static void nvme_free_ns(struct kref *kref)
 674 {
 675         struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
 676
 677         put_disk(ns->disk);
 678         nvme_put_ns_head(ns->head);
 679         nvme_put_ctrl(ns->ctrl);
 680         kfree(ns);
 681 }
 682
 683 bool nvme_get_ns(struct nvme_ns *ns)
 684 {
 685         return kref_get_unless_zero(&ns->kref);
 686 }
 687
 688 void nvme_put_ns(struct nvme_ns *ns)
 689 {
 690         kref_put(&ns->kref, nvme_free_ns);
 691 }
 692 EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
 693
 694 static inline void nvme_clear_nvme_request(struct request *req)
 695 {
 696         nvme_req(req)->status = 0;
 697         nvme_req(req)->retries = 0;
 698         nvme_req(req)->flags = 0;
 699         req->rq_flags |= RQF_DONTPREP;
 700 }
 701
 702 /* initialize a passthrough request */
 703 void nvme_init_request(struct request *req, struct nvme_command *cmd)
 704 {
 705         struct nvme_request *nr = nvme_req(req);
 706         bool logging_enabled;
 707
 708         if (req->q->queuedata) {
 709                 struct nvme_ns *ns = req->q->disk->private_data;
 710
 711                 logging_enabled = ns->head->passthru_err_log_enabled;
 712                 req->timeout = NVME_IO_TIMEOUT;
 713         } else { /* no queuedata implies admin queue */
 714                 logging_enabled = nr->ctrl->passthru_err_log_enabled;
 715                 req->timeout = NVME_ADMIN_TIMEOUT;
 716         }
 717
 718         if (!logging_enabled)
 719                 req->rq_flags |= RQF_QUIET;
 720
 721         /* passthru commands should let the driver set the SGL flags */
 722         cmd->common.flags &= ~NVME_CMD_SGL_ALL;
 723
 724         req->cmd_flags |= REQ_FAILFAST_DRIVER;
 725         if (req->mq_hctx->type == HCTX_TYPE_POLL)
 726                 req->cmd_flags |= REQ_POLLED;
 727         nvme_clear_nvme_request(req);
 728         memcpy(nr->cmd, cmd, sizeof(*cmd));
 729 }
 730 EXPORT_SYMBOL_GPL(nvme_init_request);
 731
 732 /*
 733  * For something we're not in a state to send to the device the default action
 734  * is to busy it and retry it after the controller state is recovered.  However,
 735  * if the controller is deleting or if anything is marked for failfast or
 736  * nvme multipath it is immediately failed.
 737  *
 738  * Note: commands used to initialize the controller will be marked for failfast.
 739  * Note: nvme cli/ioctl commands are marked for failfast.
 740  */
 741 blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
 742                 struct request *rq)
 743 {
 744         enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
 745
 746         if (state != NVME_CTRL_DELETING_NOIO &&
 747             state != NVME_CTRL_DELETING &&
 748             state != NVME_CTRL_DEAD &&
 749             !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
 750             !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
 751                 return BLK_STS_RESOURCE;
 752         return nvme_host_path_error(rq);
 753 }
 754 EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
 755
 756 bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 757                 bool queue_live, enum nvme_ctrl_state state)
 758 {
 759         struct nvme_request *req = nvme_req(rq);
 760
 761         /*
 762          * currently we have a problem sending passthru commands
 763          * on the admin_q if the controller is not LIVE because we can't
 764          * make sure that they are going out after the admin connect,
 765          * controller enable and/or other commands in the initialization
 766          * sequence. until the controller will be LIVE, fail with
 767          * BLK_STS_RESOURCE so that they will be rescheduled.
 768          */
 769         if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
 770                 return false;
 771
 772         if (ctrl->ops->flags & NVME_F_FABRICS) {
 773                 /*
 774                  * Only allow commands on a live queue, except for the connect
 775                  * command, which is require to set the queue live in the
 776                  * appropinquate states.
 777                  */
 778                 switch (state) {
 779                 case NVME_CTRL_CONNECTING:
 780                         if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
 781                             (req->cmd->fabrics.fctype == nvme_fabrics_type_connect ||
 782                              req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send ||
 783                              req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive))
 784                                 return true;
 785                         break;
 786                 default:
 787                         break;
 788                 case NVME_CTRL_DEAD:
 789                         return false;
 790                 }
 791         }
 792
 793         return queue_live;
 794 }
 795 EXPORT_SYMBOL_GPL(__nvme_check_ready);
 796
 797 static inline void nvme_setup_flush(struct nvme_ns *ns,
 798                 struct nvme_command *cmnd)
 799 {
 800         memset(cmnd, 0, sizeof(*cmnd));
 801         cmnd->common.opcode = nvme_cmd_flush;
 802         cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
 803 }
 804
 805 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 806                 struct nvme_command *cmnd)
 807 {
 808         unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
 809         struct nvme_dsm_range *range;
 810         struct bio *bio;
 811
 812         /*
 813          * Some devices do not consider the DSM 'Number of Ranges' field when
 814          * determining how much data to DMA. Always allocate memory for maximum
 815          * number of segments to prevent device reading beyond end of buffer.
 816          */
 817         static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
 818
 819         range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
 820         if (!range) {
 821                 /*
 822                  * If we fail allocation our range, fallback to the controller
 823                  * discard page. If that's also busy, it's safe to return
 824                  * busy, as we know we can make progress once that's freed.
 825                  */
 826                 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
 827                         return BLK_STS_RESOURCE;
 828
 829                 range = page_address(ns->ctrl->discard_page);
 830         }
 831
 832         if (queue_max_discard_segments(req->q) == 1) {
 833                 u64 slba = nvme_sect_to_lba(ns->head, blk_rq_pos(req));
 834                 u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
 835
 836                 range[0].cattr = cpu_to_le32(0);
 837                 range[0].nlb = cpu_to_le32(nlb);
 838                 range[0].slba = cpu_to_le64(slba);
 839                 n = 1;
 840         } else {
 841                 __rq_for_each_bio(bio, req) {
 842                         u64 slba = nvme_sect_to_lba(ns->head,
 843                                                     bio->bi_iter.bi_sector);
 844                         u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
 845
 846                         if (n < segments) {
 847                                 range[n].cattr = cpu_to_le32(0);
 848                                 range[n].nlb = cpu_to_le32(nlb);
 849                                 range[n].slba = cpu_to_le64(slba);
 850                         }
 851                         n++;
 852                 }
 853         }
 854
 855         if (WARN_ON_ONCE(n != segments)) {
 856                 if (virt_to_page(range) == ns->ctrl->discard_page)
 857                         clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
 858                 else
 859                         kfree(range);
 860                 return BLK_STS_IOERR;
 861         }
 862
 863         memset(cmnd, 0, sizeof(*cmnd));
 864         cmnd->dsm.opcode = nvme_cmd_dsm;
 865         cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
 866         cmnd->dsm.nr = cpu_to_le32(segments - 1);
 867         cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
 868
 869         bvec_set_virt(&req->special_vec, range, alloc_size);
 870         req->rq_flags |= RQF_SPECIAL_PAYLOAD;
 871
 872         return BLK_STS_OK;
 873 }
 874
 875 static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
 876                               struct request *req)
 877 {
 878         u32 upper, lower;
 879         u64 ref48;
 880
 881         /* both rw and write zeroes share the same reftag format */
 882         switch (ns->head->guard_type) {
 883         case NVME_NVM_NS_16B_GUARD:
 884                 cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
 885                 break;
 886         case NVME_NVM_NS_64B_GUARD:
 887                 ref48 = ext_pi_ref_tag(req);
 888                 lower = lower_32_bits(ref48);
 889                 upper = upper_32_bits(ref48);
 890
 891                 cmnd->rw.reftag = cpu_to_le32(lower);
 892                 cmnd->rw.cdw3 = cpu_to_le32(upper);
 893                 break;
 894         default:
 895                 break;
 896         }
 897 }
 898
 899 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 900                 struct request *req, struct nvme_command *cmnd)
 901 {
 902         memset(cmnd, 0, sizeof(*cmnd));
 903
 904         if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
 905                 return nvme_setup_discard(ns, req, cmnd);
 906
 907         cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
 908         cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
 909         cmnd->write_zeroes.slba =
 910                 cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
 911         cmnd->write_zeroes.length =
 912                 cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
 913
 914         if (!(req->cmd_flags & REQ_NOUNMAP) &&
 915             (ns->head->features & NVME_NS_DEAC))
 916                 cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
 917
 918         if (nvme_ns_has_pi(ns->head)) {
 919                 cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
 920
 921                 switch (ns->head->pi_type) {
 922                 case NVME_NS_DPS_PI_TYPE1:
 923                 case NVME_NS_DPS_PI_TYPE2:
 924                         nvme_set_ref_tag(ns, cmnd, req);
 925                         break;
 926                 }
 927         }
 928
 929         return BLK_STS_OK;
 930 }
 931
 932 /*
 933  * NVMe does not support a dedicated command to issue an atomic write. A write
 934  * which does adhere to the device atomic limits will silently be executed
 935  * non-atomically. The request issuer should ensure that the write is within
 936  * the queue atomic writes limits, but just validate this in case it is not.
 937  */
 938 static bool nvme_valid_atomic_write(struct request *req)
 939 {
 940         struct request_queue *q = req->q;
 941         u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);
 942
 943         if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q))
 944                 return false;
 945
 946         if (boundary_bytes) {
 947                 u64 mask = boundary_bytes - 1, imask = ~mask;
 948                 u64 start = blk_rq_pos(req) << SECTOR_SHIFT;
 949                 u64 end = start + blk_rq_bytes(req) - 1;
 950
 951                 /* If greater then must be crossing a boundary */
 952                 if (blk_rq_bytes(req) > boundary_bytes)
 953                         return false;
 954
 955                 if ((start & imask) != (end & imask))
 956                         return false;
 957         }
 958
 959         return true;
 960 }
 961
 962 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 963                 struct request *req, struct nvme_command *cmnd,
 964                 enum nvme_opcode op)
 965 {
 966         u16 control = 0;
 967         u32 dsmgmt = 0;
 968
 969         if (req->cmd_flags & REQ_FUA)
 970                 control |= NVME_RW_FUA;
 971         if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
 972                 control |= NVME_RW_LR;
 973
 974         if (req->cmd_flags & REQ_RAHEAD)
 975                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 976
 977         if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
 978                 return BLK_STS_INVAL;
 979
 980         cmnd->rw.opcode = op;
 981         cmnd->rw.flags = 0;
 982         cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 983         cmnd->rw.cdw2 = 0;
 984         cmnd->rw.cdw3 = 0;
 985         cmnd->rw.metadata = 0;
 986         cmnd->rw.slba =
 987                 cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
 988         cmnd->rw.length =
 989                 cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
 990         cmnd->rw.reftag = 0;
 991         cmnd->rw.lbat = 0;
 992         cmnd->rw.lbatm = 0;
 993
 994         if (ns->head->ms) {
 995                 /*
 996                  * If formated with metadata, the block layer always provides a
 997                  * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
 998                  * we enable the PRACT bit for protection information or set the
 999                  * namespace capacity to zero to prevent any I/O.
1000                  */
1001                 if (!blk_integrity_rq(req)) {
1002                         if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
1003                                 return BLK_STS_NOTSUPP;
1004                         control |= NVME_RW_PRINFO_PRACT;
1005                 }
1006
1007                 switch (ns->head->pi_type) {
1008                 case NVME_NS_DPS_PI_TYPE3:
1009                         control |= NVME_RW_PRINFO_PRCHK_GUARD;
1010                         break;
1011                 case NVME_NS_DPS_PI_TYPE1:
1012                 case NVME_NS_DPS_PI_TYPE2:
1013                         control |= NVME_RW_PRINFO_PRCHK_GUARD |
1014                                         NVME_RW_PRINFO_PRCHK_REF;
1015                         if (op == nvme_cmd_zone_append)
1016                                 control |= NVME_RW_APPEND_PIREMAP;
1017                         nvme_set_ref_tag(ns, cmnd, req);
1018                         break;
1019                 }
1020         }
1021
1022         cmnd->rw.control = cpu_to_le16(control);
1023         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
1024         return 0;
1025 }
1026
1027 void nvme_cleanup_cmd(struct request *req)
1028 {
1029         if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
1030                 struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
1031
1032                 if (req->special_vec.bv_page == ctrl->discard_page)
1033                         clear_bit_unlock(0, &ctrl->discard_page_busy);
1034                 else
1035                         kfree(bvec_virt(&req->special_vec));
1036                 req->rq_flags &= ~RQF_SPECIAL_PAYLOAD;
1037         }
1038 }
1039 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
1040
1041 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)
1042 {
1043         struct nvme_command *cmd = nvme_req(req)->cmd;
1044         blk_status_t ret = BLK_STS_OK;
1045
1046         if (!(req->rq_flags & RQF_DONTPREP))
1047                 nvme_clear_nvme_request(req);
1048
1049         switch (req_op(req)) {
1050         case REQ_OP_DRV_IN:
1051         case REQ_OP_DRV_OUT:
1052                 /* these are setup prior to execution in nvme_init_request() */
1053                 break;
1054         case REQ_OP_FLUSH:
1055                 nvme_setup_flush(ns, cmd);
1056                 break;
1057         case REQ_OP_ZONE_RESET_ALL:
1058         case REQ_OP_ZONE_RESET:
1059                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
1060                 break;
1061         case REQ_OP_ZONE_OPEN:
1062                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
1063                 break;
1064         case REQ_OP_ZONE_CLOSE:
1065                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
1066                 break;
1067         case REQ_OP_ZONE_FINISH:
1068                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
1069                 break;
1070         case REQ_OP_WRITE_ZEROES:
1071                 ret = nvme_setup_write_zeroes(ns, req, cmd);
1072                 break;
1073         case REQ_OP_DISCARD:
1074                 ret = nvme_setup_discard(ns, req, cmd);
1075                 break;
1076         case REQ_OP_READ:
1077                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
1078                 break;
1079         case REQ_OP_WRITE:
1080                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
1081                 break;
1082         case REQ_OP_ZONE_APPEND:
1083                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
1084                 break;
1085         default:
1086                 WARN_ON_ONCE(1);
1087                 return BLK_STS_IOERR;
1088         }
1089
1090         cmd->common.command_id = nvme_cid(req);
1091         trace_nvme_setup_cmd(req, cmd);
1092         return ret;
1093 }
1094 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
1095
1096 /*
1097  * Return values:
1098  * 0:  success
1099  * >0: nvme controller's cqe status response
1100  * <0: kernel error in lieu of controller response
1101  */
1102 int nvme_execute_rq(struct request *rq, bool at_head)
1103 {
1104         blk_status_t status;
1105
1106         status = blk_execute_rq(rq, at_head);
1107         if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
1108                 return -EINTR;
1109         if (nvme_req(rq)->status)
1110                 return nvme_req(rq)->status;
1111         return blk_status_to_errno(status);
1112 }
1113 EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, NVME_TARGET_PASSTHRU);
1114
1115 /*
1116  * Returns 0 on success.  If the result is negative, it's a Linux error code;
1117  * if the result is positive, it's an NVM Express status code
1118  */
1119 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1120                 union nvme_result *result, void *buffer, unsigned bufflen,
1121                 int qid, nvme_submit_flags_t flags)
1122 {
1123         struct request *req;
1124         int ret;
1125         blk_mq_req_flags_t blk_flags = 0;
1126
1127         if (flags & NVME_SUBMIT_NOWAIT)
1128                 blk_flags |= BLK_MQ_REQ_NOWAIT;
1129         if (flags & NVME_SUBMIT_RESERVED)
1130                 blk_flags |= BLK_MQ_REQ_RESERVED;
1131         if (qid == NVME_QID_ANY)
1132                 req = blk_mq_alloc_request(q, nvme_req_op(cmd), blk_flags);
1133         else
1134                 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), blk_flags,
1135                                                 qid - 1);
1136
1137         if (IS_ERR(req))
1138                 return PTR_ERR(req);
1139         nvme_init_request(req, cmd);
1140         if (flags & NVME_SUBMIT_RETRY)
1141                 req->cmd_flags &= ~REQ_FAILFAST_DRIVER;
1142
1143         if (buffer && bufflen) {
1144                 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
1145                 if (ret)
1146                         goto out;
1147         }
1148
1149         ret = nvme_execute_rq(req, flags & NVME_SUBMIT_AT_HEAD);
1150         if (result && ret >= 0)
1151                 *result = nvme_req(req)->result;
1152  out:
1153         blk_mq_free_request(req);
1154         return ret;
1155 }
1156 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
1157
1158 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
1159                 void *buffer, unsigned bufflen)
1160 {
1161         return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen,
1162                         NVME_QID_ANY, 0);
1163 }
1164 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
1165
1166 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1167 {
1168         u32 effects = 0;
1169
1170         if (ns) {
1171                 effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
1172                 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
1173                         dev_warn_once(ctrl->device,
1174                                 "IO command:%02x has unusual effects:%08x\n",
1175                                 opcode, effects);
1176
1177                 /*
1178                  * NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues,
1179                  * which would deadlock when done on an I/O command.  Note that
1180                  * We already warn about an unusual effect above.
1181                  */
1182                 effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
1183         } else {
1184                 effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1185
1186                 /* Ignore execution restrictions if any relaxation bits are set */
1187                 if (effects & NVME_CMD_EFFECTS_CSER_MASK)
1188                         effects &= ~NVME_CMD_EFFECTS_CSE_MASK;
1189         }
1190
1191         return effects;
1192 }
1193 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
1194
1195 u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
1196 {
1197         u32 effects = nvme_command_effects(ctrl, ns, opcode);
1198
1199         /*
1200          * For simplicity, IO to all namespaces is quiesced even if the command
1201          * effects say only one namespace is affected.
1202          */
1203         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1204                 mutex_lock(&ctrl->scan_lock);
1205                 mutex_lock(&ctrl->subsys->lock);
1206                 nvme_mpath_start_freeze(ctrl->subsys);
1207                 nvme_mpath_wait_freeze(ctrl->subsys);
1208                 nvme_start_freeze(ctrl);
1209                 nvme_wait_freeze(ctrl);
1210         }
1211         return effects;
1212 }
1213 EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, NVME_TARGET_PASSTHRU);
1214
1215 void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
1216                        struct nvme_command *cmd, int status)
1217 {
1218         if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
1219                 nvme_unfreeze(ctrl);
1220                 nvme_mpath_unfreeze(ctrl->subsys);
1221                 mutex_unlock(&ctrl->subsys->lock);
1222                 mutex_unlock(&ctrl->scan_lock);
1223         }
1224         if (effects & NVME_CMD_EFFECTS_CCC) {
1225                 if (!test_and_set_bit(NVME_CTRL_DIRTY_CAPABILITY,
1226                                       &ctrl->flags)) {
1227                         dev_info(ctrl->device,
1228 "controller capabilities changed, reset may be required to take effect.\n");
1229                 }
1230         }
1231         if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1232                 nvme_queue_scan(ctrl);
1233                 flush_work(&ctrl->scan_work);
1234         }
1235         if (ns)
1236                 return;
1237
1238         switch (cmd->common.opcode) {
1239         case nvme_admin_set_features:
1240                 switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) {
1241                 case NVME_FEAT_KATO:
1242                         /*
1243                          * Keep alive commands interval on the host should be
1244                          * updated when KATO is modified by Set Features
1245                          * commands.
1246                          */
1247                         if (!status)
1248                                 nvme_update_keep_alive(ctrl, cmd);
1249                         break;
1250                 default:
1251                         break;
1252                 }
1253                 break;
1254         default:
1255                 break;
1256         }
1257 }
1258 EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU);
1259
1260 /*
1261  * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1:
1262  *
1263  *   The host should send Keep Alive commands at half of the Keep Alive Timeout
1264  *   accounting for transport roundtrip times [..].
1265  */
1266 static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl)
1267 {
1268         unsigned long delay = ctrl->kato * HZ / 2;
1269
1270         /*
1271          * When using Traffic Based Keep Alive, we need to run
1272          * nvme_keep_alive_work at twice the normal frequency, as one
1273          * command completion can postpone sending a keep alive command
1274          * by up to twice the delay between runs.
1275          */
1276         if (ctrl->ctratt & NVME_CTRL_ATTR_TBKAS)
1277                 delay /= 2;
1278         return delay;
1279 }
1280
1281 static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
1282 {
1283         unsigned long now = jiffies;
1284         unsigned long delay = nvme_keep_alive_work_period(ctrl);
1285         unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay;
1286
1287         if (time_after(now, ka_next_check_tm))
1288                 delay = 0;
1289         else
1290                 delay = ka_next_check_tm - now;
1291
1292         queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
1293 }
1294
1295 static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
1296                                                  blk_status_t status)
1297 {
1298         struct nvme_ctrl *ctrl = rq->end_io_data;
1299         unsigned long flags;
1300         bool startka = false;
1301         unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
1302         unsigned long delay = nvme_keep_alive_work_period(ctrl);
1303
1304         /*
1305          * Subtract off the keepalive RTT so nvme_keep_alive_work runs
1306          * at the desired frequency.
1307          */
1308         if (rtt <= delay) {
1309                 delay -= rtt;
1310         } else {
1311                 dev_warn(ctrl->device, "long keepalive RTT (%u ms)\n",
1312                          jiffies_to_msecs(rtt));
1313                 delay = 0;
1314         }
1315
1316         blk_mq_free_request(rq);
1317
1318         if (status) {
1319                 dev_err(ctrl->device,
1320                         "failed nvme_keep_alive_end_io error=%d\n",
1321                                 status);
1322                 return RQ_END_IO_NONE;
1323         }
1324
1325         ctrl->ka_last_check_time = jiffies;
1326         ctrl->comp_seen = false;
1327         spin_lock_irqsave(&ctrl->lock, flags);
1328         if (ctrl->state == NVME_CTRL_LIVE ||
1329             ctrl->state == NVME_CTRL_CONNECTING)
1330                 startka = true;
1331         spin_unlock_irqrestore(&ctrl->lock, flags);
1332         if (startka)
1333                 queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
1334         return RQ_END_IO_NONE;
1335 }
1336
1337 static void nvme_keep_alive_work(struct work_struct *work)
1338 {
1339         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
1340                         struct nvme_ctrl, ka_work);
1341         bool comp_seen = ctrl->comp_seen;
1342         struct request *rq;
1343
1344         ctrl->ka_last_check_time = jiffies;
1345
1346         if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1347                 dev_dbg(ctrl->device,
1348                         "reschedule traffic based keep-alive timer\n");
1349                 ctrl->comp_seen = false;
1350                 nvme_queue_keep_alive_work(ctrl);
1351                 return;
1352         }
1353
1354         rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd),
1355                                   BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
1356         if (IS_ERR(rq)) {
1357                 /* allocation failure, reset the controller */
1358                 dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));
1359                 nvme_reset_ctrl(ctrl);
1360                 return;
1361         }
1362         nvme_init_request(rq, &ctrl->ka_cmd);
1363
1364         rq->timeout = ctrl->kato * HZ;
1365         rq->end_io = nvme_keep_alive_end_io;
1366         rq->end_io_data = ctrl;
1367         blk_execute_rq_nowait(rq, false);
1368 }
1369
1370 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1371 {
1372         if (unlikely(ctrl->kato == 0))
1373                 return;
1374
1375         nvme_queue_keep_alive_work(ctrl);
1376 }
1377
1378 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
1379 {
1380         if (unlikely(ctrl->kato == 0))
1381                 return;
1382
1383         cancel_delayed_work_sync(&ctrl->ka_work);
1384 }
1385 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
1386
1387 static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
1388                                    struct nvme_command *cmd)
1389 {
1390         unsigned int new_kato =
1391                 DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000);
1392
1393         dev_info(ctrl->device,
1394                  "keep alive interval updated from %u ms to %u ms\n",
1395                  ctrl->kato * 1000 / 2, new_kato * 1000 / 2);
1396
1397         nvme_stop_keep_alive(ctrl);
1398         ctrl->kato = new_kato;
1399         nvme_start_keep_alive(ctrl);
1400 }
1401
1402 /*
1403  * In NVMe 1.0 the CNS field was just a binary controller or namespace
1404  * flag, thus sending any new CNS opcodes has a big chance of not working.
1405  * Qemu unfortunately had that bug after reporting a 1.1 version compliance
1406  * (but not for any later version).
1407  */
1408 static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
1409 {
1410         if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
1411                 return ctrl->vs < NVME_VS(1, 2, 0);
1412         return ctrl->vs < NVME_VS(1, 1, 0);
1413 }
1414
1415 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
1416 {
1417         struct nvme_command c = { };
1418         int error;
1419
1420         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1421         c.identify.opcode = nvme_admin_identify;
1422         c.identify.cns = NVME_ID_CNS_CTRL;
1423
1424         *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1425         if (!*id)
1426                 return -ENOMEM;
1427
1428         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1429                         sizeof(struct nvme_id_ctrl));
1430         if (error) {
1431                 kfree(*id);
1432                 *id = NULL;
1433         }
1434         return error;
1435 }
1436
1437 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1438                 struct nvme_ns_id_desc *cur, bool *csi_seen)
1439 {
1440         const char *warn_str = "ctrl returned bogus length:";
1441         void *data = cur;
1442
1443         switch (cur->nidt) {
1444         case NVME_NIDT_EUI64:
1445                 if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1446                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1447                                  warn_str, cur->nidl);
1448                         return -1;
1449                 }
1450                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1451                         return NVME_NIDT_EUI64_LEN;
1452                 memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1453                 return NVME_NIDT_EUI64_LEN;
1454         case NVME_NIDT_NGUID:
1455                 if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1456                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1457                                  warn_str, cur->nidl);
1458                         return -1;
1459                 }
1460                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1461                         return NVME_NIDT_NGUID_LEN;
1462                 memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1463                 return NVME_NIDT_NGUID_LEN;
1464         case NVME_NIDT_UUID:
1465                 if (cur->nidl != NVME_NIDT_UUID_LEN) {
1466                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1467                                  warn_str, cur->nidl);
1468                         return -1;
1469                 }
1470                 if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
1471                         return NVME_NIDT_UUID_LEN;
1472                 uuid_copy(&ids->uuid, data + sizeof(*cur));
1473                 return NVME_NIDT_UUID_LEN;
1474         case NVME_NIDT_CSI:
1475                 if (cur->nidl != NVME_NIDT_CSI_LEN) {
1476                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1477                                  warn_str, cur->nidl);
1478                         return -1;
1479                 }
1480                 memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1481                 *csi_seen = true;
1482                 return NVME_NIDT_CSI_LEN;
1483         default:
1484                 /* Skip unknown types */
1485                 return cur->nidl;
1486         }
1487 }
1488
1489 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl,
1490                 struct nvme_ns_info *info)
1491 {
1492         struct nvme_command c = { };
1493         bool csi_seen = false;
1494         int status, pos, len;
1495         void *data;
1496
1497         if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
1498                 return 0;
1499         if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1500                 return 0;
1501
1502         c.identify.opcode = nvme_admin_identify;
1503         c.identify.nsid = cpu_to_le32(info->nsid);
1504         c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
1505
1506         data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
1507         if (!data)
1508                 return -ENOMEM;
1509
1510         status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1511                                       NVME_IDENTIFY_DATA_SIZE);
1512         if (status) {
1513                 dev_warn(ctrl->device,
1514                         "Identify Descriptors failed (nsid=%u, status=0x%x)\n",
1515                         info->nsid, status);
1516                 goto free_data;
1517         }
1518
1519         for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
1520                 struct nvme_ns_id_desc *cur = data + pos;
1521
1522                 if (cur->nidl == 0)
1523                         break;
1524
1525                 len = nvme_process_ns_desc(ctrl, &info->ids, cur, &csi_seen);
1526                 if (len < 0)
1527                         break;
1528
1529                 len += sizeof(*cur);
1530         }
1531
1532         if (nvme_multi_css(ctrl) && !csi_seen) {
1533                 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1534                          info->nsid);
1535                 status = -EINVAL;
1536         }
1537
1538 free_data:
1539         kfree(data);
1540         return status;
1541 }
1542
1543 int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
1544                         struct nvme_id_ns **id)
1545 {
1546         struct nvme_command c = { };
1547         int error;
1548
1549         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1550         c.identify.opcode = nvme_admin_identify;
1551         c.identify.nsid = cpu_to_le32(nsid);
1552         c.identify.cns = NVME_ID_CNS_NS;
1553
1554         *id = kmalloc(sizeof(**id), GFP_KERNEL);
1555         if (!*id)
1556                 return -ENOMEM;
1557
1558         error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
1559         if (error) {
1560                 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1561                 kfree(*id);
1562                 *id = NULL;
1563         }
1564         return error;
1565 }
1566
1567 static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
1568                 struct nvme_ns_info *info)
1569 {
1570         struct nvme_ns_ids *ids = &info->ids;
1571         struct nvme_id_ns *id;
1572         int ret;
1573
1574         ret = nvme_identify_ns(ctrl, info->nsid, &id);
1575         if (ret)
1576                 return ret;
1577
1578         if (id->ncap == 0) {
1579                 /* namespace not allocated or attached */
1580                 info->is_removed = true;
1581                 ret = -ENODEV;
1582                 goto error;
1583         }
1584
1585         info->anagrpid = id->anagrpid;
1586         info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1587         info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1588         info->is_ready = true;
1589         if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
1590                 dev_info(ctrl->device,
1591                          "Ignoring bogus Namespace Identifiers\n");
1592         } else {
1593                 if (ctrl->vs >= NVME_VS(1, 1, 0) &&
1594                     !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
1595                         memcpy(ids->eui64, id->eui64, sizeof(ids->eui64));
1596                 if (ctrl->vs >= NVME_VS(1, 2, 0) &&
1597                     !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
1598                         memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
1599         }
1600
1601 error:
1602         kfree(id);
1603         return ret;
1604 }
1605
1606 static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
1607                 struct nvme_ns_info *info)
1608 {
1609         struct nvme_id_ns_cs_indep *id;
1610         struct nvme_command c = {
1611                 .identify.opcode        = nvme_admin_identify,
1612                 .identify.nsid          = cpu_to_le32(info->nsid),
1613                 .identify.cns           = NVME_ID_CNS_NS_CS_INDEP,
1614         };
1615         int ret;
1616
1617         id = kmalloc(sizeof(*id), GFP_KERNEL);
1618         if (!id)
1619                 return -ENOMEM;
1620
1621         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
1622         if (!ret) {
1623                 info->anagrpid = id->anagrpid;
1624                 info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
1625                 info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
1626                 info->is_ready = id->nstat & NVME_NSTAT_NRDY;
1627         }
1628         kfree(id);
1629         return ret;
1630 }
1631
1632 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1633                 unsigned int dword11, void *buffer, size_t buflen, u32 *result)
1634 {
1635         union nvme_result res = { 0 };
1636         struct nvme_command c = { };
1637         int ret;
1638
1639         c.features.opcode = op;
1640         c.features.fid = cpu_to_le32(fid);
1641         c.features.dword11 = cpu_to_le32(dword11);
1642
1643         ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1644                         buffer, buflen, NVME_QID_ANY, 0);
1645         if (ret >= 0 && result)
1646                 *result = le32_to_cpu(res.u32);
1647         return ret;
1648 }
1649
1650 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1651                       unsigned int dword11, void *buffer, size_t buflen,
1652                       u32 *result)
1653 {
1654         return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1655                              buflen, result);
1656 }
1657 EXPORT_SYMBOL_GPL(nvme_set_features);
1658
1659 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1660                       unsigned int dword11, void *buffer, size_t buflen,
1661                       u32 *result)
1662 {
1663         return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1664                              buflen, result);
1665 }
1666 EXPORT_SYMBOL_GPL(nvme_get_features);
1667
1668 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1669 {
1670         u32 q_count = (*count - 1) | ((*count - 1) << 16);
1671         u32 result;
1672         int status, nr_io_queues;
1673
1674         status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1675                         &result);
1676         if (status < 0)
1677                 return status;
1678
1679         /*
1680          * Degraded controllers might return an error when setting the queue
1681          * count.  We still want to be able to bring them online and offer
1682          * access to the admin queue, as that might be only way to fix them up.
1683          */
1684         if (status > 0) {
1685                 dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1686                 *count = 0;
1687         } else {
1688                 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1689                 *count = min(*count, nr_io_queues);
1690         }
1691
1692         return 0;
1693 }
1694 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1695
1696 #define NVME_AEN_SUPPORTED \
1697         (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1698          NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
1699
1700 static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1701 {
1702         u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1703         int status;
1704
1705         if (!supported_aens)
1706                 return;
1707
1708         status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
1709                         NULL, 0, &result);
1710         if (status)
1711                 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1712                          supported_aens);
1713
1714         queue_work(nvme_wq, &ctrl->async_event_work);
1715 }
1716
1717 static int nvme_ns_open(struct nvme_ns *ns)
1718 {
1719
1720         /* should never be called due to GENHD_FL_HIDDEN */
1721         if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))
1722                 goto fail;
1723         if (!nvme_get_ns(ns))
1724                 goto fail;
1725         if (!try_module_get(ns->ctrl->ops->module))
1726                 goto fail_put_ns;
1727
1728         return 0;
1729
1730 fail_put_ns:
1731         nvme_put_ns(ns);
1732 fail:
1733         return -ENXIO;
1734 }
1735
1736 static void nvme_ns_release(struct nvme_ns *ns)
1737 {
1738
1739         module_put(ns->ctrl->ops->module);
1740         nvme_put_ns(ns);
1741 }
1742
1743 static int nvme_open(struct gendisk *disk, blk_mode_t mode)
1744 {
1745         return nvme_ns_open(disk->private_data);
1746 }
1747
1748 static void nvme_release(struct gendisk *disk)
1749 {
1750         nvme_ns_release(disk->private_data);
1751 }
1752
1753 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1754 {
1755         /* some standard values */
1756         geo->heads = 1 << 6;
1757         geo->sectors = 1 << 5;
1758         geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1759         return 0;
1760 }
1761
1762 static bool nvme_init_integrity(struct nvme_ns_head *head,
1763                 struct queue_limits *lim, struct nvme_ns_info *info)
1764 {
1765         struct blk_integrity *bi = &lim->integrity;
1766
1767         memset(bi, 0, sizeof(*bi));
1768
1769         if (!head->ms)
1770                 return true;
1771
1772         /*
1773          * PI can always be supported as we can ask the controller to simply
1774          * insert/strip it, which is not possible for other kinds of metadata.
1775          */
1776         if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
1777             !(head->features & NVME_NS_METADATA_SUPPORTED))
1778                 return nvme_ns_has_pi(head);
1779
1780         switch (head->pi_type) {
1781         case NVME_NS_DPS_PI_TYPE3:
1782                 switch (head->guard_type) {
1783                 case NVME_NVM_NS_16B_GUARD:
1784                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
1785                         bi->tag_size = sizeof(u16) + sizeof(u32);
1786                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1787                         break;
1788                 case NVME_NVM_NS_64B_GUARD:
1789                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
1790                         bi->tag_size = sizeof(u16) + 6;
1791                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1792                         break;
1793                 default:
1794                         break;
1795                 }
1796                 break;
1797         case NVME_NS_DPS_PI_TYPE1:
1798         case NVME_NS_DPS_PI_TYPE2:
1799                 switch (head->guard_type) {
1800                 case NVME_NVM_NS_16B_GUARD:
1801                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC;
1802                         bi->tag_size = sizeof(u16);
1803                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
1804                                      BLK_INTEGRITY_REF_TAG;
1805                         break;
1806                 case NVME_NVM_NS_64B_GUARD:
1807                         bi->csum_type = BLK_INTEGRITY_CSUM_CRC64;
1808                         bi->tag_size = sizeof(u16);
1809                         bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE |
1810                                      BLK_INTEGRITY_REF_TAG;
1811                         break;
1812                 default:
1813                         break;
1814                 }
1815                 break;
1816         default:
1817                 break;
1818         }
1819
1820         bi->tuple_size = head->ms;
1821         bi->pi_offset = info->pi_offset;
1822         return true;
1823 }
1824
1825 static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
1826 {
1827         struct nvme_ctrl *ctrl = ns->ctrl;
1828
1829         if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
1830                 lim->max_hw_discard_sectors =
1831                         nvme_lba_to_sect(ns->head, ctrl->dmrsl);
1832         else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
1833                 lim->max_hw_discard_sectors = UINT_MAX;
1834         else
1835                 lim->max_hw_discard_sectors = 0;
1836
1837         lim->discard_granularity = lim->logical_block_size;
1838
1839         if (ctrl->dmrl)
1840                 lim->max_discard_segments = ctrl->dmrl;
1841         else
1842                 lim->max_discard_segments = NVME_DSM_MAX_RANGES;
1843 }
1844
1845 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1846 {
1847         return uuid_equal(&a->uuid, &b->uuid) &&
1848                 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1849                 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1850                 a->csi == b->csi;
1851 }
1852
1853 static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid,
1854                 struct nvme_id_ns_nvm **nvmp)
1855 {
1856         struct nvme_command c = {
1857                 .identify.opcode        = nvme_admin_identify,
1858                 .identify.nsid          = cpu_to_le32(nsid),
1859                 .identify.cns           = NVME_ID_CNS_CS_NS,
1860                 .identify.csi           = NVME_CSI_NVM,
1861         };
1862         struct nvme_id_ns_nvm *nvm;
1863         int ret;
1864
1865         nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
1866         if (!nvm)
1867                 return -ENOMEM;
1868
1869         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
1870         if (ret)
1871                 kfree(nvm);
1872         else
1873                 *nvmp = nvm;
1874         return ret;
1875 }
1876
1877 static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
1878                 struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
1879 {
1880         u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
1881         u8 guard_type;
1882
1883         /* no support for storage tag formats right now */
1884         if (nvme_elbaf_sts(elbaf))
1885                 return;
1886
1887         guard_type = nvme_elbaf_guard_type(elbaf);
1888         if ((nvm->pic & NVME_ID_NS_NVM_QPIFS) &&
1889              guard_type == NVME_NVM_NS_QTYPE_GUARD)
1890                 guard_type = nvme_elbaf_qualified_guard_type(elbaf);
1891
1892         head->guard_type = guard_type;
1893         switch (head->guard_type) {
1894         case NVME_NVM_NS_64B_GUARD:
1895                 head->pi_size = sizeof(struct crc64_pi_tuple);
1896                 break;
1897         case NVME_NVM_NS_16B_GUARD:
1898                 head->pi_size = sizeof(struct t10_pi_tuple);
1899                 break;
1900         default:
1901                 break;
1902         }
1903 }
1904
1905 static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
1906                 struct nvme_ns_head *head, struct nvme_id_ns *id,
1907                 struct nvme_id_ns_nvm *nvm, struct nvme_ns_info *info)
1908 {
1909         head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
1910         head->pi_type = 0;
1911         head->pi_size = 0;
1912         head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
1913         if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1914                 return;
1915
1916         if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
1917                 nvme_configure_pi_elbas(head, id, nvm);
1918         } else {
1919                 head->pi_size = sizeof(struct t10_pi_tuple);
1920                 head->guard_type = NVME_NVM_NS_16B_GUARD;
1921         }
1922
1923         if (head->pi_size && head->ms >= head->pi_size)
1924                 head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1925         if (!(id->dps & NVME_NS_DPS_PI_FIRST))
1926                 info->pi_offset = head->ms - head->pi_size;
1927
1928         if (ctrl->ops->flags & NVME_F_FABRICS) {
1929                 /*
1930                  * The NVMe over Fabrics specification only supports metadata as
1931                  * part of the extended data LBA.  We rely on HCA/HBA support to
1932                  * remap the separate metadata buffer from the block layer.
1933                  */
1934                 if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
1935                         return;
1936
1937                 head->features |= NVME_NS_EXT_LBAS;
1938
1939                 /*
1940                  * The current fabrics transport drivers support namespace
1941                  * metadata formats only if nvme_ns_has_pi() returns true.
1942                  * Suppress support for all other formats so the namespace will
1943                  * have a 0 capacity and not be usable through the block stack.
1944                  *
1945                  * Note, this check will need to be modified if any drivers
1946                  * gain the ability to use other metadata formats.
1947                  */
1948                 if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
1949                         head->features |= NVME_NS_METADATA_SUPPORTED;
1950         } else {
1951                 /*
1952                  * For PCIe controllers, we can't easily remap the separate
1953                  * metadata buffer from the block layer and thus require a
1954                  * separate metadata buffer for block layer metadata/PI support.
1955                  * We allow extended LBAs for the passthrough interface, though.
1956                  */
1957                 if (id->flbas & NVME_NS_FLBAS_META_EXT)
1958                         head->features |= NVME_NS_EXT_LBAS;
1959                 else
1960                         head->features |= NVME_NS_METADATA_SUPPORTED;
1961         }
1962 }
1963
1964
1965 static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
1966                         struct nvme_id_ns *id, struct queue_limits *lim,
1967                         u32 bs, u32 atomic_bs)
1968 {
1969         unsigned int boundary = 0;
1970
1971         if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
1972                 if (le16_to_cpu(id->nabspf))
1973                         boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
1974         }
1975         lim->atomic_write_hw_max = atomic_bs;
1976         lim->atomic_write_hw_boundary = boundary;
1977         lim->atomic_write_hw_unit_min = bs;
1978         lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
1979 }
1980
1981 static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
1982 {
1983         return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
1984 }
1985
1986 static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
1987                 struct queue_limits *lim)
1988 {
1989         lim->max_hw_sectors = ctrl->max_hw_sectors;
1990         lim->max_segments = min_t(u32, USHRT_MAX,
1991                 min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
1992         lim->max_integrity_segments = ctrl->max_integrity_segments;
1993         lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
1994         lim->max_segment_size = UINT_MAX;
1995         lim->dma_alignment = 3;
1996 }
1997
1998 static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
1999                 struct queue_limits *lim)
2000 {
2001         struct nvme_ns_head *head = ns->head;
2002         u32 bs = 1U << head->lba_shift;
2003         u32 atomic_bs, phys_bs, io_opt = 0;
2004         bool valid = true;
2005
2006         /*
2007          * The block layer can't support LBA sizes larger than the page size
2008          * or smaller than a sector size yet, so catch this early and don't
2009          * allow block I/O.
2010          */
2011         if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
2012                 bs = (1 << 9);
2013                 valid = false;
2014         }
2015
2016         atomic_bs = phys_bs = bs;
2017         if (id->nabo == 0) {
2018                 /*
2019                  * Bit 1 indicates whether NAWUPF is defined for this namespace
2020                  * and whether it should be used instead of AWUPF. If NAWUPF ==
2021                  * 0 then AWUPF must be used instead.
2022                  */
2023                 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
2024                         atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
2025                 else
2026                         atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
2027
2028                 nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs);
2029         }
2030
2031         if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
2032                 /* NPWG = Namespace Preferred Write Granularity */
2033                 phys_bs = bs * (1 + le16_to_cpu(id->npwg));
2034                 /* NOWS = Namespace Optimal Write Size */
2035                 if (id->nows)
2036                         io_opt = bs * (1 + le16_to_cpu(id->nows));
2037         }
2038
2039         /*
2040          * Linux filesystems assume writing a single physical block is
2041          * an atomic operation. Hence limit the physical block size to the
2042          * value of the Atomic Write Unit Power Fail parameter.
2043          */
2044         lim->logical_block_size = bs;
2045         lim->physical_block_size = min(phys_bs, atomic_bs);
2046         lim->io_min = phys_bs;
2047         lim->io_opt = io_opt;
2048         if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
2049                 lim->max_write_zeroes_sectors = UINT_MAX;
2050         else
2051                 lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
2052         return valid;
2053 }
2054
2055 static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
2056 {
2057         return info->is_readonly || test_bit(NVME_NS_FORCE_RO, &ns->flags);
2058 }
2059
2060 static inline bool nvme_first_scan(struct gendisk *disk)
2061 {
2062         /* nvme_alloc_ns() scans the disk prior to adding it */
2063         return !disk_live(disk);
2064 }
2065
2066 static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id,
2067                 struct queue_limits *lim)
2068 {
2069         struct nvme_ctrl *ctrl = ns->ctrl;
2070         u32 iob;
2071
2072         if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
2073             is_power_of_2(ctrl->max_hw_sectors))
2074                 iob = ctrl->max_hw_sectors;
2075         else
2076                 iob = nvme_lba_to_sect(ns->head, le16_to_cpu(id->noiob));
2077
2078         if (!iob)
2079                 return;
2080
2081         if (!is_power_of_2(iob)) {
2082                 if (nvme_first_scan(ns->disk))
2083                         pr_warn("%s: ignoring unaligned IO boundary:%u\n",
2084                                 ns->disk->disk_name, iob);
2085                 return;
2086         }
2087
2088         if (blk_queue_is_zoned(ns->disk->queue)) {
2089                 if (nvme_first_scan(ns->disk))
2090                         pr_warn("%s: ignoring zoned namespace IO boundary\n",
2091                                 ns->disk->disk_name);
2092                 return;
2093         }
2094
2095         lim->chunk_sectors = iob;
2096 }
2097
2098 static int nvme_update_ns_info_generic(struct nvme_ns *ns,
2099                 struct nvme_ns_info *info)
2100 {
2101         struct queue_limits lim;
2102         int ret;
2103
2104         blk_mq_freeze_queue(ns->disk->queue);
2105         lim = queue_limits_start_update(ns->disk->queue);
2106         nvme_set_ctrl_limits(ns->ctrl, &lim);
2107         ret = queue_limits_commit_update(ns->disk->queue, &lim);
2108         set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
2109         blk_mq_unfreeze_queue(ns->disk->queue);
2110
2111         /* Hide the block-interface for these devices */
2112         if (!ret)
2113                 ret = -ENODEV;
2114         return ret;
2115 }
2116
2117 static int nvme_update_ns_info_block(struct nvme_ns *ns,
2118                 struct nvme_ns_info *info)
2119 {
2120         struct queue_limits lim;
2121         struct nvme_id_ns_nvm *nvm = NULL;
2122         struct nvme_zone_info zi = {};
2123         struct nvme_id_ns *id;
2124         sector_t capacity;
2125         unsigned lbaf;
2126         int ret;
2127
2128         ret = nvme_identify_ns(ns->ctrl, info->nsid, &id);
2129         if (ret)
2130                 return ret;
2131
2132         if (id->ncap == 0) {
2133                 /* namespace not allocated or attached */
2134                 info->is_removed = true;
2135                 ret = -ENXIO;
2136                 goto out;
2137         }
2138         lbaf = nvme_lbaf_index(id->flbas);
2139
2140         if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
2141                 ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
2142                 if (ret < 0)
2143                         goto out;
2144         }
2145
2146         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
2147             ns->head->ids.csi == NVME_CSI_ZNS) {
2148                 ret = nvme_query_zone_info(ns, lbaf, &zi);
2149                 if (ret < 0)
2150                         goto out;
2151         }
2152
2153         blk_mq_freeze_queue(ns->disk->queue);
2154         ns->head->lba_shift = id->lbaf[lbaf].ds;
2155         ns->head->nuse = le64_to_cpu(id->nuse);
2156         capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
2157
2158         lim = queue_limits_start_update(ns->disk->queue);
2159         nvme_set_ctrl_limits(ns->ctrl, &lim);
2160         nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
2161         nvme_set_chunk_sectors(ns, id, &lim);
2162         if (!nvme_update_disk_info(ns, id, &lim))
2163                 capacity = 0;
2164         nvme_config_discard(ns, &lim);
2165         if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
2166             ns->head->ids.csi == NVME_CSI_ZNS)
2167                 nvme_update_zone_info(ns, &lim, &zi);
2168
2169         if (ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT)
2170                 lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
2171         else
2172                 lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
2173
2174         /*
2175          * Register a metadata profile for PI, or the plain non-integrity NVMe
2176          * metadata masquerading as Type 0 if supported, otherwise reject block
2177          * I/O to namespaces with metadata except when the namespace supports
2178          * PI, as it can strip/insert in that case.
2179          */
2180         if (!nvme_init_integrity(ns->head, &lim, info))
2181                 capacity = 0;
2182
2183         ret = queue_limits_commit_update(ns->disk->queue, &lim);
2184         if (ret) {
2185                 blk_mq_unfreeze_queue(ns->disk->queue);
2186                 goto out;
2187         }
2188
2189         set_capacity_and_notify(ns->disk, capacity);
2190
2191         /*
2192          * Only set the DEAC bit if the device guarantees that reads from
2193          * deallocated data return zeroes.  While the DEAC bit does not
2194          * require that, it must be a no-op if reads from deallocated data
2195          * do not return zeroes.
2196          */
2197         if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
2198                 ns->head->features |= NVME_NS_DEAC;
2199         set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
2200         set_bit(NVME_NS_READY, &ns->flags);
2201         blk_mq_unfreeze_queue(ns->disk->queue);
2202
2203         if (blk_queue_is_zoned(ns->queue)) {
2204                 ret = blk_revalidate_disk_zones(ns->disk);
2205                 if (ret && !nvme_first_scan(ns->disk))
2206                         goto out;
2207         }
2208
2209         ret = 0;
2210 out:
2211         kfree(nvm);
2212         kfree(id);
2213         return ret;
2214 }
2215
2216 static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
2217 {
2218         bool unsupported = false;
2219         int ret;
2220
2221         switch (info->ids.csi) {
2222         case NVME_CSI_ZNS:
2223                 if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
2224                         dev_info(ns->ctrl->device,
2225         "block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
2226                                 info->nsid);
2227                         ret = nvme_update_ns_info_generic(ns, info);
2228                         break;
2229                 }
2230                 ret = nvme_update_ns_info_block(ns, info);
2231                 break;
2232         case NVME_CSI_NVM:
2233                 ret = nvme_update_ns_info_block(ns, info);
2234                 break;
2235         default:
2236                 dev_info(ns->ctrl->device,
2237                         "block device for nsid %u not supported (csi %u)\n",
2238                         info->nsid, info->ids.csi);
2239                 ret = nvme_update_ns_info_generic(ns, info);
2240                 break;
2241         }
2242
2243         /*
2244          * If probing fails due an unsupported feature, hide the block device,
2245          * but still allow other access.
2246          */
2247         if (ret == -ENODEV) {
2248                 ns->disk->flags |= GENHD_FL_HIDDEN;
2249                 set_bit(NVME_NS_READY, &ns->flags);
2250                 unsupported = true;
2251                 ret = 0;
2252         }
2253
2254         if (!ret && nvme_ns_head_multipath(ns->head)) {
2255                 struct queue_limits *ns_lim = &ns->disk->queue->limits;
2256                 struct queue_limits lim;
2257
2258                 blk_mq_freeze_queue(ns->head->disk->queue);
2259                 /*
2260                  * queue_limits mixes values that are the hardware limitations
2261                  * for bio splitting with what is the device configuration.
2262                  *
2263                  * For NVMe the device configuration can change after e.g. a
2264                  * Format command, and we really want to pick up the new format
2265                  * value here.  But we must still stack the queue limits to the
2266                  * least common denominator for multipathing to split the bios
2267                  * properly.
2268                  *
2269                  * To work around this, we explicitly set the device
2270                  * configuration to those that we just queried, but only stack
2271                  * the splitting limits in to make sure we still obey possibly
2272                  * lower limitations of other controllers.
2273                  */
2274                 lim = queue_limits_start_update(ns->head->disk->queue);
2275                 lim.logical_block_size = ns_lim->logical_block_size;
2276                 lim.physical_block_size = ns_lim->physical_block_size;
2277                 lim.io_min = ns_lim->io_min;
2278                 lim.io_opt = ns_lim->io_opt;
2279                 queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
2280                                         ns->head->disk->disk_name);
2281                 if (unsupported)
2282                         ns->head->disk->flags |= GENHD_FL_HIDDEN;
2283                 else
2284                         nvme_init_integrity(ns->head, &lim, info);
2285                 ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
2286
2287                 set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
2288                 set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
2289                 nvme_mpath_revalidate_paths(ns);
2290
2291                 blk_mq_unfreeze_queue(ns->head->disk->queue);
2292         }
2293
2294         return ret;
2295 }
2296
2297 int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
2298                 enum blk_unique_id type)
2299 {
2300         struct nvme_ns_ids *ids = &ns->head->ids;
2301
2302         if (type != BLK_UID_EUI64)
2303                 return -EINVAL;
2304
2305         if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) {
2306                 memcpy(id, &ids->nguid, sizeof(ids->nguid));
2307                 return sizeof(ids->nguid);
2308         }
2309         if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) {
2310                 memcpy(id, &ids->eui64, sizeof(ids->eui64));
2311                 return sizeof(ids->eui64);
2312         }
2313
2314         return -EINVAL;
2315 }
2316
2317 static int nvme_get_unique_id(struct gendisk *disk, u8 id[16],
2318                 enum blk_unique_id type)
2319 {
2320         return nvme_ns_get_unique_id(disk->private_data, id, type);
2321 }
2322
2323 #ifdef CONFIG_BLK_SED_OPAL
2324 static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
2325                 bool send)
2326 {
2327         struct nvme_ctrl *ctrl = data;
2328         struct nvme_command cmd = { };
2329
2330         if (send)
2331                 cmd.common.opcode = nvme_admin_security_send;
2332         else
2333                 cmd.common.opcode = nvme_admin_security_recv;
2334         cmd.common.nsid = 0;
2335         cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
2336         cmd.common.cdw11 = cpu_to_le32(len);
2337
2338         return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
2339                         NVME_QID_ANY, NVME_SUBMIT_AT_HEAD);
2340 }
2341
2342 static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2343 {
2344         if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) {
2345                 if (!ctrl->opal_dev)
2346                         ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit);
2347                 else if (was_suspended)
2348                         opal_unlock_from_suspend(ctrl->opal_dev);
2349         } else {
2350                 free_opal_dev(ctrl->opal_dev);
2351                 ctrl->opal_dev = NULL;
2352         }
2353 }
2354 #else
2355 static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
2356 {
2357 }
2358 #endif /* CONFIG_BLK_SED_OPAL */
2359
2360 #ifdef CONFIG_BLK_DEV_ZONED
2361 static int nvme_report_zones(struct gendisk *disk, sector_t sector,
2362                 unsigned int nr_zones, report_zones_cb cb, void *data)
2363 {
2364         return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
2365                         data);
2366 }
2367 #else
2368 #define nvme_report_zones       NULL
2369 #endif /* CONFIG_BLK_DEV_ZONED */
2370
2371 const struct block_device_operations nvme_bdev_ops = {
2372         .owner          = THIS_MODULE,
2373         .ioctl          = nvme_ioctl,
2374         .compat_ioctl   = blkdev_compat_ptr_ioctl,
2375         .open           = nvme_open,
2376         .release        = nvme_release,
2377         .getgeo         = nvme_getgeo,
2378         .get_unique_id  = nvme_get_unique_id,
2379         .report_zones   = nvme_report_zones,
2380         .pr_ops         = &nvme_pr_ops,
2381 };
2382
2383 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
2384                 u32 timeout, const char *op)
2385 {
2386         unsigned long timeout_jiffies = jiffies + timeout * HZ;
2387         u32 csts;
2388         int ret;
2389
2390         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2391                 if (csts == ~0)
2392                         return -ENODEV;
2393                 if ((csts & mask) == val)
2394                         break;
2395
2396                 usleep_range(1000, 2000);
2397                 if (fatal_signal_pending(current))
2398                         return -EINTR;
2399                 if (time_after(jiffies, timeout_jiffies)) {
2400                         dev_err(ctrl->device,
2401                                 "Device not ready; aborting %s, CSTS=0x%x\n",
2402                                 op, csts);
2403                         return -ENODEV;
2404                 }
2405         }
2406
2407         return ret;
2408 }
2409
2410 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2411 {
2412         int ret;
2413
2414         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2415         if (shutdown)
2416                 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
2417         else
2418                 ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2419
2420         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2421         if (ret)
2422                 return ret;
2423
2424         if (shutdown) {
2425                 return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
2426                                        NVME_CSTS_SHST_CMPLT,
2427                                        ctrl->shutdown_timeout, "shutdown");
2428         }
2429         if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2430                 msleep(NVME_QUIRK_DELAY_AMOUNT);
2431         return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
2432                                (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
2433 }
2434 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2435
2436 int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2437 {
2438         unsigned dev_page_min;
2439         u32 timeout;
2440         int ret;
2441
2442         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2443         if (ret) {
2444                 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2445                 return ret;
2446         }
2447         dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2448
2449         if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2450                 dev_err(ctrl->device,
2451                         "Minimum device page size %u too large for host (%u)\n",
2452                         1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
2453                 return -ENODEV;
2454         }
2455
2456         if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2457                 ctrl->ctrl_config = NVME_CC_CSS_CSI;
2458         else
2459                 ctrl->ctrl_config = NVME_CC_CSS_NVM;
2460
2461         if (ctrl->cap & NVME_CAP_CRMS_CRWMS && ctrl->cap & NVME_CAP_CRMS_CRIMS)
2462                 ctrl->ctrl_config |= NVME_CC_CRIME;
2463
2464         ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
2465         ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
2466         ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
2467         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2468         if (ret)
2469                 return ret;
2470
2471         /* CAP value may change after initial CC write */
2472         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2473         if (ret)
2474                 return ret;
2475
2476         timeout = NVME_CAP_TIMEOUT(ctrl->cap);
2477         if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
2478                 u32 crto, ready_timeout;
2479
2480                 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
2481                 if (ret) {
2482                         dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
2483                                 ret);
2484                         return ret;
2485                 }
2486
2487                 /*
2488                  * CRTO should always be greater or equal to CAP.TO, but some
2489                  * devices are known to get this wrong. Use the larger of the
2490                  * two values.
2491                  */
2492                 if (ctrl->ctrl_config & NVME_CC_CRIME)
2493                         ready_timeout = NVME_CRTO_CRIMT(crto);
2494                 else
2495                         ready_timeout = NVME_CRTO_CRWMT(crto);
2496
2497                 if (ready_timeout < timeout)
2498                         dev_warn_once(ctrl->device, "bad crto:%x cap:%llx\n",
2499                                       crto, ctrl->cap);
2500                 else
2501                         timeout = ready_timeout;
2502         }
2503
2504         ctrl->ctrl_config |= NVME_CC_ENABLE;
2505         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2506         if (ret)
2507                 return ret;
2508         return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
2509                                (timeout + 1) / 2, "initialisation");
2510 }
2511 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2512
2513 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
2514 {
2515         __le64 ts;
2516         int ret;
2517
2518         if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
2519                 return 0;
2520
2521         ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
2522         ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
2523                         NULL);
2524         if (ret)
2525                 dev_warn_once(ctrl->device,
2526                         "could not set timestamp (%d)\n", ret);
2527         return ret;
2528 }
2529
2530 static int nvme_configure_host_options(struct nvme_ctrl *ctrl)
2531 {
2532         struct nvme_feat_host_behavior *host;
2533         u8 acre = 0, lbafee = 0;
2534         int ret;
2535
2536         /* Don't bother enabling the feature if retry delay is not reported */
2537         if (ctrl->crdt[0])
2538                 acre = NVME_ENABLE_ACRE;
2539         if (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)
2540                 lbafee = NVME_ENABLE_LBAFEE;
2541
2542         if (!acre && !lbafee)
2543                 return 0;
2544
2545         host = kzalloc(sizeof(*host), GFP_KERNEL);
2546         if (!host)
2547                 return 0;
2548
2549         host->acre = acre;
2550         host->lbafee = lbafee;
2551         ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2552                                 host, sizeof(*host), NULL);
2553         kfree(host);
2554         return ret;
2555 }
2556
2557 /*
2558  * The function checks whether the given total (exlat + enlat) latency of
2559  * a power state allows the latter to be used as an APST transition target.
2560  * It does so by comparing the latency to the primary and secondary latency
2561  * tolerances defined by module params. If there's a match, the corresponding
2562  * timeout value is returned and the matching tolerance index (1 or 2) is
2563  * reported.
2564  */
2565 static bool nvme_apst_get_transition_time(u64 total_latency,
2566                 u64 *transition_time, unsigned *last_index)
2567 {
2568         if (total_latency <= apst_primary_latency_tol_us) {
2569                 if (*last_index == 1)
2570                         return false;
2571                 *last_index = 1;
2572                 *transition_time = apst_primary_timeout_ms;
2573                 return true;
2574         }
2575         if (apst_secondary_timeout_ms &&
2576                 total_latency <= apst_secondary_latency_tol_us) {
2577                 if (*last_index <= 2)
2578                         return false;
2579                 *last_index = 2;
2580                 *transition_time = apst_secondary_timeout_ms;
2581                 return true;
2582         }
2583         return false;
2584 }
2585
2586 /*
2587  * APST (Autonomous Power State Transition) lets us program a table of power
2588  * state transitions that the controller will perform automatically.
2589  *
2590  * Depending on module params, one of the two supported techniques will be used:
2591  *
2592  * - If the parameters provide explicit timeouts and tolerances, they will be
2593  *   used to build a table with up to 2 non-operational states to transition to.
2594  *   The default parameter values were selected based on the values used by
2595  *   Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
2596  *   regeneration of the APST table in the event of switching between external
2597  *   and battery power, the timeouts and tolerances reflect a compromise
2598  *   between values used by Microsoft for AC and battery scenarios.
2599  * - If not, we'll configure the table with a simple heuristic: we are willing
2600  *   to spend at most 2% of the time transitioning between power states.
2601  *   Therefore, when running in any given state, we will enter the next
2602  *   lower-power non-operational state after waiting 50 * (enlat + exlat)
2603  *   microseconds, as long as that state's exit latency is under the requested
2604  *   maximum latency.
2605  *
2606  * We will not autonomously enter any non-operational state for which the total
2607  * latency exceeds ps_max_latency_us.
2608  *
2609  * Users can set ps_max_latency_us to zero to turn off APST.
2610  */
2611 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2612 {
2613         struct nvme_feat_auto_pst *table;
2614         unsigned apste = 0;
2615         u64 max_lat_us = 0;
2616         __le64 target = 0;
2617         int max_ps = -1;
2618         int state;
2619         int ret;
2620         unsigned last_lt_index = UINT_MAX;
2621
2622         /*
2623          * If APST isn't supported or if we haven't been initialized yet,
2624          * then don't do anything.
2625          */
2626         if (!ctrl->apsta)
2627                 return 0;
2628
2629         if (ctrl->npss > 31) {
2630                 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2631                 return 0;
2632         }
2633
2634         table = kzalloc(sizeof(*table), GFP_KERNEL);
2635         if (!table)
2636                 return 0;
2637
2638         if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
2639                 /* Turn off APST. */
2640                 dev_dbg(ctrl->device, "APST disabled\n");
2641                 goto done;
2642         }
2643
2644         /*
2645          * Walk through all states from lowest- to highest-power.
2646          * According to the spec, lower-numbered states use more power.  NPSS,
2647          * despite the name, is the index of the lowest-power state, not the
2648          * number of states.
2649          */
2650         for (state = (int)ctrl->npss; state >= 0; state--) {
2651                 u64 total_latency_us, exit_latency_us, transition_ms;
2652
2653                 if (target)
2654                         table->entries[state] = target;
2655
2656                 /*
2657                  * Don't allow transitions to the deepest state if it's quirked
2658                  * off.
2659                  */
2660                 if (state == ctrl->npss &&
2661                     (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
2662                         continue;
2663
2664                 /*
2665                  * Is this state a useful non-operational state for higher-power
2666                  * states to autonomously transition to?
2667                  */
2668                 if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE))
2669                         continue;
2670
2671                 exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
2672                 if (exit_latency_us > ctrl->ps_max_latency_us)
2673                         continue;
2674
2675                 total_latency_us = exit_latency_us +
2676                         le32_to_cpu(ctrl->psd[state].entry_lat);
2677
2678                 /*
2679                  * This state is good. It can be used as the APST idle target
2680                  * for higher power states.
2681                  */
2682                 if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
2683                         if (!nvme_apst_get_transition_time(total_latency_us,
2684                                         &transition_ms, &last_lt_index))
2685                                 continue;
2686                 } else {
2687                         transition_ms = total_latency_us + 19;
2688                         do_div(transition_ms, 20);
2689                         if (transition_ms > (1 << 24) - 1)
2690                                 transition_ms = (1 << 24) - 1;
2691                 }
2692
2693                 target = cpu_to_le64((state << 3) | (transition_ms << 8));
2694                 if (max_ps == -1)
2695                         max_ps = state;
2696                 if (total_latency_us > max_lat_us)
2697                         max_lat_us = total_latency_us;
2698         }
2699
2700         if (max_ps == -1)
2701                 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
2702         else
2703                 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
2704                         max_ps, max_lat_us, (int)sizeof(*table), table);
2705         apste = 1;
2706
2707 done:
2708         ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
2709                                 table, sizeof(*table), NULL);
2710         if (ret)
2711                 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
2712         kfree(table);
2713         return ret;
2714 }
2715
2716 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
2717 {
2718         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2719         u64 latency;
2720
2721         switch (val) {
2722         case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
2723         case PM_QOS_LATENCY_ANY:
2724                 latency = U64_MAX;
2725                 break;
2726
2727         default:
2728                 latency = val;
2729         }
2730
2731         if (ctrl->ps_max_latency_us != latency) {
2732                 ctrl->ps_max_latency_us = latency;
2733                 if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
2734                         nvme_configure_apst(ctrl);
2735         }
2736 }
2737
2738 struct nvme_core_quirk_entry {
2739         /*
2740          * NVMe model and firmware strings are padded with spaces.  For
2741          * simplicity, strings in the quirk table are padded with NULLs
2742          * instead.
2743          */
2744         u16 vid;
2745         const char *mn;
2746         const char *fr;
2747         unsigned long quirks;
2748 };
2749
2750 static const struct nvme_core_quirk_entry core_quirks[] = {
2751         {
2752                 /*
2753                  * This Toshiba device seems to die using any APST states.  See:
2754                  * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
2755                  */
2756                 .vid = 0x1179,
2757                 .mn = "THNSF5256GPUK TOSHIBA",
2758                 .quirks = NVME_QUIRK_NO_APST,
2759         },
2760         {
2761                 /*
2762                  * This LiteON CL1-3D*-Q11 firmware version has a race
2763                  * condition associated with actions related to suspend to idle
2764                  * LiteON has resolved the problem in future firmware
2765                  */
2766                 .vid = 0x14a4,
2767                 .fr = "22301111",
2768                 .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2769         },
2770         {
2771                 /*
2772                  * This Kioxia CD6-V Series / HPE PE8030 device times out and
2773                  * aborts I/O during any load, but more easily reproducible
2774                  * with discards (fstrim).
2775                  *
2776                  * The device is left in a state where it is also not possible
2777                  * to use "nvme set-feature" to disable APST, but booting with
2778                  * nvme_core.default_ps_max_latency=0 works.
2779                  */
2780                 .vid = 0x1e0f,
2781                 .mn = "KCD6XVUL6T40",
2782                 .quirks = NVME_QUIRK_NO_APST,
2783         },
2784         {
2785                 /*
2786                  * The external Samsung X5 SSD fails initialization without a
2787                  * delay before checking if it is ready and has a whole set of
2788                  * other problems.  To make this even more interesting, it
2789                  * shares the PCI ID with internal Samsung 970 Evo Plus that
2790                  * does not need or want these quirks.
2791                  */
2792                 .vid = 0x144d,
2793                 .mn = "Samsung Portable SSD X5",
2794                 .quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
2795                           NVME_QUIRK_NO_DEEPEST_PS |
2796                           NVME_QUIRK_IGNORE_DEV_SUBNQN,
2797         }
2798 };
2799
2800 /* match is null-terminated but idstr is space-padded. */
2801 static bool string_matches(const char *idstr, const char *match, size_t len)
2802 {
2803         size_t matchlen;
2804
2805         if (!match)
2806                 return true;
2807
2808         matchlen = strlen(match);
2809         WARN_ON_ONCE(matchlen > len);
2810
2811         if (memcmp(idstr, match, matchlen))
2812                 return false;
2813
2814         for (; matchlen < len; matchlen++)
2815                 if (idstr[matchlen] != ' ')
2816                         return false;
2817
2818         return true;
2819 }
2820
2821 static bool quirk_matches(const struct nvme_id_ctrl *id,
2822                           const struct nvme_core_quirk_entry *q)
2823 {
2824         return q->vid == le16_to_cpu(id->vid) &&
2825                 string_matches(id->mn, q->mn, sizeof(id->mn)) &&
2826                 string_matches(id->fr, q->fr, sizeof(id->fr));
2827 }
2828
2829 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
2830                 struct nvme_id_ctrl *id)
2831 {
2832         size_t nqnlen;
2833         int off;
2834
2835         if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
2836                 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2837                 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2838                         strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2839                         return;
2840                 }
2841
2842                 if (ctrl->vs >= NVME_VS(1, 2, 1))
2843                         dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2844         }
2845
2846         /*
2847          * Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe
2848          * Base Specification 2.0.  It is slightly different from the format
2849          * specified there due to historic reasons, and we can't change it now.
2850          */
2851         off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2852                         "nqn.2014.08.org.nvmexpress:%04x%04x",
2853                         le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
2854         memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2855         off += sizeof(id->sn);
2856         memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2857         off += sizeof(id->mn);
2858         memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
2859 }
2860
2861 static void nvme_release_subsystem(struct device *dev)
2862 {
2863         struct nvme_subsystem *subsys =
2864                 container_of(dev, struct nvme_subsystem, dev);
2865
2866         if (subsys->instance >= 0)
2867                 ida_free(&nvme_instance_ida, subsys->instance);
2868         kfree(subsys);
2869 }
2870
2871 static void nvme_destroy_subsystem(struct kref *ref)
2872 {
2873         struct nvme_subsystem *subsys =
2874                         container_of(ref, struct nvme_subsystem, ref);
2875
2876         mutex_lock(&nvme_subsystems_lock);
2877         list_del(&subsys->entry);
2878         mutex_unlock(&nvme_subsystems_lock);
2879
2880         ida_destroy(&subsys->ns_ida);
2881         device_del(&subsys->dev);
2882         put_device(&subsys->dev);
2883 }
2884
2885 static void nvme_put_subsystem(struct nvme_subsystem *subsys)
2886 {
2887         kref_put(&subsys->ref, nvme_destroy_subsystem);
2888 }
2889
2890 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
2891 {
2892         struct nvme_subsystem *subsys;
2893
2894         lockdep_assert_held(&nvme_subsystems_lock);
2895
2896         /*
2897          * Fail matches for discovery subsystems. This results
2898          * in each discovery controller bound to a unique subsystem.
2899          * This avoids issues with validating controller values
2900          * that can only be true when there is a single unique subsystem.
2901          * There may be multiple and completely independent entities
2902          * that provide discovery controllers.
2903          */
2904         if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
2905                 return NULL;
2906
2907         list_for_each_entry(subsys, &nvme_subsystems, entry) {
2908                 if (strcmp(subsys->subnqn, subsysnqn))
2909                         continue;
2910                 if (!kref_get_unless_zero(&subsys->ref))
2911                         continue;
2912                 return subsys;
2913         }
2914
2915         return NULL;
2916 }
2917
2918 static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
2919 {
2920         return ctrl->opts && ctrl->opts->discovery_nqn;
2921 }
2922
2923 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2924                 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2925 {
2926         struct nvme_ctrl *tmp;
2927
2928         lockdep_assert_held(&nvme_subsystems_lock);
2929
2930         list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2931                 if (nvme_state_terminal(tmp))
2932                         continue;
2933
2934                 if (tmp->cntlid == ctrl->cntlid) {
2935                         dev_err(ctrl->device,
2936                                 "Duplicate cntlid %u with %s, subsys %s, rejecting\n",
2937                                 ctrl->cntlid, dev_name(tmp->device),
2938                                 subsys->subnqn);
2939                         return false;
2940                 }
2941
2942                 if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2943                     nvme_discovery_ctrl(ctrl))
2944                         continue;
2945
2946                 dev_err(ctrl->device,
2947                         "Subsystem does not support multiple controllers\n");
2948                 return false;
2949         }
2950
2951         return true;
2952 }
2953
2954 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2955 {
2956         struct nvme_subsystem *subsys, *found;
2957         int ret;
2958
2959         subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2960         if (!subsys)
2961                 return -ENOMEM;
2962
2963         subsys->instance = -1;
2964         mutex_init(&subsys->lock);
2965         kref_init(&subsys->ref);
2966         INIT_LIST_HEAD(&subsys->ctrls);
2967         INIT_LIST_HEAD(&subsys->nsheads);
2968         nvme_init_subnqn(subsys, ctrl, id);
2969         memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2970         memcpy(subsys->model, id->mn, sizeof(subsys->model));
2971         subsys->vendor_id = le16_to_cpu(id->vid);
2972         subsys->cmic = id->cmic;
2973
2974         /* Versions prior to 1.4 don't necessarily report a valid type */
2975         if (id->cntrltype == NVME_CTRL_DISC ||
2976             !strcmp(subsys->subnqn, NVME_DISC_SUBSYS_NAME))
2977                 subsys->subtype = NVME_NQN_DISC;
2978         else
2979                 subsys->subtype = NVME_NQN_NVME;
2980
2981         if (nvme_discovery_ctrl(ctrl) && subsys->subtype != NVME_NQN_DISC) {
2982                 dev_err(ctrl->device,
2983                         "Subsystem %s is not a discovery controller",
2984                         subsys->subnqn);
2985                 kfree(subsys);
2986                 return -EINVAL;
2987         }
2988         subsys->awupf = le16_to_cpu(id->awupf);
2989         nvme_mpath_default_iopolicy(subsys);
2990
2991         subsys->dev.class = &nvme_subsys_class;
2992         subsys->dev.release = nvme_release_subsystem;
2993         subsys->dev.groups = nvme_subsys_attrs_groups;
2994         dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
2995         device_initialize(&subsys->dev);
2996
2997         mutex_lock(&nvme_subsystems_lock);
2998         found = __nvme_find_get_subsystem(subsys->subnqn);
2999         if (found) {
3000                 put_device(&subsys->dev);
3001                 subsys = found;
3002
3003                 if (!nvme_validate_cntlid(subsys, ctrl, id)) {
3004                         ret = -EINVAL;
3005                         goto out_put_subsystem;
3006                 }
3007         } else {
3008                 ret = device_add(&subsys->dev);
3009                 if (ret) {
3010                         dev_err(ctrl->device,
3011                                 "failed to register subsystem device.\n");
3012                         put_device(&subsys->dev);
3013                         goto out_unlock;
3014                 }
3015                 ida_init(&subsys->ns_ida);
3016                 list_add_tail(&subsys->entry, &nvme_subsystems);
3017         }
3018
3019         ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
3020                                 dev_name(ctrl->device));
3021         if (ret) {
3022                 dev_err(ctrl->device,
3023                         "failed to create sysfs link from subsystem.\n");
3024                 goto out_put_subsystem;
3025         }
3026
3027         if (!found)
3028                 subsys->instance = ctrl->instance;
3029         ctrl->subsys = subsys;
3030         list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
3031         mutex_unlock(&nvme_subsystems_lock);
3032         return 0;
3033
3034 out_put_subsystem:
3035         nvme_put_subsystem(subsys);
3036 out_unlock:
3037         mutex_unlock(&nvme_subsystems_lock);
3038         return ret;
3039 }
3040
3041 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
3042                 void *log, size_t size, u64 offset)
3043 {
3044         struct nvme_command c = { };
3045         u32 dwlen = nvme_bytes_to_numd(size);
3046
3047         c.get_log_page.opcode = nvme_admin_get_log_page;
3048         c.get_log_page.nsid = cpu_to_le32(nsid);
3049         c.get_log_page.lid = log_page;
3050         c.get_log_page.lsp = lsp;
3051         c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
3052         c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
3053         c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
3054         c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
3055         c.get_log_page.csi = csi;
3056
3057         return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
3058 }
3059
3060 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
3061                                 struct nvme_effects_log **log)
3062 {
3063         struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi);
3064         int ret;
3065
3066         if (cel)
3067                 goto out;
3068
3069         cel = kzalloc(sizeof(*cel), GFP_KERNEL);
3070         if (!cel)
3071                 return -ENOMEM;
3072
3073         ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
3074                         cel, sizeof(*cel), 0);
3075         if (ret) {
3076                 kfree(cel);
3077                 return ret;
3078         }
3079
3080         xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
3081 out:
3082         *log = cel;
3083         return 0;
3084 }
3085
3086 static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)
3087 {
3088         u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val;
3089
3090         if (check_shl_overflow(1U, units + page_shift - 9, &val))
3091                 return UINT_MAX;
3092         return val;
3093 }
3094
3095 static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
3096 {
3097         struct nvme_command c = { };
3098         struct nvme_id_ctrl_nvm *id;
3099         int ret;
3100
3101         /*
3102          * Even though NVMe spec explicitly states that MDTS is not applicable
3103          * to the write-zeroes, we are cautious and limit the size to the
3104          * controllers max_hw_sectors value, which is based on the MDTS field
3105          * and possibly other limiting factors.
3106          */
3107         if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
3108             !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
3109                 ctrl->max_zeroes_sectors = ctrl->max_hw_sectors;
3110         else
3111                 ctrl->max_zeroes_sectors = 0;
3112
3113         if (ctrl->subsys->subtype != NVME_NQN_NVME ||
3114             nvme_ctrl_limited_cns(ctrl) ||
3115             test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags))
3116                 return 0;
3117
3118         id = kzalloc(sizeof(*id), GFP_KERNEL);
3119         if (!id)
3120                 return -ENOMEM;
3121
3122         c.identify.opcode = nvme_admin_identify;
3123         c.identify.cns = NVME_ID_CNS_CS_CTRL;
3124         c.identify.csi = NVME_CSI_NVM;
3125
3126         ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
3127         if (ret)
3128                 goto free_data;
3129
3130         ctrl->dmrl = id->dmrl;
3131         ctrl->dmrsl = le32_to_cpu(id->dmrsl);
3132         if (id->wzsl)
3133                 ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
3134
3135 free_data:
3136         if (ret > 0)
3137                 set_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags);
3138         kfree(id);
3139         return ret;
3140 }
3141
3142 static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl)
3143 {
3144         struct nvme_effects_log *log = ctrl->effects;
3145
3146         log->acs[nvme_admin_format_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
3147                                                 NVME_CMD_EFFECTS_NCC |
3148                                                 NVME_CMD_EFFECTS_CSE_MASK);
3149         log->acs[nvme_admin_sanitize_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC |
3150                                                 NVME_CMD_EFFECTS_CSE_MASK);
3151
3152         /*
3153          * The spec says the result of a security receive command depends on
3154          * the previous security send command. As such, many vendors log this
3155          * command as one to submitted only when no other commands to the same
3156          * namespace are outstanding. The intention is to tell the host to
3157          * prevent mixing security send and receive.
3158          *
3159          * This driver can only enforce such exclusive access against IO
3160          * queues, though. We are not readily able to enforce such a rule for
3161          * two commands to the admin queue, which is the only queue that
3162          * matters for this command.
3163          *
3164          * Rather than blindly freezing the IO queues for this effect that
3165          * doesn't even apply to IO, mask it off.
3166          */
3167         log->acs[nvme_admin_security_recv] &= cpu_to_le32(~NVME_CMD_EFFECTS_CSE_MASK);
3168
3169         log->iocs[nvme_cmd_write] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3170         log->iocs[nvme_cmd_write_zeroes] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3171         log->iocs[nvme_cmd_write_uncor] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC);
3172 }
3173
3174 static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
3175 {
3176         int ret = 0;
3177
3178         if (ctrl->effects)
3179                 return 0;
3180
3181         if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
3182                 ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
3183                 if (ret < 0)
3184                         return ret;
3185         }
3186
3187         if (!ctrl->effects) {
3188                 ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
3189                 if (!ctrl->effects)
3190                         return -ENOMEM;
3191                 xa_store(&ctrl->cels, NVME_CSI_NVM, ctrl->effects, GFP_KERNEL);
3192         }
3193
3194         nvme_init_known_nvm_effects(ctrl);
3195         return 0;
3196 }
3197
3198 static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
3199 {
3200         /*
3201          * In fabrics we need to verify the cntlid matches the
3202          * admin connect
3203          */
3204         if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3205                 dev_err(ctrl->device,
3206                         "Mismatching cntlid: Connect %u vs Identify %u, rejecting\n",
3207                         ctrl->cntlid, le16_to_cpu(id->cntlid));
3208                 return -EINVAL;
3209         }
3210
3211         if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
3212                 dev_err(ctrl->device,
3213                         "keep-alive support is mandatory for fabrics\n");
3214                 return -EINVAL;
3215         }
3216
3217         if (!nvme_discovery_ctrl(ctrl) && ctrl->ioccsz < 4) {
3218                 dev_err(ctrl->device,
3219                         "I/O queue command capsule supported size %d < 4\n",
3220                         ctrl->ioccsz);
3221                 return -EINVAL;
3222         }
3223
3224         if (!nvme_discovery_ctrl(ctrl) && ctrl->iorcsz < 1) {
3225                 dev_err(ctrl->device,
3226                         "I/O queue response capsule supported size %d < 1\n",
3227                         ctrl->iorcsz);
3228                 return -EINVAL;
3229         }
3230
3231         if (!ctrl->maxcmd) {
3232                 dev_err(ctrl->device, "Maximum outstanding commands is 0\n");
3233                 return -EINVAL;
3234         }
3235
3236         return 0;
3237 }
3238
3239 static int nvme_init_identify(struct nvme_ctrl *ctrl)
3240 {
3241         struct queue_limits lim;
3242         struct nvme_id_ctrl *id;
3243         u32 max_hw_sectors;
3244         bool prev_apst_enabled;
3245         int ret;
3246
3247         ret = nvme_identify_ctrl(ctrl, &id);
3248         if (ret) {
3249                 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
3250                 return -EIO;
3251         }
3252
3253         if (!(ctrl->ops->flags & NVME_F_FABRICS))
3254                 ctrl->cntlid = le16_to_cpu(id->cntlid);
3255
3256         if (!ctrl->identified) {
3257                 unsigned int i;
3258
3259                 /*
3260                  * Check for quirks.  Quirk can depend on firmware version,
3261                  * so, in principle, the set of quirks present can change
3262                  * across a reset.  As a possible future enhancement, we
3263                  * could re-scan for quirks every time we reinitialize
3264                  * the device, but we'd have to make sure that the driver
3265                  * behaves intelligently if the quirks change.
3266                  */
3267                 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
3268                         if (quirk_matches(id, &core_quirks[i]))
3269                                 ctrl->quirks |= core_quirks[i].quirks;
3270                 }
3271
3272                 ret = nvme_init_subsystem(ctrl, id);
3273                 if (ret)
3274                         goto out_free;
3275
3276                 ret = nvme_init_effects(ctrl, id);
3277                 if (ret)
3278                         goto out_free;
3279         }
3280         memcpy(ctrl->subsys->firmware_rev, id->fr,
3281                sizeof(ctrl->subsys->firmware_rev));
3282
3283         if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
3284                 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
3285                 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
3286         }
3287
3288         ctrl->crdt[0] = le16_to_cpu(id->crdt1);
3289         ctrl->crdt[1] = le16_to_cpu(id->crdt2);
3290         ctrl->crdt[2] = le16_to_cpu(id->crdt3);
3291
3292         ctrl->oacs = le16_to_cpu(id->oacs);
3293         ctrl->oncs = le16_to_cpu(id->oncs);
3294         ctrl->mtfa = le16_to_cpu(id->mtfa);
3295         ctrl->oaes = le32_to_cpu(id->oaes);
3296         ctrl->wctemp = le16_to_cpu(id->wctemp);
3297         ctrl->cctemp = le16_to_cpu(id->cctemp);
3298
3299         atomic_set(&ctrl->abort_limit, id->acl + 1);
3300         ctrl->vwc = id->vwc;
3301         if (id->mdts)
3302                 max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);
3303         else
3304                 max_hw_sectors = UINT_MAX;
3305         ctrl->max_hw_sectors =
3306                 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
3307
3308         lim = queue_limits_start_update(ctrl->admin_q);
3309         nvme_set_ctrl_limits(ctrl, &lim);
3310         ret = queue_limits_commit_update(ctrl->admin_q, &lim);
3311         if (ret)
3312                 goto out_free;
3313
3314         ctrl->sgls = le32_to_cpu(id->sgls);
3315         ctrl->kas = le16_to_cpu(id->kas);
3316         ctrl->max_namespaces = le32_to_cpu(id->mnan);
3317         ctrl->ctratt = le32_to_cpu(id->ctratt);
3318
3319         ctrl->cntrltype = id->cntrltype;
3320         ctrl->dctype = id->dctype;
3321
3322         if (id->rtd3e) {
3323                 /* us -> s */
3324                 u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
3325
3326                 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
3327                                                  shutdown_timeout, 60);
3328
3329                 if (ctrl->shutdown_timeout != shutdown_timeout)
3330                         dev_info(ctrl->device,
3331                                  "D3 entry latency set to %u seconds\n",
3332                                  ctrl->shutdown_timeout);
3333         } else
3334                 ctrl->shutdown_timeout = shutdown_timeout;
3335
3336         ctrl->npss = id->npss;
3337         ctrl->apsta = id->apsta;
3338         prev_apst_enabled = ctrl->apst_enabled;
3339         if (ctrl->quirks & NVME_QUIRK_NO_APST) {
3340                 if (force_apst && id->apsta) {
3341                         dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
3342                         ctrl->apst_enabled = true;
3343                 } else {
3344                         ctrl->apst_enabled = false;
3345                 }
3346         } else {
3347                 ctrl->apst_enabled = id->apsta;
3348         }
3349         memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
3350
3351         if (ctrl->ops->flags & NVME_F_FABRICS) {
3352                 ctrl->icdoff = le16_to_cpu(id->icdoff);
3353                 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
3354                 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
3355                 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
3356
3357                 ret = nvme_check_ctrl_fabric_info(ctrl, id);
3358                 if (ret)
3359                         goto out_free;
3360         } else {
3361                 ctrl->hmpre = le32_to_cpu(id->hmpre);
3362                 ctrl->hmmin = le32_to_cpu(id->hmmin);
3363                 ctrl->hmminds = le32_to_cpu(id->hmminds);
3364                 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
3365         }
3366
3367         ret = nvme_mpath_init_identify(ctrl, id);
3368         if (ret < 0)
3369                 goto out_free;
3370
3371         if (ctrl->apst_enabled && !prev_apst_enabled)
3372                 dev_pm_qos_expose_latency_tolerance(ctrl->device);
3373         else if (!ctrl->apst_enabled && prev_apst_enabled)
3374                 dev_pm_qos_hide_latency_tolerance(ctrl->device);
3375
3376 out_free:
3377         kfree(id);
3378         return ret;
3379 }
3380
3381 /*
3382  * Initialize the cached copies of the Identify data and various controller
3383  * register in our nvme_ctrl structure.  This should be called as soon as
3384  * the admin queue is fully up and running.
3385  */
3386 int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
3387 {
3388         int ret;
3389
3390         ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
3391         if (ret) {
3392                 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
3393                 return ret;
3394         }
3395
3396         ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
3397
3398         if (ctrl->vs >= NVME_VS(1, 1, 0))
3399                 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
3400
3401         ret = nvme_init_identify(ctrl);
3402         if (ret)
3403                 return ret;
3404
3405         ret = nvme_configure_apst(ctrl);
3406         if (ret < 0)
3407                 return ret;
3408
3409         ret = nvme_configure_timestamp(ctrl);
3410         if (ret < 0)
3411                 return ret;
3412
3413         ret = nvme_configure_host_options(ctrl);
3414         if (ret < 0)
3415                 return ret;
3416
3417         nvme_configure_opal(ctrl, was_suspended);
3418
3419         if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
3420                 /*
3421                  * Do not return errors unless we are in a controller reset,
3422                  * the controller works perfectly fine without hwmon.
3423                  */
3424                 ret = nvme_hwmon_init(ctrl);
3425                 if (ret == -EINTR)
3426                         return ret;
3427         }
3428
3429         clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags);
3430         ctrl->identified = true;
3431
3432         nvme_start_keep_alive(ctrl);
3433
3434         return 0;
3435 }
3436 EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);
3437
3438 static int nvme_dev_open(struct inode *inode, struct file *file)
3439 {
3440         struct nvme_ctrl *ctrl =
3441                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3442
3443         switch (nvme_ctrl_state(ctrl)) {
3444         case NVME_CTRL_LIVE:
3445                 break;
3446         default:
3447                 return -EWOULDBLOCK;
3448         }
3449
3450         nvme_get_ctrl(ctrl);
3451         if (!try_module_get(ctrl->ops->module)) {
3452                 nvme_put_ctrl(ctrl);
3453                 return -EINVAL;
3454         }
3455
3456         file->private_data = ctrl;
3457         return 0;
3458 }
3459
3460 static int nvme_dev_release(struct inode *inode, struct file *file)
3461 {
3462         struct nvme_ctrl *ctrl =
3463                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3464
3465         module_put(ctrl->ops->module);
3466         nvme_put_ctrl(ctrl);
3467         return 0;
3468 }
3469
3470 static const struct file_operations nvme_dev_fops = {
3471         .owner          = THIS_MODULE,
3472         .open           = nvme_dev_open,
3473         .release        = nvme_dev_release,
3474         .unlocked_ioctl = nvme_dev_ioctl,
3475         .compat_ioctl   = compat_ptr_ioctl,
3476         .uring_cmd      = nvme_dev_uring_cmd,
3477 };
3478
3479 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
3480                 unsigned nsid)
3481 {
3482         struct nvme_ns_head *h;
3483
3484         lockdep_assert_held(&ctrl->subsys->lock);
3485
3486         list_for_each_entry(h, &ctrl->subsys->nsheads, entry) {
3487                 /*
3488                  * Private namespaces can share NSIDs under some conditions.
3489                  * In that case we can't use the same ns_head for namespaces
3490                  * with the same NSID.
3491                  */
3492                 if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
3493                         continue;
3494                 if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
3495                         return h;
3496         }
3497
3498         return NULL;
3499 }
3500
3501 static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
3502                 struct nvme_ns_ids *ids)
3503 {
3504         bool has_uuid = !uuid_is_null(&ids->uuid);
3505         bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
3506         bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
3507         struct nvme_ns_head *h;
3508
3509         lockdep_assert_held(&subsys->lock);
3510
3511         list_for_each_entry(h, &subsys->nsheads, entry) {
3512                 if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
3513                         return -EINVAL;
3514                 if (has_nguid &&
3515                     memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
3516                         return -EINVAL;
3517                 if (has_eui64 &&
3518                     memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
3519                         return -EINVAL;
3520         }
3521
3522         return 0;
3523 }
3524
3525 static void nvme_cdev_rel(struct device *dev)
3526 {
3527         ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt));
3528 }
3529
3530 void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device)
3531 {
3532         cdev_device_del(cdev, cdev_device);
3533         put_device(cdev_device);
3534 }
3535
3536 int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
3537                 const struct file_operations *fops, struct module *owner)
3538 {
3539         int minor, ret;
3540
3541         minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL);
3542         if (minor < 0)
3543                 return minor;
3544         cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
3545         cdev_device->class = &nvme_ns_chr_class;
3546         cdev_device->release = nvme_cdev_rel;
3547         device_initialize(cdev_device);
3548         cdev_init(cdev, fops);
3549         cdev->owner = owner;
3550         ret = cdev_device_add(cdev, cdev_device);
3551         if (ret)
3552                 put_device(cdev_device);
3553
3554         return ret;
3555 }
3556
3557 static int nvme_ns_chr_open(struct inode *inode, struct file *file)
3558 {
3559         return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev));
3560 }
3561
3562 static int nvme_ns_chr_release(struct inode *inode, struct file *file)
3563 {
3564         nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev));
3565         return 0;
3566 }
3567
3568 static const struct file_operations nvme_ns_chr_fops = {
3569         .owner          = THIS_MODULE,
3570         .open           = nvme_ns_chr_open,
3571         .release        = nvme_ns_chr_release,
3572         .unlocked_ioctl = nvme_ns_chr_ioctl,
3573         .compat_ioctl   = compat_ptr_ioctl,
3574         .uring_cmd      = nvme_ns_chr_uring_cmd,
3575         .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
3576 };
3577
3578 static int nvme_add_ns_cdev(struct nvme_ns *ns)
3579 {
3580         int ret;
3581
3582         ns->cdev_device.parent = ns->ctrl->device;
3583         ret = dev_set_name(&ns->cdev_device, "ng%dn%d",
3584                            ns->ctrl->instance, ns->head->instance);
3585         if (ret)
3586                 return ret;
3587
3588         return nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops,
3589                              ns->ctrl->ops->module);
3590 }
3591
3592 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3593                 struct nvme_ns_info *info)
3594 {
3595         struct nvme_ns_head *head;
3596         size_t size = sizeof(*head);
3597         int ret = -ENOMEM;
3598
3599 #ifdef CONFIG_NVME_MULTIPATH
3600         size += num_possible_nodes() * sizeof(struct nvme_ns *);
3601 #endif
3602
3603         head = kzalloc(size, GFP_KERNEL);
3604         if (!head)
3605                 goto out;
3606         ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL);
3607         if (ret < 0)
3608                 goto out_free_head;
3609         head->instance = ret;
3610         INIT_LIST_HEAD(&head->list);
3611         ret = init_srcu_struct(&head->srcu);
3612         if (ret)
3613                 goto out_ida_remove;
3614         head->subsys = ctrl->subsys;
3615         head->ns_id = info->nsid;
3616         head->ids = info->ids;
3617         head->shared = info->is_shared;
3618         ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
3619         ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
3620         kref_init(&head->ref);
3621
3622         if (head->ids.csi) {
3623                 ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
3624                 if (ret)
3625                         goto out_cleanup_srcu;
3626         } else
3627                 head->effects = ctrl->effects;
3628
3629         ret = nvme_mpath_alloc_disk(ctrl, head);
3630         if (ret)
3631                 goto out_cleanup_srcu;
3632
3633         list_add_tail(&head->entry, &ctrl->subsys->nsheads);
3634
3635         kref_get(&ctrl->subsys->ref);
3636
3637         return head;
3638 out_cleanup_srcu:
3639         cleanup_srcu_struct(&head->srcu);
3640 out_ida_remove:
3641         ida_free(&ctrl->subsys->ns_ida, head->instance);
3642 out_free_head:
3643         kfree(head);
3644 out:
3645         if (ret > 0)
3646                 ret = blk_status_to_errno(nvme_error_status(ret));
3647         return ERR_PTR(ret);
3648 }
3649
3650 static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this,
3651                 struct nvme_ns_ids *ids)
3652 {
3653         struct nvme_subsystem *s;
3654         int ret = 0;
3655
3656         /*
3657          * Note that this check is racy as we try to avoid holding the global
3658          * lock over the whole ns_head creation.  But it is only intended as
3659          * a sanity check anyway.
3660          */
3661         mutex_lock(&nvme_subsystems_lock);
3662         list_for_each_entry(s, &nvme_subsystems, entry) {
3663                 if (s == this)
3664                         continue;
3665                 mutex_lock(&s->lock);
3666                 ret = nvme_subsys_check_duplicate_ids(s, ids);
3667                 mutex_unlock(&s->lock);
3668                 if (ret)
3669                         break;
3670         }
3671         mutex_unlock(&nvme_subsystems_lock);
3672
3673         return ret;
3674 }
3675
3676 static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
3677 {
3678         struct nvme_ctrl *ctrl = ns->ctrl;
3679         struct nvme_ns_head *head = NULL;
3680         int ret;
3681
3682         ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids);
3683         if (ret) {
3684                 /*
3685                  * We've found two different namespaces on two different
3686                  * subsystems that report the same ID.  This is pretty nasty
3687                  * for anything that actually requires unique device
3688                  * identification.  In the kernel we need this for multipathing,
3689                  * and in user space the /dev/disk/by-id/ links rely on it.
3690                  *
3691                  * If the device also claims to be multi-path capable back off
3692                  * here now and refuse the probe the second device as this is a
3693                  * recipe for data corruption.  If not this is probably a
3694                  * cheap consumer device if on the PCIe bus, so let the user
3695                  * proceed and use the shiny toy, but warn that with changing
3696                  * probing order (which due to our async probing could just be
3697                  * device taking longer to startup) the other device could show
3698                  * up at any time.
3699                  */
3700                 nvme_print_device_info(ctrl);
3701                 if ((ns->ctrl->ops->flags & NVME_F_FABRICS) || /* !PCIe */
3702                     ((ns->ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) &&
3703                      info->is_shared)) {
3704                         dev_err(ctrl->device,
3705                                 "ignoring nsid %d because of duplicate IDs\n",
3706                                 info->nsid);
3707                         return ret;
3708                 }
3709
3710                 dev_err(ctrl->device,
3711                         "clearing duplicate IDs for nsid %d\n", info->nsid);
3712                 dev_err(ctrl->device,
3713                         "use of /dev/disk/by-id/ may cause data corruption\n");
3714                 memset(&info->ids.nguid, 0, sizeof(info->ids.nguid));
3715                 memset(&info->ids.uuid, 0, sizeof(info->ids.uuid));
3716                 memset(&info->ids.eui64, 0, sizeof(info->ids.eui64));
3717                 ctrl->quirks |= NVME_QUIRK_BOGUS_NID;
3718         }
3719
3720         mutex_lock(&ctrl->subsys->lock);
3721         head = nvme_find_ns_head(ctrl, info->nsid);
3722         if (!head) {
3723                 ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids);
3724                 if (ret) {
3725                         dev_err(ctrl->device,
3726                                 "duplicate IDs in subsystem for nsid %d\n",
3727                                 info->nsid);
3728                         goto out_unlock;
3729                 }
3730                 head = nvme_alloc_ns_head(ctrl, info);
3731                 if (IS_ERR(head)) {
3732                         ret = PTR_ERR(head);
3733                         goto out_unlock;
3734                 }
3735         } else {
3736                 ret = -EINVAL;
3737                 if (!info->is_shared || !head->shared) {
3738                         dev_err(ctrl->device,
3739                                 "Duplicate unshared namespace %d\n",
3740                                 info->nsid);
3741                         goto out_put_ns_head;
3742                 }
3743                 if (!nvme_ns_ids_equal(&head->ids, &info->ids)) {
3744                         dev_err(ctrl->device,
3745                                 "IDs don't match for shared namespace %d\n",
3746                                         info->nsid);
3747                         goto out_put_ns_head;
3748                 }
3749
3750                 if (!multipath) {
3751                         dev_warn(ctrl->device,
3752                                 "Found shared namespace %d, but multipathing not supported.\n",
3753                                 info->nsid);
3754                         dev_warn_once(ctrl->device,
3755                                 "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n");
3756                 }
3757         }
3758
3759         list_add_tail_rcu(&ns->siblings, &head->list);
3760         ns->head = head;
3761         mutex_unlock(&ctrl->subsys->lock);
3762         return 0;
3763
3764 out_put_ns_head:
3765         nvme_put_ns_head(head);
3766 out_unlock:
3767         mutex_unlock(&ctrl->subsys->lock);
3768         return ret;
3769 }
3770
3771 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3772 {
3773         struct nvme_ns *ns, *ret = NULL;
3774         int srcu_idx;
3775
3776         srcu_idx = srcu_read_lock(&ctrl->srcu);
3777         list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
3778                 if (ns->head->ns_id == nsid) {
3779                         if (!nvme_get_ns(ns))
3780                                 continue;
3781                         ret = ns;
3782                         break;
3783                 }
3784                 if (ns->head->ns_id > nsid)
3785                         break;
3786         }
3787         srcu_read_unlock(&ctrl->srcu, srcu_idx);
3788         return ret;
3789 }
3790 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
3791
3792 /*
3793  * Add the namespace to the controller list while keeping the list ordered.
3794  */
3795 static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
3796 {
3797         struct nvme_ns *tmp;
3798
3799         list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
3800                 if (tmp->head->ns_id < ns->head->ns_id) {
3801                         list_add_rcu(&ns->list, &tmp->list);
3802                         return;
3803                 }
3804         }
3805         list_add(&ns->list, &ns->ctrl->namespaces);
3806 }
3807
3808 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
3809 {
3810         struct queue_limits lim = { };
3811         struct nvme_ns *ns;
3812         struct gendisk *disk;
3813         int node = ctrl->numa_node;
3814
3815         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
3816         if (!ns)
3817                 return;
3818
3819         if (ctrl->opts && ctrl->opts->data_digest)
3820                 lim.features |= BLK_FEAT_STABLE_WRITES;
3821         if (ctrl->ops->supports_pci_p2pdma &&
3822             ctrl->ops->supports_pci_p2pdma(ctrl))
3823                 lim.features |= BLK_FEAT_PCI_P2PDMA;
3824
3825         disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns);
3826         if (IS_ERR(disk))
3827                 goto out_free_ns;
3828         disk->fops = &nvme_bdev_ops;
3829         disk->private_data = ns;
3830
3831         ns->disk = disk;
3832         ns->queue = disk->queue;
3833         ns->ctrl = ctrl;
3834         kref_init(&ns->kref);
3835
3836         if (nvme_init_ns_head(ns, info))
3837                 goto out_cleanup_disk;
3838
3839         /*
3840          * If multipathing is enabled, the device name for all disks and not
3841          * just those that represent shared namespaces needs to be based on the
3842          * subsystem instance.  Using the controller instance for private
3843          * namespaces could lead to naming collisions between shared and private
3844          * namespaces if they don't use a common numbering scheme.
3845          *
3846          * If multipathing is not enabled, disk names must use the controller
3847          * instance as shared namespaces will show up as multiple block
3848          * devices.
3849          */
3850         if (nvme_ns_head_multipath(ns->head)) {
3851                 sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
3852                         ctrl->instance, ns->head->instance);
3853                 disk->flags |= GENHD_FL_HIDDEN;
3854         } else if (multipath) {
3855                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
3856                         ns->head->instance);
3857         } else {
3858                 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
3859                         ns->head->instance);
3860         }
3861
3862         if (nvme_update_ns_info(ns, info))
3863                 goto out_unlink_ns;
3864
3865         mutex_lock(&ctrl->namespaces_lock);
3866         /*
3867          * Ensure that no namespaces are added to the ctrl list after the queues
3868          * are frozen, thereby avoiding a deadlock between scan and reset.
3869          */
3870         if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
3871                 mutex_unlock(&ctrl->namespaces_lock);
3872                 goto out_unlink_ns;
3873         }
3874         nvme_ns_add_to_ctrl_list(ns);
3875         mutex_unlock(&ctrl->namespaces_lock);
3876         synchronize_srcu(&ctrl->srcu);
3877         nvme_get_ctrl(ctrl);
3878
3879         if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
3880                 goto out_cleanup_ns_from_list;
3881
3882         if (!nvme_ns_head_multipath(ns->head))
3883                 nvme_add_ns_cdev(ns);
3884
3885         nvme_mpath_add_disk(ns, info->anagrpid);
3886         nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
3887
3888         /*
3889          * Set ns->disk->device->driver_data to ns so we can access
3890          * ns->head->passthru_err_log_enabled in
3891          * nvme_io_passthru_err_log_enabled_[store | show]().
3892          */
3893         dev_set_drvdata(disk_to_dev(ns->disk), ns);
3894
3895         return;
3896
3897  out_cleanup_ns_from_list:
3898         nvme_put_ctrl(ctrl);
3899         mutex_lock(&ctrl->namespaces_lock);
3900         list_del_rcu(&ns->list);
3901         mutex_unlock(&ctrl->namespaces_lock);
3902         synchronize_srcu(&ctrl->srcu);
3903  out_unlink_ns:
3904         mutex_lock(&ctrl->subsys->lock);
3905         list_del_rcu(&ns->siblings);
3906         if (list_empty(&ns->head->list))
3907                 list_del_init(&ns->head->entry);
3908         mutex_unlock(&ctrl->subsys->lock);
3909         nvme_put_ns_head(ns->head);
3910  out_cleanup_disk:
3911         put_disk(disk);
3912  out_free_ns:
3913         kfree(ns);
3914 }
3915
3916 static void nvme_ns_remove(struct nvme_ns *ns)
3917 {
3918         bool last_path = false;
3919
3920         if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
3921                 return;
3922
3923         clear_bit(NVME_NS_READY, &ns->flags);
3924         set_capacity(ns->disk, 0);
3925         nvme_fault_inject_fini(&ns->fault_inject);
3926
3927         /*
3928          * Ensure that !NVME_NS_READY is seen by other threads to prevent
3929          * this ns going back into current_path.
3930          */
3931         synchronize_srcu(&ns->head->srcu);
3932
3933         /* wait for concurrent submissions */
3934         if (nvme_mpath_clear_current_path(ns))
3935                 synchronize_srcu(&ns->head->srcu);
3936
3937         mutex_lock(&ns->ctrl->subsys->lock);
3938         list_del_rcu(&ns->siblings);
3939         if (list_empty(&ns->head->list)) {
3940                 list_del_init(&ns->head->entry);
3941                 last_path = true;
3942         }
3943         mutex_unlock(&ns->ctrl->subsys->lock);
3944
3945         /* guarantee not available in head->list */
3946         synchronize_srcu(&ns->head->srcu);
3947
3948         if (!nvme_ns_head_multipath(ns->head))
3949                 nvme_cdev_del(&ns->cdev, &ns->cdev_device);
3950         del_gendisk(ns->disk);
3951
3952         mutex_lock(&ns->ctrl->namespaces_lock);
3953         list_del_rcu(&ns->list);
3954         mutex_unlock(&ns->ctrl->namespaces_lock);
3955         synchronize_srcu(&ns->ctrl->srcu);
3956
3957         if (last_path)
3958                 nvme_mpath_shutdown_disk(ns->head);
3959         nvme_put_ns(ns);
3960 }
3961
3962 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
3963 {
3964         struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
3965
3966         if (ns) {
3967                 nvme_ns_remove(ns);
3968                 nvme_put_ns(ns);
3969         }
3970 }
3971
3972 static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
3973 {
3974         int ret = NVME_SC_INVALID_NS | NVME_STATUS_DNR;
3975
3976         if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
3977                 dev_err(ns->ctrl->device,
3978                         "identifiers changed for nsid %d\n", ns->head->ns_id);
3979                 goto out;
3980         }
3981
3982         ret = nvme_update_ns_info(ns, info);
3983 out:
3984         /*
3985          * Only remove the namespace if we got a fatal error back from the
3986          * device, otherwise ignore the error and just move on.
3987          *
3988          * TODO: we should probably schedule a delayed retry here.
3989          */
3990         if (ret > 0 && (ret & NVME_STATUS_DNR))
3991                 nvme_ns_remove(ns);
3992 }
3993
3994 static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3995 {
3996         struct nvme_ns_info info = { .nsid = nsid };
3997         struct nvme_ns *ns;
3998         int ret;
3999
4000         if (nvme_identify_ns_descs(ctrl, &info))
4001                 return;
4002
4003         if (info.ids.csi != NVME_CSI_NVM && !nvme_multi_css(ctrl)) {
4004                 dev_warn(ctrl->device,
4005                         "command set not reported for nsid: %d\n", nsid);
4006                 return;
4007         }
4008
4009         /*
4010          * If available try to use the Command Set Idependent Identify Namespace
4011          * data structure to find all the generic information that is needed to
4012          * set up a namespace.  If not fall back to the legacy version.
4013          */
4014         if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
4015             (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS))
4016                 ret = nvme_ns_info_from_id_cs_indep(ctrl, &info);
4017         else
4018                 ret = nvme_ns_info_from_identify(ctrl, &info);
4019
4020         if (info.is_removed)
4021                 nvme_ns_remove_by_nsid(ctrl, nsid);
4022
4023         /*
4024          * Ignore the namespace if it is not ready. We will get an AEN once it
4025          * becomes ready and restart the scan.
4026          */
4027         if (ret || !info.is_ready)
4028                 return;
4029
4030         ns = nvme_find_get_ns(ctrl, nsid);
4031         if (ns) {
4032                 nvme_validate_ns(ns, &info);
4033                 nvme_put_ns(ns);
4034         } else {
4035                 nvme_alloc_ns(ctrl, &info);
4036         }
4037 }
4038
4039 /**
4040  * struct async_scan_info - keeps track of controller & NSIDs to scan
4041  * @ctrl:       Controller on which namespaces are being scanned
4042  * @next_nsid:  Index of next NSID to scan in ns_list
4043  * @ns_list:    Pointer to list of NSIDs to scan
4044  *
4045  * Note: There is a single async_scan_info structure shared by all instances
4046  * of nvme_scan_ns_async() scanning a given controller, so the atomic
4047  * operations on next_nsid are critical to ensure each instance scans a unique
4048  * NSID.
4049  */
4050 struct async_scan_info {
4051         struct nvme_ctrl *ctrl;
4052         atomic_t next_nsid;
4053         __le32 *ns_list;
4054 };
4055
4056 static void nvme_scan_ns_async(void *data, async_cookie_t cookie)
4057 {
4058         struct async_scan_info *scan_info = data;
4059         int idx;
4060         u32 nsid;
4061
4062         idx = (u32)atomic_fetch_inc(&scan_info->next_nsid);
4063         nsid = le32_to_cpu(scan_info->ns_list[idx]);
4064
4065         nvme_scan_ns(scan_info->ctrl, nsid);
4066 }
4067
4068 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
4069                                         unsigned nsid)
4070 {
4071         struct nvme_ns *ns, *next;
4072         LIST_HEAD(rm_list);
4073
4074         mutex_lock(&ctrl->namespaces_lock);
4075         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
4076                 if (ns->head->ns_id > nsid) {
4077                         list_del_rcu(&ns->list);
4078                         synchronize_srcu(&ctrl->srcu);
4079                         list_add_tail_rcu(&ns->list, &rm_list);
4080                 }
4081         }
4082         mutex_unlock(&ctrl->namespaces_lock);
4083
4084         list_for_each_entry_safe(ns, next, &rm_list, list)
4085                 nvme_ns_remove(ns);
4086 }
4087
4088 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
4089 {
4090         const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
4091         __le32 *ns_list;
4092         u32 prev = 0;
4093         int ret = 0, i;
4094         ASYNC_DOMAIN(domain);
4095         struct async_scan_info scan_info;
4096
4097         ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
4098         if (!ns_list)
4099                 return -ENOMEM;
4100
4101         scan_info.ctrl = ctrl;
4102         scan_info.ns_list = ns_list;
4103         for (;;) {
4104                 struct nvme_command cmd = {
4105                         .identify.opcode        = nvme_admin_identify,
4106                         .identify.cns           = NVME_ID_CNS_NS_ACTIVE_LIST,
4107                         .identify.nsid          = cpu_to_le32(prev),
4108                 };
4109
4110                 ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
4111                                             NVME_IDENTIFY_DATA_SIZE);
4112                 if (ret) {
4113                         dev_warn(ctrl->device,
4114                                 "Identify NS List failed (status=0x%x)\n", ret);
4115                         goto free;
4116                 }
4117
4118                 atomic_set(&scan_info.next_nsid, 0);
4119                 for (i = 0; i < nr_entries; i++) {
4120                         u32 nsid = le32_to_cpu(ns_list[i]);
4121
4122                         if (!nsid)      /* end of the list? */
4123                                 goto out;
4124                         async_schedule_domain(nvme_scan_ns_async, &scan_info,
4125                                                 &domain);
4126                         while (++prev < nsid)
4127                                 nvme_ns_remove_by_nsid(ctrl, prev);
4128                 }
4129                 async_synchronize_full_domain(&domain);
4130         }
4131  out:
4132         nvme_remove_invalid_namespaces(ctrl, prev);
4133  free:
4134         async_synchronize_full_domain(&domain);
4135         kfree(ns_list);
4136         return ret;
4137 }
4138
4139 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
4140 {
4141         struct nvme_id_ctrl *id;
4142         u32 nn, i;
4143
4144         if (nvme_identify_ctrl(ctrl, &id))
4145                 return;
4146         nn = le32_to_cpu(id->nn);
4147         kfree(id);
4148
4149         for (i = 1; i <= nn; i++)
4150                 nvme_scan_ns(ctrl, i);
4151
4152         nvme_remove_invalid_namespaces(ctrl, nn);
4153 }
4154
4155 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
4156 {
4157         size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
4158         __le32 *log;
4159         int error;
4160
4161         log = kzalloc(log_size, GFP_KERNEL);
4162         if (!log)
4163                 return;
4164
4165         /*
4166          * We need to read the log to clear the AEN, but we don't want to rely
4167          * on it for the changed namespace information as userspace could have
4168          * raced with us in reading the log page, which could cause us to miss
4169          * updates.
4170          */
4171         error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4172                         NVME_CSI_NVM, log, log_size, 0);
4173         if (error)
4174                 dev_warn(ctrl->device,
4175                         "reading changed ns log failed: %d\n", error);
4176
4177         kfree(log);
4178 }
4179
4180 static void nvme_scan_work(struct work_struct *work)
4181 {
4182         struct nvme_ctrl *ctrl =
4183                 container_of(work, struct nvme_ctrl, scan_work);
4184         int ret;
4185
4186         /* No tagset on a live ctrl means IO queues could not created */
4187         if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE || !ctrl->tagset)
4188                 return;
4189
4190         /*
4191          * Identify controller limits can change at controller reset due to
4192          * new firmware download, even though it is not common we cannot ignore
4193          * such scenario. Controller's non-mdts limits are reported in the unit
4194          * of logical blocks that is dependent on the format of attached
4195          * namespace. Hence re-read the limits at the time of ns allocation.
4196          */
4197         ret = nvme_init_non_mdts_limits(ctrl);
4198         if (ret < 0) {
4199                 dev_warn(ctrl->device,
4200                         "reading non-mdts-limits failed: %d\n", ret);
4201                 return;
4202         }
4203
4204         if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
4205                 dev_info(ctrl->device, "rescanning namespaces.\n");
4206                 nvme_clear_changed_ns_log(ctrl);
4207         }
4208
4209         mutex_lock(&ctrl->scan_lock);
4210         if (nvme_ctrl_limited_cns(ctrl)) {
4211                 nvme_scan_ns_sequential(ctrl);
4212         } else {
4213                 /*
4214                  * Fall back to sequential scan if DNR is set to handle broken
4215                  * devices which should support Identify NS List (as per the VS
4216                  * they report) but don't actually support it.
4217                  */
4218                 ret = nvme_scan_ns_list(ctrl);
4219                 if (ret > 0 && ret & NVME_STATUS_DNR)
4220                         nvme_scan_ns_sequential(ctrl);
4221         }
4222         mutex_unlock(&ctrl->scan_lock);
4223 }
4224
4225 /*
4226  * This function iterates the namespace list unlocked to allow recovery from
4227  * controller failure. It is up to the caller to ensure the namespace list is
4228  * not modified by scan work while this function is executing.
4229  */
4230 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
4231 {
4232         struct nvme_ns *ns, *next;
4233         LIST_HEAD(ns_list);
4234
4235         /*
4236          * make sure to requeue I/O to all namespaces as these
4237          * might result from the scan itself and must complete
4238          * for the scan_work to make progress
4239          */
4240         nvme_mpath_clear_ctrl_paths(ctrl);
4241
4242         /*
4243          * Unquiesce io queues so any pending IO won't hang, especially
4244          * those submitted from scan work
4245          */
4246         nvme_unquiesce_io_queues(ctrl);
4247
4248         /* prevent racing with ns scanning */
4249         flush_work(&ctrl->scan_work);
4250
4251         /*
4252          * The dead states indicates the controller was not gracefully
4253          * disconnected. In that case, we won't be able to flush any data while
4254          * removing the namespaces' disks; fail all the queues now to avoid
4255          * potentially having to clean up the failed sync later.
4256          */
4257         if (nvme_ctrl_state(ctrl) == NVME_CTRL_DEAD)
4258                 nvme_mark_namespaces_dead(ctrl);
4259
4260         /* this is a no-op when called from the controller reset handler */
4261         nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4262
4263         mutex_lock(&ctrl->namespaces_lock);
4264         list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
4265         mutex_unlock(&ctrl->namespaces_lock);
4266         synchronize_srcu(&ctrl->srcu);
4267
4268         list_for_each_entry_safe(ns, next, &ns_list, list)
4269                 nvme_ns_remove(ns);
4270 }
4271 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4272
4273 static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env)
4274 {
4275         const struct nvme_ctrl *ctrl =
4276                 container_of(dev, struct nvme_ctrl, ctrl_device);
4277         struct nvmf_ctrl_options *opts = ctrl->opts;
4278         int ret;
4279
4280         ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4281         if (ret)
4282                 return ret;
4283
4284         if (opts) {
4285                 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4286                 if (ret)
4287                         return ret;
4288
4289                 ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4290                                 opts->trsvcid ?: "none");
4291                 if (ret)
4292                         return ret;
4293
4294                 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4295                                 opts->host_traddr ?: "none");
4296                 if (ret)
4297                         return ret;
4298
4299                 ret = add_uevent_var(env, "NVME_HOST_IFACE=%s",
4300                                 opts->host_iface ?: "none");
4301         }
4302         return ret;
4303 }
4304
4305 static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata)
4306 {
4307         char *envp[2] = { envdata, NULL };
4308
4309         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4310 }
4311
4312 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
4313 {
4314         char *envp[2] = { NULL, NULL };
4315         u32 aen_result = ctrl->aen_result;
4316
4317         ctrl->aen_result = 0;
4318         if (!aen_result)
4319                 return;
4320
4321         envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
4322         if (!envp[0])
4323                 return;
4324         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4325         kfree(envp[0]);
4326 }
4327
4328 static void nvme_async_event_work(struct work_struct *work)
4329 {
4330         struct nvme_ctrl *ctrl =
4331                 container_of(work, struct nvme_ctrl, async_event_work);
4332
4333         nvme_aen_uevent(ctrl);
4334
4335         /*
4336          * The transport drivers must guarantee AER submission here is safe by
4337          * flushing ctrl async_event_work after changing the controller state
4338          * from LIVE and before freeing the admin queue.
4339         */
4340         if (nvme_ctrl_state(ctrl) == NVME_CTRL_LIVE)
4341                 ctrl->ops->submit_async_event(ctrl);
4342 }
4343
4344 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
4345 {
4346
4347         u32 csts;
4348
4349         if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
4350                 return false;
4351
4352         if (csts == ~0)
4353                 return false;
4354
4355         return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
4356 }
4357
4358 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
4359 {
4360         struct nvme_fw_slot_info_log *log;
4361         u8 next_fw_slot, cur_fw_slot;
4362
4363         log = kmalloc(sizeof(*log), GFP_KERNEL);
4364         if (!log)
4365                 return;
4366
4367         if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4368                          log, sizeof(*log), 0)) {
4369                 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4370                 goto out_free_log;
4371         }
4372
4373         cur_fw_slot = log->afi & 0x7;
4374         next_fw_slot = (log->afi & 0x70) >> 4;
4375         if (!cur_fw_slot || (next_fw_slot && (cur_fw_slot != next_fw_slot))) {
4376                 dev_info(ctrl->device,
4377                          "Firmware is activated after next Controller Level Reset\n");
4378                 goto out_free_log;
4379         }
4380
4381         memcpy(ctrl->subsys->firmware_rev, &log->frs[cur_fw_slot - 1],
4382                 sizeof(ctrl->subsys->firmware_rev));
4383
4384 out_free_log:
4385         kfree(log);
4386 }
4387
4388 static void nvme_fw_act_work(struct work_struct *work)
4389 {
4390         struct nvme_ctrl *ctrl = container_of(work,
4391                                 struct nvme_ctrl, fw_act_work);
4392         unsigned long fw_act_timeout;
4393
4394         nvme_auth_stop(ctrl);
4395
4396         if (ctrl->mtfa)
4397                 fw_act_timeout = jiffies +
4398                                 msecs_to_jiffies(ctrl->mtfa * 100);
4399         else
4400                 fw_act_timeout = jiffies +
4401                                 msecs_to_jiffies(admin_timeout * 1000);
4402
4403         nvme_quiesce_io_queues(ctrl);
4404         while (nvme_ctrl_pp_status(ctrl)) {
4405                 if (time_after(jiffies, fw_act_timeout)) {
4406                         dev_warn(ctrl->device,
4407                                 "Fw activation timeout, reset controller\n");
4408                         nvme_try_sched_reset(ctrl);
4409                         return;
4410                 }
4411                 msleep(100);
4412         }
4413
4414         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4415                 return;
4416
4417         nvme_unquiesce_io_queues(ctrl);
4418         /* read FW slot information to clear the AER */
4419         nvme_get_fw_slot_info(ctrl);
4420
4421         queue_work(nvme_wq, &ctrl->async_event_work);
4422 }
4423
4424 static u32 nvme_aer_type(u32 result)
4425 {
4426         return result & 0x7;
4427 }
4428
4429 static u32 nvme_aer_subtype(u32 result)
4430 {
4431         return (result & 0xff00) >> 8;
4432 }
4433
4434 static bool nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4435 {
4436         u32 aer_notice_type = nvme_aer_subtype(result);
4437         bool requeue = true;
4438
4439         switch (aer_notice_type) {
4440         case NVME_AER_NOTICE_NS_CHANGED:
4441                 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
4442                 nvme_queue_scan(ctrl);
4443                 break;
4444         case NVME_AER_NOTICE_FW_ACT_STARTING:
4445                 /*
4446                  * We are (ab)using the RESETTING state to prevent subsequent
4447                  * recovery actions from interfering with the controller's
4448                  * firmware activation.
4449                  */
4450                 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
4451                         requeue = false;
4452                         queue_work(nvme_wq, &ctrl->fw_act_work);
4453                 }
4454                 break;
4455 #ifdef CONFIG_NVME_MULTIPATH
4456         case NVME_AER_NOTICE_ANA:
4457                 if (!ctrl->ana_log_buf)
4458                         break;
4459                 queue_work(nvme_wq, &ctrl->ana_work);
4460                 break;
4461 #endif
4462         case NVME_AER_NOTICE_DISC_CHANGED:
4463                 ctrl->aen_result = result;
4464                 break;
4465         default:
4466                 dev_warn(ctrl->device, "async event result %08x\n", result);
4467         }
4468         return requeue;
4469 }
4470
4471 static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl)
4472 {
4473         dev_warn(ctrl->device,
4474                 "resetting controller due to persistent internal error\n");
4475         nvme_reset_ctrl(ctrl);
4476 }
4477
4478 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4479                 volatile union nvme_result *res)
4480 {
4481         u32 result = le32_to_cpu(res->u32);
4482         u32 aer_type = nvme_aer_type(result);
4483         u32 aer_subtype = nvme_aer_subtype(result);
4484         bool requeue = true;
4485
4486         if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
4487                 return;
4488
4489         trace_nvme_async_event(ctrl, result);
4490         switch (aer_type) {
4491         case NVME_AER_NOTICE:
4492                 requeue = nvme_handle_aen_notice(ctrl, result);
4493                 break;
4494         case NVME_AER_ERROR:
4495                 /*
4496                  * For a persistent internal error, don't run async_event_work
4497                  * to submit a new AER. The controller reset will do it.
4498                  */
4499                 if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) {
4500                         nvme_handle_aer_persistent_error(ctrl);
4501                         return;
4502                 }
4503                 fallthrough;
4504         case NVME_AER_SMART:
4505         case NVME_AER_CSS:
4506         case NVME_AER_VS:
4507                 ctrl->aen_result = result;
4508                 break;
4509         default:
4510                 break;
4511         }
4512
4513         if (requeue)
4514                 queue_work(nvme_wq, &ctrl->async_event_work);
4515 }
4516 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4517
4518 int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4519                 const struct blk_mq_ops *ops, unsigned int cmd_size)
4520 {
4521         struct queue_limits lim = {};
4522         int ret;
4523
4524         memset(set, 0, sizeof(*set));
4525         set->ops = ops;
4526         set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
4527         if (ctrl->ops->flags & NVME_F_FABRICS)
4528                 /* Reserved for fabric connect and keep alive */
4529                 set->reserved_tags = 2;
4530         set->numa_node = ctrl->numa_node;
4531         set->flags = BLK_MQ_F_NO_SCHED;
4532         if (ctrl->ops->flags & NVME_F_BLOCKING)
4533                 set->flags |= BLK_MQ_F_BLOCKING;
4534         set->cmd_size = cmd_size;
4535         set->driver_data = ctrl;
4536         set->nr_hw_queues = 1;
4537         set->timeout = NVME_ADMIN_TIMEOUT;
4538         ret = blk_mq_alloc_tag_set(set);
4539         if (ret)
4540                 return ret;
4541
4542         ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL);
4543         if (IS_ERR(ctrl->admin_q)) {
4544                 ret = PTR_ERR(ctrl->admin_q);
4545                 goto out_free_tagset;
4546         }
4547
4548         if (ctrl->ops->flags & NVME_F_FABRICS) {
4549                 ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL);
4550                 if (IS_ERR(ctrl->fabrics_q)) {
4551                         ret = PTR_ERR(ctrl->fabrics_q);
4552                         goto out_cleanup_admin_q;
4553                 }
4554         }
4555
4556         ctrl->admin_tagset = set;
4557         return 0;
4558
4559 out_cleanup_admin_q:
4560         blk_mq_destroy_queue(ctrl->admin_q);
4561         blk_put_queue(ctrl->admin_q);
4562 out_free_tagset:
4563         blk_mq_free_tag_set(set);
4564         ctrl->admin_q = NULL;
4565         ctrl->fabrics_q = NULL;
4566         return ret;
4567 }
4568 EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
4569
4570 void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
4571 {
4572         blk_mq_destroy_queue(ctrl->admin_q);
4573         blk_put_queue(ctrl->admin_q);
4574         if (ctrl->ops->flags & NVME_F_FABRICS) {
4575                 blk_mq_destroy_queue(ctrl->fabrics_q);
4576                 blk_put_queue(ctrl->fabrics_q);
4577         }
4578         blk_mq_free_tag_set(ctrl->admin_tagset);
4579 }
4580 EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
4581
4582 int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
4583                 const struct blk_mq_ops *ops, unsigned int nr_maps,
4584                 unsigned int cmd_size)
4585 {
4586         int ret;
4587
4588         memset(set, 0, sizeof(*set));
4589         set->ops = ops;
4590         set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - 1);
4591         /*
4592          * Some Apple controllers requires tags to be unique across admin and
4593          * the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
4594          */
4595         if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
4596                 set->reserved_tags = NVME_AQ_DEPTH;
4597         else if (ctrl->ops->flags & NVME_F_FABRICS)
4598                 /* Reserved for fabric connect */
4599                 set->reserved_tags = 1;
4600         set->numa_node = ctrl->numa_node;
4601         set->flags = BLK_MQ_F_SHOULD_MERGE;
4602         if (ctrl->ops->flags & NVME_F_BLOCKING)
4603                 set->flags |= BLK_MQ_F_BLOCKING;
4604         set->cmd_size = cmd_size;
4605         set->driver_data = ctrl;
4606         set->nr_hw_queues = ctrl->queue_count - 1;
4607         set->timeout = NVME_IO_TIMEOUT;
4608         set->nr_maps = nr_maps;
4609         ret = blk_mq_alloc_tag_set(set);
4610         if (ret)
4611                 return ret;
4612
4613         if (ctrl->ops->flags & NVME_F_FABRICS) {
4614                 struct queue_limits lim = {
4615                         .features       = BLK_FEAT_SKIP_TAGSET_QUIESCE,
4616                 };
4617
4618                 ctrl->connect_q = blk_mq_alloc_queue(set, &lim, NULL);
4619                 if (IS_ERR(ctrl->connect_q)) {
4620                         ret = PTR_ERR(ctrl->connect_q);
4621                         goto out_free_tag_set;
4622                 }
4623         }
4624
4625         ctrl->tagset = set;
4626         return 0;
4627
4628 out_free_tag_set:
4629         blk_mq_free_tag_set(set);
4630         ctrl->connect_q = NULL;
4631         return ret;
4632 }
4633 EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);
4634
4635 void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
4636 {
4637         if (ctrl->ops->flags & NVME_F_FABRICS) {
4638                 blk_mq_destroy_queue(ctrl->connect_q);
4639                 blk_put_queue(ctrl->connect_q);
4640         }
4641         blk_mq_free_tag_set(ctrl->tagset);
4642 }
4643 EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
4644
4645 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
4646 {
4647         nvme_mpath_stop(ctrl);
4648         nvme_auth_stop(ctrl);
4649         nvme_stop_failfast_work(ctrl);
4650         flush_work(&ctrl->async_event_work);
4651         cancel_work_sync(&ctrl->fw_act_work);
4652         if (ctrl->ops->stop_ctrl)
4653                 ctrl->ops->stop_ctrl(ctrl);
4654 }
4655 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
4656
4657 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
4658 {
4659         nvme_enable_aen(ctrl);
4660
4661         /*
4662          * persistent discovery controllers need to send indication to userspace
4663          * to re-read the discovery log page to learn about possible changes
4664          * that were missed. We identify persistent discovery controllers by
4665          * checking that they started once before, hence are reconnecting back.
4666          */
4667         if (test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
4668             nvme_discovery_ctrl(ctrl))
4669                 nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");
4670
4671         if (ctrl->queue_count > 1) {
4672                 nvme_queue_scan(ctrl);
4673                 nvme_unquiesce_io_queues(ctrl);
4674                 nvme_mpath_update(ctrl);
4675         }
4676
4677         nvme_change_uevent(ctrl, "NVME_EVENT=connected");
4678         set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags);
4679 }
4680 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
4681
4682 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
4683 {
4684         nvme_stop_keep_alive(ctrl);
4685         nvme_hwmon_exit(ctrl);
4686         nvme_fault_inject_fini(&ctrl->fault_inject);
4687         dev_pm_qos_hide_latency_tolerance(ctrl->device);
4688         cdev_device_del(&ctrl->cdev, ctrl->device);
4689         nvme_put_ctrl(ctrl);
4690 }
4691 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
4692
4693 static void nvme_free_cels(struct nvme_ctrl *ctrl)
4694 {
4695         struct nvme_effects_log *cel;
4696         unsigned long i;
4697
4698         xa_for_each(&ctrl->cels, i, cel) {
4699                 xa_erase(&ctrl->cels, i);
4700                 kfree(cel);
4701         }
4702
4703         xa_destroy(&ctrl->cels);
4704 }
4705
4706 static void nvme_free_ctrl(struct device *dev)
4707 {
4708         struct nvme_ctrl *ctrl =
4709                 container_of(dev, struct nvme_ctrl, ctrl_device);
4710         struct nvme_subsystem *subsys = ctrl->subsys;
4711
4712         if (!subsys || ctrl->instance != subsys->instance)
4713                 ida_free(&nvme_instance_ida, ctrl->instance);
4714         nvme_free_cels(ctrl);
4715         nvme_mpath_uninit(ctrl);
4716         cleanup_srcu_struct(&ctrl->srcu);
4717         nvme_auth_stop(ctrl);
4718         nvme_auth_free(ctrl);
4719         __free_page(ctrl->discard_page);
4720         free_opal_dev(ctrl->opal_dev);
4721
4722         if (subsys) {
4723                 mutex_lock(&nvme_subsystems_lock);
4724                 list_del(&ctrl->subsys_entry);
4725                 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
4726                 mutex_unlock(&nvme_subsystems_lock);
4727         }
4728
4729         ctrl->ops->free_ctrl(ctrl);
4730
4731         if (subsys)
4732                 nvme_put_subsystem(subsys);
4733 }
4734
4735 /*
4736  * Initialize a NVMe controller structures.  This needs to be called during
4737  * earliest initialization so that we have the initialized structured around
4738  * during probing.
4739  *
4740  * On success, the caller must use the nvme_put_ctrl() to release this when
4741  * needed, which also invokes the ops->free_ctrl() callback.
4742  */
4743 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
4744                 const struct nvme_ctrl_ops *ops, unsigned long quirks)
4745 {
4746         int ret;
4747
4748         WRITE_ONCE(ctrl->state, NVME_CTRL_NEW);
4749         ctrl->passthru_err_log_enabled = false;
4750         clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
4751         spin_lock_init(&ctrl->lock);
4752         mutex_init(&ctrl->namespaces_lock);
4753
4754         ret = init_srcu_struct(&ctrl->srcu);
4755         if (ret)
4756                 return ret;
4757
4758         mutex_init(&ctrl->scan_lock);
4759         INIT_LIST_HEAD(&ctrl->namespaces);
4760         xa_init(&ctrl->cels);
4761         ctrl->dev = dev;
4762         ctrl->ops = ops;
4763         ctrl->quirks = quirks;
4764         ctrl->numa_node = NUMA_NO_NODE;
4765         INIT_WORK(&ctrl->scan_work, nvme_scan_work);
4766         INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
4767         INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
4768         INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
4769         init_waitqueue_head(&ctrl->state_wq);
4770
4771         INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
4772         INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
4773         memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
4774         ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
4775         ctrl->ka_last_check_time = jiffies;
4776
4777         BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
4778                         PAGE_SIZE);
4779         ctrl->discard_page = alloc_page(GFP_KERNEL);
4780         if (!ctrl->discard_page) {
4781                 ret = -ENOMEM;
4782                 goto out;
4783         }
4784
4785         ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL);
4786         if (ret < 0)
4787                 goto out;
4788         ctrl->instance = ret;
4789
4790         ret = nvme_auth_init_ctrl(ctrl);
4791         if (ret)
4792                 goto out_release_instance;
4793
4794         nvme_mpath_init_ctrl(ctrl);
4795
4796         device_initialize(&ctrl->ctrl_device);
4797         ctrl->device = &ctrl->ctrl_device;
4798         ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
4799                         ctrl->instance);
4800         ctrl->device->class = &nvme_class;
4801         ctrl->device->parent = ctrl->dev;
4802         if (ops->dev_attr_groups)
4803                 ctrl->device->groups = ops->dev_attr_groups;
4804         else
4805                 ctrl->device->groups = nvme_dev_attr_groups;
4806         ctrl->device->release = nvme_free_ctrl;
4807         dev_set_drvdata(ctrl->device, ctrl);
4808
4809         return ret;
4810
4811 out_release_instance:
4812         ida_free(&nvme_instance_ida, ctrl->instance);
4813 out:
4814         if (ctrl->discard_page)
4815                 __free_page(ctrl->discard_page);
4816         cleanup_srcu_struct(&ctrl->srcu);
4817         return ret;
4818 }
4819 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
4820
4821 /*
4822  * On success, returns with an elevated controller reference and caller must
4823  * use nvme_uninit_ctrl() to properly free resources associated with the ctrl.
4824  */
4825 int nvme_add_ctrl(struct nvme_ctrl *ctrl)
4826 {
4827         int ret;
4828
4829         ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
4830         if (ret)
4831                 return ret;
4832
4833         cdev_init(&ctrl->cdev, &nvme_dev_fops);
4834         ctrl->cdev.owner = ctrl->ops->module;
4835         ret = cdev_device_add(&ctrl->cdev, ctrl->device);
4836         if (ret)
4837                 return ret;
4838
4839         /*
4840          * Initialize latency tolerance controls.  The sysfs files won't
4841          * be visible to userspace unless the device actually supports APST.
4842          */
4843         ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
4844         dev_pm_qos_update_user_latency_tolerance(ctrl->device,
4845                 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
4846
4847         nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
4848         nvme_get_ctrl(ctrl);
4849
4850         return 0;
4851 }
4852 EXPORT_SYMBOL_GPL(nvme_add_ctrl);
4853
4854 /* let I/O to all namespaces fail in preparation for surprise removal */
4855 void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
4856 {
4857         struct nvme_ns *ns;
4858         int srcu_idx;
4859
4860         srcu_idx = srcu_read_lock(&ctrl->srcu);
4861         list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
4862                 blk_mark_disk_dead(ns->disk);
4863         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4864 }
4865 EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
4866
4867 void nvme_unfreeze(struct nvme_ctrl *ctrl)
4868 {
4869         struct nvme_ns *ns;
4870         int srcu_idx;
4871
4872         srcu_idx = srcu_read_lock(&ctrl->srcu);
4873         list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
4874                 blk_mq_unfreeze_queue(ns->queue);
4875         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4876         clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
4877 }
4878 EXPORT_SYMBOL_GPL(nvme_unfreeze);
4879
4880 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
4881 {
4882         struct nvme_ns *ns;
4883         int srcu_idx;
4884
4885         srcu_idx = srcu_read_lock(&ctrl->srcu);
4886         list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
4887                 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
4888                 if (timeout <= 0)
4889                         break;
4890         }
4891         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4892         return timeout;
4893 }
4894 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
4895
4896 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
4897 {
4898         struct nvme_ns *ns;
4899         int srcu_idx;
4900
4901         srcu_idx = srcu_read_lock(&ctrl->srcu);
4902         list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
4903                 blk_mq_freeze_queue_wait(ns->queue);
4904         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4905 }
4906 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
4907
4908 void nvme_start_freeze(struct nvme_ctrl *ctrl)
4909 {
4910         struct nvme_ns *ns;
4911         int srcu_idx;
4912
4913         set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
4914         srcu_idx = srcu_read_lock(&ctrl->srcu);
4915         list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
4916                 blk_freeze_queue_start(ns->queue);
4917         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4918 }
4919 EXPORT_SYMBOL_GPL(nvme_start_freeze);
4920
4921 void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
4922 {
4923         if (!ctrl->tagset)
4924                 return;
4925         if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
4926                 blk_mq_quiesce_tagset(ctrl->tagset);
4927         else
4928                 blk_mq_wait_quiesce_done(ctrl->tagset);
4929 }
4930 EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
4931
4932 void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
4933 {
4934         if (!ctrl->tagset)
4935                 return;
4936         if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
4937                 blk_mq_unquiesce_tagset(ctrl->tagset);
4938 }
4939 EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
4940
4941 void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl)
4942 {
4943         if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
4944                 blk_mq_quiesce_queue(ctrl->admin_q);
4945         else
4946                 blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set);
4947 }
4948 EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue);
4949
4950 void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
4951 {
4952         if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
4953                 blk_mq_unquiesce_queue(ctrl->admin_q);
4954 }
4955 EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
4956
4957 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
4958 {
4959         struct nvme_ns *ns;
4960         int srcu_idx;
4961
4962         srcu_idx = srcu_read_lock(&ctrl->srcu);
4963         list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
4964                 blk_sync_queue(ns->queue);
4965         srcu_read_unlock(&ctrl->srcu, srcu_idx);
4966 }
4967 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
4968
4969 void nvme_sync_queues(struct nvme_ctrl *ctrl)
4970 {
4971         nvme_sync_io_queues(ctrl);
4972         if (ctrl->admin_q)
4973                 blk_sync_queue(ctrl->admin_q);
4974 }
4975 EXPORT_SYMBOL_GPL(nvme_sync_queues);
4976
4977 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
4978 {
4979         if (file->f_op != &nvme_dev_fops)
4980                 return NULL;
4981         return file->private_data;
4982 }
4983 EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
4984
4985 /*
4986  * Check we didn't inadvertently grow the command structure sizes:
4987  */
4988 static inline void _nvme_check_size(void)
4989 {
4990         BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
4991         BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
4992         BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
4993         BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
4994         BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
4995         BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
4996         BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
4997         BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
4998         BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
4999         BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
5000         BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
5001         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
5002         BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
5003         BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
5004                         NVME_IDENTIFY_DATA_SIZE);
5005         BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
5006         BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
5007         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
5008         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
5009         BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
5010         BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
5011         BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
5012         BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
5013         BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
5014 }
5015
5016
5017 static int __init nvme_core_init(void)
5018 {
5019         int result = -ENOMEM;
5020
5021         _nvme_check_size();
5022
5023         nvme_wq = alloc_workqueue("nvme-wq",
5024                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
5025         if (!nvme_wq)
5026                 goto out;
5027
5028         nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
5029                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
5030         if (!nvme_reset_wq)
5031                 goto destroy_wq;
5032
5033         nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
5034                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
5035         if (!nvme_delete_wq)
5036                 goto destroy_reset_wq;
5037
5038         result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0,
5039                         NVME_MINORS, "nvme");
5040         if (result < 0)
5041                 goto destroy_delete_wq;
5042
5043         result = class_register(&nvme_class);
5044         if (result)
5045                 goto unregister_chrdev;
5046
5047         result = class_register(&nvme_subsys_class);
5048         if (result)
5049                 goto destroy_class;
5050
5051         result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
5052                                      "nvme-generic");
5053         if (result < 0)
5054                 goto destroy_subsys_class;
5055
5056         result = class_register(&nvme_ns_chr_class);
5057         if (result)
5058                 goto unregister_generic_ns;
5059
5060         result = nvme_init_auth();
5061         if (result)
5062                 goto destroy_ns_chr;
5063         return 0;
5064
5065 destroy_ns_chr:
5066         class_unregister(&nvme_ns_chr_class);
5067 unregister_generic_ns:
5068         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5069 destroy_subsys_class:
5070         class_unregister(&nvme_subsys_class);
5071 destroy_class:
5072         class_unregister(&nvme_class);
5073 unregister_chrdev:
5074         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5075 destroy_delete_wq:
5076         destroy_workqueue(nvme_delete_wq);
5077 destroy_reset_wq:
5078         destroy_workqueue(nvme_reset_wq);
5079 destroy_wq:
5080         destroy_workqueue(nvme_wq);
5081 out:
5082         return result;
5083 }
5084
5085 static void __exit nvme_core_exit(void)
5086 {
5087         nvme_exit_auth();
5088         class_unregister(&nvme_ns_chr_class);
5089         class_unregister(&nvme_subsys_class);
5090         class_unregister(&nvme_class);
5091         unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
5092         unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
5093         destroy_workqueue(nvme_delete_wq);
5094         destroy_workqueue(nvme_reset_wq);
5095         destroy_workqueue(nvme_wq);
5096         ida_destroy(&nvme_ns_chr_minor_ida);
5097         ida_destroy(&nvme_instance_ida);
5098 }
5099
5100 MODULE_LICENSE("GPL");
5101 MODULE_VERSION("1.0");
5102 MODULE_DESCRIPTION("NVMe host core framework");
5103 module_init(nvme_core_init);
5104 module_exit(nvme_core_exit);