]> Git Repo - linux.git/blob - drivers/nvme/host/core.c
drm/nouveau/kms: Don't change EDID when it hasn't actually changed
[linux.git] / drivers / nvme / host / core.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVM Express device driver
4  * Copyright (c) 2011-2014, Intel Corporation.
5  */
6
7 #include <linux/blkdev.h>
8 #include <linux/blk-mq.h>
9 #include <linux/compat.h>
10 #include <linux/delay.h>
11 #include <linux/errno.h>
12 #include <linux/hdreg.h>
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/backing-dev.h>
16 #include <linux/list_sort.h>
17 #include <linux/slab.h>
18 #include <linux/types.h>
19 #include <linux/pr.h>
20 #include <linux/ptrace.h>
21 #include <linux/nvme_ioctl.h>
22 #include <linux/pm_qos.h>
23 #include <asm/unaligned.h>
24
25 #include "nvme.h"
26 #include "fabrics.h"
27
28 #define CREATE_TRACE_POINTS
29 #include "trace.h"
30
31 #define NVME_MINORS             (1U << MINORBITS)
32
33 unsigned int admin_timeout = 60;
34 module_param(admin_timeout, uint, 0644);
35 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
36 EXPORT_SYMBOL_GPL(admin_timeout);
37
38 unsigned int nvme_io_timeout = 30;
39 module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
40 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
41 EXPORT_SYMBOL_GPL(nvme_io_timeout);
42
43 static unsigned char shutdown_timeout = 5;
44 module_param(shutdown_timeout, byte, 0644);
45 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
46
47 static u8 nvme_max_retries = 5;
48 module_param_named(max_retries, nvme_max_retries, byte, 0644);
49 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
50
51 static unsigned long default_ps_max_latency_us = 100000;
52 module_param(default_ps_max_latency_us, ulong, 0644);
53 MODULE_PARM_DESC(default_ps_max_latency_us,
54                  "max power saving latency for new devices; use PM QOS to change per device");
55
56 static bool force_apst;
57 module_param(force_apst, bool, 0644);
58 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
59
60 static bool streams;
61 module_param(streams, bool, 0644);
62 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
63
64 /*
65  * nvme_wq - hosts nvme related works that are not reset or delete
66  * nvme_reset_wq - hosts nvme reset works
67  * nvme_delete_wq - hosts nvme delete works
68  *
69  * nvme_wq will host works such as scan, aen handling, fw activation,
70  * keep-alive, periodic reconnects etc. nvme_reset_wq
71  * runs reset works which also flush works hosted on nvme_wq for
72  * serialization purposes. nvme_delete_wq host controller deletion
73  * works which flush reset works for serialization.
74  */
75 struct workqueue_struct *nvme_wq;
76 EXPORT_SYMBOL_GPL(nvme_wq);
77
78 struct workqueue_struct *nvme_reset_wq;
79 EXPORT_SYMBOL_GPL(nvme_reset_wq);
80
81 struct workqueue_struct *nvme_delete_wq;
82 EXPORT_SYMBOL_GPL(nvme_delete_wq);
83
84 static LIST_HEAD(nvme_subsystems);
85 static DEFINE_MUTEX(nvme_subsystems_lock);
86
87 static DEFINE_IDA(nvme_instance_ida);
88 static dev_t nvme_chr_devt;
89 static struct class *nvme_class;
90 static struct class *nvme_subsys_class;
91
92 static int _nvme_revalidate_disk(struct gendisk *disk);
93 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
94 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
95                                            unsigned nsid);
96
97 static void nvme_set_queue_dying(struct nvme_ns *ns)
98 {
99         /*
100          * Revalidating a dead namespace sets capacity to 0. This will end
101          * buffered writers dirtying pages that can't be synced.
102          */
103         if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
104                 return;
105         blk_set_queue_dying(ns->queue);
106         /* Forcibly unquiesce queues to avoid blocking dispatch */
107         blk_mq_unquiesce_queue(ns->queue);
108         /*
109          * Revalidate after unblocking dispatchers that may be holding bd_butex
110          */
111         revalidate_disk(ns->disk);
112 }
113
114 static void nvme_queue_scan(struct nvme_ctrl *ctrl)
115 {
116         /*
117          * Only new queue scan work when admin and IO queues are both alive
118          */
119         if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
120                 queue_work(nvme_wq, &ctrl->scan_work);
121 }
122
123 /*
124  * Use this function to proceed with scheduling reset_work for a controller
125  * that had previously been set to the resetting state. This is intended for
126  * code paths that can't be interrupted by other reset attempts. A hot removal
127  * may prevent this from succeeding.
128  */
129 int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
130 {
131         if (ctrl->state != NVME_CTRL_RESETTING)
132                 return -EBUSY;
133         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
134                 return -EBUSY;
135         return 0;
136 }
137 EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
138
139 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
140 {
141         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
142                 return -EBUSY;
143         if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
144                 return -EBUSY;
145         return 0;
146 }
147 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
148
149 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
150 {
151         int ret;
152
153         ret = nvme_reset_ctrl(ctrl);
154         if (!ret) {
155                 flush_work(&ctrl->reset_work);
156                 if (ctrl->state != NVME_CTRL_LIVE)
157                         ret = -ENETRESET;
158         }
159
160         return ret;
161 }
162 EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
163
164 static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
165 {
166         dev_info(ctrl->device,
167                  "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
168
169         flush_work(&ctrl->reset_work);
170         nvme_stop_ctrl(ctrl);
171         nvme_remove_namespaces(ctrl);
172         ctrl->ops->delete_ctrl(ctrl);
173         nvme_uninit_ctrl(ctrl);
174 }
175
176 static void nvme_delete_ctrl_work(struct work_struct *work)
177 {
178         struct nvme_ctrl *ctrl =
179                 container_of(work, struct nvme_ctrl, delete_work);
180
181         nvme_do_delete_ctrl(ctrl);
182 }
183
184 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
185 {
186         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
187                 return -EBUSY;
188         if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
189                 return -EBUSY;
190         return 0;
191 }
192 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
193
194 static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
195 {
196         /*
197          * Keep a reference until nvme_do_delete_ctrl() complete,
198          * since ->delete_ctrl can free the controller.
199          */
200         nvme_get_ctrl(ctrl);
201         if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
202                 nvme_do_delete_ctrl(ctrl);
203         nvme_put_ctrl(ctrl);
204 }
205
206 static blk_status_t nvme_error_status(u16 status)
207 {
208         switch (status & 0x7ff) {
209         case NVME_SC_SUCCESS:
210                 return BLK_STS_OK;
211         case NVME_SC_CAP_EXCEEDED:
212                 return BLK_STS_NOSPC;
213         case NVME_SC_LBA_RANGE:
214         case NVME_SC_CMD_INTERRUPTED:
215         case NVME_SC_NS_NOT_READY:
216                 return BLK_STS_TARGET;
217         case NVME_SC_BAD_ATTRIBUTES:
218         case NVME_SC_ONCS_NOT_SUPPORTED:
219         case NVME_SC_INVALID_OPCODE:
220         case NVME_SC_INVALID_FIELD:
221         case NVME_SC_INVALID_NS:
222                 return BLK_STS_NOTSUPP;
223         case NVME_SC_WRITE_FAULT:
224         case NVME_SC_READ_ERROR:
225         case NVME_SC_UNWRITTEN_BLOCK:
226         case NVME_SC_ACCESS_DENIED:
227         case NVME_SC_READ_ONLY:
228         case NVME_SC_COMPARE_FAILED:
229                 return BLK_STS_MEDIUM;
230         case NVME_SC_GUARD_CHECK:
231         case NVME_SC_APPTAG_CHECK:
232         case NVME_SC_REFTAG_CHECK:
233         case NVME_SC_INVALID_PI:
234                 return BLK_STS_PROTECTION;
235         case NVME_SC_RESERVATION_CONFLICT:
236                 return BLK_STS_NEXUS;
237         case NVME_SC_HOST_PATH_ERROR:
238                 return BLK_STS_TRANSPORT;
239         default:
240                 return BLK_STS_IOERR;
241         }
242 }
243
244 static inline bool nvme_req_needs_retry(struct request *req)
245 {
246         if (blk_noretry_request(req))
247                 return false;
248         if (nvme_req(req)->status & NVME_SC_DNR)
249                 return false;
250         if (nvme_req(req)->retries >= nvme_max_retries)
251                 return false;
252         return true;
253 }
254
255 static void nvme_retry_req(struct request *req)
256 {
257         struct nvme_ns *ns = req->q->queuedata;
258         unsigned long delay = 0;
259         u16 crd;
260
261         /* The mask and shift result must be <= 3 */
262         crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
263         if (ns && crd)
264                 delay = ns->ctrl->crdt[crd - 1] * 100;
265
266         nvme_req(req)->retries++;
267         blk_mq_requeue_request(req, false);
268         blk_mq_delay_kick_requeue_list(req->q, delay);
269 }
270
271 void nvme_complete_rq(struct request *req)
272 {
273         blk_status_t status = nvme_error_status(nvme_req(req)->status);
274
275         trace_nvme_complete_rq(req);
276
277         nvme_cleanup_cmd(req);
278
279         if (nvme_req(req)->ctrl->kas)
280                 nvme_req(req)->ctrl->comp_seen = true;
281
282         if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
283                 if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
284                         return;
285
286                 if (!blk_queue_dying(req->q)) {
287                         nvme_retry_req(req);
288                         return;
289                 }
290         } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
291                    req_op(req) == REQ_OP_ZONE_APPEND) {
292                 req->__sector = nvme_lba_to_sect(req->q->queuedata,
293                         le64_to_cpu(nvme_req(req)->result.u64));
294         }
295
296         nvme_trace_bio_complete(req, status);
297         blk_mq_end_request(req, status);
298 }
299 EXPORT_SYMBOL_GPL(nvme_complete_rq);
300
301 bool nvme_cancel_request(struct request *req, void *data, bool reserved)
302 {
303         dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
304                                 "Cancelling I/O %d", req->tag);
305
306         /* don't abort one completed request */
307         if (blk_mq_request_completed(req))
308                 return true;
309
310         nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
311         blk_mq_complete_request(req);
312         return true;
313 }
314 EXPORT_SYMBOL_GPL(nvme_cancel_request);
315
316 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
317                 enum nvme_ctrl_state new_state)
318 {
319         enum nvme_ctrl_state old_state;
320         unsigned long flags;
321         bool changed = false;
322
323         spin_lock_irqsave(&ctrl->lock, flags);
324
325         old_state = ctrl->state;
326         switch (new_state) {
327         case NVME_CTRL_LIVE:
328                 switch (old_state) {
329                 case NVME_CTRL_NEW:
330                 case NVME_CTRL_RESETTING:
331                 case NVME_CTRL_CONNECTING:
332                         changed = true;
333                         /* FALLTHRU */
334                 default:
335                         break;
336                 }
337                 break;
338         case NVME_CTRL_RESETTING:
339                 switch (old_state) {
340                 case NVME_CTRL_NEW:
341                 case NVME_CTRL_LIVE:
342                         changed = true;
343                         /* FALLTHRU */
344                 default:
345                         break;
346                 }
347                 break;
348         case NVME_CTRL_CONNECTING:
349                 switch (old_state) {
350                 case NVME_CTRL_NEW:
351                 case NVME_CTRL_RESETTING:
352                         changed = true;
353                         /* FALLTHRU */
354                 default:
355                         break;
356                 }
357                 break;
358         case NVME_CTRL_DELETING:
359                 switch (old_state) {
360                 case NVME_CTRL_LIVE:
361                 case NVME_CTRL_RESETTING:
362                 case NVME_CTRL_CONNECTING:
363                         changed = true;
364                         /* FALLTHRU */
365                 default:
366                         break;
367                 }
368                 break;
369         case NVME_CTRL_DELETING_NOIO:
370                 switch (old_state) {
371                 case NVME_CTRL_DELETING:
372                 case NVME_CTRL_DEAD:
373                         changed = true;
374                         /* FALLTHRU */
375                 default:
376                         break;
377                 }
378                 break;
379         case NVME_CTRL_DEAD:
380                 switch (old_state) {
381                 case NVME_CTRL_DELETING:
382                         changed = true;
383                         /* FALLTHRU */
384                 default:
385                         break;
386                 }
387                 break;
388         default:
389                 break;
390         }
391
392         if (changed) {
393                 ctrl->state = new_state;
394                 wake_up_all(&ctrl->state_wq);
395         }
396
397         spin_unlock_irqrestore(&ctrl->lock, flags);
398         if (changed && ctrl->state == NVME_CTRL_LIVE)
399                 nvme_kick_requeue_lists(ctrl);
400         return changed;
401 }
402 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
403
404 /*
405  * Returns true for sink states that can't ever transition back to live.
406  */
407 static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
408 {
409         switch (ctrl->state) {
410         case NVME_CTRL_NEW:
411         case NVME_CTRL_LIVE:
412         case NVME_CTRL_RESETTING:
413         case NVME_CTRL_CONNECTING:
414                 return false;
415         case NVME_CTRL_DELETING:
416         case NVME_CTRL_DELETING_NOIO:
417         case NVME_CTRL_DEAD:
418                 return true;
419         default:
420                 WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
421                 return true;
422         }
423 }
424
425 /*
426  * Waits for the controller state to be resetting, or returns false if it is
427  * not possible to ever transition to that state.
428  */
429 bool nvme_wait_reset(struct nvme_ctrl *ctrl)
430 {
431         wait_event(ctrl->state_wq,
432                    nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
433                    nvme_state_terminal(ctrl));
434         return ctrl->state == NVME_CTRL_RESETTING;
435 }
436 EXPORT_SYMBOL_GPL(nvme_wait_reset);
437
438 static void nvme_free_ns_head(struct kref *ref)
439 {
440         struct nvme_ns_head *head =
441                 container_of(ref, struct nvme_ns_head, ref);
442
443         nvme_mpath_remove_disk(head);
444         ida_simple_remove(&head->subsys->ns_ida, head->instance);
445         cleanup_srcu_struct(&head->srcu);
446         nvme_put_subsystem(head->subsys);
447         kfree(head);
448 }
449
450 static void nvme_put_ns_head(struct nvme_ns_head *head)
451 {
452         kref_put(&head->ref, nvme_free_ns_head);
453 }
454
455 static void nvme_free_ns(struct kref *kref)
456 {
457         struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
458
459         if (ns->ndev)
460                 nvme_nvm_unregister(ns);
461
462         put_disk(ns->disk);
463         nvme_put_ns_head(ns->head);
464         nvme_put_ctrl(ns->ctrl);
465         kfree(ns);
466 }
467
468 void nvme_put_ns(struct nvme_ns *ns)
469 {
470         kref_put(&ns->kref, nvme_free_ns);
471 }
472 EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
473
474 static inline void nvme_clear_nvme_request(struct request *req)
475 {
476         if (!(req->rq_flags & RQF_DONTPREP)) {
477                 nvme_req(req)->retries = 0;
478                 nvme_req(req)->flags = 0;
479                 req->rq_flags |= RQF_DONTPREP;
480         }
481 }
482
483 struct request *nvme_alloc_request(struct request_queue *q,
484                 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
485 {
486         unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
487         struct request *req;
488
489         if (qid == NVME_QID_ANY) {
490                 req = blk_mq_alloc_request(q, op, flags);
491         } else {
492                 req = blk_mq_alloc_request_hctx(q, op, flags,
493                                 qid ? qid - 1 : 0);
494         }
495         if (IS_ERR(req))
496                 return req;
497
498         req->cmd_flags |= REQ_FAILFAST_DRIVER;
499         nvme_clear_nvme_request(req);
500         nvme_req(req)->cmd = cmd;
501
502         return req;
503 }
504 EXPORT_SYMBOL_GPL(nvme_alloc_request);
505
506 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
507 {
508         struct nvme_command c;
509
510         memset(&c, 0, sizeof(c));
511
512         c.directive.opcode = nvme_admin_directive_send;
513         c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
514         c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
515         c.directive.dtype = NVME_DIR_IDENTIFY;
516         c.directive.tdtype = NVME_DIR_STREAMS;
517         c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
518
519         return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
520 }
521
522 static int nvme_disable_streams(struct nvme_ctrl *ctrl)
523 {
524         return nvme_toggle_streams(ctrl, false);
525 }
526
527 static int nvme_enable_streams(struct nvme_ctrl *ctrl)
528 {
529         return nvme_toggle_streams(ctrl, true);
530 }
531
532 static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
533                                   struct streams_directive_params *s, u32 nsid)
534 {
535         struct nvme_command c;
536
537         memset(&c, 0, sizeof(c));
538         memset(s, 0, sizeof(*s));
539
540         c.directive.opcode = nvme_admin_directive_recv;
541         c.directive.nsid = cpu_to_le32(nsid);
542         c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s)));
543         c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
544         c.directive.dtype = NVME_DIR_STREAMS;
545
546         return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
547 }
548
549 static int nvme_configure_directives(struct nvme_ctrl *ctrl)
550 {
551         struct streams_directive_params s;
552         int ret;
553
554         if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
555                 return 0;
556         if (!streams)
557                 return 0;
558
559         ret = nvme_enable_streams(ctrl);
560         if (ret)
561                 return ret;
562
563         ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
564         if (ret)
565                 goto out_disable_stream;
566
567         ctrl->nssa = le16_to_cpu(s.nssa);
568         if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
569                 dev_info(ctrl->device, "too few streams (%u) available\n",
570                                         ctrl->nssa);
571                 goto out_disable_stream;
572         }
573
574         ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
575         dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
576         return 0;
577
578 out_disable_stream:
579         nvme_disable_streams(ctrl);
580         return ret;
581 }
582
583 /*
584  * Check if 'req' has a write hint associated with it. If it does, assign
585  * a valid namespace stream to the write.
586  */
587 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
588                                      struct request *req, u16 *control,
589                                      u32 *dsmgmt)
590 {
591         enum rw_hint streamid = req->write_hint;
592
593         if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
594                 streamid = 0;
595         else {
596                 streamid--;
597                 if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
598                         return;
599
600                 *control |= NVME_RW_DTYPE_STREAMS;
601                 *dsmgmt |= streamid << 16;
602         }
603
604         if (streamid < ARRAY_SIZE(req->q->write_hints))
605                 req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
606 }
607
608 static void nvme_setup_passthrough(struct request *req,
609                 struct nvme_command *cmd)
610 {
611         memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
612         /* passthru commands should let the driver set the SGL flags */
613         cmd->common.flags &= ~NVME_CMD_SGL_ALL;
614 }
615
616 static inline void nvme_setup_flush(struct nvme_ns *ns,
617                 struct nvme_command *cmnd)
618 {
619         cmnd->common.opcode = nvme_cmd_flush;
620         cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
621 }
622
623 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
624                 struct nvme_command *cmnd)
625 {
626         unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
627         struct nvme_dsm_range *range;
628         struct bio *bio;
629
630         /*
631          * Some devices do not consider the DSM 'Number of Ranges' field when
632          * determining how much data to DMA. Always allocate memory for maximum
633          * number of segments to prevent device reading beyond end of buffer.
634          */
635         static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
636
637         range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
638         if (!range) {
639                 /*
640                  * If we fail allocation our range, fallback to the controller
641                  * discard page. If that's also busy, it's safe to return
642                  * busy, as we know we can make progress once that's freed.
643                  */
644                 if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
645                         return BLK_STS_RESOURCE;
646
647                 range = page_address(ns->ctrl->discard_page);
648         }
649
650         __rq_for_each_bio(bio, req) {
651                 u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
652                 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
653
654                 if (n < segments) {
655                         range[n].cattr = cpu_to_le32(0);
656                         range[n].nlb = cpu_to_le32(nlb);
657                         range[n].slba = cpu_to_le64(slba);
658                 }
659                 n++;
660         }
661
662         if (WARN_ON_ONCE(n != segments)) {
663                 if (virt_to_page(range) == ns->ctrl->discard_page)
664                         clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
665                 else
666                         kfree(range);
667                 return BLK_STS_IOERR;
668         }
669
670         cmnd->dsm.opcode = nvme_cmd_dsm;
671         cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
672         cmnd->dsm.nr = cpu_to_le32(segments - 1);
673         cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
674
675         req->special_vec.bv_page = virt_to_page(range);
676         req->special_vec.bv_offset = offset_in_page(range);
677         req->special_vec.bv_len = alloc_size;
678         req->rq_flags |= RQF_SPECIAL_PAYLOAD;
679
680         return BLK_STS_OK;
681 }
682
683 static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
684                 struct request *req, struct nvme_command *cmnd)
685 {
686         if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
687                 return nvme_setup_discard(ns, req, cmnd);
688
689         cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
690         cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
691         cmnd->write_zeroes.slba =
692                 cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
693         cmnd->write_zeroes.length =
694                 cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
695         cmnd->write_zeroes.control = 0;
696         return BLK_STS_OK;
697 }
698
699 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
700                 struct request *req, struct nvme_command *cmnd,
701                 enum nvme_opcode op)
702 {
703         struct nvme_ctrl *ctrl = ns->ctrl;
704         u16 control = 0;
705         u32 dsmgmt = 0;
706
707         if (req->cmd_flags & REQ_FUA)
708                 control |= NVME_RW_FUA;
709         if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
710                 control |= NVME_RW_LR;
711
712         if (req->cmd_flags & REQ_RAHEAD)
713                 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
714
715         cmnd->rw.opcode = op;
716         cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
717         cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
718         cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
719
720         if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
721                 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
722
723         if (ns->ms) {
724                 /*
725                  * If formated with metadata, the block layer always provides a
726                  * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
727                  * we enable the PRACT bit for protection information or set the
728                  * namespace capacity to zero to prevent any I/O.
729                  */
730                 if (!blk_integrity_rq(req)) {
731                         if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
732                                 return BLK_STS_NOTSUPP;
733                         control |= NVME_RW_PRINFO_PRACT;
734                 }
735
736                 switch (ns->pi_type) {
737                 case NVME_NS_DPS_PI_TYPE3:
738                         control |= NVME_RW_PRINFO_PRCHK_GUARD;
739                         break;
740                 case NVME_NS_DPS_PI_TYPE1:
741                 case NVME_NS_DPS_PI_TYPE2:
742                         control |= NVME_RW_PRINFO_PRCHK_GUARD |
743                                         NVME_RW_PRINFO_PRCHK_REF;
744                         if (op == nvme_cmd_zone_append)
745                                 control |= NVME_RW_APPEND_PIREMAP;
746                         cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
747                         break;
748                 }
749         }
750
751         cmnd->rw.control = cpu_to_le16(control);
752         cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
753         return 0;
754 }
755
756 void nvme_cleanup_cmd(struct request *req)
757 {
758         if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
759                 struct nvme_ns *ns = req->rq_disk->private_data;
760                 struct page *page = req->special_vec.bv_page;
761
762                 if (page == ns->ctrl->discard_page)
763                         clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
764                 else
765                         kfree(page_address(page) + req->special_vec.bv_offset);
766         }
767 }
768 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
769
770 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
771                 struct nvme_command *cmd)
772 {
773         blk_status_t ret = BLK_STS_OK;
774
775         nvme_clear_nvme_request(req);
776
777         memset(cmd, 0, sizeof(*cmd));
778         switch (req_op(req)) {
779         case REQ_OP_DRV_IN:
780         case REQ_OP_DRV_OUT:
781                 nvme_setup_passthrough(req, cmd);
782                 break;
783         case REQ_OP_FLUSH:
784                 nvme_setup_flush(ns, cmd);
785                 break;
786         case REQ_OP_ZONE_RESET_ALL:
787         case REQ_OP_ZONE_RESET:
788                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
789                 break;
790         case REQ_OP_ZONE_OPEN:
791                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
792                 break;
793         case REQ_OP_ZONE_CLOSE:
794                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
795                 break;
796         case REQ_OP_ZONE_FINISH:
797                 ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
798                 break;
799         case REQ_OP_WRITE_ZEROES:
800                 ret = nvme_setup_write_zeroes(ns, req, cmd);
801                 break;
802         case REQ_OP_DISCARD:
803                 ret = nvme_setup_discard(ns, req, cmd);
804                 break;
805         case REQ_OP_READ:
806                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
807                 break;
808         case REQ_OP_WRITE:
809                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
810                 break;
811         case REQ_OP_ZONE_APPEND:
812                 ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
813                 break;
814         default:
815                 WARN_ON_ONCE(1);
816                 return BLK_STS_IOERR;
817         }
818
819         cmd->common.command_id = req->tag;
820         trace_nvme_setup_cmd(req, cmd);
821         return ret;
822 }
823 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
824
825 static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
826 {
827         struct completion *waiting = rq->end_io_data;
828
829         rq->end_io_data = NULL;
830         complete(waiting);
831 }
832
833 static void nvme_execute_rq_polled(struct request_queue *q,
834                 struct gendisk *bd_disk, struct request *rq, int at_head)
835 {
836         DECLARE_COMPLETION_ONSTACK(wait);
837
838         WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
839
840         rq->cmd_flags |= REQ_HIPRI;
841         rq->end_io_data = &wait;
842         blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
843
844         while (!completion_done(&wait)) {
845                 blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
846                 cond_resched();
847         }
848 }
849
850 /*
851  * Returns 0 on success.  If the result is negative, it's a Linux error code;
852  * if the result is positive, it's an NVM Express status code
853  */
854 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
855                 union nvme_result *result, void *buffer, unsigned bufflen,
856                 unsigned timeout, int qid, int at_head,
857                 blk_mq_req_flags_t flags, bool poll)
858 {
859         struct request *req;
860         int ret;
861
862         req = nvme_alloc_request(q, cmd, flags, qid);
863         if (IS_ERR(req))
864                 return PTR_ERR(req);
865
866         req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
867
868         if (buffer && bufflen) {
869                 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
870                 if (ret)
871                         goto out;
872         }
873
874         if (poll)
875                 nvme_execute_rq_polled(req->q, NULL, req, at_head);
876         else
877                 blk_execute_rq(req->q, NULL, req, at_head);
878         if (result)
879                 *result = nvme_req(req)->result;
880         if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
881                 ret = -EINTR;
882         else
883                 ret = nvme_req(req)->status;
884  out:
885         blk_mq_free_request(req);
886         return ret;
887 }
888 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
889
890 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
891                 void *buffer, unsigned bufflen)
892 {
893         return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
894                         NVME_QID_ANY, 0, 0, false);
895 }
896 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
897
898 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
899                 unsigned len, u32 seed, bool write)
900 {
901         struct bio_integrity_payload *bip;
902         int ret = -ENOMEM;
903         void *buf;
904
905         buf = kmalloc(len, GFP_KERNEL);
906         if (!buf)
907                 goto out;
908
909         ret = -EFAULT;
910         if (write && copy_from_user(buf, ubuf, len))
911                 goto out_free_meta;
912
913         bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
914         if (IS_ERR(bip)) {
915                 ret = PTR_ERR(bip);
916                 goto out_free_meta;
917         }
918
919         bip->bip_iter.bi_size = len;
920         bip->bip_iter.bi_sector = seed;
921         ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
922                         offset_in_page(buf));
923         if (ret == len)
924                 return buf;
925         ret = -ENOMEM;
926 out_free_meta:
927         kfree(buf);
928 out:
929         return ERR_PTR(ret);
930 }
931
932 static u32 nvme_known_admin_effects(u8 opcode)
933 {
934         switch (opcode) {
935         case nvme_admin_format_nvm:
936                 return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
937                         NVME_CMD_EFFECTS_CSE_MASK;
938         case nvme_admin_sanitize_nvm:
939                 return NVME_CMD_EFFECTS_CSE_MASK;
940         default:
941                 break;
942         }
943         return 0;
944 }
945
946 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
947 {
948         u32 effects = 0;
949
950         if (ns) {
951                 if (ns->head->effects)
952                         effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
953                 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
954                         dev_warn(ctrl->device,
955                                  "IO command:%02x has unhandled effects:%08x\n",
956                                  opcode, effects);
957                 return 0;
958         }
959
960         if (ctrl->effects)
961                 effects = le32_to_cpu(ctrl->effects->acs[opcode]);
962         effects |= nvme_known_admin_effects(opcode);
963
964         return effects;
965 }
966 EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
967
968 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
969                                u8 opcode)
970 {
971         u32 effects = nvme_command_effects(ctrl, ns, opcode);
972
973         /*
974          * For simplicity, IO to all namespaces is quiesced even if the command
975          * effects say only one namespace is affected.
976          */
977         if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
978                 mutex_lock(&ctrl->scan_lock);
979                 mutex_lock(&ctrl->subsys->lock);
980                 nvme_mpath_start_freeze(ctrl->subsys);
981                 nvme_mpath_wait_freeze(ctrl->subsys);
982                 nvme_start_freeze(ctrl);
983                 nvme_wait_freeze(ctrl);
984         }
985         return effects;
986 }
987
988 static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
989 {
990         struct nvme_ns *ns;
991
992         down_read(&ctrl->namespaces_rwsem);
993         list_for_each_entry(ns, &ctrl->namespaces, list)
994                 if (_nvme_revalidate_disk(ns->disk))
995                         nvme_set_queue_dying(ns);
996                 else if (blk_queue_is_zoned(ns->disk->queue)) {
997                         /*
998                          * IO commands are required to fully revalidate a zoned
999                          * device. Force the command effects to trigger rescan
1000                          * work so report zones can run in a context with
1001                          * unfrozen IO queues.
1002                          */
1003                         *effects |= NVME_CMD_EFFECTS_NCC;
1004                 }
1005         up_read(&ctrl->namespaces_rwsem);
1006 }
1007
1008 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
1009 {
1010         /*
1011          * Revalidate LBA changes prior to unfreezing. This is necessary to
1012          * prevent memory corruption if a logical block size was changed by
1013          * this command.
1014          */
1015         if (effects & NVME_CMD_EFFECTS_LBCC)
1016                 nvme_update_formats(ctrl, &effects);
1017         if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
1018                 nvme_unfreeze(ctrl);
1019                 nvme_mpath_unfreeze(ctrl->subsys);
1020                 mutex_unlock(&ctrl->subsys->lock);
1021                 nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
1022                 mutex_unlock(&ctrl->scan_lock);
1023         }
1024         if (effects & NVME_CMD_EFFECTS_CCC)
1025                 nvme_init_identify(ctrl);
1026         if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
1027                 nvme_queue_scan(ctrl);
1028                 flush_work(&ctrl->scan_work);
1029         }
1030 }
1031
1032 void nvme_execute_passthru_rq(struct request *rq)
1033 {
1034         struct nvme_command *cmd = nvme_req(rq)->cmd;
1035         struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
1036         struct nvme_ns *ns = rq->q->queuedata;
1037         struct gendisk *disk = ns ? ns->disk : NULL;
1038         u32 effects;
1039
1040         effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
1041         blk_execute_rq(rq->q, disk, rq, 0);
1042         nvme_passthru_end(ctrl, effects);
1043 }
1044 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
1045
1046 static int nvme_submit_user_cmd(struct request_queue *q,
1047                 struct nvme_command *cmd, void __user *ubuffer,
1048                 unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
1049                 u32 meta_seed, u64 *result, unsigned timeout)
1050 {
1051         bool write = nvme_is_write(cmd);
1052         struct nvme_ns *ns = q->queuedata;
1053         struct gendisk *disk = ns ? ns->disk : NULL;
1054         struct request *req;
1055         struct bio *bio = NULL;
1056         void *meta = NULL;
1057         int ret;
1058
1059         req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
1060         if (IS_ERR(req))
1061                 return PTR_ERR(req);
1062
1063         req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
1064         nvme_req(req)->flags |= NVME_REQ_USERCMD;
1065
1066         if (ubuffer && bufflen) {
1067                 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
1068                                 GFP_KERNEL);
1069                 if (ret)
1070                         goto out;
1071                 bio = req->bio;
1072                 bio->bi_disk = disk;
1073                 if (disk && meta_buffer && meta_len) {
1074                         meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
1075                                         meta_seed, write);
1076                         if (IS_ERR(meta)) {
1077                                 ret = PTR_ERR(meta);
1078                                 goto out_unmap;
1079                         }
1080                         req->cmd_flags |= REQ_INTEGRITY;
1081                 }
1082         }
1083
1084         nvme_execute_passthru_rq(req);
1085         if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
1086                 ret = -EINTR;
1087         else
1088                 ret = nvme_req(req)->status;
1089         if (result)
1090                 *result = le64_to_cpu(nvme_req(req)->result.u64);
1091         if (meta && !ret && !write) {
1092                 if (copy_to_user(meta_buffer, meta, meta_len))
1093                         ret = -EFAULT;
1094         }
1095         kfree(meta);
1096  out_unmap:
1097         if (bio)
1098                 blk_rq_unmap_user(bio);
1099  out:
1100         blk_mq_free_request(req);
1101         return ret;
1102 }
1103
1104 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
1105 {
1106         struct nvme_ctrl *ctrl = rq->end_io_data;
1107         unsigned long flags;
1108         bool startka = false;
1109
1110         blk_mq_free_request(rq);
1111
1112         if (status) {
1113                 dev_err(ctrl->device,
1114                         "failed nvme_keep_alive_end_io error=%d\n",
1115                                 status);
1116                 return;
1117         }
1118
1119         ctrl->comp_seen = false;
1120         spin_lock_irqsave(&ctrl->lock, flags);
1121         if (ctrl->state == NVME_CTRL_LIVE ||
1122             ctrl->state == NVME_CTRL_CONNECTING)
1123                 startka = true;
1124         spin_unlock_irqrestore(&ctrl->lock, flags);
1125         if (startka)
1126                 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
1127 }
1128
1129 static int nvme_keep_alive(struct nvme_ctrl *ctrl)
1130 {
1131         struct request *rq;
1132
1133         rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED,
1134                         NVME_QID_ANY);
1135         if (IS_ERR(rq))
1136                 return PTR_ERR(rq);
1137
1138         rq->timeout = ctrl->kato * HZ;
1139         rq->end_io_data = ctrl;
1140
1141         blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
1142
1143         return 0;
1144 }
1145
1146 static void nvme_keep_alive_work(struct work_struct *work)
1147 {
1148         struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
1149                         struct nvme_ctrl, ka_work);
1150         bool comp_seen = ctrl->comp_seen;
1151
1152         if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
1153                 dev_dbg(ctrl->device,
1154                         "reschedule traffic based keep-alive timer\n");
1155                 ctrl->comp_seen = false;
1156                 queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
1157                 return;
1158         }
1159
1160         if (nvme_keep_alive(ctrl)) {
1161                 /* allocation failure, reset the controller */
1162                 dev_err(ctrl->device, "keep-alive failed\n");
1163                 nvme_reset_ctrl(ctrl);
1164                 return;
1165         }
1166 }
1167
1168 static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
1169 {
1170         if (unlikely(ctrl->kato == 0))
1171                 return;
1172
1173         queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
1174 }
1175
1176 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
1177 {
1178         if (unlikely(ctrl->kato == 0))
1179                 return;
1180
1181         cancel_delayed_work_sync(&ctrl->ka_work);
1182 }
1183 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
1184
1185 /*
1186  * In NVMe 1.0 the CNS field was just a binary controller or namespace
1187  * flag, thus sending any new CNS opcodes has a big chance of not working.
1188  * Qemu unfortunately had that bug after reporting a 1.1 version compliance
1189  * (but not for any later version).
1190  */
1191 static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
1192 {
1193         if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
1194                 return ctrl->vs < NVME_VS(1, 2, 0);
1195         return ctrl->vs < NVME_VS(1, 1, 0);
1196 }
1197
1198 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
1199 {
1200         struct nvme_command c = { };
1201         int error;
1202
1203         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1204         c.identify.opcode = nvme_admin_identify;
1205         c.identify.cns = NVME_ID_CNS_CTRL;
1206
1207         *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
1208         if (!*id)
1209                 return -ENOMEM;
1210
1211         error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
1212                         sizeof(struct nvme_id_ctrl));
1213         if (error)
1214                 kfree(*id);
1215         return error;
1216 }
1217
1218 static bool nvme_multi_css(struct nvme_ctrl *ctrl)
1219 {
1220         return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
1221 }
1222
1223 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
1224                 struct nvme_ns_id_desc *cur, bool *csi_seen)
1225 {
1226         const char *warn_str = "ctrl returned bogus length:";
1227         void *data = cur;
1228
1229         switch (cur->nidt) {
1230         case NVME_NIDT_EUI64:
1231                 if (cur->nidl != NVME_NIDT_EUI64_LEN) {
1232                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
1233                                  warn_str, cur->nidl);
1234                         return -1;
1235                 }
1236                 memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
1237                 return NVME_NIDT_EUI64_LEN;
1238         case NVME_NIDT_NGUID:
1239                 if (cur->nidl != NVME_NIDT_NGUID_LEN) {
1240                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
1241                                  warn_str, cur->nidl);
1242                         return -1;
1243                 }
1244                 memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
1245                 return NVME_NIDT_NGUID_LEN;
1246         case NVME_NIDT_UUID:
1247                 if (cur->nidl != NVME_NIDT_UUID_LEN) {
1248                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
1249                                  warn_str, cur->nidl);
1250                         return -1;
1251                 }
1252                 uuid_copy(&ids->uuid, data + sizeof(*cur));
1253                 return NVME_NIDT_UUID_LEN;
1254         case NVME_NIDT_CSI:
1255                 if (cur->nidl != NVME_NIDT_CSI_LEN) {
1256                         dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
1257                                  warn_str, cur->nidl);
1258                         return -1;
1259                 }
1260                 memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
1261                 *csi_seen = true;
1262                 return NVME_NIDT_CSI_LEN;
1263         default:
1264                 /* Skip unknown types */
1265                 return cur->nidl;
1266         }
1267 }
1268
1269 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
1270                 struct nvme_ns_ids *ids)
1271 {
1272         struct nvme_command c = { };
1273         bool csi_seen = false;
1274         int status, pos, len;
1275         void *data;
1276
1277         if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
1278                 return 0;
1279
1280         c.identify.opcode = nvme_admin_identify;
1281         c.identify.nsid = cpu_to_le32(nsid);
1282         c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
1283
1284         data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
1285         if (!data)
1286                 return -ENOMEM;
1287
1288         status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
1289                                       NVME_IDENTIFY_DATA_SIZE);
1290         if (status) {
1291                 dev_warn(ctrl->device,
1292                         "Identify Descriptors failed (%d)\n", status);
1293                 goto free_data;
1294         }
1295
1296         for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
1297                 struct nvme_ns_id_desc *cur = data + pos;
1298
1299                 if (cur->nidl == 0)
1300                         break;
1301
1302                 len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
1303                 if (len < 0)
1304                         break;
1305
1306                 len += sizeof(*cur);
1307         }
1308
1309         if (nvme_multi_css(ctrl) && !csi_seen) {
1310                 dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
1311                          nsid);
1312                 status = -EINVAL;
1313         }
1314
1315 free_data:
1316         kfree(data);
1317         return status;
1318 }
1319
1320 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
1321 {
1322         struct nvme_command c = { };
1323
1324         c.identify.opcode = nvme_admin_identify;
1325         c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
1326         c.identify.nsid = cpu_to_le32(nsid);
1327         return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list,
1328                                     NVME_IDENTIFY_DATA_SIZE);
1329 }
1330
1331 static int nvme_identify_ns(struct nvme_ctrl *ctrl,
1332                 unsigned nsid, struct nvme_id_ns **id)
1333 {
1334         struct nvme_command c = { };
1335         int error;
1336
1337         /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
1338         c.identify.opcode = nvme_admin_identify;
1339         c.identify.nsid = cpu_to_le32(nsid);
1340         c.identify.cns = NVME_ID_CNS_NS;
1341
1342         *id = kmalloc(sizeof(**id), GFP_KERNEL);
1343         if (!*id)
1344                 return -ENOMEM;
1345
1346         error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
1347         if (error) {
1348                 dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
1349                 kfree(*id);
1350         }
1351
1352         return error;
1353 }
1354
1355 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
1356                 unsigned int dword11, void *buffer, size_t buflen, u32 *result)
1357 {
1358         union nvme_result res = { 0 };
1359         struct nvme_command c;
1360         int ret;
1361
1362         memset(&c, 0, sizeof(c));
1363         c.features.opcode = op;
1364         c.features.fid = cpu_to_le32(fid);
1365         c.features.dword11 = cpu_to_le32(dword11);
1366
1367         ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
1368                         buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
1369         if (ret >= 0 && result)
1370                 *result = le32_to_cpu(res.u32);
1371         return ret;
1372 }
1373
1374 int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
1375                       unsigned int dword11, void *buffer, size_t buflen,
1376                       u32 *result)
1377 {
1378         return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
1379                              buflen, result);
1380 }
1381 EXPORT_SYMBOL_GPL(nvme_set_features);
1382
1383 int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
1384                       unsigned int dword11, void *buffer, size_t buflen,
1385                       u32 *result)
1386 {
1387         return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
1388                              buflen, result);
1389 }
1390 EXPORT_SYMBOL_GPL(nvme_get_features);
1391
1392 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
1393 {
1394         u32 q_count = (*count - 1) | ((*count - 1) << 16);
1395         u32 result;
1396         int status, nr_io_queues;
1397
1398         status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
1399                         &result);
1400         if (status < 0)
1401                 return status;
1402
1403         /*
1404          * Degraded controllers might return an error when setting the queue
1405          * count.  We still want to be able to bring them online and offer
1406          * access to the admin queue, as that might be only way to fix them up.
1407          */
1408         if (status > 0) {
1409                 dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
1410                 *count = 0;
1411         } else {
1412                 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
1413                 *count = min(*count, nr_io_queues);
1414         }
1415
1416         return 0;
1417 }
1418 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
1419
1420 #define NVME_AEN_SUPPORTED \
1421         (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
1422          NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
1423
1424 static void nvme_enable_aen(struct nvme_ctrl *ctrl)
1425 {
1426         u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
1427         int status;
1428
1429         if (!supported_aens)
1430                 return;
1431
1432         status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
1433                         NULL, 0, &result);
1434         if (status)
1435                 dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
1436                          supported_aens);
1437
1438         queue_work(nvme_wq, &ctrl->async_event_work);
1439 }
1440
1441 /*
1442  * Convert integer values from ioctl structures to user pointers, silently
1443  * ignoring the upper bits in the compat case to match behaviour of 32-bit
1444  * kernels.
1445  */
1446 static void __user *nvme_to_user_ptr(uintptr_t ptrval)
1447 {
1448         if (in_compat_syscall())
1449                 ptrval = (compat_uptr_t)ptrval;
1450         return (void __user *)ptrval;
1451 }
1452
1453 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
1454 {
1455         struct nvme_user_io io;
1456         struct nvme_command c;
1457         unsigned length, meta_len;
1458         void __user *metadata;
1459
1460         if (copy_from_user(&io, uio, sizeof(io)))
1461                 return -EFAULT;
1462         if (io.flags)
1463                 return -EINVAL;
1464
1465         switch (io.opcode) {
1466         case nvme_cmd_write:
1467         case nvme_cmd_read:
1468         case nvme_cmd_compare:
1469                 break;
1470         default:
1471                 return -EINVAL;
1472         }
1473
1474         length = (io.nblocks + 1) << ns->lba_shift;
1475         meta_len = (io.nblocks + 1) * ns->ms;
1476         metadata = nvme_to_user_ptr(io.metadata);
1477
1478         if (ns->features & NVME_NS_EXT_LBAS) {
1479                 length += meta_len;
1480                 meta_len = 0;
1481         } else if (meta_len) {
1482                 if ((io.metadata & 3) || !io.metadata)
1483                         return -EINVAL;
1484         }
1485
1486         memset(&c, 0, sizeof(c));
1487         c.rw.opcode = io.opcode;
1488         c.rw.flags = io.flags;
1489         c.rw.nsid = cpu_to_le32(ns->head->ns_id);
1490         c.rw.slba = cpu_to_le64(io.slba);
1491         c.rw.length = cpu_to_le16(io.nblocks);
1492         c.rw.control = cpu_to_le16(io.control);
1493         c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
1494         c.rw.reftag = cpu_to_le32(io.reftag);
1495         c.rw.apptag = cpu_to_le16(io.apptag);
1496         c.rw.appmask = cpu_to_le16(io.appmask);
1497
1498         return nvme_submit_user_cmd(ns->queue, &c,
1499                         nvme_to_user_ptr(io.addr), length,
1500                         metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
1501 }
1502
1503 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1504                         struct nvme_passthru_cmd __user *ucmd)
1505 {
1506         struct nvme_passthru_cmd cmd;
1507         struct nvme_command c;
1508         unsigned timeout = 0;
1509         u64 result;
1510         int status;
1511
1512         if (!capable(CAP_SYS_ADMIN))
1513                 return -EACCES;
1514         if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1515                 return -EFAULT;
1516         if (cmd.flags)
1517                 return -EINVAL;
1518
1519         memset(&c, 0, sizeof(c));
1520         c.common.opcode = cmd.opcode;
1521         c.common.flags = cmd.flags;
1522         c.common.nsid = cpu_to_le32(cmd.nsid);
1523         c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1524         c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1525         c.common.cdw10 = cpu_to_le32(cmd.cdw10);
1526         c.common.cdw11 = cpu_to_le32(cmd.cdw11);
1527         c.common.cdw12 = cpu_to_le32(cmd.cdw12);
1528         c.common.cdw13 = cpu_to_le32(cmd.cdw13);
1529         c.common.cdw14 = cpu_to_le32(cmd.cdw14);
1530         c.common.cdw15 = cpu_to_le32(cmd.cdw15);
1531
1532         if (cmd.timeout_ms)
1533                 timeout = msecs_to_jiffies(cmd.timeout_ms);
1534
1535         status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1536                         nvme_to_user_ptr(cmd.addr), cmd.data_len,
1537                         nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
1538                         0, &result, timeout);
1539
1540         if (status >= 0) {
1541                 if (put_user(result, &ucmd->result))
1542                         return -EFAULT;
1543         }
1544
1545         return status;
1546 }
1547
1548 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1549                         struct nvme_passthru_cmd64 __user *ucmd)
1550 {
1551         struct nvme_passthru_cmd64 cmd;
1552         struct nvme_command c;
1553         unsigned timeout = 0;
1554         int status;
1555
1556         if (!capable(CAP_SYS_ADMIN))
1557                 return -EACCES;
1558         if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1559                 return -EFAULT;
1560         if (cmd.flags)
1561                 return -EINVAL;
1562
1563         memset(&c, 0, sizeof(c));
1564         c.common.opcode = cmd.opcode;
1565         c.common.flags = cmd.flags;
1566         c.common.nsid = cpu_to_le32(cmd.nsid);
1567         c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1568         c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1569         c.common.cdw10 = cpu_to_le32(cmd.cdw10);
1570         c.common.cdw11 = cpu_to_le32(cmd.cdw11);
1571         c.common.cdw12 = cpu_to_le32(cmd.cdw12);
1572         c.common.cdw13 = cpu_to_le32(cmd.cdw13);
1573         c.common.cdw14 = cpu_to_le32(cmd.cdw14);
1574         c.common.cdw15 = cpu_to_le32(cmd.cdw15);
1575
1576         if (cmd.timeout_ms)
1577                 timeout = msecs_to_jiffies(cmd.timeout_ms);
1578
1579         status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1580                         nvme_to_user_ptr(cmd.addr), cmd.data_len,
1581                         nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
1582                         0, &cmd.result, timeout);
1583
1584         if (status >= 0) {
1585                 if (put_user(cmd.result, &ucmd->result))
1586                         return -EFAULT;
1587         }
1588
1589         return status;
1590 }
1591
1592 /*
1593  * Issue ioctl requests on the first available path.  Note that unlike normal
1594  * block layer requests we will not retry failed request on another controller.
1595  */
1596 struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
1597                 struct nvme_ns_head **head, int *srcu_idx)
1598 {
1599 #ifdef CONFIG_NVME_MULTIPATH
1600         if (disk->fops == &nvme_ns_head_ops) {
1601                 struct nvme_ns *ns;
1602
1603                 *head = disk->private_data;
1604                 *srcu_idx = srcu_read_lock(&(*head)->srcu);
1605                 ns = nvme_find_path(*head);
1606                 if (!ns)
1607                         srcu_read_unlock(&(*head)->srcu, *srcu_idx);
1608                 return ns;
1609         }
1610 #endif
1611         *head = NULL;
1612         *srcu_idx = -1;
1613         return disk->private_data;
1614 }
1615
1616 void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
1617 {
1618         if (head)
1619                 srcu_read_unlock(&head->srcu, idx);
1620 }
1621
1622 static bool is_ctrl_ioctl(unsigned int cmd)
1623 {
1624         if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
1625                 return true;
1626         if (is_sed_ioctl(cmd))
1627                 return true;
1628         return false;
1629 }
1630
1631 static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
1632                                   void __user *argp,
1633                                   struct nvme_ns_head *head,
1634                                   int srcu_idx)
1635 {
1636         struct nvme_ctrl *ctrl = ns->ctrl;
1637         int ret;
1638
1639         nvme_get_ctrl(ns->ctrl);
1640         nvme_put_ns_from_disk(head, srcu_idx);
1641
1642         switch (cmd) {
1643         case NVME_IOCTL_ADMIN_CMD:
1644                 ret = nvme_user_cmd(ctrl, NULL, argp);
1645                 break;
1646         case NVME_IOCTL_ADMIN64_CMD:
1647                 ret = nvme_user_cmd64(ctrl, NULL, argp);
1648                 break;
1649         default:
1650                 ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
1651                 break;
1652         }
1653         nvme_put_ctrl(ctrl);
1654         return ret;
1655 }
1656
1657 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
1658                 unsigned int cmd, unsigned long arg)
1659 {
1660         struct nvme_ns_head *head = NULL;
1661         void __user *argp = (void __user *)arg;
1662         struct nvme_ns *ns;
1663         int srcu_idx, ret;
1664
1665         ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
1666         if (unlikely(!ns))
1667                 return -EWOULDBLOCK;
1668
1669         /*
1670          * Handle ioctls that apply to the controller instead of the namespace
1671          * seperately and drop the ns SRCU reference early.  This avoids a
1672          * deadlock when deleting namespaces using the passthrough interface.
1673          */
1674         if (is_ctrl_ioctl(cmd))
1675                 return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
1676
1677         switch (cmd) {
1678         case NVME_IOCTL_ID:
1679                 force_successful_syscall_return();
1680                 ret = ns->head->ns_id;
1681                 break;
1682         case NVME_IOCTL_IO_CMD:
1683                 ret = nvme_user_cmd(ns->ctrl, ns, argp);
1684                 break;
1685         case NVME_IOCTL_SUBMIT_IO:
1686                 ret = nvme_submit_io(ns, argp);
1687                 break;
1688         case NVME_IOCTL_IO64_CMD:
1689                 ret = nvme_user_cmd64(ns->ctrl, ns, argp);
1690                 break;
1691         default:
1692                 if (ns->ndev)
1693                         ret = nvme_nvm_ioctl(ns, cmd, arg);
1694                 else
1695                         ret = -ENOTTY;
1696         }
1697
1698         nvme_put_ns_from_disk(head, srcu_idx);
1699         return ret;
1700 }
1701
1702 #ifdef CONFIG_COMPAT
1703 struct nvme_user_io32 {
1704         __u8    opcode;
1705         __u8    flags;
1706         __u16   control;
1707         __u16   nblocks;
1708         __u16   rsvd;
1709         __u64   metadata;
1710         __u64   addr;
1711         __u64   slba;
1712         __u32   dsmgmt;
1713         __u32   reftag;
1714         __u16   apptag;
1715         __u16   appmask;
1716 } __attribute__((__packed__));
1717
1718 #define NVME_IOCTL_SUBMIT_IO32  _IOW('N', 0x42, struct nvme_user_io32)
1719
1720 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
1721                 unsigned int cmd, unsigned long arg)
1722 {
1723         /*
1724          * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO
1725          * between 32 bit programs and 64 bit kernel.
1726          * The cause is that the results of sizeof(struct nvme_user_io),
1727          * which is used to define NVME_IOCTL_SUBMIT_IO,
1728          * are not same between 32 bit compiler and 64 bit compiler.
1729          * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling
1730          * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs.
1731          * Other IOCTL numbers are same between 32 bit and 64 bit.
1732          * So there is nothing to do regarding to other IOCTL numbers.
1733          */
1734         if (cmd == NVME_IOCTL_SUBMIT_IO32)
1735                 return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg);
1736
1737         return nvme_ioctl(bdev, mode, cmd, arg);
1738 }
1739 #else
1740 #define nvme_compat_ioctl       NULL
1741 #endif /* CONFIG_COMPAT */
1742
1743 static int nvme_open(struct block_device *bdev, fmode_t mode)
1744 {
1745         struct nvme_ns *ns = bdev->bd_disk->private_data;
1746
1747 #ifdef CONFIG_NVME_MULTIPATH
1748         /* should never be called due to GENHD_FL_HIDDEN */
1749         if (WARN_ON_ONCE(ns->head->disk))
1750                 goto fail;
1751 #endif
1752         if (!kref_get_unless_zero(&ns->kref))
1753                 goto fail;
1754         if (!try_module_get(ns->ctrl->ops->module))
1755                 goto fail_put_ns;
1756
1757         return 0;
1758
1759 fail_put_ns:
1760         nvme_put_ns(ns);
1761 fail:
1762         return -ENXIO;
1763 }
1764
1765 static void nvme_release(struct gendisk *disk, fmode_t mode)
1766 {
1767         struct nvme_ns *ns = disk->private_data;
1768
1769         module_put(ns->ctrl->ops->module);
1770         nvme_put_ns(ns);
1771 }
1772
1773 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1774 {
1775         /* some standard values */
1776         geo->heads = 1 << 6;
1777         geo->sectors = 1 << 5;
1778         geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1779         return 0;
1780 }
1781
1782 #ifdef CONFIG_BLK_DEV_INTEGRITY
1783 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1784                                 u32 max_integrity_segments)
1785 {
1786         struct blk_integrity integrity;
1787
1788         memset(&integrity, 0, sizeof(integrity));
1789         switch (pi_type) {
1790         case NVME_NS_DPS_PI_TYPE3:
1791                 integrity.profile = &t10_pi_type3_crc;
1792                 integrity.tag_size = sizeof(u16) + sizeof(u32);
1793                 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1794                 break;
1795         case NVME_NS_DPS_PI_TYPE1:
1796         case NVME_NS_DPS_PI_TYPE2:
1797                 integrity.profile = &t10_pi_type1_crc;
1798                 integrity.tag_size = sizeof(u16);
1799                 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1800                 break;
1801         default:
1802                 integrity.profile = NULL;
1803                 break;
1804         }
1805         integrity.tuple_size = ms;
1806         blk_integrity_register(disk, &integrity);
1807         blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
1808 }
1809 #else
1810 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
1811                                 u32 max_integrity_segments)
1812 {
1813 }
1814 #endif /* CONFIG_BLK_DEV_INTEGRITY */
1815
1816 static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
1817 {
1818         struct nvme_ctrl *ctrl = ns->ctrl;
1819         struct request_queue *queue = disk->queue;
1820         u32 size = queue_logical_block_size(queue);
1821
1822         if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) {
1823                 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue);
1824                 return;
1825         }
1826
1827         if (ctrl->nr_streams && ns->sws && ns->sgs)
1828                 size *= ns->sws * ns->sgs;
1829
1830         BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1831                         NVME_DSM_MAX_RANGES);
1832
1833         queue->limits.discard_alignment = 0;
1834         queue->limits.discard_granularity = size;
1835
1836         /* If discard is already enabled, don't reset queue limits */
1837         if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue))
1838                 return;
1839
1840         blk_queue_max_discard_sectors(queue, UINT_MAX);
1841         blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
1842
1843         if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1844                 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
1845 }
1846
1847 static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
1848 {
1849         u64 max_blocks;
1850
1851         if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) ||
1852             (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
1853                 return;
1854         /*
1855          * Even though NVMe spec explicitly states that MDTS is not
1856          * applicable to the write-zeroes:- "The restriction does not apply to
1857          * commands that do not transfer data between the host and the
1858          * controller (e.g., Write Uncorrectable ro Write Zeroes command).".
1859          * In order to be more cautious use controller's max_hw_sectors value
1860          * to configure the maximum sectors for the write-zeroes which is
1861          * configured based on the controller's MDTS field in the
1862          * nvme_init_identify() if available.
1863          */
1864         if (ns->ctrl->max_hw_sectors == UINT_MAX)
1865                 max_blocks = (u64)USHRT_MAX + 1;
1866         else
1867                 max_blocks = ns->ctrl->max_hw_sectors + 1;
1868
1869         blk_queue_max_write_zeroes_sectors(disk->queue,
1870                                            nvme_lba_to_sect(ns, max_blocks));
1871 }
1872
1873 static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
1874                 struct nvme_id_ns *id, struct nvme_ns_ids *ids)
1875 {
1876         memset(ids, 0, sizeof(*ids));
1877
1878         if (ctrl->vs >= NVME_VS(1, 1, 0))
1879                 memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
1880         if (ctrl->vs >= NVME_VS(1, 2, 0))
1881                 memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
1882         if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
1883                 return nvme_identify_ns_descs(ctrl, nsid, ids);
1884         return 0;
1885 }
1886
1887 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
1888 {
1889         return !uuid_is_null(&ids->uuid) ||
1890                 memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
1891                 memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
1892 }
1893
1894 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1895 {
1896         return uuid_equal(&a->uuid, &b->uuid) &&
1897                 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1898                 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
1899                 a->csi == b->csi;
1900 }
1901
1902 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1903                                  u32 *phys_bs, u32 *io_opt)
1904 {
1905         struct streams_directive_params s;
1906         int ret;
1907
1908         if (!ctrl->nr_streams)
1909                 return 0;
1910
1911         ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
1912         if (ret)
1913                 return ret;
1914
1915         ns->sws = le32_to_cpu(s.sws);
1916         ns->sgs = le16_to_cpu(s.sgs);
1917
1918         if (ns->sws) {
1919                 *phys_bs = ns->sws * (1 << ns->lba_shift);
1920                 if (ns->sgs)
1921                         *io_opt = *phys_bs * ns->sgs;
1922         }
1923
1924         return 0;
1925 }
1926
1927 static void nvme_update_disk_info(struct gendisk *disk,
1928                 struct nvme_ns *ns, struct nvme_id_ns *id)
1929 {
1930         sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
1931         unsigned short bs = 1 << ns->lba_shift;
1932         u32 atomic_bs, phys_bs, io_opt = 0;
1933
1934         if (ns->lba_shift > PAGE_SHIFT) {
1935                 /* unsupported block size, set capacity to 0 later */
1936                 bs = (1 << 9);
1937         }
1938         blk_mq_freeze_queue(disk->queue);
1939         blk_integrity_unregister(disk);
1940
1941         atomic_bs = phys_bs = bs;
1942         nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt);
1943         if (id->nabo == 0) {
1944                 /*
1945                  * Bit 1 indicates whether NAWUPF is defined for this namespace
1946                  * and whether it should be used instead of AWUPF. If NAWUPF ==
1947                  * 0 then AWUPF must be used instead.
1948                  */
1949                 if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
1950                         atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
1951                 else
1952                         atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
1953         }
1954
1955         if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
1956                 /* NPWG = Namespace Preferred Write Granularity */
1957                 phys_bs = bs * (1 + le16_to_cpu(id->npwg));
1958                 /* NOWS = Namespace Optimal Write Size */
1959                 io_opt = bs * (1 + le16_to_cpu(id->nows));
1960         }
1961
1962         blk_queue_logical_block_size(disk->queue, bs);
1963         /*
1964          * Linux filesystems assume writing a single physical block is
1965          * an atomic operation. Hence limit the physical block size to the
1966          * value of the Atomic Write Unit Power Fail parameter.
1967          */
1968         blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
1969         blk_queue_io_min(disk->queue, phys_bs);
1970         blk_queue_io_opt(disk->queue, io_opt);
1971
1972         /*
1973          * The block layer can't support LBA sizes larger than the page size
1974          * yet, so catch this early and don't allow block I/O.
1975          */
1976         if (ns->lba_shift > PAGE_SHIFT)
1977                 capacity = 0;
1978
1979         /*
1980          * Register a metadata profile for PI, or the plain non-integrity NVMe
1981          * metadata masquerading as Type 0 if supported, otherwise reject block
1982          * I/O to namespaces with metadata except when the namespace supports
1983          * PI, as it can strip/insert in that case.
1984          */
1985         if (ns->ms) {
1986                 if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
1987                     (ns->features & NVME_NS_METADATA_SUPPORTED))
1988                         nvme_init_integrity(disk, ns->ms, ns->pi_type,
1989                                             ns->ctrl->max_integrity_segments);
1990                 else if (!nvme_ns_has_pi(ns))
1991                         capacity = 0;
1992         }
1993
1994         set_capacity_revalidate_and_notify(disk, capacity, false);
1995
1996         nvme_config_discard(disk, ns);
1997         nvme_config_write_zeroes(disk, ns);
1998
1999         if (id->nsattr & NVME_NS_ATTR_RO)
2000                 set_disk_ro(disk, true);
2001         else
2002                 set_disk_ro(disk, false);
2003
2004         blk_mq_unfreeze_queue(disk->queue);
2005 }
2006
2007 static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
2008 {
2009         unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
2010         struct nvme_ns *ns = disk->private_data;
2011         struct nvme_ctrl *ctrl = ns->ctrl;
2012         int ret;
2013         u32 iob;
2014
2015         /*
2016          * If identify namespace failed, use default 512 byte block size so
2017          * block layer can use before failing read/write for 0 capacity.
2018          */
2019         ns->lba_shift = id->lbaf[lbaf].ds;
2020         if (ns->lba_shift == 0)
2021                 ns->lba_shift = 9;
2022
2023         switch (ns->head->ids.csi) {
2024         case NVME_CSI_NVM:
2025                 break;
2026         case NVME_CSI_ZNS:
2027                 ret = nvme_update_zone_info(disk, ns, lbaf);
2028                 if (ret) {
2029                         dev_warn(ctrl->device,
2030                                 "failed to add zoned namespace:%u ret:%d\n",
2031                                 ns->head->ns_id, ret);
2032                         return ret;
2033                 }
2034                 break;
2035         default:
2036                 dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
2037                         ns->head->ids.csi, ns->head->ns_id);
2038                 return -ENODEV;
2039         }
2040
2041         if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
2042             is_power_of_2(ctrl->max_hw_sectors))
2043                 iob = ctrl->max_hw_sectors;
2044         else
2045                 iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
2046
2047         ns->features = 0;
2048         ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
2049         /* the PI implementation requires metadata equal t10 pi tuple size */
2050         if (ns->ms == sizeof(struct t10_pi_tuple))
2051                 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
2052         else
2053                 ns->pi_type = 0;
2054
2055         if (ns->ms) {
2056                 /*
2057                  * For PCIe only the separate metadata pointer is supported,
2058                  * as the block layer supplies metadata in a separate bio_vec
2059                  * chain. For Fabrics, only metadata as part of extended data
2060                  * LBA is supported on the wire per the Fabrics specification,
2061                  * but the HBA/HCA will do the remapping from the separate
2062                  * metadata buffers for us.
2063                  */
2064                 if (id->flbas & NVME_NS_FLBAS_META_EXT) {
2065                         ns->features |= NVME_NS_EXT_LBAS;
2066                         if ((ctrl->ops->flags & NVME_F_FABRICS) &&
2067                             (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) &&
2068                             ctrl->max_integrity_segments)
2069                                 ns->features |= NVME_NS_METADATA_SUPPORTED;
2070                 } else {
2071                         if (WARN_ON_ONCE(ctrl->ops->flags & NVME_F_FABRICS))
2072                                 return -EINVAL;
2073                         if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
2074                                 ns->features |= NVME_NS_METADATA_SUPPORTED;
2075                 }
2076         }
2077
2078         if (iob)
2079                 blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob));
2080         nvme_update_disk_info(disk, ns, id);
2081 #ifdef CONFIG_NVME_MULTIPATH
2082         if (ns->head->disk) {
2083                 nvme_update_disk_info(ns->head->disk, ns, id);
2084                 blk_stack_limits(&ns->head->disk->queue->limits,
2085                                  &ns->queue->limits, 0);
2086                 nvme_mpath_update_disk_size(ns->head->disk);
2087         }
2088 #endif
2089         return 0;
2090 }
2091
2092 static int _nvme_revalidate_disk(struct gendisk *disk)
2093 {
2094         struct nvme_ns *ns = disk->private_data;
2095         struct nvme_ctrl *ctrl = ns->ctrl;
2096         struct nvme_id_ns *id;
2097         struct nvme_ns_ids ids;
2098         int ret = 0;
2099
2100         if (test_bit(NVME_NS_DEAD, &ns->flags)) {
2101                 set_capacity(disk, 0);
2102                 return -ENODEV;
2103         }
2104
2105         ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id);
2106         if (ret)
2107                 goto out;
2108
2109         if (id->ncap == 0) {
2110                 ret = -ENODEV;
2111                 goto free_id;
2112         }
2113
2114         ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
2115         if (ret)
2116                 goto free_id;
2117
2118         if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
2119                 dev_err(ctrl->device,
2120                         "identifiers changed for nsid %d\n", ns->head->ns_id);
2121                 ret = -ENODEV;
2122                 goto free_id;
2123         }
2124
2125         ret = __nvme_revalidate_disk(disk, id);
2126 free_id:
2127         kfree(id);
2128 out:
2129         /*
2130          * Only fail the function if we got a fatal error back from the
2131          * device, otherwise ignore the error and just move on.
2132          */
2133         if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR)))
2134                 ret = 0;
2135         else if (ret > 0)
2136                 ret = blk_status_to_errno(nvme_error_status(ret));
2137         return ret;
2138 }
2139
2140 static int nvme_revalidate_disk(struct gendisk *disk)
2141 {
2142         int ret;
2143
2144         ret = _nvme_revalidate_disk(disk);
2145         if (ret)
2146                 return ret;
2147
2148 #ifdef CONFIG_BLK_DEV_ZONED
2149         if (blk_queue_is_zoned(disk->queue)) {
2150                 struct nvme_ns *ns = disk->private_data;
2151                 struct nvme_ctrl *ctrl = ns->ctrl;
2152
2153                 ret = blk_revalidate_disk_zones(disk, NULL);
2154                 if (!ret)
2155                         blk_queue_max_zone_append_sectors(disk->queue,
2156                                                           ctrl->max_zone_append);
2157         }
2158 #endif
2159         return ret;
2160 }
2161
2162 static char nvme_pr_type(enum pr_type type)
2163 {
2164         switch (type) {
2165         case PR_WRITE_EXCLUSIVE:
2166                 return 1;
2167         case PR_EXCLUSIVE_ACCESS:
2168                 return 2;
2169         case PR_WRITE_EXCLUSIVE_REG_ONLY:
2170                 return 3;
2171         case PR_EXCLUSIVE_ACCESS_REG_ONLY:
2172                 return 4;
2173         case PR_WRITE_EXCLUSIVE_ALL_REGS:
2174                 return 5;
2175         case PR_EXCLUSIVE_ACCESS_ALL_REGS:
2176                 return 6;
2177         default:
2178                 return 0;
2179         }
2180 };
2181
2182 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
2183                                 u64 key, u64 sa_key, u8 op)
2184 {
2185         struct nvme_ns_head *head = NULL;
2186         struct nvme_ns *ns;
2187         struct nvme_command c;
2188         int srcu_idx, ret;
2189         u8 data[16] = { 0, };
2190
2191         ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
2192         if (unlikely(!ns))
2193                 return -EWOULDBLOCK;
2194
2195         put_unaligned_le64(key, &data[0]);
2196         put_unaligned_le64(sa_key, &data[8]);
2197
2198         memset(&c, 0, sizeof(c));
2199         c.common.opcode = op;
2200         c.common.nsid = cpu_to_le32(ns->head->ns_id);
2201         c.common.cdw10 = cpu_to_le32(cdw10);
2202
2203         ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
2204         nvme_put_ns_from_disk(head, srcu_idx);
2205         return ret;
2206 }
2207
2208 static int nvme_pr_register(struct block_device *bdev, u64 old,
2209                 u64 new, unsigned flags)
2210 {
2211         u32 cdw10;
2212
2213         if (flags & ~PR_FL_IGNORE_KEY)
2214                 return -EOPNOTSUPP;
2215
2216         cdw10 = old ? 2 : 0;
2217         cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
2218         cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
2219         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
2220 }
2221
2222 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
2223                 enum pr_type type, unsigned flags)
2224 {
2225         u32 cdw10;
2226
2227         if (flags & ~PR_FL_IGNORE_KEY)
2228                 return -EOPNOTSUPP;
2229
2230         cdw10 = nvme_pr_type(type) << 8;
2231         cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
2232         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
2233 }
2234
2235 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
2236                 enum pr_type type, bool abort)
2237 {
2238         u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
2239         return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
2240 }
2241
2242 static int nvme_pr_clear(struct block_device *bdev, u64 key)
2243 {
2244         u32 cdw10 = 1 | (key ? 1 << 3 : 0);
2245         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
2246 }
2247
2248 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2249 {
2250         u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
2251         return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
2252 }
2253
2254 static const struct pr_ops nvme_pr_ops = {
2255         .pr_register    = nvme_pr_register,
2256         .pr_reserve     = nvme_pr_reserve,
2257         .pr_release     = nvme_pr_release,
2258         .pr_preempt     = nvme_pr_preempt,
2259         .pr_clear       = nvme_pr_clear,
2260 };
2261
2262 #ifdef CONFIG_BLK_SED_OPAL
2263 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
2264                 bool send)
2265 {
2266         struct nvme_ctrl *ctrl = data;
2267         struct nvme_command cmd;
2268
2269         memset(&cmd, 0, sizeof(cmd));
2270         if (send)
2271                 cmd.common.opcode = nvme_admin_security_send;
2272         else
2273                 cmd.common.opcode = nvme_admin_security_recv;
2274         cmd.common.nsid = 0;
2275         cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
2276         cmd.common.cdw11 = cpu_to_le32(len);
2277
2278         return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
2279                                       ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false);
2280 }
2281 EXPORT_SYMBOL_GPL(nvme_sec_submit);
2282 #endif /* CONFIG_BLK_SED_OPAL */
2283
2284 static const struct block_device_operations nvme_fops = {
2285         .owner          = THIS_MODULE,
2286         .ioctl          = nvme_ioctl,
2287         .compat_ioctl   = nvme_compat_ioctl,
2288         .open           = nvme_open,
2289         .release        = nvme_release,
2290         .getgeo         = nvme_getgeo,
2291         .revalidate_disk= nvme_revalidate_disk,
2292         .report_zones   = nvme_report_zones,
2293         .pr_ops         = &nvme_pr_ops,
2294 };
2295
2296 #ifdef CONFIG_NVME_MULTIPATH
2297 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
2298 {
2299         struct nvme_ns_head *head = bdev->bd_disk->private_data;
2300
2301         if (!kref_get_unless_zero(&head->ref))
2302                 return -ENXIO;
2303         return 0;
2304 }
2305
2306 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
2307 {
2308         nvme_put_ns_head(disk->private_data);
2309 }
2310
2311 const struct block_device_operations nvme_ns_head_ops = {
2312         .owner          = THIS_MODULE,
2313         .submit_bio     = nvme_ns_head_submit_bio,
2314         .open           = nvme_ns_head_open,
2315         .release        = nvme_ns_head_release,
2316         .ioctl          = nvme_ioctl,
2317         .compat_ioctl   = nvme_compat_ioctl,
2318         .getgeo         = nvme_getgeo,
2319         .report_zones   = nvme_report_zones,
2320         .pr_ops         = &nvme_pr_ops,
2321 };
2322 #endif /* CONFIG_NVME_MULTIPATH */
2323
2324 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
2325 {
2326         unsigned long timeout =
2327                 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
2328         u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
2329         int ret;
2330
2331         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2332                 if (csts == ~0)
2333                         return -ENODEV;
2334                 if ((csts & NVME_CSTS_RDY) == bit)
2335                         break;
2336
2337                 usleep_range(1000, 2000);
2338                 if (fatal_signal_pending(current))
2339                         return -EINTR;
2340                 if (time_after(jiffies, timeout)) {
2341                         dev_err(ctrl->device,
2342                                 "Device not ready; aborting %s, CSTS=0x%x\n",
2343                                 enabled ? "initialisation" : "reset", csts);
2344                         return -ENODEV;
2345                 }
2346         }
2347
2348         return ret;
2349 }
2350
2351 /*
2352  * If the device has been passed off to us in an enabled state, just clear
2353  * the enabled bit.  The spec says we should set the 'shutdown notification
2354  * bits', but doing so may cause the device to complete commands to the
2355  * admin queue ... and we don't know what memory that might be pointing at!
2356  */
2357 int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
2358 {
2359         int ret;
2360
2361         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2362         ctrl->ctrl_config &= ~NVME_CC_ENABLE;
2363
2364         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2365         if (ret)
2366                 return ret;
2367
2368         if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
2369                 msleep(NVME_QUIRK_DELAY_AMOUNT);
2370
2371         return nvme_wait_ready(ctrl, ctrl->cap, false);
2372 }
2373 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
2374
2375 int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
2376 {
2377         unsigned dev_page_min;
2378         int ret;
2379
2380         ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
2381         if (ret) {
2382                 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2383                 return ret;
2384         }
2385         dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
2386
2387         if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
2388                 dev_err(ctrl->device,
2389                         "Minimum device page size %u too large for host (%u)\n",
2390                         1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
2391                 return -ENODEV;
2392         }
2393
2394         if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
2395                 ctrl->ctrl_config = NVME_CC_CSS_CSI;
2396         else
2397                 ctrl->ctrl_config = NVME_CC_CSS_NVM;
2398         ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
2399         ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
2400         ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
2401         ctrl->ctrl_config |= NVME_CC_ENABLE;
2402
2403         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2404         if (ret)
2405                 return ret;
2406         return nvme_wait_ready(ctrl, ctrl->cap, true);
2407 }
2408 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
2409
2410 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
2411 {
2412         unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
2413         u32 csts;
2414         int ret;
2415
2416         ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
2417         ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
2418
2419         ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
2420         if (ret)
2421                 return ret;
2422
2423         while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
2424                 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
2425                         break;
2426
2427                 msleep(100);
2428                 if (fatal_signal_pending(current))
2429                         return -EINTR;
2430                 if (time_after(jiffies, timeout)) {
2431                         dev_err(ctrl->device,
2432                                 "Device shutdown incomplete; abort shutdown\n");
2433                         return -ENODEV;
2434                 }
2435         }
2436
2437         return ret;
2438 }
2439 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
2440
2441 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
2442                 struct request_queue *q)
2443 {
2444         bool vwc = false;
2445
2446         if (ctrl->max_hw_sectors) {
2447                 u32 max_segments =
2448                         (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
2449
2450                 max_segments = min_not_zero(max_segments, ctrl->max_segments);
2451                 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
2452                 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
2453         }
2454         blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
2455         blk_queue_dma_alignment(q, 7);
2456         if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
2457                 vwc = true;
2458         blk_queue_write_cache(q, vwc, vwc);
2459 }
2460
2461 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
2462 {
2463         __le64 ts;
2464         int ret;
2465
2466         if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
2467                 return 0;
2468
2469         ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
2470         ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
2471                         NULL);
2472         if (ret)
2473                 dev_warn_once(ctrl->device,
2474                         "could not set timestamp (%d)\n", ret);
2475         return ret;
2476 }
2477
2478 static int nvme_configure_acre(struct nvme_ctrl *ctrl)
2479 {
2480         struct nvme_feat_host_behavior *host;
2481         int ret;
2482
2483         /* Don't bother enabling the feature if retry delay is not reported */
2484         if (!ctrl->crdt[0])
2485                 return 0;
2486
2487         host = kzalloc(sizeof(*host), GFP_KERNEL);
2488         if (!host)
2489                 return 0;
2490
2491         host->acre = NVME_ENABLE_ACRE;
2492         ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
2493                                 host, sizeof(*host), NULL);
2494         kfree(host);
2495         return ret;
2496 }
2497
2498 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
2499 {
2500         /*
2501          * APST (Autonomous Power State Transition) lets us program a
2502          * table of power state transitions that the controller will
2503          * perform automatically.  We configure it with a simple
2504          * heuristic: we are willing to spend at most 2% of the time
2505          * transitioning between power states.  Therefore, when running
2506          * in any given state, we will enter the next lower-power
2507          * non-operational state after waiting 50 * (enlat + exlat)
2508          * microseconds, as long as that state's exit latency is under
2509          * the requested maximum latency.
2510          *
2511          * We will not autonomously enter any non-operational state for
2512          * which the total latency exceeds ps_max_latency_us.  Users
2513          * can set ps_max_latency_us to zero to turn off APST.
2514          */
2515
2516         unsigned apste;
2517         struct nvme_feat_auto_pst *table;
2518         u64 max_lat_us = 0;
2519         int max_ps = -1;
2520         int ret;
2521
2522         /*
2523          * If APST isn't supported or if we haven't been initialized yet,
2524          * then don't do anything.
2525          */
2526         if (!ctrl->apsta)
2527                 return 0;
2528
2529         if (ctrl->npss > 31) {
2530                 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
2531                 return 0;
2532         }
2533
2534         table = kzalloc(sizeof(*table), GFP_KERNEL);
2535         if (!table)
2536                 return 0;
2537
2538         if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
2539                 /* Turn off APST. */
2540                 apste = 0;
2541                 dev_dbg(ctrl->device, "APST disabled\n");
2542         } else {
2543                 __le64 target = cpu_to_le64(0);
2544                 int state;
2545
2546                 /*
2547                  * Walk through all states from lowest- to highest-power.
2548                  * According to the spec, lower-numbered states use more
2549                  * power.  NPSS, despite the name, is the index of the
2550                  * lowest-power state, not the number of states.
2551                  */
2552                 for (state = (int)ctrl->npss; state >= 0; state--) {
2553                         u64 total_latency_us, exit_latency_us, transition_ms;
2554
2555                         if (target)
2556                                 table->entries[state] = target;
2557
2558                         /*
2559                          * Don't allow transitions to the deepest state
2560                          * if it's quirked off.
2561                          */
2562                         if (state == ctrl->npss &&
2563                             (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
2564                                 continue;
2565
2566                         /*
2567                          * Is this state a useful non-operational state for
2568                          * higher-power states to autonomously transition to?
2569                          */
2570                         if (!(ctrl->psd[state].flags &
2571                               NVME_PS_FLAGS_NON_OP_STATE))
2572                                 continue;
2573
2574                         exit_latency_us =
2575                                 (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
2576                         if (exit_latency_us > ctrl->ps_max_latency_us)
2577                                 continue;
2578
2579                         total_latency_us =
2580                                 exit_latency_us +
2581                                 le32_to_cpu(ctrl->psd[state].entry_lat);
2582
2583                         /*
2584                          * This state is good.  Use it as the APST idle
2585                          * target for higher power states.
2586                          */
2587                         transition_ms = total_latency_us + 19;
2588                         do_div(transition_ms, 20);
2589                         if (transition_ms > (1 << 24) - 1)
2590                                 transition_ms = (1 << 24) - 1;
2591
2592                         target = cpu_to_le64((state << 3) |
2593                                              (transition_ms << 8));
2594
2595                         if (max_ps == -1)
2596                                 max_ps = state;
2597
2598                         if (total_latency_us > max_lat_us)
2599                                 max_lat_us = total_latency_us;
2600                 }
2601
2602                 apste = 1;
2603
2604                 if (max_ps == -1) {
2605                         dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
2606                 } else {
2607                         dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
2608                                 max_ps, max_lat_us, (int)sizeof(*table), table);
2609                 }
2610         }
2611
2612         ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
2613                                 table, sizeof(*table), NULL);
2614         if (ret)
2615                 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
2616
2617         kfree(table);
2618         return ret;
2619 }
2620
2621 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
2622 {
2623         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2624         u64 latency;
2625
2626         switch (val) {
2627         case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
2628         case PM_QOS_LATENCY_ANY:
2629                 latency = U64_MAX;
2630                 break;
2631
2632         default:
2633                 latency = val;
2634         }
2635
2636         if (ctrl->ps_max_latency_us != latency) {
2637                 ctrl->ps_max_latency_us = latency;
2638                 nvme_configure_apst(ctrl);
2639         }
2640 }
2641
2642 struct nvme_core_quirk_entry {
2643         /*
2644          * NVMe model and firmware strings are padded with spaces.  For
2645          * simplicity, strings in the quirk table are padded with NULLs
2646          * instead.
2647          */
2648         u16 vid;
2649         const char *mn;
2650         const char *fr;
2651         unsigned long quirks;
2652 };
2653
2654 static const struct nvme_core_quirk_entry core_quirks[] = {
2655         {
2656                 /*
2657                  * This Toshiba device seems to die using any APST states.  See:
2658                  * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
2659                  */
2660                 .vid = 0x1179,
2661                 .mn = "THNSF5256GPUK TOSHIBA",
2662                 .quirks = NVME_QUIRK_NO_APST,
2663         },
2664         {
2665                 /*
2666                  * This LiteON CL1-3D*-Q11 firmware version has a race
2667                  * condition associated with actions related to suspend to idle
2668                  * LiteON has resolved the problem in future firmware
2669                  */
2670                 .vid = 0x14a4,
2671                 .fr = "22301111",
2672                 .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
2673         }
2674 };
2675
2676 /* match is null-terminated but idstr is space-padded. */
2677 static bool string_matches(const char *idstr, const char *match, size_t len)
2678 {
2679         size_t matchlen;
2680
2681         if (!match)
2682                 return true;
2683
2684         matchlen = strlen(match);
2685         WARN_ON_ONCE(matchlen > len);
2686
2687         if (memcmp(idstr, match, matchlen))
2688                 return false;
2689
2690         for (; matchlen < len; matchlen++)
2691                 if (idstr[matchlen] != ' ')
2692                         return false;
2693
2694         return true;
2695 }
2696
2697 static bool quirk_matches(const struct nvme_id_ctrl *id,
2698                           const struct nvme_core_quirk_entry *q)
2699 {
2700         return q->vid == le16_to_cpu(id->vid) &&
2701                 string_matches(id->mn, q->mn, sizeof(id->mn)) &&
2702                 string_matches(id->fr, q->fr, sizeof(id->fr));
2703 }
2704
2705 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
2706                 struct nvme_id_ctrl *id)
2707 {
2708         size_t nqnlen;
2709         int off;
2710
2711         if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
2712                 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
2713                 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
2714                         strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
2715                         return;
2716                 }
2717
2718                 if (ctrl->vs >= NVME_VS(1, 2, 1))
2719                         dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
2720         }
2721
2722         /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
2723         off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
2724                         "nqn.2014.08.org.nvmexpress:%04x%04x",
2725                         le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
2726         memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
2727         off += sizeof(id->sn);
2728         memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
2729         off += sizeof(id->mn);
2730         memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
2731 }
2732
2733 static void nvme_release_subsystem(struct device *dev)
2734 {
2735         struct nvme_subsystem *subsys =
2736                 container_of(dev, struct nvme_subsystem, dev);
2737
2738         if (subsys->instance >= 0)
2739                 ida_simple_remove(&nvme_instance_ida, subsys->instance);
2740         kfree(subsys);
2741 }
2742
2743 static void nvme_destroy_subsystem(struct kref *ref)
2744 {
2745         struct nvme_subsystem *subsys =
2746                         container_of(ref, struct nvme_subsystem, ref);
2747
2748         mutex_lock(&nvme_subsystems_lock);
2749         list_del(&subsys->entry);
2750         mutex_unlock(&nvme_subsystems_lock);
2751
2752         ida_destroy(&subsys->ns_ida);
2753         device_del(&subsys->dev);
2754         put_device(&subsys->dev);
2755 }
2756
2757 static void nvme_put_subsystem(struct nvme_subsystem *subsys)
2758 {
2759         kref_put(&subsys->ref, nvme_destroy_subsystem);
2760 }
2761
2762 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
2763 {
2764         struct nvme_subsystem *subsys;
2765
2766         lockdep_assert_held(&nvme_subsystems_lock);
2767
2768         /*
2769          * Fail matches for discovery subsystems. This results
2770          * in each discovery controller bound to a unique subsystem.
2771          * This avoids issues with validating controller values
2772          * that can only be true when there is a single unique subsystem.
2773          * There may be multiple and completely independent entities
2774          * that provide discovery controllers.
2775          */
2776         if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
2777                 return NULL;
2778
2779         list_for_each_entry(subsys, &nvme_subsystems, entry) {
2780                 if (strcmp(subsys->subnqn, subsysnqn))
2781                         continue;
2782                 if (!kref_get_unless_zero(&subsys->ref))
2783                         continue;
2784                 return subsys;
2785         }
2786
2787         return NULL;
2788 }
2789
2790 #define SUBSYS_ATTR_RO(_name, _mode, _show)                     \
2791         struct device_attribute subsys_attr_##_name = \
2792                 __ATTR(_name, _mode, _show, NULL)
2793
2794 static ssize_t nvme_subsys_show_nqn(struct device *dev,
2795                                     struct device_attribute *attr,
2796                                     char *buf)
2797 {
2798         struct nvme_subsystem *subsys =
2799                 container_of(dev, struct nvme_subsystem, dev);
2800
2801         return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
2802 }
2803 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
2804
2805 #define nvme_subsys_show_str_function(field)                            \
2806 static ssize_t subsys_##field##_show(struct device *dev,                \
2807                             struct device_attribute *attr, char *buf)   \
2808 {                                                                       \
2809         struct nvme_subsystem *subsys =                                 \
2810                 container_of(dev, struct nvme_subsystem, dev);          \
2811         return sprintf(buf, "%.*s\n",                                   \
2812                        (int)sizeof(subsys->field), subsys->field);      \
2813 }                                                                       \
2814 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
2815
2816 nvme_subsys_show_str_function(model);
2817 nvme_subsys_show_str_function(serial);
2818 nvme_subsys_show_str_function(firmware_rev);
2819
2820 static struct attribute *nvme_subsys_attrs[] = {
2821         &subsys_attr_model.attr,
2822         &subsys_attr_serial.attr,
2823         &subsys_attr_firmware_rev.attr,
2824         &subsys_attr_subsysnqn.attr,
2825 #ifdef CONFIG_NVME_MULTIPATH
2826         &subsys_attr_iopolicy.attr,
2827 #endif
2828         NULL,
2829 };
2830
2831 static struct attribute_group nvme_subsys_attrs_group = {
2832         .attrs = nvme_subsys_attrs,
2833 };
2834
2835 static const struct attribute_group *nvme_subsys_attrs_groups[] = {
2836         &nvme_subsys_attrs_group,
2837         NULL,
2838 };
2839
2840 static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
2841                 struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2842 {
2843         struct nvme_ctrl *tmp;
2844
2845         lockdep_assert_held(&nvme_subsystems_lock);
2846
2847         list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
2848                 if (nvme_state_terminal(tmp))
2849                         continue;
2850
2851                 if (tmp->cntlid == ctrl->cntlid) {
2852                         dev_err(ctrl->device,
2853                                 "Duplicate cntlid %u with %s, rejecting\n",
2854                                 ctrl->cntlid, dev_name(tmp->device));
2855                         return false;
2856                 }
2857
2858                 if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
2859                     (ctrl->opts && ctrl->opts->discovery_nqn))
2860                         continue;
2861
2862                 dev_err(ctrl->device,
2863                         "Subsystem does not support multiple controllers\n");
2864                 return false;
2865         }
2866
2867         return true;
2868 }
2869
2870 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2871 {
2872         struct nvme_subsystem *subsys, *found;
2873         int ret;
2874
2875         subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2876         if (!subsys)
2877                 return -ENOMEM;
2878
2879         subsys->instance = -1;
2880         mutex_init(&subsys->lock);
2881         kref_init(&subsys->ref);
2882         INIT_LIST_HEAD(&subsys->ctrls);
2883         INIT_LIST_HEAD(&subsys->nsheads);
2884         nvme_init_subnqn(subsys, ctrl, id);
2885         memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2886         memcpy(subsys->model, id->mn, sizeof(subsys->model));
2887         memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
2888         subsys->vendor_id = le16_to_cpu(id->vid);
2889         subsys->cmic = id->cmic;
2890         subsys->awupf = le16_to_cpu(id->awupf);
2891 #ifdef CONFIG_NVME_MULTIPATH
2892         subsys->iopolicy = NVME_IOPOLICY_NUMA;
2893 #endif
2894
2895         subsys->dev.class = nvme_subsys_class;
2896         subsys->dev.release = nvme_release_subsystem;
2897         subsys->dev.groups = nvme_subsys_attrs_groups;
2898         dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
2899         device_initialize(&subsys->dev);
2900
2901         mutex_lock(&nvme_subsystems_lock);
2902         found = __nvme_find_get_subsystem(subsys->subnqn);
2903         if (found) {
2904                 put_device(&subsys->dev);
2905                 subsys = found;
2906
2907                 if (!nvme_validate_cntlid(subsys, ctrl, id)) {
2908                         ret = -EINVAL;
2909                         goto out_put_subsystem;
2910                 }
2911         } else {
2912                 ret = device_add(&subsys->dev);
2913                 if (ret) {
2914                         dev_err(ctrl->device,
2915                                 "failed to register subsystem device.\n");
2916                         put_device(&subsys->dev);
2917                         goto out_unlock;
2918                 }
2919                 ida_init(&subsys->ns_ida);
2920                 list_add_tail(&subsys->entry, &nvme_subsystems);
2921         }
2922
2923         ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2924                                 dev_name(ctrl->device));
2925         if (ret) {
2926                 dev_err(ctrl->device,
2927                         "failed to create sysfs link from subsystem.\n");
2928                 goto out_put_subsystem;
2929         }
2930
2931         if (!found)
2932                 subsys->instance = ctrl->instance;
2933         ctrl->subsys = subsys;
2934         list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2935         mutex_unlock(&nvme_subsystems_lock);
2936         return 0;
2937
2938 out_put_subsystem:
2939         nvme_put_subsystem(subsys);
2940 out_unlock:
2941         mutex_unlock(&nvme_subsystems_lock);
2942         return ret;
2943 }
2944
2945 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
2946                 void *log, size_t size, u64 offset)
2947 {
2948         struct nvme_command c = { };
2949         u32 dwlen = nvme_bytes_to_numd(size);
2950
2951         c.get_log_page.opcode = nvme_admin_get_log_page;
2952         c.get_log_page.nsid = cpu_to_le32(nsid);
2953         c.get_log_page.lid = log_page;
2954         c.get_log_page.lsp = lsp;
2955         c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
2956         c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
2957         c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
2958         c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
2959         c.get_log_page.csi = csi;
2960
2961         return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
2962 }
2963
2964 static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi)
2965 {
2966         struct nvme_cel *cel, *ret = NULL;
2967
2968         spin_lock(&ctrl->lock);
2969         list_for_each_entry(cel, &ctrl->cels, entry) {
2970                 if (cel->csi == csi) {
2971                         ret = cel;
2972                         break;
2973                 }
2974         }
2975         spin_unlock(&ctrl->lock);
2976
2977         return ret;
2978 }
2979
2980 static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
2981                                 struct nvme_effects_log **log)
2982 {
2983         struct nvme_cel *cel = nvme_find_cel(ctrl, csi);
2984         int ret;
2985
2986         if (cel)
2987                 goto out;
2988
2989         cel = kzalloc(sizeof(*cel), GFP_KERNEL);
2990         if (!cel)
2991                 return -ENOMEM;
2992
2993         ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi,
2994                         &cel->log, sizeof(cel->log), 0);
2995         if (ret) {
2996                 kfree(cel);
2997                 return ret;
2998         }
2999
3000         cel->csi = csi;
3001
3002         spin_lock(&ctrl->lock);
3003         list_add_tail(&cel->entry, &ctrl->cels);
3004         spin_unlock(&ctrl->lock);
3005 out:
3006         *log = &cel->log;
3007         return 0;
3008 }
3009
3010 /*
3011  * Initialize the cached copies of the Identify data and various controller
3012  * register in our nvme_ctrl structure.  This should be called as soon as
3013  * the admin queue is fully up and running.
3014  */
3015 int nvme_init_identify(struct nvme_ctrl *ctrl)
3016 {
3017         struct nvme_id_ctrl *id;
3018         int ret, page_shift;
3019         u32 max_hw_sectors;
3020         bool prev_apst_enabled;
3021
3022         ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
3023         if (ret) {
3024                 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
3025                 return ret;
3026         }
3027         page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
3028         ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
3029
3030         if (ctrl->vs >= NVME_VS(1, 1, 0))
3031                 ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
3032
3033         ret = nvme_identify_ctrl(ctrl, &id);
3034         if (ret) {
3035                 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
3036                 return -EIO;
3037         }
3038
3039         if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
3040                 ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
3041                 if (ret < 0)
3042                         goto out_free;
3043         }
3044
3045         if (!(ctrl->ops->flags & NVME_F_FABRICS))
3046                 ctrl->cntlid = le16_to_cpu(id->cntlid);
3047
3048         if (!ctrl->identified) {
3049                 int i;
3050
3051                 ret = nvme_init_subsystem(ctrl, id);
3052                 if (ret)
3053                         goto out_free;
3054
3055                 /*
3056                  * Check for quirks.  Quirk can depend on firmware version,
3057                  * so, in principle, the set of quirks present can change
3058                  * across a reset.  As a possible future enhancement, we
3059                  * could re-scan for quirks every time we reinitialize
3060                  * the device, but we'd have to make sure that the driver
3061                  * behaves intelligently if the quirks change.
3062                  */
3063                 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
3064                         if (quirk_matches(id, &core_quirks[i]))
3065                                 ctrl->quirks |= core_quirks[i].quirks;
3066                 }
3067         }
3068
3069         if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
3070                 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
3071                 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
3072         }
3073
3074         ctrl->crdt[0] = le16_to_cpu(id->crdt1);
3075         ctrl->crdt[1] = le16_to_cpu(id->crdt2);
3076         ctrl->crdt[2] = le16_to_cpu(id->crdt3);
3077
3078         ctrl->oacs = le16_to_cpu(id->oacs);
3079         ctrl->oncs = le16_to_cpu(id->oncs);
3080         ctrl->mtfa = le16_to_cpu(id->mtfa);
3081         ctrl->oaes = le32_to_cpu(id->oaes);
3082         ctrl->wctemp = le16_to_cpu(id->wctemp);
3083         ctrl->cctemp = le16_to_cpu(id->cctemp);
3084
3085         atomic_set(&ctrl->abort_limit, id->acl + 1);
3086         ctrl->vwc = id->vwc;
3087         if (id->mdts)
3088                 max_hw_sectors = 1 << (id->mdts + page_shift - 9);
3089         else
3090                 max_hw_sectors = UINT_MAX;
3091         ctrl->max_hw_sectors =
3092                 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
3093
3094         nvme_set_queue_limits(ctrl, ctrl->admin_q);
3095         ctrl->sgls = le32_to_cpu(id->sgls);
3096         ctrl->kas = le16_to_cpu(id->kas);
3097         ctrl->max_namespaces = le32_to_cpu(id->mnan);
3098         ctrl->ctratt = le32_to_cpu(id->ctratt);
3099
3100         if (id->rtd3e) {
3101                 /* us -> s */
3102                 u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
3103
3104                 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
3105                                                  shutdown_timeout, 60);
3106
3107                 if (ctrl->shutdown_timeout != shutdown_timeout)
3108                         dev_info(ctrl->device,
3109                                  "Shutdown timeout set to %u seconds\n",
3110                                  ctrl->shutdown_timeout);
3111         } else
3112                 ctrl->shutdown_timeout = shutdown_timeout;
3113
3114         ctrl->npss = id->npss;
3115         ctrl->apsta = id->apsta;
3116         prev_apst_enabled = ctrl->apst_enabled;
3117         if (ctrl->quirks & NVME_QUIRK_NO_APST) {
3118                 if (force_apst && id->apsta) {
3119                         dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
3120                         ctrl->apst_enabled = true;
3121                 } else {
3122                         ctrl->apst_enabled = false;
3123                 }
3124         } else {
3125                 ctrl->apst_enabled = id->apsta;
3126         }
3127         memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
3128
3129         if (ctrl->ops->flags & NVME_F_FABRICS) {
3130                 ctrl->icdoff = le16_to_cpu(id->icdoff);
3131                 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
3132                 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
3133                 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
3134
3135                 /*
3136                  * In fabrics we need to verify the cntlid matches the
3137                  * admin connect
3138                  */
3139                 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
3140                         dev_err(ctrl->device,
3141                                 "Mismatching cntlid: Connect %u vs Identify "
3142                                 "%u, rejecting\n",
3143                                 ctrl->cntlid, le16_to_cpu(id->cntlid));
3144                         ret = -EINVAL;
3145                         goto out_free;
3146                 }
3147
3148                 if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
3149                         dev_err(ctrl->device,
3150                                 "keep-alive support is mandatory for fabrics\n");
3151                         ret = -EINVAL;
3152                         goto out_free;
3153                 }
3154         } else {
3155                 ctrl->hmpre = le32_to_cpu(id->hmpre);
3156                 ctrl->hmmin = le32_to_cpu(id->hmmin);
3157                 ctrl->hmminds = le32_to_cpu(id->hmminds);
3158                 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
3159         }
3160
3161         ret = nvme_mpath_init(ctrl, id);
3162         kfree(id);
3163
3164         if (ret < 0)
3165                 return ret;
3166
3167         if (ctrl->apst_enabled && !prev_apst_enabled)
3168                 dev_pm_qos_expose_latency_tolerance(ctrl->device);
3169         else if (!ctrl->apst_enabled && prev_apst_enabled)
3170                 dev_pm_qos_hide_latency_tolerance(ctrl->device);
3171
3172         ret = nvme_configure_apst(ctrl);
3173         if (ret < 0)
3174                 return ret;
3175         
3176         ret = nvme_configure_timestamp(ctrl);
3177         if (ret < 0)
3178                 return ret;
3179
3180         ret = nvme_configure_directives(ctrl);
3181         if (ret < 0)
3182                 return ret;
3183
3184         ret = nvme_configure_acre(ctrl);
3185         if (ret < 0)
3186                 return ret;
3187
3188         if (!ctrl->identified)
3189                 nvme_hwmon_init(ctrl);
3190
3191         ctrl->identified = true;
3192
3193         return 0;
3194
3195 out_free:
3196         kfree(id);
3197         return ret;
3198 }
3199 EXPORT_SYMBOL_GPL(nvme_init_identify);
3200
3201 static int nvme_dev_open(struct inode *inode, struct file *file)
3202 {
3203         struct nvme_ctrl *ctrl =
3204                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
3205
3206         switch (ctrl->state) {
3207         case NVME_CTRL_LIVE:
3208                 break;
3209         default:
3210                 return -EWOULDBLOCK;
3211         }
3212
3213         file->private_data = ctrl;
3214         return 0;
3215 }
3216
3217 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
3218 {
3219         struct nvme_ns *ns;
3220         int ret;
3221
3222         down_read(&ctrl->namespaces_rwsem);
3223         if (list_empty(&ctrl->namespaces)) {
3224                 ret = -ENOTTY;
3225                 goto out_unlock;
3226         }
3227
3228         ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
3229         if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
3230                 dev_warn(ctrl->device,
3231                         "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
3232                 ret = -EINVAL;
3233                 goto out_unlock;
3234         }
3235
3236         dev_warn(ctrl->device,
3237                 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
3238         kref_get(&ns->kref);
3239         up_read(&ctrl->namespaces_rwsem);
3240
3241         ret = nvme_user_cmd(ctrl, ns, argp);
3242         nvme_put_ns(ns);
3243         return ret;
3244
3245 out_unlock:
3246         up_read(&ctrl->namespaces_rwsem);
3247         return ret;
3248 }
3249
3250 static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
3251                 unsigned long arg)
3252 {
3253         struct nvme_ctrl *ctrl = file->private_data;
3254         void __user *argp = (void __user *)arg;
3255
3256         switch (cmd) {
3257         case NVME_IOCTL_ADMIN_CMD:
3258                 return nvme_user_cmd(ctrl, NULL, argp);
3259         case NVME_IOCTL_ADMIN64_CMD:
3260                 return nvme_user_cmd64(ctrl, NULL, argp);
3261         case NVME_IOCTL_IO_CMD:
3262                 return nvme_dev_user_cmd(ctrl, argp);
3263         case NVME_IOCTL_RESET:
3264                 dev_warn(ctrl->device, "resetting controller\n");
3265                 return nvme_reset_ctrl_sync(ctrl);
3266         case NVME_IOCTL_SUBSYS_RESET:
3267                 return nvme_reset_subsystem(ctrl);
3268         case NVME_IOCTL_RESCAN:
3269                 nvme_queue_scan(ctrl);
3270                 return 0;
3271         default:
3272                 return -ENOTTY;
3273         }
3274 }
3275
3276 static const struct file_operations nvme_dev_fops = {
3277         .owner          = THIS_MODULE,
3278         .open           = nvme_dev_open,
3279         .unlocked_ioctl = nvme_dev_ioctl,
3280         .compat_ioctl   = compat_ptr_ioctl,
3281 };
3282
3283 static ssize_t nvme_sysfs_reset(struct device *dev,
3284                                 struct device_attribute *attr, const char *buf,
3285                                 size_t count)
3286 {
3287         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3288         int ret;
3289
3290         ret = nvme_reset_ctrl_sync(ctrl);
3291         if (ret < 0)
3292                 return ret;
3293         return count;
3294 }
3295 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
3296
3297 static ssize_t nvme_sysfs_rescan(struct device *dev,
3298                                 struct device_attribute *attr, const char *buf,
3299                                 size_t count)
3300 {
3301         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3302
3303         nvme_queue_scan(ctrl);
3304         return count;
3305 }
3306 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
3307
3308 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
3309 {
3310         struct gendisk *disk = dev_to_disk(dev);
3311
3312         if (disk->fops == &nvme_fops)
3313                 return nvme_get_ns_from_dev(dev)->head;
3314         else
3315                 return disk->private_data;
3316 }
3317
3318 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
3319                 char *buf)
3320 {
3321         struct nvme_ns_head *head = dev_to_ns_head(dev);
3322         struct nvme_ns_ids *ids = &head->ids;
3323         struct nvme_subsystem *subsys = head->subsys;
3324         int serial_len = sizeof(subsys->serial);
3325         int model_len = sizeof(subsys->model);
3326
3327         if (!uuid_is_null(&ids->uuid))
3328                 return sprintf(buf, "uuid.%pU\n", &ids->uuid);
3329
3330         if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3331                 return sprintf(buf, "eui.%16phN\n", ids->nguid);
3332
3333         if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3334                 return sprintf(buf, "eui.%8phN\n", ids->eui64);
3335
3336         while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
3337                                   subsys->serial[serial_len - 1] == '\0'))
3338                 serial_len--;
3339         while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
3340                                  subsys->model[model_len - 1] == '\0'))
3341                 model_len--;
3342
3343         return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
3344                 serial_len, subsys->serial, model_len, subsys->model,
3345                 head->ns_id);
3346 }
3347 static DEVICE_ATTR_RO(wwid);
3348
3349 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
3350                 char *buf)
3351 {
3352         return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
3353 }
3354 static DEVICE_ATTR_RO(nguid);
3355
3356 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
3357                 char *buf)
3358 {
3359         struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3360
3361         /* For backward compatibility expose the NGUID to userspace if
3362          * we have no UUID set
3363          */
3364         if (uuid_is_null(&ids->uuid)) {
3365                 printk_ratelimited(KERN_WARNING
3366                                    "No UUID available providing old NGUID\n");
3367                 return sprintf(buf, "%pU\n", ids->nguid);
3368         }
3369         return sprintf(buf, "%pU\n", &ids->uuid);
3370 }
3371 static DEVICE_ATTR_RO(uuid);
3372
3373 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
3374                 char *buf)
3375 {
3376         return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
3377 }
3378 static DEVICE_ATTR_RO(eui);
3379
3380 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
3381                 char *buf)
3382 {
3383         return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
3384 }
3385 static DEVICE_ATTR_RO(nsid);
3386
3387 static struct attribute *nvme_ns_id_attrs[] = {
3388         &dev_attr_wwid.attr,
3389         &dev_attr_uuid.attr,
3390         &dev_attr_nguid.attr,
3391         &dev_attr_eui.attr,
3392         &dev_attr_nsid.attr,
3393 #ifdef CONFIG_NVME_MULTIPATH
3394         &dev_attr_ana_grpid.attr,
3395         &dev_attr_ana_state.attr,
3396 #endif
3397         NULL,
3398 };
3399
3400 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
3401                 struct attribute *a, int n)
3402 {
3403         struct device *dev = container_of(kobj, struct device, kobj);
3404         struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
3405
3406         if (a == &dev_attr_uuid.attr) {
3407                 if (uuid_is_null(&ids->uuid) &&
3408                     !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3409                         return 0;
3410         }
3411         if (a == &dev_attr_nguid.attr) {
3412                 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
3413                         return 0;
3414         }
3415         if (a == &dev_attr_eui.attr) {
3416                 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
3417                         return 0;
3418         }
3419 #ifdef CONFIG_NVME_MULTIPATH
3420         if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
3421                 if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */
3422                         return 0;
3423                 if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
3424                         return 0;
3425         }
3426 #endif
3427         return a->mode;
3428 }
3429
3430 static const struct attribute_group nvme_ns_id_attr_group = {
3431         .attrs          = nvme_ns_id_attrs,
3432         .is_visible     = nvme_ns_id_attrs_are_visible,
3433 };
3434
3435 const struct attribute_group *nvme_ns_id_attr_groups[] = {
3436         &nvme_ns_id_attr_group,
3437 #ifdef CONFIG_NVM
3438         &nvme_nvm_attr_group,
3439 #endif
3440         NULL,
3441 };
3442
3443 #define nvme_show_str_function(field)                                           \
3444 static ssize_t  field##_show(struct device *dev,                                \
3445                             struct device_attribute *attr, char *buf)           \
3446 {                                                                               \
3447         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
3448         return sprintf(buf, "%.*s\n",                                           \
3449                 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field);         \
3450 }                                                                               \
3451 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3452
3453 nvme_show_str_function(model);
3454 nvme_show_str_function(serial);
3455 nvme_show_str_function(firmware_rev);
3456
3457 #define nvme_show_int_function(field)                                           \
3458 static ssize_t  field##_show(struct device *dev,                                \
3459                             struct device_attribute *attr, char *buf)           \
3460 {                                                                               \
3461         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);                          \
3462         return sprintf(buf, "%d\n", ctrl->field);       \
3463 }                                                                               \
3464 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
3465
3466 nvme_show_int_function(cntlid);
3467 nvme_show_int_function(numa_node);
3468 nvme_show_int_function(queue_count);
3469 nvme_show_int_function(sqsize);
3470
3471 static ssize_t nvme_sysfs_delete(struct device *dev,
3472                                 struct device_attribute *attr, const char *buf,
3473                                 size_t count)
3474 {
3475         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3476
3477         /* Can't delete non-created controllers */
3478         if (!ctrl->created)
3479                 return -EBUSY;
3480
3481         if (device_remove_file_self(dev, attr))
3482                 nvme_delete_ctrl_sync(ctrl);
3483         return count;
3484 }
3485 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
3486
3487 static ssize_t nvme_sysfs_show_transport(struct device *dev,
3488                                          struct device_attribute *attr,
3489                                          char *buf)
3490 {
3491         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3492
3493         return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
3494 }
3495 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
3496
3497 static ssize_t nvme_sysfs_show_state(struct device *dev,
3498                                      struct device_attribute *attr,
3499                                      char *buf)
3500 {
3501         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3502         static const char *const state_name[] = {
3503                 [NVME_CTRL_NEW]         = "new",
3504                 [NVME_CTRL_LIVE]        = "live",
3505                 [NVME_CTRL_RESETTING]   = "resetting",
3506                 [NVME_CTRL_CONNECTING]  = "connecting",
3507                 [NVME_CTRL_DELETING]    = "deleting",
3508                 [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
3509                 [NVME_CTRL_DEAD]        = "dead",
3510         };
3511
3512         if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
3513             state_name[ctrl->state])
3514                 return sprintf(buf, "%s\n", state_name[ctrl->state]);
3515
3516         return sprintf(buf, "unknown state\n");
3517 }
3518
3519 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
3520
3521 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
3522                                          struct device_attribute *attr,
3523                                          char *buf)
3524 {
3525         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3526
3527         return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
3528 }
3529 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
3530
3531 static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
3532                                         struct device_attribute *attr,
3533                                         char *buf)
3534 {
3535         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3536
3537         return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn);
3538 }
3539 static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
3540
3541 static ssize_t nvme_sysfs_show_hostid(struct device *dev,
3542                                         struct device_attribute *attr,
3543                                         char *buf)
3544 {
3545         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3546
3547         return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id);
3548 }
3549 static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
3550
3551 static ssize_t nvme_sysfs_show_address(struct device *dev,
3552                                          struct device_attribute *attr,
3553                                          char *buf)
3554 {
3555         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3556
3557         return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
3558 }
3559 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
3560
3561 static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
3562                 struct device_attribute *attr, char *buf)
3563 {
3564         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3565         struct nvmf_ctrl_options *opts = ctrl->opts;
3566
3567         if (ctrl->opts->max_reconnects == -1)
3568                 return sprintf(buf, "off\n");
3569         return sprintf(buf, "%d\n",
3570                         opts->max_reconnects * opts->reconnect_delay);
3571 }
3572
3573 static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
3574                 struct device_attribute *attr, const char *buf, size_t count)
3575 {
3576         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3577         struct nvmf_ctrl_options *opts = ctrl->opts;
3578         int ctrl_loss_tmo, err;
3579
3580         err = kstrtoint(buf, 10, &ctrl_loss_tmo);
3581         if (err)
3582                 return -EINVAL;
3583
3584         else if (ctrl_loss_tmo < 0)
3585                 opts->max_reconnects = -1;
3586         else
3587                 opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
3588                                                 opts->reconnect_delay);
3589         return count;
3590 }
3591 static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
3592         nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
3593
3594 static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
3595                 struct device_attribute *attr, char *buf)
3596 {
3597         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3598
3599         if (ctrl->opts->reconnect_delay == -1)
3600                 return sprintf(buf, "off\n");
3601         return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay);
3602 }
3603
3604 static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
3605                 struct device_attribute *attr, const char *buf, size_t count)
3606 {
3607         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3608         unsigned int v;
3609         int err;
3610
3611         err = kstrtou32(buf, 10, &v);
3612         if (err)
3613                 return err;
3614
3615         ctrl->opts->reconnect_delay = v;
3616         return count;
3617 }
3618 static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
3619         nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
3620
3621 static struct attribute *nvme_dev_attrs[] = {
3622         &dev_attr_reset_controller.attr,
3623         &dev_attr_rescan_controller.attr,
3624         &dev_attr_model.attr,
3625         &dev_attr_serial.attr,
3626         &dev_attr_firmware_rev.attr,
3627         &dev_attr_cntlid.attr,
3628         &dev_attr_delete_controller.attr,
3629         &dev_attr_transport.attr,
3630         &dev_attr_subsysnqn.attr,
3631         &dev_attr_address.attr,
3632         &dev_attr_state.attr,
3633         &dev_attr_numa_node.attr,
3634         &dev_attr_queue_count.attr,
3635         &dev_attr_sqsize.attr,
3636         &dev_attr_hostnqn.attr,
3637         &dev_attr_hostid.attr,
3638         &dev_attr_ctrl_loss_tmo.attr,
3639         &dev_attr_reconnect_delay.attr,
3640         NULL
3641 };
3642
3643 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
3644                 struct attribute *a, int n)
3645 {
3646         struct device *dev = container_of(kobj, struct device, kobj);
3647         struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
3648
3649         if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
3650                 return 0;
3651         if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
3652                 return 0;
3653         if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
3654                 return 0;
3655         if (a == &dev_attr_hostid.attr && !ctrl->opts)
3656                 return 0;
3657
3658         return a->mode;
3659 }
3660
3661 static struct attribute_group nvme_dev_attrs_group = {
3662         .attrs          = nvme_dev_attrs,
3663         .is_visible     = nvme_dev_attrs_are_visible,
3664 };
3665
3666 static const struct attribute_group *nvme_dev_attr_groups[] = {
3667         &nvme_dev_attrs_group,
3668         NULL,
3669 };
3670
3671 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
3672                 unsigned nsid)
3673 {
3674         struct nvme_ns_head *h;
3675
3676         lockdep_assert_held(&subsys->lock);
3677
3678         list_for_each_entry(h, &subsys->nsheads, entry) {
3679                 if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
3680                         return h;
3681         }
3682
3683         return NULL;
3684 }
3685
3686 static int __nvme_check_ids(struct nvme_subsystem *subsys,
3687                 struct nvme_ns_head *new)
3688 {
3689         struct nvme_ns_head *h;
3690
3691         lockdep_assert_held(&subsys->lock);
3692
3693         list_for_each_entry(h, &subsys->nsheads, entry) {
3694                 if (nvme_ns_ids_valid(&new->ids) &&
3695                     nvme_ns_ids_equal(&new->ids, &h->ids))
3696                         return -EINVAL;
3697         }
3698
3699         return 0;
3700 }
3701
3702 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
3703                 unsigned nsid, struct nvme_ns_ids *ids)
3704 {
3705         struct nvme_ns_head *head;
3706         size_t size = sizeof(*head);
3707         int ret = -ENOMEM;
3708
3709 #ifdef CONFIG_NVME_MULTIPATH
3710         size += num_possible_nodes() * sizeof(struct nvme_ns *);
3711 #endif
3712
3713         head = kzalloc(size, GFP_KERNEL);
3714         if (!head)
3715                 goto out;
3716         ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
3717         if (ret < 0)
3718                 goto out_free_head;
3719         head->instance = ret;
3720         INIT_LIST_HEAD(&head->list);
3721         ret = init_srcu_struct(&head->srcu);
3722         if (ret)
3723                 goto out_ida_remove;
3724         head->subsys = ctrl->subsys;
3725         head->ns_id = nsid;
3726         head->ids = *ids;
3727         kref_init(&head->ref);
3728
3729         ret = __nvme_check_ids(ctrl->subsys, head);
3730         if (ret) {
3731                 dev_err(ctrl->device,
3732                         "duplicate IDs for nsid %d\n", nsid);
3733                 goto out_cleanup_srcu;
3734         }
3735
3736         if (head->ids.csi) {
3737                 ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
3738                 if (ret)
3739                         goto out_cleanup_srcu;
3740         } else
3741                 head->effects = ctrl->effects;
3742
3743         ret = nvme_mpath_alloc_disk(ctrl, head);
3744         if (ret)
3745                 goto out_cleanup_srcu;
3746
3747         list_add_tail(&head->entry, &ctrl->subsys->nsheads);
3748
3749         kref_get(&ctrl->subsys->ref);
3750
3751         return head;
3752 out_cleanup_srcu:
3753         cleanup_srcu_struct(&head->srcu);
3754 out_ida_remove:
3755         ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
3756 out_free_head:
3757         kfree(head);
3758 out:
3759         if (ret > 0)
3760                 ret = blk_status_to_errno(nvme_error_status(ret));
3761         return ERR_PTR(ret);
3762 }
3763
3764 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
3765                 struct nvme_id_ns *id)
3766 {
3767         struct nvme_ctrl *ctrl = ns->ctrl;
3768         bool is_shared = id->nmic & NVME_NS_NMIC_SHARED;
3769         struct nvme_ns_head *head = NULL;
3770         struct nvme_ns_ids ids;
3771         int ret = 0;
3772
3773         ret = nvme_report_ns_ids(ctrl, nsid, id, &ids);
3774         if (ret) {
3775                 if (ret < 0)
3776                         return ret;
3777                 return blk_status_to_errno(nvme_error_status(ret));
3778         }
3779
3780         mutex_lock(&ctrl->subsys->lock);
3781         head = nvme_find_ns_head(ctrl->subsys, nsid);
3782         if (!head) {
3783                 head = nvme_alloc_ns_head(ctrl, nsid, &ids);
3784                 if (IS_ERR(head)) {
3785                         ret = PTR_ERR(head);
3786                         goto out_unlock;
3787                 }
3788                 head->shared = is_shared;
3789         } else {
3790                 ret = -EINVAL;
3791                 if (!is_shared || !head->shared) {
3792                         dev_err(ctrl->device,
3793                                 "Duplicate unshared namespace %d\n", nsid);
3794                         goto out_put_ns_head;
3795                 }
3796                 if (!nvme_ns_ids_equal(&head->ids, &ids)) {
3797                         dev_err(ctrl->device,
3798                                 "IDs don't match for shared namespace %d\n",
3799                                         nsid);
3800                         goto out_put_ns_head;
3801                 }
3802         }
3803
3804         list_add_tail(&ns->siblings, &head->list);
3805         ns->head = head;
3806         mutex_unlock(&ctrl->subsys->lock);
3807         return 0;
3808
3809 out_put_ns_head:
3810         nvme_put_ns_head(head);
3811 out_unlock:
3812         mutex_unlock(&ctrl->subsys->lock);
3813         return ret;
3814 }
3815
3816 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
3817 {
3818         struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
3819         struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
3820
3821         return nsa->head->ns_id - nsb->head->ns_id;
3822 }
3823
3824 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3825 {
3826         struct nvme_ns *ns, *ret = NULL;
3827
3828         down_read(&ctrl->namespaces_rwsem);
3829         list_for_each_entry(ns, &ctrl->namespaces, list) {
3830                 if (ns->head->ns_id == nsid) {
3831                         if (!kref_get_unless_zero(&ns->kref))
3832                                 continue;
3833                         ret = ns;
3834                         break;
3835                 }
3836                 if (ns->head->ns_id > nsid)
3837                         break;
3838         }
3839         up_read(&ctrl->namespaces_rwsem);
3840         return ret;
3841 }
3842 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
3843
3844 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3845 {
3846         struct nvme_ns *ns;
3847         struct gendisk *disk;
3848         struct nvme_id_ns *id;
3849         char disk_name[DISK_NAME_LEN];
3850         int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
3851
3852         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
3853         if (!ns)
3854                 return;
3855
3856         ns->queue = blk_mq_init_queue(ctrl->tagset);
3857         if (IS_ERR(ns->queue))
3858                 goto out_free_ns;
3859
3860         if (ctrl->opts && ctrl->opts->data_digest)
3861                 ns->queue->backing_dev_info->capabilities
3862                         |= BDI_CAP_STABLE_WRITES;
3863
3864         blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
3865         if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
3866                 blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
3867
3868         ns->queue->queuedata = ns;
3869         ns->ctrl = ctrl;
3870
3871         kref_init(&ns->kref);
3872         ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
3873
3874         blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
3875         nvme_set_queue_limits(ctrl, ns->queue);
3876
3877         ret = nvme_identify_ns(ctrl, nsid, &id);
3878         if (ret)
3879                 goto out_free_queue;
3880
3881         if (id->ncap == 0)      /* no namespace (legacy quirk) */
3882                 goto out_free_id;
3883
3884         ret = nvme_init_ns_head(ns, nsid, id);
3885         if (ret)
3886                 goto out_free_id;
3887         nvme_set_disk_name(disk_name, ns, ctrl, &flags);
3888
3889         disk = alloc_disk_node(0, node);
3890         if (!disk)
3891                 goto out_unlink_ns;
3892
3893         disk->fops = &nvme_fops;
3894         disk->private_data = ns;
3895         disk->queue = ns->queue;
3896         disk->flags = flags;
3897         memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
3898         ns->disk = disk;
3899
3900         if (__nvme_revalidate_disk(disk, id))
3901                 goto out_put_disk;
3902
3903         if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
3904                 ret = nvme_nvm_register(ns, disk_name, node);
3905                 if (ret) {
3906                         dev_warn(ctrl->device, "LightNVM init failure\n");
3907                         goto out_put_disk;
3908                 }
3909         }
3910
3911         down_write(&ctrl->namespaces_rwsem);
3912         list_add_tail(&ns->list, &ctrl->namespaces);
3913         up_write(&ctrl->namespaces_rwsem);
3914
3915         nvme_get_ctrl(ctrl);
3916
3917         device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups);
3918
3919         nvme_mpath_add_disk(ns, id);
3920         nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
3921         kfree(id);
3922
3923         return;
3924  out_put_disk:
3925         /* prevent double queue cleanup */
3926         ns->disk->queue = NULL;
3927         put_disk(ns->disk);
3928  out_unlink_ns:
3929         mutex_lock(&ctrl->subsys->lock);
3930         list_del_rcu(&ns->siblings);
3931         if (list_empty(&ns->head->list))
3932                 list_del_init(&ns->head->entry);
3933         mutex_unlock(&ctrl->subsys->lock);
3934         nvme_put_ns_head(ns->head);
3935  out_free_id:
3936         kfree(id);
3937  out_free_queue:
3938         blk_cleanup_queue(ns->queue);
3939  out_free_ns:
3940         kfree(ns);
3941 }
3942
3943 static void nvme_ns_remove(struct nvme_ns *ns)
3944 {
3945         if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
3946                 return;
3947
3948         nvme_fault_inject_fini(&ns->fault_inject);
3949
3950         mutex_lock(&ns->ctrl->subsys->lock);
3951         list_del_rcu(&ns->siblings);
3952         if (list_empty(&ns->head->list))
3953                 list_del_init(&ns->head->entry);
3954         mutex_unlock(&ns->ctrl->subsys->lock);
3955
3956         synchronize_rcu(); /* guarantee not available in head->list */
3957         nvme_mpath_clear_current_path(ns);
3958         synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
3959
3960         if (ns->disk->flags & GENHD_FL_UP) {
3961                 del_gendisk(ns->disk);
3962                 blk_cleanup_queue(ns->queue);
3963                 if (blk_get_integrity(ns->disk))
3964                         blk_integrity_unregister(ns->disk);
3965         }
3966
3967         down_write(&ns->ctrl->namespaces_rwsem);
3968         list_del_init(&ns->list);
3969         up_write(&ns->ctrl->namespaces_rwsem);
3970
3971         nvme_mpath_check_last_path(ns);
3972         nvme_put_ns(ns);
3973 }
3974
3975 static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
3976 {
3977         struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
3978
3979         if (ns) {
3980                 nvme_ns_remove(ns);
3981                 nvme_put_ns(ns);
3982         }
3983 }
3984
3985 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
3986 {
3987         struct nvme_ns *ns;
3988
3989         ns = nvme_find_get_ns(ctrl, nsid);
3990         if (ns) {
3991                 if (revalidate_disk(ns->disk))
3992                         nvme_ns_remove(ns);
3993                 nvme_put_ns(ns);
3994         } else
3995                 nvme_alloc_ns(ctrl, nsid);
3996 }
3997
3998 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
3999                                         unsigned nsid)
4000 {
4001         struct nvme_ns *ns, *next;
4002         LIST_HEAD(rm_list);
4003
4004         down_write(&ctrl->namespaces_rwsem);
4005         list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
4006                 if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
4007                         list_move_tail(&ns->list, &rm_list);
4008         }
4009         up_write(&ctrl->namespaces_rwsem);
4010
4011         list_for_each_entry_safe(ns, next, &rm_list, list)
4012                 nvme_ns_remove(ns);
4013
4014 }
4015
4016 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
4017 {
4018         const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
4019         __le32 *ns_list;
4020         u32 prev = 0;
4021         int ret = 0, i;
4022
4023         if (nvme_ctrl_limited_cns(ctrl))
4024                 return -EOPNOTSUPP;
4025
4026         ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
4027         if (!ns_list)
4028                 return -ENOMEM;
4029
4030         for (;;) {
4031                 ret = nvme_identify_ns_list(ctrl, prev, ns_list);
4032                 if (ret)
4033                         goto free;
4034
4035                 for (i = 0; i < nr_entries; i++) {
4036                         u32 nsid = le32_to_cpu(ns_list[i]);
4037
4038                         if (!nsid)      /* end of the list? */
4039                                 goto out;
4040                         nvme_validate_ns(ctrl, nsid);
4041                         while (++prev < nsid)
4042                                 nvme_ns_remove_by_nsid(ctrl, prev);
4043                 }
4044         }
4045  out:
4046         nvme_remove_invalid_namespaces(ctrl, prev);
4047  free:
4048         kfree(ns_list);
4049         return ret;
4050 }
4051
4052 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
4053 {
4054         struct nvme_id_ctrl *id;
4055         u32 nn, i;
4056
4057         if (nvme_identify_ctrl(ctrl, &id))
4058                 return;
4059         nn = le32_to_cpu(id->nn);
4060         kfree(id);
4061
4062         for (i = 1; i <= nn; i++)
4063                 nvme_validate_ns(ctrl, i);
4064
4065         nvme_remove_invalid_namespaces(ctrl, nn);
4066 }
4067
4068 static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
4069 {
4070         size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
4071         __le32 *log;
4072         int error;
4073
4074         log = kzalloc(log_size, GFP_KERNEL);
4075         if (!log)
4076                 return;
4077
4078         /*
4079          * We need to read the log to clear the AEN, but we don't want to rely
4080          * on it for the changed namespace information as userspace could have
4081          * raced with us in reading the log page, which could cause us to miss
4082          * updates.
4083          */
4084         error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
4085                         NVME_CSI_NVM, log, log_size, 0);
4086         if (error)
4087                 dev_warn(ctrl->device,
4088                         "reading changed ns log failed: %d\n", error);
4089
4090         kfree(log);
4091 }
4092
4093 static void nvme_scan_work(struct work_struct *work)
4094 {
4095         struct nvme_ctrl *ctrl =
4096                 container_of(work, struct nvme_ctrl, scan_work);
4097
4098         /* No tagset on a live ctrl means IO queues could not created */
4099         if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
4100                 return;
4101
4102         if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
4103                 dev_info(ctrl->device, "rescanning namespaces.\n");
4104                 nvme_clear_changed_ns_log(ctrl);
4105         }
4106
4107         mutex_lock(&ctrl->scan_lock);
4108         if (nvme_scan_ns_list(ctrl) != 0)
4109                 nvme_scan_ns_sequential(ctrl);
4110         mutex_unlock(&ctrl->scan_lock);
4111
4112         down_write(&ctrl->namespaces_rwsem);
4113         list_sort(NULL, &ctrl->namespaces, ns_cmp);
4114         up_write(&ctrl->namespaces_rwsem);
4115 }
4116
4117 /*
4118  * This function iterates the namespace list unlocked to allow recovery from
4119  * controller failure. It is up to the caller to ensure the namespace list is
4120  * not modified by scan work while this function is executing.
4121  */
4122 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
4123 {
4124         struct nvme_ns *ns, *next;
4125         LIST_HEAD(ns_list);
4126
4127         /*
4128          * make sure to requeue I/O to all namespaces as these
4129          * might result from the scan itself and must complete
4130          * for the scan_work to make progress
4131          */
4132         nvme_mpath_clear_ctrl_paths(ctrl);
4133
4134         /* prevent racing with ns scanning */
4135         flush_work(&ctrl->scan_work);
4136
4137         /*
4138          * The dead states indicates the controller was not gracefully
4139          * disconnected. In that case, we won't be able to flush any data while
4140          * removing the namespaces' disks; fail all the queues now to avoid
4141          * potentially having to clean up the failed sync later.
4142          */
4143         if (ctrl->state == NVME_CTRL_DEAD)
4144                 nvme_kill_queues(ctrl);
4145
4146         /* this is a no-op when called from the controller reset handler */
4147         nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
4148
4149         down_write(&ctrl->namespaces_rwsem);
4150         list_splice_init(&ctrl->namespaces, &ns_list);
4151         up_write(&ctrl->namespaces_rwsem);
4152
4153         list_for_each_entry_safe(ns, next, &ns_list, list)
4154                 nvme_ns_remove(ns);
4155 }
4156 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
4157
4158 static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
4159 {
4160         struct nvme_ctrl *ctrl =
4161                 container_of(dev, struct nvme_ctrl, ctrl_device);
4162         struct nvmf_ctrl_options *opts = ctrl->opts;
4163         int ret;
4164
4165         ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
4166         if (ret)
4167                 return ret;
4168
4169         if (opts) {
4170                 ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
4171                 if (ret)
4172                         return ret;
4173
4174                 ret = add_uevent_var(env, "NVME_TRSVCID=%s",
4175                                 opts->trsvcid ?: "none");
4176                 if (ret)
4177                         return ret;
4178
4179                 ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
4180                                 opts->host_traddr ?: "none");
4181         }
4182         return ret;
4183 }
4184
4185 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
4186 {
4187         char *envp[2] = { NULL, NULL };
4188         u32 aen_result = ctrl->aen_result;
4189
4190         ctrl->aen_result = 0;
4191         if (!aen_result)
4192                 return;
4193
4194         envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
4195         if (!envp[0])
4196                 return;
4197         kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
4198         kfree(envp[0]);
4199 }
4200
4201 static void nvme_async_event_work(struct work_struct *work)
4202 {
4203         struct nvme_ctrl *ctrl =
4204                 container_of(work, struct nvme_ctrl, async_event_work);
4205
4206         nvme_aen_uevent(ctrl);
4207         ctrl->ops->submit_async_event(ctrl);
4208 }
4209
4210 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
4211 {
4212
4213         u32 csts;
4214
4215         if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
4216                 return false;
4217
4218         if (csts == ~0)
4219                 return false;
4220
4221         return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
4222 }
4223
4224 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
4225 {
4226         struct nvme_fw_slot_info_log *log;
4227
4228         log = kmalloc(sizeof(*log), GFP_KERNEL);
4229         if (!log)
4230                 return;
4231
4232         if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
4233                         log, sizeof(*log), 0))
4234                 dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
4235         kfree(log);
4236 }
4237
4238 static void nvme_fw_act_work(struct work_struct *work)
4239 {
4240         struct nvme_ctrl *ctrl = container_of(work,
4241                                 struct nvme_ctrl, fw_act_work);
4242         unsigned long fw_act_timeout;
4243
4244         if (ctrl->mtfa)
4245                 fw_act_timeout = jiffies +
4246                                 msecs_to_jiffies(ctrl->mtfa * 100);
4247         else
4248                 fw_act_timeout = jiffies +
4249                                 msecs_to_jiffies(admin_timeout * 1000);
4250
4251         nvme_stop_queues(ctrl);
4252         while (nvme_ctrl_pp_status(ctrl)) {
4253                 if (time_after(jiffies, fw_act_timeout)) {
4254                         dev_warn(ctrl->device,
4255                                 "Fw activation timeout, reset controller\n");
4256                         nvme_try_sched_reset(ctrl);
4257                         return;
4258                 }
4259                 msleep(100);
4260         }
4261
4262         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
4263                 return;
4264
4265         nvme_start_queues(ctrl);
4266         /* read FW slot information to clear the AER */
4267         nvme_get_fw_slot_info(ctrl);
4268 }
4269
4270 static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
4271 {
4272         u32 aer_notice_type = (result & 0xff00) >> 8;
4273
4274         trace_nvme_async_event(ctrl, aer_notice_type);
4275
4276         switch (aer_notice_type) {
4277         case NVME_AER_NOTICE_NS_CHANGED:
4278                 set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
4279                 nvme_queue_scan(ctrl);
4280                 break;
4281         case NVME_AER_NOTICE_FW_ACT_STARTING:
4282                 /*
4283                  * We are (ab)using the RESETTING state to prevent subsequent
4284                  * recovery actions from interfering with the controller's
4285                  * firmware activation.
4286                  */
4287                 if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
4288                         queue_work(nvme_wq, &ctrl->fw_act_work);
4289                 break;
4290 #ifdef CONFIG_NVME_MULTIPATH
4291         case NVME_AER_NOTICE_ANA:
4292                 if (!ctrl->ana_log_buf)
4293                         break;
4294                 queue_work(nvme_wq, &ctrl->ana_work);
4295                 break;
4296 #endif
4297         case NVME_AER_NOTICE_DISC_CHANGED:
4298                 ctrl->aen_result = result;
4299                 break;
4300         default:
4301                 dev_warn(ctrl->device, "async event result %08x\n", result);
4302         }
4303 }
4304
4305 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
4306                 volatile union nvme_result *res)
4307 {
4308         u32 result = le32_to_cpu(res->u32);
4309         u32 aer_type = result & 0x07;
4310
4311         if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
4312                 return;
4313
4314         switch (aer_type) {
4315         case NVME_AER_NOTICE:
4316                 nvme_handle_aen_notice(ctrl, result);
4317                 break;
4318         case NVME_AER_ERROR:
4319         case NVME_AER_SMART:
4320         case NVME_AER_CSS:
4321         case NVME_AER_VS:
4322                 trace_nvme_async_event(ctrl, aer_type);
4323                 ctrl->aen_result = result;
4324                 break;
4325         default:
4326                 break;
4327         }
4328         queue_work(nvme_wq, &ctrl->async_event_work);
4329 }
4330 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
4331
4332 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
4333 {
4334         nvme_mpath_stop(ctrl);
4335         nvme_stop_keep_alive(ctrl);
4336         flush_work(&ctrl->async_event_work);
4337         cancel_work_sync(&ctrl->fw_act_work);
4338 }
4339 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
4340
4341 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
4342 {
4343         nvme_start_keep_alive(ctrl);
4344
4345         nvme_enable_aen(ctrl);
4346
4347         if (ctrl->queue_count > 1) {
4348                 nvme_queue_scan(ctrl);
4349                 nvme_start_queues(ctrl);
4350         }
4351         ctrl->created = true;
4352 }
4353 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
4354
4355 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
4356 {
4357         nvme_fault_inject_fini(&ctrl->fault_inject);
4358         dev_pm_qos_hide_latency_tolerance(ctrl->device);
4359         cdev_device_del(&ctrl->cdev, ctrl->device);
4360         nvme_put_ctrl(ctrl);
4361 }
4362 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
4363
4364 static void nvme_free_ctrl(struct device *dev)
4365 {
4366         struct nvme_ctrl *ctrl =
4367                 container_of(dev, struct nvme_ctrl, ctrl_device);
4368         struct nvme_subsystem *subsys = ctrl->subsys;
4369         struct nvme_cel *cel, *next;
4370
4371         if (subsys && ctrl->instance != subsys->instance)
4372                 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
4373
4374         list_for_each_entry_safe(cel, next, &ctrl->cels, entry) {
4375                 list_del(&cel->entry);
4376                 kfree(cel);
4377         }
4378
4379         nvme_mpath_uninit(ctrl);
4380         __free_page(ctrl->discard_page);
4381
4382         if (subsys) {
4383                 mutex_lock(&nvme_subsystems_lock);
4384                 list_del(&ctrl->subsys_entry);
4385                 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
4386                 mutex_unlock(&nvme_subsystems_lock);
4387         }
4388
4389         ctrl->ops->free_ctrl(ctrl);
4390
4391         if (subsys)
4392                 nvme_put_subsystem(subsys);
4393 }
4394
4395 /*
4396  * Initialize a NVMe controller structures.  This needs to be called during
4397  * earliest initialization so that we have the initialized structured around
4398  * during probing.
4399  */
4400 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
4401                 const struct nvme_ctrl_ops *ops, unsigned long quirks)
4402 {
4403         int ret;
4404
4405         ctrl->state = NVME_CTRL_NEW;
4406         spin_lock_init(&ctrl->lock);
4407         mutex_init(&ctrl->scan_lock);
4408         INIT_LIST_HEAD(&ctrl->namespaces);
4409         INIT_LIST_HEAD(&ctrl->cels);
4410         init_rwsem(&ctrl->namespaces_rwsem);
4411         ctrl->dev = dev;
4412         ctrl->ops = ops;
4413         ctrl->quirks = quirks;
4414         ctrl->numa_node = NUMA_NO_NODE;
4415         INIT_WORK(&ctrl->scan_work, nvme_scan_work);
4416         INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
4417         INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
4418         INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
4419         init_waitqueue_head(&ctrl->state_wq);
4420
4421         INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
4422         memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
4423         ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
4424
4425         BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
4426                         PAGE_SIZE);
4427         ctrl->discard_page = alloc_page(GFP_KERNEL);
4428         if (!ctrl->discard_page) {
4429                 ret = -ENOMEM;
4430                 goto out;
4431         }
4432
4433         ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
4434         if (ret < 0)
4435                 goto out;
4436         ctrl->instance = ret;
4437
4438         device_initialize(&ctrl->ctrl_device);
4439         ctrl->device = &ctrl->ctrl_device;
4440         ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
4441         ctrl->device->class = nvme_class;
4442         ctrl->device->parent = ctrl->dev;
4443         ctrl->device->groups = nvme_dev_attr_groups;
4444         ctrl->device->release = nvme_free_ctrl;
4445         dev_set_drvdata(ctrl->device, ctrl);
4446         ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
4447         if (ret)
4448                 goto out_release_instance;
4449
4450         nvme_get_ctrl(ctrl);
4451         cdev_init(&ctrl->cdev, &nvme_dev_fops);
4452         ctrl->cdev.owner = ops->module;
4453         ret = cdev_device_add(&ctrl->cdev, ctrl->device);
4454         if (ret)
4455                 goto out_free_name;
4456
4457         /*
4458          * Initialize latency tolerance controls.  The sysfs files won't
4459          * be visible to userspace unless the device actually supports APST.
4460          */
4461         ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
4462         dev_pm_qos_update_user_latency_tolerance(ctrl->device,
4463                 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
4464
4465         nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
4466
4467         return 0;
4468 out_free_name:
4469         nvme_put_ctrl(ctrl);
4470         kfree_const(ctrl->device->kobj.name);
4471 out_release_instance:
4472         ida_simple_remove(&nvme_instance_ida, ctrl->instance);
4473 out:
4474         if (ctrl->discard_page)
4475                 __free_page(ctrl->discard_page);
4476         return ret;
4477 }
4478 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
4479
4480 /**
4481  * nvme_kill_queues(): Ends all namespace queues
4482  * @ctrl: the dead controller that needs to end
4483  *
4484  * Call this function when the driver determines it is unable to get the
4485  * controller in a state capable of servicing IO.
4486  */
4487 void nvme_kill_queues(struct nvme_ctrl *ctrl)
4488 {
4489         struct nvme_ns *ns;
4490
4491         down_read(&ctrl->namespaces_rwsem);
4492
4493         /* Forcibly unquiesce queues to avoid blocking dispatch */
4494         if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
4495                 blk_mq_unquiesce_queue(ctrl->admin_q);
4496
4497         list_for_each_entry(ns, &ctrl->namespaces, list)
4498                 nvme_set_queue_dying(ns);
4499
4500         up_read(&ctrl->namespaces_rwsem);
4501 }
4502 EXPORT_SYMBOL_GPL(nvme_kill_queues);
4503
4504 void nvme_unfreeze(struct nvme_ctrl *ctrl)
4505 {
4506         struct nvme_ns *ns;
4507
4508         down_read(&ctrl->namespaces_rwsem);
4509         list_for_each_entry(ns, &ctrl->namespaces, list)
4510                 blk_mq_unfreeze_queue(ns->queue);
4511         up_read(&ctrl->namespaces_rwsem);
4512 }
4513 EXPORT_SYMBOL_GPL(nvme_unfreeze);
4514
4515 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
4516 {
4517         struct nvme_ns *ns;
4518
4519         down_read(&ctrl->namespaces_rwsem);
4520         list_for_each_entry(ns, &ctrl->namespaces, list) {
4521                 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
4522                 if (timeout <= 0)
4523                         break;
4524         }
4525         up_read(&ctrl->namespaces_rwsem);
4526 }
4527 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
4528
4529 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
4530 {
4531         struct nvme_ns *ns;
4532
4533         down_read(&ctrl->namespaces_rwsem);
4534         list_for_each_entry(ns, &ctrl->namespaces, list)
4535                 blk_mq_freeze_queue_wait(ns->queue);
4536         up_read(&ctrl->namespaces_rwsem);
4537 }
4538 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
4539
4540 void nvme_start_freeze(struct nvme_ctrl *ctrl)
4541 {
4542         struct nvme_ns *ns;
4543
4544         down_read(&ctrl->namespaces_rwsem);
4545         list_for_each_entry(ns, &ctrl->namespaces, list)
4546                 blk_freeze_queue_start(ns->queue);
4547         up_read(&ctrl->namespaces_rwsem);
4548 }
4549 EXPORT_SYMBOL_GPL(nvme_start_freeze);
4550
4551 void nvme_stop_queues(struct nvme_ctrl *ctrl)
4552 {
4553         struct nvme_ns *ns;
4554
4555         down_read(&ctrl->namespaces_rwsem);
4556         list_for_each_entry(ns, &ctrl->namespaces, list)
4557                 blk_mq_quiesce_queue(ns->queue);
4558         up_read(&ctrl->namespaces_rwsem);
4559 }
4560 EXPORT_SYMBOL_GPL(nvme_stop_queues);
4561
4562 void nvme_start_queues(struct nvme_ctrl *ctrl)
4563 {
4564         struct nvme_ns *ns;
4565
4566         down_read(&ctrl->namespaces_rwsem);
4567         list_for_each_entry(ns, &ctrl->namespaces, list)
4568                 blk_mq_unquiesce_queue(ns->queue);
4569         up_read(&ctrl->namespaces_rwsem);
4570 }
4571 EXPORT_SYMBOL_GPL(nvme_start_queues);
4572
4573
4574 void nvme_sync_queues(struct nvme_ctrl *ctrl)
4575 {
4576         struct nvme_ns *ns;
4577
4578         down_read(&ctrl->namespaces_rwsem);
4579         list_for_each_entry(ns, &ctrl->namespaces, list)
4580                 blk_sync_queue(ns->queue);
4581         up_read(&ctrl->namespaces_rwsem);
4582
4583         if (ctrl->admin_q)
4584                 blk_sync_queue(ctrl->admin_q);
4585 }
4586 EXPORT_SYMBOL_GPL(nvme_sync_queues);
4587
4588 struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path)
4589 {
4590         struct nvme_ctrl *ctrl;
4591         struct file *f;
4592
4593         f = filp_open(path, O_RDWR, 0);
4594         if (IS_ERR(f))
4595                 return ERR_CAST(f);
4596
4597         if (f->f_op != &nvme_dev_fops) {
4598                 ctrl = ERR_PTR(-EINVAL);
4599                 goto out_close;
4600         }
4601
4602         ctrl = f->private_data;
4603         nvme_get_ctrl(ctrl);
4604
4605 out_close:
4606         filp_close(f, NULL);
4607         return ctrl;
4608 }
4609 EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU);
4610
4611 /*
4612  * Check we didn't inadvertently grow the command structure sizes:
4613  */
4614 static inline void _nvme_check_size(void)
4615 {
4616         BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
4617         BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
4618         BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
4619         BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
4620         BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
4621         BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
4622         BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
4623         BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
4624         BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
4625         BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
4626         BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
4627         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
4628         BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
4629         BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
4630         BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
4631         BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
4632         BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
4633         BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
4634         BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
4635 }
4636
4637
4638 static int __init nvme_core_init(void)
4639 {
4640         int result = -ENOMEM;
4641
4642         _nvme_check_size();
4643
4644         nvme_wq = alloc_workqueue("nvme-wq",
4645                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4646         if (!nvme_wq)
4647                 goto out;
4648
4649         nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
4650                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4651         if (!nvme_reset_wq)
4652                 goto destroy_wq;
4653
4654         nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
4655                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
4656         if (!nvme_delete_wq)
4657                 goto destroy_reset_wq;
4658
4659         result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
4660         if (result < 0)
4661                 goto destroy_delete_wq;
4662
4663         nvme_class = class_create(THIS_MODULE, "nvme");
4664         if (IS_ERR(nvme_class)) {
4665                 result = PTR_ERR(nvme_class);
4666                 goto unregister_chrdev;
4667         }
4668         nvme_class->dev_uevent = nvme_class_uevent;
4669
4670         nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
4671         if (IS_ERR(nvme_subsys_class)) {
4672                 result = PTR_ERR(nvme_subsys_class);
4673                 goto destroy_class;
4674         }
4675         return 0;
4676
4677 destroy_class:
4678         class_destroy(nvme_class);
4679 unregister_chrdev:
4680         unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
4681 destroy_delete_wq:
4682         destroy_workqueue(nvme_delete_wq);
4683 destroy_reset_wq:
4684         destroy_workqueue(nvme_reset_wq);
4685 destroy_wq:
4686         destroy_workqueue(nvme_wq);
4687 out:
4688         return result;
4689 }
4690
4691 static void __exit nvme_core_exit(void)
4692 {
4693         class_destroy(nvme_subsys_class);
4694         class_destroy(nvme_class);
4695         unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
4696         destroy_workqueue(nvme_delete_wq);
4697         destroy_workqueue(nvme_reset_wq);
4698         destroy_workqueue(nvme_wq);
4699         ida_destroy(&nvme_instance_ida);
4700 }
4701
4702 MODULE_LICENSE("GPL");
4703 MODULE_VERSION("1.0");
4704 module_init(nvme_core_init);
4705 module_exit(nvme_core_exit);
This page took 0.306865 seconds and 4 git commands to generate.