]> Git Repo - linux.git/blob - drivers/nvme/host/tcp.c
drm/nouveau/kms: Don't change EDID when it hasn't actually changed
[linux.git] / drivers / nvme / host / tcp.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP host.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
12 #include <net/sock.h>
13 #include <net/tcp.h>
14 #include <linux/blk-mq.h>
15 #include <crypto/hash.h>
16 #include <net/busy_poll.h>
17
18 #include "nvme.h"
19 #include "fabrics.h"
20
21 struct nvme_tcp_queue;
22
23 /* Define the socket priority to use for connections were it is desirable
24  * that the NIC consider performing optimized packet processing or filtering.
25  * A non-zero value being sufficient to indicate general consideration of any
26  * possible optimization.  Making it a module param allows for alternative
27  * values that may be unique for some NIC implementations.
28  */
29 static int so_priority;
30 module_param(so_priority, int, 0644);
31 MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
32
33 enum nvme_tcp_send_state {
34         NVME_TCP_SEND_CMD_PDU = 0,
35         NVME_TCP_SEND_H2C_PDU,
36         NVME_TCP_SEND_DATA,
37         NVME_TCP_SEND_DDGST,
38 };
39
40 struct nvme_tcp_request {
41         struct nvme_request     req;
42         void                    *pdu;
43         struct nvme_tcp_queue   *queue;
44         u32                     data_len;
45         u32                     pdu_len;
46         u32                     pdu_sent;
47         u16                     ttag;
48         struct list_head        entry;
49         struct llist_node       lentry;
50         __le32                  ddgst;
51
52         struct bio              *curr_bio;
53         struct iov_iter         iter;
54
55         /* send state */
56         size_t                  offset;
57         size_t                  data_sent;
58         enum nvme_tcp_send_state state;
59 };
60
61 enum nvme_tcp_queue_flags {
62         NVME_TCP_Q_ALLOCATED    = 0,
63         NVME_TCP_Q_LIVE         = 1,
64         NVME_TCP_Q_POLLING      = 2,
65 };
66
67 enum nvme_tcp_recv_state {
68         NVME_TCP_RECV_PDU = 0,
69         NVME_TCP_RECV_DATA,
70         NVME_TCP_RECV_DDGST,
71 };
72
73 struct nvme_tcp_ctrl;
74 struct nvme_tcp_queue {
75         struct socket           *sock;
76         struct work_struct      io_work;
77         int                     io_cpu;
78
79         struct mutex            send_mutex;
80         struct llist_head       req_list;
81         struct list_head        send_list;
82         bool                    more_requests;
83
84         /* recv state */
85         void                    *pdu;
86         int                     pdu_remaining;
87         int                     pdu_offset;
88         size_t                  data_remaining;
89         size_t                  ddgst_remaining;
90         unsigned int            nr_cqe;
91
92         /* send state */
93         struct nvme_tcp_request *request;
94
95         int                     queue_size;
96         size_t                  cmnd_capsule_len;
97         struct nvme_tcp_ctrl    *ctrl;
98         unsigned long           flags;
99         bool                    rd_enabled;
100
101         bool                    hdr_digest;
102         bool                    data_digest;
103         struct ahash_request    *rcv_hash;
104         struct ahash_request    *snd_hash;
105         __le32                  exp_ddgst;
106         __le32                  recv_ddgst;
107
108         struct page_frag_cache  pf_cache;
109
110         void (*state_change)(struct sock *);
111         void (*data_ready)(struct sock *);
112         void (*write_space)(struct sock *);
113 };
114
115 struct nvme_tcp_ctrl {
116         /* read only in the hot path */
117         struct nvme_tcp_queue   *queues;
118         struct blk_mq_tag_set   tag_set;
119
120         /* other member variables */
121         struct list_head        list;
122         struct blk_mq_tag_set   admin_tag_set;
123         struct sockaddr_storage addr;
124         struct sockaddr_storage src_addr;
125         struct nvme_ctrl        ctrl;
126
127         struct work_struct      err_work;
128         struct delayed_work     connect_work;
129         struct nvme_tcp_request async_req;
130         u32                     io_queues[HCTX_MAX_TYPES];
131 };
132
133 static LIST_HEAD(nvme_tcp_ctrl_list);
134 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
135 static struct workqueue_struct *nvme_tcp_wq;
136 static const struct blk_mq_ops nvme_tcp_mq_ops;
137 static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
138 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
139
140 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
141 {
142         return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
143 }
144
145 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
146 {
147         return queue - queue->ctrl->queues;
148 }
149
150 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
151 {
152         u32 queue_idx = nvme_tcp_queue_id(queue);
153
154         if (queue_idx == 0)
155                 return queue->ctrl->admin_tag_set.tags[queue_idx];
156         return queue->ctrl->tag_set.tags[queue_idx - 1];
157 }
158
159 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
160 {
161         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
162 }
163
164 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
165 {
166         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
167 }
168
169 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
170 {
171         return queue->cmnd_capsule_len - sizeof(struct nvme_command);
172 }
173
174 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
175 {
176         return req == &req->queue->ctrl->async_req;
177 }
178
179 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
180 {
181         struct request *rq;
182
183         if (unlikely(nvme_tcp_async_req(req)))
184                 return false; /* async events don't have a request */
185
186         rq = blk_mq_rq_from_pdu(req);
187
188         return rq_data_dir(rq) == WRITE && req->data_len &&
189                 req->data_len <= nvme_tcp_inline_data_size(req->queue);
190 }
191
192 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
193 {
194         return req->iter.bvec->bv_page;
195 }
196
197 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
198 {
199         return req->iter.bvec->bv_offset + req->iter.iov_offset;
200 }
201
202 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
203 {
204         return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
205                         req->pdu_len - req->pdu_sent);
206 }
207
208 static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
209 {
210         return req->iter.iov_offset;
211 }
212
213 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
214 {
215         return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
216                         req->pdu_len - req->pdu_sent : 0;
217 }
218
219 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
220                 int len)
221 {
222         return nvme_tcp_pdu_data_left(req) <= len;
223 }
224
225 static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
226                 unsigned int dir)
227 {
228         struct request *rq = blk_mq_rq_from_pdu(req);
229         struct bio_vec *vec;
230         unsigned int size;
231         int nsegs;
232         size_t offset;
233
234         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
235                 vec = &rq->special_vec;
236                 nsegs = 1;
237                 size = blk_rq_payload_bytes(rq);
238                 offset = 0;
239         } else {
240                 struct bio *bio = req->curr_bio;
241
242                 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
243                 nsegs = bio_segments(bio);
244                 size = bio->bi_iter.bi_size;
245                 offset = bio->bi_iter.bi_bvec_done;
246         }
247
248         iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
249         req->iter.iov_offset = offset;
250 }
251
252 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
253                 int len)
254 {
255         req->data_sent += len;
256         req->pdu_sent += len;
257         iov_iter_advance(&req->iter, len);
258         if (!iov_iter_count(&req->iter) &&
259             req->data_sent < req->data_len) {
260                 req->curr_bio = req->curr_bio->bi_next;
261                 nvme_tcp_init_iter(req, WRITE);
262         }
263 }
264
265 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
266                 bool sync, bool last)
267 {
268         struct nvme_tcp_queue *queue = req->queue;
269         bool empty;
270
271         empty = llist_add(&req->lentry, &queue->req_list) &&
272                 list_empty(&queue->send_list) && !queue->request;
273
274         /*
275          * if we're the first on the send_list and we can try to send
276          * directly, otherwise queue io_work. Also, only do that if we
277          * are on the same cpu, so we don't introduce contention.
278          */
279         if (queue->io_cpu == smp_processor_id() &&
280             sync && empty && mutex_trylock(&queue->send_mutex)) {
281                 queue->more_requests = !last;
282                 nvme_tcp_try_send(queue);
283                 queue->more_requests = false;
284                 mutex_unlock(&queue->send_mutex);
285         } else if (last) {
286                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
287         }
288 }
289
290 static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
291 {
292         struct nvme_tcp_request *req;
293         struct llist_node *node;
294
295         for (node = llist_del_all(&queue->req_list); node; node = node->next) {
296                 req = llist_entry(node, struct nvme_tcp_request, lentry);
297                 list_add(&req->entry, &queue->send_list);
298         }
299 }
300
301 static inline struct nvme_tcp_request *
302 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
303 {
304         struct nvme_tcp_request *req;
305
306         req = list_first_entry_or_null(&queue->send_list,
307                         struct nvme_tcp_request, entry);
308         if (!req) {
309                 nvme_tcp_process_req_list(queue);
310                 req = list_first_entry_or_null(&queue->send_list,
311                                 struct nvme_tcp_request, entry);
312                 if (unlikely(!req))
313                         return NULL;
314         }
315
316         list_del(&req->entry);
317         return req;
318 }
319
320 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
321                 __le32 *dgst)
322 {
323         ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
324         crypto_ahash_final(hash);
325 }
326
327 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
328                 struct page *page, off_t off, size_t len)
329 {
330         struct scatterlist sg;
331
332         sg_init_marker(&sg, 1);
333         sg_set_page(&sg, page, len, off);
334         ahash_request_set_crypt(hash, &sg, NULL, len);
335         crypto_ahash_update(hash);
336 }
337
338 static inline void nvme_tcp_hdgst(struct ahash_request *hash,
339                 void *pdu, size_t len)
340 {
341         struct scatterlist sg;
342
343         sg_init_one(&sg, pdu, len);
344         ahash_request_set_crypt(hash, &sg, pdu + len, len);
345         crypto_ahash_digest(hash);
346 }
347
348 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
349                 void *pdu, size_t pdu_len)
350 {
351         struct nvme_tcp_hdr *hdr = pdu;
352         __le32 recv_digest;
353         __le32 exp_digest;
354
355         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
356                 dev_err(queue->ctrl->ctrl.device,
357                         "queue %d: header digest flag is cleared\n",
358                         nvme_tcp_queue_id(queue));
359                 return -EPROTO;
360         }
361
362         recv_digest = *(__le32 *)(pdu + hdr->hlen);
363         nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
364         exp_digest = *(__le32 *)(pdu + hdr->hlen);
365         if (recv_digest != exp_digest) {
366                 dev_err(queue->ctrl->ctrl.device,
367                         "header digest error: recv %#x expected %#x\n",
368                         le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
369                 return -EIO;
370         }
371
372         return 0;
373 }
374
375 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
376 {
377         struct nvme_tcp_hdr *hdr = pdu;
378         u8 digest_len = nvme_tcp_hdgst_len(queue);
379         u32 len;
380
381         len = le32_to_cpu(hdr->plen) - hdr->hlen -
382                 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
383
384         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
385                 dev_err(queue->ctrl->ctrl.device,
386                         "queue %d: data digest flag is cleared\n",
387                 nvme_tcp_queue_id(queue));
388                 return -EPROTO;
389         }
390         crypto_ahash_init(queue->rcv_hash);
391
392         return 0;
393 }
394
395 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
396                 struct request *rq, unsigned int hctx_idx)
397 {
398         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
399
400         page_frag_free(req->pdu);
401 }
402
403 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
404                 struct request *rq, unsigned int hctx_idx,
405                 unsigned int numa_node)
406 {
407         struct nvme_tcp_ctrl *ctrl = set->driver_data;
408         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
409         int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
410         struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
411         u8 hdgst = nvme_tcp_hdgst_len(queue);
412
413         req->pdu = page_frag_alloc(&queue->pf_cache,
414                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
415                 GFP_KERNEL | __GFP_ZERO);
416         if (!req->pdu)
417                 return -ENOMEM;
418
419         req->queue = queue;
420         nvme_req(rq)->ctrl = &ctrl->ctrl;
421
422         return 0;
423 }
424
425 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
426                 unsigned int hctx_idx)
427 {
428         struct nvme_tcp_ctrl *ctrl = data;
429         struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
430
431         hctx->driver_data = queue;
432         return 0;
433 }
434
435 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
436                 unsigned int hctx_idx)
437 {
438         struct nvme_tcp_ctrl *ctrl = data;
439         struct nvme_tcp_queue *queue = &ctrl->queues[0];
440
441         hctx->driver_data = queue;
442         return 0;
443 }
444
445 static enum nvme_tcp_recv_state
446 nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
447 {
448         return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
449                 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
450                 NVME_TCP_RECV_DATA;
451 }
452
453 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
454 {
455         queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
456                                 nvme_tcp_hdgst_len(queue);
457         queue->pdu_offset = 0;
458         queue->data_remaining = -1;
459         queue->ddgst_remaining = 0;
460 }
461
462 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
463 {
464         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
465                 return;
466
467         queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
468 }
469
470 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
471                 struct nvme_completion *cqe)
472 {
473         struct request *rq;
474
475         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
476         if (!rq) {
477                 dev_err(queue->ctrl->ctrl.device,
478                         "queue %d tag 0x%x not found\n",
479                         nvme_tcp_queue_id(queue), cqe->command_id);
480                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
481                 return -EINVAL;
482         }
483
484         if (!nvme_end_request(rq, cqe->status, cqe->result))
485                 nvme_complete_rq(rq);
486         queue->nr_cqe++;
487
488         return 0;
489 }
490
491 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
492                 struct nvme_tcp_data_pdu *pdu)
493 {
494         struct request *rq;
495
496         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
497         if (!rq) {
498                 dev_err(queue->ctrl->ctrl.device,
499                         "queue %d tag %#x not found\n",
500                         nvme_tcp_queue_id(queue), pdu->command_id);
501                 return -ENOENT;
502         }
503
504         if (!blk_rq_payload_bytes(rq)) {
505                 dev_err(queue->ctrl->ctrl.device,
506                         "queue %d tag %#x unexpected data\n",
507                         nvme_tcp_queue_id(queue), rq->tag);
508                 return -EIO;
509         }
510
511         queue->data_remaining = le32_to_cpu(pdu->data_length);
512
513         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
514             unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
515                 dev_err(queue->ctrl->ctrl.device,
516                         "queue %d tag %#x SUCCESS set but not last PDU\n",
517                         nvme_tcp_queue_id(queue), rq->tag);
518                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
519                 return -EPROTO;
520         }
521
522         return 0;
523 }
524
525 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
526                 struct nvme_tcp_rsp_pdu *pdu)
527 {
528         struct nvme_completion *cqe = &pdu->cqe;
529         int ret = 0;
530
531         /*
532          * AEN requests are special as they don't time out and can
533          * survive any kind of queue freeze and often don't respond to
534          * aborts.  We don't even bother to allocate a struct request
535          * for them but rather special case them here.
536          */
537         if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
538                                      cqe->command_id)))
539                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
540                                 &cqe->result);
541         else
542                 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
543
544         return ret;
545 }
546
547 static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
548                 struct nvme_tcp_r2t_pdu *pdu)
549 {
550         struct nvme_tcp_data_pdu *data = req->pdu;
551         struct nvme_tcp_queue *queue = req->queue;
552         struct request *rq = blk_mq_rq_from_pdu(req);
553         u8 hdgst = nvme_tcp_hdgst_len(queue);
554         u8 ddgst = nvme_tcp_ddgst_len(queue);
555
556         req->pdu_len = le32_to_cpu(pdu->r2t_length);
557         req->pdu_sent = 0;
558
559         if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
560                 dev_err(queue->ctrl->ctrl.device,
561                         "req %d r2t len %u exceeded data len %u (%zu sent)\n",
562                         rq->tag, req->pdu_len, req->data_len,
563                         req->data_sent);
564                 return -EPROTO;
565         }
566
567         if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
568                 dev_err(queue->ctrl->ctrl.device,
569                         "req %d unexpected r2t offset %u (expected %zu)\n",
570                         rq->tag, le32_to_cpu(pdu->r2t_offset),
571                         req->data_sent);
572                 return -EPROTO;
573         }
574
575         memset(data, 0, sizeof(*data));
576         data->hdr.type = nvme_tcp_h2c_data;
577         data->hdr.flags = NVME_TCP_F_DATA_LAST;
578         if (queue->hdr_digest)
579                 data->hdr.flags |= NVME_TCP_F_HDGST;
580         if (queue->data_digest)
581                 data->hdr.flags |= NVME_TCP_F_DDGST;
582         data->hdr.hlen = sizeof(*data);
583         data->hdr.pdo = data->hdr.hlen + hdgst;
584         data->hdr.plen =
585                 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
586         data->ttag = pdu->ttag;
587         data->command_id = rq->tag;
588         data->data_offset = cpu_to_le32(req->data_sent);
589         data->data_length = cpu_to_le32(req->pdu_len);
590         return 0;
591 }
592
593 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
594                 struct nvme_tcp_r2t_pdu *pdu)
595 {
596         struct nvme_tcp_request *req;
597         struct request *rq;
598         int ret;
599
600         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
601         if (!rq) {
602                 dev_err(queue->ctrl->ctrl.device,
603                         "queue %d tag %#x not found\n",
604                         nvme_tcp_queue_id(queue), pdu->command_id);
605                 return -ENOENT;
606         }
607         req = blk_mq_rq_to_pdu(rq);
608
609         ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
610         if (unlikely(ret))
611                 return ret;
612
613         req->state = NVME_TCP_SEND_H2C_PDU;
614         req->offset = 0;
615
616         nvme_tcp_queue_request(req, false, true);
617
618         return 0;
619 }
620
621 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
622                 unsigned int *offset, size_t *len)
623 {
624         struct nvme_tcp_hdr *hdr;
625         char *pdu = queue->pdu;
626         size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
627         int ret;
628
629         ret = skb_copy_bits(skb, *offset,
630                 &pdu[queue->pdu_offset], rcv_len);
631         if (unlikely(ret))
632                 return ret;
633
634         queue->pdu_remaining -= rcv_len;
635         queue->pdu_offset += rcv_len;
636         *offset += rcv_len;
637         *len -= rcv_len;
638         if (queue->pdu_remaining)
639                 return 0;
640
641         hdr = queue->pdu;
642         if (queue->hdr_digest) {
643                 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
644                 if (unlikely(ret))
645                         return ret;
646         }
647
648
649         if (queue->data_digest) {
650                 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
651                 if (unlikely(ret))
652                         return ret;
653         }
654
655         switch (hdr->type) {
656         case nvme_tcp_c2h_data:
657                 return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
658         case nvme_tcp_rsp:
659                 nvme_tcp_init_recv_ctx(queue);
660                 return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
661         case nvme_tcp_r2t:
662                 nvme_tcp_init_recv_ctx(queue);
663                 return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
664         default:
665                 dev_err(queue->ctrl->ctrl.device,
666                         "unsupported pdu type (%d)\n", hdr->type);
667                 return -EINVAL;
668         }
669 }
670
671 static inline void nvme_tcp_end_request(struct request *rq, u16 status)
672 {
673         union nvme_result res = {};
674
675         if (!nvme_end_request(rq, cpu_to_le16(status << 1), res))
676                 nvme_complete_rq(rq);
677 }
678
679 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
680                               unsigned int *offset, size_t *len)
681 {
682         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
683         struct nvme_tcp_request *req;
684         struct request *rq;
685
686         rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
687         if (!rq) {
688                 dev_err(queue->ctrl->ctrl.device,
689                         "queue %d tag %#x not found\n",
690                         nvme_tcp_queue_id(queue), pdu->command_id);
691                 return -ENOENT;
692         }
693         req = blk_mq_rq_to_pdu(rq);
694
695         while (true) {
696                 int recv_len, ret;
697
698                 recv_len = min_t(size_t, *len, queue->data_remaining);
699                 if (!recv_len)
700                         break;
701
702                 if (!iov_iter_count(&req->iter)) {
703                         req->curr_bio = req->curr_bio->bi_next;
704
705                         /*
706                          * If we don`t have any bios it means that controller
707                          * sent more data than we requested, hence error
708                          */
709                         if (!req->curr_bio) {
710                                 dev_err(queue->ctrl->ctrl.device,
711                                         "queue %d no space in request %#x",
712                                         nvme_tcp_queue_id(queue), rq->tag);
713                                 nvme_tcp_init_recv_ctx(queue);
714                                 return -EIO;
715                         }
716                         nvme_tcp_init_iter(req, READ);
717                 }
718
719                 /* we can read only from what is left in this bio */
720                 recv_len = min_t(size_t, recv_len,
721                                 iov_iter_count(&req->iter));
722
723                 if (queue->data_digest)
724                         ret = skb_copy_and_hash_datagram_iter(skb, *offset,
725                                 &req->iter, recv_len, queue->rcv_hash);
726                 else
727                         ret = skb_copy_datagram_iter(skb, *offset,
728                                         &req->iter, recv_len);
729                 if (ret) {
730                         dev_err(queue->ctrl->ctrl.device,
731                                 "queue %d failed to copy request %#x data",
732                                 nvme_tcp_queue_id(queue), rq->tag);
733                         return ret;
734                 }
735
736                 *len -= recv_len;
737                 *offset += recv_len;
738                 queue->data_remaining -= recv_len;
739         }
740
741         if (!queue->data_remaining) {
742                 if (queue->data_digest) {
743                         nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
744                         queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
745                 } else {
746                         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
747                                 nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
748                                 queue->nr_cqe++;
749                         }
750                         nvme_tcp_init_recv_ctx(queue);
751                 }
752         }
753
754         return 0;
755 }
756
757 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
758                 struct sk_buff *skb, unsigned int *offset, size_t *len)
759 {
760         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
761         char *ddgst = (char *)&queue->recv_ddgst;
762         size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
763         off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
764         int ret;
765
766         ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
767         if (unlikely(ret))
768                 return ret;
769
770         queue->ddgst_remaining -= recv_len;
771         *offset += recv_len;
772         *len -= recv_len;
773         if (queue->ddgst_remaining)
774                 return 0;
775
776         if (queue->recv_ddgst != queue->exp_ddgst) {
777                 dev_err(queue->ctrl->ctrl.device,
778                         "data digest error: recv %#x expected %#x\n",
779                         le32_to_cpu(queue->recv_ddgst),
780                         le32_to_cpu(queue->exp_ddgst));
781                 return -EIO;
782         }
783
784         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
785                 struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
786                                                 pdu->command_id);
787
788                 nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
789                 queue->nr_cqe++;
790         }
791
792         nvme_tcp_init_recv_ctx(queue);
793         return 0;
794 }
795
796 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
797                              unsigned int offset, size_t len)
798 {
799         struct nvme_tcp_queue *queue = desc->arg.data;
800         size_t consumed = len;
801         int result;
802
803         while (len) {
804                 switch (nvme_tcp_recv_state(queue)) {
805                 case NVME_TCP_RECV_PDU:
806                         result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
807                         break;
808                 case NVME_TCP_RECV_DATA:
809                         result = nvme_tcp_recv_data(queue, skb, &offset, &len);
810                         break;
811                 case NVME_TCP_RECV_DDGST:
812                         result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
813                         break;
814                 default:
815                         result = -EFAULT;
816                 }
817                 if (result) {
818                         dev_err(queue->ctrl->ctrl.device,
819                                 "receive failed:  %d\n", result);
820                         queue->rd_enabled = false;
821                         nvme_tcp_error_recovery(&queue->ctrl->ctrl);
822                         return result;
823                 }
824         }
825
826         return consumed;
827 }
828
829 static void nvme_tcp_data_ready(struct sock *sk)
830 {
831         struct nvme_tcp_queue *queue;
832
833         read_lock_bh(&sk->sk_callback_lock);
834         queue = sk->sk_user_data;
835         if (likely(queue && queue->rd_enabled) &&
836             !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
837                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
838         read_unlock_bh(&sk->sk_callback_lock);
839 }
840
841 static void nvme_tcp_write_space(struct sock *sk)
842 {
843         struct nvme_tcp_queue *queue;
844
845         read_lock_bh(&sk->sk_callback_lock);
846         queue = sk->sk_user_data;
847         if (likely(queue && sk_stream_is_writeable(sk))) {
848                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
849                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
850         }
851         read_unlock_bh(&sk->sk_callback_lock);
852 }
853
854 static void nvme_tcp_state_change(struct sock *sk)
855 {
856         struct nvme_tcp_queue *queue;
857
858         read_lock(&sk->sk_callback_lock);
859         queue = sk->sk_user_data;
860         if (!queue)
861                 goto done;
862
863         switch (sk->sk_state) {
864         case TCP_CLOSE:
865         case TCP_CLOSE_WAIT:
866         case TCP_LAST_ACK:
867         case TCP_FIN_WAIT1:
868         case TCP_FIN_WAIT2:
869                 /* fallthrough */
870                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
871                 break;
872         default:
873                 dev_info(queue->ctrl->ctrl.device,
874                         "queue %d socket state %d\n",
875                         nvme_tcp_queue_id(queue), sk->sk_state);
876         }
877
878         queue->state_change(sk);
879 done:
880         read_unlock(&sk->sk_callback_lock);
881 }
882
883 static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
884 {
885         return !list_empty(&queue->send_list) ||
886                 !llist_empty(&queue->req_list) || queue->more_requests;
887 }
888
889 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
890 {
891         queue->request = NULL;
892 }
893
894 static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
895 {
896         nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
897 }
898
899 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
900 {
901         struct nvme_tcp_queue *queue = req->queue;
902
903         while (true) {
904                 struct page *page = nvme_tcp_req_cur_page(req);
905                 size_t offset = nvme_tcp_req_cur_offset(req);
906                 size_t len = nvme_tcp_req_cur_length(req);
907                 bool last = nvme_tcp_pdu_last_send(req, len);
908                 int ret, flags = MSG_DONTWAIT;
909
910                 if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
911                         flags |= MSG_EOR;
912                 else
913                         flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
914
915                 /* can't zcopy slab pages */
916                 if (unlikely(PageSlab(page))) {
917                         ret = sock_no_sendpage(queue->sock, page, offset, len,
918                                         flags);
919                 } else {
920                         ret = kernel_sendpage(queue->sock, page, offset, len,
921                                         flags);
922                 }
923                 if (ret <= 0)
924                         return ret;
925
926                 nvme_tcp_advance_req(req, ret);
927                 if (queue->data_digest)
928                         nvme_tcp_ddgst_update(queue->snd_hash, page,
929                                         offset, ret);
930
931                 /* fully successful last write*/
932                 if (last && ret == len) {
933                         if (queue->data_digest) {
934                                 nvme_tcp_ddgst_final(queue->snd_hash,
935                                         &req->ddgst);
936                                 req->state = NVME_TCP_SEND_DDGST;
937                                 req->offset = 0;
938                         } else {
939                                 nvme_tcp_done_send_req(queue);
940                         }
941                         return 1;
942                 }
943         }
944         return -EAGAIN;
945 }
946
947 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
948 {
949         struct nvme_tcp_queue *queue = req->queue;
950         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
951         bool inline_data = nvme_tcp_has_inline_data(req);
952         u8 hdgst = nvme_tcp_hdgst_len(queue);
953         int len = sizeof(*pdu) + hdgst - req->offset;
954         int flags = MSG_DONTWAIT;
955         int ret;
956
957         if (inline_data || nvme_tcp_queue_more(queue))
958                 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
959         else
960                 flags |= MSG_EOR;
961
962         if (queue->hdr_digest && !req->offset)
963                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
964
965         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
966                         offset_in_page(pdu) + req->offset, len,  flags);
967         if (unlikely(ret <= 0))
968                 return ret;
969
970         len -= ret;
971         if (!len) {
972                 if (inline_data) {
973                         req->state = NVME_TCP_SEND_DATA;
974                         if (queue->data_digest)
975                                 crypto_ahash_init(queue->snd_hash);
976                         nvme_tcp_init_iter(req, WRITE);
977                 } else {
978                         nvme_tcp_done_send_req(queue);
979                 }
980                 return 1;
981         }
982         req->offset += ret;
983
984         return -EAGAIN;
985 }
986
987 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
988 {
989         struct nvme_tcp_queue *queue = req->queue;
990         struct nvme_tcp_data_pdu *pdu = req->pdu;
991         u8 hdgst = nvme_tcp_hdgst_len(queue);
992         int len = sizeof(*pdu) - req->offset + hdgst;
993         int ret;
994
995         if (queue->hdr_digest && !req->offset)
996                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
997
998         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
999                         offset_in_page(pdu) + req->offset, len,
1000                         MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
1001         if (unlikely(ret <= 0))
1002                 return ret;
1003
1004         len -= ret;
1005         if (!len) {
1006                 req->state = NVME_TCP_SEND_DATA;
1007                 if (queue->data_digest)
1008                         crypto_ahash_init(queue->snd_hash);
1009                 if (!req->data_sent)
1010                         nvme_tcp_init_iter(req, WRITE);
1011                 return 1;
1012         }
1013         req->offset += ret;
1014
1015         return -EAGAIN;
1016 }
1017
1018 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1019 {
1020         struct nvme_tcp_queue *queue = req->queue;
1021         int ret;
1022         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1023         struct kvec iov = {
1024                 .iov_base = &req->ddgst + req->offset,
1025                 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1026         };
1027
1028         if (nvme_tcp_queue_more(queue))
1029                 msg.msg_flags |= MSG_MORE;
1030         else
1031                 msg.msg_flags |= MSG_EOR;
1032
1033         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1034         if (unlikely(ret <= 0))
1035                 return ret;
1036
1037         if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
1038                 nvme_tcp_done_send_req(queue);
1039                 return 1;
1040         }
1041
1042         req->offset += ret;
1043         return -EAGAIN;
1044 }
1045
1046 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1047 {
1048         struct nvme_tcp_request *req;
1049         int ret = 1;
1050
1051         if (!queue->request) {
1052                 queue->request = nvme_tcp_fetch_request(queue);
1053                 if (!queue->request)
1054                         return 0;
1055         }
1056         req = queue->request;
1057
1058         if (req->state == NVME_TCP_SEND_CMD_PDU) {
1059                 ret = nvme_tcp_try_send_cmd_pdu(req);
1060                 if (ret <= 0)
1061                         goto done;
1062                 if (!nvme_tcp_has_inline_data(req))
1063                         return ret;
1064         }
1065
1066         if (req->state == NVME_TCP_SEND_H2C_PDU) {
1067                 ret = nvme_tcp_try_send_data_pdu(req);
1068                 if (ret <= 0)
1069                         goto done;
1070         }
1071
1072         if (req->state == NVME_TCP_SEND_DATA) {
1073                 ret = nvme_tcp_try_send_data(req);
1074                 if (ret <= 0)
1075                         goto done;
1076         }
1077
1078         if (req->state == NVME_TCP_SEND_DDGST)
1079                 ret = nvme_tcp_try_send_ddgst(req);
1080 done:
1081         if (ret == -EAGAIN) {
1082                 ret = 0;
1083         } else if (ret < 0) {
1084                 dev_err(queue->ctrl->ctrl.device,
1085                         "failed to send request %d\n", ret);
1086                 if (ret != -EPIPE && ret != -ECONNRESET)
1087                         nvme_tcp_fail_request(queue->request);
1088                 nvme_tcp_done_send_req(queue);
1089         }
1090         return ret;
1091 }
1092
1093 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1094 {
1095         struct socket *sock = queue->sock;
1096         struct sock *sk = sock->sk;
1097         read_descriptor_t rd_desc;
1098         int consumed;
1099
1100         rd_desc.arg.data = queue;
1101         rd_desc.count = 1;
1102         lock_sock(sk);
1103         queue->nr_cqe = 0;
1104         consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1105         release_sock(sk);
1106         return consumed;
1107 }
1108
1109 static void nvme_tcp_io_work(struct work_struct *w)
1110 {
1111         struct nvme_tcp_queue *queue =
1112                 container_of(w, struct nvme_tcp_queue, io_work);
1113         unsigned long deadline = jiffies + msecs_to_jiffies(1);
1114
1115         do {
1116                 bool pending = false;
1117                 int result;
1118
1119                 if (mutex_trylock(&queue->send_mutex)) {
1120                         result = nvme_tcp_try_send(queue);
1121                         mutex_unlock(&queue->send_mutex);
1122                         if (result > 0)
1123                                 pending = true;
1124                         else if (unlikely(result < 0))
1125                                 break;
1126                 }
1127
1128                 result = nvme_tcp_try_recv(queue);
1129                 if (result > 0)
1130                         pending = true;
1131                 else if (unlikely(result < 0))
1132                         return;
1133
1134                 if (!pending)
1135                         return;
1136
1137         } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1138
1139         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1140 }
1141
1142 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1143 {
1144         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1145
1146         ahash_request_free(queue->rcv_hash);
1147         ahash_request_free(queue->snd_hash);
1148         crypto_free_ahash(tfm);
1149 }
1150
1151 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1152 {
1153         struct crypto_ahash *tfm;
1154
1155         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1156         if (IS_ERR(tfm))
1157                 return PTR_ERR(tfm);
1158
1159         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1160         if (!queue->snd_hash)
1161                 goto free_tfm;
1162         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1163
1164         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1165         if (!queue->rcv_hash)
1166                 goto free_snd_hash;
1167         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1168
1169         return 0;
1170 free_snd_hash:
1171         ahash_request_free(queue->snd_hash);
1172 free_tfm:
1173         crypto_free_ahash(tfm);
1174         return -ENOMEM;
1175 }
1176
1177 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1178 {
1179         struct nvme_tcp_request *async = &ctrl->async_req;
1180
1181         page_frag_free(async->pdu);
1182 }
1183
1184 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1185 {
1186         struct nvme_tcp_queue *queue = &ctrl->queues[0];
1187         struct nvme_tcp_request *async = &ctrl->async_req;
1188         u8 hdgst = nvme_tcp_hdgst_len(queue);
1189
1190         async->pdu = page_frag_alloc(&queue->pf_cache,
1191                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1192                 GFP_KERNEL | __GFP_ZERO);
1193         if (!async->pdu)
1194                 return -ENOMEM;
1195
1196         async->queue = &ctrl->queues[0];
1197         return 0;
1198 }
1199
1200 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1201 {
1202         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1203         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1204
1205         if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1206                 return;
1207
1208         if (queue->hdr_digest || queue->data_digest)
1209                 nvme_tcp_free_crypto(queue);
1210
1211         sock_release(queue->sock);
1212         kfree(queue->pdu);
1213 }
1214
1215 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1216 {
1217         struct nvme_tcp_icreq_pdu *icreq;
1218         struct nvme_tcp_icresp_pdu *icresp;
1219         struct msghdr msg = {};
1220         struct kvec iov;
1221         bool ctrl_hdgst, ctrl_ddgst;
1222         int ret;
1223
1224         icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1225         if (!icreq)
1226                 return -ENOMEM;
1227
1228         icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1229         if (!icresp) {
1230                 ret = -ENOMEM;
1231                 goto free_icreq;
1232         }
1233
1234         icreq->hdr.type = nvme_tcp_icreq;
1235         icreq->hdr.hlen = sizeof(*icreq);
1236         icreq->hdr.pdo = 0;
1237         icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1238         icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1239         icreq->maxr2t = 0; /* single inflight r2t supported */
1240         icreq->hpda = 0; /* no alignment constraint */
1241         if (queue->hdr_digest)
1242                 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1243         if (queue->data_digest)
1244                 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1245
1246         iov.iov_base = icreq;
1247         iov.iov_len = sizeof(*icreq);
1248         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1249         if (ret < 0)
1250                 goto free_icresp;
1251
1252         memset(&msg, 0, sizeof(msg));
1253         iov.iov_base = icresp;
1254         iov.iov_len = sizeof(*icresp);
1255         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1256                         iov.iov_len, msg.msg_flags);
1257         if (ret < 0)
1258                 goto free_icresp;
1259
1260         ret = -EINVAL;
1261         if (icresp->hdr.type != nvme_tcp_icresp) {
1262                 pr_err("queue %d: bad type returned %d\n",
1263                         nvme_tcp_queue_id(queue), icresp->hdr.type);
1264                 goto free_icresp;
1265         }
1266
1267         if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1268                 pr_err("queue %d: bad pdu length returned %d\n",
1269                         nvme_tcp_queue_id(queue), icresp->hdr.plen);
1270                 goto free_icresp;
1271         }
1272
1273         if (icresp->pfv != NVME_TCP_PFV_1_0) {
1274                 pr_err("queue %d: bad pfv returned %d\n",
1275                         nvme_tcp_queue_id(queue), icresp->pfv);
1276                 goto free_icresp;
1277         }
1278
1279         ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1280         if ((queue->data_digest && !ctrl_ddgst) ||
1281             (!queue->data_digest && ctrl_ddgst)) {
1282                 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1283                         nvme_tcp_queue_id(queue),
1284                         queue->data_digest ? "enabled" : "disabled",
1285                         ctrl_ddgst ? "enabled" : "disabled");
1286                 goto free_icresp;
1287         }
1288
1289         ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1290         if ((queue->hdr_digest && !ctrl_hdgst) ||
1291             (!queue->hdr_digest && ctrl_hdgst)) {
1292                 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1293                         nvme_tcp_queue_id(queue),
1294                         queue->hdr_digest ? "enabled" : "disabled",
1295                         ctrl_hdgst ? "enabled" : "disabled");
1296                 goto free_icresp;
1297         }
1298
1299         if (icresp->cpda != 0) {
1300                 pr_err("queue %d: unsupported cpda returned %d\n",
1301                         nvme_tcp_queue_id(queue), icresp->cpda);
1302                 goto free_icresp;
1303         }
1304
1305         ret = 0;
1306 free_icresp:
1307         kfree(icresp);
1308 free_icreq:
1309         kfree(icreq);
1310         return ret;
1311 }
1312
1313 static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1314 {
1315         return nvme_tcp_queue_id(queue) == 0;
1316 }
1317
1318 static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1319 {
1320         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1321         int qid = nvme_tcp_queue_id(queue);
1322
1323         return !nvme_tcp_admin_queue(queue) &&
1324                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1325 }
1326
1327 static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1328 {
1329         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1330         int qid = nvme_tcp_queue_id(queue);
1331
1332         return !nvme_tcp_admin_queue(queue) &&
1333                 !nvme_tcp_default_queue(queue) &&
1334                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1335                           ctrl->io_queues[HCTX_TYPE_READ];
1336 }
1337
1338 static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1339 {
1340         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1341         int qid = nvme_tcp_queue_id(queue);
1342
1343         return !nvme_tcp_admin_queue(queue) &&
1344                 !nvme_tcp_default_queue(queue) &&
1345                 !nvme_tcp_read_queue(queue) &&
1346                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1347                           ctrl->io_queues[HCTX_TYPE_READ] +
1348                           ctrl->io_queues[HCTX_TYPE_POLL];
1349 }
1350
1351 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1352 {
1353         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1354         int qid = nvme_tcp_queue_id(queue);
1355         int n = 0;
1356
1357         if (nvme_tcp_default_queue(queue))
1358                 n = qid - 1;
1359         else if (nvme_tcp_read_queue(queue))
1360                 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1361         else if (nvme_tcp_poll_queue(queue))
1362                 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1363                                 ctrl->io_queues[HCTX_TYPE_READ] - 1;
1364         queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1365 }
1366
1367 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1368                 int qid, size_t queue_size)
1369 {
1370         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1371         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1372         int ret, rcv_pdu_size;
1373
1374         queue->ctrl = ctrl;
1375         init_llist_head(&queue->req_list);
1376         INIT_LIST_HEAD(&queue->send_list);
1377         mutex_init(&queue->send_mutex);
1378         INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1379         queue->queue_size = queue_size;
1380
1381         if (qid > 0)
1382                 queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1383         else
1384                 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1385                                                 NVME_TCP_ADMIN_CCSZ;
1386
1387         ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1388                         IPPROTO_TCP, &queue->sock);
1389         if (ret) {
1390                 dev_err(nctrl->device,
1391                         "failed to create socket: %d\n", ret);
1392                 return ret;
1393         }
1394
1395         /* Single syn retry */
1396         tcp_sock_set_syncnt(queue->sock->sk, 1);
1397
1398         /* Set TCP no delay */
1399         tcp_sock_set_nodelay(queue->sock->sk);
1400
1401         /*
1402          * Cleanup whatever is sitting in the TCP transmit queue on socket
1403          * close. This is done to prevent stale data from being sent should
1404          * the network connection be restored before TCP times out.
1405          */
1406         sock_no_linger(queue->sock->sk);
1407
1408         if (so_priority > 0)
1409                 sock_set_priority(queue->sock->sk, so_priority);
1410
1411         /* Set socket type of service */
1412         if (nctrl->opts->tos >= 0)
1413                 ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1414
1415         /* Set 10 seconds timeout for icresp recvmsg */
1416         queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1417
1418         queue->sock->sk->sk_allocation = GFP_ATOMIC;
1419         nvme_tcp_set_queue_io_cpu(queue);
1420         queue->request = NULL;
1421         queue->data_remaining = 0;
1422         queue->ddgst_remaining = 0;
1423         queue->pdu_remaining = 0;
1424         queue->pdu_offset = 0;
1425         sk_set_memalloc(queue->sock->sk);
1426
1427         if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1428                 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1429                         sizeof(ctrl->src_addr));
1430                 if (ret) {
1431                         dev_err(nctrl->device,
1432                                 "failed to bind queue %d socket %d\n",
1433                                 qid, ret);
1434                         goto err_sock;
1435                 }
1436         }
1437
1438         queue->hdr_digest = nctrl->opts->hdr_digest;
1439         queue->data_digest = nctrl->opts->data_digest;
1440         if (queue->hdr_digest || queue->data_digest) {
1441                 ret = nvme_tcp_alloc_crypto(queue);
1442                 if (ret) {
1443                         dev_err(nctrl->device,
1444                                 "failed to allocate queue %d crypto\n", qid);
1445                         goto err_sock;
1446                 }
1447         }
1448
1449         rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1450                         nvme_tcp_hdgst_len(queue);
1451         queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1452         if (!queue->pdu) {
1453                 ret = -ENOMEM;
1454                 goto err_crypto;
1455         }
1456
1457         dev_dbg(nctrl->device, "connecting queue %d\n",
1458                         nvme_tcp_queue_id(queue));
1459
1460         ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1461                 sizeof(ctrl->addr), 0);
1462         if (ret) {
1463                 dev_err(nctrl->device,
1464                         "failed to connect socket: %d\n", ret);
1465                 goto err_rcv_pdu;
1466         }
1467
1468         ret = nvme_tcp_init_connection(queue);
1469         if (ret)
1470                 goto err_init_connect;
1471
1472         queue->rd_enabled = true;
1473         set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1474         nvme_tcp_init_recv_ctx(queue);
1475
1476         write_lock_bh(&queue->sock->sk->sk_callback_lock);
1477         queue->sock->sk->sk_user_data = queue;
1478         queue->state_change = queue->sock->sk->sk_state_change;
1479         queue->data_ready = queue->sock->sk->sk_data_ready;
1480         queue->write_space = queue->sock->sk->sk_write_space;
1481         queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1482         queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1483         queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1484 #ifdef CONFIG_NET_RX_BUSY_POLL
1485         queue->sock->sk->sk_ll_usec = 1;
1486 #endif
1487         write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1488
1489         return 0;
1490
1491 err_init_connect:
1492         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1493 err_rcv_pdu:
1494         kfree(queue->pdu);
1495 err_crypto:
1496         if (queue->hdr_digest || queue->data_digest)
1497                 nvme_tcp_free_crypto(queue);
1498 err_sock:
1499         sock_release(queue->sock);
1500         queue->sock = NULL;
1501         return ret;
1502 }
1503
1504 static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1505 {
1506         struct socket *sock = queue->sock;
1507
1508         write_lock_bh(&sock->sk->sk_callback_lock);
1509         sock->sk->sk_user_data  = NULL;
1510         sock->sk->sk_data_ready = queue->data_ready;
1511         sock->sk->sk_state_change = queue->state_change;
1512         sock->sk->sk_write_space  = queue->write_space;
1513         write_unlock_bh(&sock->sk->sk_callback_lock);
1514 }
1515
1516 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1517 {
1518         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1519         nvme_tcp_restore_sock_calls(queue);
1520         cancel_work_sync(&queue->io_work);
1521 }
1522
1523 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1524 {
1525         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1526         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1527
1528         if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1529                 return;
1530
1531         __nvme_tcp_stop_queue(queue);
1532 }
1533
1534 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1535 {
1536         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1537         int ret;
1538
1539         if (idx)
1540                 ret = nvmf_connect_io_queue(nctrl, idx, false);
1541         else
1542                 ret = nvmf_connect_admin_queue(nctrl);
1543
1544         if (!ret) {
1545                 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1546         } else {
1547                 if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1548                         __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1549                 dev_err(nctrl->device,
1550                         "failed to connect queue: %d ret=%d\n", idx, ret);
1551         }
1552         return ret;
1553 }
1554
1555 static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1556                 bool admin)
1557 {
1558         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1559         struct blk_mq_tag_set *set;
1560         int ret;
1561
1562         if (admin) {
1563                 set = &ctrl->admin_tag_set;
1564                 memset(set, 0, sizeof(*set));
1565                 set->ops = &nvme_tcp_admin_mq_ops;
1566                 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1567                 set->reserved_tags = 2; /* connect + keep-alive */
1568                 set->numa_node = nctrl->numa_node;
1569                 set->flags = BLK_MQ_F_BLOCKING;
1570                 set->cmd_size = sizeof(struct nvme_tcp_request);
1571                 set->driver_data = ctrl;
1572                 set->nr_hw_queues = 1;
1573                 set->timeout = ADMIN_TIMEOUT;
1574         } else {
1575                 set = &ctrl->tag_set;
1576                 memset(set, 0, sizeof(*set));
1577                 set->ops = &nvme_tcp_mq_ops;
1578                 set->queue_depth = nctrl->sqsize + 1;
1579                 set->reserved_tags = 1; /* fabric connect */
1580                 set->numa_node = nctrl->numa_node;
1581                 set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
1582                 set->cmd_size = sizeof(struct nvme_tcp_request);
1583                 set->driver_data = ctrl;
1584                 set->nr_hw_queues = nctrl->queue_count - 1;
1585                 set->timeout = NVME_IO_TIMEOUT;
1586                 set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
1587         }
1588
1589         ret = blk_mq_alloc_tag_set(set);
1590         if (ret)
1591                 return ERR_PTR(ret);
1592
1593         return set;
1594 }
1595
1596 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1597 {
1598         if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1599                 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1600                 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1601         }
1602
1603         nvme_tcp_free_queue(ctrl, 0);
1604 }
1605
1606 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1607 {
1608         int i;
1609
1610         for (i = 1; i < ctrl->queue_count; i++)
1611                 nvme_tcp_free_queue(ctrl, i);
1612 }
1613
1614 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1615 {
1616         int i;
1617
1618         for (i = 1; i < ctrl->queue_count; i++)
1619                 nvme_tcp_stop_queue(ctrl, i);
1620 }
1621
1622 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1623 {
1624         int i, ret = 0;
1625
1626         for (i = 1; i < ctrl->queue_count; i++) {
1627                 ret = nvme_tcp_start_queue(ctrl, i);
1628                 if (ret)
1629                         goto out_stop_queues;
1630         }
1631
1632         return 0;
1633
1634 out_stop_queues:
1635         for (i--; i >= 1; i--)
1636                 nvme_tcp_stop_queue(ctrl, i);
1637         return ret;
1638 }
1639
1640 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1641 {
1642         int ret;
1643
1644         ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1645         if (ret)
1646                 return ret;
1647
1648         ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1649         if (ret)
1650                 goto out_free_queue;
1651
1652         return 0;
1653
1654 out_free_queue:
1655         nvme_tcp_free_queue(ctrl, 0);
1656         return ret;
1657 }
1658
1659 static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1660 {
1661         int i, ret;
1662
1663         for (i = 1; i < ctrl->queue_count; i++) {
1664                 ret = nvme_tcp_alloc_queue(ctrl, i,
1665                                 ctrl->sqsize + 1);
1666                 if (ret)
1667                         goto out_free_queues;
1668         }
1669
1670         return 0;
1671
1672 out_free_queues:
1673         for (i--; i >= 1; i--)
1674                 nvme_tcp_free_queue(ctrl, i);
1675
1676         return ret;
1677 }
1678
1679 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1680 {
1681         unsigned int nr_io_queues;
1682
1683         nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1684         nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1685         nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1686
1687         return nr_io_queues;
1688 }
1689
1690 static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1691                 unsigned int nr_io_queues)
1692 {
1693         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1694         struct nvmf_ctrl_options *opts = nctrl->opts;
1695
1696         if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1697                 /*
1698                  * separate read/write queues
1699                  * hand out dedicated default queues only after we have
1700                  * sufficient read queues.
1701                  */
1702                 ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1703                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1704                 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1705                         min(opts->nr_write_queues, nr_io_queues);
1706                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1707         } else {
1708                 /*
1709                  * shared read/write queues
1710                  * either no write queues were requested, or we don't have
1711                  * sufficient queue count to have dedicated default queues.
1712                  */
1713                 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1714                         min(opts->nr_io_queues, nr_io_queues);
1715                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1716         }
1717
1718         if (opts->nr_poll_queues && nr_io_queues) {
1719                 /* map dedicated poll queues only if we have queues left */
1720                 ctrl->io_queues[HCTX_TYPE_POLL] =
1721                         min(opts->nr_poll_queues, nr_io_queues);
1722         }
1723 }
1724
1725 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1726 {
1727         unsigned int nr_io_queues;
1728         int ret;
1729
1730         nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1731         ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1732         if (ret)
1733                 return ret;
1734
1735         ctrl->queue_count = nr_io_queues + 1;
1736         if (ctrl->queue_count < 2)
1737                 return 0;
1738
1739         dev_info(ctrl->device,
1740                 "creating %d I/O queues.\n", nr_io_queues);
1741
1742         nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1743
1744         return __nvme_tcp_alloc_io_queues(ctrl);
1745 }
1746
1747 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1748 {
1749         nvme_tcp_stop_io_queues(ctrl);
1750         if (remove) {
1751                 blk_cleanup_queue(ctrl->connect_q);
1752                 blk_mq_free_tag_set(ctrl->tagset);
1753         }
1754         nvme_tcp_free_io_queues(ctrl);
1755 }
1756
1757 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1758 {
1759         int ret;
1760
1761         ret = nvme_tcp_alloc_io_queues(ctrl);
1762         if (ret)
1763                 return ret;
1764
1765         if (new) {
1766                 ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1767                 if (IS_ERR(ctrl->tagset)) {
1768                         ret = PTR_ERR(ctrl->tagset);
1769                         goto out_free_io_queues;
1770                 }
1771
1772                 ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1773                 if (IS_ERR(ctrl->connect_q)) {
1774                         ret = PTR_ERR(ctrl->connect_q);
1775                         goto out_free_tag_set;
1776                 }
1777         }
1778
1779         ret = nvme_tcp_start_io_queues(ctrl);
1780         if (ret)
1781                 goto out_cleanup_connect_q;
1782
1783         if (!new) {
1784                 nvme_start_queues(ctrl);
1785                 nvme_wait_freeze(ctrl);
1786                 blk_mq_update_nr_hw_queues(ctrl->tagset,
1787                         ctrl->queue_count - 1);
1788                 nvme_unfreeze(ctrl);
1789         }
1790
1791         return 0;
1792
1793 out_cleanup_connect_q:
1794         if (new)
1795                 blk_cleanup_queue(ctrl->connect_q);
1796 out_free_tag_set:
1797         if (new)
1798                 blk_mq_free_tag_set(ctrl->tagset);
1799 out_free_io_queues:
1800         nvme_tcp_free_io_queues(ctrl);
1801         return ret;
1802 }
1803
1804 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1805 {
1806         nvme_tcp_stop_queue(ctrl, 0);
1807         if (remove) {
1808                 blk_cleanup_queue(ctrl->admin_q);
1809                 blk_cleanup_queue(ctrl->fabrics_q);
1810                 blk_mq_free_tag_set(ctrl->admin_tagset);
1811         }
1812         nvme_tcp_free_admin_queue(ctrl);
1813 }
1814
1815 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1816 {
1817         int error;
1818
1819         error = nvme_tcp_alloc_admin_queue(ctrl);
1820         if (error)
1821                 return error;
1822
1823         if (new) {
1824                 ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1825                 if (IS_ERR(ctrl->admin_tagset)) {
1826                         error = PTR_ERR(ctrl->admin_tagset);
1827                         goto out_free_queue;
1828                 }
1829
1830                 ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
1831                 if (IS_ERR(ctrl->fabrics_q)) {
1832                         error = PTR_ERR(ctrl->fabrics_q);
1833                         goto out_free_tagset;
1834                 }
1835
1836                 ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1837                 if (IS_ERR(ctrl->admin_q)) {
1838                         error = PTR_ERR(ctrl->admin_q);
1839                         goto out_cleanup_fabrics_q;
1840                 }
1841         }
1842
1843         error = nvme_tcp_start_queue(ctrl, 0);
1844         if (error)
1845                 goto out_cleanup_queue;
1846
1847         error = nvme_enable_ctrl(ctrl);
1848         if (error)
1849                 goto out_stop_queue;
1850
1851         blk_mq_unquiesce_queue(ctrl->admin_q);
1852
1853         error = nvme_init_identify(ctrl);
1854         if (error)
1855                 goto out_stop_queue;
1856
1857         return 0;
1858
1859 out_stop_queue:
1860         nvme_tcp_stop_queue(ctrl, 0);
1861 out_cleanup_queue:
1862         if (new)
1863                 blk_cleanup_queue(ctrl->admin_q);
1864 out_cleanup_fabrics_q:
1865         if (new)
1866                 blk_cleanup_queue(ctrl->fabrics_q);
1867 out_free_tagset:
1868         if (new)
1869                 blk_mq_free_tag_set(ctrl->admin_tagset);
1870 out_free_queue:
1871         nvme_tcp_free_admin_queue(ctrl);
1872         return error;
1873 }
1874
1875 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1876                 bool remove)
1877 {
1878         blk_mq_quiesce_queue(ctrl->admin_q);
1879         nvme_tcp_stop_queue(ctrl, 0);
1880         if (ctrl->admin_tagset) {
1881                 blk_mq_tagset_busy_iter(ctrl->admin_tagset,
1882                         nvme_cancel_request, ctrl);
1883                 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
1884         }
1885         if (remove)
1886                 blk_mq_unquiesce_queue(ctrl->admin_q);
1887         nvme_tcp_destroy_admin_queue(ctrl, remove);
1888 }
1889
1890 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1891                 bool remove)
1892 {
1893         if (ctrl->queue_count <= 1)
1894                 return;
1895         nvme_start_freeze(ctrl);
1896         nvme_stop_queues(ctrl);
1897         nvme_tcp_stop_io_queues(ctrl);
1898         if (ctrl->tagset) {
1899                 blk_mq_tagset_busy_iter(ctrl->tagset,
1900                         nvme_cancel_request, ctrl);
1901                 blk_mq_tagset_wait_completed_request(ctrl->tagset);
1902         }
1903         if (remove)
1904                 nvme_start_queues(ctrl);
1905         nvme_tcp_destroy_io_queues(ctrl, remove);
1906 }
1907
1908 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1909 {
1910         /* If we are resetting/deleting then do nothing */
1911         if (ctrl->state != NVME_CTRL_CONNECTING) {
1912                 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1913                         ctrl->state == NVME_CTRL_LIVE);
1914                 return;
1915         }
1916
1917         if (nvmf_should_reconnect(ctrl)) {
1918                 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1919                         ctrl->opts->reconnect_delay);
1920                 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1921                                 ctrl->opts->reconnect_delay * HZ);
1922         } else {
1923                 dev_info(ctrl->device, "Removing controller...\n");
1924                 nvme_delete_ctrl(ctrl);
1925         }
1926 }
1927
1928 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1929 {
1930         struct nvmf_ctrl_options *opts = ctrl->opts;
1931         int ret;
1932
1933         ret = nvme_tcp_configure_admin_queue(ctrl, new);
1934         if (ret)
1935                 return ret;
1936
1937         if (ctrl->icdoff) {
1938                 dev_err(ctrl->device, "icdoff is not supported!\n");
1939                 goto destroy_admin;
1940         }
1941
1942         if (opts->queue_size > ctrl->sqsize + 1)
1943                 dev_warn(ctrl->device,
1944                         "queue_size %zu > ctrl sqsize %u, clamping down\n",
1945                         opts->queue_size, ctrl->sqsize + 1);
1946
1947         if (ctrl->sqsize + 1 > ctrl->maxcmd) {
1948                 dev_warn(ctrl->device,
1949                         "sqsize %u > ctrl maxcmd %u, clamping down\n",
1950                         ctrl->sqsize + 1, ctrl->maxcmd);
1951                 ctrl->sqsize = ctrl->maxcmd - 1;
1952         }
1953
1954         if (ctrl->queue_count > 1) {
1955                 ret = nvme_tcp_configure_io_queues(ctrl, new);
1956                 if (ret)
1957                         goto destroy_admin;
1958         }
1959
1960         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
1961                 /*
1962                  * state change failure is ok if we started ctrl delete,
1963                  * unless we're during creation of a new controller to
1964                  * avoid races with teardown flow.
1965                  */
1966                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
1967                              ctrl->state != NVME_CTRL_DELETING_NOIO);
1968                 WARN_ON_ONCE(new);
1969                 ret = -EINVAL;
1970                 goto destroy_io;
1971         }
1972
1973         nvme_start_ctrl(ctrl);
1974         return 0;
1975
1976 destroy_io:
1977         if (ctrl->queue_count > 1)
1978                 nvme_tcp_destroy_io_queues(ctrl, new);
1979 destroy_admin:
1980         nvme_tcp_stop_queue(ctrl, 0);
1981         nvme_tcp_destroy_admin_queue(ctrl, new);
1982         return ret;
1983 }
1984
1985 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
1986 {
1987         struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
1988                         struct nvme_tcp_ctrl, connect_work);
1989         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1990
1991         ++ctrl->nr_reconnects;
1992
1993         if (nvme_tcp_setup_ctrl(ctrl, false))
1994                 goto requeue;
1995
1996         dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
1997                         ctrl->nr_reconnects);
1998
1999         ctrl->nr_reconnects = 0;
2000
2001         return;
2002
2003 requeue:
2004         dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2005                         ctrl->nr_reconnects);
2006         nvme_tcp_reconnect_or_remove(ctrl);
2007 }
2008
2009 static void nvme_tcp_error_recovery_work(struct work_struct *work)
2010 {
2011         struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2012                                 struct nvme_tcp_ctrl, err_work);
2013         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2014
2015         nvme_stop_keep_alive(ctrl);
2016         nvme_tcp_teardown_io_queues(ctrl, false);
2017         /* unquiesce to fail fast pending requests */
2018         nvme_start_queues(ctrl);
2019         nvme_tcp_teardown_admin_queue(ctrl, false);
2020         blk_mq_unquiesce_queue(ctrl->admin_q);
2021
2022         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2023                 /* state change failure is ok if we started ctrl delete */
2024                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2025                              ctrl->state != NVME_CTRL_DELETING_NOIO);
2026                 return;
2027         }
2028
2029         nvme_tcp_reconnect_or_remove(ctrl);
2030 }
2031
2032 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2033 {
2034         cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
2035         cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2036
2037         nvme_tcp_teardown_io_queues(ctrl, shutdown);
2038         blk_mq_quiesce_queue(ctrl->admin_q);
2039         if (shutdown)
2040                 nvme_shutdown_ctrl(ctrl);
2041         else
2042                 nvme_disable_ctrl(ctrl);
2043         nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2044 }
2045
2046 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2047 {
2048         nvme_tcp_teardown_ctrl(ctrl, true);
2049 }
2050
2051 static void nvme_reset_ctrl_work(struct work_struct *work)
2052 {
2053         struct nvme_ctrl *ctrl =
2054                 container_of(work, struct nvme_ctrl, reset_work);
2055
2056         nvme_stop_ctrl(ctrl);
2057         nvme_tcp_teardown_ctrl(ctrl, false);
2058
2059         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2060                 /* state change failure is ok if we started ctrl delete */
2061                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2062                              ctrl->state != NVME_CTRL_DELETING_NOIO);
2063                 return;
2064         }
2065
2066         if (nvme_tcp_setup_ctrl(ctrl, false))
2067                 goto out_fail;
2068
2069         return;
2070
2071 out_fail:
2072         ++ctrl->nr_reconnects;
2073         nvme_tcp_reconnect_or_remove(ctrl);
2074 }
2075
2076 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2077 {
2078         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2079
2080         if (list_empty(&ctrl->list))
2081                 goto free_ctrl;
2082
2083         mutex_lock(&nvme_tcp_ctrl_mutex);
2084         list_del(&ctrl->list);
2085         mutex_unlock(&nvme_tcp_ctrl_mutex);
2086
2087         nvmf_free_options(nctrl->opts);
2088 free_ctrl:
2089         kfree(ctrl->queues);
2090         kfree(ctrl);
2091 }
2092
2093 static void nvme_tcp_set_sg_null(struct nvme_command *c)
2094 {
2095         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2096
2097         sg->addr = 0;
2098         sg->length = 0;
2099         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2100                         NVME_SGL_FMT_TRANSPORT_A;
2101 }
2102
2103 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2104                 struct nvme_command *c, u32 data_len)
2105 {
2106         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2107
2108         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2109         sg->length = cpu_to_le32(data_len);
2110         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2111 }
2112
2113 static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2114                 u32 data_len)
2115 {
2116         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2117
2118         sg->addr = 0;
2119         sg->length = cpu_to_le32(data_len);
2120         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2121                         NVME_SGL_FMT_TRANSPORT_A;
2122 }
2123
2124 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2125 {
2126         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2127         struct nvme_tcp_queue *queue = &ctrl->queues[0];
2128         struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2129         struct nvme_command *cmd = &pdu->cmd;
2130         u8 hdgst = nvme_tcp_hdgst_len(queue);
2131
2132         memset(pdu, 0, sizeof(*pdu));
2133         pdu->hdr.type = nvme_tcp_cmd;
2134         if (queue->hdr_digest)
2135                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2136         pdu->hdr.hlen = sizeof(*pdu);
2137         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2138
2139         cmd->common.opcode = nvme_admin_async_event;
2140         cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2141         cmd->common.flags |= NVME_CMD_SGL_METABUF;
2142         nvme_tcp_set_sg_null(cmd);
2143
2144         ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2145         ctrl->async_req.offset = 0;
2146         ctrl->async_req.curr_bio = NULL;
2147         ctrl->async_req.data_len = 0;
2148
2149         nvme_tcp_queue_request(&ctrl->async_req, true, true);
2150 }
2151
2152 static enum blk_eh_timer_return
2153 nvme_tcp_timeout(struct request *rq, bool reserved)
2154 {
2155         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2156         struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
2157         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2158
2159         /*
2160          * Restart the timer if a controller reset is already scheduled. Any
2161          * timed out commands would be handled before entering the connecting
2162          * state.
2163          */
2164         if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
2165                 return BLK_EH_RESET_TIMER;
2166
2167         dev_warn(ctrl->ctrl.device,
2168                 "queue %d: timeout request %#x type %d\n",
2169                 nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
2170
2171         if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
2172                 /*
2173                  * Teardown immediately if controller times out while starting
2174                  * or we are already started error recovery. all outstanding
2175                  * requests are completed on shutdown, so we return BLK_EH_DONE.
2176                  */
2177                 flush_work(&ctrl->err_work);
2178                 nvme_tcp_teardown_io_queues(&ctrl->ctrl, false);
2179                 nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false);
2180                 return BLK_EH_DONE;
2181         }
2182
2183         dev_warn(ctrl->ctrl.device, "starting error recovery\n");
2184         nvme_tcp_error_recovery(&ctrl->ctrl);
2185
2186         return BLK_EH_RESET_TIMER;
2187 }
2188
2189 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2190                         struct request *rq)
2191 {
2192         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2193         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2194         struct nvme_command *c = &pdu->cmd;
2195
2196         c->common.flags |= NVME_CMD_SGL_METABUF;
2197
2198         if (!blk_rq_nr_phys_segments(rq))
2199                 nvme_tcp_set_sg_null(c);
2200         else if (rq_data_dir(rq) == WRITE &&
2201             req->data_len <= nvme_tcp_inline_data_size(queue))
2202                 nvme_tcp_set_sg_inline(queue, c, req->data_len);
2203         else
2204                 nvme_tcp_set_sg_host_data(c, req->data_len);
2205
2206         return 0;
2207 }
2208
2209 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2210                 struct request *rq)
2211 {
2212         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2213         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2214         struct nvme_tcp_queue *queue = req->queue;
2215         u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2216         blk_status_t ret;
2217
2218         ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
2219         if (ret)
2220                 return ret;
2221
2222         req->state = NVME_TCP_SEND_CMD_PDU;
2223         req->offset = 0;
2224         req->data_sent = 0;
2225         req->pdu_len = 0;
2226         req->pdu_sent = 0;
2227         req->data_len = blk_rq_nr_phys_segments(rq) ?
2228                                 blk_rq_payload_bytes(rq) : 0;
2229         req->curr_bio = rq->bio;
2230
2231         if (rq_data_dir(rq) == WRITE &&
2232             req->data_len <= nvme_tcp_inline_data_size(queue))
2233                 req->pdu_len = req->data_len;
2234         else if (req->curr_bio)
2235                 nvme_tcp_init_iter(req, READ);
2236
2237         pdu->hdr.type = nvme_tcp_cmd;
2238         pdu->hdr.flags = 0;
2239         if (queue->hdr_digest)
2240                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2241         if (queue->data_digest && req->pdu_len) {
2242                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2243                 ddgst = nvme_tcp_ddgst_len(queue);
2244         }
2245         pdu->hdr.hlen = sizeof(*pdu);
2246         pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2247         pdu->hdr.plen =
2248                 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2249
2250         ret = nvme_tcp_map_data(queue, rq);
2251         if (unlikely(ret)) {
2252                 nvme_cleanup_cmd(rq);
2253                 dev_err(queue->ctrl->ctrl.device,
2254                         "Failed to map data (%d)\n", ret);
2255                 return ret;
2256         }
2257
2258         return 0;
2259 }
2260
2261 static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2262 {
2263         struct nvme_tcp_queue *queue = hctx->driver_data;
2264
2265         if (!llist_empty(&queue->req_list))
2266                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2267 }
2268
2269 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2270                 const struct blk_mq_queue_data *bd)
2271 {
2272         struct nvme_ns *ns = hctx->queue->queuedata;
2273         struct nvme_tcp_queue *queue = hctx->driver_data;
2274         struct request *rq = bd->rq;
2275         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2276         bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2277         blk_status_t ret;
2278
2279         if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2280                 return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
2281
2282         ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2283         if (unlikely(ret))
2284                 return ret;
2285
2286         blk_mq_start_request(rq);
2287
2288         nvme_tcp_queue_request(req, true, bd->last);
2289
2290         return BLK_STS_OK;
2291 }
2292
2293 static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2294 {
2295         struct nvme_tcp_ctrl *ctrl = set->driver_data;
2296         struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2297
2298         if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2299                 /* separate read/write queues */
2300                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2301                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2302                 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2303                 set->map[HCTX_TYPE_READ].nr_queues =
2304                         ctrl->io_queues[HCTX_TYPE_READ];
2305                 set->map[HCTX_TYPE_READ].queue_offset =
2306                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2307         } else {
2308                 /* shared read/write queues */
2309                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2310                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2311                 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2312                 set->map[HCTX_TYPE_READ].nr_queues =
2313                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2314                 set->map[HCTX_TYPE_READ].queue_offset = 0;
2315         }
2316         blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2317         blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2318
2319         if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2320                 /* map dedicated poll queues only if we have queues left */
2321                 set->map[HCTX_TYPE_POLL].nr_queues =
2322                                 ctrl->io_queues[HCTX_TYPE_POLL];
2323                 set->map[HCTX_TYPE_POLL].queue_offset =
2324                         ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2325                         ctrl->io_queues[HCTX_TYPE_READ];
2326                 blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2327         }
2328
2329         dev_info(ctrl->ctrl.device,
2330                 "mapped %d/%d/%d default/read/poll queues.\n",
2331                 ctrl->io_queues[HCTX_TYPE_DEFAULT],
2332                 ctrl->io_queues[HCTX_TYPE_READ],
2333                 ctrl->io_queues[HCTX_TYPE_POLL]);
2334
2335         return 0;
2336 }
2337
2338 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
2339 {
2340         struct nvme_tcp_queue *queue = hctx->driver_data;
2341         struct sock *sk = queue->sock->sk;
2342
2343         if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2344                 return 0;
2345
2346         set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2347         if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2348                 sk_busy_loop(sk, true);
2349         nvme_tcp_try_recv(queue);
2350         clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2351         return queue->nr_cqe;
2352 }
2353
2354 static const struct blk_mq_ops nvme_tcp_mq_ops = {
2355         .queue_rq       = nvme_tcp_queue_rq,
2356         .commit_rqs     = nvme_tcp_commit_rqs,
2357         .complete       = nvme_complete_rq,
2358         .init_request   = nvme_tcp_init_request,
2359         .exit_request   = nvme_tcp_exit_request,
2360         .init_hctx      = nvme_tcp_init_hctx,
2361         .timeout        = nvme_tcp_timeout,
2362         .map_queues     = nvme_tcp_map_queues,
2363         .poll           = nvme_tcp_poll,
2364 };
2365
2366 static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2367         .queue_rq       = nvme_tcp_queue_rq,
2368         .complete       = nvme_complete_rq,
2369         .init_request   = nvme_tcp_init_request,
2370         .exit_request   = nvme_tcp_exit_request,
2371         .init_hctx      = nvme_tcp_init_admin_hctx,
2372         .timeout        = nvme_tcp_timeout,
2373 };
2374
2375 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2376         .name                   = "tcp",
2377         .module                 = THIS_MODULE,
2378         .flags                  = NVME_F_FABRICS,
2379         .reg_read32             = nvmf_reg_read32,
2380         .reg_read64             = nvmf_reg_read64,
2381         .reg_write32            = nvmf_reg_write32,
2382         .free_ctrl              = nvme_tcp_free_ctrl,
2383         .submit_async_event     = nvme_tcp_submit_async_event,
2384         .delete_ctrl            = nvme_tcp_delete_ctrl,
2385         .get_address            = nvmf_get_address,
2386 };
2387
2388 static bool
2389 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2390 {
2391         struct nvme_tcp_ctrl *ctrl;
2392         bool found = false;
2393
2394         mutex_lock(&nvme_tcp_ctrl_mutex);
2395         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2396                 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2397                 if (found)
2398                         break;
2399         }
2400         mutex_unlock(&nvme_tcp_ctrl_mutex);
2401
2402         return found;
2403 }
2404
2405 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2406                 struct nvmf_ctrl_options *opts)
2407 {
2408         struct nvme_tcp_ctrl *ctrl;
2409         int ret;
2410
2411         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2412         if (!ctrl)
2413                 return ERR_PTR(-ENOMEM);
2414
2415         INIT_LIST_HEAD(&ctrl->list);
2416         ctrl->ctrl.opts = opts;
2417         ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2418                                 opts->nr_poll_queues + 1;
2419         ctrl->ctrl.sqsize = opts->queue_size - 1;
2420         ctrl->ctrl.kato = opts->kato;
2421
2422         INIT_DELAYED_WORK(&ctrl->connect_work,
2423                         nvme_tcp_reconnect_ctrl_work);
2424         INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2425         INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2426
2427         if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2428                 opts->trsvcid =
2429                         kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2430                 if (!opts->trsvcid) {
2431                         ret = -ENOMEM;
2432                         goto out_free_ctrl;
2433                 }
2434                 opts->mask |= NVMF_OPT_TRSVCID;
2435         }
2436
2437         ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2438                         opts->traddr, opts->trsvcid, &ctrl->addr);
2439         if (ret) {
2440                 pr_err("malformed address passed: %s:%s\n",
2441                         opts->traddr, opts->trsvcid);
2442                 goto out_free_ctrl;
2443         }
2444
2445         if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2446                 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2447                         opts->host_traddr, NULL, &ctrl->src_addr);
2448                 if (ret) {
2449                         pr_err("malformed src address passed: %s\n",
2450                                opts->host_traddr);
2451                         goto out_free_ctrl;
2452                 }
2453         }
2454
2455         if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2456                 ret = -EALREADY;
2457                 goto out_free_ctrl;
2458         }
2459
2460         ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2461                                 GFP_KERNEL);
2462         if (!ctrl->queues) {
2463                 ret = -ENOMEM;
2464                 goto out_free_ctrl;
2465         }
2466
2467         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2468         if (ret)
2469                 goto out_kfree_queues;
2470
2471         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2472                 WARN_ON_ONCE(1);
2473                 ret = -EINTR;
2474                 goto out_uninit_ctrl;
2475         }
2476
2477         ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2478         if (ret)
2479                 goto out_uninit_ctrl;
2480
2481         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2482                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2483
2484         mutex_lock(&nvme_tcp_ctrl_mutex);
2485         list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2486         mutex_unlock(&nvme_tcp_ctrl_mutex);
2487
2488         return &ctrl->ctrl;
2489
2490 out_uninit_ctrl:
2491         nvme_uninit_ctrl(&ctrl->ctrl);
2492         nvme_put_ctrl(&ctrl->ctrl);
2493         if (ret > 0)
2494                 ret = -EIO;
2495         return ERR_PTR(ret);
2496 out_kfree_queues:
2497         kfree(ctrl->queues);
2498 out_free_ctrl:
2499         kfree(ctrl);
2500         return ERR_PTR(ret);
2501 }
2502
2503 static struct nvmf_transport_ops nvme_tcp_transport = {
2504         .name           = "tcp",
2505         .module         = THIS_MODULE,
2506         .required_opts  = NVMF_OPT_TRADDR,
2507         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2508                           NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2509                           NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2510                           NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2511                           NVMF_OPT_TOS,
2512         .create_ctrl    = nvme_tcp_create_ctrl,
2513 };
2514
2515 static int __init nvme_tcp_init_module(void)
2516 {
2517         nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2518                         WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2519         if (!nvme_tcp_wq)
2520                 return -ENOMEM;
2521
2522         nvmf_register_transport(&nvme_tcp_transport);
2523         return 0;
2524 }
2525
2526 static void __exit nvme_tcp_cleanup_module(void)
2527 {
2528         struct nvme_tcp_ctrl *ctrl;
2529
2530         nvmf_unregister_transport(&nvme_tcp_transport);
2531
2532         mutex_lock(&nvme_tcp_ctrl_mutex);
2533         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2534                 nvme_delete_ctrl(&ctrl->ctrl);
2535         mutex_unlock(&nvme_tcp_ctrl_mutex);
2536         flush_workqueue(nvme_delete_wq);
2537
2538         destroy_workqueue(nvme_tcp_wq);
2539 }
2540
2541 module_init(nvme_tcp_init_module);
2542 module_exit(nvme_tcp_cleanup_module);
2543
2544 MODULE_LICENSE("GPL v2");
This page took 0.173191 seconds and 4 git commands to generate.