]> Git Repo - linux.git/blob - drivers/nvme/host/tcp.c
dma-mapping: don't return errors from dma_set_max_seg_size
[linux.git] / drivers / nvme / host / tcp.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP host.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/key.h>
12 #include <linux/nvme-tcp.h>
13 #include <linux/nvme-keyring.h>
14 #include <net/sock.h>
15 #include <net/tcp.h>
16 #include <net/tls.h>
17 #include <net/tls_prot.h>
18 #include <net/handshake.h>
19 #include <linux/blk-mq.h>
20 #include <crypto/hash.h>
21 #include <net/busy_poll.h>
22 #include <trace/events/sock.h>
23
24 #include "nvme.h"
25 #include "fabrics.h"
26
27 struct nvme_tcp_queue;
28
29 /* Define the socket priority to use for connections were it is desirable
30  * that the NIC consider performing optimized packet processing or filtering.
31  * A non-zero value being sufficient to indicate general consideration of any
32  * possible optimization.  Making it a module param allows for alternative
33  * values that may be unique for some NIC implementations.
34  */
35 static int so_priority;
36 module_param(so_priority, int, 0644);
37 MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
38
39 /*
40  * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu affinity
41  * from sysfs.
42  */
43 static bool wq_unbound;
44 module_param(wq_unbound, bool, 0644);
45 MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)");
46
47 /*
48  * TLS handshake timeout
49  */
50 static int tls_handshake_timeout = 10;
51 #ifdef CONFIG_NVME_TCP_TLS
52 module_param(tls_handshake_timeout, int, 0644);
53 MODULE_PARM_DESC(tls_handshake_timeout,
54                  "nvme TLS handshake timeout in seconds (default 10)");
55 #endif
56
57 #ifdef CONFIG_DEBUG_LOCK_ALLOC
58 /* lockdep can detect a circular dependency of the form
59  *   sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
60  * because dependencies are tracked for both nvme-tcp and user contexts. Using
61  * a separate class prevents lockdep from conflating nvme-tcp socket use with
62  * user-space socket API use.
63  */
64 static struct lock_class_key nvme_tcp_sk_key[2];
65 static struct lock_class_key nvme_tcp_slock_key[2];
66
67 static void nvme_tcp_reclassify_socket(struct socket *sock)
68 {
69         struct sock *sk = sock->sk;
70
71         if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
72                 return;
73
74         switch (sk->sk_family) {
75         case AF_INET:
76                 sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME",
77                                               &nvme_tcp_slock_key[0],
78                                               "sk_lock-AF_INET-NVME",
79                                               &nvme_tcp_sk_key[0]);
80                 break;
81         case AF_INET6:
82                 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME",
83                                               &nvme_tcp_slock_key[1],
84                                               "sk_lock-AF_INET6-NVME",
85                                               &nvme_tcp_sk_key[1]);
86                 break;
87         default:
88                 WARN_ON_ONCE(1);
89         }
90 }
91 #else
92 static void nvme_tcp_reclassify_socket(struct socket *sock) { }
93 #endif
94
95 enum nvme_tcp_send_state {
96         NVME_TCP_SEND_CMD_PDU = 0,
97         NVME_TCP_SEND_H2C_PDU,
98         NVME_TCP_SEND_DATA,
99         NVME_TCP_SEND_DDGST,
100 };
101
102 struct nvme_tcp_request {
103         struct nvme_request     req;
104         void                    *pdu;
105         struct nvme_tcp_queue   *queue;
106         u32                     data_len;
107         u32                     pdu_len;
108         u32                     pdu_sent;
109         u32                     h2cdata_left;
110         u32                     h2cdata_offset;
111         u16                     ttag;
112         __le16                  status;
113         struct list_head        entry;
114         struct llist_node       lentry;
115         __le32                  ddgst;
116
117         struct bio              *curr_bio;
118         struct iov_iter         iter;
119
120         /* send state */
121         size_t                  offset;
122         size_t                  data_sent;
123         enum nvme_tcp_send_state state;
124 };
125
126 enum nvme_tcp_queue_flags {
127         NVME_TCP_Q_ALLOCATED    = 0,
128         NVME_TCP_Q_LIVE         = 1,
129         NVME_TCP_Q_POLLING      = 2,
130 };
131
132 enum nvme_tcp_recv_state {
133         NVME_TCP_RECV_PDU = 0,
134         NVME_TCP_RECV_DATA,
135         NVME_TCP_RECV_DDGST,
136 };
137
138 struct nvme_tcp_ctrl;
139 struct nvme_tcp_queue {
140         struct socket           *sock;
141         struct work_struct      io_work;
142         int                     io_cpu;
143
144         struct mutex            queue_lock;
145         struct mutex            send_mutex;
146         struct llist_head       req_list;
147         struct list_head        send_list;
148
149         /* recv state */
150         void                    *pdu;
151         int                     pdu_remaining;
152         int                     pdu_offset;
153         size_t                  data_remaining;
154         size_t                  ddgst_remaining;
155         unsigned int            nr_cqe;
156
157         /* send state */
158         struct nvme_tcp_request *request;
159
160         u32                     maxh2cdata;
161         size_t                  cmnd_capsule_len;
162         struct nvme_tcp_ctrl    *ctrl;
163         unsigned long           flags;
164         bool                    rd_enabled;
165
166         bool                    hdr_digest;
167         bool                    data_digest;
168         struct ahash_request    *rcv_hash;
169         struct ahash_request    *snd_hash;
170         __le32                  exp_ddgst;
171         __le32                  recv_ddgst;
172         struct completion       tls_complete;
173         int                     tls_err;
174         struct page_frag_cache  pf_cache;
175
176         void (*state_change)(struct sock *);
177         void (*data_ready)(struct sock *);
178         void (*write_space)(struct sock *);
179 };
180
181 struct nvme_tcp_ctrl {
182         /* read only in the hot path */
183         struct nvme_tcp_queue   *queues;
184         struct blk_mq_tag_set   tag_set;
185
186         /* other member variables */
187         struct list_head        list;
188         struct blk_mq_tag_set   admin_tag_set;
189         struct sockaddr_storage addr;
190         struct sockaddr_storage src_addr;
191         struct nvme_ctrl        ctrl;
192
193         struct work_struct      err_work;
194         struct delayed_work     connect_work;
195         struct nvme_tcp_request async_req;
196         u32                     io_queues[HCTX_MAX_TYPES];
197 };
198
199 static LIST_HEAD(nvme_tcp_ctrl_list);
200 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
201 static struct workqueue_struct *nvme_tcp_wq;
202 static const struct blk_mq_ops nvme_tcp_mq_ops;
203 static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
204 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
205
206 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
207 {
208         return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
209 }
210
211 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
212 {
213         return queue - queue->ctrl->queues;
214 }
215
216 static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl)
217 {
218         if (!IS_ENABLED(CONFIG_NVME_TCP_TLS))
219                 return 0;
220
221         return ctrl->opts->tls;
222 }
223
224 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
225 {
226         u32 queue_idx = nvme_tcp_queue_id(queue);
227
228         if (queue_idx == 0)
229                 return queue->ctrl->admin_tag_set.tags[queue_idx];
230         return queue->ctrl->tag_set.tags[queue_idx - 1];
231 }
232
233 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
234 {
235         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
236 }
237
238 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
239 {
240         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
241 }
242
243 static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req)
244 {
245         return req->pdu;
246 }
247
248 static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req)
249 {
250         /* use the pdu space in the back for the data pdu */
251         return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) -
252                 sizeof(struct nvme_tcp_data_pdu);
253 }
254
255 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req)
256 {
257         if (nvme_is_fabrics(req->req.cmd))
258                 return NVME_TCP_ADMIN_CCSZ;
259         return req->queue->cmnd_capsule_len - sizeof(struct nvme_command);
260 }
261
262 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
263 {
264         return req == &req->queue->ctrl->async_req;
265 }
266
267 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
268 {
269         struct request *rq;
270
271         if (unlikely(nvme_tcp_async_req(req)))
272                 return false; /* async events don't have a request */
273
274         rq = blk_mq_rq_from_pdu(req);
275
276         return rq_data_dir(rq) == WRITE && req->data_len &&
277                 req->data_len <= nvme_tcp_inline_data_size(req);
278 }
279
280 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
281 {
282         return req->iter.bvec->bv_page;
283 }
284
285 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
286 {
287         return req->iter.bvec->bv_offset + req->iter.iov_offset;
288 }
289
290 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
291 {
292         return min_t(size_t, iov_iter_single_seg_count(&req->iter),
293                         req->pdu_len - req->pdu_sent);
294 }
295
296 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
297 {
298         return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
299                         req->pdu_len - req->pdu_sent : 0;
300 }
301
302 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
303                 int len)
304 {
305         return nvme_tcp_pdu_data_left(req) <= len;
306 }
307
308 static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
309                 unsigned int dir)
310 {
311         struct request *rq = blk_mq_rq_from_pdu(req);
312         struct bio_vec *vec;
313         unsigned int size;
314         int nr_bvec;
315         size_t offset;
316
317         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
318                 vec = &rq->special_vec;
319                 nr_bvec = 1;
320                 size = blk_rq_payload_bytes(rq);
321                 offset = 0;
322         } else {
323                 struct bio *bio = req->curr_bio;
324                 struct bvec_iter bi;
325                 struct bio_vec bv;
326
327                 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
328                 nr_bvec = 0;
329                 bio_for_each_bvec(bv, bio, bi) {
330                         nr_bvec++;
331                 }
332                 size = bio->bi_iter.bi_size;
333                 offset = bio->bi_iter.bi_bvec_done;
334         }
335
336         iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
337         req->iter.iov_offset = offset;
338 }
339
340 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
341                 int len)
342 {
343         req->data_sent += len;
344         req->pdu_sent += len;
345         iov_iter_advance(&req->iter, len);
346         if (!iov_iter_count(&req->iter) &&
347             req->data_sent < req->data_len) {
348                 req->curr_bio = req->curr_bio->bi_next;
349                 nvme_tcp_init_iter(req, ITER_SOURCE);
350         }
351 }
352
353 static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
354 {
355         int ret;
356
357         /* drain the send queue as much as we can... */
358         do {
359                 ret = nvme_tcp_try_send(queue);
360         } while (ret > 0);
361 }
362
363 static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue)
364 {
365         return !list_empty(&queue->send_list) ||
366                 !llist_empty(&queue->req_list);
367 }
368
369 static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
370 {
371         return !nvme_tcp_tls(&queue->ctrl->ctrl) &&
372                 nvme_tcp_queue_has_pending(queue);
373 }
374
375 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
376                 bool sync, bool last)
377 {
378         struct nvme_tcp_queue *queue = req->queue;
379         bool empty;
380
381         empty = llist_add(&req->lentry, &queue->req_list) &&
382                 list_empty(&queue->send_list) && !queue->request;
383
384         /*
385          * if we're the first on the send_list and we can try to send
386          * directly, otherwise queue io_work. Also, only do that if we
387          * are on the same cpu, so we don't introduce contention.
388          */
389         if (queue->io_cpu == raw_smp_processor_id() &&
390             sync && empty && mutex_trylock(&queue->send_mutex)) {
391                 nvme_tcp_send_all(queue);
392                 mutex_unlock(&queue->send_mutex);
393         }
394
395         if (last && nvme_tcp_queue_has_pending(queue))
396                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
397 }
398
399 static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
400 {
401         struct nvme_tcp_request *req;
402         struct llist_node *node;
403
404         for (node = llist_del_all(&queue->req_list); node; node = node->next) {
405                 req = llist_entry(node, struct nvme_tcp_request, lentry);
406                 list_add(&req->entry, &queue->send_list);
407         }
408 }
409
410 static inline struct nvme_tcp_request *
411 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
412 {
413         struct nvme_tcp_request *req;
414
415         req = list_first_entry_or_null(&queue->send_list,
416                         struct nvme_tcp_request, entry);
417         if (!req) {
418                 nvme_tcp_process_req_list(queue);
419                 req = list_first_entry_or_null(&queue->send_list,
420                                 struct nvme_tcp_request, entry);
421                 if (unlikely(!req))
422                         return NULL;
423         }
424
425         list_del(&req->entry);
426         return req;
427 }
428
429 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
430                 __le32 *dgst)
431 {
432         ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
433         crypto_ahash_final(hash);
434 }
435
436 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
437                 struct page *page, off_t off, size_t len)
438 {
439         struct scatterlist sg;
440
441         sg_init_table(&sg, 1);
442         sg_set_page(&sg, page, len, off);
443         ahash_request_set_crypt(hash, &sg, NULL, len);
444         crypto_ahash_update(hash);
445 }
446
447 static inline void nvme_tcp_hdgst(struct ahash_request *hash,
448                 void *pdu, size_t len)
449 {
450         struct scatterlist sg;
451
452         sg_init_one(&sg, pdu, len);
453         ahash_request_set_crypt(hash, &sg, pdu + len, len);
454         crypto_ahash_digest(hash);
455 }
456
457 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
458                 void *pdu, size_t pdu_len)
459 {
460         struct nvme_tcp_hdr *hdr = pdu;
461         __le32 recv_digest;
462         __le32 exp_digest;
463
464         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
465                 dev_err(queue->ctrl->ctrl.device,
466                         "queue %d: header digest flag is cleared\n",
467                         nvme_tcp_queue_id(queue));
468                 return -EPROTO;
469         }
470
471         recv_digest = *(__le32 *)(pdu + hdr->hlen);
472         nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
473         exp_digest = *(__le32 *)(pdu + hdr->hlen);
474         if (recv_digest != exp_digest) {
475                 dev_err(queue->ctrl->ctrl.device,
476                         "header digest error: recv %#x expected %#x\n",
477                         le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
478                 return -EIO;
479         }
480
481         return 0;
482 }
483
484 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
485 {
486         struct nvme_tcp_hdr *hdr = pdu;
487         u8 digest_len = nvme_tcp_hdgst_len(queue);
488         u32 len;
489
490         len = le32_to_cpu(hdr->plen) - hdr->hlen -
491                 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
492
493         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
494                 dev_err(queue->ctrl->ctrl.device,
495                         "queue %d: data digest flag is cleared\n",
496                 nvme_tcp_queue_id(queue));
497                 return -EPROTO;
498         }
499         crypto_ahash_init(queue->rcv_hash);
500
501         return 0;
502 }
503
504 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
505                 struct request *rq, unsigned int hctx_idx)
506 {
507         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
508
509         page_frag_free(req->pdu);
510 }
511
512 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
513                 struct request *rq, unsigned int hctx_idx,
514                 unsigned int numa_node)
515 {
516         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
517         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
518         struct nvme_tcp_cmd_pdu *pdu;
519         int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
520         struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
521         u8 hdgst = nvme_tcp_hdgst_len(queue);
522
523         req->pdu = page_frag_alloc(&queue->pf_cache,
524                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
525                 GFP_KERNEL | __GFP_ZERO);
526         if (!req->pdu)
527                 return -ENOMEM;
528
529         pdu = req->pdu;
530         req->queue = queue;
531         nvme_req(rq)->ctrl = &ctrl->ctrl;
532         nvme_req(rq)->cmd = &pdu->cmd;
533
534         return 0;
535 }
536
537 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
538                 unsigned int hctx_idx)
539 {
540         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
541         struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
542
543         hctx->driver_data = queue;
544         return 0;
545 }
546
547 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
548                 unsigned int hctx_idx)
549 {
550         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
551         struct nvme_tcp_queue *queue = &ctrl->queues[0];
552
553         hctx->driver_data = queue;
554         return 0;
555 }
556
557 static enum nvme_tcp_recv_state
558 nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
559 {
560         return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
561                 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
562                 NVME_TCP_RECV_DATA;
563 }
564
565 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
566 {
567         queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
568                                 nvme_tcp_hdgst_len(queue);
569         queue->pdu_offset = 0;
570         queue->data_remaining = -1;
571         queue->ddgst_remaining = 0;
572 }
573
574 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
575 {
576         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
577                 return;
578
579         dev_warn(ctrl->device, "starting error recovery\n");
580         queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
581 }
582
583 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
584                 struct nvme_completion *cqe)
585 {
586         struct nvme_tcp_request *req;
587         struct request *rq;
588
589         rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
590         if (!rq) {
591                 dev_err(queue->ctrl->ctrl.device,
592                         "got bad cqe.command_id %#x on queue %d\n",
593                         cqe->command_id, nvme_tcp_queue_id(queue));
594                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
595                 return -EINVAL;
596         }
597
598         req = blk_mq_rq_to_pdu(rq);
599         if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
600                 req->status = cqe->status;
601
602         if (!nvme_try_complete_req(rq, req->status, cqe->result))
603                 nvme_complete_rq(rq);
604         queue->nr_cqe++;
605
606         return 0;
607 }
608
609 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
610                 struct nvme_tcp_data_pdu *pdu)
611 {
612         struct request *rq;
613
614         rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
615         if (!rq) {
616                 dev_err(queue->ctrl->ctrl.device,
617                         "got bad c2hdata.command_id %#x on queue %d\n",
618                         pdu->command_id, nvme_tcp_queue_id(queue));
619                 return -ENOENT;
620         }
621
622         if (!blk_rq_payload_bytes(rq)) {
623                 dev_err(queue->ctrl->ctrl.device,
624                         "queue %d tag %#x unexpected data\n",
625                         nvme_tcp_queue_id(queue), rq->tag);
626                 return -EIO;
627         }
628
629         queue->data_remaining = le32_to_cpu(pdu->data_length);
630
631         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
632             unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
633                 dev_err(queue->ctrl->ctrl.device,
634                         "queue %d tag %#x SUCCESS set but not last PDU\n",
635                         nvme_tcp_queue_id(queue), rq->tag);
636                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
637                 return -EPROTO;
638         }
639
640         return 0;
641 }
642
643 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
644                 struct nvme_tcp_rsp_pdu *pdu)
645 {
646         struct nvme_completion *cqe = &pdu->cqe;
647         int ret = 0;
648
649         /*
650          * AEN requests are special as they don't time out and can
651          * survive any kind of queue freeze and often don't respond to
652          * aborts.  We don't even bother to allocate a struct request
653          * for them but rather special case them here.
654          */
655         if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
656                                      cqe->command_id)))
657                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
658                                 &cqe->result);
659         else
660                 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
661
662         return ret;
663 }
664
665 static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
666 {
667         struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req);
668         struct nvme_tcp_queue *queue = req->queue;
669         struct request *rq = blk_mq_rq_from_pdu(req);
670         u32 h2cdata_sent = req->pdu_len;
671         u8 hdgst = nvme_tcp_hdgst_len(queue);
672         u8 ddgst = nvme_tcp_ddgst_len(queue);
673
674         req->state = NVME_TCP_SEND_H2C_PDU;
675         req->offset = 0;
676         req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata);
677         req->pdu_sent = 0;
678         req->h2cdata_left -= req->pdu_len;
679         req->h2cdata_offset += h2cdata_sent;
680
681         memset(data, 0, sizeof(*data));
682         data->hdr.type = nvme_tcp_h2c_data;
683         if (!req->h2cdata_left)
684                 data->hdr.flags = NVME_TCP_F_DATA_LAST;
685         if (queue->hdr_digest)
686                 data->hdr.flags |= NVME_TCP_F_HDGST;
687         if (queue->data_digest)
688                 data->hdr.flags |= NVME_TCP_F_DDGST;
689         data->hdr.hlen = sizeof(*data);
690         data->hdr.pdo = data->hdr.hlen + hdgst;
691         data->hdr.plen =
692                 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
693         data->ttag = req->ttag;
694         data->command_id = nvme_cid(rq);
695         data->data_offset = cpu_to_le32(req->h2cdata_offset);
696         data->data_length = cpu_to_le32(req->pdu_len);
697 }
698
699 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
700                 struct nvme_tcp_r2t_pdu *pdu)
701 {
702         struct nvme_tcp_request *req;
703         struct request *rq;
704         u32 r2t_length = le32_to_cpu(pdu->r2t_length);
705         u32 r2t_offset = le32_to_cpu(pdu->r2t_offset);
706
707         rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
708         if (!rq) {
709                 dev_err(queue->ctrl->ctrl.device,
710                         "got bad r2t.command_id %#x on queue %d\n",
711                         pdu->command_id, nvme_tcp_queue_id(queue));
712                 return -ENOENT;
713         }
714         req = blk_mq_rq_to_pdu(rq);
715
716         if (unlikely(!r2t_length)) {
717                 dev_err(queue->ctrl->ctrl.device,
718                         "req %d r2t len is %u, probably a bug...\n",
719                         rq->tag, r2t_length);
720                 return -EPROTO;
721         }
722
723         if (unlikely(req->data_sent + r2t_length > req->data_len)) {
724                 dev_err(queue->ctrl->ctrl.device,
725                         "req %d r2t len %u exceeded data len %u (%zu sent)\n",
726                         rq->tag, r2t_length, req->data_len, req->data_sent);
727                 return -EPROTO;
728         }
729
730         if (unlikely(r2t_offset < req->data_sent)) {
731                 dev_err(queue->ctrl->ctrl.device,
732                         "req %d unexpected r2t offset %u (expected %zu)\n",
733                         rq->tag, r2t_offset, req->data_sent);
734                 return -EPROTO;
735         }
736
737         req->pdu_len = 0;
738         req->h2cdata_left = r2t_length;
739         req->h2cdata_offset = r2t_offset;
740         req->ttag = pdu->ttag;
741
742         nvme_tcp_setup_h2c_data_pdu(req);
743         nvme_tcp_queue_request(req, false, true);
744
745         return 0;
746 }
747
748 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
749                 unsigned int *offset, size_t *len)
750 {
751         struct nvme_tcp_hdr *hdr;
752         char *pdu = queue->pdu;
753         size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
754         int ret;
755
756         ret = skb_copy_bits(skb, *offset,
757                 &pdu[queue->pdu_offset], rcv_len);
758         if (unlikely(ret))
759                 return ret;
760
761         queue->pdu_remaining -= rcv_len;
762         queue->pdu_offset += rcv_len;
763         *offset += rcv_len;
764         *len -= rcv_len;
765         if (queue->pdu_remaining)
766                 return 0;
767
768         hdr = queue->pdu;
769         if (queue->hdr_digest) {
770                 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
771                 if (unlikely(ret))
772                         return ret;
773         }
774
775
776         if (queue->data_digest) {
777                 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
778                 if (unlikely(ret))
779                         return ret;
780         }
781
782         switch (hdr->type) {
783         case nvme_tcp_c2h_data:
784                 return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
785         case nvme_tcp_rsp:
786                 nvme_tcp_init_recv_ctx(queue);
787                 return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
788         case nvme_tcp_r2t:
789                 nvme_tcp_init_recv_ctx(queue);
790                 return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
791         default:
792                 dev_err(queue->ctrl->ctrl.device,
793                         "unsupported pdu type (%d)\n", hdr->type);
794                 return -EINVAL;
795         }
796 }
797
798 static inline void nvme_tcp_end_request(struct request *rq, u16 status)
799 {
800         union nvme_result res = {};
801
802         if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
803                 nvme_complete_rq(rq);
804 }
805
806 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
807                               unsigned int *offset, size_t *len)
808 {
809         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
810         struct request *rq =
811                 nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
812         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
813
814         while (true) {
815                 int recv_len, ret;
816
817                 recv_len = min_t(size_t, *len, queue->data_remaining);
818                 if (!recv_len)
819                         break;
820
821                 if (!iov_iter_count(&req->iter)) {
822                         req->curr_bio = req->curr_bio->bi_next;
823
824                         /*
825                          * If we don`t have any bios it means that controller
826                          * sent more data than we requested, hence error
827                          */
828                         if (!req->curr_bio) {
829                                 dev_err(queue->ctrl->ctrl.device,
830                                         "queue %d no space in request %#x",
831                                         nvme_tcp_queue_id(queue), rq->tag);
832                                 nvme_tcp_init_recv_ctx(queue);
833                                 return -EIO;
834                         }
835                         nvme_tcp_init_iter(req, ITER_DEST);
836                 }
837
838                 /* we can read only from what is left in this bio */
839                 recv_len = min_t(size_t, recv_len,
840                                 iov_iter_count(&req->iter));
841
842                 if (queue->data_digest)
843                         ret = skb_copy_and_hash_datagram_iter(skb, *offset,
844                                 &req->iter, recv_len, queue->rcv_hash);
845                 else
846                         ret = skb_copy_datagram_iter(skb, *offset,
847                                         &req->iter, recv_len);
848                 if (ret) {
849                         dev_err(queue->ctrl->ctrl.device,
850                                 "queue %d failed to copy request %#x data",
851                                 nvme_tcp_queue_id(queue), rq->tag);
852                         return ret;
853                 }
854
855                 *len -= recv_len;
856                 *offset += recv_len;
857                 queue->data_remaining -= recv_len;
858         }
859
860         if (!queue->data_remaining) {
861                 if (queue->data_digest) {
862                         nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
863                         queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
864                 } else {
865                         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
866                                 nvme_tcp_end_request(rq,
867                                                 le16_to_cpu(req->status));
868                                 queue->nr_cqe++;
869                         }
870                         nvme_tcp_init_recv_ctx(queue);
871                 }
872         }
873
874         return 0;
875 }
876
877 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
878                 struct sk_buff *skb, unsigned int *offset, size_t *len)
879 {
880         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
881         char *ddgst = (char *)&queue->recv_ddgst;
882         size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
883         off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
884         int ret;
885
886         ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
887         if (unlikely(ret))
888                 return ret;
889
890         queue->ddgst_remaining -= recv_len;
891         *offset += recv_len;
892         *len -= recv_len;
893         if (queue->ddgst_remaining)
894                 return 0;
895
896         if (queue->recv_ddgst != queue->exp_ddgst) {
897                 struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
898                                         pdu->command_id);
899                 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
900
901                 req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR);
902
903                 dev_err(queue->ctrl->ctrl.device,
904                         "data digest error: recv %#x expected %#x\n",
905                         le32_to_cpu(queue->recv_ddgst),
906                         le32_to_cpu(queue->exp_ddgst));
907         }
908
909         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
910                 struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
911                                         pdu->command_id);
912                 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
913
914                 nvme_tcp_end_request(rq, le16_to_cpu(req->status));
915                 queue->nr_cqe++;
916         }
917
918         nvme_tcp_init_recv_ctx(queue);
919         return 0;
920 }
921
922 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
923                              unsigned int offset, size_t len)
924 {
925         struct nvme_tcp_queue *queue = desc->arg.data;
926         size_t consumed = len;
927         int result;
928
929         if (unlikely(!queue->rd_enabled))
930                 return -EFAULT;
931
932         while (len) {
933                 switch (nvme_tcp_recv_state(queue)) {
934                 case NVME_TCP_RECV_PDU:
935                         result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
936                         break;
937                 case NVME_TCP_RECV_DATA:
938                         result = nvme_tcp_recv_data(queue, skb, &offset, &len);
939                         break;
940                 case NVME_TCP_RECV_DDGST:
941                         result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
942                         break;
943                 default:
944                         result = -EFAULT;
945                 }
946                 if (result) {
947                         dev_err(queue->ctrl->ctrl.device,
948                                 "receive failed:  %d\n", result);
949                         queue->rd_enabled = false;
950                         nvme_tcp_error_recovery(&queue->ctrl->ctrl);
951                         return result;
952                 }
953         }
954
955         return consumed;
956 }
957
958 static void nvme_tcp_data_ready(struct sock *sk)
959 {
960         struct nvme_tcp_queue *queue;
961
962         trace_sk_data_ready(sk);
963
964         read_lock_bh(&sk->sk_callback_lock);
965         queue = sk->sk_user_data;
966         if (likely(queue && queue->rd_enabled) &&
967             !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
968                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
969         read_unlock_bh(&sk->sk_callback_lock);
970 }
971
972 static void nvme_tcp_write_space(struct sock *sk)
973 {
974         struct nvme_tcp_queue *queue;
975
976         read_lock_bh(&sk->sk_callback_lock);
977         queue = sk->sk_user_data;
978         if (likely(queue && sk_stream_is_writeable(sk))) {
979                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
980                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
981         }
982         read_unlock_bh(&sk->sk_callback_lock);
983 }
984
985 static void nvme_tcp_state_change(struct sock *sk)
986 {
987         struct nvme_tcp_queue *queue;
988
989         read_lock_bh(&sk->sk_callback_lock);
990         queue = sk->sk_user_data;
991         if (!queue)
992                 goto done;
993
994         switch (sk->sk_state) {
995         case TCP_CLOSE:
996         case TCP_CLOSE_WAIT:
997         case TCP_LAST_ACK:
998         case TCP_FIN_WAIT1:
999         case TCP_FIN_WAIT2:
1000                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
1001                 break;
1002         default:
1003                 dev_info(queue->ctrl->ctrl.device,
1004                         "queue %d socket state %d\n",
1005                         nvme_tcp_queue_id(queue), sk->sk_state);
1006         }
1007
1008         queue->state_change(sk);
1009 done:
1010         read_unlock_bh(&sk->sk_callback_lock);
1011 }
1012
1013 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
1014 {
1015         queue->request = NULL;
1016 }
1017
1018 static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
1019 {
1020         if (nvme_tcp_async_req(req)) {
1021                 union nvme_result res = {};
1022
1023                 nvme_complete_async_event(&req->queue->ctrl->ctrl,
1024                                 cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res);
1025         } else {
1026                 nvme_tcp_end_request(blk_mq_rq_from_pdu(req),
1027                                 NVME_SC_HOST_PATH_ERROR);
1028         }
1029 }
1030
1031 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
1032 {
1033         struct nvme_tcp_queue *queue = req->queue;
1034         int req_data_len = req->data_len;
1035         u32 h2cdata_left = req->h2cdata_left;
1036
1037         while (true) {
1038                 struct bio_vec bvec;
1039                 struct msghdr msg = {
1040                         .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
1041                 };
1042                 struct page *page = nvme_tcp_req_cur_page(req);
1043                 size_t offset = nvme_tcp_req_cur_offset(req);
1044                 size_t len = nvme_tcp_req_cur_length(req);
1045                 bool last = nvme_tcp_pdu_last_send(req, len);
1046                 int req_data_sent = req->data_sent;
1047                 int ret;
1048
1049                 if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
1050                         msg.msg_flags |= MSG_EOR;
1051                 else
1052                         msg.msg_flags |= MSG_MORE;
1053
1054                 if (!sendpage_ok(page))
1055                         msg.msg_flags &= ~MSG_SPLICE_PAGES;
1056
1057                 bvec_set_page(&bvec, page, len, offset);
1058                 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
1059                 ret = sock_sendmsg(queue->sock, &msg);
1060                 if (ret <= 0)
1061                         return ret;
1062
1063                 if (queue->data_digest)
1064                         nvme_tcp_ddgst_update(queue->snd_hash, page,
1065                                         offset, ret);
1066
1067                 /*
1068                  * update the request iterator except for the last payload send
1069                  * in the request where we don't want to modify it as we may
1070                  * compete with the RX path completing the request.
1071                  */
1072                 if (req_data_sent + ret < req_data_len)
1073                         nvme_tcp_advance_req(req, ret);
1074
1075                 /* fully successful last send in current PDU */
1076                 if (last && ret == len) {
1077                         if (queue->data_digest) {
1078                                 nvme_tcp_ddgst_final(queue->snd_hash,
1079                                         &req->ddgst);
1080                                 req->state = NVME_TCP_SEND_DDGST;
1081                                 req->offset = 0;
1082                         } else {
1083                                 if (h2cdata_left)
1084                                         nvme_tcp_setup_h2c_data_pdu(req);
1085                                 else
1086                                         nvme_tcp_done_send_req(queue);
1087                         }
1088                         return 1;
1089                 }
1090         }
1091         return -EAGAIN;
1092 }
1093
1094 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
1095 {
1096         struct nvme_tcp_queue *queue = req->queue;
1097         struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
1098         struct bio_vec bvec;
1099         struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
1100         bool inline_data = nvme_tcp_has_inline_data(req);
1101         u8 hdgst = nvme_tcp_hdgst_len(queue);
1102         int len = sizeof(*pdu) + hdgst - req->offset;
1103         int ret;
1104
1105         if (inline_data || nvme_tcp_queue_more(queue))
1106                 msg.msg_flags |= MSG_MORE;
1107         else
1108                 msg.msg_flags |= MSG_EOR;
1109
1110         if (queue->hdr_digest && !req->offset)
1111                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1112
1113         bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
1114         iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
1115         ret = sock_sendmsg(queue->sock, &msg);
1116         if (unlikely(ret <= 0))
1117                 return ret;
1118
1119         len -= ret;
1120         if (!len) {
1121                 if (inline_data) {
1122                         req->state = NVME_TCP_SEND_DATA;
1123                         if (queue->data_digest)
1124                                 crypto_ahash_init(queue->snd_hash);
1125                 } else {
1126                         nvme_tcp_done_send_req(queue);
1127                 }
1128                 return 1;
1129         }
1130         req->offset += ret;
1131
1132         return -EAGAIN;
1133 }
1134
1135 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
1136 {
1137         struct nvme_tcp_queue *queue = req->queue;
1138         struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
1139         struct bio_vec bvec;
1140         struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, };
1141         u8 hdgst = nvme_tcp_hdgst_len(queue);
1142         int len = sizeof(*pdu) - req->offset + hdgst;
1143         int ret;
1144
1145         if (queue->hdr_digest && !req->offset)
1146                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1147
1148         if (!req->h2cdata_left)
1149                 msg.msg_flags |= MSG_SPLICE_PAGES;
1150
1151         bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
1152         iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
1153         ret = sock_sendmsg(queue->sock, &msg);
1154         if (unlikely(ret <= 0))
1155                 return ret;
1156
1157         len -= ret;
1158         if (!len) {
1159                 req->state = NVME_TCP_SEND_DATA;
1160                 if (queue->data_digest)
1161                         crypto_ahash_init(queue->snd_hash);
1162                 return 1;
1163         }
1164         req->offset += ret;
1165
1166         return -EAGAIN;
1167 }
1168
1169 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1170 {
1171         struct nvme_tcp_queue *queue = req->queue;
1172         size_t offset = req->offset;
1173         u32 h2cdata_left = req->h2cdata_left;
1174         int ret;
1175         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1176         struct kvec iov = {
1177                 .iov_base = (u8 *)&req->ddgst + req->offset,
1178                 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1179         };
1180
1181         if (nvme_tcp_queue_more(queue))
1182                 msg.msg_flags |= MSG_MORE;
1183         else
1184                 msg.msg_flags |= MSG_EOR;
1185
1186         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1187         if (unlikely(ret <= 0))
1188                 return ret;
1189
1190         if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
1191                 if (h2cdata_left)
1192                         nvme_tcp_setup_h2c_data_pdu(req);
1193                 else
1194                         nvme_tcp_done_send_req(queue);
1195                 return 1;
1196         }
1197
1198         req->offset += ret;
1199         return -EAGAIN;
1200 }
1201
1202 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1203 {
1204         struct nvme_tcp_request *req;
1205         unsigned int noreclaim_flag;
1206         int ret = 1;
1207
1208         if (!queue->request) {
1209                 queue->request = nvme_tcp_fetch_request(queue);
1210                 if (!queue->request)
1211                         return 0;
1212         }
1213         req = queue->request;
1214
1215         noreclaim_flag = memalloc_noreclaim_save();
1216         if (req->state == NVME_TCP_SEND_CMD_PDU) {
1217                 ret = nvme_tcp_try_send_cmd_pdu(req);
1218                 if (ret <= 0)
1219                         goto done;
1220                 if (!nvme_tcp_has_inline_data(req))
1221                         goto out;
1222         }
1223
1224         if (req->state == NVME_TCP_SEND_H2C_PDU) {
1225                 ret = nvme_tcp_try_send_data_pdu(req);
1226                 if (ret <= 0)
1227                         goto done;
1228         }
1229
1230         if (req->state == NVME_TCP_SEND_DATA) {
1231                 ret = nvme_tcp_try_send_data(req);
1232                 if (ret <= 0)
1233                         goto done;
1234         }
1235
1236         if (req->state == NVME_TCP_SEND_DDGST)
1237                 ret = nvme_tcp_try_send_ddgst(req);
1238 done:
1239         if (ret == -EAGAIN) {
1240                 ret = 0;
1241         } else if (ret < 0) {
1242                 dev_err(queue->ctrl->ctrl.device,
1243                         "failed to send request %d\n", ret);
1244                 nvme_tcp_fail_request(queue->request);
1245                 nvme_tcp_done_send_req(queue);
1246         }
1247 out:
1248         memalloc_noreclaim_restore(noreclaim_flag);
1249         return ret;
1250 }
1251
1252 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1253 {
1254         struct socket *sock = queue->sock;
1255         struct sock *sk = sock->sk;
1256         read_descriptor_t rd_desc;
1257         int consumed;
1258
1259         rd_desc.arg.data = queue;
1260         rd_desc.count = 1;
1261         lock_sock(sk);
1262         queue->nr_cqe = 0;
1263         consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1264         release_sock(sk);
1265         return consumed;
1266 }
1267
1268 static void nvme_tcp_io_work(struct work_struct *w)
1269 {
1270         struct nvme_tcp_queue *queue =
1271                 container_of(w, struct nvme_tcp_queue, io_work);
1272         unsigned long deadline = jiffies + msecs_to_jiffies(1);
1273
1274         do {
1275                 bool pending = false;
1276                 int result;
1277
1278                 if (mutex_trylock(&queue->send_mutex)) {
1279                         result = nvme_tcp_try_send(queue);
1280                         mutex_unlock(&queue->send_mutex);
1281                         if (result > 0)
1282                                 pending = true;
1283                         else if (unlikely(result < 0))
1284                                 break;
1285                 }
1286
1287                 result = nvme_tcp_try_recv(queue);
1288                 if (result > 0)
1289                         pending = true;
1290                 else if (unlikely(result < 0))
1291                         return;
1292
1293                 if (!pending || !queue->rd_enabled)
1294                         return;
1295
1296         } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1297
1298         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1299 }
1300
1301 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1302 {
1303         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1304
1305         ahash_request_free(queue->rcv_hash);
1306         ahash_request_free(queue->snd_hash);
1307         crypto_free_ahash(tfm);
1308 }
1309
1310 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1311 {
1312         struct crypto_ahash *tfm;
1313
1314         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1315         if (IS_ERR(tfm))
1316                 return PTR_ERR(tfm);
1317
1318         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1319         if (!queue->snd_hash)
1320                 goto free_tfm;
1321         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1322
1323         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1324         if (!queue->rcv_hash)
1325                 goto free_snd_hash;
1326         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1327
1328         return 0;
1329 free_snd_hash:
1330         ahash_request_free(queue->snd_hash);
1331 free_tfm:
1332         crypto_free_ahash(tfm);
1333         return -ENOMEM;
1334 }
1335
1336 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1337 {
1338         struct nvme_tcp_request *async = &ctrl->async_req;
1339
1340         page_frag_free(async->pdu);
1341 }
1342
1343 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1344 {
1345         struct nvme_tcp_queue *queue = &ctrl->queues[0];
1346         struct nvme_tcp_request *async = &ctrl->async_req;
1347         u8 hdgst = nvme_tcp_hdgst_len(queue);
1348
1349         async->pdu = page_frag_alloc(&queue->pf_cache,
1350                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1351                 GFP_KERNEL | __GFP_ZERO);
1352         if (!async->pdu)
1353                 return -ENOMEM;
1354
1355         async->queue = &ctrl->queues[0];
1356         return 0;
1357 }
1358
1359 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1360 {
1361         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1362         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1363         unsigned int noreclaim_flag;
1364
1365         if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1366                 return;
1367
1368         if (queue->hdr_digest || queue->data_digest)
1369                 nvme_tcp_free_crypto(queue);
1370
1371         page_frag_cache_drain(&queue->pf_cache);
1372
1373         noreclaim_flag = memalloc_noreclaim_save();
1374         /* ->sock will be released by fput() */
1375         fput(queue->sock->file);
1376         queue->sock = NULL;
1377         memalloc_noreclaim_restore(noreclaim_flag);
1378
1379         kfree(queue->pdu);
1380         mutex_destroy(&queue->send_mutex);
1381         mutex_destroy(&queue->queue_lock);
1382 }
1383
1384 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1385 {
1386         struct nvme_tcp_icreq_pdu *icreq;
1387         struct nvme_tcp_icresp_pdu *icresp;
1388         char cbuf[CMSG_LEN(sizeof(char))] = {};
1389         u8 ctype;
1390         struct msghdr msg = {};
1391         struct kvec iov;
1392         bool ctrl_hdgst, ctrl_ddgst;
1393         u32 maxh2cdata;
1394         int ret;
1395
1396         icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1397         if (!icreq)
1398                 return -ENOMEM;
1399
1400         icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1401         if (!icresp) {
1402                 ret = -ENOMEM;
1403                 goto free_icreq;
1404         }
1405
1406         icreq->hdr.type = nvme_tcp_icreq;
1407         icreq->hdr.hlen = sizeof(*icreq);
1408         icreq->hdr.pdo = 0;
1409         icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1410         icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1411         icreq->maxr2t = 0; /* single inflight r2t supported */
1412         icreq->hpda = 0; /* no alignment constraint */
1413         if (queue->hdr_digest)
1414                 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1415         if (queue->data_digest)
1416                 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1417
1418         iov.iov_base = icreq;
1419         iov.iov_len = sizeof(*icreq);
1420         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1421         if (ret < 0) {
1422                 pr_warn("queue %d: failed to send icreq, error %d\n",
1423                         nvme_tcp_queue_id(queue), ret);
1424                 goto free_icresp;
1425         }
1426
1427         memset(&msg, 0, sizeof(msg));
1428         iov.iov_base = icresp;
1429         iov.iov_len = sizeof(*icresp);
1430         if (nvme_tcp_tls(&queue->ctrl->ctrl)) {
1431                 msg.msg_control = cbuf;
1432                 msg.msg_controllen = sizeof(cbuf);
1433         }
1434         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1435                         iov.iov_len, msg.msg_flags);
1436         if (ret < 0) {
1437                 pr_warn("queue %d: failed to receive icresp, error %d\n",
1438                         nvme_tcp_queue_id(queue), ret);
1439                 goto free_icresp;
1440         }
1441         ret = -ENOTCONN;
1442         if (nvme_tcp_tls(&queue->ctrl->ctrl)) {
1443                 ctype = tls_get_record_type(queue->sock->sk,
1444                                             (struct cmsghdr *)cbuf);
1445                 if (ctype != TLS_RECORD_TYPE_DATA) {
1446                         pr_err("queue %d: unhandled TLS record %d\n",
1447                                nvme_tcp_queue_id(queue), ctype);
1448                         goto free_icresp;
1449                 }
1450         }
1451         ret = -EINVAL;
1452         if (icresp->hdr.type != nvme_tcp_icresp) {
1453                 pr_err("queue %d: bad type returned %d\n",
1454                         nvme_tcp_queue_id(queue), icresp->hdr.type);
1455                 goto free_icresp;
1456         }
1457
1458         if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1459                 pr_err("queue %d: bad pdu length returned %d\n",
1460                         nvme_tcp_queue_id(queue), icresp->hdr.plen);
1461                 goto free_icresp;
1462         }
1463
1464         if (icresp->pfv != NVME_TCP_PFV_1_0) {
1465                 pr_err("queue %d: bad pfv returned %d\n",
1466                         nvme_tcp_queue_id(queue), icresp->pfv);
1467                 goto free_icresp;
1468         }
1469
1470         ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1471         if ((queue->data_digest && !ctrl_ddgst) ||
1472             (!queue->data_digest && ctrl_ddgst)) {
1473                 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1474                         nvme_tcp_queue_id(queue),
1475                         queue->data_digest ? "enabled" : "disabled",
1476                         ctrl_ddgst ? "enabled" : "disabled");
1477                 goto free_icresp;
1478         }
1479
1480         ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1481         if ((queue->hdr_digest && !ctrl_hdgst) ||
1482             (!queue->hdr_digest && ctrl_hdgst)) {
1483                 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1484                         nvme_tcp_queue_id(queue),
1485                         queue->hdr_digest ? "enabled" : "disabled",
1486                         ctrl_hdgst ? "enabled" : "disabled");
1487                 goto free_icresp;
1488         }
1489
1490         if (icresp->cpda != 0) {
1491                 pr_err("queue %d: unsupported cpda returned %d\n",
1492                         nvme_tcp_queue_id(queue), icresp->cpda);
1493                 goto free_icresp;
1494         }
1495
1496         maxh2cdata = le32_to_cpu(icresp->maxdata);
1497         if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) {
1498                 pr_err("queue %d: invalid maxh2cdata returned %u\n",
1499                        nvme_tcp_queue_id(queue), maxh2cdata);
1500                 goto free_icresp;
1501         }
1502         queue->maxh2cdata = maxh2cdata;
1503
1504         ret = 0;
1505 free_icresp:
1506         kfree(icresp);
1507 free_icreq:
1508         kfree(icreq);
1509         return ret;
1510 }
1511
1512 static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1513 {
1514         return nvme_tcp_queue_id(queue) == 0;
1515 }
1516
1517 static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1518 {
1519         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1520         int qid = nvme_tcp_queue_id(queue);
1521
1522         return !nvme_tcp_admin_queue(queue) &&
1523                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1524 }
1525
1526 static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1527 {
1528         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1529         int qid = nvme_tcp_queue_id(queue);
1530
1531         return !nvme_tcp_admin_queue(queue) &&
1532                 !nvme_tcp_default_queue(queue) &&
1533                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1534                           ctrl->io_queues[HCTX_TYPE_READ];
1535 }
1536
1537 static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1538 {
1539         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1540         int qid = nvme_tcp_queue_id(queue);
1541
1542         return !nvme_tcp_admin_queue(queue) &&
1543                 !nvme_tcp_default_queue(queue) &&
1544                 !nvme_tcp_read_queue(queue) &&
1545                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1546                           ctrl->io_queues[HCTX_TYPE_READ] +
1547                           ctrl->io_queues[HCTX_TYPE_POLL];
1548 }
1549
1550 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1551 {
1552         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1553         int qid = nvme_tcp_queue_id(queue);
1554         int n = 0;
1555
1556         if (nvme_tcp_default_queue(queue))
1557                 n = qid - 1;
1558         else if (nvme_tcp_read_queue(queue))
1559                 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1560         else if (nvme_tcp_poll_queue(queue))
1561                 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1562                                 ctrl->io_queues[HCTX_TYPE_READ] - 1;
1563         if (wq_unbound)
1564                 queue->io_cpu = WORK_CPU_UNBOUND;
1565         else
1566                 queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1567 }
1568
1569 static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
1570 {
1571         struct nvme_tcp_queue *queue = data;
1572         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1573         int qid = nvme_tcp_queue_id(queue);
1574         struct key *tls_key;
1575
1576         dev_dbg(ctrl->ctrl.device, "queue %d: TLS handshake done, key %x, status %d\n",
1577                 qid, pskid, status);
1578
1579         if (status) {
1580                 queue->tls_err = -status;
1581                 goto out_complete;
1582         }
1583
1584         tls_key = key_lookup(pskid);
1585         if (IS_ERR(tls_key)) {
1586                 dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n",
1587                          qid, pskid);
1588                 queue->tls_err = -ENOKEY;
1589         } else {
1590                 ctrl->ctrl.tls_key = tls_key;
1591                 queue->tls_err = 0;
1592         }
1593
1594 out_complete:
1595         complete(&queue->tls_complete);
1596 }
1597
1598 static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
1599                               struct nvme_tcp_queue *queue,
1600                               key_serial_t pskid)
1601 {
1602         int qid = nvme_tcp_queue_id(queue);
1603         int ret;
1604         struct tls_handshake_args args;
1605         unsigned long tmo = tls_handshake_timeout * HZ;
1606         key_serial_t keyring = nvme_keyring_id();
1607
1608         dev_dbg(nctrl->device, "queue %d: start TLS with key %x\n",
1609                 qid, pskid);
1610         memset(&args, 0, sizeof(args));
1611         args.ta_sock = queue->sock;
1612         args.ta_done = nvme_tcp_tls_done;
1613         args.ta_data = queue;
1614         args.ta_my_peerids[0] = pskid;
1615         args.ta_num_peerids = 1;
1616         if (nctrl->opts->keyring)
1617                 keyring = key_serial(nctrl->opts->keyring);
1618         args.ta_keyring = keyring;
1619         args.ta_timeout_ms = tls_handshake_timeout * 1000;
1620         queue->tls_err = -EOPNOTSUPP;
1621         init_completion(&queue->tls_complete);
1622         ret = tls_client_hello_psk(&args, GFP_KERNEL);
1623         if (ret) {
1624                 dev_err(nctrl->device, "queue %d: failed to start TLS: %d\n",
1625                         qid, ret);
1626                 return ret;
1627         }
1628         ret = wait_for_completion_interruptible_timeout(&queue->tls_complete, tmo);
1629         if (ret <= 0) {
1630                 if (ret == 0)
1631                         ret = -ETIMEDOUT;
1632
1633                 dev_err(nctrl->device,
1634                         "queue %d: TLS handshake failed, error %d\n",
1635                         qid, ret);
1636                 tls_handshake_cancel(queue->sock->sk);
1637         } else {
1638                 dev_dbg(nctrl->device,
1639                         "queue %d: TLS handshake complete, error %d\n",
1640                         qid, queue->tls_err);
1641                 ret = queue->tls_err;
1642         }
1643         return ret;
1644 }
1645
1646 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
1647                                 key_serial_t pskid)
1648 {
1649         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1650         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1651         int ret, rcv_pdu_size;
1652         struct file *sock_file;
1653
1654         mutex_init(&queue->queue_lock);
1655         queue->ctrl = ctrl;
1656         init_llist_head(&queue->req_list);
1657         INIT_LIST_HEAD(&queue->send_list);
1658         mutex_init(&queue->send_mutex);
1659         INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1660
1661         if (qid > 0)
1662                 queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1663         else
1664                 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1665                                                 NVME_TCP_ADMIN_CCSZ;
1666
1667         ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1668                         IPPROTO_TCP, &queue->sock);
1669         if (ret) {
1670                 dev_err(nctrl->device,
1671                         "failed to create socket: %d\n", ret);
1672                 goto err_destroy_mutex;
1673         }
1674
1675         sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL);
1676         if (IS_ERR(sock_file)) {
1677                 ret = PTR_ERR(sock_file);
1678                 goto err_destroy_mutex;
1679         }
1680         nvme_tcp_reclassify_socket(queue->sock);
1681
1682         /* Single syn retry */
1683         tcp_sock_set_syncnt(queue->sock->sk, 1);
1684
1685         /* Set TCP no delay */
1686         tcp_sock_set_nodelay(queue->sock->sk);
1687
1688         /*
1689          * Cleanup whatever is sitting in the TCP transmit queue on socket
1690          * close. This is done to prevent stale data from being sent should
1691          * the network connection be restored before TCP times out.
1692          */
1693         sock_no_linger(queue->sock->sk);
1694
1695         if (so_priority > 0)
1696                 sock_set_priority(queue->sock->sk, so_priority);
1697
1698         /* Set socket type of service */
1699         if (nctrl->opts->tos >= 0)
1700                 ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1701
1702         /* Set 10 seconds timeout for icresp recvmsg */
1703         queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1704
1705         queue->sock->sk->sk_allocation = GFP_ATOMIC;
1706         queue->sock->sk->sk_use_task_frag = false;
1707         nvme_tcp_set_queue_io_cpu(queue);
1708         queue->request = NULL;
1709         queue->data_remaining = 0;
1710         queue->ddgst_remaining = 0;
1711         queue->pdu_remaining = 0;
1712         queue->pdu_offset = 0;
1713         sk_set_memalloc(queue->sock->sk);
1714
1715         if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1716                 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1717                         sizeof(ctrl->src_addr));
1718                 if (ret) {
1719                         dev_err(nctrl->device,
1720                                 "failed to bind queue %d socket %d\n",
1721                                 qid, ret);
1722                         goto err_sock;
1723                 }
1724         }
1725
1726         if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
1727                 char *iface = nctrl->opts->host_iface;
1728                 sockptr_t optval = KERNEL_SOCKPTR(iface);
1729
1730                 ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
1731                                       optval, strlen(iface));
1732                 if (ret) {
1733                         dev_err(nctrl->device,
1734                           "failed to bind to interface %s queue %d err %d\n",
1735                           iface, qid, ret);
1736                         goto err_sock;
1737                 }
1738         }
1739
1740         queue->hdr_digest = nctrl->opts->hdr_digest;
1741         queue->data_digest = nctrl->opts->data_digest;
1742         if (queue->hdr_digest || queue->data_digest) {
1743                 ret = nvme_tcp_alloc_crypto(queue);
1744                 if (ret) {
1745                         dev_err(nctrl->device,
1746                                 "failed to allocate queue %d crypto\n", qid);
1747                         goto err_sock;
1748                 }
1749         }
1750
1751         rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1752                         nvme_tcp_hdgst_len(queue);
1753         queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1754         if (!queue->pdu) {
1755                 ret = -ENOMEM;
1756                 goto err_crypto;
1757         }
1758
1759         dev_dbg(nctrl->device, "connecting queue %d\n",
1760                         nvme_tcp_queue_id(queue));
1761
1762         ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1763                 sizeof(ctrl->addr), 0);
1764         if (ret) {
1765                 dev_err(nctrl->device,
1766                         "failed to connect socket: %d\n", ret);
1767                 goto err_rcv_pdu;
1768         }
1769
1770         /* If PSKs are configured try to start TLS */
1771         if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) {
1772                 ret = nvme_tcp_start_tls(nctrl, queue, pskid);
1773                 if (ret)
1774                         goto err_init_connect;
1775         }
1776
1777         ret = nvme_tcp_init_connection(queue);
1778         if (ret)
1779                 goto err_init_connect;
1780
1781         set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1782
1783         return 0;
1784
1785 err_init_connect:
1786         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1787 err_rcv_pdu:
1788         kfree(queue->pdu);
1789 err_crypto:
1790         if (queue->hdr_digest || queue->data_digest)
1791                 nvme_tcp_free_crypto(queue);
1792 err_sock:
1793         /* ->sock will be released by fput() */
1794         fput(queue->sock->file);
1795         queue->sock = NULL;
1796 err_destroy_mutex:
1797         mutex_destroy(&queue->send_mutex);
1798         mutex_destroy(&queue->queue_lock);
1799         return ret;
1800 }
1801
1802 static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
1803 {
1804         struct socket *sock = queue->sock;
1805
1806         write_lock_bh(&sock->sk->sk_callback_lock);
1807         sock->sk->sk_user_data  = NULL;
1808         sock->sk->sk_data_ready = queue->data_ready;
1809         sock->sk->sk_state_change = queue->state_change;
1810         sock->sk->sk_write_space  = queue->write_space;
1811         write_unlock_bh(&sock->sk->sk_callback_lock);
1812 }
1813
1814 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1815 {
1816         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1817         nvme_tcp_restore_sock_ops(queue);
1818         cancel_work_sync(&queue->io_work);
1819 }
1820
1821 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1822 {
1823         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1824         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1825
1826         if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1827                 return;
1828
1829         mutex_lock(&queue->queue_lock);
1830         if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1831                 __nvme_tcp_stop_queue(queue);
1832         mutex_unlock(&queue->queue_lock);
1833 }
1834
1835 static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
1836 {
1837         write_lock_bh(&queue->sock->sk->sk_callback_lock);
1838         queue->sock->sk->sk_user_data = queue;
1839         queue->state_change = queue->sock->sk->sk_state_change;
1840         queue->data_ready = queue->sock->sk->sk_data_ready;
1841         queue->write_space = queue->sock->sk->sk_write_space;
1842         queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1843         queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1844         queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1845 #ifdef CONFIG_NET_RX_BUSY_POLL
1846         queue->sock->sk->sk_ll_usec = 1;
1847 #endif
1848         write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1849 }
1850
1851 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1852 {
1853         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1854         struct nvme_tcp_queue *queue = &ctrl->queues[idx];
1855         int ret;
1856
1857         queue->rd_enabled = true;
1858         nvme_tcp_init_recv_ctx(queue);
1859         nvme_tcp_setup_sock_ops(queue);
1860
1861         if (idx)
1862                 ret = nvmf_connect_io_queue(nctrl, idx);
1863         else
1864                 ret = nvmf_connect_admin_queue(nctrl);
1865
1866         if (!ret) {
1867                 set_bit(NVME_TCP_Q_LIVE, &queue->flags);
1868         } else {
1869                 if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1870                         __nvme_tcp_stop_queue(queue);
1871                 dev_err(nctrl->device,
1872                         "failed to connect queue: %d ret=%d\n", idx, ret);
1873         }
1874         return ret;
1875 }
1876
1877 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1878 {
1879         if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1880                 cancel_work_sync(&ctrl->async_event_work);
1881                 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1882                 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1883         }
1884
1885         nvme_tcp_free_queue(ctrl, 0);
1886 }
1887
1888 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1889 {
1890         int i;
1891
1892         for (i = 1; i < ctrl->queue_count; i++)
1893                 nvme_tcp_free_queue(ctrl, i);
1894 }
1895
1896 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1897 {
1898         int i;
1899
1900         for (i = 1; i < ctrl->queue_count; i++)
1901                 nvme_tcp_stop_queue(ctrl, i);
1902 }
1903
1904 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,
1905                                     int first, int last)
1906 {
1907         int i, ret;
1908
1909         for (i = first; i < last; i++) {
1910                 ret = nvme_tcp_start_queue(ctrl, i);
1911                 if (ret)
1912                         goto out_stop_queues;
1913         }
1914
1915         return 0;
1916
1917 out_stop_queues:
1918         for (i--; i >= first; i--)
1919                 nvme_tcp_stop_queue(ctrl, i);
1920         return ret;
1921 }
1922
1923 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1924 {
1925         int ret;
1926         key_serial_t pskid = 0;
1927
1928         if (nvme_tcp_tls(ctrl)) {
1929                 if (ctrl->opts->tls_key)
1930                         pskid = key_serial(ctrl->opts->tls_key);
1931                 else
1932                         pskid = nvme_tls_psk_default(ctrl->opts->keyring,
1933                                                       ctrl->opts->host->nqn,
1934                                                       ctrl->opts->subsysnqn);
1935                 if (!pskid) {
1936                         dev_err(ctrl->device, "no valid PSK found\n");
1937                         return -ENOKEY;
1938                 }
1939         }
1940
1941         ret = nvme_tcp_alloc_queue(ctrl, 0, pskid);
1942         if (ret)
1943                 return ret;
1944
1945         ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1946         if (ret)
1947                 goto out_free_queue;
1948
1949         return 0;
1950
1951 out_free_queue:
1952         nvme_tcp_free_queue(ctrl, 0);
1953         return ret;
1954 }
1955
1956 static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1957 {
1958         int i, ret;
1959
1960         if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) {
1961                 dev_err(ctrl->device, "no PSK negotiated\n");
1962                 return -ENOKEY;
1963         }
1964         for (i = 1; i < ctrl->queue_count; i++) {
1965                 ret = nvme_tcp_alloc_queue(ctrl, i,
1966                                 key_serial(ctrl->tls_key));
1967                 if (ret)
1968                         goto out_free_queues;
1969         }
1970
1971         return 0;
1972
1973 out_free_queues:
1974         for (i--; i >= 1; i--)
1975                 nvme_tcp_free_queue(ctrl, i);
1976
1977         return ret;
1978 }
1979
1980 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1981 {
1982         unsigned int nr_io_queues;
1983         int ret;
1984
1985         nr_io_queues = nvmf_nr_io_queues(ctrl->opts);
1986         ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1987         if (ret)
1988                 return ret;
1989
1990         if (nr_io_queues == 0) {
1991                 dev_err(ctrl->device,
1992                         "unable to set any I/O queues\n");
1993                 return -ENOMEM;
1994         }
1995
1996         ctrl->queue_count = nr_io_queues + 1;
1997         dev_info(ctrl->device,
1998                 "creating %d I/O queues.\n", nr_io_queues);
1999
2000         nvmf_set_io_queues(ctrl->opts, nr_io_queues,
2001                            to_tcp_ctrl(ctrl)->io_queues);
2002         return __nvme_tcp_alloc_io_queues(ctrl);
2003 }
2004
2005 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
2006 {
2007         nvme_tcp_stop_io_queues(ctrl);
2008         if (remove)
2009                 nvme_remove_io_tag_set(ctrl);
2010         nvme_tcp_free_io_queues(ctrl);
2011 }
2012
2013 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
2014 {
2015         int ret, nr_queues;
2016
2017         ret = nvme_tcp_alloc_io_queues(ctrl);
2018         if (ret)
2019                 return ret;
2020
2021         if (new) {
2022                 ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set,
2023                                 &nvme_tcp_mq_ops,
2024                                 ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2,
2025                                 sizeof(struct nvme_tcp_request));
2026                 if (ret)
2027                         goto out_free_io_queues;
2028         }
2029
2030         /*
2031          * Only start IO queues for which we have allocated the tagset
2032          * and limitted it to the available queues. On reconnects, the
2033          * queue number might have changed.
2034          */
2035         nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);
2036         ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues);
2037         if (ret)
2038                 goto out_cleanup_connect_q;
2039
2040         if (!new) {
2041                 nvme_start_freeze(ctrl);
2042                 nvme_unquiesce_io_queues(ctrl);
2043                 if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
2044                         /*
2045                          * If we timed out waiting for freeze we are likely to
2046                          * be stuck.  Fail the controller initialization just
2047                          * to be safe.
2048                          */
2049                         ret = -ENODEV;
2050                         nvme_unfreeze(ctrl);
2051                         goto out_wait_freeze_timed_out;
2052                 }
2053                 blk_mq_update_nr_hw_queues(ctrl->tagset,
2054                         ctrl->queue_count - 1);
2055                 nvme_unfreeze(ctrl);
2056         }
2057
2058         /*
2059          * If the number of queues has increased (reconnect case)
2060          * start all new queues now.
2061          */
2062         ret = nvme_tcp_start_io_queues(ctrl, nr_queues,
2063                                        ctrl->tagset->nr_hw_queues + 1);
2064         if (ret)
2065                 goto out_wait_freeze_timed_out;
2066
2067         return 0;
2068
2069 out_wait_freeze_timed_out:
2070         nvme_quiesce_io_queues(ctrl);
2071         nvme_sync_io_queues(ctrl);
2072         nvme_tcp_stop_io_queues(ctrl);
2073 out_cleanup_connect_q:
2074         nvme_cancel_tagset(ctrl);
2075         if (new)
2076                 nvme_remove_io_tag_set(ctrl);
2077 out_free_io_queues:
2078         nvme_tcp_free_io_queues(ctrl);
2079         return ret;
2080 }
2081
2082 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
2083 {
2084         nvme_tcp_stop_queue(ctrl, 0);
2085         if (remove)
2086                 nvme_remove_admin_tag_set(ctrl);
2087         nvme_tcp_free_admin_queue(ctrl);
2088 }
2089
2090 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
2091 {
2092         int error;
2093
2094         error = nvme_tcp_alloc_admin_queue(ctrl);
2095         if (error)
2096                 return error;
2097
2098         if (new) {
2099                 error = nvme_alloc_admin_tag_set(ctrl,
2100                                 &to_tcp_ctrl(ctrl)->admin_tag_set,
2101                                 &nvme_tcp_admin_mq_ops,
2102                                 sizeof(struct nvme_tcp_request));
2103                 if (error)
2104                         goto out_free_queue;
2105         }
2106
2107         error = nvme_tcp_start_queue(ctrl, 0);
2108         if (error)
2109                 goto out_cleanup_tagset;
2110
2111         error = nvme_enable_ctrl(ctrl);
2112         if (error)
2113                 goto out_stop_queue;
2114
2115         nvme_unquiesce_admin_queue(ctrl);
2116
2117         error = nvme_init_ctrl_finish(ctrl, false);
2118         if (error)
2119                 goto out_quiesce_queue;
2120
2121         return 0;
2122
2123 out_quiesce_queue:
2124         nvme_quiesce_admin_queue(ctrl);
2125         blk_sync_queue(ctrl->admin_q);
2126 out_stop_queue:
2127         nvme_tcp_stop_queue(ctrl, 0);
2128         nvme_cancel_admin_tagset(ctrl);
2129 out_cleanup_tagset:
2130         if (new)
2131                 nvme_remove_admin_tag_set(ctrl);
2132 out_free_queue:
2133         nvme_tcp_free_admin_queue(ctrl);
2134         return error;
2135 }
2136
2137 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
2138                 bool remove)
2139 {
2140         nvme_quiesce_admin_queue(ctrl);
2141         blk_sync_queue(ctrl->admin_q);
2142         nvme_tcp_stop_queue(ctrl, 0);
2143         nvme_cancel_admin_tagset(ctrl);
2144         if (remove)
2145                 nvme_unquiesce_admin_queue(ctrl);
2146         nvme_tcp_destroy_admin_queue(ctrl, remove);
2147 }
2148
2149 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
2150                 bool remove)
2151 {
2152         if (ctrl->queue_count <= 1)
2153                 return;
2154         nvme_quiesce_admin_queue(ctrl);
2155         nvme_quiesce_io_queues(ctrl);
2156         nvme_sync_io_queues(ctrl);
2157         nvme_tcp_stop_io_queues(ctrl);
2158         nvme_cancel_tagset(ctrl);
2159         if (remove)
2160                 nvme_unquiesce_io_queues(ctrl);
2161         nvme_tcp_destroy_io_queues(ctrl, remove);
2162 }
2163
2164 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl,
2165                 int status)
2166 {
2167         enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2168
2169         /* If we are resetting/deleting then do nothing */
2170         if (state != NVME_CTRL_CONNECTING) {
2171                 WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
2172                 return;
2173         }
2174
2175         if (nvmf_should_reconnect(ctrl, status)) {
2176                 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
2177                         ctrl->opts->reconnect_delay);
2178                 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
2179                                 ctrl->opts->reconnect_delay * HZ);
2180         } else {
2181                 dev_info(ctrl->device, "Removing controller (%d)...\n",
2182                          status);
2183                 nvme_delete_ctrl(ctrl);
2184         }
2185 }
2186
2187 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
2188 {
2189         struct nvmf_ctrl_options *opts = ctrl->opts;
2190         int ret;
2191
2192         ret = nvme_tcp_configure_admin_queue(ctrl, new);
2193         if (ret)
2194                 return ret;
2195
2196         if (ctrl->icdoff) {
2197                 ret = -EOPNOTSUPP;
2198                 dev_err(ctrl->device, "icdoff is not supported!\n");
2199                 goto destroy_admin;
2200         }
2201
2202         if (!nvme_ctrl_sgl_supported(ctrl)) {
2203                 ret = -EOPNOTSUPP;
2204                 dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
2205                 goto destroy_admin;
2206         }
2207
2208         if (opts->queue_size > ctrl->sqsize + 1)
2209                 dev_warn(ctrl->device,
2210                         "queue_size %zu > ctrl sqsize %u, clamping down\n",
2211                         opts->queue_size, ctrl->sqsize + 1);
2212
2213         if (ctrl->sqsize + 1 > ctrl->maxcmd) {
2214                 dev_warn(ctrl->device,
2215                         "sqsize %u > ctrl maxcmd %u, clamping down\n",
2216                         ctrl->sqsize + 1, ctrl->maxcmd);
2217                 ctrl->sqsize = ctrl->maxcmd - 1;
2218         }
2219
2220         if (ctrl->queue_count > 1) {
2221                 ret = nvme_tcp_configure_io_queues(ctrl, new);
2222                 if (ret)
2223                         goto destroy_admin;
2224         }
2225
2226         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
2227                 /*
2228                  * state change failure is ok if we started ctrl delete,
2229                  * unless we're during creation of a new controller to
2230                  * avoid races with teardown flow.
2231                  */
2232                 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2233
2234                 WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2235                              state != NVME_CTRL_DELETING_NOIO);
2236                 WARN_ON_ONCE(new);
2237                 ret = -EINVAL;
2238                 goto destroy_io;
2239         }
2240
2241         nvme_start_ctrl(ctrl);
2242         return 0;
2243
2244 destroy_io:
2245         if (ctrl->queue_count > 1) {
2246                 nvme_quiesce_io_queues(ctrl);
2247                 nvme_sync_io_queues(ctrl);
2248                 nvme_tcp_stop_io_queues(ctrl);
2249                 nvme_cancel_tagset(ctrl);
2250                 nvme_tcp_destroy_io_queues(ctrl, new);
2251         }
2252 destroy_admin:
2253         nvme_stop_keep_alive(ctrl);
2254         nvme_tcp_teardown_admin_queue(ctrl, false);
2255         return ret;
2256 }
2257
2258 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
2259 {
2260         struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2261                         struct nvme_tcp_ctrl, connect_work);
2262         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2263         int ret;
2264
2265         ++ctrl->nr_reconnects;
2266
2267         ret = nvme_tcp_setup_ctrl(ctrl, false);
2268         if (ret)
2269                 goto requeue;
2270
2271         dev_info(ctrl->device, "Successfully reconnected (attempt %d/%d)\n",
2272                  ctrl->nr_reconnects, ctrl->opts->max_reconnects);
2273
2274         ctrl->nr_reconnects = 0;
2275
2276         return;
2277
2278 requeue:
2279         dev_info(ctrl->device, "Failed reconnect attempt %d/%d\n",
2280                  ctrl->nr_reconnects, ctrl->opts->max_reconnects);
2281         nvme_tcp_reconnect_or_remove(ctrl, ret);
2282 }
2283
2284 static void nvme_tcp_error_recovery_work(struct work_struct *work)
2285 {
2286         struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2287                                 struct nvme_tcp_ctrl, err_work);
2288         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2289
2290         nvme_stop_keep_alive(ctrl);
2291         flush_work(&ctrl->async_event_work);
2292         nvme_tcp_teardown_io_queues(ctrl, false);
2293         /* unquiesce to fail fast pending requests */
2294         nvme_unquiesce_io_queues(ctrl);
2295         nvme_tcp_teardown_admin_queue(ctrl, false);
2296         nvme_unquiesce_admin_queue(ctrl);
2297         nvme_auth_stop(ctrl);
2298
2299         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2300                 /* state change failure is ok if we started ctrl delete */
2301                 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2302
2303                 WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2304                              state != NVME_CTRL_DELETING_NOIO);
2305                 return;
2306         }
2307
2308         nvme_tcp_reconnect_or_remove(ctrl, 0);
2309 }
2310
2311 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2312 {
2313         nvme_tcp_teardown_io_queues(ctrl, shutdown);
2314         nvme_quiesce_admin_queue(ctrl);
2315         nvme_disable_ctrl(ctrl, shutdown);
2316         nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2317 }
2318
2319 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2320 {
2321         nvme_tcp_teardown_ctrl(ctrl, true);
2322 }
2323
2324 static void nvme_reset_ctrl_work(struct work_struct *work)
2325 {
2326         struct nvme_ctrl *ctrl =
2327                 container_of(work, struct nvme_ctrl, reset_work);
2328         int ret;
2329
2330         nvme_stop_ctrl(ctrl);
2331         nvme_tcp_teardown_ctrl(ctrl, false);
2332
2333         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2334                 /* state change failure is ok if we started ctrl delete */
2335                 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2336
2337                 WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2338                              state != NVME_CTRL_DELETING_NOIO);
2339                 return;
2340         }
2341
2342         ret = nvme_tcp_setup_ctrl(ctrl, false);
2343         if (ret)
2344                 goto out_fail;
2345
2346         return;
2347
2348 out_fail:
2349         ++ctrl->nr_reconnects;
2350         nvme_tcp_reconnect_or_remove(ctrl, ret);
2351 }
2352
2353 static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
2354 {
2355         flush_work(&to_tcp_ctrl(ctrl)->err_work);
2356         cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2357 }
2358
2359 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2360 {
2361         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2362
2363         if (list_empty(&ctrl->list))
2364                 goto free_ctrl;
2365
2366         mutex_lock(&nvme_tcp_ctrl_mutex);
2367         list_del(&ctrl->list);
2368         mutex_unlock(&nvme_tcp_ctrl_mutex);
2369
2370         nvmf_free_options(nctrl->opts);
2371 free_ctrl:
2372         kfree(ctrl->queues);
2373         kfree(ctrl);
2374 }
2375
2376 static void nvme_tcp_set_sg_null(struct nvme_command *c)
2377 {
2378         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2379
2380         sg->addr = 0;
2381         sg->length = 0;
2382         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2383                         NVME_SGL_FMT_TRANSPORT_A;
2384 }
2385
2386 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2387                 struct nvme_command *c, u32 data_len)
2388 {
2389         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2390
2391         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2392         sg->length = cpu_to_le32(data_len);
2393         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2394 }
2395
2396 static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2397                 u32 data_len)
2398 {
2399         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2400
2401         sg->addr = 0;
2402         sg->length = cpu_to_le32(data_len);
2403         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2404                         NVME_SGL_FMT_TRANSPORT_A;
2405 }
2406
2407 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2408 {
2409         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2410         struct nvme_tcp_queue *queue = &ctrl->queues[0];
2411         struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2412         struct nvme_command *cmd = &pdu->cmd;
2413         u8 hdgst = nvme_tcp_hdgst_len(queue);
2414
2415         memset(pdu, 0, sizeof(*pdu));
2416         pdu->hdr.type = nvme_tcp_cmd;
2417         if (queue->hdr_digest)
2418                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2419         pdu->hdr.hlen = sizeof(*pdu);
2420         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2421
2422         cmd->common.opcode = nvme_admin_async_event;
2423         cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2424         cmd->common.flags |= NVME_CMD_SGL_METABUF;
2425         nvme_tcp_set_sg_null(cmd);
2426
2427         ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2428         ctrl->async_req.offset = 0;
2429         ctrl->async_req.curr_bio = NULL;
2430         ctrl->async_req.data_len = 0;
2431
2432         nvme_tcp_queue_request(&ctrl->async_req, true, true);
2433 }
2434
2435 static void nvme_tcp_complete_timed_out(struct request *rq)
2436 {
2437         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2438         struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2439
2440         nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
2441         nvmf_complete_timed_out_request(rq);
2442 }
2443
2444 static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
2445 {
2446         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2447         struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2448         struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2449         struct nvme_command *cmd = &pdu->cmd;
2450         int qid = nvme_tcp_queue_id(req->queue);
2451
2452         dev_warn(ctrl->device,
2453                  "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
2454                  rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode,
2455                  nvme_fabrics_opcode_str(qid, cmd), qid);
2456
2457         if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
2458                 /*
2459                  * If we are resetting, connecting or deleting we should
2460                  * complete immediately because we may block controller
2461                  * teardown or setup sequence
2462                  * - ctrl disable/shutdown fabrics requests
2463                  * - connect requests
2464                  * - initialization admin requests
2465                  * - I/O requests that entered after unquiescing and
2466                  *   the controller stopped responding
2467                  *
2468                  * All other requests should be cancelled by the error
2469                  * recovery work, so it's fine that we fail it here.
2470                  */
2471                 nvme_tcp_complete_timed_out(rq);
2472                 return BLK_EH_DONE;
2473         }
2474
2475         /*
2476          * LIVE state should trigger the normal error recovery which will
2477          * handle completing this request.
2478          */
2479         nvme_tcp_error_recovery(ctrl);
2480         return BLK_EH_RESET_TIMER;
2481 }
2482
2483 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2484                         struct request *rq)
2485 {
2486         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2487         struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2488         struct nvme_command *c = &pdu->cmd;
2489
2490         c->common.flags |= NVME_CMD_SGL_METABUF;
2491
2492         if (!blk_rq_nr_phys_segments(rq))
2493                 nvme_tcp_set_sg_null(c);
2494         else if (rq_data_dir(rq) == WRITE &&
2495             req->data_len <= nvme_tcp_inline_data_size(req))
2496                 nvme_tcp_set_sg_inline(queue, c, req->data_len);
2497         else
2498                 nvme_tcp_set_sg_host_data(c, req->data_len);
2499
2500         return 0;
2501 }
2502
2503 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2504                 struct request *rq)
2505 {
2506         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2507         struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2508         struct nvme_tcp_queue *queue = req->queue;
2509         u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2510         blk_status_t ret;
2511
2512         ret = nvme_setup_cmd(ns, rq);
2513         if (ret)
2514                 return ret;
2515
2516         req->state = NVME_TCP_SEND_CMD_PDU;
2517         req->status = cpu_to_le16(NVME_SC_SUCCESS);
2518         req->offset = 0;
2519         req->data_sent = 0;
2520         req->pdu_len = 0;
2521         req->pdu_sent = 0;
2522         req->h2cdata_left = 0;
2523         req->data_len = blk_rq_nr_phys_segments(rq) ?
2524                                 blk_rq_payload_bytes(rq) : 0;
2525         req->curr_bio = rq->bio;
2526         if (req->curr_bio && req->data_len)
2527                 nvme_tcp_init_iter(req, rq_data_dir(rq));
2528
2529         if (rq_data_dir(rq) == WRITE &&
2530             req->data_len <= nvme_tcp_inline_data_size(req))
2531                 req->pdu_len = req->data_len;
2532
2533         pdu->hdr.type = nvme_tcp_cmd;
2534         pdu->hdr.flags = 0;
2535         if (queue->hdr_digest)
2536                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2537         if (queue->data_digest && req->pdu_len) {
2538                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2539                 ddgst = nvme_tcp_ddgst_len(queue);
2540         }
2541         pdu->hdr.hlen = sizeof(*pdu);
2542         pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2543         pdu->hdr.plen =
2544                 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2545
2546         ret = nvme_tcp_map_data(queue, rq);
2547         if (unlikely(ret)) {
2548                 nvme_cleanup_cmd(rq);
2549                 dev_err(queue->ctrl->ctrl.device,
2550                         "Failed to map data (%d)\n", ret);
2551                 return ret;
2552         }
2553
2554         return 0;
2555 }
2556
2557 static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2558 {
2559         struct nvme_tcp_queue *queue = hctx->driver_data;
2560
2561         if (!llist_empty(&queue->req_list))
2562                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2563 }
2564
2565 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2566                 const struct blk_mq_queue_data *bd)
2567 {
2568         struct nvme_ns *ns = hctx->queue->queuedata;
2569         struct nvme_tcp_queue *queue = hctx->driver_data;
2570         struct request *rq = bd->rq;
2571         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2572         bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2573         blk_status_t ret;
2574
2575         if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2576                 return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
2577
2578         ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2579         if (unlikely(ret))
2580                 return ret;
2581
2582         nvme_start_request(rq);
2583
2584         nvme_tcp_queue_request(req, true, bd->last);
2585
2586         return BLK_STS_OK;
2587 }
2588
2589 static void nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2590 {
2591         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
2592
2593         nvmf_map_queues(set, &ctrl->ctrl, ctrl->io_queues);
2594 }
2595
2596 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
2597 {
2598         struct nvme_tcp_queue *queue = hctx->driver_data;
2599         struct sock *sk = queue->sock->sk;
2600
2601         if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2602                 return 0;
2603
2604         set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2605         if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2606                 sk_busy_loop(sk, true);
2607         nvme_tcp_try_recv(queue);
2608         clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2609         return queue->nr_cqe;
2610 }
2611
2612 static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
2613 {
2614         struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0];
2615         struct sockaddr_storage src_addr;
2616         int ret, len;
2617
2618         len = nvmf_get_address(ctrl, buf, size);
2619
2620         mutex_lock(&queue->queue_lock);
2621
2622         if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2623                 goto done;
2624         ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr);
2625         if (ret > 0) {
2626                 if (len > 0)
2627                         len--; /* strip trailing newline */
2628                 len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n",
2629                                 (len) ? "," : "", &src_addr);
2630         }
2631 done:
2632         mutex_unlock(&queue->queue_lock);
2633
2634         return len;
2635 }
2636
2637 static const struct blk_mq_ops nvme_tcp_mq_ops = {
2638         .queue_rq       = nvme_tcp_queue_rq,
2639         .commit_rqs     = nvme_tcp_commit_rqs,
2640         .complete       = nvme_complete_rq,
2641         .init_request   = nvme_tcp_init_request,
2642         .exit_request   = nvme_tcp_exit_request,
2643         .init_hctx      = nvme_tcp_init_hctx,
2644         .timeout        = nvme_tcp_timeout,
2645         .map_queues     = nvme_tcp_map_queues,
2646         .poll           = nvme_tcp_poll,
2647 };
2648
2649 static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2650         .queue_rq       = nvme_tcp_queue_rq,
2651         .complete       = nvme_complete_rq,
2652         .init_request   = nvme_tcp_init_request,
2653         .exit_request   = nvme_tcp_exit_request,
2654         .init_hctx      = nvme_tcp_init_admin_hctx,
2655         .timeout        = nvme_tcp_timeout,
2656 };
2657
2658 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2659         .name                   = "tcp",
2660         .module                 = THIS_MODULE,
2661         .flags                  = NVME_F_FABRICS | NVME_F_BLOCKING,
2662         .reg_read32             = nvmf_reg_read32,
2663         .reg_read64             = nvmf_reg_read64,
2664         .reg_write32            = nvmf_reg_write32,
2665         .subsystem_reset        = nvmf_subsystem_reset,
2666         .free_ctrl              = nvme_tcp_free_ctrl,
2667         .submit_async_event     = nvme_tcp_submit_async_event,
2668         .delete_ctrl            = nvme_tcp_delete_ctrl,
2669         .get_address            = nvme_tcp_get_address,
2670         .stop_ctrl              = nvme_tcp_stop_ctrl,
2671 };
2672
2673 static bool
2674 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2675 {
2676         struct nvme_tcp_ctrl *ctrl;
2677         bool found = false;
2678
2679         mutex_lock(&nvme_tcp_ctrl_mutex);
2680         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2681                 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2682                 if (found)
2683                         break;
2684         }
2685         mutex_unlock(&nvme_tcp_ctrl_mutex);
2686
2687         return found;
2688 }
2689
2690 static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
2691                 struct nvmf_ctrl_options *opts)
2692 {
2693         struct nvme_tcp_ctrl *ctrl;
2694         int ret;
2695
2696         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2697         if (!ctrl)
2698                 return ERR_PTR(-ENOMEM);
2699
2700         INIT_LIST_HEAD(&ctrl->list);
2701         ctrl->ctrl.opts = opts;
2702         ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2703                                 opts->nr_poll_queues + 1;
2704         ctrl->ctrl.sqsize = opts->queue_size - 1;
2705         ctrl->ctrl.kato = opts->kato;
2706
2707         INIT_DELAYED_WORK(&ctrl->connect_work,
2708                         nvme_tcp_reconnect_ctrl_work);
2709         INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2710         INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2711
2712         if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2713                 opts->trsvcid =
2714                         kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2715                 if (!opts->trsvcid) {
2716                         ret = -ENOMEM;
2717                         goto out_free_ctrl;
2718                 }
2719                 opts->mask |= NVMF_OPT_TRSVCID;
2720         }
2721
2722         ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2723                         opts->traddr, opts->trsvcid, &ctrl->addr);
2724         if (ret) {
2725                 pr_err("malformed address passed: %s:%s\n",
2726                         opts->traddr, opts->trsvcid);
2727                 goto out_free_ctrl;
2728         }
2729
2730         if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2731                 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2732                         opts->host_traddr, NULL, &ctrl->src_addr);
2733                 if (ret) {
2734                         pr_err("malformed src address passed: %s\n",
2735                                opts->host_traddr);
2736                         goto out_free_ctrl;
2737                 }
2738         }
2739
2740         if (opts->mask & NVMF_OPT_HOST_IFACE) {
2741                 if (!__dev_get_by_name(&init_net, opts->host_iface)) {
2742                         pr_err("invalid interface passed: %s\n",
2743                                opts->host_iface);
2744                         ret = -ENODEV;
2745                         goto out_free_ctrl;
2746                 }
2747         }
2748
2749         if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2750                 ret = -EALREADY;
2751                 goto out_free_ctrl;
2752         }
2753
2754         ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2755                                 GFP_KERNEL);
2756         if (!ctrl->queues) {
2757                 ret = -ENOMEM;
2758                 goto out_free_ctrl;
2759         }
2760
2761         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2762         if (ret)
2763                 goto out_kfree_queues;
2764
2765         return ctrl;
2766 out_kfree_queues:
2767         kfree(ctrl->queues);
2768 out_free_ctrl:
2769         kfree(ctrl);
2770         return ERR_PTR(ret);
2771 }
2772
2773 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2774                 struct nvmf_ctrl_options *opts)
2775 {
2776         struct nvme_tcp_ctrl *ctrl;
2777         int ret;
2778
2779         ctrl = nvme_tcp_alloc_ctrl(dev, opts);
2780         if (IS_ERR(ctrl))
2781                 return ERR_CAST(ctrl);
2782
2783         ret = nvme_add_ctrl(&ctrl->ctrl);
2784         if (ret)
2785                 goto out_put_ctrl;
2786
2787         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2788                 WARN_ON_ONCE(1);
2789                 ret = -EINTR;
2790                 goto out_uninit_ctrl;
2791         }
2792
2793         ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2794         if (ret)
2795                 goto out_uninit_ctrl;
2796
2797         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp, hostnqn: %s\n",
2798                 nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr, opts->host->nqn);
2799
2800         mutex_lock(&nvme_tcp_ctrl_mutex);
2801         list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2802         mutex_unlock(&nvme_tcp_ctrl_mutex);
2803
2804         return &ctrl->ctrl;
2805
2806 out_uninit_ctrl:
2807         nvme_uninit_ctrl(&ctrl->ctrl);
2808 out_put_ctrl:
2809         nvme_put_ctrl(&ctrl->ctrl);
2810         if (ret > 0)
2811                 ret = -EIO;
2812         return ERR_PTR(ret);
2813 }
2814
2815 static struct nvmf_transport_ops nvme_tcp_transport = {
2816         .name           = "tcp",
2817         .module         = THIS_MODULE,
2818         .required_opts  = NVMF_OPT_TRADDR,
2819         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2820                           NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2821                           NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2822                           NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2823                           NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS |
2824                           NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY,
2825         .create_ctrl    = nvme_tcp_create_ctrl,
2826 };
2827
2828 static int __init nvme_tcp_init_module(void)
2829 {
2830         unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
2831
2832         BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
2833         BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
2834         BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24);
2835         BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24);
2836         BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24);
2837         BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128);
2838         BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
2839         BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
2840
2841         if (wq_unbound)
2842                 wq_flags |= WQ_UNBOUND;
2843
2844         nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0);
2845         if (!nvme_tcp_wq)
2846                 return -ENOMEM;
2847
2848         nvmf_register_transport(&nvme_tcp_transport);
2849         return 0;
2850 }
2851
2852 static void __exit nvme_tcp_cleanup_module(void)
2853 {
2854         struct nvme_tcp_ctrl *ctrl;
2855
2856         nvmf_unregister_transport(&nvme_tcp_transport);
2857
2858         mutex_lock(&nvme_tcp_ctrl_mutex);
2859         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2860                 nvme_delete_ctrl(&ctrl->ctrl);
2861         mutex_unlock(&nvme_tcp_ctrl_mutex);
2862         flush_workqueue(nvme_delete_wq);
2863
2864         destroy_workqueue(nvme_tcp_wq);
2865 }
2866
2867 module_init(nvme_tcp_init_module);
2868 module_exit(nvme_tcp_cleanup_module);
2869
2870 MODULE_DESCRIPTION("NVMe host TCP transport driver");
2871 MODULE_LICENSE("GPL v2");
This page took 0.189616 seconds and 4 git commands to generate.