]> Git Repo - J-linux.git/blob - drivers/nvme/target/tcp.c
lightnvm: pblk: Replace guid_copy() with export_guid()/import_guid()
[J-linux.git] / drivers / nvme / target / tcp.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP target.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
12 #include <net/sock.h>
13 #include <net/tcp.h>
14 #include <linux/inet.h>
15 #include <linux/llist.h>
16 #include <crypto/hash.h>
17
18 #include "nvmet.h"
19
20 #define NVMET_TCP_DEF_INLINE_DATA_SIZE  (4 * PAGE_SIZE)
21
22 /* Define the socket priority to use for connections were it is desirable
23  * that the NIC consider performing optimized packet processing or filtering.
24  * A non-zero value being sufficient to indicate general consideration of any
25  * possible optimization.  Making it a module param allows for alternative
26  * values that may be unique for some NIC implementations.
27  */
28 static int so_priority;
29 module_param(so_priority, int, 0644);
30 MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority");
31
32 #define NVMET_TCP_RECV_BUDGET           8
33 #define NVMET_TCP_SEND_BUDGET           8
34 #define NVMET_TCP_IO_WORK_BUDGET        64
35
36 enum nvmet_tcp_send_state {
37         NVMET_TCP_SEND_DATA_PDU,
38         NVMET_TCP_SEND_DATA,
39         NVMET_TCP_SEND_R2T,
40         NVMET_TCP_SEND_DDGST,
41         NVMET_TCP_SEND_RESPONSE
42 };
43
44 enum nvmet_tcp_recv_state {
45         NVMET_TCP_RECV_PDU,
46         NVMET_TCP_RECV_DATA,
47         NVMET_TCP_RECV_DDGST,
48         NVMET_TCP_RECV_ERR,
49 };
50
51 enum {
52         NVMET_TCP_F_INIT_FAILED = (1 << 0),
53 };
54
55 struct nvmet_tcp_cmd {
56         struct nvmet_tcp_queue          *queue;
57         struct nvmet_req                req;
58
59         struct nvme_tcp_cmd_pdu         *cmd_pdu;
60         struct nvme_tcp_rsp_pdu         *rsp_pdu;
61         struct nvme_tcp_data_pdu        *data_pdu;
62         struct nvme_tcp_r2t_pdu         *r2t_pdu;
63
64         u32                             rbytes_done;
65         u32                             wbytes_done;
66
67         u32                             pdu_len;
68         u32                             pdu_recv;
69         int                             sg_idx;
70         int                             nr_mapped;
71         struct msghdr                   recv_msg;
72         struct kvec                     *iov;
73         u32                             flags;
74
75         struct list_head                entry;
76         struct llist_node               lentry;
77
78         /* send state */
79         u32                             offset;
80         struct scatterlist              *cur_sg;
81         enum nvmet_tcp_send_state       state;
82
83         __le32                          exp_ddgst;
84         __le32                          recv_ddgst;
85 };
86
87 enum nvmet_tcp_queue_state {
88         NVMET_TCP_Q_CONNECTING,
89         NVMET_TCP_Q_LIVE,
90         NVMET_TCP_Q_DISCONNECTING,
91 };
92
93 struct nvmet_tcp_queue {
94         struct socket           *sock;
95         struct nvmet_tcp_port   *port;
96         struct work_struct      io_work;
97         struct nvmet_cq         nvme_cq;
98         struct nvmet_sq         nvme_sq;
99
100         /* send state */
101         struct nvmet_tcp_cmd    *cmds;
102         unsigned int            nr_cmds;
103         struct list_head        free_list;
104         struct llist_head       resp_list;
105         struct list_head        resp_send_list;
106         int                     send_list_len;
107         struct nvmet_tcp_cmd    *snd_cmd;
108
109         /* recv state */
110         int                     offset;
111         int                     left;
112         enum nvmet_tcp_recv_state rcv_state;
113         struct nvmet_tcp_cmd    *cmd;
114         union nvme_tcp_pdu      pdu;
115
116         /* digest state */
117         bool                    hdr_digest;
118         bool                    data_digest;
119         struct ahash_request    *snd_hash;
120         struct ahash_request    *rcv_hash;
121
122         spinlock_t              state_lock;
123         enum nvmet_tcp_queue_state state;
124
125         struct sockaddr_storage sockaddr;
126         struct sockaddr_storage sockaddr_peer;
127         struct work_struct      release_work;
128
129         int                     idx;
130         struct list_head        queue_list;
131
132         struct nvmet_tcp_cmd    connect;
133
134         struct page_frag_cache  pf_cache;
135
136         void (*data_ready)(struct sock *);
137         void (*state_change)(struct sock *);
138         void (*write_space)(struct sock *);
139 };
140
141 struct nvmet_tcp_port {
142         struct socket           *sock;
143         struct work_struct      accept_work;
144         struct nvmet_port       *nport;
145         struct sockaddr_storage addr;
146         void (*data_ready)(struct sock *);
147 };
148
149 static DEFINE_IDA(nvmet_tcp_queue_ida);
150 static LIST_HEAD(nvmet_tcp_queue_list);
151 static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
152
153 static struct workqueue_struct *nvmet_tcp_wq;
154 static const struct nvmet_fabrics_ops nvmet_tcp_ops;
155 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
156 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
157
158 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
159                 struct nvmet_tcp_cmd *cmd)
160 {
161         if (unlikely(!queue->nr_cmds)) {
162                 /* We didn't allocate cmds yet, send 0xffff */
163                 return USHRT_MAX;
164         }
165
166         return cmd - queue->cmds;
167 }
168
169 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
170 {
171         return nvme_is_write(cmd->req.cmd) &&
172                 cmd->rbytes_done < cmd->req.transfer_len;
173 }
174
175 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
176 {
177         return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
178 }
179
180 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
181 {
182         return !nvme_is_write(cmd->req.cmd) &&
183                 cmd->req.transfer_len > 0 &&
184                 !cmd->req.cqe->status;
185 }
186
187 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
188 {
189         return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
190                 !cmd->rbytes_done;
191 }
192
193 static inline struct nvmet_tcp_cmd *
194 nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
195 {
196         struct nvmet_tcp_cmd *cmd;
197
198         cmd = list_first_entry_or_null(&queue->free_list,
199                                 struct nvmet_tcp_cmd, entry);
200         if (!cmd)
201                 return NULL;
202         list_del_init(&cmd->entry);
203
204         cmd->rbytes_done = cmd->wbytes_done = 0;
205         cmd->pdu_len = 0;
206         cmd->pdu_recv = 0;
207         cmd->iov = NULL;
208         cmd->flags = 0;
209         return cmd;
210 }
211
212 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
213 {
214         if (unlikely(cmd == &cmd->queue->connect))
215                 return;
216
217         list_add_tail(&cmd->entry, &cmd->queue->free_list);
218 }
219
220 static inline int queue_cpu(struct nvmet_tcp_queue *queue)
221 {
222         return queue->sock->sk->sk_incoming_cpu;
223 }
224
225 static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
226 {
227         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
228 }
229
230 static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
231 {
232         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
233 }
234
235 static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
236                 void *pdu, size_t len)
237 {
238         struct scatterlist sg;
239
240         sg_init_one(&sg, pdu, len);
241         ahash_request_set_crypt(hash, &sg, pdu + len, len);
242         crypto_ahash_digest(hash);
243 }
244
245 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
246         void *pdu, size_t len)
247 {
248         struct nvme_tcp_hdr *hdr = pdu;
249         __le32 recv_digest;
250         __le32 exp_digest;
251
252         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
253                 pr_err("queue %d: header digest enabled but no header digest\n",
254                         queue->idx);
255                 return -EPROTO;
256         }
257
258         recv_digest = *(__le32 *)(pdu + hdr->hlen);
259         nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
260         exp_digest = *(__le32 *)(pdu + hdr->hlen);
261         if (recv_digest != exp_digest) {
262                 pr_err("queue %d: header digest error: recv %#x expected %#x\n",
263                         queue->idx, le32_to_cpu(recv_digest),
264                         le32_to_cpu(exp_digest));
265                 return -EPROTO;
266         }
267
268         return 0;
269 }
270
271 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
272 {
273         struct nvme_tcp_hdr *hdr = pdu;
274         u8 digest_len = nvmet_tcp_hdgst_len(queue);
275         u32 len;
276
277         len = le32_to_cpu(hdr->plen) - hdr->hlen -
278                 (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
279
280         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
281                 pr_err("queue %d: data digest flag is cleared\n", queue->idx);
282                 return -EPROTO;
283         }
284
285         return 0;
286 }
287
288 static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
289 {
290         struct scatterlist *sg;
291         int i;
292
293         sg = &cmd->req.sg[cmd->sg_idx];
294
295         for (i = 0; i < cmd->nr_mapped; i++)
296                 kunmap(sg_page(&sg[i]));
297 }
298
299 static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
300 {
301         struct kvec *iov = cmd->iov;
302         struct scatterlist *sg;
303         u32 length, offset, sg_offset;
304
305         length = cmd->pdu_len;
306         cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
307         offset = cmd->rbytes_done;
308         cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE);
309         sg_offset = offset % PAGE_SIZE;
310         sg = &cmd->req.sg[cmd->sg_idx];
311
312         while (length) {
313                 u32 iov_len = min_t(u32, length, sg->length - sg_offset);
314
315                 iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
316                 iov->iov_len = iov_len;
317
318                 length -= iov_len;
319                 sg = sg_next(sg);
320                 iov++;
321         }
322
323         iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
324                 cmd->nr_mapped, cmd->pdu_len);
325 }
326
327 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
328 {
329         queue->rcv_state = NVMET_TCP_RECV_ERR;
330         if (queue->nvme_sq.ctrl)
331                 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
332         else
333                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
334 }
335
336 static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status)
337 {
338         if (status == -EPIPE || status == -ECONNRESET)
339                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
340         else
341                 nvmet_tcp_fatal_error(queue);
342 }
343
344 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
345 {
346         struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
347         u32 len = le32_to_cpu(sgl->length);
348
349         if (!len)
350                 return 0;
351
352         if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
353                           NVME_SGL_FMT_OFFSET)) {
354                 if (!nvme_is_write(cmd->req.cmd))
355                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
356
357                 if (len > cmd->req.port->inline_data_size)
358                         return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
359                 cmd->pdu_len = len;
360         }
361         cmd->req.transfer_len += len;
362
363         cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
364         if (!cmd->req.sg)
365                 return NVME_SC_INTERNAL;
366         cmd->cur_sg = cmd->req.sg;
367
368         if (nvmet_tcp_has_data_in(cmd)) {
369                 cmd->iov = kmalloc_array(cmd->req.sg_cnt,
370                                 sizeof(*cmd->iov), GFP_KERNEL);
371                 if (!cmd->iov)
372                         goto err;
373         }
374
375         return 0;
376 err:
377         sgl_free(cmd->req.sg);
378         return NVME_SC_INTERNAL;
379 }
380
381 static void nvmet_tcp_send_ddgst(struct ahash_request *hash,
382                 struct nvmet_tcp_cmd *cmd)
383 {
384         ahash_request_set_crypt(hash, cmd->req.sg,
385                 (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
386         crypto_ahash_digest(hash);
387 }
388
389 static void nvmet_tcp_recv_ddgst(struct ahash_request *hash,
390                 struct nvmet_tcp_cmd *cmd)
391 {
392         struct scatterlist sg;
393         struct kvec *iov;
394         int i;
395
396         crypto_ahash_init(hash);
397         for (i = 0, iov = cmd->iov; i < cmd->nr_mapped; i++, iov++) {
398                 sg_init_one(&sg, iov->iov_base, iov->iov_len);
399                 ahash_request_set_crypt(hash, &sg, NULL, iov->iov_len);
400                 crypto_ahash_update(hash);
401         }
402         ahash_request_set_crypt(hash, NULL, (void *)&cmd->exp_ddgst, 0);
403         crypto_ahash_final(hash);
404 }
405
406 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
407 {
408         struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
409         struct nvmet_tcp_queue *queue = cmd->queue;
410         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
411         u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
412
413         cmd->offset = 0;
414         cmd->state = NVMET_TCP_SEND_DATA_PDU;
415
416         pdu->hdr.type = nvme_tcp_c2h_data;
417         pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
418                                                 NVME_TCP_F_DATA_SUCCESS : 0);
419         pdu->hdr.hlen = sizeof(*pdu);
420         pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
421         pdu->hdr.plen =
422                 cpu_to_le32(pdu->hdr.hlen + hdgst +
423                                 cmd->req.transfer_len + ddgst);
424         pdu->command_id = cmd->req.cqe->command_id;
425         pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
426         pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
427
428         if (queue->data_digest) {
429                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
430                 nvmet_tcp_send_ddgst(queue->snd_hash, cmd);
431         }
432
433         if (cmd->queue->hdr_digest) {
434                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
435                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
436         }
437 }
438
439 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
440 {
441         struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
442         struct nvmet_tcp_queue *queue = cmd->queue;
443         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
444
445         cmd->offset = 0;
446         cmd->state = NVMET_TCP_SEND_R2T;
447
448         pdu->hdr.type = nvme_tcp_r2t;
449         pdu->hdr.flags = 0;
450         pdu->hdr.hlen = sizeof(*pdu);
451         pdu->hdr.pdo = 0;
452         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
453
454         pdu->command_id = cmd->req.cmd->common.command_id;
455         pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
456         pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
457         pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
458         if (cmd->queue->hdr_digest) {
459                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
460                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
461         }
462 }
463
464 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
465 {
466         struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
467         struct nvmet_tcp_queue *queue = cmd->queue;
468         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
469
470         cmd->offset = 0;
471         cmd->state = NVMET_TCP_SEND_RESPONSE;
472
473         pdu->hdr.type = nvme_tcp_rsp;
474         pdu->hdr.flags = 0;
475         pdu->hdr.hlen = sizeof(*pdu);
476         pdu->hdr.pdo = 0;
477         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
478         if (cmd->queue->hdr_digest) {
479                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
480                 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
481         }
482 }
483
484 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
485 {
486         struct llist_node *node;
487         struct nvmet_tcp_cmd *cmd;
488
489         for (node = llist_del_all(&queue->resp_list); node; node = node->next) {
490                 cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry);
491                 list_add(&cmd->entry, &queue->resp_send_list);
492                 queue->send_list_len++;
493         }
494 }
495
496 static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
497 {
498         queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
499                                 struct nvmet_tcp_cmd, entry);
500         if (!queue->snd_cmd) {
501                 nvmet_tcp_process_resp_list(queue);
502                 queue->snd_cmd =
503                         list_first_entry_or_null(&queue->resp_send_list,
504                                         struct nvmet_tcp_cmd, entry);
505                 if (unlikely(!queue->snd_cmd))
506                         return NULL;
507         }
508
509         list_del_init(&queue->snd_cmd->entry);
510         queue->send_list_len--;
511
512         if (nvmet_tcp_need_data_out(queue->snd_cmd))
513                 nvmet_setup_c2h_data_pdu(queue->snd_cmd);
514         else if (nvmet_tcp_need_data_in(queue->snd_cmd))
515                 nvmet_setup_r2t_pdu(queue->snd_cmd);
516         else
517                 nvmet_setup_response_pdu(queue->snd_cmd);
518
519         return queue->snd_cmd;
520 }
521
522 static void nvmet_tcp_queue_response(struct nvmet_req *req)
523 {
524         struct nvmet_tcp_cmd *cmd =
525                 container_of(req, struct nvmet_tcp_cmd, req);
526         struct nvmet_tcp_queue  *queue = cmd->queue;
527
528         llist_add(&cmd->lentry, &queue->resp_list);
529         queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
530 }
531
532 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
533 {
534         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
535         int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
536         int ret;
537
538         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
539                         offset_in_page(cmd->data_pdu) + cmd->offset,
540                         left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
541         if (ret <= 0)
542                 return ret;
543
544         cmd->offset += ret;
545         left -= ret;
546
547         if (left)
548                 return -EAGAIN;
549
550         cmd->state = NVMET_TCP_SEND_DATA;
551         cmd->offset  = 0;
552         return 1;
553 }
554
555 static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
556 {
557         struct nvmet_tcp_queue *queue = cmd->queue;
558         int ret;
559
560         while (cmd->cur_sg) {
561                 struct page *page = sg_page(cmd->cur_sg);
562                 u32 left = cmd->cur_sg->length - cmd->offset;
563                 int flags = MSG_DONTWAIT;
564
565                 if ((!last_in_batch && cmd->queue->send_list_len) ||
566                     cmd->wbytes_done + left < cmd->req.transfer_len ||
567                     queue->data_digest || !queue->nvme_sq.sqhd_disabled)
568                         flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
569
570                 ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
571                                         left, flags);
572                 if (ret <= 0)
573                         return ret;
574
575                 cmd->offset += ret;
576                 cmd->wbytes_done += ret;
577
578                 /* Done with sg?*/
579                 if (cmd->offset == cmd->cur_sg->length) {
580                         cmd->cur_sg = sg_next(cmd->cur_sg);
581                         cmd->offset = 0;
582                 }
583         }
584
585         if (queue->data_digest) {
586                 cmd->state = NVMET_TCP_SEND_DDGST;
587                 cmd->offset = 0;
588         } else {
589                 if (queue->nvme_sq.sqhd_disabled) {
590                         cmd->queue->snd_cmd = NULL;
591                         nvmet_tcp_put_cmd(cmd);
592                 } else {
593                         nvmet_setup_response_pdu(cmd);
594                 }
595         }
596
597         if (queue->nvme_sq.sqhd_disabled) {
598                 kfree(cmd->iov);
599                 sgl_free(cmd->req.sg);
600         }
601
602         return 1;
603
604 }
605
606 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
607                 bool last_in_batch)
608 {
609         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
610         int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
611         int flags = MSG_DONTWAIT;
612         int ret;
613
614         if (!last_in_batch && cmd->queue->send_list_len)
615                 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
616         else
617                 flags |= MSG_EOR;
618
619         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
620                 offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
621         if (ret <= 0)
622                 return ret;
623         cmd->offset += ret;
624         left -= ret;
625
626         if (left)
627                 return -EAGAIN;
628
629         kfree(cmd->iov);
630         sgl_free(cmd->req.sg);
631         cmd->queue->snd_cmd = NULL;
632         nvmet_tcp_put_cmd(cmd);
633         return 1;
634 }
635
636 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
637 {
638         u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
639         int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
640         int flags = MSG_DONTWAIT;
641         int ret;
642
643         if (!last_in_batch && cmd->queue->send_list_len)
644                 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
645         else
646                 flags |= MSG_EOR;
647
648         ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
649                 offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
650         if (ret <= 0)
651                 return ret;
652         cmd->offset += ret;
653         left -= ret;
654
655         if (left)
656                 return -EAGAIN;
657
658         cmd->queue->snd_cmd = NULL;
659         return 1;
660 }
661
662 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
663 {
664         struct nvmet_tcp_queue *queue = cmd->queue;
665         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
666         struct kvec iov = {
667                 .iov_base = &cmd->exp_ddgst + cmd->offset,
668                 .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
669         };
670         int ret;
671
672         if (!last_in_batch && cmd->queue->send_list_len)
673                 msg.msg_flags |= MSG_MORE;
674         else
675                 msg.msg_flags |= MSG_EOR;
676
677         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
678         if (unlikely(ret <= 0))
679                 return ret;
680
681         cmd->offset += ret;
682
683         if (queue->nvme_sq.sqhd_disabled) {
684                 cmd->queue->snd_cmd = NULL;
685                 nvmet_tcp_put_cmd(cmd);
686         } else {
687                 nvmet_setup_response_pdu(cmd);
688         }
689         return 1;
690 }
691
692 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
693                 bool last_in_batch)
694 {
695         struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
696         int ret = 0;
697
698         if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
699                 cmd = nvmet_tcp_fetch_cmd(queue);
700                 if (unlikely(!cmd))
701                         return 0;
702         }
703
704         if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
705                 ret = nvmet_try_send_data_pdu(cmd);
706                 if (ret <= 0)
707                         goto done_send;
708         }
709
710         if (cmd->state == NVMET_TCP_SEND_DATA) {
711                 ret = nvmet_try_send_data(cmd, last_in_batch);
712                 if (ret <= 0)
713                         goto done_send;
714         }
715
716         if (cmd->state == NVMET_TCP_SEND_DDGST) {
717                 ret = nvmet_try_send_ddgst(cmd, last_in_batch);
718                 if (ret <= 0)
719                         goto done_send;
720         }
721
722         if (cmd->state == NVMET_TCP_SEND_R2T) {
723                 ret = nvmet_try_send_r2t(cmd, last_in_batch);
724                 if (ret <= 0)
725                         goto done_send;
726         }
727
728         if (cmd->state == NVMET_TCP_SEND_RESPONSE)
729                 ret = nvmet_try_send_response(cmd, last_in_batch);
730
731 done_send:
732         if (ret < 0) {
733                 if (ret == -EAGAIN)
734                         return 0;
735                 return ret;
736         }
737
738         return 1;
739 }
740
741 static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
742                 int budget, int *sends)
743 {
744         int i, ret = 0;
745
746         for (i = 0; i < budget; i++) {
747                 ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
748                 if (unlikely(ret < 0)) {
749                         nvmet_tcp_socket_error(queue, ret);
750                         goto done;
751                 } else if (ret == 0) {
752                         break;
753                 }
754                 (*sends)++;
755         }
756 done:
757         return ret;
758 }
759
760 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
761 {
762         queue->offset = 0;
763         queue->left = sizeof(struct nvme_tcp_hdr);
764         queue->cmd = NULL;
765         queue->rcv_state = NVMET_TCP_RECV_PDU;
766 }
767
768 static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
769 {
770         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
771
772         ahash_request_free(queue->rcv_hash);
773         ahash_request_free(queue->snd_hash);
774         crypto_free_ahash(tfm);
775 }
776
777 static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
778 {
779         struct crypto_ahash *tfm;
780
781         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
782         if (IS_ERR(tfm))
783                 return PTR_ERR(tfm);
784
785         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
786         if (!queue->snd_hash)
787                 goto free_tfm;
788         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
789
790         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
791         if (!queue->rcv_hash)
792                 goto free_snd_hash;
793         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
794
795         return 0;
796 free_snd_hash:
797         ahash_request_free(queue->snd_hash);
798 free_tfm:
799         crypto_free_ahash(tfm);
800         return -ENOMEM;
801 }
802
803
804 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
805 {
806         struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
807         struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
808         struct msghdr msg = {};
809         struct kvec iov;
810         int ret;
811
812         if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
813                 pr_err("bad nvme-tcp pdu length (%d)\n",
814                         le32_to_cpu(icreq->hdr.plen));
815                 nvmet_tcp_fatal_error(queue);
816         }
817
818         if (icreq->pfv != NVME_TCP_PFV_1_0) {
819                 pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
820                 return -EPROTO;
821         }
822
823         if (icreq->hpda != 0) {
824                 pr_err("queue %d: unsupported hpda %d\n", queue->idx,
825                         icreq->hpda);
826                 return -EPROTO;
827         }
828
829         queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
830         queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
831         if (queue->hdr_digest || queue->data_digest) {
832                 ret = nvmet_tcp_alloc_crypto(queue);
833                 if (ret)
834                         return ret;
835         }
836
837         memset(icresp, 0, sizeof(*icresp));
838         icresp->hdr.type = nvme_tcp_icresp;
839         icresp->hdr.hlen = sizeof(*icresp);
840         icresp->hdr.pdo = 0;
841         icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
842         icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
843         icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */
844         icresp->cpda = 0;
845         if (queue->hdr_digest)
846                 icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
847         if (queue->data_digest)
848                 icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
849
850         iov.iov_base = icresp;
851         iov.iov_len = sizeof(*icresp);
852         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
853         if (ret < 0)
854                 goto free_crypto;
855
856         queue->state = NVMET_TCP_Q_LIVE;
857         nvmet_prepare_receive_pdu(queue);
858         return 0;
859 free_crypto:
860         if (queue->hdr_digest || queue->data_digest)
861                 nvmet_tcp_free_crypto(queue);
862         return ret;
863 }
864
865 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
866                 struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
867 {
868         size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
869         int ret;
870
871         if (!nvme_is_write(cmd->req.cmd) ||
872             data_len > cmd->req.port->inline_data_size) {
873                 nvmet_prepare_receive_pdu(queue);
874                 return;
875         }
876
877         ret = nvmet_tcp_map_data(cmd);
878         if (unlikely(ret)) {
879                 pr_err("queue %d: failed to map data\n", queue->idx);
880                 nvmet_tcp_fatal_error(queue);
881                 return;
882         }
883
884         queue->rcv_state = NVMET_TCP_RECV_DATA;
885         nvmet_tcp_map_pdu_iovec(cmd);
886         cmd->flags |= NVMET_TCP_F_INIT_FAILED;
887 }
888
889 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
890 {
891         struct nvme_tcp_data_pdu *data = &queue->pdu.data;
892         struct nvmet_tcp_cmd *cmd;
893
894         if (likely(queue->nr_cmds))
895                 cmd = &queue->cmds[data->ttag];
896         else
897                 cmd = &queue->connect;
898
899         if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
900                 pr_err("ttag %u unexpected data offset %u (expected %u)\n",
901                         data->ttag, le32_to_cpu(data->data_offset),
902                         cmd->rbytes_done);
903                 /* FIXME: use path and transport errors */
904                 nvmet_req_complete(&cmd->req,
905                         NVME_SC_INVALID_FIELD | NVME_SC_DNR);
906                 return -EPROTO;
907         }
908
909         cmd->pdu_len = le32_to_cpu(data->data_length);
910         cmd->pdu_recv = 0;
911         nvmet_tcp_map_pdu_iovec(cmd);
912         queue->cmd = cmd;
913         queue->rcv_state = NVMET_TCP_RECV_DATA;
914
915         return 0;
916 }
917
918 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
919 {
920         struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
921         struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
922         struct nvmet_req *req;
923         int ret;
924
925         if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
926                 if (hdr->type != nvme_tcp_icreq) {
927                         pr_err("unexpected pdu type (%d) before icreq\n",
928                                 hdr->type);
929                         nvmet_tcp_fatal_error(queue);
930                         return -EPROTO;
931                 }
932                 return nvmet_tcp_handle_icreq(queue);
933         }
934
935         if (hdr->type == nvme_tcp_h2c_data) {
936                 ret = nvmet_tcp_handle_h2c_data_pdu(queue);
937                 if (unlikely(ret))
938                         return ret;
939                 return 0;
940         }
941
942         queue->cmd = nvmet_tcp_get_cmd(queue);
943         if (unlikely(!queue->cmd)) {
944                 /* This should never happen */
945                 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
946                         queue->idx, queue->nr_cmds, queue->send_list_len,
947                         nvme_cmd->common.opcode);
948                 nvmet_tcp_fatal_error(queue);
949                 return -ENOMEM;
950         }
951
952         req = &queue->cmd->req;
953         memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
954
955         if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
956                         &queue->nvme_sq, &nvmet_tcp_ops))) {
957                 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
958                         req->cmd, req->cmd->common.command_id,
959                         req->cmd->common.opcode,
960                         le32_to_cpu(req->cmd->common.dptr.sgl.length));
961
962                 nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
963                 return -EAGAIN;
964         }
965
966         ret = nvmet_tcp_map_data(queue->cmd);
967         if (unlikely(ret)) {
968                 pr_err("queue %d: failed to map data\n", queue->idx);
969                 if (nvmet_tcp_has_inline_data(queue->cmd))
970                         nvmet_tcp_fatal_error(queue);
971                 else
972                         nvmet_req_complete(req, ret);
973                 ret = -EAGAIN;
974                 goto out;
975         }
976
977         if (nvmet_tcp_need_data_in(queue->cmd)) {
978                 if (nvmet_tcp_has_inline_data(queue->cmd)) {
979                         queue->rcv_state = NVMET_TCP_RECV_DATA;
980                         nvmet_tcp_map_pdu_iovec(queue->cmd);
981                         return 0;
982                 }
983                 /* send back R2T */
984                 nvmet_tcp_queue_response(&queue->cmd->req);
985                 goto out;
986         }
987
988         queue->cmd->req.execute(&queue->cmd->req);
989 out:
990         nvmet_prepare_receive_pdu(queue);
991         return ret;
992 }
993
994 static const u8 nvme_tcp_pdu_sizes[] = {
995         [nvme_tcp_icreq]        = sizeof(struct nvme_tcp_icreq_pdu),
996         [nvme_tcp_cmd]          = sizeof(struct nvme_tcp_cmd_pdu),
997         [nvme_tcp_h2c_data]     = sizeof(struct nvme_tcp_data_pdu),
998 };
999
1000 static inline u8 nvmet_tcp_pdu_size(u8 type)
1001 {
1002         size_t idx = type;
1003
1004         return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
1005                 nvme_tcp_pdu_sizes[idx]) ?
1006                         nvme_tcp_pdu_sizes[idx] : 0;
1007 }
1008
1009 static inline bool nvmet_tcp_pdu_valid(u8 type)
1010 {
1011         switch (type) {
1012         case nvme_tcp_icreq:
1013         case nvme_tcp_cmd:
1014         case nvme_tcp_h2c_data:
1015                 /* fallthru */
1016                 return true;
1017         }
1018
1019         return false;
1020 }
1021
1022 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
1023 {
1024         struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
1025         int len;
1026         struct kvec iov;
1027         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1028
1029 recv:
1030         iov.iov_base = (void *)&queue->pdu + queue->offset;
1031         iov.iov_len = queue->left;
1032         len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1033                         iov.iov_len, msg.msg_flags);
1034         if (unlikely(len < 0))
1035                 return len;
1036
1037         queue->offset += len;
1038         queue->left -= len;
1039         if (queue->left)
1040                 return -EAGAIN;
1041
1042         if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
1043                 u8 hdgst = nvmet_tcp_hdgst_len(queue);
1044
1045                 if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
1046                         pr_err("unexpected pdu type %d\n", hdr->type);
1047                         nvmet_tcp_fatal_error(queue);
1048                         return -EIO;
1049                 }
1050
1051                 if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
1052                         pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
1053                         return -EIO;
1054                 }
1055
1056                 queue->left = hdr->hlen - queue->offset + hdgst;
1057                 goto recv;
1058         }
1059
1060         if (queue->hdr_digest &&
1061             nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
1062                 nvmet_tcp_fatal_error(queue); /* fatal */
1063                 return -EPROTO;
1064         }
1065
1066         if (queue->data_digest &&
1067             nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1068                 nvmet_tcp_fatal_error(queue); /* fatal */
1069                 return -EPROTO;
1070         }
1071
1072         return nvmet_tcp_done_recv_pdu(queue);
1073 }
1074
1075 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1076 {
1077         struct nvmet_tcp_queue *queue = cmd->queue;
1078
1079         nvmet_tcp_recv_ddgst(queue->rcv_hash, cmd);
1080         queue->offset = 0;
1081         queue->left = NVME_TCP_DIGEST_LENGTH;
1082         queue->rcv_state = NVMET_TCP_RECV_DDGST;
1083 }
1084
1085 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1086 {
1087         struct nvmet_tcp_cmd  *cmd = queue->cmd;
1088         int ret;
1089
1090         while (msg_data_left(&cmd->recv_msg)) {
1091                 ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1092                         cmd->recv_msg.msg_flags);
1093                 if (ret <= 0)
1094                         return ret;
1095
1096                 cmd->pdu_recv += ret;
1097                 cmd->rbytes_done += ret;
1098         }
1099
1100         if (queue->data_digest) {
1101                 nvmet_tcp_prep_recv_ddgst(cmd);
1102                 return 0;
1103         }
1104         nvmet_tcp_unmap_pdu_iovec(cmd);
1105
1106         if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1107             cmd->rbytes_done == cmd->req.transfer_len) {
1108                 cmd->req.execute(&cmd->req);
1109         }
1110
1111         nvmet_prepare_receive_pdu(queue);
1112         return 0;
1113 }
1114
1115 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1116 {
1117         struct nvmet_tcp_cmd *cmd = queue->cmd;
1118         int ret;
1119         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1120         struct kvec iov = {
1121                 .iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1122                 .iov_len = queue->left
1123         };
1124
1125         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1126                         iov.iov_len, msg.msg_flags);
1127         if (unlikely(ret < 0))
1128                 return ret;
1129
1130         queue->offset += ret;
1131         queue->left -= ret;
1132         if (queue->left)
1133                 return -EAGAIN;
1134
1135         if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1136                 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1137                         queue->idx, cmd->req.cmd->common.command_id,
1138                         queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1139                         le32_to_cpu(cmd->exp_ddgst));
1140                 nvmet_tcp_finish_cmd(cmd);
1141                 nvmet_tcp_fatal_error(queue);
1142                 ret = -EPROTO;
1143                 goto out;
1144         }
1145
1146         if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1147             cmd->rbytes_done == cmd->req.transfer_len)
1148                 cmd->req.execute(&cmd->req);
1149         ret = 0;
1150 out:
1151         nvmet_prepare_receive_pdu(queue);
1152         return ret;
1153 }
1154
1155 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1156 {
1157         int result = 0;
1158
1159         if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1160                 return 0;
1161
1162         if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1163                 result = nvmet_tcp_try_recv_pdu(queue);
1164                 if (result != 0)
1165                         goto done_recv;
1166         }
1167
1168         if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1169                 result = nvmet_tcp_try_recv_data(queue);
1170                 if (result != 0)
1171                         goto done_recv;
1172         }
1173
1174         if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1175                 result = nvmet_tcp_try_recv_ddgst(queue);
1176                 if (result != 0)
1177                         goto done_recv;
1178         }
1179
1180 done_recv:
1181         if (result < 0) {
1182                 if (result == -EAGAIN)
1183                         return 0;
1184                 return result;
1185         }
1186         return 1;
1187 }
1188
1189 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1190                 int budget, int *recvs)
1191 {
1192         int i, ret = 0;
1193
1194         for (i = 0; i < budget; i++) {
1195                 ret = nvmet_tcp_try_recv_one(queue);
1196                 if (unlikely(ret < 0)) {
1197                         nvmet_tcp_socket_error(queue, ret);
1198                         goto done;
1199                 } else if (ret == 0) {
1200                         break;
1201                 }
1202                 (*recvs)++;
1203         }
1204 done:
1205         return ret;
1206 }
1207
1208 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1209 {
1210         spin_lock(&queue->state_lock);
1211         if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1212                 queue->state = NVMET_TCP_Q_DISCONNECTING;
1213                 schedule_work(&queue->release_work);
1214         }
1215         spin_unlock(&queue->state_lock);
1216 }
1217
1218 static void nvmet_tcp_io_work(struct work_struct *w)
1219 {
1220         struct nvmet_tcp_queue *queue =
1221                 container_of(w, struct nvmet_tcp_queue, io_work);
1222         bool pending;
1223         int ret, ops = 0;
1224
1225         do {
1226                 pending = false;
1227
1228                 ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1229                 if (ret > 0)
1230                         pending = true;
1231                 else if (ret < 0)
1232                         return;
1233
1234                 ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1235                 if (ret > 0)
1236                         pending = true;
1237                 else if (ret < 0)
1238                         return;
1239
1240         } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1241
1242         /*
1243          * We exahusted our budget, requeue our selves
1244          */
1245         if (pending)
1246                 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1247 }
1248
1249 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1250                 struct nvmet_tcp_cmd *c)
1251 {
1252         u8 hdgst = nvmet_tcp_hdgst_len(queue);
1253
1254         c->queue = queue;
1255         c->req.port = queue->port->nport;
1256
1257         c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1258                         sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1259         if (!c->cmd_pdu)
1260                 return -ENOMEM;
1261         c->req.cmd = &c->cmd_pdu->cmd;
1262
1263         c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1264                         sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1265         if (!c->rsp_pdu)
1266                 goto out_free_cmd;
1267         c->req.cqe = &c->rsp_pdu->cqe;
1268
1269         c->data_pdu = page_frag_alloc(&queue->pf_cache,
1270                         sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1271         if (!c->data_pdu)
1272                 goto out_free_rsp;
1273
1274         c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1275                         sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1276         if (!c->r2t_pdu)
1277                 goto out_free_data;
1278
1279         c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1280
1281         list_add_tail(&c->entry, &queue->free_list);
1282
1283         return 0;
1284 out_free_data:
1285         page_frag_free(c->data_pdu);
1286 out_free_rsp:
1287         page_frag_free(c->rsp_pdu);
1288 out_free_cmd:
1289         page_frag_free(c->cmd_pdu);
1290         return -ENOMEM;
1291 }
1292
1293 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1294 {
1295         page_frag_free(c->r2t_pdu);
1296         page_frag_free(c->data_pdu);
1297         page_frag_free(c->rsp_pdu);
1298         page_frag_free(c->cmd_pdu);
1299 }
1300
1301 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1302 {
1303         struct nvmet_tcp_cmd *cmds;
1304         int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1305
1306         cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1307         if (!cmds)
1308                 goto out;
1309
1310         for (i = 0; i < nr_cmds; i++) {
1311                 ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1312                 if (ret)
1313                         goto out_free;
1314         }
1315
1316         queue->cmds = cmds;
1317
1318         return 0;
1319 out_free:
1320         while (--i >= 0)
1321                 nvmet_tcp_free_cmd(cmds + i);
1322         kfree(cmds);
1323 out:
1324         return ret;
1325 }
1326
1327 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1328 {
1329         struct nvmet_tcp_cmd *cmds = queue->cmds;
1330         int i;
1331
1332         for (i = 0; i < queue->nr_cmds; i++)
1333                 nvmet_tcp_free_cmd(cmds + i);
1334
1335         nvmet_tcp_free_cmd(&queue->connect);
1336         kfree(cmds);
1337 }
1338
1339 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1340 {
1341         struct socket *sock = queue->sock;
1342
1343         write_lock_bh(&sock->sk->sk_callback_lock);
1344         sock->sk->sk_data_ready =  queue->data_ready;
1345         sock->sk->sk_state_change = queue->state_change;
1346         sock->sk->sk_write_space = queue->write_space;
1347         sock->sk->sk_user_data = NULL;
1348         write_unlock_bh(&sock->sk->sk_callback_lock);
1349 }
1350
1351 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
1352 {
1353         nvmet_req_uninit(&cmd->req);
1354         nvmet_tcp_unmap_pdu_iovec(cmd);
1355         kfree(cmd->iov);
1356         sgl_free(cmd->req.sg);
1357 }
1358
1359 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1360 {
1361         struct nvmet_tcp_cmd *cmd = queue->cmds;
1362         int i;
1363
1364         for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1365                 if (nvmet_tcp_need_data_in(cmd))
1366                         nvmet_tcp_finish_cmd(cmd);
1367         }
1368
1369         if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1370                 /* failed in connect */
1371                 nvmet_tcp_finish_cmd(&queue->connect);
1372         }
1373 }
1374
1375 static void nvmet_tcp_release_queue_work(struct work_struct *w)
1376 {
1377         struct nvmet_tcp_queue *queue =
1378                 container_of(w, struct nvmet_tcp_queue, release_work);
1379
1380         mutex_lock(&nvmet_tcp_queue_mutex);
1381         list_del_init(&queue->queue_list);
1382         mutex_unlock(&nvmet_tcp_queue_mutex);
1383
1384         nvmet_tcp_restore_socket_callbacks(queue);
1385         flush_work(&queue->io_work);
1386
1387         nvmet_tcp_uninit_data_in_cmds(queue);
1388         nvmet_sq_destroy(&queue->nvme_sq);
1389         cancel_work_sync(&queue->io_work);
1390         sock_release(queue->sock);
1391         nvmet_tcp_free_cmds(queue);
1392         if (queue->hdr_digest || queue->data_digest)
1393                 nvmet_tcp_free_crypto(queue);
1394         ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1395
1396         kfree(queue);
1397 }
1398
1399 static void nvmet_tcp_data_ready(struct sock *sk)
1400 {
1401         struct nvmet_tcp_queue *queue;
1402
1403         read_lock_bh(&sk->sk_callback_lock);
1404         queue = sk->sk_user_data;
1405         if (likely(queue))
1406                 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1407         read_unlock_bh(&sk->sk_callback_lock);
1408 }
1409
1410 static void nvmet_tcp_write_space(struct sock *sk)
1411 {
1412         struct nvmet_tcp_queue *queue;
1413
1414         read_lock_bh(&sk->sk_callback_lock);
1415         queue = sk->sk_user_data;
1416         if (unlikely(!queue))
1417                 goto out;
1418
1419         if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1420                 queue->write_space(sk);
1421                 goto out;
1422         }
1423
1424         if (sk_stream_is_writeable(sk)) {
1425                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1426                 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1427         }
1428 out:
1429         read_unlock_bh(&sk->sk_callback_lock);
1430 }
1431
1432 static void nvmet_tcp_state_change(struct sock *sk)
1433 {
1434         struct nvmet_tcp_queue *queue;
1435
1436         write_lock_bh(&sk->sk_callback_lock);
1437         queue = sk->sk_user_data;
1438         if (!queue)
1439                 goto done;
1440
1441         switch (sk->sk_state) {
1442         case TCP_FIN_WAIT1:
1443         case TCP_CLOSE_WAIT:
1444         case TCP_CLOSE:
1445                 /* FALLTHRU */
1446                 sk->sk_user_data = NULL;
1447                 nvmet_tcp_schedule_release_queue(queue);
1448                 break;
1449         default:
1450                 pr_warn("queue %d unhandled state %d\n",
1451                         queue->idx, sk->sk_state);
1452         }
1453 done:
1454         write_unlock_bh(&sk->sk_callback_lock);
1455 }
1456
1457 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1458 {
1459         struct socket *sock = queue->sock;
1460         struct inet_sock *inet = inet_sk(sock->sk);
1461         int ret;
1462
1463         ret = kernel_getsockname(sock,
1464                 (struct sockaddr *)&queue->sockaddr);
1465         if (ret < 0)
1466                 return ret;
1467
1468         ret = kernel_getpeername(sock,
1469                 (struct sockaddr *)&queue->sockaddr_peer);
1470         if (ret < 0)
1471                 return ret;
1472
1473         /*
1474          * Cleanup whatever is sitting in the TCP transmit queue on socket
1475          * close. This is done to prevent stale data from being sent should
1476          * the network connection be restored before TCP times out.
1477          */
1478         sock_no_linger(sock->sk);
1479
1480         if (so_priority > 0)
1481                 sock_set_priority(sock->sk, so_priority);
1482
1483         /* Set socket type of service */
1484         if (inet->rcv_tos > 0)
1485                 ip_sock_set_tos(sock->sk, inet->rcv_tos);
1486
1487         ret = 0;
1488         write_lock_bh(&sock->sk->sk_callback_lock);
1489         if (sock->sk->sk_state != TCP_ESTABLISHED) {
1490                 /*
1491                  * If the socket is already closing, don't even start
1492                  * consuming it
1493                  */
1494                 ret = -ENOTCONN;
1495         } else {
1496                 sock->sk->sk_user_data = queue;
1497                 queue->data_ready = sock->sk->sk_data_ready;
1498                 sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1499                 queue->state_change = sock->sk->sk_state_change;
1500                 sock->sk->sk_state_change = nvmet_tcp_state_change;
1501                 queue->write_space = sock->sk->sk_write_space;
1502                 sock->sk->sk_write_space = nvmet_tcp_write_space;
1503                 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
1504         }
1505         write_unlock_bh(&sock->sk->sk_callback_lock);
1506
1507         return ret;
1508 }
1509
1510 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1511                 struct socket *newsock)
1512 {
1513         struct nvmet_tcp_queue *queue;
1514         int ret;
1515
1516         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1517         if (!queue)
1518                 return -ENOMEM;
1519
1520         INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1521         INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1522         queue->sock = newsock;
1523         queue->port = port;
1524         queue->nr_cmds = 0;
1525         spin_lock_init(&queue->state_lock);
1526         queue->state = NVMET_TCP_Q_CONNECTING;
1527         INIT_LIST_HEAD(&queue->free_list);
1528         init_llist_head(&queue->resp_list);
1529         INIT_LIST_HEAD(&queue->resp_send_list);
1530
1531         queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
1532         if (queue->idx < 0) {
1533                 ret = queue->idx;
1534                 goto out_free_queue;
1535         }
1536
1537         ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1538         if (ret)
1539                 goto out_ida_remove;
1540
1541         ret = nvmet_sq_init(&queue->nvme_sq);
1542         if (ret)
1543                 goto out_free_connect;
1544
1545         nvmet_prepare_receive_pdu(queue);
1546
1547         mutex_lock(&nvmet_tcp_queue_mutex);
1548         list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1549         mutex_unlock(&nvmet_tcp_queue_mutex);
1550
1551         ret = nvmet_tcp_set_queue_sock(queue);
1552         if (ret)
1553                 goto out_destroy_sq;
1554
1555         return 0;
1556 out_destroy_sq:
1557         mutex_lock(&nvmet_tcp_queue_mutex);
1558         list_del_init(&queue->queue_list);
1559         mutex_unlock(&nvmet_tcp_queue_mutex);
1560         nvmet_sq_destroy(&queue->nvme_sq);
1561 out_free_connect:
1562         nvmet_tcp_free_cmd(&queue->connect);
1563 out_ida_remove:
1564         ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1565 out_free_queue:
1566         kfree(queue);
1567         return ret;
1568 }
1569
1570 static void nvmet_tcp_accept_work(struct work_struct *w)
1571 {
1572         struct nvmet_tcp_port *port =
1573                 container_of(w, struct nvmet_tcp_port, accept_work);
1574         struct socket *newsock;
1575         int ret;
1576
1577         while (true) {
1578                 ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
1579                 if (ret < 0) {
1580                         if (ret != -EAGAIN)
1581                                 pr_warn("failed to accept err=%d\n", ret);
1582                         return;
1583                 }
1584                 ret = nvmet_tcp_alloc_queue(port, newsock);
1585                 if (ret) {
1586                         pr_err("failed to allocate queue\n");
1587                         sock_release(newsock);
1588                 }
1589         }
1590 }
1591
1592 static void nvmet_tcp_listen_data_ready(struct sock *sk)
1593 {
1594         struct nvmet_tcp_port *port;
1595
1596         read_lock_bh(&sk->sk_callback_lock);
1597         port = sk->sk_user_data;
1598         if (!port)
1599                 goto out;
1600
1601         if (sk->sk_state == TCP_LISTEN)
1602                 schedule_work(&port->accept_work);
1603 out:
1604         read_unlock_bh(&sk->sk_callback_lock);
1605 }
1606
1607 static int nvmet_tcp_add_port(struct nvmet_port *nport)
1608 {
1609         struct nvmet_tcp_port *port;
1610         __kernel_sa_family_t af;
1611         int ret;
1612
1613         port = kzalloc(sizeof(*port), GFP_KERNEL);
1614         if (!port)
1615                 return -ENOMEM;
1616
1617         switch (nport->disc_addr.adrfam) {
1618         case NVMF_ADDR_FAMILY_IP4:
1619                 af = AF_INET;
1620                 break;
1621         case NVMF_ADDR_FAMILY_IP6:
1622                 af = AF_INET6;
1623                 break;
1624         default:
1625                 pr_err("address family %d not supported\n",
1626                                 nport->disc_addr.adrfam);
1627                 ret = -EINVAL;
1628                 goto err_port;
1629         }
1630
1631         ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1632                         nport->disc_addr.trsvcid, &port->addr);
1633         if (ret) {
1634                 pr_err("malformed ip/port passed: %s:%s\n",
1635                         nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1636                 goto err_port;
1637         }
1638
1639         port->nport = nport;
1640         INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
1641         if (port->nport->inline_data_size < 0)
1642                 port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
1643
1644         ret = sock_create(port->addr.ss_family, SOCK_STREAM,
1645                                 IPPROTO_TCP, &port->sock);
1646         if (ret) {
1647                 pr_err("failed to create a socket\n");
1648                 goto err_port;
1649         }
1650
1651         port->sock->sk->sk_user_data = port;
1652         port->data_ready = port->sock->sk->sk_data_ready;
1653         port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
1654         sock_set_reuseaddr(port->sock->sk);
1655         tcp_sock_set_nodelay(port->sock->sk);
1656         if (so_priority > 0)
1657                 sock_set_priority(port->sock->sk, so_priority);
1658
1659         ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
1660                         sizeof(port->addr));
1661         if (ret) {
1662                 pr_err("failed to bind port socket %d\n", ret);
1663                 goto err_sock;
1664         }
1665
1666         ret = kernel_listen(port->sock, 128);
1667         if (ret) {
1668                 pr_err("failed to listen %d on port sock\n", ret);
1669                 goto err_sock;
1670         }
1671
1672         nport->priv = port;
1673         pr_info("enabling port %d (%pISpc)\n",
1674                 le16_to_cpu(nport->disc_addr.portid), &port->addr);
1675
1676         return 0;
1677
1678 err_sock:
1679         sock_release(port->sock);
1680 err_port:
1681         kfree(port);
1682         return ret;
1683 }
1684
1685 static void nvmet_tcp_remove_port(struct nvmet_port *nport)
1686 {
1687         struct nvmet_tcp_port *port = nport->priv;
1688
1689         write_lock_bh(&port->sock->sk->sk_callback_lock);
1690         port->sock->sk->sk_data_ready = port->data_ready;
1691         port->sock->sk->sk_user_data = NULL;
1692         write_unlock_bh(&port->sock->sk->sk_callback_lock);
1693         cancel_work_sync(&port->accept_work);
1694
1695         sock_release(port->sock);
1696         kfree(port);
1697 }
1698
1699 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
1700 {
1701         struct nvmet_tcp_queue *queue;
1702
1703         mutex_lock(&nvmet_tcp_queue_mutex);
1704         list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1705                 if (queue->nvme_sq.ctrl == ctrl)
1706                         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1707         mutex_unlock(&nvmet_tcp_queue_mutex);
1708 }
1709
1710 static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
1711 {
1712         struct nvmet_tcp_queue *queue =
1713                 container_of(sq, struct nvmet_tcp_queue, nvme_sq);
1714
1715         if (sq->qid == 0) {
1716                 /* Let inflight controller teardown complete */
1717                 flush_scheduled_work();
1718         }
1719
1720         queue->nr_cmds = sq->size * 2;
1721         if (nvmet_tcp_alloc_cmds(queue))
1722                 return NVME_SC_INTERNAL;
1723         return 0;
1724 }
1725
1726 static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
1727                 struct nvmet_port *nport, char *traddr)
1728 {
1729         struct nvmet_tcp_port *port = nport->priv;
1730
1731         if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
1732                 struct nvmet_tcp_cmd *cmd =
1733                         container_of(req, struct nvmet_tcp_cmd, req);
1734                 struct nvmet_tcp_queue *queue = cmd->queue;
1735
1736                 sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
1737         } else {
1738                 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1739         }
1740 }
1741
1742 static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
1743         .owner                  = THIS_MODULE,
1744         .type                   = NVMF_TRTYPE_TCP,
1745         .msdbd                  = 1,
1746         .add_port               = nvmet_tcp_add_port,
1747         .remove_port            = nvmet_tcp_remove_port,
1748         .queue_response         = nvmet_tcp_queue_response,
1749         .delete_ctrl            = nvmet_tcp_delete_ctrl,
1750         .install_queue          = nvmet_tcp_install_queue,
1751         .disc_traddr            = nvmet_tcp_disc_port_addr,
1752 };
1753
1754 static int __init nvmet_tcp_init(void)
1755 {
1756         int ret;
1757
1758         nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
1759         if (!nvmet_tcp_wq)
1760                 return -ENOMEM;
1761
1762         ret = nvmet_register_transport(&nvmet_tcp_ops);
1763         if (ret)
1764                 goto err;
1765
1766         return 0;
1767 err:
1768         destroy_workqueue(nvmet_tcp_wq);
1769         return ret;
1770 }
1771
1772 static void __exit nvmet_tcp_exit(void)
1773 {
1774         struct nvmet_tcp_queue *queue;
1775
1776         nvmet_unregister_transport(&nvmet_tcp_ops);
1777
1778         flush_scheduled_work();
1779         mutex_lock(&nvmet_tcp_queue_mutex);
1780         list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1781                 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1782         mutex_unlock(&nvmet_tcp_queue_mutex);
1783         flush_scheduled_work();
1784
1785         destroy_workqueue(nvmet_tcp_wq);
1786 }
1787
1788 module_init(nvmet_tcp_init);
1789 module_exit(nvmet_tcp_exit);
1790
1791 MODULE_LICENSE("GPL v2");
1792 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */
This page took 0.136712 seconds and 4 git commands to generate.