]> Git Repo - J-linux.git/blob - fs/smb/client/smbdirect.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / fs / smb / client / smbdirect.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *   Copyright (C) 2017, Microsoft Corporation.
4  *
5  *   Author(s): Long Li <[email protected]>
6  */
7 #include <linux/module.h>
8 #include <linux/highmem.h>
9 #include <linux/folio_queue.h>
10 #include "smbdirect.h"
11 #include "cifs_debug.h"
12 #include "cifsproto.h"
13 #include "smb2proto.h"
14
15 static struct smbd_response *get_empty_queue_buffer(
16                 struct smbd_connection *info);
17 static struct smbd_response *get_receive_buffer(
18                 struct smbd_connection *info);
19 static void put_receive_buffer(
20                 struct smbd_connection *info,
21                 struct smbd_response *response);
22 static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
23 static void destroy_receive_buffers(struct smbd_connection *info);
24
25 static void put_empty_packet(
26                 struct smbd_connection *info, struct smbd_response *response);
27 static void enqueue_reassembly(
28                 struct smbd_connection *info,
29                 struct smbd_response *response, int data_length);
30 static struct smbd_response *_get_first_reassembly(
31                 struct smbd_connection *info);
32
33 static int smbd_post_recv(
34                 struct smbd_connection *info,
35                 struct smbd_response *response);
36
37 static int smbd_post_send_empty(struct smbd_connection *info);
38
39 static void destroy_mr_list(struct smbd_connection *info);
40 static int allocate_mr_list(struct smbd_connection *info);
41
42 struct smb_extract_to_rdma {
43         struct ib_sge           *sge;
44         unsigned int            nr_sge;
45         unsigned int            max_sge;
46         struct ib_device        *device;
47         u32                     local_dma_lkey;
48         enum dma_data_direction direction;
49 };
50 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
51                                         struct smb_extract_to_rdma *rdma);
52
53 /* SMBD version number */
54 #define SMBD_V1 0x0100
55
56 /* Port numbers for SMBD transport */
57 #define SMB_PORT        445
58 #define SMBD_PORT       5445
59
60 /* Address lookup and resolve timeout in ms */
61 #define RDMA_RESOLVE_TIMEOUT    5000
62
63 /* SMBD negotiation timeout in seconds */
64 #define SMBD_NEGOTIATE_TIMEOUT  120
65
66 /* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
67 #define SMBD_MIN_RECEIVE_SIZE           128
68 #define SMBD_MIN_FRAGMENTED_SIZE        131072
69
70 /*
71  * Default maximum number of RDMA read/write outstanding on this connection
72  * This value is possibly decreased during QP creation on hardware limit
73  */
74 #define SMBD_CM_RESPONDER_RESOURCES     32
75
76 /* Maximum number of retries on data transfer operations */
77 #define SMBD_CM_RETRY                   6
78 /* No need to retry on Receiver Not Ready since SMBD manages credits */
79 #define SMBD_CM_RNR_RETRY               0
80
81 /*
82  * User configurable initial values per SMBD transport connection
83  * as defined in [MS-SMBD] 3.1.1.1
84  * Those may change after a SMBD negotiation
85  */
86 /* The local peer's maximum number of credits to grant to the peer */
87 int smbd_receive_credit_max = 255;
88
89 /* The remote peer's credit request of local peer */
90 int smbd_send_credit_target = 255;
91
92 /* The maximum single message size can be sent to remote peer */
93 int smbd_max_send_size = 1364;
94
95 /*  The maximum fragmented upper-layer payload receive size supported */
96 int smbd_max_fragmented_recv_size = 1024 * 1024;
97
98 /*  The maximum single-message size which can be received */
99 int smbd_max_receive_size = 1364;
100
101 /* The timeout to initiate send of a keepalive message on idle */
102 int smbd_keep_alive_interval = 120;
103
104 /*
105  * User configurable initial values for RDMA transport
106  * The actual values used may be lower and are limited to hardware capabilities
107  */
108 /* Default maximum number of pages in a single RDMA write/read */
109 int smbd_max_frmr_depth = 2048;
110
111 /* If payload is less than this byte, use RDMA send/recv not read/write */
112 int rdma_readwrite_threshold = 4096;
113
114 /* Transport logging functions
115  * Logging are defined as classes. They can be OR'ed to define the actual
116  * logging level via module parameter smbd_logging_class
117  * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and
118  * log_rdma_event()
119  */
120 #define LOG_OUTGOING                    0x1
121 #define LOG_INCOMING                    0x2
122 #define LOG_READ                        0x4
123 #define LOG_WRITE                       0x8
124 #define LOG_RDMA_SEND                   0x10
125 #define LOG_RDMA_RECV                   0x20
126 #define LOG_KEEP_ALIVE                  0x40
127 #define LOG_RDMA_EVENT                  0x80
128 #define LOG_RDMA_MR                     0x100
129 static unsigned int smbd_logging_class;
130 module_param(smbd_logging_class, uint, 0644);
131 MODULE_PARM_DESC(smbd_logging_class,
132         "Logging class for SMBD transport 0x0 to 0x100");
133
134 #define ERR             0x0
135 #define INFO            0x1
136 static unsigned int smbd_logging_level = ERR;
137 module_param(smbd_logging_level, uint, 0644);
138 MODULE_PARM_DESC(smbd_logging_level,
139         "Logging level for SMBD transport, 0 (default): error, 1: info");
140
141 #define log_rdma(level, class, fmt, args...)                            \
142 do {                                                                    \
143         if (level <= smbd_logging_level || class & smbd_logging_class)  \
144                 cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\
145 } while (0)
146
147 #define log_outgoing(level, fmt, args...) \
148                 log_rdma(level, LOG_OUTGOING, fmt, ##args)
149 #define log_incoming(level, fmt, args...) \
150                 log_rdma(level, LOG_INCOMING, fmt, ##args)
151 #define log_read(level, fmt, args...)   log_rdma(level, LOG_READ, fmt, ##args)
152 #define log_write(level, fmt, args...)  log_rdma(level, LOG_WRITE, fmt, ##args)
153 #define log_rdma_send(level, fmt, args...) \
154                 log_rdma(level, LOG_RDMA_SEND, fmt, ##args)
155 #define log_rdma_recv(level, fmt, args...) \
156                 log_rdma(level, LOG_RDMA_RECV, fmt, ##args)
157 #define log_keep_alive(level, fmt, args...) \
158                 log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args)
159 #define log_rdma_event(level, fmt, args...) \
160                 log_rdma(level, LOG_RDMA_EVENT, fmt, ##args)
161 #define log_rdma_mr(level, fmt, args...) \
162                 log_rdma(level, LOG_RDMA_MR, fmt, ##args)
163
164 static void smbd_disconnect_rdma_work(struct work_struct *work)
165 {
166         struct smbd_connection *info =
167                 container_of(work, struct smbd_connection, disconnect_work);
168
169         if (info->transport_status == SMBD_CONNECTED) {
170                 info->transport_status = SMBD_DISCONNECTING;
171                 rdma_disconnect(info->id);
172         }
173 }
174
175 static void smbd_disconnect_rdma_connection(struct smbd_connection *info)
176 {
177         queue_work(info->workqueue, &info->disconnect_work);
178 }
179
180 /* Upcall from RDMA CM */
181 static int smbd_conn_upcall(
182                 struct rdma_cm_id *id, struct rdma_cm_event *event)
183 {
184         struct smbd_connection *info = id->context;
185
186         log_rdma_event(INFO, "event=%d status=%d\n",
187                 event->event, event->status);
188
189         switch (event->event) {
190         case RDMA_CM_EVENT_ADDR_RESOLVED:
191         case RDMA_CM_EVENT_ROUTE_RESOLVED:
192                 info->ri_rc = 0;
193                 complete(&info->ri_done);
194                 break;
195
196         case RDMA_CM_EVENT_ADDR_ERROR:
197                 info->ri_rc = -EHOSTUNREACH;
198                 complete(&info->ri_done);
199                 break;
200
201         case RDMA_CM_EVENT_ROUTE_ERROR:
202                 info->ri_rc = -ENETUNREACH;
203                 complete(&info->ri_done);
204                 break;
205
206         case RDMA_CM_EVENT_ESTABLISHED:
207                 log_rdma_event(INFO, "connected event=%d\n", event->event);
208                 info->transport_status = SMBD_CONNECTED;
209                 wake_up_interruptible(&info->conn_wait);
210                 break;
211
212         case RDMA_CM_EVENT_CONNECT_ERROR:
213         case RDMA_CM_EVENT_UNREACHABLE:
214         case RDMA_CM_EVENT_REJECTED:
215                 log_rdma_event(INFO, "connecting failed event=%d\n", event->event);
216                 info->transport_status = SMBD_DISCONNECTED;
217                 wake_up_interruptible(&info->conn_wait);
218                 break;
219
220         case RDMA_CM_EVENT_DEVICE_REMOVAL:
221         case RDMA_CM_EVENT_DISCONNECTED:
222                 /* This happens when we fail the negotiation */
223                 if (info->transport_status == SMBD_NEGOTIATE_FAILED) {
224                         info->transport_status = SMBD_DISCONNECTED;
225                         wake_up(&info->conn_wait);
226                         break;
227                 }
228
229                 info->transport_status = SMBD_DISCONNECTED;
230                 wake_up_interruptible(&info->disconn_wait);
231                 wake_up_interruptible(&info->wait_reassembly_queue);
232                 wake_up_interruptible_all(&info->wait_send_queue);
233                 break;
234
235         default:
236                 break;
237         }
238
239         return 0;
240 }
241
242 /* Upcall from RDMA QP */
243 static void
244 smbd_qp_async_error_upcall(struct ib_event *event, void *context)
245 {
246         struct smbd_connection *info = context;
247
248         log_rdma_event(ERR, "%s on device %s info %p\n",
249                 ib_event_msg(event->event), event->device->name, info);
250
251         switch (event->event) {
252         case IB_EVENT_CQ_ERR:
253         case IB_EVENT_QP_FATAL:
254                 smbd_disconnect_rdma_connection(info);
255                 break;
256
257         default:
258                 break;
259         }
260 }
261
262 static inline void *smbd_request_payload(struct smbd_request *request)
263 {
264         return (void *)request->packet;
265 }
266
267 static inline void *smbd_response_payload(struct smbd_response *response)
268 {
269         return (void *)response->packet;
270 }
271
272 /* Called when a RDMA send is done */
273 static void send_done(struct ib_cq *cq, struct ib_wc *wc)
274 {
275         int i;
276         struct smbd_request *request =
277                 container_of(wc->wr_cqe, struct smbd_request, cqe);
278
279         log_rdma_send(INFO, "smbd_request 0x%p completed wc->status=%d\n",
280                 request, wc->status);
281
282         if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
283                 log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n",
284                         wc->status, wc->opcode);
285                 smbd_disconnect_rdma_connection(request->info);
286         }
287
288         for (i = 0; i < request->num_sge; i++)
289                 ib_dma_unmap_single(request->info->id->device,
290                         request->sge[i].addr,
291                         request->sge[i].length,
292                         DMA_TO_DEVICE);
293
294         if (atomic_dec_and_test(&request->info->send_pending))
295                 wake_up(&request->info->wait_send_pending);
296
297         wake_up(&request->info->wait_post_send);
298
299         mempool_free(request, request->info->request_mempool);
300 }
301
302 static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp)
303 {
304         log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n",
305                        resp->min_version, resp->max_version,
306                        resp->negotiated_version, resp->credits_requested,
307                        resp->credits_granted, resp->status,
308                        resp->max_readwrite_size, resp->preferred_send_size,
309                        resp->max_receive_size, resp->max_fragmented_size);
310 }
311
312 /*
313  * Process a negotiation response message, according to [MS-SMBD]3.1.5.7
314  * response, packet_length: the negotiation response message
315  * return value: true if negotiation is a success, false if failed
316  */
317 static bool process_negotiation_response(
318                 struct smbd_response *response, int packet_length)
319 {
320         struct smbd_connection *info = response->info;
321         struct smbd_negotiate_resp *packet = smbd_response_payload(response);
322
323         if (packet_length < sizeof(struct smbd_negotiate_resp)) {
324                 log_rdma_event(ERR,
325                         "error: packet_length=%d\n", packet_length);
326                 return false;
327         }
328
329         if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) {
330                 log_rdma_event(ERR, "error: negotiated_version=%x\n",
331                         le16_to_cpu(packet->negotiated_version));
332                 return false;
333         }
334         info->protocol = le16_to_cpu(packet->negotiated_version);
335
336         if (packet->credits_requested == 0) {
337                 log_rdma_event(ERR, "error: credits_requested==0\n");
338                 return false;
339         }
340         info->receive_credit_target = le16_to_cpu(packet->credits_requested);
341
342         if (packet->credits_granted == 0) {
343                 log_rdma_event(ERR, "error: credits_granted==0\n");
344                 return false;
345         }
346         atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted));
347
348         atomic_set(&info->receive_credits, 0);
349
350         if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) {
351                 log_rdma_event(ERR, "error: preferred_send_size=%d\n",
352                         le32_to_cpu(packet->preferred_send_size));
353                 return false;
354         }
355         info->max_receive_size = le32_to_cpu(packet->preferred_send_size);
356
357         if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
358                 log_rdma_event(ERR, "error: max_receive_size=%d\n",
359                         le32_to_cpu(packet->max_receive_size));
360                 return false;
361         }
362         info->max_send_size = min_t(int, info->max_send_size,
363                                         le32_to_cpu(packet->max_receive_size));
364
365         if (le32_to_cpu(packet->max_fragmented_size) <
366                         SMBD_MIN_FRAGMENTED_SIZE) {
367                 log_rdma_event(ERR, "error: max_fragmented_size=%d\n",
368                         le32_to_cpu(packet->max_fragmented_size));
369                 return false;
370         }
371         info->max_fragmented_send_size =
372                 le32_to_cpu(packet->max_fragmented_size);
373         info->rdma_readwrite_threshold =
374                 rdma_readwrite_threshold > info->max_fragmented_send_size ?
375                 info->max_fragmented_send_size :
376                 rdma_readwrite_threshold;
377
378
379         info->max_readwrite_size = min_t(u32,
380                         le32_to_cpu(packet->max_readwrite_size),
381                         info->max_frmr_depth * PAGE_SIZE);
382         info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE;
383
384         return true;
385 }
386
387 static void smbd_post_send_credits(struct work_struct *work)
388 {
389         int ret = 0;
390         int use_receive_queue = 1;
391         int rc;
392         struct smbd_response *response;
393         struct smbd_connection *info =
394                 container_of(work, struct smbd_connection,
395                         post_send_credits_work);
396
397         if (info->transport_status != SMBD_CONNECTED) {
398                 wake_up(&info->wait_receive_queues);
399                 return;
400         }
401
402         if (info->receive_credit_target >
403                 atomic_read(&info->receive_credits)) {
404                 while (true) {
405                         if (use_receive_queue)
406                                 response = get_receive_buffer(info);
407                         else
408                                 response = get_empty_queue_buffer(info);
409                         if (!response) {
410                                 /* now switch to empty packet queue */
411                                 if (use_receive_queue) {
412                                         use_receive_queue = 0;
413                                         continue;
414                                 } else
415                                         break;
416                         }
417
418                         response->type = SMBD_TRANSFER_DATA;
419                         response->first_segment = false;
420                         rc = smbd_post_recv(info, response);
421                         if (rc) {
422                                 log_rdma_recv(ERR,
423                                         "post_recv failed rc=%d\n", rc);
424                                 put_receive_buffer(info, response);
425                                 break;
426                         }
427
428                         ret++;
429                 }
430         }
431
432         spin_lock(&info->lock_new_credits_offered);
433         info->new_credits_offered += ret;
434         spin_unlock(&info->lock_new_credits_offered);
435
436         /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */
437         info->send_immediate = true;
438         if (atomic_read(&info->receive_credits) <
439                 info->receive_credit_target - 1) {
440                 if (info->keep_alive_requested == KEEP_ALIVE_PENDING ||
441                     info->send_immediate) {
442                         log_keep_alive(INFO, "send an empty message\n");
443                         smbd_post_send_empty(info);
444                 }
445         }
446 }
447
448 /* Called from softirq, when recv is done */
449 static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
450 {
451         struct smbd_data_transfer *data_transfer;
452         struct smbd_response *response =
453                 container_of(wc->wr_cqe, struct smbd_response, cqe);
454         struct smbd_connection *info = response->info;
455         int data_length = 0;
456
457         log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n",
458                       response, response->type, wc->status, wc->opcode,
459                       wc->byte_len, wc->pkey_index);
460
461         if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
462                 log_rdma_recv(INFO, "wc->status=%d opcode=%d\n",
463                         wc->status, wc->opcode);
464                 smbd_disconnect_rdma_connection(info);
465                 goto error;
466         }
467
468         ib_dma_sync_single_for_cpu(
469                 wc->qp->device,
470                 response->sge.addr,
471                 response->sge.length,
472                 DMA_FROM_DEVICE);
473
474         switch (response->type) {
475         /* SMBD negotiation response */
476         case SMBD_NEGOTIATE_RESP:
477                 dump_smbd_negotiate_resp(smbd_response_payload(response));
478                 info->full_packet_received = true;
479                 info->negotiate_done =
480                         process_negotiation_response(response, wc->byte_len);
481                 complete(&info->negotiate_completion);
482                 break;
483
484         /* SMBD data transfer packet */
485         case SMBD_TRANSFER_DATA:
486                 data_transfer = smbd_response_payload(response);
487                 data_length = le32_to_cpu(data_transfer->data_length);
488
489                 /*
490                  * If this is a packet with data playload place the data in
491                  * reassembly queue and wake up the reading thread
492                  */
493                 if (data_length) {
494                         if (info->full_packet_received)
495                                 response->first_segment = true;
496
497                         if (le32_to_cpu(data_transfer->remaining_data_length))
498                                 info->full_packet_received = false;
499                         else
500                                 info->full_packet_received = true;
501
502                         enqueue_reassembly(
503                                 info,
504                                 response,
505                                 data_length);
506                 } else
507                         put_empty_packet(info, response);
508
509                 if (data_length)
510                         wake_up_interruptible(&info->wait_reassembly_queue);
511
512                 atomic_dec(&info->receive_credits);
513                 info->receive_credit_target =
514                         le16_to_cpu(data_transfer->credits_requested);
515                 if (le16_to_cpu(data_transfer->credits_granted)) {
516                         atomic_add(le16_to_cpu(data_transfer->credits_granted),
517                                 &info->send_credits);
518                         /*
519                          * We have new send credits granted from remote peer
520                          * If any sender is waiting for credits, unblock it
521                          */
522                         wake_up_interruptible(&info->wait_send_queue);
523                 }
524
525                 log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n",
526                              le16_to_cpu(data_transfer->flags),
527                              le32_to_cpu(data_transfer->data_offset),
528                              le32_to_cpu(data_transfer->data_length),
529                              le32_to_cpu(data_transfer->remaining_data_length));
530
531                 /* Send a KEEP_ALIVE response right away if requested */
532                 info->keep_alive_requested = KEEP_ALIVE_NONE;
533                 if (le16_to_cpu(data_transfer->flags) &
534                                 SMB_DIRECT_RESPONSE_REQUESTED) {
535                         info->keep_alive_requested = KEEP_ALIVE_PENDING;
536                 }
537
538                 return;
539
540         default:
541                 log_rdma_recv(ERR,
542                         "unexpected response type=%d\n", response->type);
543         }
544
545 error:
546         put_receive_buffer(info, response);
547 }
548
549 static struct rdma_cm_id *smbd_create_id(
550                 struct smbd_connection *info,
551                 struct sockaddr *dstaddr, int port)
552 {
553         struct rdma_cm_id *id;
554         int rc;
555         __be16 *sport;
556
557         id = rdma_create_id(&init_net, smbd_conn_upcall, info,
558                 RDMA_PS_TCP, IB_QPT_RC);
559         if (IS_ERR(id)) {
560                 rc = PTR_ERR(id);
561                 log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc);
562                 return id;
563         }
564
565         if (dstaddr->sa_family == AF_INET6)
566                 sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port;
567         else
568                 sport = &((struct sockaddr_in *)dstaddr)->sin_port;
569
570         *sport = htons(port);
571
572         init_completion(&info->ri_done);
573         info->ri_rc = -ETIMEDOUT;
574
575         rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr,
576                 RDMA_RESOLVE_TIMEOUT);
577         if (rc) {
578                 log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
579                 goto out;
580         }
581         rc = wait_for_completion_interruptible_timeout(
582                 &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
583         /* e.g. if interrupted returns -ERESTARTSYS */
584         if (rc < 0) {
585                 log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
586                 goto out;
587         }
588         rc = info->ri_rc;
589         if (rc) {
590                 log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
591                 goto out;
592         }
593
594         info->ri_rc = -ETIMEDOUT;
595         rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
596         if (rc) {
597                 log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
598                 goto out;
599         }
600         rc = wait_for_completion_interruptible_timeout(
601                 &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
602         /* e.g. if interrupted returns -ERESTARTSYS */
603         if (rc < 0)  {
604                 log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
605                 goto out;
606         }
607         rc = info->ri_rc;
608         if (rc) {
609                 log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
610                 goto out;
611         }
612
613         return id;
614
615 out:
616         rdma_destroy_id(id);
617         return ERR_PTR(rc);
618 }
619
620 /*
621  * Test if FRWR (Fast Registration Work Requests) is supported on the device
622  * This implementation requires FRWR on RDMA read/write
623  * return value: true if it is supported
624  */
625 static bool frwr_is_supported(struct ib_device_attr *attrs)
626 {
627         if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
628                 return false;
629         if (attrs->max_fast_reg_page_list_len == 0)
630                 return false;
631         return true;
632 }
633
634 static int smbd_ia_open(
635                 struct smbd_connection *info,
636                 struct sockaddr *dstaddr, int port)
637 {
638         int rc;
639
640         info->id = smbd_create_id(info, dstaddr, port);
641         if (IS_ERR(info->id)) {
642                 rc = PTR_ERR(info->id);
643                 goto out1;
644         }
645
646         if (!frwr_is_supported(&info->id->device->attrs)) {
647                 log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n");
648                 log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
649                                info->id->device->attrs.device_cap_flags,
650                                info->id->device->attrs.max_fast_reg_page_list_len);
651                 rc = -EPROTONOSUPPORT;
652                 goto out2;
653         }
654         info->max_frmr_depth = min_t(int,
655                 smbd_max_frmr_depth,
656                 info->id->device->attrs.max_fast_reg_page_list_len);
657         info->mr_type = IB_MR_TYPE_MEM_REG;
658         if (info->id->device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
659                 info->mr_type = IB_MR_TYPE_SG_GAPS;
660
661         info->pd = ib_alloc_pd(info->id->device, 0);
662         if (IS_ERR(info->pd)) {
663                 rc = PTR_ERR(info->pd);
664                 log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
665                 goto out2;
666         }
667
668         return 0;
669
670 out2:
671         rdma_destroy_id(info->id);
672         info->id = NULL;
673
674 out1:
675         return rc;
676 }
677
678 /*
679  * Send a negotiation request message to the peer
680  * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3
681  * After negotiation, the transport is connected and ready for
682  * carrying upper layer SMB payload
683  */
684 static int smbd_post_send_negotiate_req(struct smbd_connection *info)
685 {
686         struct ib_send_wr send_wr;
687         int rc = -ENOMEM;
688         struct smbd_request *request;
689         struct smbd_negotiate_req *packet;
690
691         request = mempool_alloc(info->request_mempool, GFP_KERNEL);
692         if (!request)
693                 return rc;
694
695         request->info = info;
696
697         packet = smbd_request_payload(request);
698         packet->min_version = cpu_to_le16(SMBD_V1);
699         packet->max_version = cpu_to_le16(SMBD_V1);
700         packet->reserved = 0;
701         packet->credits_requested = cpu_to_le16(info->send_credit_target);
702         packet->preferred_send_size = cpu_to_le32(info->max_send_size);
703         packet->max_receive_size = cpu_to_le32(info->max_receive_size);
704         packet->max_fragmented_size =
705                 cpu_to_le32(info->max_fragmented_recv_size);
706
707         request->num_sge = 1;
708         request->sge[0].addr = ib_dma_map_single(
709                                 info->id->device, (void *)packet,
710                                 sizeof(*packet), DMA_TO_DEVICE);
711         if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
712                 rc = -EIO;
713                 goto dma_mapping_failed;
714         }
715
716         request->sge[0].length = sizeof(*packet);
717         request->sge[0].lkey = info->pd->local_dma_lkey;
718
719         ib_dma_sync_single_for_device(
720                 info->id->device, request->sge[0].addr,
721                 request->sge[0].length, DMA_TO_DEVICE);
722
723         request->cqe.done = send_done;
724
725         send_wr.next = NULL;
726         send_wr.wr_cqe = &request->cqe;
727         send_wr.sg_list = request->sge;
728         send_wr.num_sge = request->num_sge;
729         send_wr.opcode = IB_WR_SEND;
730         send_wr.send_flags = IB_SEND_SIGNALED;
731
732         log_rdma_send(INFO, "sge addr=0x%llx length=%u lkey=0x%x\n",
733                 request->sge[0].addr,
734                 request->sge[0].length, request->sge[0].lkey);
735
736         atomic_inc(&info->send_pending);
737         rc = ib_post_send(info->id->qp, &send_wr, NULL);
738         if (!rc)
739                 return 0;
740
741         /* if we reach here, post send failed */
742         log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
743         atomic_dec(&info->send_pending);
744         ib_dma_unmap_single(info->id->device, request->sge[0].addr,
745                 request->sge[0].length, DMA_TO_DEVICE);
746
747         smbd_disconnect_rdma_connection(info);
748
749 dma_mapping_failed:
750         mempool_free(request, info->request_mempool);
751         return rc;
752 }
753
754 /*
755  * Extend the credits to remote peer
756  * This implements [MS-SMBD] 3.1.5.9
757  * The idea is that we should extend credits to remote peer as quickly as
758  * it's allowed, to maintain data flow. We allocate as much receive
759  * buffer as possible, and extend the receive credits to remote peer
760  * return value: the new credtis being granted.
761  */
762 static int manage_credits_prior_sending(struct smbd_connection *info)
763 {
764         int new_credits;
765
766         spin_lock(&info->lock_new_credits_offered);
767         new_credits = info->new_credits_offered;
768         info->new_credits_offered = 0;
769         spin_unlock(&info->lock_new_credits_offered);
770
771         return new_credits;
772 }
773
774 /*
775  * Check if we need to send a KEEP_ALIVE message
776  * The idle connection timer triggers a KEEP_ALIVE message when expires
777  * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send
778  * back a response.
779  * return value:
780  * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set
781  * 0: otherwise
782  */
783 static int manage_keep_alive_before_sending(struct smbd_connection *info)
784 {
785         if (info->keep_alive_requested == KEEP_ALIVE_PENDING) {
786                 info->keep_alive_requested = KEEP_ALIVE_SENT;
787                 return 1;
788         }
789         return 0;
790 }
791
792 /* Post the send request */
793 static int smbd_post_send(struct smbd_connection *info,
794                 struct smbd_request *request)
795 {
796         struct ib_send_wr send_wr;
797         int rc, i;
798
799         for (i = 0; i < request->num_sge; i++) {
800                 log_rdma_send(INFO,
801                         "rdma_request sge[%d] addr=0x%llx length=%u\n",
802                         i, request->sge[i].addr, request->sge[i].length);
803                 ib_dma_sync_single_for_device(
804                         info->id->device,
805                         request->sge[i].addr,
806                         request->sge[i].length,
807                         DMA_TO_DEVICE);
808         }
809
810         request->cqe.done = send_done;
811
812         send_wr.next = NULL;
813         send_wr.wr_cqe = &request->cqe;
814         send_wr.sg_list = request->sge;
815         send_wr.num_sge = request->num_sge;
816         send_wr.opcode = IB_WR_SEND;
817         send_wr.send_flags = IB_SEND_SIGNALED;
818
819         rc = ib_post_send(info->id->qp, &send_wr, NULL);
820         if (rc) {
821                 log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
822                 smbd_disconnect_rdma_connection(info);
823                 rc = -EAGAIN;
824         } else
825                 /* Reset timer for idle connection after packet is sent */
826                 mod_delayed_work(info->workqueue, &info->idle_timer_work,
827                         info->keep_alive_interval*HZ);
828
829         return rc;
830 }
831
832 static int smbd_post_send_iter(struct smbd_connection *info,
833                                struct iov_iter *iter,
834                                int *_remaining_data_length)
835 {
836         int i, rc;
837         int header_length;
838         int data_length;
839         struct smbd_request *request;
840         struct smbd_data_transfer *packet;
841         int new_credits = 0;
842
843 wait_credit:
844         /* Wait for send credits. A SMBD packet needs one credit */
845         rc = wait_event_interruptible(info->wait_send_queue,
846                 atomic_read(&info->send_credits) > 0 ||
847                 info->transport_status != SMBD_CONNECTED);
848         if (rc)
849                 goto err_wait_credit;
850
851         if (info->transport_status != SMBD_CONNECTED) {
852                 log_outgoing(ERR, "disconnected not sending on wait_credit\n");
853                 rc = -EAGAIN;
854                 goto err_wait_credit;
855         }
856         if (unlikely(atomic_dec_return(&info->send_credits) < 0)) {
857                 atomic_inc(&info->send_credits);
858                 goto wait_credit;
859         }
860
861 wait_send_queue:
862         wait_event(info->wait_post_send,
863                 atomic_read(&info->send_pending) < info->send_credit_target ||
864                 info->transport_status != SMBD_CONNECTED);
865
866         if (info->transport_status != SMBD_CONNECTED) {
867                 log_outgoing(ERR, "disconnected not sending on wait_send_queue\n");
868                 rc = -EAGAIN;
869                 goto err_wait_send_queue;
870         }
871
872         if (unlikely(atomic_inc_return(&info->send_pending) >
873                                 info->send_credit_target)) {
874                 atomic_dec(&info->send_pending);
875                 goto wait_send_queue;
876         }
877
878         request = mempool_alloc(info->request_mempool, GFP_KERNEL);
879         if (!request) {
880                 rc = -ENOMEM;
881                 goto err_alloc;
882         }
883
884         request->info = info;
885         memset(request->sge, 0, sizeof(request->sge));
886
887         /* Fill in the data payload to find out how much data we can add */
888         if (iter) {
889                 struct smb_extract_to_rdma extract = {
890                         .nr_sge         = 1,
891                         .max_sge        = SMBDIRECT_MAX_SEND_SGE,
892                         .sge            = request->sge,
893                         .device         = info->id->device,
894                         .local_dma_lkey = info->pd->local_dma_lkey,
895                         .direction      = DMA_TO_DEVICE,
896                 };
897
898                 rc = smb_extract_iter_to_rdma(iter, *_remaining_data_length,
899                                               &extract);
900                 if (rc < 0)
901                         goto err_dma;
902                 data_length = rc;
903                 request->num_sge = extract.nr_sge;
904                 *_remaining_data_length -= data_length;
905         } else {
906                 data_length = 0;
907                 request->num_sge = 1;
908         }
909
910         /* Fill in the packet header */
911         packet = smbd_request_payload(request);
912         packet->credits_requested = cpu_to_le16(info->send_credit_target);
913
914         new_credits = manage_credits_prior_sending(info);
915         atomic_add(new_credits, &info->receive_credits);
916         packet->credits_granted = cpu_to_le16(new_credits);
917
918         info->send_immediate = false;
919
920         packet->flags = 0;
921         if (manage_keep_alive_before_sending(info))
922                 packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
923
924         packet->reserved = 0;
925         if (!data_length)
926                 packet->data_offset = 0;
927         else
928                 packet->data_offset = cpu_to_le32(24);
929         packet->data_length = cpu_to_le32(data_length);
930         packet->remaining_data_length = cpu_to_le32(*_remaining_data_length);
931         packet->padding = 0;
932
933         log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n",
934                      le16_to_cpu(packet->credits_requested),
935                      le16_to_cpu(packet->credits_granted),
936                      le32_to_cpu(packet->data_offset),
937                      le32_to_cpu(packet->data_length),
938                      le32_to_cpu(packet->remaining_data_length));
939
940         /* Map the packet to DMA */
941         header_length = sizeof(struct smbd_data_transfer);
942         /* If this is a packet without payload, don't send padding */
943         if (!data_length)
944                 header_length = offsetof(struct smbd_data_transfer, padding);
945
946         request->sge[0].addr = ib_dma_map_single(info->id->device,
947                                                  (void *)packet,
948                                                  header_length,
949                                                  DMA_TO_DEVICE);
950         if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
951                 rc = -EIO;
952                 request->sge[0].addr = 0;
953                 goto err_dma;
954         }
955
956         request->sge[0].length = header_length;
957         request->sge[0].lkey = info->pd->local_dma_lkey;
958
959         rc = smbd_post_send(info, request);
960         if (!rc)
961                 return 0;
962
963 err_dma:
964         for (i = 0; i < request->num_sge; i++)
965                 if (request->sge[i].addr)
966                         ib_dma_unmap_single(info->id->device,
967                                             request->sge[i].addr,
968                                             request->sge[i].length,
969                                             DMA_TO_DEVICE);
970         mempool_free(request, info->request_mempool);
971
972         /* roll back receive credits and credits to be offered */
973         spin_lock(&info->lock_new_credits_offered);
974         info->new_credits_offered += new_credits;
975         spin_unlock(&info->lock_new_credits_offered);
976         atomic_sub(new_credits, &info->receive_credits);
977
978 err_alloc:
979         if (atomic_dec_and_test(&info->send_pending))
980                 wake_up(&info->wait_send_pending);
981
982 err_wait_send_queue:
983         /* roll back send credits and pending */
984         atomic_inc(&info->send_credits);
985
986 err_wait_credit:
987         return rc;
988 }
989
990 /*
991  * Send an empty message
992  * Empty message is used to extend credits to peer to for keep live
993  * while there is no upper layer payload to send at the time
994  */
995 static int smbd_post_send_empty(struct smbd_connection *info)
996 {
997         int remaining_data_length = 0;
998
999         info->count_send_empty++;
1000         return smbd_post_send_iter(info, NULL, &remaining_data_length);
1001 }
1002
1003 /*
1004  * Post a receive request to the transport
1005  * The remote peer can only send data when a receive request is posted
1006  * The interaction is controlled by send/receive credit system
1007  */
1008 static int smbd_post_recv(
1009                 struct smbd_connection *info, struct smbd_response *response)
1010 {
1011         struct ib_recv_wr recv_wr;
1012         int rc = -EIO;
1013
1014         response->sge.addr = ib_dma_map_single(
1015                                 info->id->device, response->packet,
1016                                 info->max_receive_size, DMA_FROM_DEVICE);
1017         if (ib_dma_mapping_error(info->id->device, response->sge.addr))
1018                 return rc;
1019
1020         response->sge.length = info->max_receive_size;
1021         response->sge.lkey = info->pd->local_dma_lkey;
1022
1023         response->cqe.done = recv_done;
1024
1025         recv_wr.wr_cqe = &response->cqe;
1026         recv_wr.next = NULL;
1027         recv_wr.sg_list = &response->sge;
1028         recv_wr.num_sge = 1;
1029
1030         rc = ib_post_recv(info->id->qp, &recv_wr, NULL);
1031         if (rc) {
1032                 ib_dma_unmap_single(info->id->device, response->sge.addr,
1033                                     response->sge.length, DMA_FROM_DEVICE);
1034                 smbd_disconnect_rdma_connection(info);
1035                 log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
1036         }
1037
1038         return rc;
1039 }
1040
1041 /* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
1042 static int smbd_negotiate(struct smbd_connection *info)
1043 {
1044         int rc;
1045         struct smbd_response *response = get_receive_buffer(info);
1046
1047         response->type = SMBD_NEGOTIATE_RESP;
1048         rc = smbd_post_recv(info, response);
1049         log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
1050                        rc, response->sge.addr,
1051                        response->sge.length, response->sge.lkey);
1052         if (rc)
1053                 return rc;
1054
1055         init_completion(&info->negotiate_completion);
1056         info->negotiate_done = false;
1057         rc = smbd_post_send_negotiate_req(info);
1058         if (rc)
1059                 return rc;
1060
1061         rc = wait_for_completion_interruptible_timeout(
1062                 &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ);
1063         log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc);
1064
1065         if (info->negotiate_done)
1066                 return 0;
1067
1068         if (rc == 0)
1069                 rc = -ETIMEDOUT;
1070         else if (rc == -ERESTARTSYS)
1071                 rc = -EINTR;
1072         else
1073                 rc = -ENOTCONN;
1074
1075         return rc;
1076 }
1077
1078 static void put_empty_packet(
1079                 struct smbd_connection *info, struct smbd_response *response)
1080 {
1081         spin_lock(&info->empty_packet_queue_lock);
1082         list_add_tail(&response->list, &info->empty_packet_queue);
1083         info->count_empty_packet_queue++;
1084         spin_unlock(&info->empty_packet_queue_lock);
1085
1086         queue_work(info->workqueue, &info->post_send_credits_work);
1087 }
1088
1089 /*
1090  * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
1091  * This is a queue for reassembling upper layer payload and present to upper
1092  * layer. All the inncoming payload go to the reassembly queue, regardless of
1093  * if reassembly is required. The uuper layer code reads from the queue for all
1094  * incoming payloads.
1095  * Put a received packet to the reassembly queue
1096  * response: the packet received
1097  * data_length: the size of payload in this packet
1098  */
1099 static void enqueue_reassembly(
1100         struct smbd_connection *info,
1101         struct smbd_response *response,
1102         int data_length)
1103 {
1104         spin_lock(&info->reassembly_queue_lock);
1105         list_add_tail(&response->list, &info->reassembly_queue);
1106         info->reassembly_queue_length++;
1107         /*
1108          * Make sure reassembly_data_length is updated after list and
1109          * reassembly_queue_length are updated. On the dequeue side
1110          * reassembly_data_length is checked without a lock to determine
1111          * if reassembly_queue_length and list is up to date
1112          */
1113         virt_wmb();
1114         info->reassembly_data_length += data_length;
1115         spin_unlock(&info->reassembly_queue_lock);
1116         info->count_reassembly_queue++;
1117         info->count_enqueue_reassembly_queue++;
1118 }
1119
1120 /*
1121  * Get the first entry at the front of reassembly queue
1122  * Caller is responsible for locking
1123  * return value: the first entry if any, NULL if queue is empty
1124  */
1125 static struct smbd_response *_get_first_reassembly(struct smbd_connection *info)
1126 {
1127         struct smbd_response *ret = NULL;
1128
1129         if (!list_empty(&info->reassembly_queue)) {
1130                 ret = list_first_entry(
1131                         &info->reassembly_queue,
1132                         struct smbd_response, list);
1133         }
1134         return ret;
1135 }
1136
1137 static struct smbd_response *get_empty_queue_buffer(
1138                 struct smbd_connection *info)
1139 {
1140         struct smbd_response *ret = NULL;
1141         unsigned long flags;
1142
1143         spin_lock_irqsave(&info->empty_packet_queue_lock, flags);
1144         if (!list_empty(&info->empty_packet_queue)) {
1145                 ret = list_first_entry(
1146                         &info->empty_packet_queue,
1147                         struct smbd_response, list);
1148                 list_del(&ret->list);
1149                 info->count_empty_packet_queue--;
1150         }
1151         spin_unlock_irqrestore(&info->empty_packet_queue_lock, flags);
1152
1153         return ret;
1154 }
1155
1156 /*
1157  * Get a receive buffer
1158  * For each remote send, we need to post a receive. The receive buffers are
1159  * pre-allocated in advance.
1160  * return value: the receive buffer, NULL if none is available
1161  */
1162 static struct smbd_response *get_receive_buffer(struct smbd_connection *info)
1163 {
1164         struct smbd_response *ret = NULL;
1165         unsigned long flags;
1166
1167         spin_lock_irqsave(&info->receive_queue_lock, flags);
1168         if (!list_empty(&info->receive_queue)) {
1169                 ret = list_first_entry(
1170                         &info->receive_queue,
1171                         struct smbd_response, list);
1172                 list_del(&ret->list);
1173                 info->count_receive_queue--;
1174                 info->count_get_receive_buffer++;
1175         }
1176         spin_unlock_irqrestore(&info->receive_queue_lock, flags);
1177
1178         return ret;
1179 }
1180
1181 /*
1182  * Return a receive buffer
1183  * Upon returning of a receive buffer, we can post new receive and extend
1184  * more receive credits to remote peer. This is done immediately after a
1185  * receive buffer is returned.
1186  */
1187 static void put_receive_buffer(
1188         struct smbd_connection *info, struct smbd_response *response)
1189 {
1190         unsigned long flags;
1191
1192         ib_dma_unmap_single(info->id->device, response->sge.addr,
1193                 response->sge.length, DMA_FROM_DEVICE);
1194
1195         spin_lock_irqsave(&info->receive_queue_lock, flags);
1196         list_add_tail(&response->list, &info->receive_queue);
1197         info->count_receive_queue++;
1198         info->count_put_receive_buffer++;
1199         spin_unlock_irqrestore(&info->receive_queue_lock, flags);
1200
1201         queue_work(info->workqueue, &info->post_send_credits_work);
1202 }
1203
1204 /* Preallocate all receive buffer on transport establishment */
1205 static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
1206 {
1207         int i;
1208         struct smbd_response *response;
1209
1210         INIT_LIST_HEAD(&info->reassembly_queue);
1211         spin_lock_init(&info->reassembly_queue_lock);
1212         info->reassembly_data_length = 0;
1213         info->reassembly_queue_length = 0;
1214
1215         INIT_LIST_HEAD(&info->receive_queue);
1216         spin_lock_init(&info->receive_queue_lock);
1217         info->count_receive_queue = 0;
1218
1219         INIT_LIST_HEAD(&info->empty_packet_queue);
1220         spin_lock_init(&info->empty_packet_queue_lock);
1221         info->count_empty_packet_queue = 0;
1222
1223         init_waitqueue_head(&info->wait_receive_queues);
1224
1225         for (i = 0; i < num_buf; i++) {
1226                 response = mempool_alloc(info->response_mempool, GFP_KERNEL);
1227                 if (!response)
1228                         goto allocate_failed;
1229
1230                 response->info = info;
1231                 list_add_tail(&response->list, &info->receive_queue);
1232                 info->count_receive_queue++;
1233         }
1234
1235         return 0;
1236
1237 allocate_failed:
1238         while (!list_empty(&info->receive_queue)) {
1239                 response = list_first_entry(
1240                                 &info->receive_queue,
1241                                 struct smbd_response, list);
1242                 list_del(&response->list);
1243                 info->count_receive_queue--;
1244
1245                 mempool_free(response, info->response_mempool);
1246         }
1247         return -ENOMEM;
1248 }
1249
1250 static void destroy_receive_buffers(struct smbd_connection *info)
1251 {
1252         struct smbd_response *response;
1253
1254         while ((response = get_receive_buffer(info)))
1255                 mempool_free(response, info->response_mempool);
1256
1257         while ((response = get_empty_queue_buffer(info)))
1258                 mempool_free(response, info->response_mempool);
1259 }
1260
1261 /* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
1262 static void idle_connection_timer(struct work_struct *work)
1263 {
1264         struct smbd_connection *info = container_of(
1265                                         work, struct smbd_connection,
1266                                         idle_timer_work.work);
1267
1268         if (info->keep_alive_requested != KEEP_ALIVE_NONE) {
1269                 log_keep_alive(ERR,
1270                         "error status info->keep_alive_requested=%d\n",
1271                         info->keep_alive_requested);
1272                 smbd_disconnect_rdma_connection(info);
1273                 return;
1274         }
1275
1276         log_keep_alive(INFO, "about to send an empty idle message\n");
1277         smbd_post_send_empty(info);
1278
1279         /* Setup the next idle timeout work */
1280         queue_delayed_work(info->workqueue, &info->idle_timer_work,
1281                         info->keep_alive_interval*HZ);
1282 }
1283
1284 /*
1285  * Destroy the transport and related RDMA and memory resources
1286  * Need to go through all the pending counters and make sure on one is using
1287  * the transport while it is destroyed
1288  */
1289 void smbd_destroy(struct TCP_Server_Info *server)
1290 {
1291         struct smbd_connection *info = server->smbd_conn;
1292         struct smbd_response *response;
1293         unsigned long flags;
1294
1295         if (!info) {
1296                 log_rdma_event(INFO, "rdma session already destroyed\n");
1297                 return;
1298         }
1299
1300         log_rdma_event(INFO, "destroying rdma session\n");
1301         if (info->transport_status != SMBD_DISCONNECTED) {
1302                 rdma_disconnect(server->smbd_conn->id);
1303                 log_rdma_event(INFO, "wait for transport being disconnected\n");
1304                 wait_event_interruptible(
1305                         info->disconn_wait,
1306                         info->transport_status == SMBD_DISCONNECTED);
1307         }
1308
1309         log_rdma_event(INFO, "destroying qp\n");
1310         ib_drain_qp(info->id->qp);
1311         rdma_destroy_qp(info->id);
1312
1313         log_rdma_event(INFO, "cancelling idle timer\n");
1314         cancel_delayed_work_sync(&info->idle_timer_work);
1315
1316         log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
1317         wait_event(info->wait_send_pending,
1318                 atomic_read(&info->send_pending) == 0);
1319
1320         /* It's not possible for upper layer to get to reassembly */
1321         log_rdma_event(INFO, "drain the reassembly queue\n");
1322         do {
1323                 spin_lock_irqsave(&info->reassembly_queue_lock, flags);
1324                 response = _get_first_reassembly(info);
1325                 if (response) {
1326                         list_del(&response->list);
1327                         spin_unlock_irqrestore(
1328                                 &info->reassembly_queue_lock, flags);
1329                         put_receive_buffer(info, response);
1330                 } else
1331                         spin_unlock_irqrestore(
1332                                 &info->reassembly_queue_lock, flags);
1333         } while (response);
1334         info->reassembly_data_length = 0;
1335
1336         log_rdma_event(INFO, "free receive buffers\n");
1337         wait_event(info->wait_receive_queues,
1338                 info->count_receive_queue + info->count_empty_packet_queue
1339                         == info->receive_credit_max);
1340         destroy_receive_buffers(info);
1341
1342         /*
1343          * For performance reasons, memory registration and deregistration
1344          * are not locked by srv_mutex. It is possible some processes are
1345          * blocked on transport srv_mutex while holding memory registration.
1346          * Release the transport srv_mutex to allow them to hit the failure
1347          * path when sending data, and then release memory registrations.
1348          */
1349         log_rdma_event(INFO, "freeing mr list\n");
1350         wake_up_interruptible_all(&info->wait_mr);
1351         while (atomic_read(&info->mr_used_count)) {
1352                 cifs_server_unlock(server);
1353                 msleep(1000);
1354                 cifs_server_lock(server);
1355         }
1356         destroy_mr_list(info);
1357
1358         ib_free_cq(info->send_cq);
1359         ib_free_cq(info->recv_cq);
1360         ib_dealloc_pd(info->pd);
1361         rdma_destroy_id(info->id);
1362
1363         /* free mempools */
1364         mempool_destroy(info->request_mempool);
1365         kmem_cache_destroy(info->request_cache);
1366
1367         mempool_destroy(info->response_mempool);
1368         kmem_cache_destroy(info->response_cache);
1369
1370         info->transport_status = SMBD_DESTROYED;
1371
1372         destroy_workqueue(info->workqueue);
1373         log_rdma_event(INFO,  "rdma session destroyed\n");
1374         kfree(info);
1375         server->smbd_conn = NULL;
1376 }
1377
1378 /*
1379  * Reconnect this SMBD connection, called from upper layer
1380  * return value: 0 on success, or actual error code
1381  */
1382 int smbd_reconnect(struct TCP_Server_Info *server)
1383 {
1384         log_rdma_event(INFO, "reconnecting rdma session\n");
1385
1386         if (!server->smbd_conn) {
1387                 log_rdma_event(INFO, "rdma session already destroyed\n");
1388                 goto create_conn;
1389         }
1390
1391         /*
1392          * This is possible if transport is disconnected and we haven't received
1393          * notification from RDMA, but upper layer has detected timeout
1394          */
1395         if (server->smbd_conn->transport_status == SMBD_CONNECTED) {
1396                 log_rdma_event(INFO, "disconnecting transport\n");
1397                 smbd_destroy(server);
1398         }
1399
1400 create_conn:
1401         log_rdma_event(INFO, "creating rdma session\n");
1402         server->smbd_conn = smbd_get_connection(
1403                 server, (struct sockaddr *) &server->dstaddr);
1404
1405         if (server->smbd_conn) {
1406                 cifs_dbg(VFS, "RDMA transport re-established\n");
1407                 trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr);
1408                 return 0;
1409         }
1410         trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr);
1411         return -ENOENT;
1412 }
1413
1414 static void destroy_caches_and_workqueue(struct smbd_connection *info)
1415 {
1416         destroy_receive_buffers(info);
1417         destroy_workqueue(info->workqueue);
1418         mempool_destroy(info->response_mempool);
1419         kmem_cache_destroy(info->response_cache);
1420         mempool_destroy(info->request_mempool);
1421         kmem_cache_destroy(info->request_cache);
1422 }
1423
1424 #define MAX_NAME_LEN    80
1425 static int allocate_caches_and_workqueue(struct smbd_connection *info)
1426 {
1427         char name[MAX_NAME_LEN];
1428         int rc;
1429
1430         scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
1431         info->request_cache =
1432                 kmem_cache_create(
1433                         name,
1434                         sizeof(struct smbd_request) +
1435                                 sizeof(struct smbd_data_transfer),
1436                         0, SLAB_HWCACHE_ALIGN, NULL);
1437         if (!info->request_cache)
1438                 return -ENOMEM;
1439
1440         info->request_mempool =
1441                 mempool_create(info->send_credit_target, mempool_alloc_slab,
1442                         mempool_free_slab, info->request_cache);
1443         if (!info->request_mempool)
1444                 goto out1;
1445
1446         scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
1447         info->response_cache =
1448                 kmem_cache_create(
1449                         name,
1450                         sizeof(struct smbd_response) +
1451                                 info->max_receive_size,
1452                         0, SLAB_HWCACHE_ALIGN, NULL);
1453         if (!info->response_cache)
1454                 goto out2;
1455
1456         info->response_mempool =
1457                 mempool_create(info->receive_credit_max, mempool_alloc_slab,
1458                        mempool_free_slab, info->response_cache);
1459         if (!info->response_mempool)
1460                 goto out3;
1461
1462         scnprintf(name, MAX_NAME_LEN, "smbd_%p", info);
1463         info->workqueue = create_workqueue(name);
1464         if (!info->workqueue)
1465                 goto out4;
1466
1467         rc = allocate_receive_buffers(info, info->receive_credit_max);
1468         if (rc) {
1469                 log_rdma_event(ERR, "failed to allocate receive buffers\n");
1470                 goto out5;
1471         }
1472
1473         return 0;
1474
1475 out5:
1476         destroy_workqueue(info->workqueue);
1477 out4:
1478         mempool_destroy(info->response_mempool);
1479 out3:
1480         kmem_cache_destroy(info->response_cache);
1481 out2:
1482         mempool_destroy(info->request_mempool);
1483 out1:
1484         kmem_cache_destroy(info->request_cache);
1485         return -ENOMEM;
1486 }
1487
1488 /* Create a SMBD connection, called by upper layer */
1489 static struct smbd_connection *_smbd_get_connection(
1490         struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port)
1491 {
1492         int rc;
1493         struct smbd_connection *info;
1494         struct rdma_conn_param conn_param;
1495         struct ib_qp_init_attr qp_attr;
1496         struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
1497         struct ib_port_immutable port_immutable;
1498         u32 ird_ord_hdr[2];
1499
1500         info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
1501         if (!info)
1502                 return NULL;
1503
1504         info->transport_status = SMBD_CONNECTING;
1505         rc = smbd_ia_open(info, dstaddr, port);
1506         if (rc) {
1507                 log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
1508                 goto create_id_failed;
1509         }
1510
1511         if (smbd_send_credit_target > info->id->device->attrs.max_cqe ||
1512             smbd_send_credit_target > info->id->device->attrs.max_qp_wr) {
1513                 log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
1514                                smbd_send_credit_target,
1515                                info->id->device->attrs.max_cqe,
1516                                info->id->device->attrs.max_qp_wr);
1517                 goto config_failed;
1518         }
1519
1520         if (smbd_receive_credit_max > info->id->device->attrs.max_cqe ||
1521             smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) {
1522                 log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
1523                                smbd_receive_credit_max,
1524                                info->id->device->attrs.max_cqe,
1525                                info->id->device->attrs.max_qp_wr);
1526                 goto config_failed;
1527         }
1528
1529         info->receive_credit_max = smbd_receive_credit_max;
1530         info->send_credit_target = smbd_send_credit_target;
1531         info->max_send_size = smbd_max_send_size;
1532         info->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
1533         info->max_receive_size = smbd_max_receive_size;
1534         info->keep_alive_interval = smbd_keep_alive_interval;
1535
1536         if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE ||
1537             info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) {
1538                 log_rdma_event(ERR,
1539                         "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
1540                         IB_DEVICE_NAME_MAX,
1541                         info->id->device->name,
1542                         info->id->device->attrs.max_send_sge,
1543                         info->id->device->attrs.max_recv_sge);
1544                 goto config_failed;
1545         }
1546
1547         info->send_cq = NULL;
1548         info->recv_cq = NULL;
1549         info->send_cq =
1550                 ib_alloc_cq_any(info->id->device, info,
1551                                 info->send_credit_target, IB_POLL_SOFTIRQ);
1552         if (IS_ERR(info->send_cq)) {
1553                 info->send_cq = NULL;
1554                 goto alloc_cq_failed;
1555         }
1556
1557         info->recv_cq =
1558                 ib_alloc_cq_any(info->id->device, info,
1559                                 info->receive_credit_max, IB_POLL_SOFTIRQ);
1560         if (IS_ERR(info->recv_cq)) {
1561                 info->recv_cq = NULL;
1562                 goto alloc_cq_failed;
1563         }
1564
1565         memset(&qp_attr, 0, sizeof(qp_attr));
1566         qp_attr.event_handler = smbd_qp_async_error_upcall;
1567         qp_attr.qp_context = info;
1568         qp_attr.cap.max_send_wr = info->send_credit_target;
1569         qp_attr.cap.max_recv_wr = info->receive_credit_max;
1570         qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SEND_SGE;
1571         qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_RECV_SGE;
1572         qp_attr.cap.max_inline_data = 0;
1573         qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1574         qp_attr.qp_type = IB_QPT_RC;
1575         qp_attr.send_cq = info->send_cq;
1576         qp_attr.recv_cq = info->recv_cq;
1577         qp_attr.port_num = ~0;
1578
1579         rc = rdma_create_qp(info->id, info->pd, &qp_attr);
1580         if (rc) {
1581                 log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
1582                 goto create_qp_failed;
1583         }
1584
1585         memset(&conn_param, 0, sizeof(conn_param));
1586         conn_param.initiator_depth = 0;
1587
1588         conn_param.responder_resources =
1589                 min(info->id->device->attrs.max_qp_rd_atom,
1590                     SMBD_CM_RESPONDER_RESOURCES);
1591         info->responder_resources = conn_param.responder_resources;
1592         log_rdma_mr(INFO, "responder_resources=%d\n",
1593                 info->responder_resources);
1594
1595         /* Need to send IRD/ORD in private data for iWARP */
1596         info->id->device->ops.get_port_immutable(
1597                 info->id->device, info->id->port_num, &port_immutable);
1598         if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
1599                 ird_ord_hdr[0] = info->responder_resources;
1600                 ird_ord_hdr[1] = 1;
1601                 conn_param.private_data = ird_ord_hdr;
1602                 conn_param.private_data_len = sizeof(ird_ord_hdr);
1603         } else {
1604                 conn_param.private_data = NULL;
1605                 conn_param.private_data_len = 0;
1606         }
1607
1608         conn_param.retry_count = SMBD_CM_RETRY;
1609         conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY;
1610         conn_param.flow_control = 0;
1611
1612         log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
1613                 &addr_in->sin_addr, port);
1614
1615         init_waitqueue_head(&info->conn_wait);
1616         init_waitqueue_head(&info->disconn_wait);
1617         init_waitqueue_head(&info->wait_reassembly_queue);
1618         rc = rdma_connect(info->id, &conn_param);
1619         if (rc) {
1620                 log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
1621                 goto rdma_connect_failed;
1622         }
1623
1624         wait_event_interruptible(
1625                 info->conn_wait, info->transport_status != SMBD_CONNECTING);
1626
1627         if (info->transport_status != SMBD_CONNECTED) {
1628                 log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
1629                 goto rdma_connect_failed;
1630         }
1631
1632         log_rdma_event(INFO, "rdma_connect connected\n");
1633
1634         rc = allocate_caches_and_workqueue(info);
1635         if (rc) {
1636                 log_rdma_event(ERR, "cache allocation failed\n");
1637                 goto allocate_cache_failed;
1638         }
1639
1640         init_waitqueue_head(&info->wait_send_queue);
1641         INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
1642         queue_delayed_work(info->workqueue, &info->idle_timer_work,
1643                 info->keep_alive_interval*HZ);
1644
1645         init_waitqueue_head(&info->wait_send_pending);
1646         atomic_set(&info->send_pending, 0);
1647
1648         init_waitqueue_head(&info->wait_post_send);
1649
1650         INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work);
1651         INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits);
1652         info->new_credits_offered = 0;
1653         spin_lock_init(&info->lock_new_credits_offered);
1654
1655         rc = smbd_negotiate(info);
1656         if (rc) {
1657                 log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc);
1658                 goto negotiation_failed;
1659         }
1660
1661         rc = allocate_mr_list(info);
1662         if (rc) {
1663                 log_rdma_mr(ERR, "memory registration allocation failed\n");
1664                 goto allocate_mr_failed;
1665         }
1666
1667         return info;
1668
1669 allocate_mr_failed:
1670         /* At this point, need to a full transport shutdown */
1671         server->smbd_conn = info;
1672         smbd_destroy(server);
1673         return NULL;
1674
1675 negotiation_failed:
1676         cancel_delayed_work_sync(&info->idle_timer_work);
1677         destroy_caches_and_workqueue(info);
1678         info->transport_status = SMBD_NEGOTIATE_FAILED;
1679         init_waitqueue_head(&info->conn_wait);
1680         rdma_disconnect(info->id);
1681         wait_event(info->conn_wait,
1682                 info->transport_status == SMBD_DISCONNECTED);
1683
1684 allocate_cache_failed:
1685 rdma_connect_failed:
1686         rdma_destroy_qp(info->id);
1687
1688 create_qp_failed:
1689 alloc_cq_failed:
1690         if (info->send_cq)
1691                 ib_free_cq(info->send_cq);
1692         if (info->recv_cq)
1693                 ib_free_cq(info->recv_cq);
1694
1695 config_failed:
1696         ib_dealloc_pd(info->pd);
1697         rdma_destroy_id(info->id);
1698
1699 create_id_failed:
1700         kfree(info);
1701         return NULL;
1702 }
1703
1704 struct smbd_connection *smbd_get_connection(
1705         struct TCP_Server_Info *server, struct sockaddr *dstaddr)
1706 {
1707         struct smbd_connection *ret;
1708         int port = SMBD_PORT;
1709
1710 try_again:
1711         ret = _smbd_get_connection(server, dstaddr, port);
1712
1713         /* Try SMB_PORT if SMBD_PORT doesn't work */
1714         if (!ret && port == SMBD_PORT) {
1715                 port = SMB_PORT;
1716                 goto try_again;
1717         }
1718         return ret;
1719 }
1720
1721 /*
1722  * Receive data from receive reassembly queue
1723  * All the incoming data packets are placed in reassembly queue
1724  * buf: the buffer to read data into
1725  * size: the length of data to read
1726  * return value: actual data read
1727  * Note: this implementation copies the data from reassebmly queue to receive
1728  * buffers used by upper layer. This is not the optimal code path. A better way
1729  * to do it is to not have upper layer allocate its receive buffers but rather
1730  * borrow the buffer from reassembly queue, and return it after data is
1731  * consumed. But this will require more changes to upper layer code, and also
1732  * need to consider packet boundaries while they still being reassembled.
1733  */
1734 static int smbd_recv_buf(struct smbd_connection *info, char *buf,
1735                 unsigned int size)
1736 {
1737         struct smbd_response *response;
1738         struct smbd_data_transfer *data_transfer;
1739         int to_copy, to_read, data_read, offset;
1740         u32 data_length, remaining_data_length, data_offset;
1741         int rc;
1742
1743 again:
1744         /*
1745          * No need to hold the reassembly queue lock all the time as we are
1746          * the only one reading from the front of the queue. The transport
1747          * may add more entries to the back of the queue at the same time
1748          */
1749         log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size,
1750                 info->reassembly_data_length);
1751         if (info->reassembly_data_length >= size) {
1752                 int queue_length;
1753                 int queue_removed = 0;
1754
1755                 /*
1756                  * Need to make sure reassembly_data_length is read before
1757                  * reading reassembly_queue_length and calling
1758                  * _get_first_reassembly. This call is lock free
1759                  * as we never read at the end of the queue which are being
1760                  * updated in SOFTIRQ as more data is received
1761                  */
1762                 virt_rmb();
1763                 queue_length = info->reassembly_queue_length;
1764                 data_read = 0;
1765                 to_read = size;
1766                 offset = info->first_entry_offset;
1767                 while (data_read < size) {
1768                         response = _get_first_reassembly(info);
1769                         data_transfer = smbd_response_payload(response);
1770                         data_length = le32_to_cpu(data_transfer->data_length);
1771                         remaining_data_length =
1772                                 le32_to_cpu(
1773                                         data_transfer->remaining_data_length);
1774                         data_offset = le32_to_cpu(data_transfer->data_offset);
1775
1776                         /*
1777                          * The upper layer expects RFC1002 length at the
1778                          * beginning of the payload. Return it to indicate
1779                          * the total length of the packet. This minimize the
1780                          * change to upper layer packet processing logic. This
1781                          * will be eventually remove when an intermediate
1782                          * transport layer is added
1783                          */
1784                         if (response->first_segment && size == 4) {
1785                                 unsigned int rfc1002_len =
1786                                         data_length + remaining_data_length;
1787                                 *((__be32 *)buf) = cpu_to_be32(rfc1002_len);
1788                                 data_read = 4;
1789                                 response->first_segment = false;
1790                                 log_read(INFO, "returning rfc1002 length %d\n",
1791                                         rfc1002_len);
1792                                 goto read_rfc1002_done;
1793                         }
1794
1795                         to_copy = min_t(int, data_length - offset, to_read);
1796                         memcpy(
1797                                 buf + data_read,
1798                                 (char *)data_transfer + data_offset + offset,
1799                                 to_copy);
1800
1801                         /* move on to the next buffer? */
1802                         if (to_copy == data_length - offset) {
1803                                 queue_length--;
1804                                 /*
1805                                  * No need to lock if we are not at the
1806                                  * end of the queue
1807                                  */
1808                                 if (queue_length)
1809                                         list_del(&response->list);
1810                                 else {
1811                                         spin_lock_irq(
1812                                                 &info->reassembly_queue_lock);
1813                                         list_del(&response->list);
1814                                         spin_unlock_irq(
1815                                                 &info->reassembly_queue_lock);
1816                                 }
1817                                 queue_removed++;
1818                                 info->count_reassembly_queue--;
1819                                 info->count_dequeue_reassembly_queue++;
1820                                 put_receive_buffer(info, response);
1821                                 offset = 0;
1822                                 log_read(INFO, "put_receive_buffer offset=0\n");
1823                         } else
1824                                 offset += to_copy;
1825
1826                         to_read -= to_copy;
1827                         data_read += to_copy;
1828
1829                         log_read(INFO, "_get_first_reassembly memcpy %d bytes data_transfer_length-offset=%d after that to_read=%d data_read=%d offset=%d\n",
1830                                  to_copy, data_length - offset,
1831                                  to_read, data_read, offset);
1832                 }
1833
1834                 spin_lock_irq(&info->reassembly_queue_lock);
1835                 info->reassembly_data_length -= data_read;
1836                 info->reassembly_queue_length -= queue_removed;
1837                 spin_unlock_irq(&info->reassembly_queue_lock);
1838
1839                 info->first_entry_offset = offset;
1840                 log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
1841                          data_read, info->reassembly_data_length,
1842                          info->first_entry_offset);
1843 read_rfc1002_done:
1844                 return data_read;
1845         }
1846
1847         log_read(INFO, "wait_event on more data\n");
1848         rc = wait_event_interruptible(
1849                 info->wait_reassembly_queue,
1850                 info->reassembly_data_length >= size ||
1851                         info->transport_status != SMBD_CONNECTED);
1852         /* Don't return any data if interrupted */
1853         if (rc)
1854                 return rc;
1855
1856         if (info->transport_status != SMBD_CONNECTED) {
1857                 log_read(ERR, "disconnected\n");
1858                 return -ECONNABORTED;
1859         }
1860
1861         goto again;
1862 }
1863
1864 /*
1865  * Receive a page from receive reassembly queue
1866  * page: the page to read data into
1867  * to_read: the length of data to read
1868  * return value: actual data read
1869  */
1870 static int smbd_recv_page(struct smbd_connection *info,
1871                 struct page *page, unsigned int page_offset,
1872                 unsigned int to_read)
1873 {
1874         int ret;
1875         char *to_address;
1876         void *page_address;
1877
1878         /* make sure we have the page ready for read */
1879         ret = wait_event_interruptible(
1880                 info->wait_reassembly_queue,
1881                 info->reassembly_data_length >= to_read ||
1882                         info->transport_status != SMBD_CONNECTED);
1883         if (ret)
1884                 return ret;
1885
1886         /* now we can read from reassembly queue and not sleep */
1887         page_address = kmap_atomic(page);
1888         to_address = (char *) page_address + page_offset;
1889
1890         log_read(INFO, "reading from page=%p address=%p to_read=%d\n",
1891                 page, to_address, to_read);
1892
1893         ret = smbd_recv_buf(info, to_address, to_read);
1894         kunmap_atomic(page_address);
1895
1896         return ret;
1897 }
1898
1899 /*
1900  * Receive data from transport
1901  * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC
1902  * return: total bytes read, or 0. SMB Direct will not do partial read.
1903  */
1904 int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
1905 {
1906         char *buf;
1907         struct page *page;
1908         unsigned int to_read, page_offset;
1909         int rc;
1910
1911         if (iov_iter_rw(&msg->msg_iter) == WRITE) {
1912                 /* It's a bug in upper layer to get there */
1913                 cifs_dbg(VFS, "Invalid msg iter dir %u\n",
1914                          iov_iter_rw(&msg->msg_iter));
1915                 rc = -EINVAL;
1916                 goto out;
1917         }
1918
1919         switch (iov_iter_type(&msg->msg_iter)) {
1920         case ITER_KVEC:
1921                 buf = msg->msg_iter.kvec->iov_base;
1922                 to_read = msg->msg_iter.kvec->iov_len;
1923                 rc = smbd_recv_buf(info, buf, to_read);
1924                 break;
1925
1926         case ITER_BVEC:
1927                 page = msg->msg_iter.bvec->bv_page;
1928                 page_offset = msg->msg_iter.bvec->bv_offset;
1929                 to_read = msg->msg_iter.bvec->bv_len;
1930                 rc = smbd_recv_page(info, page, page_offset, to_read);
1931                 break;
1932
1933         default:
1934                 /* It's a bug in upper layer to get there */
1935                 cifs_dbg(VFS, "Invalid msg type %d\n",
1936                          iov_iter_type(&msg->msg_iter));
1937                 rc = -EINVAL;
1938         }
1939
1940 out:
1941         /* SMBDirect will read it all or nothing */
1942         if (rc > 0)
1943                 msg->msg_iter.count = 0;
1944         return rc;
1945 }
1946
1947 /*
1948  * Send data to transport
1949  * Each rqst is transported as a SMBDirect payload
1950  * rqst: the data to write
1951  * return value: 0 if successfully write, otherwise error code
1952  */
1953 int smbd_send(struct TCP_Server_Info *server,
1954         int num_rqst, struct smb_rqst *rqst_array)
1955 {
1956         struct smbd_connection *info = server->smbd_conn;
1957         struct smb_rqst *rqst;
1958         struct iov_iter iter;
1959         unsigned int remaining_data_length, klen;
1960         int rc, i, rqst_idx;
1961
1962         if (info->transport_status != SMBD_CONNECTED)
1963                 return -EAGAIN;
1964
1965         /*
1966          * Add in the page array if there is one. The caller needs to set
1967          * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and
1968          * ends at page boundary
1969          */
1970         remaining_data_length = 0;
1971         for (i = 0; i < num_rqst; i++)
1972                 remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
1973
1974         if (unlikely(remaining_data_length > info->max_fragmented_send_size)) {
1975                 /* assertion: payload never exceeds negotiated maximum */
1976                 log_write(ERR, "payload size %d > max size %d\n",
1977                         remaining_data_length, info->max_fragmented_send_size);
1978                 return -EINVAL;
1979         }
1980
1981         log_write(INFO, "num_rqst=%d total length=%u\n",
1982                         num_rqst, remaining_data_length);
1983
1984         rqst_idx = 0;
1985         do {
1986                 rqst = &rqst_array[rqst_idx];
1987
1988                 cifs_dbg(FYI, "Sending smb (RDMA): idx=%d smb_len=%lu\n",
1989                          rqst_idx, smb_rqst_len(server, rqst));
1990                 for (i = 0; i < rqst->rq_nvec; i++)
1991                         dump_smb(rqst->rq_iov[i].iov_base, rqst->rq_iov[i].iov_len);
1992
1993                 log_write(INFO, "RDMA-WR[%u] nvec=%d len=%u iter=%zu rqlen=%lu\n",
1994                           rqst_idx, rqst->rq_nvec, remaining_data_length,
1995                           iov_iter_count(&rqst->rq_iter), smb_rqst_len(server, rqst));
1996
1997                 /* Send the metadata pages. */
1998                 klen = 0;
1999                 for (i = 0; i < rqst->rq_nvec; i++)
2000                         klen += rqst->rq_iov[i].iov_len;
2001                 iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
2002
2003                 rc = smbd_post_send_iter(info, &iter, &remaining_data_length);
2004                 if (rc < 0)
2005                         break;
2006
2007                 if (iov_iter_count(&rqst->rq_iter) > 0) {
2008                         /* And then the data pages if there are any */
2009                         rc = smbd_post_send_iter(info, &rqst->rq_iter,
2010                                                  &remaining_data_length);
2011                         if (rc < 0)
2012                                 break;
2013                 }
2014
2015         } while (++rqst_idx < num_rqst);
2016
2017         /*
2018          * As an optimization, we don't wait for individual I/O to finish
2019          * before sending the next one.
2020          * Send them all and wait for pending send count to get to 0
2021          * that means all the I/Os have been out and we are good to return
2022          */
2023
2024         wait_event(info->wait_send_pending,
2025                 atomic_read(&info->send_pending) == 0);
2026
2027         return rc;
2028 }
2029
2030 static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc)
2031 {
2032         struct smbd_mr *mr;
2033         struct ib_cqe *cqe;
2034
2035         if (wc->status) {
2036                 log_rdma_mr(ERR, "status=%d\n", wc->status);
2037                 cqe = wc->wr_cqe;
2038                 mr = container_of(cqe, struct smbd_mr, cqe);
2039                 smbd_disconnect_rdma_connection(mr->conn);
2040         }
2041 }
2042
2043 /*
2044  * The work queue function that recovers MRs
2045  * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used
2046  * again. Both calls are slow, so finish them in a workqueue. This will not
2047  * block I/O path.
2048  * There is one workqueue that recovers MRs, there is no need to lock as the
2049  * I/O requests calling smbd_register_mr will never update the links in the
2050  * mr_list.
2051  */
2052 static void smbd_mr_recovery_work(struct work_struct *work)
2053 {
2054         struct smbd_connection *info =
2055                 container_of(work, struct smbd_connection, mr_recovery_work);
2056         struct smbd_mr *smbdirect_mr;
2057         int rc;
2058
2059         list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
2060                 if (smbdirect_mr->state == MR_ERROR) {
2061
2062                         /* recover this MR entry */
2063                         rc = ib_dereg_mr(smbdirect_mr->mr);
2064                         if (rc) {
2065                                 log_rdma_mr(ERR,
2066                                         "ib_dereg_mr failed rc=%x\n",
2067                                         rc);
2068                                 smbd_disconnect_rdma_connection(info);
2069                                 continue;
2070                         }
2071
2072                         smbdirect_mr->mr = ib_alloc_mr(
2073                                 info->pd, info->mr_type,
2074                                 info->max_frmr_depth);
2075                         if (IS_ERR(smbdirect_mr->mr)) {
2076                                 log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
2077                                             info->mr_type,
2078                                             info->max_frmr_depth);
2079                                 smbd_disconnect_rdma_connection(info);
2080                                 continue;
2081                         }
2082                 } else
2083                         /* This MR is being used, don't recover it */
2084                         continue;
2085
2086                 smbdirect_mr->state = MR_READY;
2087
2088                 /* smbdirect_mr->state is updated by this function
2089                  * and is read and updated by I/O issuing CPUs trying
2090                  * to get a MR, the call to atomic_inc_return
2091                  * implicates a memory barrier and guarantees this
2092                  * value is updated before waking up any calls to
2093                  * get_mr() from the I/O issuing CPUs
2094                  */
2095                 if (atomic_inc_return(&info->mr_ready_count) == 1)
2096                         wake_up_interruptible(&info->wait_mr);
2097         }
2098 }
2099
2100 static void destroy_mr_list(struct smbd_connection *info)
2101 {
2102         struct smbd_mr *mr, *tmp;
2103
2104         cancel_work_sync(&info->mr_recovery_work);
2105         list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
2106                 if (mr->state == MR_INVALIDATED)
2107                         ib_dma_unmap_sg(info->id->device, mr->sgt.sgl,
2108                                 mr->sgt.nents, mr->dir);
2109                 ib_dereg_mr(mr->mr);
2110                 kfree(mr->sgt.sgl);
2111                 kfree(mr);
2112         }
2113 }
2114
2115 /*
2116  * Allocate MRs used for RDMA read/write
2117  * The number of MRs will not exceed hardware capability in responder_resources
2118  * All MRs are kept in mr_list. The MR can be recovered after it's used
2119  * Recovery is done in smbd_mr_recovery_work. The content of list entry changes
2120  * as MRs are used and recovered for I/O, but the list links will not change
2121  */
2122 static int allocate_mr_list(struct smbd_connection *info)
2123 {
2124         int i;
2125         struct smbd_mr *smbdirect_mr, *tmp;
2126
2127         INIT_LIST_HEAD(&info->mr_list);
2128         init_waitqueue_head(&info->wait_mr);
2129         spin_lock_init(&info->mr_list_lock);
2130         atomic_set(&info->mr_ready_count, 0);
2131         atomic_set(&info->mr_used_count, 0);
2132         init_waitqueue_head(&info->wait_for_mr_cleanup);
2133         INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
2134         /* Allocate more MRs (2x) than hardware responder_resources */
2135         for (i = 0; i < info->responder_resources * 2; i++) {
2136                 smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
2137                 if (!smbdirect_mr)
2138                         goto cleanup_entries;
2139                 smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type,
2140                                         info->max_frmr_depth);
2141                 if (IS_ERR(smbdirect_mr->mr)) {
2142                         log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
2143                                     info->mr_type, info->max_frmr_depth);
2144                         goto out;
2145                 }
2146                 smbdirect_mr->sgt.sgl = kcalloc(info->max_frmr_depth,
2147                                                 sizeof(struct scatterlist),
2148                                                 GFP_KERNEL);
2149                 if (!smbdirect_mr->sgt.sgl) {
2150                         log_rdma_mr(ERR, "failed to allocate sgl\n");
2151                         ib_dereg_mr(smbdirect_mr->mr);
2152                         goto out;
2153                 }
2154                 smbdirect_mr->state = MR_READY;
2155                 smbdirect_mr->conn = info;
2156
2157                 list_add_tail(&smbdirect_mr->list, &info->mr_list);
2158                 atomic_inc(&info->mr_ready_count);
2159         }
2160         return 0;
2161
2162 out:
2163         kfree(smbdirect_mr);
2164 cleanup_entries:
2165         list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
2166                 list_del(&smbdirect_mr->list);
2167                 ib_dereg_mr(smbdirect_mr->mr);
2168                 kfree(smbdirect_mr->sgt.sgl);
2169                 kfree(smbdirect_mr);
2170         }
2171         return -ENOMEM;
2172 }
2173
2174 /*
2175  * Get a MR from mr_list. This function waits until there is at least one
2176  * MR available in the list. It may access the list while the
2177  * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
2178  * as they never modify the same places. However, there may be several CPUs
2179  * issuing I/O trying to get MR at the same time, mr_list_lock is used to
2180  * protect this situation.
2181  */
2182 static struct smbd_mr *get_mr(struct smbd_connection *info)
2183 {
2184         struct smbd_mr *ret;
2185         int rc;
2186 again:
2187         rc = wait_event_interruptible(info->wait_mr,
2188                 atomic_read(&info->mr_ready_count) ||
2189                 info->transport_status != SMBD_CONNECTED);
2190         if (rc) {
2191                 log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
2192                 return NULL;
2193         }
2194
2195         if (info->transport_status != SMBD_CONNECTED) {
2196                 log_rdma_mr(ERR, "info->transport_status=%x\n",
2197                         info->transport_status);
2198                 return NULL;
2199         }
2200
2201         spin_lock(&info->mr_list_lock);
2202         list_for_each_entry(ret, &info->mr_list, list) {
2203                 if (ret->state == MR_READY) {
2204                         ret->state = MR_REGISTERED;
2205                         spin_unlock(&info->mr_list_lock);
2206                         atomic_dec(&info->mr_ready_count);
2207                         atomic_inc(&info->mr_used_count);
2208                         return ret;
2209                 }
2210         }
2211
2212         spin_unlock(&info->mr_list_lock);
2213         /*
2214          * It is possible that we could fail to get MR because other processes may
2215          * try to acquire a MR at the same time. If this is the case, retry it.
2216          */
2217         goto again;
2218 }
2219
2220 /*
2221  * Transcribe the pages from an iterator into an MR scatterlist.
2222  */
2223 static int smbd_iter_to_mr(struct smbd_connection *info,
2224                            struct iov_iter *iter,
2225                            struct sg_table *sgt,
2226                            unsigned int max_sg)
2227 {
2228         int ret;
2229
2230         memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist));
2231
2232         ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0);
2233         WARN_ON(ret < 0);
2234         if (sgt->nents > 0)
2235                 sg_mark_end(&sgt->sgl[sgt->nents - 1]);
2236         return ret;
2237 }
2238
2239 /*
2240  * Register memory for RDMA read/write
2241  * iter: the buffer to register memory with
2242  * writing: true if this is a RDMA write (SMB read), false for RDMA read
2243  * need_invalidate: true if this MR needs to be locally invalidated after I/O
2244  * return value: the MR registered, NULL if failed.
2245  */
2246 struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
2247                                  struct iov_iter *iter,
2248                                  bool writing, bool need_invalidate)
2249 {
2250         struct smbd_mr *smbdirect_mr;
2251         int rc, num_pages;
2252         enum dma_data_direction dir;
2253         struct ib_reg_wr *reg_wr;
2254
2255         num_pages = iov_iter_npages(iter, info->max_frmr_depth + 1);
2256         if (num_pages > info->max_frmr_depth) {
2257                 log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n",
2258                         num_pages, info->max_frmr_depth);
2259                 WARN_ON_ONCE(1);
2260                 return NULL;
2261         }
2262
2263         smbdirect_mr = get_mr(info);
2264         if (!smbdirect_mr) {
2265                 log_rdma_mr(ERR, "get_mr returning NULL\n");
2266                 return NULL;
2267         }
2268
2269         dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
2270         smbdirect_mr->dir = dir;
2271         smbdirect_mr->need_invalidate = need_invalidate;
2272         smbdirect_mr->sgt.nents = 0;
2273         smbdirect_mr->sgt.orig_nents = 0;
2274
2275         log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n",
2276                     num_pages, iov_iter_count(iter), info->max_frmr_depth);
2277         smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth);
2278
2279         rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgt.sgl,
2280                            smbdirect_mr->sgt.nents, dir);
2281         if (!rc) {
2282                 log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
2283                         num_pages, dir, rc);
2284                 goto dma_map_error;
2285         }
2286
2287         rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgt.sgl,
2288                           smbdirect_mr->sgt.nents, NULL, PAGE_SIZE);
2289         if (rc != smbdirect_mr->sgt.nents) {
2290                 log_rdma_mr(ERR,
2291                         "ib_map_mr_sg failed rc = %d nents = %x\n",
2292                         rc, smbdirect_mr->sgt.nents);
2293                 goto map_mr_error;
2294         }
2295
2296         ib_update_fast_reg_key(smbdirect_mr->mr,
2297                 ib_inc_rkey(smbdirect_mr->mr->rkey));
2298         reg_wr = &smbdirect_mr->wr;
2299         reg_wr->wr.opcode = IB_WR_REG_MR;
2300         smbdirect_mr->cqe.done = register_mr_done;
2301         reg_wr->wr.wr_cqe = &smbdirect_mr->cqe;
2302         reg_wr->wr.num_sge = 0;
2303         reg_wr->wr.send_flags = IB_SEND_SIGNALED;
2304         reg_wr->mr = smbdirect_mr->mr;
2305         reg_wr->key = smbdirect_mr->mr->rkey;
2306         reg_wr->access = writing ?
2307                         IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
2308                         IB_ACCESS_REMOTE_READ;
2309
2310         /*
2311          * There is no need for waiting for complemtion on ib_post_send
2312          * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
2313          * on the next ib_post_send when we actually send I/O to remote peer
2314          */
2315         rc = ib_post_send(info->id->qp, &reg_wr->wr, NULL);
2316         if (!rc)
2317                 return smbdirect_mr;
2318
2319         log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n",
2320                 rc, reg_wr->key);
2321
2322         /* If all failed, attempt to recover this MR by setting it MR_ERROR*/
2323 map_mr_error:
2324         ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgt.sgl,
2325                         smbdirect_mr->sgt.nents, smbdirect_mr->dir);
2326
2327 dma_map_error:
2328         smbdirect_mr->state = MR_ERROR;
2329         if (atomic_dec_and_test(&info->mr_used_count))
2330                 wake_up(&info->wait_for_mr_cleanup);
2331
2332         smbd_disconnect_rdma_connection(info);
2333
2334         return NULL;
2335 }
2336
2337 static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
2338 {
2339         struct smbd_mr *smbdirect_mr;
2340         struct ib_cqe *cqe;
2341
2342         cqe = wc->wr_cqe;
2343         smbdirect_mr = container_of(cqe, struct smbd_mr, cqe);
2344         smbdirect_mr->state = MR_INVALIDATED;
2345         if (wc->status != IB_WC_SUCCESS) {
2346                 log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status);
2347                 smbdirect_mr->state = MR_ERROR;
2348         }
2349         complete(&smbdirect_mr->invalidate_done);
2350 }
2351
2352 /*
2353  * Deregister a MR after I/O is done
2354  * This function may wait if remote invalidation is not used
2355  * and we have to locally invalidate the buffer to prevent data is being
2356  * modified by remote peer after upper layer consumes it
2357  */
2358 int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
2359 {
2360         struct ib_send_wr *wr;
2361         struct smbd_connection *info = smbdirect_mr->conn;
2362         int rc = 0;
2363
2364         if (smbdirect_mr->need_invalidate) {
2365                 /* Need to finish local invalidation before returning */
2366                 wr = &smbdirect_mr->inv_wr;
2367                 wr->opcode = IB_WR_LOCAL_INV;
2368                 smbdirect_mr->cqe.done = local_inv_done;
2369                 wr->wr_cqe = &smbdirect_mr->cqe;
2370                 wr->num_sge = 0;
2371                 wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey;
2372                 wr->send_flags = IB_SEND_SIGNALED;
2373
2374                 init_completion(&smbdirect_mr->invalidate_done);
2375                 rc = ib_post_send(info->id->qp, wr, NULL);
2376                 if (rc) {
2377                         log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
2378                         smbd_disconnect_rdma_connection(info);
2379                         goto done;
2380                 }
2381                 wait_for_completion(&smbdirect_mr->invalidate_done);
2382                 smbdirect_mr->need_invalidate = false;
2383         } else
2384                 /*
2385                  * For remote invalidation, just set it to MR_INVALIDATED
2386                  * and defer to mr_recovery_work to recover the MR for next use
2387                  */
2388                 smbdirect_mr->state = MR_INVALIDATED;
2389
2390         if (smbdirect_mr->state == MR_INVALIDATED) {
2391                 ib_dma_unmap_sg(
2392                         info->id->device, smbdirect_mr->sgt.sgl,
2393                         smbdirect_mr->sgt.nents,
2394                         smbdirect_mr->dir);
2395                 smbdirect_mr->state = MR_READY;
2396                 if (atomic_inc_return(&info->mr_ready_count) == 1)
2397                         wake_up_interruptible(&info->wait_mr);
2398         } else
2399                 /*
2400                  * Schedule the work to do MR recovery for future I/Os MR
2401                  * recovery is slow and don't want it to block current I/O
2402                  */
2403                 queue_work(info->workqueue, &info->mr_recovery_work);
2404
2405 done:
2406         if (atomic_dec_and_test(&info->mr_used_count))
2407                 wake_up(&info->wait_for_mr_cleanup);
2408
2409         return rc;
2410 }
2411
2412 static bool smb_set_sge(struct smb_extract_to_rdma *rdma,
2413                         struct page *lowest_page, size_t off, size_t len)
2414 {
2415         struct ib_sge *sge = &rdma->sge[rdma->nr_sge];
2416         u64 addr;
2417
2418         addr = ib_dma_map_page(rdma->device, lowest_page,
2419                                off, len, rdma->direction);
2420         if (ib_dma_mapping_error(rdma->device, addr))
2421                 return false;
2422
2423         sge->addr   = addr;
2424         sge->length = len;
2425         sge->lkey   = rdma->local_dma_lkey;
2426         rdma->nr_sge++;
2427         return true;
2428 }
2429
2430 /*
2431  * Extract page fragments from a BVEC-class iterator and add them to an RDMA
2432  * element list.  The pages are not pinned.
2433  */
2434 static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter,
2435                                         struct smb_extract_to_rdma *rdma,
2436                                         ssize_t maxsize)
2437 {
2438         const struct bio_vec *bv = iter->bvec;
2439         unsigned long start = iter->iov_offset;
2440         unsigned int i;
2441         ssize_t ret = 0;
2442
2443         for (i = 0; i < iter->nr_segs; i++) {
2444                 size_t off, len;
2445
2446                 len = bv[i].bv_len;
2447                 if (start >= len) {
2448                         start -= len;
2449                         continue;
2450                 }
2451
2452                 len = min_t(size_t, maxsize, len - start);
2453                 off = bv[i].bv_offset + start;
2454
2455                 if (!smb_set_sge(rdma, bv[i].bv_page, off, len))
2456                         return -EIO;
2457
2458                 ret += len;
2459                 maxsize -= len;
2460                 if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
2461                         break;
2462                 start = 0;
2463         }
2464
2465         if (ret > 0)
2466                 iov_iter_advance(iter, ret);
2467         return ret;
2468 }
2469
2470 /*
2471  * Extract fragments from a KVEC-class iterator and add them to an RDMA list.
2472  * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers.
2473  * The pages are not pinned.
2474  */
2475 static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter,
2476                                         struct smb_extract_to_rdma *rdma,
2477                                         ssize_t maxsize)
2478 {
2479         const struct kvec *kv = iter->kvec;
2480         unsigned long start = iter->iov_offset;
2481         unsigned int i;
2482         ssize_t ret = 0;
2483
2484         for (i = 0; i < iter->nr_segs; i++) {
2485                 struct page *page;
2486                 unsigned long kaddr;
2487                 size_t off, len, seg;
2488
2489                 len = kv[i].iov_len;
2490                 if (start >= len) {
2491                         start -= len;
2492                         continue;
2493                 }
2494
2495                 kaddr = (unsigned long)kv[i].iov_base + start;
2496                 off = kaddr & ~PAGE_MASK;
2497                 len = min_t(size_t, maxsize, len - start);
2498                 kaddr &= PAGE_MASK;
2499
2500                 maxsize -= len;
2501                 do {
2502                         seg = min_t(size_t, len, PAGE_SIZE - off);
2503
2504                         if (is_vmalloc_or_module_addr((void *)kaddr))
2505                                 page = vmalloc_to_page((void *)kaddr);
2506                         else
2507                                 page = virt_to_page((void *)kaddr);
2508
2509                         if (!smb_set_sge(rdma, page, off, seg))
2510                                 return -EIO;
2511
2512                         ret += seg;
2513                         len -= seg;
2514                         kaddr += PAGE_SIZE;
2515                         off = 0;
2516                 } while (len > 0 && rdma->nr_sge < rdma->max_sge);
2517
2518                 if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0)
2519                         break;
2520                 start = 0;
2521         }
2522
2523         if (ret > 0)
2524                 iov_iter_advance(iter, ret);
2525         return ret;
2526 }
2527
2528 /*
2529  * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA
2530  * list.  The folios are not pinned.
2531  */
2532 static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
2533                                           struct smb_extract_to_rdma *rdma,
2534                                           ssize_t maxsize)
2535 {
2536         const struct folio_queue *folioq = iter->folioq;
2537         unsigned int slot = iter->folioq_slot;
2538         ssize_t ret = 0;
2539         size_t offset = iter->iov_offset;
2540
2541         BUG_ON(!folioq);
2542
2543         if (slot >= folioq_nr_slots(folioq)) {
2544                 folioq = folioq->next;
2545                 if (WARN_ON_ONCE(!folioq))
2546                         return -EIO;
2547                 slot = 0;
2548         }
2549
2550         do {
2551                 struct folio *folio = folioq_folio(folioq, slot);
2552                 size_t fsize = folioq_folio_size(folioq, slot);
2553
2554                 if (offset < fsize) {
2555                         size_t part = umin(maxsize - ret, fsize - offset);
2556
2557                         if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part))
2558                                 return -EIO;
2559
2560                         offset += part;
2561                         ret += part;
2562                 }
2563
2564                 if (offset >= fsize) {
2565                         offset = 0;
2566                         slot++;
2567                         if (slot >= folioq_nr_slots(folioq)) {
2568                                 if (!folioq->next) {
2569                                         WARN_ON_ONCE(ret < iter->count);
2570                                         break;
2571                                 }
2572                                 folioq = folioq->next;
2573                                 slot = 0;
2574                         }
2575                 }
2576         } while (rdma->nr_sge < rdma->max_sge || maxsize > 0);
2577
2578         iter->folioq = folioq;
2579         iter->folioq_slot = slot;
2580         iter->iov_offset = offset;
2581         iter->count -= ret;
2582         return ret;
2583 }
2584
2585 /*
2586  * Extract page fragments from up to the given amount of the source iterator
2587  * and build up an RDMA list that refers to all of those bits.  The RDMA list
2588  * is appended to, up to the maximum number of elements set in the parameter
2589  * block.
2590  *
2591  * The extracted page fragments are not pinned or ref'd in any way; if an
2592  * IOVEC/UBUF-type iterator is to be used, it should be converted to a
2593  * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some
2594  * way.
2595  */
2596 static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
2597                                         struct smb_extract_to_rdma *rdma)
2598 {
2599         ssize_t ret;
2600         int before = rdma->nr_sge;
2601
2602         switch (iov_iter_type(iter)) {
2603         case ITER_BVEC:
2604                 ret = smb_extract_bvec_to_rdma(iter, rdma, len);
2605                 break;
2606         case ITER_KVEC:
2607                 ret = smb_extract_kvec_to_rdma(iter, rdma, len);
2608                 break;
2609         case ITER_FOLIOQ:
2610                 ret = smb_extract_folioq_to_rdma(iter, rdma, len);
2611                 break;
2612         default:
2613                 WARN_ON_ONCE(1);
2614                 return -EIO;
2615         }
2616
2617         if (ret < 0) {
2618                 while (rdma->nr_sge > before) {
2619                         struct ib_sge *sge = &rdma->sge[rdma->nr_sge--];
2620
2621                         ib_dma_unmap_single(rdma->device, sge->addr, sge->length,
2622                                             rdma->direction);
2623                         sge->addr = 0;
2624                 }
2625         }
2626
2627         return ret;
2628 }
This page took 0.179076 seconds and 4 git commands to generate.