]> Git Repo - linux.git/blob - drivers/infiniband/sw/siw/siw_verbs.c
Linux 6.14-rc3
[linux.git] / drivers / infiniband / sw / siw / siw_verbs.c
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3 /* Authors: Bernard Metzler <[email protected]> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/uaccess.h>
9 #include <linux/vmalloc.h>
10 #include <linux/xarray.h>
11 #include <net/addrconf.h>
12
13 #include <rdma/iw_cm.h>
14 #include <rdma/ib_verbs.h>
15 #include <rdma/ib_user_verbs.h>
16 #include <rdma/uverbs_ioctl.h>
17
18 #include "siw.h"
19 #include "siw_verbs.h"
20 #include "siw_mem.h"
21
22 static int siw_qp_state_to_ib_qp_state[SIW_QP_STATE_COUNT] = {
23         [SIW_QP_STATE_IDLE] = IB_QPS_INIT,
24         [SIW_QP_STATE_RTR] = IB_QPS_RTR,
25         [SIW_QP_STATE_RTS] = IB_QPS_RTS,
26         [SIW_QP_STATE_CLOSING] = IB_QPS_SQD,
27         [SIW_QP_STATE_TERMINATE] = IB_QPS_SQE,
28         [SIW_QP_STATE_ERROR] = IB_QPS_ERR
29 };
30
31 static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
32         [IB_QPS_RESET] = SIW_QP_STATE_IDLE,
33         [IB_QPS_INIT] = SIW_QP_STATE_IDLE,
34         [IB_QPS_RTR] = SIW_QP_STATE_RTR,
35         [IB_QPS_RTS] = SIW_QP_STATE_RTS,
36         [IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
37         [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
38         [IB_QPS_ERR] = SIW_QP_STATE_ERROR
39 };
40
41 static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
42         [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
43         [IB_QPS_RTS] = "RTS",     [IB_QPS_SQD] = "SQD",   [IB_QPS_SQE] = "SQE",
44         [IB_QPS_ERR] = "ERR"
45 };
46
47 void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
48 {
49         struct siw_user_mmap_entry *entry = to_siw_mmap_entry(rdma_entry);
50
51         kfree(entry);
52 }
53
54 int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
55 {
56         struct siw_ucontext *uctx = to_siw_ctx(ctx);
57         size_t size = vma->vm_end - vma->vm_start;
58         struct rdma_user_mmap_entry *rdma_entry;
59         struct siw_user_mmap_entry *entry;
60         int rv = -EINVAL;
61
62         /*
63          * Must be page aligned
64          */
65         if (vma->vm_start & (PAGE_SIZE - 1)) {
66                 pr_warn("siw: mmap not page aligned\n");
67                 return -EINVAL;
68         }
69         rdma_entry = rdma_user_mmap_entry_get(&uctx->base_ucontext, vma);
70         if (!rdma_entry) {
71                 siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %#zx\n",
72                         vma->vm_pgoff, size);
73                 return -EINVAL;
74         }
75         entry = to_siw_mmap_entry(rdma_entry);
76
77         rv = remap_vmalloc_range(vma, entry->address, 0);
78         if (rv)
79                 pr_warn("remap_vmalloc_range failed: %lu, %zu\n", vma->vm_pgoff,
80                         size);
81         rdma_user_mmap_entry_put(rdma_entry);
82
83         return rv;
84 }
85
86 int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
87 {
88         struct siw_device *sdev = to_siw_dev(base_ctx->device);
89         struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
90         struct siw_uresp_alloc_ctx uresp = {};
91         int rv;
92
93         if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
94                 rv = -ENOMEM;
95                 goto err_out;
96         }
97         ctx->sdev = sdev;
98
99         uresp.dev_id = sdev->vendor_part_id;
100
101         if (udata->outlen < sizeof(uresp)) {
102                 rv = -EINVAL;
103                 goto err_out;
104         }
105         rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
106         if (rv)
107                 goto err_out;
108
109         siw_dbg(base_ctx->device, "success. now %d context(s)\n",
110                 atomic_read(&sdev->num_ctx));
111
112         return 0;
113
114 err_out:
115         atomic_dec(&sdev->num_ctx);
116         siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
117                 atomic_read(&sdev->num_ctx));
118
119         return rv;
120 }
121
122 void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
123 {
124         struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
125
126         atomic_dec(&uctx->sdev->num_ctx);
127 }
128
129 int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
130                      struct ib_udata *udata)
131 {
132         struct siw_device *sdev = to_siw_dev(base_dev);
133
134         if (udata->inlen || udata->outlen)
135                 return -EINVAL;
136
137         memset(attr, 0, sizeof(*attr));
138
139         /* Revisit atomic caps if RFC 7306 gets supported */
140         attr->atomic_cap = 0;
141         attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS;
142         attr->kernel_cap_flags = IBK_ALLOW_USER_UNREG;
143         attr->max_cq = sdev->attrs.max_cq;
144         attr->max_cqe = sdev->attrs.max_cqe;
145         attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
146         attr->max_mr = sdev->attrs.max_mr;
147         attr->max_mw = sdev->attrs.max_mw;
148         attr->max_mr_size = ~0ull;
149         attr->max_pd = sdev->attrs.max_pd;
150         attr->max_qp = sdev->attrs.max_qp;
151         attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
152         attr->max_qp_rd_atom = sdev->attrs.max_ord;
153         attr->max_qp_wr = sdev->attrs.max_qp_wr;
154         attr->max_recv_sge = sdev->attrs.max_sge;
155         attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
156         attr->max_send_sge = sdev->attrs.max_sge;
157         attr->max_sge_rd = sdev->attrs.max_sge_rd;
158         attr->max_srq = sdev->attrs.max_srq;
159         attr->max_srq_sge = sdev->attrs.max_srq_sge;
160         attr->max_srq_wr = sdev->attrs.max_srq_wr;
161         attr->page_size_cap = PAGE_SIZE;
162         attr->vendor_id = SIW_VENDOR_ID;
163         attr->vendor_part_id = sdev->vendor_part_id;
164
165         addrconf_addr_eui48((u8 *)&attr->sys_image_guid,
166                             sdev->raw_gid);
167
168         return 0;
169 }
170
171 int siw_query_port(struct ib_device *base_dev, u32 port,
172                    struct ib_port_attr *attr)
173 {
174         struct net_device *ndev;
175         int rv;
176
177         memset(attr, 0, sizeof(*attr));
178
179         rv = ib_get_eth_speed(base_dev, port, &attr->active_speed,
180                          &attr->active_width);
181         if (rv)
182                 return rv;
183
184         ndev = ib_device_get_netdev(base_dev, SIW_PORT);
185         if (!ndev)
186                 return -ENODEV;
187
188         attr->gid_tbl_len = 1;
189         attr->max_msg_sz = -1;
190         attr->max_mtu = ib_mtu_int_to_enum(ndev->max_mtu);
191         attr->active_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu));
192         attr->state = ib_get_curr_port_state(ndev);
193         attr->phys_state = attr->state == IB_PORT_ACTIVE ?
194                 IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
195         attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
196         /*
197          * All zero
198          *
199          * attr->lid = 0;
200          * attr->bad_pkey_cntr = 0;
201          * attr->qkey_viol_cntr = 0;
202          * attr->sm_lid = 0;
203          * attr->lmc = 0;
204          * attr->max_vl_num = 0;
205          * attr->sm_sl = 0;
206          * attr->subnet_timeout = 0;
207          * attr->init_type_repy = 0;
208          */
209         dev_put(ndev);
210         return rv;
211 }
212
213 int siw_get_port_immutable(struct ib_device *base_dev, u32 port,
214                            struct ib_port_immutable *port_immutable)
215 {
216         struct ib_port_attr attr;
217         int rv = siw_query_port(base_dev, port, &attr);
218
219         if (rv)
220                 return rv;
221
222         port_immutable->gid_tbl_len = attr.gid_tbl_len;
223         port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
224
225         return 0;
226 }
227
228 int siw_query_gid(struct ib_device *base_dev, u32 port, int idx,
229                   union ib_gid *gid)
230 {
231         struct siw_device *sdev = to_siw_dev(base_dev);
232
233         /* subnet_prefix == interface_id == 0; */
234         memset(gid, 0, sizeof(*gid));
235         memcpy(gid->raw, sdev->raw_gid, ETH_ALEN);
236
237         return 0;
238 }
239
240 int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
241 {
242         struct siw_device *sdev = to_siw_dev(pd->device);
243
244         if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
245                 atomic_dec(&sdev->num_pd);
246                 return -ENOMEM;
247         }
248         siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
249
250         return 0;
251 }
252
253 int siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
254 {
255         struct siw_device *sdev = to_siw_dev(pd->device);
256
257         siw_dbg_pd(pd, "free PD\n");
258         atomic_dec(&sdev->num_pd);
259         return 0;
260 }
261
262 void siw_qp_get_ref(struct ib_qp *base_qp)
263 {
264         siw_qp_get(to_siw_qp(base_qp));
265 }
266
267 void siw_qp_put_ref(struct ib_qp *base_qp)
268 {
269         siw_qp_put(to_siw_qp(base_qp));
270 }
271
272 static struct rdma_user_mmap_entry *
273 siw_mmap_entry_insert(struct siw_ucontext *uctx,
274                       void *address, size_t length,
275                       u64 *offset)
276 {
277         struct siw_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
278         int rv;
279
280         *offset = SIW_INVAL_UOBJ_KEY;
281         if (!entry)
282                 return NULL;
283
284         entry->address = address;
285
286         rv = rdma_user_mmap_entry_insert(&uctx->base_ucontext,
287                                          &entry->rdma_entry,
288                                          length);
289         if (rv) {
290                 kfree(entry);
291                 return NULL;
292         }
293
294         *offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
295
296         return &entry->rdma_entry;
297 }
298
299 /*
300  * siw_create_qp()
301  *
302  * Create QP of requested size on given device.
303  *
304  * @qp:         Queue pait
305  * @attrs:      Initial QP attributes.
306  * @udata:      used to provide QP ID, SQ and RQ size back to user.
307  */
308
309 int siw_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs,
310                   struct ib_udata *udata)
311 {
312         struct ib_pd *pd = ibqp->pd;
313         struct siw_qp *qp = to_siw_qp(ibqp);
314         struct ib_device *base_dev = pd->device;
315         struct siw_device *sdev = to_siw_dev(base_dev);
316         struct siw_ucontext *uctx =
317                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
318                                           base_ucontext);
319         unsigned long flags;
320         int num_sqe, num_rqe, rv = 0;
321         size_t length;
322
323         siw_dbg(base_dev, "create new QP\n");
324
325         if (attrs->create_flags)
326                 return -EOPNOTSUPP;
327
328         if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
329                 siw_dbg(base_dev, "too many QP's\n");
330                 rv = -ENOMEM;
331                 goto err_atomic;
332         }
333         if (attrs->qp_type != IB_QPT_RC) {
334                 siw_dbg(base_dev, "only RC QP's supported\n");
335                 rv = -EOPNOTSUPP;
336                 goto err_atomic;
337         }
338         if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
339             (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
340             (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
341             (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
342                 siw_dbg(base_dev, "QP size error\n");
343                 rv = -EINVAL;
344                 goto err_atomic;
345         }
346         if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
347                 siw_dbg(base_dev, "max inline send: %d > %d\n",
348                         attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
349                 rv = -EINVAL;
350                 goto err_atomic;
351         }
352         /*
353          * NOTE: we don't allow for a QP unable to hold any SQ WQE
354          */
355         if (attrs->cap.max_send_wr == 0) {
356                 siw_dbg(base_dev, "QP must have send queue\n");
357                 rv = -EINVAL;
358                 goto err_atomic;
359         }
360
361         if (!attrs->send_cq || (!attrs->recv_cq && !attrs->srq)) {
362                 siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
363                 rv = -EINVAL;
364                 goto err_atomic;
365         }
366
367         init_rwsem(&qp->state_lock);
368         spin_lock_init(&qp->sq_lock);
369         spin_lock_init(&qp->rq_lock);
370         spin_lock_init(&qp->orq_lock);
371
372         rv = siw_qp_add(sdev, qp);
373         if (rv)
374                 goto err_atomic;
375
376
377         /* All queue indices are derived from modulo operations
378          * on a free running 'get' (consumer) and 'put' (producer)
379          * unsigned counter. Having queue sizes at power of two
380          * avoids handling counter wrap around.
381          */
382         num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr);
383         num_rqe = attrs->cap.max_recv_wr;
384         if (num_rqe)
385                 num_rqe = roundup_pow_of_two(num_rqe);
386
387         if (udata)
388                 qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
389         else
390                 qp->sendq = vcalloc(num_sqe, sizeof(struct siw_sqe));
391
392         if (qp->sendq == NULL) {
393                 rv = -ENOMEM;
394                 goto err_out_xa;
395         }
396         if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
397                 if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
398                         qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
399                 else {
400                         rv = -EINVAL;
401                         goto err_out_xa;
402                 }
403         }
404         qp->pd = pd;
405         qp->scq = to_siw_cq(attrs->send_cq);
406         qp->rcq = to_siw_cq(attrs->recv_cq);
407
408         if (attrs->srq) {
409                 /*
410                  * SRQ support.
411                  * Verbs 6.3.7: ignore RQ size, if SRQ present
412                  * Verbs 6.3.5: do not check PD of SRQ against PD of QP
413                  */
414                 qp->srq = to_siw_srq(attrs->srq);
415                 qp->attrs.rq_size = 0;
416                 siw_dbg(base_dev, "QP [%u]: SRQ attached\n",
417                         qp->base_qp.qp_num);
418         } else if (num_rqe) {
419                 if (udata)
420                         qp->recvq =
421                                 vmalloc_user(num_rqe * sizeof(struct siw_rqe));
422                 else
423                         qp->recvq = vcalloc(num_rqe, sizeof(struct siw_rqe));
424
425                 if (qp->recvq == NULL) {
426                         rv = -ENOMEM;
427                         goto err_out_xa;
428                 }
429                 qp->attrs.rq_size = num_rqe;
430         }
431         qp->attrs.sq_size = num_sqe;
432         qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
433         qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
434
435         /* Make those two tunables fixed for now. */
436         qp->tx_ctx.gso_seg_limit = 1;
437         qp->tx_ctx.zcopy_tx = zcopy_tx;
438
439         qp->attrs.state = SIW_QP_STATE_IDLE;
440
441         if (udata) {
442                 struct siw_uresp_create_qp uresp = {};
443
444                 uresp.num_sqe = num_sqe;
445                 uresp.num_rqe = num_rqe;
446                 uresp.qp_id = qp_id(qp);
447
448                 if (qp->sendq) {
449                         length = num_sqe * sizeof(struct siw_sqe);
450                         qp->sq_entry =
451                                 siw_mmap_entry_insert(uctx, qp->sendq,
452                                                       length, &uresp.sq_key);
453                         if (!qp->sq_entry) {
454                                 rv = -ENOMEM;
455                                 goto err_out_xa;
456                         }
457                 }
458
459                 if (qp->recvq) {
460                         length = num_rqe * sizeof(struct siw_rqe);
461                         qp->rq_entry =
462                                 siw_mmap_entry_insert(uctx, qp->recvq,
463                                                       length, &uresp.rq_key);
464                         if (!qp->rq_entry) {
465                                 uresp.sq_key = SIW_INVAL_UOBJ_KEY;
466                                 rv = -ENOMEM;
467                                 goto err_out_xa;
468                         }
469                 }
470
471                 if (udata->outlen < sizeof(uresp)) {
472                         rv = -EINVAL;
473                         goto err_out_xa;
474                 }
475                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
476                 if (rv)
477                         goto err_out_xa;
478         }
479         qp->tx_cpu = siw_get_tx_cpu(sdev);
480         if (qp->tx_cpu < 0) {
481                 rv = -EINVAL;
482                 goto err_out_xa;
483         }
484         INIT_LIST_HEAD(&qp->devq);
485         spin_lock_irqsave(&sdev->lock, flags);
486         list_add_tail(&qp->devq, &sdev->qp_list);
487         spin_unlock_irqrestore(&sdev->lock, flags);
488
489         init_completion(&qp->qp_free);
490
491         return 0;
492
493 err_out_xa:
494         xa_erase(&sdev->qp_xa, qp_id(qp));
495         if (uctx) {
496                 rdma_user_mmap_entry_remove(qp->sq_entry);
497                 rdma_user_mmap_entry_remove(qp->rq_entry);
498         }
499         vfree(qp->sendq);
500         vfree(qp->recvq);
501
502 err_atomic:
503         atomic_dec(&sdev->num_qp);
504         return rv;
505 }
506
507 /*
508  * Minimum siw_query_qp() verb interface.
509  *
510  * @qp_attr_mask is not used but all available information is provided
511  */
512 int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
513                  int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
514 {
515         struct siw_qp *qp;
516         struct net_device *ndev;
517
518         if (base_qp && qp_attr && qp_init_attr)
519                 qp = to_siw_qp(base_qp);
520         else
521                 return -EINVAL;
522
523         ndev = ib_device_get_netdev(base_qp->device, SIW_PORT);
524         if (!ndev)
525                 return -ENODEV;
526
527         qp_attr->qp_state = siw_qp_state_to_ib_qp_state[qp->attrs.state];
528         qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
529         qp_attr->cap.max_send_wr = qp->attrs.sq_size;
530         qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
531         qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
532         qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
533         qp_attr->path_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu));
534         qp_attr->max_rd_atomic = qp->attrs.irq_size;
535         qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
536
537         qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
538                                    IB_ACCESS_REMOTE_WRITE |
539                                    IB_ACCESS_REMOTE_READ;
540
541         qp_init_attr->qp_type = base_qp->qp_type;
542         qp_init_attr->send_cq = base_qp->send_cq;
543         qp_init_attr->recv_cq = base_qp->recv_cq;
544         qp_init_attr->srq = base_qp->srq;
545
546         qp_init_attr->cap = qp_attr->cap;
547
548         dev_put(ndev);
549         return 0;
550 }
551
552 int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
553                         int attr_mask, struct ib_udata *udata)
554 {
555         struct siw_qp_attrs new_attrs;
556         enum siw_qp_attr_mask siw_attr_mask = 0;
557         struct siw_qp *qp = to_siw_qp(base_qp);
558         int rv = 0;
559
560         if (!attr_mask)
561                 return 0;
562
563         if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
564                 return -EOPNOTSUPP;
565
566         memset(&new_attrs, 0, sizeof(new_attrs));
567
568         if (attr_mask & IB_QP_ACCESS_FLAGS) {
569                 siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
570
571                 if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
572                         new_attrs.flags |= SIW_RDMA_READ_ENABLED;
573                 if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
574                         new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
575                 if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
576                         new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
577         }
578         if (attr_mask & IB_QP_STATE) {
579                 siw_dbg_qp(qp, "desired IB QP state: %s\n",
580                            ib_qp_state_to_string[attr->qp_state]);
581
582                 new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
583
584                 if (new_attrs.state > SIW_QP_STATE_RTS)
585                         qp->tx_ctx.tx_suspend = 1;
586
587                 siw_attr_mask |= SIW_QP_ATTR_STATE;
588         }
589         if (!siw_attr_mask)
590                 goto out;
591
592         down_write(&qp->state_lock);
593
594         rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
595
596         up_write(&qp->state_lock);
597 out:
598         return rv;
599 }
600
601 int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
602 {
603         struct siw_qp *qp = to_siw_qp(base_qp);
604         struct siw_ucontext *uctx =
605                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
606                                           base_ucontext);
607         struct siw_qp_attrs qp_attrs;
608
609         siw_dbg_qp(qp, "state %d\n", qp->attrs.state);
610
611         /*
612          * Mark QP as in process of destruction to prevent from
613          * any async callbacks to RDMA core
614          */
615         qp->attrs.flags |= SIW_QP_IN_DESTROY;
616         qp->rx_stream.rx_suspend = 1;
617
618         if (uctx) {
619                 rdma_user_mmap_entry_remove(qp->sq_entry);
620                 rdma_user_mmap_entry_remove(qp->rq_entry);
621         }
622
623         down_write(&qp->state_lock);
624
625         qp_attrs.state = SIW_QP_STATE_ERROR;
626         siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
627
628         if (qp->cep) {
629                 siw_cep_put(qp->cep);
630                 qp->cep = NULL;
631         }
632         up_write(&qp->state_lock);
633
634         kfree(qp->tx_ctx.mpa_crc_hd);
635         kfree(qp->rx_stream.mpa_crc_hd);
636
637         qp->scq = qp->rcq = NULL;
638
639         siw_qp_put(qp);
640         wait_for_completion(&qp->qp_free);
641
642         return 0;
643 }
644
645 /*
646  * siw_copy_inline_sgl()
647  *
648  * Prepare sgl of inlined data for sending. For userland callers
649  * function checks if given buffer addresses and len's are within
650  * process context bounds.
651  * Data from all provided sge's are copied together into the wqe,
652  * referenced by a single sge.
653  */
654 static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
655                                struct siw_sqe *sqe)
656 {
657         struct ib_sge *core_sge = core_wr->sg_list;
658         void *kbuf = &sqe->sge[1];
659         int num_sge = core_wr->num_sge, bytes = 0;
660
661         sqe->sge[0].laddr = (uintptr_t)kbuf;
662         sqe->sge[0].lkey = 0;
663
664         while (num_sge--) {
665                 if (!core_sge->length) {
666                         core_sge++;
667                         continue;
668                 }
669                 bytes += core_sge->length;
670                 if (bytes > SIW_MAX_INLINE) {
671                         bytes = -EINVAL;
672                         break;
673                 }
674                 memcpy(kbuf, ib_virt_dma_to_ptr(core_sge->addr),
675                        core_sge->length);
676
677                 kbuf += core_sge->length;
678                 core_sge++;
679         }
680         sqe->sge[0].length = max(bytes, 0);
681         sqe->num_sge = bytes > 0 ? 1 : 0;
682
683         return bytes;
684 }
685
686 /* Complete SQ WR's without processing */
687 static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
688                            const struct ib_send_wr **bad_wr)
689 {
690         int rv = 0;
691
692         while (wr) {
693                 struct siw_sqe sqe = {};
694
695                 switch (wr->opcode) {
696                 case IB_WR_RDMA_WRITE:
697                         sqe.opcode = SIW_OP_WRITE;
698                         break;
699                 case IB_WR_RDMA_READ:
700                         sqe.opcode = SIW_OP_READ;
701                         break;
702                 case IB_WR_RDMA_READ_WITH_INV:
703                         sqe.opcode = SIW_OP_READ_LOCAL_INV;
704                         break;
705                 case IB_WR_SEND:
706                         sqe.opcode = SIW_OP_SEND;
707                         break;
708                 case IB_WR_SEND_WITH_IMM:
709                         sqe.opcode = SIW_OP_SEND_WITH_IMM;
710                         break;
711                 case IB_WR_SEND_WITH_INV:
712                         sqe.opcode = SIW_OP_SEND_REMOTE_INV;
713                         break;
714                 case IB_WR_LOCAL_INV:
715                         sqe.opcode = SIW_OP_INVAL_STAG;
716                         break;
717                 case IB_WR_REG_MR:
718                         sqe.opcode = SIW_OP_REG_MR;
719                         break;
720                 default:
721                         rv = -EINVAL;
722                         break;
723                 }
724                 if (!rv) {
725                         sqe.id = wr->wr_id;
726                         rv = siw_sqe_complete(qp, &sqe, 0,
727                                               SIW_WC_WR_FLUSH_ERR);
728                 }
729                 if (rv) {
730                         if (bad_wr)
731                                 *bad_wr = wr;
732                         break;
733                 }
734                 wr = wr->next;
735         }
736         return rv;
737 }
738
739 /* Complete RQ WR's without processing */
740 static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr,
741                            const struct ib_recv_wr **bad_wr)
742 {
743         struct siw_rqe rqe = {};
744         int rv = 0;
745
746         while (wr) {
747                 rqe.id = wr->wr_id;
748                 rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR);
749                 if (rv) {
750                         if (bad_wr)
751                                 *bad_wr = wr;
752                         break;
753                 }
754                 wr = wr->next;
755         }
756         return rv;
757 }
758
759 /*
760  * siw_post_send()
761  *
762  * Post a list of S-WR's to a SQ.
763  *
764  * @base_qp:    Base QP contained in siw QP
765  * @wr:         Null terminated list of user WR's
766  * @bad_wr:     Points to failing WR in case of synchronous failure.
767  */
768 int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
769                   const struct ib_send_wr **bad_wr)
770 {
771         struct siw_qp *qp = to_siw_qp(base_qp);
772         struct siw_wqe *wqe = tx_wqe(qp);
773
774         unsigned long flags;
775         int rv = 0;
776
777         if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) {
778                 siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
779                 *bad_wr = wr;
780                 return -EINVAL;
781         }
782
783         /*
784          * Try to acquire QP state lock. Must be non-blocking
785          * to accommodate kernel clients needs.
786          */
787         if (!down_read_trylock(&qp->state_lock)) {
788                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
789                         /*
790                          * ERROR state is final, so we can be sure
791                          * this state will not change as long as the QP
792                          * exists.
793                          *
794                          * This handles an ib_drain_sq() call with
795                          * a concurrent request to set the QP state
796                          * to ERROR.
797                          */
798                         rv = siw_sq_flush_wr(qp, wr, bad_wr);
799                 } else {
800                         siw_dbg_qp(qp, "QP locked, state %d\n",
801                                    qp->attrs.state);
802                         *bad_wr = wr;
803                         rv = -ENOTCONN;
804                 }
805                 return rv;
806         }
807         if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
808                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
809                         /*
810                          * Immediately flush this WR to CQ, if QP
811                          * is in ERROR state. SQ is guaranteed to
812                          * be empty, so WR complets in-order.
813                          *
814                          * Typically triggered by ib_drain_sq().
815                          */
816                         rv = siw_sq_flush_wr(qp, wr, bad_wr);
817                 } else {
818                         siw_dbg_qp(qp, "QP out of state %d\n",
819                                    qp->attrs.state);
820                         *bad_wr = wr;
821                         rv = -ENOTCONN;
822                 }
823                 up_read(&qp->state_lock);
824                 return rv;
825         }
826         spin_lock_irqsave(&qp->sq_lock, flags);
827
828         while (wr) {
829                 u32 idx = qp->sq_put % qp->attrs.sq_size;
830                 struct siw_sqe *sqe = &qp->sendq[idx];
831
832                 if (sqe->flags) {
833                         siw_dbg_qp(qp, "sq full\n");
834                         rv = -ENOMEM;
835                         break;
836                 }
837                 if (wr->num_sge > qp->attrs.sq_max_sges) {
838                         siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
839                         rv = -EINVAL;
840                         break;
841                 }
842                 sqe->id = wr->wr_id;
843
844                 if ((wr->send_flags & IB_SEND_SIGNALED) ||
845                     (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
846                         sqe->flags |= SIW_WQE_SIGNALLED;
847
848                 if (wr->send_flags & IB_SEND_FENCE)
849                         sqe->flags |= SIW_WQE_READ_FENCE;
850
851                 switch (wr->opcode) {
852                 case IB_WR_SEND:
853                 case IB_WR_SEND_WITH_INV:
854                         if (wr->send_flags & IB_SEND_SOLICITED)
855                                 sqe->flags |= SIW_WQE_SOLICITED;
856
857                         if (!(wr->send_flags & IB_SEND_INLINE)) {
858                                 siw_copy_sgl(wr->sg_list, sqe->sge,
859                                              wr->num_sge);
860                                 sqe->num_sge = wr->num_sge;
861                         } else {
862                                 rv = siw_copy_inline_sgl(wr, sqe);
863                                 if (rv <= 0) {
864                                         rv = -EINVAL;
865                                         break;
866                                 }
867                                 sqe->flags |= SIW_WQE_INLINE;
868                                 sqe->num_sge = 1;
869                         }
870                         if (wr->opcode == IB_WR_SEND)
871                                 sqe->opcode = SIW_OP_SEND;
872                         else {
873                                 sqe->opcode = SIW_OP_SEND_REMOTE_INV;
874                                 sqe->rkey = wr->ex.invalidate_rkey;
875                         }
876                         break;
877
878                 case IB_WR_RDMA_READ_WITH_INV:
879                 case IB_WR_RDMA_READ:
880                         /*
881                          * iWarp restricts RREAD sink to SGL containing
882                          * 1 SGE only. we could relax to SGL with multiple
883                          * elements referring the SAME ltag or even sending
884                          * a private per-rreq tag referring to a checked
885                          * local sgl with MULTIPLE ltag's.
886                          */
887                         if (unlikely(wr->num_sge != 1)) {
888                                 rv = -EINVAL;
889                                 break;
890                         }
891                         siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
892                         /*
893                          * NOTE: zero length RREAD is allowed!
894                          */
895                         sqe->raddr = rdma_wr(wr)->remote_addr;
896                         sqe->rkey = rdma_wr(wr)->rkey;
897                         sqe->num_sge = 1;
898
899                         if (wr->opcode == IB_WR_RDMA_READ)
900                                 sqe->opcode = SIW_OP_READ;
901                         else
902                                 sqe->opcode = SIW_OP_READ_LOCAL_INV;
903                         break;
904
905                 case IB_WR_RDMA_WRITE:
906                         if (!(wr->send_flags & IB_SEND_INLINE)) {
907                                 siw_copy_sgl(wr->sg_list, &sqe->sge[0],
908                                              wr->num_sge);
909                                 sqe->num_sge = wr->num_sge;
910                         } else {
911                                 rv = siw_copy_inline_sgl(wr, sqe);
912                                 if (unlikely(rv < 0)) {
913                                         rv = -EINVAL;
914                                         break;
915                                 }
916                                 sqe->flags |= SIW_WQE_INLINE;
917                                 sqe->num_sge = 1;
918                         }
919                         sqe->raddr = rdma_wr(wr)->remote_addr;
920                         sqe->rkey = rdma_wr(wr)->rkey;
921                         sqe->opcode = SIW_OP_WRITE;
922                         break;
923
924                 case IB_WR_REG_MR:
925                         sqe->base_mr = (uintptr_t)reg_wr(wr)->mr;
926                         sqe->rkey = reg_wr(wr)->key;
927                         sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
928                         sqe->opcode = SIW_OP_REG_MR;
929                         break;
930
931                 case IB_WR_LOCAL_INV:
932                         sqe->rkey = wr->ex.invalidate_rkey;
933                         sqe->opcode = SIW_OP_INVAL_STAG;
934                         break;
935
936                 default:
937                         siw_dbg_qp(qp, "ib wr type %d unsupported\n",
938                                    wr->opcode);
939                         rv = -EINVAL;
940                         break;
941                 }
942                 siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
943                            sqe->opcode, sqe->flags,
944                            (void *)(uintptr_t)sqe->id);
945
946                 if (unlikely(rv < 0))
947                         break;
948
949                 /* make SQE only valid after completely written */
950                 smp_wmb();
951                 sqe->flags |= SIW_WQE_VALID;
952
953                 qp->sq_put++;
954                 wr = wr->next;
955         }
956
957         /*
958          * Send directly if SQ processing is not in progress.
959          * Eventual immediate errors (rv < 0) do not affect the involved
960          * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
961          * processing, if new work is already pending. But rv must be passed
962          * to caller.
963          */
964         if (wqe->wr_status != SIW_WR_IDLE) {
965                 spin_unlock_irqrestore(&qp->sq_lock, flags);
966                 goto skip_direct_sending;
967         }
968         rv = siw_activate_tx(qp);
969         spin_unlock_irqrestore(&qp->sq_lock, flags);
970
971         if (rv <= 0)
972                 goto skip_direct_sending;
973
974         if (rdma_is_kernel_res(&qp->base_qp.res)) {
975                 rv = siw_sq_start(qp);
976         } else {
977                 qp->tx_ctx.in_syscall = 1;
978
979                 if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
980                         siw_qp_cm_drop(qp, 0);
981
982                 qp->tx_ctx.in_syscall = 0;
983         }
984 skip_direct_sending:
985
986         up_read(&qp->state_lock);
987
988         if (rv >= 0)
989                 return 0;
990         /*
991          * Immediate error
992          */
993         siw_dbg_qp(qp, "error %d\n", rv);
994
995         *bad_wr = wr;
996         return rv;
997 }
998
999 /*
1000  * siw_post_receive()
1001  *
1002  * Post a list of R-WR's to a RQ.
1003  *
1004  * @base_qp:    Base QP contained in siw QP
1005  * @wr:         Null terminated list of user WR's
1006  * @bad_wr:     Points to failing WR in case of synchronous failure.
1007  */
1008 int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
1009                      const struct ib_recv_wr **bad_wr)
1010 {
1011         struct siw_qp *qp = to_siw_qp(base_qp);
1012         unsigned long flags;
1013         int rv = 0;
1014
1015         if (qp->srq || qp->attrs.rq_size == 0) {
1016                 *bad_wr = wr;
1017                 return -EINVAL;
1018         }
1019         if (!rdma_is_kernel_res(&qp->base_qp.res)) {
1020                 siw_dbg_qp(qp, "no kernel post_recv for user mapped rq\n");
1021                 *bad_wr = wr;
1022                 return -EINVAL;
1023         }
1024
1025         /*
1026          * Try to acquire QP state lock. Must be non-blocking
1027          * to accommodate kernel clients needs.
1028          */
1029         if (!down_read_trylock(&qp->state_lock)) {
1030                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
1031                         /*
1032                          * ERROR state is final, so we can be sure
1033                          * this state will not change as long as the QP
1034                          * exists.
1035                          *
1036                          * This handles an ib_drain_rq() call with
1037                          * a concurrent request to set the QP state
1038                          * to ERROR.
1039                          */
1040                         rv = siw_rq_flush_wr(qp, wr, bad_wr);
1041                 } else {
1042                         siw_dbg_qp(qp, "QP locked, state %d\n",
1043                                    qp->attrs.state);
1044                         *bad_wr = wr;
1045                         rv = -ENOTCONN;
1046                 }
1047                 return rv;
1048         }
1049         if (qp->attrs.state > SIW_QP_STATE_RTS) {
1050                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
1051                         /*
1052                          * Immediately flush this WR to CQ, if QP
1053                          * is in ERROR state. RQ is guaranteed to
1054                          * be empty, so WR complets in-order.
1055                          *
1056                          * Typically triggered by ib_drain_rq().
1057                          */
1058                         rv = siw_rq_flush_wr(qp, wr, bad_wr);
1059                 } else {
1060                         siw_dbg_qp(qp, "QP out of state %d\n",
1061                                    qp->attrs.state);
1062                         *bad_wr = wr;
1063                         rv = -ENOTCONN;
1064                 }
1065                 up_read(&qp->state_lock);
1066                 return rv;
1067         }
1068         /*
1069          * Serialize potentially multiple producers.
1070          * Not needed for single threaded consumer side.
1071          */
1072         spin_lock_irqsave(&qp->rq_lock, flags);
1073
1074         while (wr) {
1075                 u32 idx = qp->rq_put % qp->attrs.rq_size;
1076                 struct siw_rqe *rqe = &qp->recvq[idx];
1077
1078                 if (rqe->flags) {
1079                         siw_dbg_qp(qp, "RQ full\n");
1080                         rv = -ENOMEM;
1081                         break;
1082                 }
1083                 if (wr->num_sge > qp->attrs.rq_max_sges) {
1084                         siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
1085                         rv = -EINVAL;
1086                         break;
1087                 }
1088                 rqe->id = wr->wr_id;
1089                 rqe->num_sge = wr->num_sge;
1090                 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1091
1092                 /* make sure RQE is completely written before valid */
1093                 smp_wmb();
1094
1095                 rqe->flags = SIW_WQE_VALID;
1096
1097                 qp->rq_put++;
1098                 wr = wr->next;
1099         }
1100         spin_unlock_irqrestore(&qp->rq_lock, flags);
1101
1102         up_read(&qp->state_lock);
1103
1104         if (rv < 0) {
1105                 siw_dbg_qp(qp, "error %d\n", rv);
1106                 *bad_wr = wr;
1107         }
1108         return rv > 0 ? 0 : rv;
1109 }
1110
1111 int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
1112 {
1113         struct siw_cq *cq = to_siw_cq(base_cq);
1114         struct siw_device *sdev = to_siw_dev(base_cq->device);
1115         struct siw_ucontext *ctx =
1116                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1117                                           base_ucontext);
1118
1119         siw_dbg_cq(cq, "free CQ resources\n");
1120
1121         siw_cq_flush(cq);
1122
1123         if (ctx)
1124                 rdma_user_mmap_entry_remove(cq->cq_entry);
1125
1126         atomic_dec(&sdev->num_cq);
1127
1128         vfree(cq->queue);
1129         return 0;
1130 }
1131
1132 /*
1133  * siw_create_cq()
1134  *
1135  * Populate CQ of requested size
1136  *
1137  * @base_cq: CQ as allocated by RDMA midlayer
1138  * @attr: Initial CQ attributes
1139  * @attrs: uverbs bundle
1140  */
1141
1142 int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
1143                   struct uverbs_attr_bundle *attrs)
1144 {
1145         struct ib_udata *udata = &attrs->driver_udata;
1146         struct siw_device *sdev = to_siw_dev(base_cq->device);
1147         struct siw_cq *cq = to_siw_cq(base_cq);
1148         int rv, size = attr->cqe;
1149
1150         if (attr->flags)
1151                 return -EOPNOTSUPP;
1152
1153         if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
1154                 siw_dbg(base_cq->device, "too many CQ's\n");
1155                 rv = -ENOMEM;
1156                 goto err_out;
1157         }
1158         if (size < 1 || size > sdev->attrs.max_cqe) {
1159                 siw_dbg(base_cq->device, "CQ size error: %d\n", size);
1160                 rv = -EINVAL;
1161                 goto err_out;
1162         }
1163         size = roundup_pow_of_two(size);
1164         cq->base_cq.cqe = size;
1165         cq->num_cqe = size;
1166
1167         if (udata)
1168                 cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
1169                                          sizeof(struct siw_cq_ctrl));
1170         else
1171                 cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
1172                                     sizeof(struct siw_cq_ctrl));
1173
1174         if (cq->queue == NULL) {
1175                 rv = -ENOMEM;
1176                 goto err_out;
1177         }
1178         get_random_bytes(&cq->id, 4);
1179         siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
1180
1181         spin_lock_init(&cq->lock);
1182
1183         cq->notify = (struct siw_cq_ctrl *)&cq->queue[size];
1184
1185         if (udata) {
1186                 struct siw_uresp_create_cq uresp = {};
1187                 struct siw_ucontext *ctx =
1188                         rdma_udata_to_drv_context(udata, struct siw_ucontext,
1189                                                   base_ucontext);
1190                 size_t length = size * sizeof(struct siw_cqe) +
1191                         sizeof(struct siw_cq_ctrl);
1192
1193                 cq->cq_entry =
1194                         siw_mmap_entry_insert(ctx, cq->queue,
1195                                               length, &uresp.cq_key);
1196                 if (!cq->cq_entry) {
1197                         rv = -ENOMEM;
1198                         goto err_out;
1199                 }
1200
1201                 uresp.cq_id = cq->id;
1202                 uresp.num_cqe = size;
1203
1204                 if (udata->outlen < sizeof(uresp)) {
1205                         rv = -EINVAL;
1206                         goto err_out;
1207                 }
1208                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1209                 if (rv)
1210                         goto err_out;
1211         }
1212         return 0;
1213
1214 err_out:
1215         siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
1216
1217         if (cq->queue) {
1218                 struct siw_ucontext *ctx =
1219                         rdma_udata_to_drv_context(udata, struct siw_ucontext,
1220                                                   base_ucontext);
1221                 if (ctx)
1222                         rdma_user_mmap_entry_remove(cq->cq_entry);
1223                 vfree(cq->queue);
1224         }
1225         atomic_dec(&sdev->num_cq);
1226
1227         return rv;
1228 }
1229
1230 /*
1231  * siw_poll_cq()
1232  *
1233  * Reap CQ entries if available and copy work completion status into
1234  * array of WC's provided by caller. Returns number of reaped CQE's.
1235  *
1236  * @base_cq:    Base CQ contained in siw CQ.
1237  * @num_cqe:    Maximum number of CQE's to reap.
1238  * @wc:         Array of work completions to be filled by siw.
1239  */
1240 int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
1241 {
1242         struct siw_cq *cq = to_siw_cq(base_cq);
1243         int i;
1244
1245         for (i = 0; i < num_cqe; i++) {
1246                 if (!siw_reap_cqe(cq, wc))
1247                         break;
1248                 wc++;
1249         }
1250         return i;
1251 }
1252
1253 /*
1254  * siw_req_notify_cq()
1255  *
1256  * Request notification for new CQE's added to that CQ.
1257  * Defined flags:
1258  * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
1259  *   event if a WQE with notification flag set enters the CQ
1260  * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
1261  *   event if a WQE enters the CQ.
1262  * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
1263  *   number of not reaped CQE's regardless of its notification
1264  *   type and current or new CQ notification settings.
1265  *
1266  * @base_cq:    Base CQ contained in siw CQ.
1267  * @flags:      Requested notification flags.
1268  */
1269 int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
1270 {
1271         struct siw_cq *cq = to_siw_cq(base_cq);
1272
1273         siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
1274
1275         if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
1276                 /*
1277                  * Enable CQ event for next solicited completion.
1278                  * and make it visible to all associated producers.
1279                  */
1280                 smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED);
1281         else
1282                 /*
1283                  * Enable CQ event for any signalled completion.
1284                  * and make it visible to all associated producers.
1285                  */
1286                 smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL);
1287
1288         if (flags & IB_CQ_REPORT_MISSED_EVENTS)
1289                 return cq->cq_put - cq->cq_get;
1290
1291         return 0;
1292 }
1293
1294 /*
1295  * siw_dereg_mr()
1296  *
1297  * Release Memory Region.
1298  *
1299  * @base_mr: Base MR contained in siw MR.
1300  * @udata: points to user context, unused.
1301  */
1302 int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
1303 {
1304         struct siw_mr *mr = to_siw_mr(base_mr);
1305         struct siw_device *sdev = to_siw_dev(base_mr->device);
1306
1307         siw_dbg_mem(mr->mem, "deregister MR\n");
1308
1309         atomic_dec(&sdev->num_mr);
1310
1311         siw_mr_drop_mem(mr);
1312         kfree_rcu(mr, rcu);
1313
1314         return 0;
1315 }
1316
1317 /*
1318  * siw_reg_user_mr()
1319  *
1320  * Register Memory Region.
1321  *
1322  * @pd:         Protection Domain
1323  * @start:      starting address of MR (virtual address)
1324  * @len:        len of MR
1325  * @rnic_va:    not used by siw
1326  * @rights:     MR access rights
1327  * @udata:      user buffer to communicate STag and Key.
1328  */
1329 struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
1330                               u64 rnic_va, int rights, struct ib_udata *udata)
1331 {
1332         struct siw_mr *mr = NULL;
1333         struct siw_umem *umem = NULL;
1334         struct siw_ureq_reg_mr ureq;
1335         struct siw_device *sdev = to_siw_dev(pd->device);
1336         int rv;
1337
1338         siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
1339                    (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
1340                    (unsigned long long)len);
1341
1342         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1343                 siw_dbg_pd(pd, "too many mr's\n");
1344                 rv = -ENOMEM;
1345                 goto err_out;
1346         }
1347         if (!len) {
1348                 rv = -EINVAL;
1349                 goto err_out;
1350         }
1351         umem = siw_umem_get(pd->device, start, len, rights);
1352         if (IS_ERR(umem)) {
1353                 rv = PTR_ERR(umem);
1354                 siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
1355                 umem = NULL;
1356                 goto err_out;
1357         }
1358         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1359         if (!mr) {
1360                 rv = -ENOMEM;
1361                 goto err_out;
1362         }
1363         rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
1364         if (rv)
1365                 goto err_out;
1366
1367         if (udata) {
1368                 struct siw_uresp_reg_mr uresp = {};
1369                 struct siw_mem *mem = mr->mem;
1370
1371                 if (udata->inlen < sizeof(ureq)) {
1372                         rv = -EINVAL;
1373                         goto err_out;
1374                 }
1375                 rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
1376                 if (rv)
1377                         goto err_out;
1378
1379                 mr->base_mr.lkey |= ureq.stag_key;
1380                 mr->base_mr.rkey |= ureq.stag_key;
1381                 mem->stag |= ureq.stag_key;
1382                 uresp.stag = mem->stag;
1383
1384                 if (udata->outlen < sizeof(uresp)) {
1385                         rv = -EINVAL;
1386                         goto err_out;
1387                 }
1388                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1389                 if (rv)
1390                         goto err_out;
1391         }
1392         mr->mem->stag_valid = 1;
1393
1394         return &mr->base_mr;
1395
1396 err_out:
1397         atomic_dec(&sdev->num_mr);
1398         if (mr) {
1399                 if (mr->mem)
1400                         siw_mr_drop_mem(mr);
1401                 kfree_rcu(mr, rcu);
1402         } else {
1403                 if (umem)
1404                         siw_umem_release(umem);
1405         }
1406         return ERR_PTR(rv);
1407 }
1408
1409 struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
1410                            u32 max_sge)
1411 {
1412         struct siw_device *sdev = to_siw_dev(pd->device);
1413         struct siw_mr *mr = NULL;
1414         struct siw_pbl *pbl = NULL;
1415         int rv;
1416
1417         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1418                 siw_dbg_pd(pd, "too many mr's\n");
1419                 rv = -ENOMEM;
1420                 goto err_out;
1421         }
1422         if (mr_type != IB_MR_TYPE_MEM_REG) {
1423                 siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
1424                 rv = -EOPNOTSUPP;
1425                 goto err_out;
1426         }
1427         if (max_sge > SIW_MAX_SGE_PBL) {
1428                 siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
1429                 rv = -ENOMEM;
1430                 goto err_out;
1431         }
1432         pbl = siw_pbl_alloc(max_sge);
1433         if (IS_ERR(pbl)) {
1434                 rv = PTR_ERR(pbl);
1435                 siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
1436                 pbl = NULL;
1437                 goto err_out;
1438         }
1439         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1440         if (!mr) {
1441                 rv = -ENOMEM;
1442                 goto err_out;
1443         }
1444         rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
1445         if (rv)
1446                 goto err_out;
1447
1448         mr->mem->is_pbl = 1;
1449
1450         siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1451
1452         return &mr->base_mr;
1453
1454 err_out:
1455         atomic_dec(&sdev->num_mr);
1456
1457         if (!mr) {
1458                 kfree(pbl);
1459         } else {
1460                 if (mr->mem)
1461                         siw_mr_drop_mem(mr);
1462                 kfree_rcu(mr, rcu);
1463         }
1464         siw_dbg_pd(pd, "failed: %d\n", rv);
1465
1466         return ERR_PTR(rv);
1467 }
1468
1469 /* Just used to count number of pages being mapped */
1470 static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
1471 {
1472         return 0;
1473 }
1474
1475 int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
1476                   unsigned int *sg_off)
1477 {
1478         struct scatterlist *slp;
1479         struct siw_mr *mr = to_siw_mr(base_mr);
1480         struct siw_mem *mem = mr->mem;
1481         struct siw_pbl *pbl = mem->pbl;
1482         struct siw_pble *pble;
1483         unsigned long pbl_size;
1484         int i, rv;
1485
1486         if (!pbl) {
1487                 siw_dbg_mem(mem, "no PBL allocated\n");
1488                 return -EINVAL;
1489         }
1490         pble = pbl->pbe;
1491
1492         if (pbl->max_buf < num_sle) {
1493                 siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
1494                             num_sle, pbl->max_buf);
1495                 return -ENOMEM;
1496         }
1497         for_each_sg(sl, slp, num_sle, i) {
1498                 if (sg_dma_len(slp) == 0) {
1499                         siw_dbg_mem(mem, "empty SGE\n");
1500                         return -EINVAL;
1501                 }
1502                 if (i == 0) {
1503                         pble->addr = sg_dma_address(slp);
1504                         pble->size = sg_dma_len(slp);
1505                         pble->pbl_off = 0;
1506                         pbl_size = pble->size;
1507                         pbl->num_buf = 1;
1508                 } else {
1509                         /* Merge PBL entries if adjacent */
1510                         if (pble->addr + pble->size == sg_dma_address(slp)) {
1511                                 pble->size += sg_dma_len(slp);
1512                         } else {
1513                                 pble++;
1514                                 pbl->num_buf++;
1515                                 pble->addr = sg_dma_address(slp);
1516                                 pble->size = sg_dma_len(slp);
1517                                 pble->pbl_off = pbl_size;
1518                         }
1519                         pbl_size += sg_dma_len(slp);
1520                 }
1521                 siw_dbg_mem(mem,
1522                         "sge[%d], size %u, addr 0x%p, total %lu\n",
1523                         i, pble->size, ib_virt_dma_to_ptr(pble->addr),
1524                         pbl_size);
1525         }
1526         rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
1527         if (rv > 0) {
1528                 mem->len = base_mr->length;
1529                 mem->va = base_mr->iova;
1530                 siw_dbg_mem(mem,
1531                         "%llu bytes, start 0x%pK, %u SLE to %u entries\n",
1532                         mem->len, (void *)(uintptr_t)mem->va, num_sle,
1533                         pbl->num_buf);
1534         }
1535         return rv;
1536 }
1537
1538 /*
1539  * siw_get_dma_mr()
1540  *
1541  * Create a (empty) DMA memory region, where no umem is attached.
1542  */
1543 struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
1544 {
1545         struct siw_device *sdev = to_siw_dev(pd->device);
1546         struct siw_mr *mr = NULL;
1547         int rv;
1548
1549         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1550                 siw_dbg_pd(pd, "too many mr's\n");
1551                 rv = -ENOMEM;
1552                 goto err_out;
1553         }
1554         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1555         if (!mr) {
1556                 rv = -ENOMEM;
1557                 goto err_out;
1558         }
1559         rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
1560         if (rv)
1561                 goto err_out;
1562
1563         mr->mem->stag_valid = 1;
1564
1565         siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1566
1567         return &mr->base_mr;
1568
1569 err_out:
1570         if (rv)
1571                 kfree(mr);
1572
1573         atomic_dec(&sdev->num_mr);
1574
1575         return ERR_PTR(rv);
1576 }
1577
1578 /*
1579  * siw_create_srq()
1580  *
1581  * Create Shared Receive Queue of attributes @init_attrs
1582  * within protection domain given by @pd.
1583  *
1584  * @base_srq:   Base SRQ contained in siw SRQ.
1585  * @init_attrs: SRQ init attributes.
1586  * @udata:      points to user context
1587  */
1588 int siw_create_srq(struct ib_srq *base_srq,
1589                    struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
1590 {
1591         struct siw_srq *srq = to_siw_srq(base_srq);
1592         struct ib_srq_attr *attrs = &init_attrs->attr;
1593         struct siw_device *sdev = to_siw_dev(base_srq->device);
1594         struct siw_ucontext *ctx =
1595                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1596                                           base_ucontext);
1597         int rv;
1598
1599         if (init_attrs->srq_type != IB_SRQT_BASIC)
1600                 return -EOPNOTSUPP;
1601
1602         if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
1603                 siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
1604                 rv = -ENOMEM;
1605                 goto err_out;
1606         }
1607         if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
1608             attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
1609                 rv = -EINVAL;
1610                 goto err_out;
1611         }
1612         srq->max_sge = attrs->max_sge;
1613         srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
1614         srq->limit = attrs->srq_limit;
1615         if (srq->limit)
1616                 srq->armed = true;
1617
1618         srq->is_kernel_res = !udata;
1619
1620         if (udata)
1621                 srq->recvq =
1622                         vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
1623         else
1624                 srq->recvq = vcalloc(srq->num_rqe, sizeof(struct siw_rqe));
1625
1626         if (srq->recvq == NULL) {
1627                 rv = -ENOMEM;
1628                 goto err_out;
1629         }
1630         if (udata) {
1631                 struct siw_uresp_create_srq uresp = {};
1632                 size_t length = srq->num_rqe * sizeof(struct siw_rqe);
1633
1634                 srq->srq_entry =
1635                         siw_mmap_entry_insert(ctx, srq->recvq,
1636                                               length, &uresp.srq_key);
1637                 if (!srq->srq_entry) {
1638                         rv = -ENOMEM;
1639                         goto err_out;
1640                 }
1641
1642                 uresp.num_rqe = srq->num_rqe;
1643
1644                 if (udata->outlen < sizeof(uresp)) {
1645                         rv = -EINVAL;
1646                         goto err_out;
1647                 }
1648                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1649                 if (rv)
1650                         goto err_out;
1651         }
1652         spin_lock_init(&srq->lock);
1653
1654         siw_dbg_pd(base_srq->pd, "[SRQ]: success\n");
1655
1656         return 0;
1657
1658 err_out:
1659         if (srq->recvq) {
1660                 if (ctx)
1661                         rdma_user_mmap_entry_remove(srq->srq_entry);
1662                 vfree(srq->recvq);
1663         }
1664         atomic_dec(&sdev->num_srq);
1665
1666         return rv;
1667 }
1668
1669 /*
1670  * siw_modify_srq()
1671  *
1672  * Modify SRQ. The caller may resize SRQ and/or set/reset notification
1673  * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
1674  *
1675  * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
1676  * parameter. siw_modify_srq() does not check the attrs->max_sge param.
1677  */
1678 int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
1679                    enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
1680 {
1681         struct siw_srq *srq = to_siw_srq(base_srq);
1682         unsigned long flags;
1683         int rv = 0;
1684
1685         spin_lock_irqsave(&srq->lock, flags);
1686
1687         if (attr_mask & IB_SRQ_MAX_WR) {
1688                 /* resize request not yet supported */
1689                 rv = -EOPNOTSUPP;
1690                 goto out;
1691         }
1692         if (attr_mask & IB_SRQ_LIMIT) {
1693                 if (attrs->srq_limit) {
1694                         if (unlikely(attrs->srq_limit > srq->num_rqe)) {
1695                                 rv = -EINVAL;
1696                                 goto out;
1697                         }
1698                         srq->armed = true;
1699                 } else {
1700                         srq->armed = false;
1701                 }
1702                 srq->limit = attrs->srq_limit;
1703         }
1704 out:
1705         spin_unlock_irqrestore(&srq->lock, flags);
1706
1707         return rv;
1708 }
1709
1710 /*
1711  * siw_query_srq()
1712  *
1713  * Query SRQ attributes.
1714  */
1715 int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
1716 {
1717         struct siw_srq *srq = to_siw_srq(base_srq);
1718         unsigned long flags;
1719
1720         spin_lock_irqsave(&srq->lock, flags);
1721
1722         attrs->max_wr = srq->num_rqe;
1723         attrs->max_sge = srq->max_sge;
1724         attrs->srq_limit = srq->limit;
1725
1726         spin_unlock_irqrestore(&srq->lock, flags);
1727
1728         return 0;
1729 }
1730
1731 /*
1732  * siw_destroy_srq()
1733  *
1734  * Destroy SRQ.
1735  * It is assumed that the SRQ is not referenced by any
1736  * QP anymore - the code trusts the RDMA core environment to keep track
1737  * of QP references.
1738  */
1739 int siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
1740 {
1741         struct siw_srq *srq = to_siw_srq(base_srq);
1742         struct siw_device *sdev = to_siw_dev(base_srq->device);
1743         struct siw_ucontext *ctx =
1744                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1745                                           base_ucontext);
1746
1747         if (ctx)
1748                 rdma_user_mmap_entry_remove(srq->srq_entry);
1749         vfree(srq->recvq);
1750         atomic_dec(&sdev->num_srq);
1751         return 0;
1752 }
1753
1754 /*
1755  * siw_post_srq_recv()
1756  *
1757  * Post a list of receive queue elements to SRQ.
1758  * NOTE: The function does not check or lock a certain SRQ state
1759  *       during the post operation. The code simply trusts the
1760  *       RDMA core environment.
1761  *
1762  * @base_srq:   Base SRQ contained in siw SRQ
1763  * @wr:         List of R-WR's
1764  * @bad_wr:     Updated to failing WR if posting fails.
1765  */
1766 int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
1767                       const struct ib_recv_wr **bad_wr)
1768 {
1769         struct siw_srq *srq = to_siw_srq(base_srq);
1770         unsigned long flags;
1771         int rv = 0;
1772
1773         if (unlikely(!srq->is_kernel_res)) {
1774                 siw_dbg_pd(base_srq->pd,
1775                            "[SRQ]: no kernel post_recv for mapped srq\n");
1776                 rv = -EINVAL;
1777                 goto out;
1778         }
1779         /*
1780          * Serialize potentially multiple producers.
1781          * Also needed to serialize potentially multiple
1782          * consumers.
1783          */
1784         spin_lock_irqsave(&srq->lock, flags);
1785
1786         while (wr) {
1787                 u32 idx = srq->rq_put % srq->num_rqe;
1788                 struct siw_rqe *rqe = &srq->recvq[idx];
1789
1790                 if (rqe->flags) {
1791                         siw_dbg_pd(base_srq->pd, "SRQ full\n");
1792                         rv = -ENOMEM;
1793                         break;
1794                 }
1795                 if (unlikely(wr->num_sge > srq->max_sge)) {
1796                         siw_dbg_pd(base_srq->pd,
1797                                    "[SRQ]: too many sge's: %d\n", wr->num_sge);
1798                         rv = -EINVAL;
1799                         break;
1800                 }
1801                 rqe->id = wr->wr_id;
1802                 rqe->num_sge = wr->num_sge;
1803                 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1804
1805                 /* Make sure S-RQE is completely written before valid */
1806                 smp_wmb();
1807
1808                 rqe->flags = SIW_WQE_VALID;
1809
1810                 srq->rq_put++;
1811                 wr = wr->next;
1812         }
1813         spin_unlock_irqrestore(&srq->lock, flags);
1814 out:
1815         if (unlikely(rv < 0)) {
1816                 siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv);
1817                 *bad_wr = wr;
1818         }
1819         return rv;
1820 }
1821
1822 void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
1823 {
1824         struct ib_event event;
1825         struct ib_qp *base_qp = &qp->base_qp;
1826
1827         /*
1828          * Do not report asynchronous errors on QP which gets
1829          * destroyed via verbs interface (siw_destroy_qp())
1830          */
1831         if (qp->attrs.flags & SIW_QP_IN_DESTROY)
1832                 return;
1833
1834         event.event = etype;
1835         event.device = base_qp->device;
1836         event.element.qp = base_qp;
1837
1838         if (base_qp->event_handler) {
1839                 siw_dbg_qp(qp, "reporting event %d\n", etype);
1840                 base_qp->event_handler(&event, base_qp->qp_context);
1841         }
1842 }
1843
1844 void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
1845 {
1846         struct ib_event event;
1847         struct ib_cq *base_cq = &cq->base_cq;
1848
1849         event.event = etype;
1850         event.device = base_cq->device;
1851         event.element.cq = base_cq;
1852
1853         if (base_cq->event_handler) {
1854                 siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
1855                 base_cq->event_handler(&event, base_cq->cq_context);
1856         }
1857 }
1858
1859 void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
1860 {
1861         struct ib_event event;
1862         struct ib_srq *base_srq = &srq->base_srq;
1863
1864         event.event = etype;
1865         event.device = base_srq->device;
1866         event.element.srq = base_srq;
1867
1868         if (base_srq->event_handler) {
1869                 siw_dbg_pd(srq->base_srq.pd,
1870                            "reporting SRQ event %d\n", etype);
1871                 base_srq->event_handler(&event, base_srq->srq_context);
1872         }
1873 }
1874
1875 void siw_port_event(struct siw_device *sdev, u32 port, enum ib_event_type etype)
1876 {
1877         struct ib_event event;
1878
1879         event.event = etype;
1880         event.device = &sdev->base_dev;
1881         event.element.port_num = port;
1882
1883         siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
1884
1885         ib_dispatch_event(&event);
1886 }
This page took 0.142801 seconds and 4 git commands to generate.