]> Git Repo - J-linux.git/blob - drivers/infiniband/sw/siw/siw_verbs.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / drivers / infiniband / sw / siw / siw_verbs.c
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3 /* Authors: Bernard Metzler <[email protected]> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/uaccess.h>
9 #include <linux/vmalloc.h>
10 #include <linux/xarray.h>
11 #include <net/addrconf.h>
12
13 #include <rdma/iw_cm.h>
14 #include <rdma/ib_verbs.h>
15 #include <rdma/ib_user_verbs.h>
16 #include <rdma/uverbs_ioctl.h>
17
18 #include "siw.h"
19 #include "siw_verbs.h"
20 #include "siw_mem.h"
21
22 static int siw_qp_state_to_ib_qp_state[SIW_QP_STATE_COUNT] = {
23         [SIW_QP_STATE_IDLE] = IB_QPS_INIT,
24         [SIW_QP_STATE_RTR] = IB_QPS_RTR,
25         [SIW_QP_STATE_RTS] = IB_QPS_RTS,
26         [SIW_QP_STATE_CLOSING] = IB_QPS_SQD,
27         [SIW_QP_STATE_TERMINATE] = IB_QPS_SQE,
28         [SIW_QP_STATE_ERROR] = IB_QPS_ERR
29 };
30
31 static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
32         [IB_QPS_RESET] = SIW_QP_STATE_IDLE,
33         [IB_QPS_INIT] = SIW_QP_STATE_IDLE,
34         [IB_QPS_RTR] = SIW_QP_STATE_RTR,
35         [IB_QPS_RTS] = SIW_QP_STATE_RTS,
36         [IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
37         [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
38         [IB_QPS_ERR] = SIW_QP_STATE_ERROR
39 };
40
41 static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
42         [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
43         [IB_QPS_RTS] = "RTS",     [IB_QPS_SQD] = "SQD",   [IB_QPS_SQE] = "SQE",
44         [IB_QPS_ERR] = "ERR"
45 };
46
47 void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
48 {
49         struct siw_user_mmap_entry *entry = to_siw_mmap_entry(rdma_entry);
50
51         kfree(entry);
52 }
53
54 int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
55 {
56         struct siw_ucontext *uctx = to_siw_ctx(ctx);
57         size_t size = vma->vm_end - vma->vm_start;
58         struct rdma_user_mmap_entry *rdma_entry;
59         struct siw_user_mmap_entry *entry;
60         int rv = -EINVAL;
61
62         /*
63          * Must be page aligned
64          */
65         if (vma->vm_start & (PAGE_SIZE - 1)) {
66                 pr_warn("siw: mmap not page aligned\n");
67                 return -EINVAL;
68         }
69         rdma_entry = rdma_user_mmap_entry_get(&uctx->base_ucontext, vma);
70         if (!rdma_entry) {
71                 siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %#zx\n",
72                         vma->vm_pgoff, size);
73                 return -EINVAL;
74         }
75         entry = to_siw_mmap_entry(rdma_entry);
76
77         rv = remap_vmalloc_range(vma, entry->address, 0);
78         if (rv)
79                 pr_warn("remap_vmalloc_range failed: %lu, %zu\n", vma->vm_pgoff,
80                         size);
81         rdma_user_mmap_entry_put(rdma_entry);
82
83         return rv;
84 }
85
86 int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
87 {
88         struct siw_device *sdev = to_siw_dev(base_ctx->device);
89         struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
90         struct siw_uresp_alloc_ctx uresp = {};
91         int rv;
92
93         if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
94                 rv = -ENOMEM;
95                 goto err_out;
96         }
97         ctx->sdev = sdev;
98
99         uresp.dev_id = sdev->vendor_part_id;
100
101         if (udata->outlen < sizeof(uresp)) {
102                 rv = -EINVAL;
103                 goto err_out;
104         }
105         rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
106         if (rv)
107                 goto err_out;
108
109         siw_dbg(base_ctx->device, "success. now %d context(s)\n",
110                 atomic_read(&sdev->num_ctx));
111
112         return 0;
113
114 err_out:
115         atomic_dec(&sdev->num_ctx);
116         siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
117                 atomic_read(&sdev->num_ctx));
118
119         return rv;
120 }
121
122 void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
123 {
124         struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
125
126         atomic_dec(&uctx->sdev->num_ctx);
127 }
128
129 int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
130                      struct ib_udata *udata)
131 {
132         struct siw_device *sdev = to_siw_dev(base_dev);
133
134         if (udata->inlen || udata->outlen)
135                 return -EINVAL;
136
137         memset(attr, 0, sizeof(*attr));
138
139         /* Revisit atomic caps if RFC 7306 gets supported */
140         attr->atomic_cap = 0;
141         attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS;
142         attr->kernel_cap_flags = IBK_ALLOW_USER_UNREG;
143         attr->max_cq = sdev->attrs.max_cq;
144         attr->max_cqe = sdev->attrs.max_cqe;
145         attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
146         attr->max_mr = sdev->attrs.max_mr;
147         attr->max_mw = sdev->attrs.max_mw;
148         attr->max_mr_size = ~0ull;
149         attr->max_pd = sdev->attrs.max_pd;
150         attr->max_qp = sdev->attrs.max_qp;
151         attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
152         attr->max_qp_rd_atom = sdev->attrs.max_ord;
153         attr->max_qp_wr = sdev->attrs.max_qp_wr;
154         attr->max_recv_sge = sdev->attrs.max_sge;
155         attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
156         attr->max_send_sge = sdev->attrs.max_sge;
157         attr->max_sge_rd = sdev->attrs.max_sge_rd;
158         attr->max_srq = sdev->attrs.max_srq;
159         attr->max_srq_sge = sdev->attrs.max_srq_sge;
160         attr->max_srq_wr = sdev->attrs.max_srq_wr;
161         attr->page_size_cap = PAGE_SIZE;
162         attr->vendor_id = SIW_VENDOR_ID;
163         attr->vendor_part_id = sdev->vendor_part_id;
164
165         addrconf_addr_eui48((u8 *)&attr->sys_image_guid,
166                             sdev->raw_gid);
167
168         return 0;
169 }
170
171 int siw_query_port(struct ib_device *base_dev, u32 port,
172                    struct ib_port_attr *attr)
173 {
174         struct net_device *ndev;
175         int rv;
176
177         memset(attr, 0, sizeof(*attr));
178
179         rv = ib_get_eth_speed(base_dev, port, &attr->active_speed,
180                          &attr->active_width);
181         if (rv)
182                 return rv;
183
184         ndev = ib_device_get_netdev(base_dev, SIW_PORT);
185         if (!ndev)
186                 return -ENODEV;
187
188         attr->gid_tbl_len = 1;
189         attr->max_msg_sz = -1;
190         attr->max_mtu = ib_mtu_int_to_enum(ndev->max_mtu);
191         attr->active_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu));
192         attr->phys_state = (netif_running(ndev) && netif_carrier_ok(ndev)) ?
193                 IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
194         attr->state = attr->phys_state == IB_PORT_PHYS_STATE_LINK_UP ?
195                 IB_PORT_ACTIVE : IB_PORT_DOWN;
196         attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
197         /*
198          * All zero
199          *
200          * attr->lid = 0;
201          * attr->bad_pkey_cntr = 0;
202          * attr->qkey_viol_cntr = 0;
203          * attr->sm_lid = 0;
204          * attr->lmc = 0;
205          * attr->max_vl_num = 0;
206          * attr->sm_sl = 0;
207          * attr->subnet_timeout = 0;
208          * attr->init_type_repy = 0;
209          */
210         dev_put(ndev);
211         return rv;
212 }
213
214 int siw_get_port_immutable(struct ib_device *base_dev, u32 port,
215                            struct ib_port_immutable *port_immutable)
216 {
217         struct ib_port_attr attr;
218         int rv = siw_query_port(base_dev, port, &attr);
219
220         if (rv)
221                 return rv;
222
223         port_immutable->gid_tbl_len = attr.gid_tbl_len;
224         port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
225
226         return 0;
227 }
228
229 int siw_query_gid(struct ib_device *base_dev, u32 port, int idx,
230                   union ib_gid *gid)
231 {
232         struct siw_device *sdev = to_siw_dev(base_dev);
233
234         /* subnet_prefix == interface_id == 0; */
235         memset(gid, 0, sizeof(*gid));
236         memcpy(gid->raw, sdev->raw_gid, ETH_ALEN);
237
238         return 0;
239 }
240
241 int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
242 {
243         struct siw_device *sdev = to_siw_dev(pd->device);
244
245         if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
246                 atomic_dec(&sdev->num_pd);
247                 return -ENOMEM;
248         }
249         siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
250
251         return 0;
252 }
253
254 int siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
255 {
256         struct siw_device *sdev = to_siw_dev(pd->device);
257
258         siw_dbg_pd(pd, "free PD\n");
259         atomic_dec(&sdev->num_pd);
260         return 0;
261 }
262
263 void siw_qp_get_ref(struct ib_qp *base_qp)
264 {
265         siw_qp_get(to_siw_qp(base_qp));
266 }
267
268 void siw_qp_put_ref(struct ib_qp *base_qp)
269 {
270         siw_qp_put(to_siw_qp(base_qp));
271 }
272
273 static struct rdma_user_mmap_entry *
274 siw_mmap_entry_insert(struct siw_ucontext *uctx,
275                       void *address, size_t length,
276                       u64 *offset)
277 {
278         struct siw_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
279         int rv;
280
281         *offset = SIW_INVAL_UOBJ_KEY;
282         if (!entry)
283                 return NULL;
284
285         entry->address = address;
286
287         rv = rdma_user_mmap_entry_insert(&uctx->base_ucontext,
288                                          &entry->rdma_entry,
289                                          length);
290         if (rv) {
291                 kfree(entry);
292                 return NULL;
293         }
294
295         *offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
296
297         return &entry->rdma_entry;
298 }
299
300 /*
301  * siw_create_qp()
302  *
303  * Create QP of requested size on given device.
304  *
305  * @qp:         Queue pait
306  * @attrs:      Initial QP attributes.
307  * @udata:      used to provide QP ID, SQ and RQ size back to user.
308  */
309
310 int siw_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs,
311                   struct ib_udata *udata)
312 {
313         struct ib_pd *pd = ibqp->pd;
314         struct siw_qp *qp = to_siw_qp(ibqp);
315         struct ib_device *base_dev = pd->device;
316         struct siw_device *sdev = to_siw_dev(base_dev);
317         struct siw_ucontext *uctx =
318                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
319                                           base_ucontext);
320         unsigned long flags;
321         int num_sqe, num_rqe, rv = 0;
322         size_t length;
323
324         siw_dbg(base_dev, "create new QP\n");
325
326         if (attrs->create_flags)
327                 return -EOPNOTSUPP;
328
329         if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
330                 siw_dbg(base_dev, "too many QP's\n");
331                 rv = -ENOMEM;
332                 goto err_atomic;
333         }
334         if (attrs->qp_type != IB_QPT_RC) {
335                 siw_dbg(base_dev, "only RC QP's supported\n");
336                 rv = -EOPNOTSUPP;
337                 goto err_atomic;
338         }
339         if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
340             (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
341             (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
342             (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
343                 siw_dbg(base_dev, "QP size error\n");
344                 rv = -EINVAL;
345                 goto err_atomic;
346         }
347         if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
348                 siw_dbg(base_dev, "max inline send: %d > %d\n",
349                         attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
350                 rv = -EINVAL;
351                 goto err_atomic;
352         }
353         /*
354          * NOTE: we don't allow for a QP unable to hold any SQ WQE
355          */
356         if (attrs->cap.max_send_wr == 0) {
357                 siw_dbg(base_dev, "QP must have send queue\n");
358                 rv = -EINVAL;
359                 goto err_atomic;
360         }
361
362         if (!attrs->send_cq || (!attrs->recv_cq && !attrs->srq)) {
363                 siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
364                 rv = -EINVAL;
365                 goto err_atomic;
366         }
367
368         init_rwsem(&qp->state_lock);
369         spin_lock_init(&qp->sq_lock);
370         spin_lock_init(&qp->rq_lock);
371         spin_lock_init(&qp->orq_lock);
372
373         rv = siw_qp_add(sdev, qp);
374         if (rv)
375                 goto err_atomic;
376
377
378         /* All queue indices are derived from modulo operations
379          * on a free running 'get' (consumer) and 'put' (producer)
380          * unsigned counter. Having queue sizes at power of two
381          * avoids handling counter wrap around.
382          */
383         num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr);
384         num_rqe = attrs->cap.max_recv_wr;
385         if (num_rqe)
386                 num_rqe = roundup_pow_of_two(num_rqe);
387
388         if (udata)
389                 qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
390         else
391                 qp->sendq = vcalloc(num_sqe, sizeof(struct siw_sqe));
392
393         if (qp->sendq == NULL) {
394                 rv = -ENOMEM;
395                 goto err_out_xa;
396         }
397         if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
398                 if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
399                         qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
400                 else {
401                         rv = -EINVAL;
402                         goto err_out_xa;
403                 }
404         }
405         qp->pd = pd;
406         qp->scq = to_siw_cq(attrs->send_cq);
407         qp->rcq = to_siw_cq(attrs->recv_cq);
408
409         if (attrs->srq) {
410                 /*
411                  * SRQ support.
412                  * Verbs 6.3.7: ignore RQ size, if SRQ present
413                  * Verbs 6.3.5: do not check PD of SRQ against PD of QP
414                  */
415                 qp->srq = to_siw_srq(attrs->srq);
416                 qp->attrs.rq_size = 0;
417                 siw_dbg(base_dev, "QP [%u]: SRQ attached\n",
418                         qp->base_qp.qp_num);
419         } else if (num_rqe) {
420                 if (udata)
421                         qp->recvq =
422                                 vmalloc_user(num_rqe * sizeof(struct siw_rqe));
423                 else
424                         qp->recvq = vcalloc(num_rqe, sizeof(struct siw_rqe));
425
426                 if (qp->recvq == NULL) {
427                         rv = -ENOMEM;
428                         goto err_out_xa;
429                 }
430                 qp->attrs.rq_size = num_rqe;
431         }
432         qp->attrs.sq_size = num_sqe;
433         qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
434         qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
435
436         /* Make those two tunables fixed for now. */
437         qp->tx_ctx.gso_seg_limit = 1;
438         qp->tx_ctx.zcopy_tx = zcopy_tx;
439
440         qp->attrs.state = SIW_QP_STATE_IDLE;
441
442         if (udata) {
443                 struct siw_uresp_create_qp uresp = {};
444
445                 uresp.num_sqe = num_sqe;
446                 uresp.num_rqe = num_rqe;
447                 uresp.qp_id = qp_id(qp);
448
449                 if (qp->sendq) {
450                         length = num_sqe * sizeof(struct siw_sqe);
451                         qp->sq_entry =
452                                 siw_mmap_entry_insert(uctx, qp->sendq,
453                                                       length, &uresp.sq_key);
454                         if (!qp->sq_entry) {
455                                 rv = -ENOMEM;
456                                 goto err_out_xa;
457                         }
458                 }
459
460                 if (qp->recvq) {
461                         length = num_rqe * sizeof(struct siw_rqe);
462                         qp->rq_entry =
463                                 siw_mmap_entry_insert(uctx, qp->recvq,
464                                                       length, &uresp.rq_key);
465                         if (!qp->rq_entry) {
466                                 uresp.sq_key = SIW_INVAL_UOBJ_KEY;
467                                 rv = -ENOMEM;
468                                 goto err_out_xa;
469                         }
470                 }
471
472                 if (udata->outlen < sizeof(uresp)) {
473                         rv = -EINVAL;
474                         goto err_out_xa;
475                 }
476                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
477                 if (rv)
478                         goto err_out_xa;
479         }
480         qp->tx_cpu = siw_get_tx_cpu(sdev);
481         if (qp->tx_cpu < 0) {
482                 rv = -EINVAL;
483                 goto err_out_xa;
484         }
485         INIT_LIST_HEAD(&qp->devq);
486         spin_lock_irqsave(&sdev->lock, flags);
487         list_add_tail(&qp->devq, &sdev->qp_list);
488         spin_unlock_irqrestore(&sdev->lock, flags);
489
490         init_completion(&qp->qp_free);
491
492         return 0;
493
494 err_out_xa:
495         xa_erase(&sdev->qp_xa, qp_id(qp));
496         if (uctx) {
497                 rdma_user_mmap_entry_remove(qp->sq_entry);
498                 rdma_user_mmap_entry_remove(qp->rq_entry);
499         }
500         vfree(qp->sendq);
501         vfree(qp->recvq);
502
503 err_atomic:
504         atomic_dec(&sdev->num_qp);
505         return rv;
506 }
507
508 /*
509  * Minimum siw_query_qp() verb interface.
510  *
511  * @qp_attr_mask is not used but all available information is provided
512  */
513 int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
514                  int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
515 {
516         struct siw_qp *qp;
517         struct net_device *ndev;
518
519         if (base_qp && qp_attr && qp_init_attr)
520                 qp = to_siw_qp(base_qp);
521         else
522                 return -EINVAL;
523
524         ndev = ib_device_get_netdev(base_qp->device, SIW_PORT);
525         if (!ndev)
526                 return -ENODEV;
527
528         qp_attr->qp_state = siw_qp_state_to_ib_qp_state[qp->attrs.state];
529         qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
530         qp_attr->cap.max_send_wr = qp->attrs.sq_size;
531         qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
532         qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
533         qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
534         qp_attr->path_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu));
535         qp_attr->max_rd_atomic = qp->attrs.irq_size;
536         qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
537
538         qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
539                                    IB_ACCESS_REMOTE_WRITE |
540                                    IB_ACCESS_REMOTE_READ;
541
542         qp_init_attr->qp_type = base_qp->qp_type;
543         qp_init_attr->send_cq = base_qp->send_cq;
544         qp_init_attr->recv_cq = base_qp->recv_cq;
545         qp_init_attr->srq = base_qp->srq;
546
547         qp_init_attr->cap = qp_attr->cap;
548
549         dev_put(ndev);
550         return 0;
551 }
552
553 int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
554                         int attr_mask, struct ib_udata *udata)
555 {
556         struct siw_qp_attrs new_attrs;
557         enum siw_qp_attr_mask siw_attr_mask = 0;
558         struct siw_qp *qp = to_siw_qp(base_qp);
559         int rv = 0;
560
561         if (!attr_mask)
562                 return 0;
563
564         if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
565                 return -EOPNOTSUPP;
566
567         memset(&new_attrs, 0, sizeof(new_attrs));
568
569         if (attr_mask & IB_QP_ACCESS_FLAGS) {
570                 siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
571
572                 if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
573                         new_attrs.flags |= SIW_RDMA_READ_ENABLED;
574                 if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
575                         new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
576                 if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
577                         new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
578         }
579         if (attr_mask & IB_QP_STATE) {
580                 siw_dbg_qp(qp, "desired IB QP state: %s\n",
581                            ib_qp_state_to_string[attr->qp_state]);
582
583                 new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
584
585                 if (new_attrs.state > SIW_QP_STATE_RTS)
586                         qp->tx_ctx.tx_suspend = 1;
587
588                 siw_attr_mask |= SIW_QP_ATTR_STATE;
589         }
590         if (!siw_attr_mask)
591                 goto out;
592
593         down_write(&qp->state_lock);
594
595         rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
596
597         up_write(&qp->state_lock);
598 out:
599         return rv;
600 }
601
602 int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
603 {
604         struct siw_qp *qp = to_siw_qp(base_qp);
605         struct siw_ucontext *uctx =
606                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
607                                           base_ucontext);
608         struct siw_qp_attrs qp_attrs;
609
610         siw_dbg_qp(qp, "state %d\n", qp->attrs.state);
611
612         /*
613          * Mark QP as in process of destruction to prevent from
614          * any async callbacks to RDMA core
615          */
616         qp->attrs.flags |= SIW_QP_IN_DESTROY;
617         qp->rx_stream.rx_suspend = 1;
618
619         if (uctx) {
620                 rdma_user_mmap_entry_remove(qp->sq_entry);
621                 rdma_user_mmap_entry_remove(qp->rq_entry);
622         }
623
624         down_write(&qp->state_lock);
625
626         qp_attrs.state = SIW_QP_STATE_ERROR;
627         siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
628
629         if (qp->cep) {
630                 siw_cep_put(qp->cep);
631                 qp->cep = NULL;
632         }
633         up_write(&qp->state_lock);
634
635         kfree(qp->tx_ctx.mpa_crc_hd);
636         kfree(qp->rx_stream.mpa_crc_hd);
637
638         qp->scq = qp->rcq = NULL;
639
640         siw_qp_put(qp);
641         wait_for_completion(&qp->qp_free);
642
643         return 0;
644 }
645
646 /*
647  * siw_copy_inline_sgl()
648  *
649  * Prepare sgl of inlined data for sending. For userland callers
650  * function checks if given buffer addresses and len's are within
651  * process context bounds.
652  * Data from all provided sge's are copied together into the wqe,
653  * referenced by a single sge.
654  */
655 static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
656                                struct siw_sqe *sqe)
657 {
658         struct ib_sge *core_sge = core_wr->sg_list;
659         void *kbuf = &sqe->sge[1];
660         int num_sge = core_wr->num_sge, bytes = 0;
661
662         sqe->sge[0].laddr = (uintptr_t)kbuf;
663         sqe->sge[0].lkey = 0;
664
665         while (num_sge--) {
666                 if (!core_sge->length) {
667                         core_sge++;
668                         continue;
669                 }
670                 bytes += core_sge->length;
671                 if (bytes > SIW_MAX_INLINE) {
672                         bytes = -EINVAL;
673                         break;
674                 }
675                 memcpy(kbuf, ib_virt_dma_to_ptr(core_sge->addr),
676                        core_sge->length);
677
678                 kbuf += core_sge->length;
679                 core_sge++;
680         }
681         sqe->sge[0].length = max(bytes, 0);
682         sqe->num_sge = bytes > 0 ? 1 : 0;
683
684         return bytes;
685 }
686
687 /* Complete SQ WR's without processing */
688 static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
689                            const struct ib_send_wr **bad_wr)
690 {
691         int rv = 0;
692
693         while (wr) {
694                 struct siw_sqe sqe = {};
695
696                 switch (wr->opcode) {
697                 case IB_WR_RDMA_WRITE:
698                         sqe.opcode = SIW_OP_WRITE;
699                         break;
700                 case IB_WR_RDMA_READ:
701                         sqe.opcode = SIW_OP_READ;
702                         break;
703                 case IB_WR_RDMA_READ_WITH_INV:
704                         sqe.opcode = SIW_OP_READ_LOCAL_INV;
705                         break;
706                 case IB_WR_SEND:
707                         sqe.opcode = SIW_OP_SEND;
708                         break;
709                 case IB_WR_SEND_WITH_IMM:
710                         sqe.opcode = SIW_OP_SEND_WITH_IMM;
711                         break;
712                 case IB_WR_SEND_WITH_INV:
713                         sqe.opcode = SIW_OP_SEND_REMOTE_INV;
714                         break;
715                 case IB_WR_LOCAL_INV:
716                         sqe.opcode = SIW_OP_INVAL_STAG;
717                         break;
718                 case IB_WR_REG_MR:
719                         sqe.opcode = SIW_OP_REG_MR;
720                         break;
721                 default:
722                         rv = -EINVAL;
723                         break;
724                 }
725                 if (!rv) {
726                         sqe.id = wr->wr_id;
727                         rv = siw_sqe_complete(qp, &sqe, 0,
728                                               SIW_WC_WR_FLUSH_ERR);
729                 }
730                 if (rv) {
731                         if (bad_wr)
732                                 *bad_wr = wr;
733                         break;
734                 }
735                 wr = wr->next;
736         }
737         return rv;
738 }
739
740 /* Complete RQ WR's without processing */
741 static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr,
742                            const struct ib_recv_wr **bad_wr)
743 {
744         struct siw_rqe rqe = {};
745         int rv = 0;
746
747         while (wr) {
748                 rqe.id = wr->wr_id;
749                 rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR);
750                 if (rv) {
751                         if (bad_wr)
752                                 *bad_wr = wr;
753                         break;
754                 }
755                 wr = wr->next;
756         }
757         return rv;
758 }
759
760 /*
761  * siw_post_send()
762  *
763  * Post a list of S-WR's to a SQ.
764  *
765  * @base_qp:    Base QP contained in siw QP
766  * @wr:         Null terminated list of user WR's
767  * @bad_wr:     Points to failing WR in case of synchronous failure.
768  */
769 int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
770                   const struct ib_send_wr **bad_wr)
771 {
772         struct siw_qp *qp = to_siw_qp(base_qp);
773         struct siw_wqe *wqe = tx_wqe(qp);
774
775         unsigned long flags;
776         int rv = 0;
777
778         if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) {
779                 siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
780                 *bad_wr = wr;
781                 return -EINVAL;
782         }
783
784         /*
785          * Try to acquire QP state lock. Must be non-blocking
786          * to accommodate kernel clients needs.
787          */
788         if (!down_read_trylock(&qp->state_lock)) {
789                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
790                         /*
791                          * ERROR state is final, so we can be sure
792                          * this state will not change as long as the QP
793                          * exists.
794                          *
795                          * This handles an ib_drain_sq() call with
796                          * a concurrent request to set the QP state
797                          * to ERROR.
798                          */
799                         rv = siw_sq_flush_wr(qp, wr, bad_wr);
800                 } else {
801                         siw_dbg_qp(qp, "QP locked, state %d\n",
802                                    qp->attrs.state);
803                         *bad_wr = wr;
804                         rv = -ENOTCONN;
805                 }
806                 return rv;
807         }
808         if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
809                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
810                         /*
811                          * Immediately flush this WR to CQ, if QP
812                          * is in ERROR state. SQ is guaranteed to
813                          * be empty, so WR complets in-order.
814                          *
815                          * Typically triggered by ib_drain_sq().
816                          */
817                         rv = siw_sq_flush_wr(qp, wr, bad_wr);
818                 } else {
819                         siw_dbg_qp(qp, "QP out of state %d\n",
820                                    qp->attrs.state);
821                         *bad_wr = wr;
822                         rv = -ENOTCONN;
823                 }
824                 up_read(&qp->state_lock);
825                 return rv;
826         }
827         spin_lock_irqsave(&qp->sq_lock, flags);
828
829         while (wr) {
830                 u32 idx = qp->sq_put % qp->attrs.sq_size;
831                 struct siw_sqe *sqe = &qp->sendq[idx];
832
833                 if (sqe->flags) {
834                         siw_dbg_qp(qp, "sq full\n");
835                         rv = -ENOMEM;
836                         break;
837                 }
838                 if (wr->num_sge > qp->attrs.sq_max_sges) {
839                         siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
840                         rv = -EINVAL;
841                         break;
842                 }
843                 sqe->id = wr->wr_id;
844
845                 if ((wr->send_flags & IB_SEND_SIGNALED) ||
846                     (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
847                         sqe->flags |= SIW_WQE_SIGNALLED;
848
849                 if (wr->send_flags & IB_SEND_FENCE)
850                         sqe->flags |= SIW_WQE_READ_FENCE;
851
852                 switch (wr->opcode) {
853                 case IB_WR_SEND:
854                 case IB_WR_SEND_WITH_INV:
855                         if (wr->send_flags & IB_SEND_SOLICITED)
856                                 sqe->flags |= SIW_WQE_SOLICITED;
857
858                         if (!(wr->send_flags & IB_SEND_INLINE)) {
859                                 siw_copy_sgl(wr->sg_list, sqe->sge,
860                                              wr->num_sge);
861                                 sqe->num_sge = wr->num_sge;
862                         } else {
863                                 rv = siw_copy_inline_sgl(wr, sqe);
864                                 if (rv <= 0) {
865                                         rv = -EINVAL;
866                                         break;
867                                 }
868                                 sqe->flags |= SIW_WQE_INLINE;
869                                 sqe->num_sge = 1;
870                         }
871                         if (wr->opcode == IB_WR_SEND)
872                                 sqe->opcode = SIW_OP_SEND;
873                         else {
874                                 sqe->opcode = SIW_OP_SEND_REMOTE_INV;
875                                 sqe->rkey = wr->ex.invalidate_rkey;
876                         }
877                         break;
878
879                 case IB_WR_RDMA_READ_WITH_INV:
880                 case IB_WR_RDMA_READ:
881                         /*
882                          * iWarp restricts RREAD sink to SGL containing
883                          * 1 SGE only. we could relax to SGL with multiple
884                          * elements referring the SAME ltag or even sending
885                          * a private per-rreq tag referring to a checked
886                          * local sgl with MULTIPLE ltag's.
887                          */
888                         if (unlikely(wr->num_sge != 1)) {
889                                 rv = -EINVAL;
890                                 break;
891                         }
892                         siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
893                         /*
894                          * NOTE: zero length RREAD is allowed!
895                          */
896                         sqe->raddr = rdma_wr(wr)->remote_addr;
897                         sqe->rkey = rdma_wr(wr)->rkey;
898                         sqe->num_sge = 1;
899
900                         if (wr->opcode == IB_WR_RDMA_READ)
901                                 sqe->opcode = SIW_OP_READ;
902                         else
903                                 sqe->opcode = SIW_OP_READ_LOCAL_INV;
904                         break;
905
906                 case IB_WR_RDMA_WRITE:
907                         if (!(wr->send_flags & IB_SEND_INLINE)) {
908                                 siw_copy_sgl(wr->sg_list, &sqe->sge[0],
909                                              wr->num_sge);
910                                 sqe->num_sge = wr->num_sge;
911                         } else {
912                                 rv = siw_copy_inline_sgl(wr, sqe);
913                                 if (unlikely(rv < 0)) {
914                                         rv = -EINVAL;
915                                         break;
916                                 }
917                                 sqe->flags |= SIW_WQE_INLINE;
918                                 sqe->num_sge = 1;
919                         }
920                         sqe->raddr = rdma_wr(wr)->remote_addr;
921                         sqe->rkey = rdma_wr(wr)->rkey;
922                         sqe->opcode = SIW_OP_WRITE;
923                         break;
924
925                 case IB_WR_REG_MR:
926                         sqe->base_mr = (uintptr_t)reg_wr(wr)->mr;
927                         sqe->rkey = reg_wr(wr)->key;
928                         sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
929                         sqe->opcode = SIW_OP_REG_MR;
930                         break;
931
932                 case IB_WR_LOCAL_INV:
933                         sqe->rkey = wr->ex.invalidate_rkey;
934                         sqe->opcode = SIW_OP_INVAL_STAG;
935                         break;
936
937                 default:
938                         siw_dbg_qp(qp, "ib wr type %d unsupported\n",
939                                    wr->opcode);
940                         rv = -EINVAL;
941                         break;
942                 }
943                 siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
944                            sqe->opcode, sqe->flags,
945                            (void *)(uintptr_t)sqe->id);
946
947                 if (unlikely(rv < 0))
948                         break;
949
950                 /* make SQE only valid after completely written */
951                 smp_wmb();
952                 sqe->flags |= SIW_WQE_VALID;
953
954                 qp->sq_put++;
955                 wr = wr->next;
956         }
957
958         /*
959          * Send directly if SQ processing is not in progress.
960          * Eventual immediate errors (rv < 0) do not affect the involved
961          * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
962          * processing, if new work is already pending. But rv must be passed
963          * to caller.
964          */
965         if (wqe->wr_status != SIW_WR_IDLE) {
966                 spin_unlock_irqrestore(&qp->sq_lock, flags);
967                 goto skip_direct_sending;
968         }
969         rv = siw_activate_tx(qp);
970         spin_unlock_irqrestore(&qp->sq_lock, flags);
971
972         if (rv <= 0)
973                 goto skip_direct_sending;
974
975         if (rdma_is_kernel_res(&qp->base_qp.res)) {
976                 rv = siw_sq_start(qp);
977         } else {
978                 qp->tx_ctx.in_syscall = 1;
979
980                 if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
981                         siw_qp_cm_drop(qp, 0);
982
983                 qp->tx_ctx.in_syscall = 0;
984         }
985 skip_direct_sending:
986
987         up_read(&qp->state_lock);
988
989         if (rv >= 0)
990                 return 0;
991         /*
992          * Immediate error
993          */
994         siw_dbg_qp(qp, "error %d\n", rv);
995
996         *bad_wr = wr;
997         return rv;
998 }
999
1000 /*
1001  * siw_post_receive()
1002  *
1003  * Post a list of R-WR's to a RQ.
1004  *
1005  * @base_qp:    Base QP contained in siw QP
1006  * @wr:         Null terminated list of user WR's
1007  * @bad_wr:     Points to failing WR in case of synchronous failure.
1008  */
1009 int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
1010                      const struct ib_recv_wr **bad_wr)
1011 {
1012         struct siw_qp *qp = to_siw_qp(base_qp);
1013         unsigned long flags;
1014         int rv = 0;
1015
1016         if (qp->srq || qp->attrs.rq_size == 0) {
1017                 *bad_wr = wr;
1018                 return -EINVAL;
1019         }
1020         if (!rdma_is_kernel_res(&qp->base_qp.res)) {
1021                 siw_dbg_qp(qp, "no kernel post_recv for user mapped rq\n");
1022                 *bad_wr = wr;
1023                 return -EINVAL;
1024         }
1025
1026         /*
1027          * Try to acquire QP state lock. Must be non-blocking
1028          * to accommodate kernel clients needs.
1029          */
1030         if (!down_read_trylock(&qp->state_lock)) {
1031                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
1032                         /*
1033                          * ERROR state is final, so we can be sure
1034                          * this state will not change as long as the QP
1035                          * exists.
1036                          *
1037                          * This handles an ib_drain_rq() call with
1038                          * a concurrent request to set the QP state
1039                          * to ERROR.
1040                          */
1041                         rv = siw_rq_flush_wr(qp, wr, bad_wr);
1042                 } else {
1043                         siw_dbg_qp(qp, "QP locked, state %d\n",
1044                                    qp->attrs.state);
1045                         *bad_wr = wr;
1046                         rv = -ENOTCONN;
1047                 }
1048                 return rv;
1049         }
1050         if (qp->attrs.state > SIW_QP_STATE_RTS) {
1051                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
1052                         /*
1053                          * Immediately flush this WR to CQ, if QP
1054                          * is in ERROR state. RQ is guaranteed to
1055                          * be empty, so WR complets in-order.
1056                          *
1057                          * Typically triggered by ib_drain_rq().
1058                          */
1059                         rv = siw_rq_flush_wr(qp, wr, bad_wr);
1060                 } else {
1061                         siw_dbg_qp(qp, "QP out of state %d\n",
1062                                    qp->attrs.state);
1063                         *bad_wr = wr;
1064                         rv = -ENOTCONN;
1065                 }
1066                 up_read(&qp->state_lock);
1067                 return rv;
1068         }
1069         /*
1070          * Serialize potentially multiple producers.
1071          * Not needed for single threaded consumer side.
1072          */
1073         spin_lock_irqsave(&qp->rq_lock, flags);
1074
1075         while (wr) {
1076                 u32 idx = qp->rq_put % qp->attrs.rq_size;
1077                 struct siw_rqe *rqe = &qp->recvq[idx];
1078
1079                 if (rqe->flags) {
1080                         siw_dbg_qp(qp, "RQ full\n");
1081                         rv = -ENOMEM;
1082                         break;
1083                 }
1084                 if (wr->num_sge > qp->attrs.rq_max_sges) {
1085                         siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
1086                         rv = -EINVAL;
1087                         break;
1088                 }
1089                 rqe->id = wr->wr_id;
1090                 rqe->num_sge = wr->num_sge;
1091                 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1092
1093                 /* make sure RQE is completely written before valid */
1094                 smp_wmb();
1095
1096                 rqe->flags = SIW_WQE_VALID;
1097
1098                 qp->rq_put++;
1099                 wr = wr->next;
1100         }
1101         spin_unlock_irqrestore(&qp->rq_lock, flags);
1102
1103         up_read(&qp->state_lock);
1104
1105         if (rv < 0) {
1106                 siw_dbg_qp(qp, "error %d\n", rv);
1107                 *bad_wr = wr;
1108         }
1109         return rv > 0 ? 0 : rv;
1110 }
1111
1112 int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
1113 {
1114         struct siw_cq *cq = to_siw_cq(base_cq);
1115         struct siw_device *sdev = to_siw_dev(base_cq->device);
1116         struct siw_ucontext *ctx =
1117                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1118                                           base_ucontext);
1119
1120         siw_dbg_cq(cq, "free CQ resources\n");
1121
1122         siw_cq_flush(cq);
1123
1124         if (ctx)
1125                 rdma_user_mmap_entry_remove(cq->cq_entry);
1126
1127         atomic_dec(&sdev->num_cq);
1128
1129         vfree(cq->queue);
1130         return 0;
1131 }
1132
1133 /*
1134  * siw_create_cq()
1135  *
1136  * Populate CQ of requested size
1137  *
1138  * @base_cq: CQ as allocated by RDMA midlayer
1139  * @attr: Initial CQ attributes
1140  * @attrs: uverbs bundle
1141  */
1142
1143 int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
1144                   struct uverbs_attr_bundle *attrs)
1145 {
1146         struct ib_udata *udata = &attrs->driver_udata;
1147         struct siw_device *sdev = to_siw_dev(base_cq->device);
1148         struct siw_cq *cq = to_siw_cq(base_cq);
1149         int rv, size = attr->cqe;
1150
1151         if (attr->flags)
1152                 return -EOPNOTSUPP;
1153
1154         if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
1155                 siw_dbg(base_cq->device, "too many CQ's\n");
1156                 rv = -ENOMEM;
1157                 goto err_out;
1158         }
1159         if (size < 1 || size > sdev->attrs.max_cqe) {
1160                 siw_dbg(base_cq->device, "CQ size error: %d\n", size);
1161                 rv = -EINVAL;
1162                 goto err_out;
1163         }
1164         size = roundup_pow_of_two(size);
1165         cq->base_cq.cqe = size;
1166         cq->num_cqe = size;
1167
1168         if (udata)
1169                 cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
1170                                          sizeof(struct siw_cq_ctrl));
1171         else
1172                 cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
1173                                     sizeof(struct siw_cq_ctrl));
1174
1175         if (cq->queue == NULL) {
1176                 rv = -ENOMEM;
1177                 goto err_out;
1178         }
1179         get_random_bytes(&cq->id, 4);
1180         siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
1181
1182         spin_lock_init(&cq->lock);
1183
1184         cq->notify = (struct siw_cq_ctrl *)&cq->queue[size];
1185
1186         if (udata) {
1187                 struct siw_uresp_create_cq uresp = {};
1188                 struct siw_ucontext *ctx =
1189                         rdma_udata_to_drv_context(udata, struct siw_ucontext,
1190                                                   base_ucontext);
1191                 size_t length = size * sizeof(struct siw_cqe) +
1192                         sizeof(struct siw_cq_ctrl);
1193
1194                 cq->cq_entry =
1195                         siw_mmap_entry_insert(ctx, cq->queue,
1196                                               length, &uresp.cq_key);
1197                 if (!cq->cq_entry) {
1198                         rv = -ENOMEM;
1199                         goto err_out;
1200                 }
1201
1202                 uresp.cq_id = cq->id;
1203                 uresp.num_cqe = size;
1204
1205                 if (udata->outlen < sizeof(uresp)) {
1206                         rv = -EINVAL;
1207                         goto err_out;
1208                 }
1209                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1210                 if (rv)
1211                         goto err_out;
1212         }
1213         return 0;
1214
1215 err_out:
1216         siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
1217
1218         if (cq->queue) {
1219                 struct siw_ucontext *ctx =
1220                         rdma_udata_to_drv_context(udata, struct siw_ucontext,
1221                                                   base_ucontext);
1222                 if (ctx)
1223                         rdma_user_mmap_entry_remove(cq->cq_entry);
1224                 vfree(cq->queue);
1225         }
1226         atomic_dec(&sdev->num_cq);
1227
1228         return rv;
1229 }
1230
1231 /*
1232  * siw_poll_cq()
1233  *
1234  * Reap CQ entries if available and copy work completion status into
1235  * array of WC's provided by caller. Returns number of reaped CQE's.
1236  *
1237  * @base_cq:    Base CQ contained in siw CQ.
1238  * @num_cqe:    Maximum number of CQE's to reap.
1239  * @wc:         Array of work completions to be filled by siw.
1240  */
1241 int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
1242 {
1243         struct siw_cq *cq = to_siw_cq(base_cq);
1244         int i;
1245
1246         for (i = 0; i < num_cqe; i++) {
1247                 if (!siw_reap_cqe(cq, wc))
1248                         break;
1249                 wc++;
1250         }
1251         return i;
1252 }
1253
1254 /*
1255  * siw_req_notify_cq()
1256  *
1257  * Request notification for new CQE's added to that CQ.
1258  * Defined flags:
1259  * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
1260  *   event if a WQE with notification flag set enters the CQ
1261  * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
1262  *   event if a WQE enters the CQ.
1263  * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
1264  *   number of not reaped CQE's regardless of its notification
1265  *   type and current or new CQ notification settings.
1266  *
1267  * @base_cq:    Base CQ contained in siw CQ.
1268  * @flags:      Requested notification flags.
1269  */
1270 int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
1271 {
1272         struct siw_cq *cq = to_siw_cq(base_cq);
1273
1274         siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
1275
1276         if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
1277                 /*
1278                  * Enable CQ event for next solicited completion.
1279                  * and make it visible to all associated producers.
1280                  */
1281                 smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED);
1282         else
1283                 /*
1284                  * Enable CQ event for any signalled completion.
1285                  * and make it visible to all associated producers.
1286                  */
1287                 smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL);
1288
1289         if (flags & IB_CQ_REPORT_MISSED_EVENTS)
1290                 return cq->cq_put - cq->cq_get;
1291
1292         return 0;
1293 }
1294
1295 /*
1296  * siw_dereg_mr()
1297  *
1298  * Release Memory Region.
1299  *
1300  * @base_mr: Base MR contained in siw MR.
1301  * @udata: points to user context, unused.
1302  */
1303 int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
1304 {
1305         struct siw_mr *mr = to_siw_mr(base_mr);
1306         struct siw_device *sdev = to_siw_dev(base_mr->device);
1307
1308         siw_dbg_mem(mr->mem, "deregister MR\n");
1309
1310         atomic_dec(&sdev->num_mr);
1311
1312         siw_mr_drop_mem(mr);
1313         kfree_rcu(mr, rcu);
1314
1315         return 0;
1316 }
1317
1318 /*
1319  * siw_reg_user_mr()
1320  *
1321  * Register Memory Region.
1322  *
1323  * @pd:         Protection Domain
1324  * @start:      starting address of MR (virtual address)
1325  * @len:        len of MR
1326  * @rnic_va:    not used by siw
1327  * @rights:     MR access rights
1328  * @udata:      user buffer to communicate STag and Key.
1329  */
1330 struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
1331                               u64 rnic_va, int rights, struct ib_udata *udata)
1332 {
1333         struct siw_mr *mr = NULL;
1334         struct siw_umem *umem = NULL;
1335         struct siw_ureq_reg_mr ureq;
1336         struct siw_device *sdev = to_siw_dev(pd->device);
1337         int rv;
1338
1339         siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
1340                    (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
1341                    (unsigned long long)len);
1342
1343         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1344                 siw_dbg_pd(pd, "too many mr's\n");
1345                 rv = -ENOMEM;
1346                 goto err_out;
1347         }
1348         if (!len) {
1349                 rv = -EINVAL;
1350                 goto err_out;
1351         }
1352         umem = siw_umem_get(pd->device, start, len, rights);
1353         if (IS_ERR(umem)) {
1354                 rv = PTR_ERR(umem);
1355                 siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
1356                 umem = NULL;
1357                 goto err_out;
1358         }
1359         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1360         if (!mr) {
1361                 rv = -ENOMEM;
1362                 goto err_out;
1363         }
1364         rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
1365         if (rv)
1366                 goto err_out;
1367
1368         if (udata) {
1369                 struct siw_uresp_reg_mr uresp = {};
1370                 struct siw_mem *mem = mr->mem;
1371
1372                 if (udata->inlen < sizeof(ureq)) {
1373                         rv = -EINVAL;
1374                         goto err_out;
1375                 }
1376                 rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
1377                 if (rv)
1378                         goto err_out;
1379
1380                 mr->base_mr.lkey |= ureq.stag_key;
1381                 mr->base_mr.rkey |= ureq.stag_key;
1382                 mem->stag |= ureq.stag_key;
1383                 uresp.stag = mem->stag;
1384
1385                 if (udata->outlen < sizeof(uresp)) {
1386                         rv = -EINVAL;
1387                         goto err_out;
1388                 }
1389                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1390                 if (rv)
1391                         goto err_out;
1392         }
1393         mr->mem->stag_valid = 1;
1394
1395         return &mr->base_mr;
1396
1397 err_out:
1398         atomic_dec(&sdev->num_mr);
1399         if (mr) {
1400                 if (mr->mem)
1401                         siw_mr_drop_mem(mr);
1402                 kfree_rcu(mr, rcu);
1403         } else {
1404                 if (umem)
1405                         siw_umem_release(umem);
1406         }
1407         return ERR_PTR(rv);
1408 }
1409
1410 struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
1411                            u32 max_sge)
1412 {
1413         struct siw_device *sdev = to_siw_dev(pd->device);
1414         struct siw_mr *mr = NULL;
1415         struct siw_pbl *pbl = NULL;
1416         int rv;
1417
1418         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1419                 siw_dbg_pd(pd, "too many mr's\n");
1420                 rv = -ENOMEM;
1421                 goto err_out;
1422         }
1423         if (mr_type != IB_MR_TYPE_MEM_REG) {
1424                 siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
1425                 rv = -EOPNOTSUPP;
1426                 goto err_out;
1427         }
1428         if (max_sge > SIW_MAX_SGE_PBL) {
1429                 siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
1430                 rv = -ENOMEM;
1431                 goto err_out;
1432         }
1433         pbl = siw_pbl_alloc(max_sge);
1434         if (IS_ERR(pbl)) {
1435                 rv = PTR_ERR(pbl);
1436                 siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
1437                 pbl = NULL;
1438                 goto err_out;
1439         }
1440         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1441         if (!mr) {
1442                 rv = -ENOMEM;
1443                 goto err_out;
1444         }
1445         rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
1446         if (rv)
1447                 goto err_out;
1448
1449         mr->mem->is_pbl = 1;
1450
1451         siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1452
1453         return &mr->base_mr;
1454
1455 err_out:
1456         atomic_dec(&sdev->num_mr);
1457
1458         if (!mr) {
1459                 kfree(pbl);
1460         } else {
1461                 if (mr->mem)
1462                         siw_mr_drop_mem(mr);
1463                 kfree_rcu(mr, rcu);
1464         }
1465         siw_dbg_pd(pd, "failed: %d\n", rv);
1466
1467         return ERR_PTR(rv);
1468 }
1469
1470 /* Just used to count number of pages being mapped */
1471 static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
1472 {
1473         return 0;
1474 }
1475
1476 int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
1477                   unsigned int *sg_off)
1478 {
1479         struct scatterlist *slp;
1480         struct siw_mr *mr = to_siw_mr(base_mr);
1481         struct siw_mem *mem = mr->mem;
1482         struct siw_pbl *pbl = mem->pbl;
1483         struct siw_pble *pble;
1484         unsigned long pbl_size;
1485         int i, rv;
1486
1487         if (!pbl) {
1488                 siw_dbg_mem(mem, "no PBL allocated\n");
1489                 return -EINVAL;
1490         }
1491         pble = pbl->pbe;
1492
1493         if (pbl->max_buf < num_sle) {
1494                 siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
1495                             num_sle, pbl->max_buf);
1496                 return -ENOMEM;
1497         }
1498         for_each_sg(sl, slp, num_sle, i) {
1499                 if (sg_dma_len(slp) == 0) {
1500                         siw_dbg_mem(mem, "empty SGE\n");
1501                         return -EINVAL;
1502                 }
1503                 if (i == 0) {
1504                         pble->addr = sg_dma_address(slp);
1505                         pble->size = sg_dma_len(slp);
1506                         pble->pbl_off = 0;
1507                         pbl_size = pble->size;
1508                         pbl->num_buf = 1;
1509                 } else {
1510                         /* Merge PBL entries if adjacent */
1511                         if (pble->addr + pble->size == sg_dma_address(slp)) {
1512                                 pble->size += sg_dma_len(slp);
1513                         } else {
1514                                 pble++;
1515                                 pbl->num_buf++;
1516                                 pble->addr = sg_dma_address(slp);
1517                                 pble->size = sg_dma_len(slp);
1518                                 pble->pbl_off = pbl_size;
1519                         }
1520                         pbl_size += sg_dma_len(slp);
1521                 }
1522                 siw_dbg_mem(mem,
1523                         "sge[%d], size %u, addr 0x%p, total %lu\n",
1524                         i, pble->size, ib_virt_dma_to_ptr(pble->addr),
1525                         pbl_size);
1526         }
1527         rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
1528         if (rv > 0) {
1529                 mem->len = base_mr->length;
1530                 mem->va = base_mr->iova;
1531                 siw_dbg_mem(mem,
1532                         "%llu bytes, start 0x%pK, %u SLE to %u entries\n",
1533                         mem->len, (void *)(uintptr_t)mem->va, num_sle,
1534                         pbl->num_buf);
1535         }
1536         return rv;
1537 }
1538
1539 /*
1540  * siw_get_dma_mr()
1541  *
1542  * Create a (empty) DMA memory region, where no umem is attached.
1543  */
1544 struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
1545 {
1546         struct siw_device *sdev = to_siw_dev(pd->device);
1547         struct siw_mr *mr = NULL;
1548         int rv;
1549
1550         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1551                 siw_dbg_pd(pd, "too many mr's\n");
1552                 rv = -ENOMEM;
1553                 goto err_out;
1554         }
1555         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1556         if (!mr) {
1557                 rv = -ENOMEM;
1558                 goto err_out;
1559         }
1560         rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
1561         if (rv)
1562                 goto err_out;
1563
1564         mr->mem->stag_valid = 1;
1565
1566         siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1567
1568         return &mr->base_mr;
1569
1570 err_out:
1571         if (rv)
1572                 kfree(mr);
1573
1574         atomic_dec(&sdev->num_mr);
1575
1576         return ERR_PTR(rv);
1577 }
1578
1579 /*
1580  * siw_create_srq()
1581  *
1582  * Create Shared Receive Queue of attributes @init_attrs
1583  * within protection domain given by @pd.
1584  *
1585  * @base_srq:   Base SRQ contained in siw SRQ.
1586  * @init_attrs: SRQ init attributes.
1587  * @udata:      points to user context
1588  */
1589 int siw_create_srq(struct ib_srq *base_srq,
1590                    struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
1591 {
1592         struct siw_srq *srq = to_siw_srq(base_srq);
1593         struct ib_srq_attr *attrs = &init_attrs->attr;
1594         struct siw_device *sdev = to_siw_dev(base_srq->device);
1595         struct siw_ucontext *ctx =
1596                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1597                                           base_ucontext);
1598         int rv;
1599
1600         if (init_attrs->srq_type != IB_SRQT_BASIC)
1601                 return -EOPNOTSUPP;
1602
1603         if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
1604                 siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
1605                 rv = -ENOMEM;
1606                 goto err_out;
1607         }
1608         if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
1609             attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
1610                 rv = -EINVAL;
1611                 goto err_out;
1612         }
1613         srq->max_sge = attrs->max_sge;
1614         srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
1615         srq->limit = attrs->srq_limit;
1616         if (srq->limit)
1617                 srq->armed = true;
1618
1619         srq->is_kernel_res = !udata;
1620
1621         if (udata)
1622                 srq->recvq =
1623                         vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
1624         else
1625                 srq->recvq = vcalloc(srq->num_rqe, sizeof(struct siw_rqe));
1626
1627         if (srq->recvq == NULL) {
1628                 rv = -ENOMEM;
1629                 goto err_out;
1630         }
1631         if (udata) {
1632                 struct siw_uresp_create_srq uresp = {};
1633                 size_t length = srq->num_rqe * sizeof(struct siw_rqe);
1634
1635                 srq->srq_entry =
1636                         siw_mmap_entry_insert(ctx, srq->recvq,
1637                                               length, &uresp.srq_key);
1638                 if (!srq->srq_entry) {
1639                         rv = -ENOMEM;
1640                         goto err_out;
1641                 }
1642
1643                 uresp.num_rqe = srq->num_rqe;
1644
1645                 if (udata->outlen < sizeof(uresp)) {
1646                         rv = -EINVAL;
1647                         goto err_out;
1648                 }
1649                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1650                 if (rv)
1651                         goto err_out;
1652         }
1653         spin_lock_init(&srq->lock);
1654
1655         siw_dbg_pd(base_srq->pd, "[SRQ]: success\n");
1656
1657         return 0;
1658
1659 err_out:
1660         if (srq->recvq) {
1661                 if (ctx)
1662                         rdma_user_mmap_entry_remove(srq->srq_entry);
1663                 vfree(srq->recvq);
1664         }
1665         atomic_dec(&sdev->num_srq);
1666
1667         return rv;
1668 }
1669
1670 /*
1671  * siw_modify_srq()
1672  *
1673  * Modify SRQ. The caller may resize SRQ and/or set/reset notification
1674  * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
1675  *
1676  * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
1677  * parameter. siw_modify_srq() does not check the attrs->max_sge param.
1678  */
1679 int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
1680                    enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
1681 {
1682         struct siw_srq *srq = to_siw_srq(base_srq);
1683         unsigned long flags;
1684         int rv = 0;
1685
1686         spin_lock_irqsave(&srq->lock, flags);
1687
1688         if (attr_mask & IB_SRQ_MAX_WR) {
1689                 /* resize request not yet supported */
1690                 rv = -EOPNOTSUPP;
1691                 goto out;
1692         }
1693         if (attr_mask & IB_SRQ_LIMIT) {
1694                 if (attrs->srq_limit) {
1695                         if (unlikely(attrs->srq_limit > srq->num_rqe)) {
1696                                 rv = -EINVAL;
1697                                 goto out;
1698                         }
1699                         srq->armed = true;
1700                 } else {
1701                         srq->armed = false;
1702                 }
1703                 srq->limit = attrs->srq_limit;
1704         }
1705 out:
1706         spin_unlock_irqrestore(&srq->lock, flags);
1707
1708         return rv;
1709 }
1710
1711 /*
1712  * siw_query_srq()
1713  *
1714  * Query SRQ attributes.
1715  */
1716 int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
1717 {
1718         struct siw_srq *srq = to_siw_srq(base_srq);
1719         unsigned long flags;
1720
1721         spin_lock_irqsave(&srq->lock, flags);
1722
1723         attrs->max_wr = srq->num_rqe;
1724         attrs->max_sge = srq->max_sge;
1725         attrs->srq_limit = srq->limit;
1726
1727         spin_unlock_irqrestore(&srq->lock, flags);
1728
1729         return 0;
1730 }
1731
1732 /*
1733  * siw_destroy_srq()
1734  *
1735  * Destroy SRQ.
1736  * It is assumed that the SRQ is not referenced by any
1737  * QP anymore - the code trusts the RDMA core environment to keep track
1738  * of QP references.
1739  */
1740 int siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
1741 {
1742         struct siw_srq *srq = to_siw_srq(base_srq);
1743         struct siw_device *sdev = to_siw_dev(base_srq->device);
1744         struct siw_ucontext *ctx =
1745                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1746                                           base_ucontext);
1747
1748         if (ctx)
1749                 rdma_user_mmap_entry_remove(srq->srq_entry);
1750         vfree(srq->recvq);
1751         atomic_dec(&sdev->num_srq);
1752         return 0;
1753 }
1754
1755 /*
1756  * siw_post_srq_recv()
1757  *
1758  * Post a list of receive queue elements to SRQ.
1759  * NOTE: The function does not check or lock a certain SRQ state
1760  *       during the post operation. The code simply trusts the
1761  *       RDMA core environment.
1762  *
1763  * @base_srq:   Base SRQ contained in siw SRQ
1764  * @wr:         List of R-WR's
1765  * @bad_wr:     Updated to failing WR if posting fails.
1766  */
1767 int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
1768                       const struct ib_recv_wr **bad_wr)
1769 {
1770         struct siw_srq *srq = to_siw_srq(base_srq);
1771         unsigned long flags;
1772         int rv = 0;
1773
1774         if (unlikely(!srq->is_kernel_res)) {
1775                 siw_dbg_pd(base_srq->pd,
1776                            "[SRQ]: no kernel post_recv for mapped srq\n");
1777                 rv = -EINVAL;
1778                 goto out;
1779         }
1780         /*
1781          * Serialize potentially multiple producers.
1782          * Also needed to serialize potentially multiple
1783          * consumers.
1784          */
1785         spin_lock_irqsave(&srq->lock, flags);
1786
1787         while (wr) {
1788                 u32 idx = srq->rq_put % srq->num_rqe;
1789                 struct siw_rqe *rqe = &srq->recvq[idx];
1790
1791                 if (rqe->flags) {
1792                         siw_dbg_pd(base_srq->pd, "SRQ full\n");
1793                         rv = -ENOMEM;
1794                         break;
1795                 }
1796                 if (unlikely(wr->num_sge > srq->max_sge)) {
1797                         siw_dbg_pd(base_srq->pd,
1798                                    "[SRQ]: too many sge's: %d\n", wr->num_sge);
1799                         rv = -EINVAL;
1800                         break;
1801                 }
1802                 rqe->id = wr->wr_id;
1803                 rqe->num_sge = wr->num_sge;
1804                 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1805
1806                 /* Make sure S-RQE is completely written before valid */
1807                 smp_wmb();
1808
1809                 rqe->flags = SIW_WQE_VALID;
1810
1811                 srq->rq_put++;
1812                 wr = wr->next;
1813         }
1814         spin_unlock_irqrestore(&srq->lock, flags);
1815 out:
1816         if (unlikely(rv < 0)) {
1817                 siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv);
1818                 *bad_wr = wr;
1819         }
1820         return rv;
1821 }
1822
1823 void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
1824 {
1825         struct ib_event event;
1826         struct ib_qp *base_qp = &qp->base_qp;
1827
1828         /*
1829          * Do not report asynchronous errors on QP which gets
1830          * destroyed via verbs interface (siw_destroy_qp())
1831          */
1832         if (qp->attrs.flags & SIW_QP_IN_DESTROY)
1833                 return;
1834
1835         event.event = etype;
1836         event.device = base_qp->device;
1837         event.element.qp = base_qp;
1838
1839         if (base_qp->event_handler) {
1840                 siw_dbg_qp(qp, "reporting event %d\n", etype);
1841                 base_qp->event_handler(&event, base_qp->qp_context);
1842         }
1843 }
1844
1845 void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
1846 {
1847         struct ib_event event;
1848         struct ib_cq *base_cq = &cq->base_cq;
1849
1850         event.event = etype;
1851         event.device = base_cq->device;
1852         event.element.cq = base_cq;
1853
1854         if (base_cq->event_handler) {
1855                 siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
1856                 base_cq->event_handler(&event, base_cq->cq_context);
1857         }
1858 }
1859
1860 void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
1861 {
1862         struct ib_event event;
1863         struct ib_srq *base_srq = &srq->base_srq;
1864
1865         event.event = etype;
1866         event.device = base_srq->device;
1867         event.element.srq = base_srq;
1868
1869         if (base_srq->event_handler) {
1870                 siw_dbg_pd(srq->base_srq.pd,
1871                            "reporting SRQ event %d\n", etype);
1872                 base_srq->event_handler(&event, base_srq->srq_context);
1873         }
1874 }
1875
1876 void siw_port_event(struct siw_device *sdev, u32 port, enum ib_event_type etype)
1877 {
1878         struct ib_event event;
1879
1880         event.event = etype;
1881         event.device = &sdev->base_dev;
1882         event.element.port_num = port;
1883
1884         siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
1885
1886         ib_dispatch_event(&event);
1887 }
This page took 0.137254 seconds and 4 git commands to generate.