]> Git Repo - qemu.git/blob - hw/rdma/rdma_backend.c
Merge remote-tracking branch 'remotes/xtensa/tags/20180918-xtensa' into staging
[qemu.git] / hw / rdma / rdma_backend.c
1 /*
2  * QEMU paravirtual RDMA - Generic RDMA backend
3  *
4  * Copyright (C) 2018 Oracle
5  * Copyright (C) 2018 Red Hat Inc
6  *
7  * Authors:
8  *     Yuval Shaia <[email protected]>
9  *     Marcel Apfelbaum <[email protected]>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  *
14  */
15
16 #include "qemu/osdep.h"
17 #include "qemu/error-report.h"
18 #include "qapi/error.h"
19
20 #include <infiniband/verbs.h>
21
22 #include "trace.h"
23 #include "rdma_utils.h"
24 #include "rdma_rm.h"
25 #include "rdma_backend.h"
26
27 /* Vendor Errors */
28 #define VENDOR_ERR_FAIL_BACKEND     0x201
29 #define VENDOR_ERR_TOO_MANY_SGES    0x202
30 #define VENDOR_ERR_NOMEM            0x203
31 #define VENDOR_ERR_QP0              0x204
32 #define VENDOR_ERR_NO_SGE           0x205
33 #define VENDOR_ERR_MAD_SEND         0x206
34 #define VENDOR_ERR_INVLKEY          0x207
35 #define VENDOR_ERR_MR_SMALL         0x208
36
37 #define THR_NAME_LEN 16
38 #define THR_POLL_TO  5000
39
40 typedef struct BackendCtx {
41     uint64_t req_id;
42     void *up_ctx;
43     bool is_tx_req;
44 } BackendCtx;
45
46 static void (*comp_handler)(int status, unsigned int vendor_err, void *ctx);
47
48 static void dummy_comp_handler(int status, unsigned int vendor_err, void *ctx)
49 {
50     pr_err("No completion handler is registered\n");
51 }
52
53 static void poll_cq(RdmaDeviceResources *rdma_dev_res, struct ibv_cq *ibcq)
54 {
55     int i, ne;
56     BackendCtx *bctx;
57     struct ibv_wc wc[2];
58
59     pr_dbg("Entering poll_cq loop on cq %p\n", ibcq);
60     do {
61         ne = ibv_poll_cq(ibcq, ARRAY_SIZE(wc), wc);
62
63         pr_dbg("Got %d completion(s) from cq %p\n", ne, ibcq);
64
65         for (i = 0; i < ne; i++) {
66             pr_dbg("wr_id=0x%" PRIx64 "\n", wc[i].wr_id);
67             pr_dbg("status=%d\n", wc[i].status);
68
69             bctx = rdma_rm_get_cqe_ctx(rdma_dev_res, wc[i].wr_id);
70             if (unlikely(!bctx)) {
71                 pr_dbg("Error: Failed to find ctx for req %" PRId64 "\n",
72                        wc[i].wr_id);
73                 continue;
74             }
75             pr_dbg("Processing %s CQE\n", bctx->is_tx_req ? "send" : "recv");
76
77             comp_handler(wc[i].status, wc[i].vendor_err, bctx->up_ctx);
78
79             rdma_rm_dealloc_cqe_ctx(rdma_dev_res, wc[i].wr_id);
80             g_free(bctx);
81         }
82     } while (ne > 0);
83
84     if (ne < 0) {
85         pr_dbg("Got error %d from ibv_poll_cq\n", ne);
86     }
87 }
88
89 static void *comp_handler_thread(void *arg)
90 {
91     RdmaBackendDev *backend_dev = (RdmaBackendDev *)arg;
92     int rc;
93     struct ibv_cq *ev_cq;
94     void *ev_ctx;
95     int flags;
96     GPollFD pfds[1];
97
98     /* Change to non-blocking mode */
99     flags = fcntl(backend_dev->channel->fd, F_GETFL);
100     rc = fcntl(backend_dev->channel->fd, F_SETFL, flags | O_NONBLOCK);
101     if (rc < 0) {
102         pr_dbg("Fail to change to non-blocking mode\n");
103         return NULL;
104     }
105
106     pr_dbg("Starting\n");
107
108     pfds[0].fd = backend_dev->channel->fd;
109     pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
110
111     backend_dev->comp_thread.is_running = true;
112
113     while (backend_dev->comp_thread.run) {
114         do {
115             rc = qemu_poll_ns(pfds, 1, THR_POLL_TO * (int64_t)SCALE_MS);
116         } while (!rc && backend_dev->comp_thread.run);
117
118         if (backend_dev->comp_thread.run) {
119             pr_dbg("Waiting for completion on channel %p\n", backend_dev->channel);
120             rc = ibv_get_cq_event(backend_dev->channel, &ev_cq, &ev_ctx);
121             pr_dbg("ibv_get_cq_event=%d\n", rc);
122             if (unlikely(rc)) {
123                 pr_dbg("---> ibv_get_cq_event (%d)\n", rc);
124                 continue;
125             }
126
127             rc = ibv_req_notify_cq(ev_cq, 0);
128             if (unlikely(rc)) {
129                 pr_dbg("Error %d from ibv_req_notify_cq\n", rc);
130             }
131
132             poll_cq(backend_dev->rdma_dev_res, ev_cq);
133
134             ibv_ack_cq_events(ev_cq, 1);
135         }
136     }
137
138     pr_dbg("Going down\n");
139
140     /* TODO: Post cqe for all remaining buffs that were posted */
141
142     backend_dev->comp_thread.is_running = false;
143
144     qemu_thread_exit(0);
145
146     return NULL;
147 }
148
149 static void stop_backend_thread(RdmaBackendThread *thread)
150 {
151     thread->run = false;
152     while (thread->is_running) {
153         pr_dbg("Waiting for thread to complete\n");
154         sleep(THR_POLL_TO / SCALE_US / 2);
155     }
156 }
157
158 static void start_comp_thread(RdmaBackendDev *backend_dev)
159 {
160     char thread_name[THR_NAME_LEN] = {0};
161
162     stop_backend_thread(&backend_dev->comp_thread);
163
164     snprintf(thread_name, sizeof(thread_name), "rdma_comp_%s",
165              ibv_get_device_name(backend_dev->ib_dev));
166     backend_dev->comp_thread.run = true;
167     qemu_thread_create(&backend_dev->comp_thread.thread, thread_name,
168                        comp_handler_thread, backend_dev, QEMU_THREAD_DETACHED);
169 }
170
171 void rdma_backend_register_comp_handler(void (*handler)(int status,
172                                         unsigned int vendor_err, void *ctx))
173 {
174     comp_handler = handler;
175 }
176
177 void rdma_backend_unregister_comp_handler(void)
178 {
179     rdma_backend_register_comp_handler(dummy_comp_handler);
180 }
181
182 int rdma_backend_query_port(RdmaBackendDev *backend_dev,
183                             struct ibv_port_attr *port_attr)
184 {
185     int rc;
186
187     rc = ibv_query_port(backend_dev->context, backend_dev->port_num, port_attr);
188     if (rc) {
189         pr_dbg("Error %d from ibv_query_port\n", rc);
190         return -EIO;
191     }
192
193     return 0;
194 }
195
196 void rdma_backend_poll_cq(RdmaDeviceResources *rdma_dev_res, RdmaBackendCQ *cq)
197 {
198     poll_cq(rdma_dev_res, cq->ibcq);
199 }
200
201 static GHashTable *ah_hash;
202
203 static struct ibv_ah *create_ah(RdmaBackendDev *backend_dev, struct ibv_pd *pd,
204                                 uint8_t sgid_idx, union ibv_gid *dgid)
205 {
206     GBytes *ah_key = g_bytes_new(dgid, sizeof(*dgid));
207     struct ibv_ah *ah = g_hash_table_lookup(ah_hash, ah_key);
208
209     if (ah) {
210         trace_create_ah_cache_hit(be64_to_cpu(dgid->global.subnet_prefix),
211                                   be64_to_cpu(dgid->global.interface_id));
212         g_bytes_unref(ah_key);
213     } else {
214         struct ibv_ah_attr ah_attr = {
215             .is_global     = 1,
216             .port_num      = backend_dev->port_num,
217             .grh.hop_limit = 1,
218         };
219
220         ah_attr.grh.dgid = *dgid;
221         ah_attr.grh.sgid_index = sgid_idx;
222
223         ah = ibv_create_ah(pd, &ah_attr);
224         if (ah) {
225             g_hash_table_insert(ah_hash, ah_key, ah);
226         } else {
227             g_bytes_unref(ah_key);
228             pr_dbg("Fail to create AH for gid <0x%" PRIx64 ", 0x%" PRIx64 ">\n",
229                     be64_to_cpu(dgid->global.subnet_prefix),
230                     be64_to_cpu(dgid->global.interface_id));
231         }
232
233         trace_create_ah_cache_miss(be64_to_cpu(dgid->global.subnet_prefix),
234                                    be64_to_cpu(dgid->global.interface_id));
235     }
236
237     return ah;
238 }
239
240 static void destroy_ah_hash_key(gpointer data)
241 {
242     g_bytes_unref(data);
243 }
244
245 static void destroy_ah_hast_data(gpointer data)
246 {
247     struct ibv_ah *ah = data;
248
249     ibv_destroy_ah(ah);
250 }
251
252 static void ah_cache_init(void)
253 {
254     ah_hash = g_hash_table_new_full(g_bytes_hash, g_bytes_equal,
255                                     destroy_ah_hash_key, destroy_ah_hast_data);
256 }
257
258 static int build_host_sge_array(RdmaDeviceResources *rdma_dev_res,
259                                 struct ibv_sge *dsge, struct ibv_sge *ssge,
260                                 uint8_t num_sge)
261 {
262     RdmaRmMR *mr;
263     int ssge_idx;
264
265     pr_dbg("num_sge=%d\n", num_sge);
266
267     for (ssge_idx = 0; ssge_idx < num_sge; ssge_idx++) {
268         mr = rdma_rm_get_mr(rdma_dev_res, ssge[ssge_idx].lkey);
269         if (unlikely(!mr)) {
270             pr_dbg("Invalid lkey 0x%x\n", ssge[ssge_idx].lkey);
271             return VENDOR_ERR_INVLKEY | ssge[ssge_idx].lkey;
272         }
273
274         dsge->addr = (uintptr_t)mr->virt + ssge[ssge_idx].addr - mr->start;
275         dsge->length = ssge[ssge_idx].length;
276         dsge->lkey = rdma_backend_mr_lkey(&mr->backend_mr);
277
278         pr_dbg("ssge->addr=0x%" PRIx64 "\n", ssge[ssge_idx].addr);
279         pr_dbg("dsge->addr=0x%" PRIx64 "\n", dsge->addr);
280         pr_dbg("dsge->length=%d\n", dsge->length);
281         pr_dbg("dsge->lkey=0x%x\n", dsge->lkey);
282
283         dsge++;
284     }
285
286     return 0;
287 }
288
289 void rdma_backend_post_send(RdmaBackendDev *backend_dev,
290                             RdmaBackendQP *qp, uint8_t qp_type,
291                             struct ibv_sge *sge, uint32_t num_sge,
292                             union ibv_gid *dgid, uint32_t dqpn,
293                             uint32_t dqkey, void *ctx)
294 {
295     BackendCtx *bctx;
296     struct ibv_sge new_sge[MAX_SGE];
297     uint32_t bctx_id;
298     int rc;
299     struct ibv_send_wr wr = {0}, *bad_wr;
300
301     if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
302         if (qp_type == IBV_QPT_SMI) {
303             pr_dbg("QP0 unsupported\n");
304             comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
305         } else if (qp_type == IBV_QPT_GSI) {
306             pr_dbg("QP1\n");
307             comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
308         }
309         pr_dbg("qp->ibqp is NULL for qp_type %d!!!\n", qp_type);
310         return;
311     }
312
313     pr_dbg("num_sge=%d\n", num_sge);
314     if (!num_sge) {
315         pr_dbg("num_sge=0\n");
316         comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx);
317         return;
318     }
319
320     bctx = g_malloc0(sizeof(*bctx));
321     bctx->up_ctx = ctx;
322     bctx->is_tx_req = 1;
323
324     rc = rdma_rm_alloc_cqe_ctx(backend_dev->rdma_dev_res, &bctx_id, bctx);
325     if (unlikely(rc)) {
326         pr_dbg("Failed to allocate cqe_ctx\n");
327         comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
328         goto out_free_bctx;
329     }
330
331     rc = build_host_sge_array(backend_dev->rdma_dev_res, new_sge, sge, num_sge);
332     if (rc) {
333         pr_dbg("Error: Failed to build host SGE array\n");
334         comp_handler(IBV_WC_GENERAL_ERR, rc, ctx);
335         goto out_dealloc_cqe_ctx;
336     }
337
338     if (qp_type == IBV_QPT_UD) {
339         wr.wr.ud.ah = create_ah(backend_dev, qp->ibpd,
340                                 backend_dev->backend_gid_idx, dgid);
341         wr.wr.ud.remote_qpn = dqpn;
342         wr.wr.ud.remote_qkey = dqkey;
343     }
344
345     wr.num_sge = num_sge;
346     wr.opcode = IBV_WR_SEND;
347     wr.send_flags = IBV_SEND_SIGNALED;
348     wr.sg_list = new_sge;
349     wr.wr_id = bctx_id;
350
351     rc = ibv_post_send(qp->ibqp, &wr, &bad_wr);
352     pr_dbg("ibv_post_send=%d\n", rc);
353     if (rc) {
354         pr_dbg("Fail (%d, %d) to post send WQE to qpn %d\n", rc, errno,
355                 qp->ibqp->qp_num);
356         comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
357         goto out_dealloc_cqe_ctx;
358     }
359
360     return;
361
362 out_dealloc_cqe_ctx:
363     rdma_rm_dealloc_cqe_ctx(backend_dev->rdma_dev_res, bctx_id);
364
365 out_free_bctx:
366     g_free(bctx);
367 }
368
369 void rdma_backend_post_recv(RdmaBackendDev *backend_dev,
370                             RdmaDeviceResources *rdma_dev_res,
371                             RdmaBackendQP *qp, uint8_t qp_type,
372                             struct ibv_sge *sge, uint32_t num_sge, void *ctx)
373 {
374     BackendCtx *bctx;
375     struct ibv_sge new_sge[MAX_SGE];
376     uint32_t bctx_id;
377     int rc;
378     struct ibv_recv_wr wr = {0}, *bad_wr;
379
380     if (!qp->ibqp) { /* This field does not get initialized for QP0 and QP1 */
381         if (qp_type == IBV_QPT_SMI) {
382             pr_dbg("QP0 unsupported\n");
383             comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_QP0, ctx);
384         }
385         if (qp_type == IBV_QPT_GSI) {
386             pr_dbg("QP1\n");
387             comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_MAD_SEND, ctx);
388         }
389         return;
390     }
391
392     pr_dbg("num_sge=%d\n", num_sge);
393     if (!num_sge) {
394         pr_dbg("num_sge=0\n");
395         comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NO_SGE, ctx);
396         return;
397     }
398
399     bctx = g_malloc0(sizeof(*bctx));
400     bctx->up_ctx = ctx;
401     bctx->is_tx_req = 0;
402
403     rc = rdma_rm_alloc_cqe_ctx(rdma_dev_res, &bctx_id, bctx);
404     if (unlikely(rc)) {
405         pr_dbg("Failed to allocate cqe_ctx\n");
406         comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_NOMEM, ctx);
407         goto out_free_bctx;
408     }
409
410     rc = build_host_sge_array(rdma_dev_res, new_sge, sge, num_sge);
411     if (rc) {
412         pr_dbg("Error: Failed to build host SGE array\n");
413         comp_handler(IBV_WC_GENERAL_ERR, rc, ctx);
414         goto out_dealloc_cqe_ctx;
415     }
416
417     wr.num_sge = num_sge;
418     wr.sg_list = new_sge;
419     wr.wr_id = bctx_id;
420     rc = ibv_post_recv(qp->ibqp, &wr, &bad_wr);
421     pr_dbg("ibv_post_recv=%d\n", rc);
422     if (rc) {
423         pr_dbg("Fail (%d, %d) to post recv WQE to qpn %d\n", rc, errno,
424                 qp->ibqp->qp_num);
425         comp_handler(IBV_WC_GENERAL_ERR, VENDOR_ERR_FAIL_BACKEND, ctx);
426         goto out_dealloc_cqe_ctx;
427     }
428
429     return;
430
431 out_dealloc_cqe_ctx:
432     rdma_rm_dealloc_cqe_ctx(rdma_dev_res, bctx_id);
433
434 out_free_bctx:
435     g_free(bctx);
436 }
437
438 int rdma_backend_create_pd(RdmaBackendDev *backend_dev, RdmaBackendPD *pd)
439 {
440     pd->ibpd = ibv_alloc_pd(backend_dev->context);
441
442     return pd->ibpd ? 0 : -EIO;
443 }
444
445 void rdma_backend_destroy_pd(RdmaBackendPD *pd)
446 {
447     if (pd->ibpd) {
448         ibv_dealloc_pd(pd->ibpd);
449     }
450 }
451
452 int rdma_backend_create_mr(RdmaBackendMR *mr, RdmaBackendPD *pd, void *addr,
453                            size_t length, int access)
454 {
455     pr_dbg("addr=0x%p\n", addr);
456     pr_dbg("len=%zu\n", length);
457     mr->ibmr = ibv_reg_mr(pd->ibpd, addr, length, access);
458     if (mr->ibmr) {
459         pr_dbg("lkey=0x%x\n", mr->ibmr->lkey);
460         pr_dbg("rkey=0x%x\n", mr->ibmr->rkey);
461         mr->ibpd = pd->ibpd;
462     }
463
464     return mr->ibmr ? 0 : -EIO;
465 }
466
467 void rdma_backend_destroy_mr(RdmaBackendMR *mr)
468 {
469     if (mr->ibmr) {
470         ibv_dereg_mr(mr->ibmr);
471     }
472 }
473
474 int rdma_backend_create_cq(RdmaBackendDev *backend_dev, RdmaBackendCQ *cq,
475                            int cqe)
476 {
477     int rc;
478
479     pr_dbg("cqe=%d\n", cqe);
480
481     pr_dbg("dev->channel=%p\n", backend_dev->channel);
482     cq->ibcq = ibv_create_cq(backend_dev->context, cqe + 1, NULL,
483                              backend_dev->channel, 0);
484
485     if (cq->ibcq) {
486         rc = ibv_req_notify_cq(cq->ibcq, 0);
487         if (rc) {
488             pr_dbg("Error %d from ibv_req_notify_cq\n", rc);
489         }
490         cq->backend_dev = backend_dev;
491     }
492
493     return cq->ibcq ? 0 : -EIO;
494 }
495
496 void rdma_backend_destroy_cq(RdmaBackendCQ *cq)
497 {
498     if (cq->ibcq) {
499         ibv_destroy_cq(cq->ibcq);
500     }
501 }
502
503 int rdma_backend_create_qp(RdmaBackendQP *qp, uint8_t qp_type,
504                            RdmaBackendPD *pd, RdmaBackendCQ *scq,
505                            RdmaBackendCQ *rcq, uint32_t max_send_wr,
506                            uint32_t max_recv_wr, uint32_t max_send_sge,
507                            uint32_t max_recv_sge)
508 {
509     struct ibv_qp_init_attr attr = {0};
510
511     qp->ibqp = 0;
512     pr_dbg("qp_type=%d\n", qp_type);
513
514     switch (qp_type) {
515     case IBV_QPT_GSI:
516         pr_dbg("QP1 unsupported\n");
517         return 0;
518
519     case IBV_QPT_RC:
520         /* fall through */
521     case IBV_QPT_UD:
522         /* do nothing */
523         break;
524
525     default:
526         pr_dbg("Unsupported QP type %d\n", qp_type);
527         return -EIO;
528     }
529
530     attr.qp_type = qp_type;
531     attr.send_cq = scq->ibcq;
532     attr.recv_cq = rcq->ibcq;
533     attr.cap.max_send_wr = max_send_wr;
534     attr.cap.max_recv_wr = max_recv_wr;
535     attr.cap.max_send_sge = max_send_sge;
536     attr.cap.max_recv_sge = max_recv_sge;
537
538     pr_dbg("max_send_wr=%d\n", max_send_wr);
539     pr_dbg("max_recv_wr=%d\n", max_recv_wr);
540     pr_dbg("max_send_sge=%d\n", max_send_sge);
541     pr_dbg("max_recv_sge=%d\n", max_recv_sge);
542
543     qp->ibqp = ibv_create_qp(pd->ibpd, &attr);
544     if (likely(!qp->ibqp)) {
545         pr_dbg("Error from ibv_create_qp\n");
546         return -EIO;
547     }
548
549     qp->ibpd = pd->ibpd;
550
551     /* TODO: Query QP to get max_inline_data and save it to be used in send */
552
553     pr_dbg("qpn=0x%x\n", qp->ibqp->qp_num);
554
555     return 0;
556 }
557
558 int rdma_backend_qp_state_init(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
559                                uint8_t qp_type, uint32_t qkey)
560 {
561     struct ibv_qp_attr attr = {0};
562     int rc, attr_mask;
563
564     pr_dbg("qpn=0x%x\n", qp->ibqp->qp_num);
565     pr_dbg("sport_num=%d\n", backend_dev->port_num);
566
567     attr_mask = IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT;
568     attr.qp_state        = IBV_QPS_INIT;
569     attr.pkey_index      = 0;
570     attr.port_num        = backend_dev->port_num;
571
572     switch (qp_type) {
573     case IBV_QPT_RC:
574         attr_mask |= IBV_QP_ACCESS_FLAGS;
575         break;
576
577     case IBV_QPT_UD:
578         attr.qkey = qkey;
579         attr_mask |= IBV_QP_QKEY;
580         break;
581
582     default:
583         pr_dbg("Unsupported QP type %d\n", qp_type);
584         return -EIO;
585     }
586
587     rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
588     if (rc) {
589         pr_dbg("Error %d from ibv_modify_qp\n", rc);
590         return -EIO;
591     }
592
593     return 0;
594 }
595
596 int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
597                               uint8_t qp_type, union ibv_gid *dgid,
598                               uint32_t dqpn, uint32_t rq_psn, uint32_t qkey,
599                               bool use_qkey)
600 {
601     struct ibv_qp_attr attr = {0};
602     union ibv_gid ibv_gid = {
603         .global.interface_id = dgid->global.interface_id,
604         .global.subnet_prefix = dgid->global.subnet_prefix
605     };
606     int rc, attr_mask;
607
608     attr.qp_state = IBV_QPS_RTR;
609     attr_mask = IBV_QP_STATE;
610
611     switch (qp_type) {
612     case IBV_QPT_RC:
613         pr_dbg("dgid=0x%" PRIx64 ",%" PRIx64 "\n",
614                be64_to_cpu(ibv_gid.global.subnet_prefix),
615                be64_to_cpu(ibv_gid.global.interface_id));
616         pr_dbg("dqpn=0x%x\n", dqpn);
617         pr_dbg("sgid_idx=%d\n", backend_dev->backend_gid_idx);
618         pr_dbg("sport_num=%d\n", backend_dev->port_num);
619         pr_dbg("rq_psn=0x%x\n", rq_psn);
620
621         attr.path_mtu               = IBV_MTU_1024;
622         attr.dest_qp_num            = dqpn;
623         attr.max_dest_rd_atomic     = 1;
624         attr.min_rnr_timer          = 12;
625         attr.ah_attr.port_num       = backend_dev->port_num;
626         attr.ah_attr.is_global      = 1;
627         attr.ah_attr.grh.hop_limit  = 1;
628         attr.ah_attr.grh.dgid       = ibv_gid;
629         attr.ah_attr.grh.sgid_index = backend_dev->backend_gid_idx;
630         attr.rq_psn                 = rq_psn;
631
632         attr_mask |= IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
633                      IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC |
634                      IBV_QP_MIN_RNR_TIMER;
635         break;
636
637     case IBV_QPT_UD:
638         if (use_qkey) {
639             pr_dbg("qkey=0x%x\n", qkey);
640             attr.qkey = qkey;
641             attr_mask |= IBV_QP_QKEY;
642         }
643         break;
644     }
645
646     rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
647     if (rc) {
648         pr_dbg("Error %d from ibv_modify_qp\n", rc);
649         return -EIO;
650     }
651
652     return 0;
653 }
654
655 int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type,
656                               uint32_t sq_psn, uint32_t qkey, bool use_qkey)
657 {
658     struct ibv_qp_attr attr = {0};
659     int rc, attr_mask;
660
661     pr_dbg("qpn=0x%x\n", qp->ibqp->qp_num);
662     pr_dbg("sq_psn=0x%x\n", sq_psn);
663
664     attr.qp_state = IBV_QPS_RTS;
665     attr.sq_psn = sq_psn;
666     attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN;
667
668     switch (qp_type) {
669     case IBV_QPT_RC:
670         attr.timeout       = 14;
671         attr.retry_cnt     = 7;
672         attr.rnr_retry     = 7;
673         attr.max_rd_atomic = 1;
674
675         attr_mask |= IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
676                      IBV_QP_MAX_QP_RD_ATOMIC;
677         break;
678
679     case IBV_QPT_UD:
680         if (use_qkey) {
681             pr_dbg("qkey=0x%x\n", qkey);
682             attr.qkey = qkey;
683             attr_mask |= IBV_QP_QKEY;
684         }
685         break;
686     }
687
688     rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
689     if (rc) {
690         pr_dbg("Error %d from ibv_modify_qp\n", rc);
691         return -EIO;
692     }
693
694     return 0;
695 }
696
697 int rdma_backend_query_qp(RdmaBackendQP *qp, struct ibv_qp_attr *attr,
698                           int attr_mask, struct ibv_qp_init_attr *init_attr)
699 {
700     if (!qp->ibqp) {
701         pr_dbg("QP1\n");
702         attr->qp_state = IBV_QPS_RTS;
703         return 0;
704     }
705
706     return ibv_query_qp(qp->ibqp, attr, attr_mask, init_attr);
707 }
708
709 void rdma_backend_destroy_qp(RdmaBackendQP *qp)
710 {
711     if (qp->ibqp) {
712         ibv_destroy_qp(qp->ibqp);
713     }
714 }
715
716 #define CHK_ATTR(req, dev, member, fmt) ({ \
717     pr_dbg("%s="fmt","fmt"\n", #member, dev.member, req->member); \
718     if (req->member > dev.member) { \
719         warn_report("%s = "fmt" is higher than host device capability "fmt, \
720                     #member, req->member, dev.member); \
721         req->member = dev.member; \
722     } \
723     pr_dbg("%s="fmt"\n", #member, req->member); })
724
725 static int init_device_caps(RdmaBackendDev *backend_dev,
726                             struct ibv_device_attr *dev_attr)
727 {
728     if (ibv_query_device(backend_dev->context, &backend_dev->dev_attr)) {
729         return -EIO;
730     }
731
732     CHK_ATTR(dev_attr, backend_dev->dev_attr, max_mr_size, "%" PRId64);
733     CHK_ATTR(dev_attr, backend_dev->dev_attr, max_qp, "%d");
734     CHK_ATTR(dev_attr, backend_dev->dev_attr, max_sge, "%d");
735     CHK_ATTR(dev_attr, backend_dev->dev_attr, max_qp_wr, "%d");
736     CHK_ATTR(dev_attr, backend_dev->dev_attr, max_cq, "%d");
737     CHK_ATTR(dev_attr, backend_dev->dev_attr, max_cqe, "%d");
738     CHK_ATTR(dev_attr, backend_dev->dev_attr, max_mr, "%d");
739     CHK_ATTR(dev_attr, backend_dev->dev_attr, max_pd, "%d");
740     CHK_ATTR(dev_attr, backend_dev->dev_attr, max_qp_rd_atom, "%d");
741     CHK_ATTR(dev_attr, backend_dev->dev_attr, max_qp_init_rd_atom, "%d");
742     CHK_ATTR(dev_attr, backend_dev->dev_attr, max_ah, "%d");
743
744     return 0;
745 }
746
747 int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev,
748                       RdmaDeviceResources *rdma_dev_res,
749                       const char *backend_device_name, uint8_t port_num,
750                       uint8_t backend_gid_idx, struct ibv_device_attr *dev_attr,
751                       Error **errp)
752 {
753     int i;
754     int ret = 0;
755     int num_ibv_devices;
756     struct ibv_device **dev_list;
757     struct ibv_port_attr port_attr;
758
759     memset(backend_dev, 0, sizeof(*backend_dev));
760
761     backend_dev->dev = pdev;
762
763     backend_dev->backend_gid_idx = backend_gid_idx;
764     backend_dev->port_num = port_num;
765     backend_dev->rdma_dev_res = rdma_dev_res;
766
767     rdma_backend_register_comp_handler(dummy_comp_handler);
768
769     dev_list = ibv_get_device_list(&num_ibv_devices);
770     if (!dev_list) {
771         error_setg(errp, "Failed to get IB devices list");
772         return -EIO;
773     }
774
775     if (num_ibv_devices == 0) {
776         error_setg(errp, "No IB devices were found");
777         ret = -ENXIO;
778         goto out_free_dev_list;
779     }
780
781     if (backend_device_name) {
782         for (i = 0; dev_list[i]; ++i) {
783             if (!strcmp(ibv_get_device_name(dev_list[i]),
784                         backend_device_name)) {
785                 break;
786             }
787         }
788
789         backend_dev->ib_dev = dev_list[i];
790         if (!backend_dev->ib_dev) {
791             error_setg(errp, "Failed to find IB device %s",
792                        backend_device_name);
793             ret = -EIO;
794             goto out_free_dev_list;
795         }
796     } else {
797         backend_dev->ib_dev = *dev_list;
798     }
799
800     pr_dbg("Using backend device %s, port %d, gid_idx %d\n",
801            ibv_get_device_name(backend_dev->ib_dev),
802            backend_dev->port_num, backend_dev->backend_gid_idx);
803
804     backend_dev->context = ibv_open_device(backend_dev->ib_dev);
805     if (!backend_dev->context) {
806         error_setg(errp, "Failed to open IB device");
807         ret = -EIO;
808         goto out;
809     }
810
811     backend_dev->channel = ibv_create_comp_channel(backend_dev->context);
812     if (!backend_dev->channel) {
813         error_setg(errp, "Failed to create IB communication channel");
814         ret = -EIO;
815         goto out_close_device;
816     }
817     pr_dbg("dev->backend_dev.channel=%p\n", backend_dev->channel);
818
819     ret = ibv_query_port(backend_dev->context, backend_dev->port_num,
820                          &port_attr);
821     if (ret) {
822         error_setg(errp, "Error %d from ibv_query_port", ret);
823         ret = -EIO;
824         goto out_destroy_comm_channel;
825     }
826
827     if (backend_dev->backend_gid_idx >= port_attr.gid_tbl_len) {
828         error_setg(errp, "Invalid backend_gid_idx, should be less than %d",
829                    port_attr.gid_tbl_len);
830         goto out_destroy_comm_channel;
831     }
832
833     ret = init_device_caps(backend_dev, dev_attr);
834     if (ret) {
835         error_setg(errp, "Failed to initialize device capabilities");
836         ret = -EIO;
837         goto out_destroy_comm_channel;
838     }
839
840     ret = ibv_query_gid(backend_dev->context, backend_dev->port_num,
841                          backend_dev->backend_gid_idx, &backend_dev->gid);
842     if (ret) {
843         error_setg(errp, "Failed to query gid %d",
844                    backend_dev->backend_gid_idx);
845         ret = -EIO;
846         goto out_destroy_comm_channel;
847     }
848     pr_dbg("subnet_prefix=0x%" PRIx64 "\n",
849            be64_to_cpu(backend_dev->gid.global.subnet_prefix));
850     pr_dbg("interface_id=0x%" PRIx64 "\n",
851            be64_to_cpu(backend_dev->gid.global.interface_id));
852
853     backend_dev->comp_thread.run = false;
854     backend_dev->comp_thread.is_running = false;
855
856     ah_cache_init();
857
858     goto out_free_dev_list;
859
860 out_destroy_comm_channel:
861     ibv_destroy_comp_channel(backend_dev->channel);
862
863 out_close_device:
864     ibv_close_device(backend_dev->context);
865
866 out_free_dev_list:
867     ibv_free_device_list(dev_list);
868
869 out:
870     return ret;
871 }
872
873
874 void rdma_backend_start(RdmaBackendDev *backend_dev)
875 {
876     pr_dbg("Starting rdma_backend\n");
877     start_comp_thread(backend_dev);
878 }
879
880 void rdma_backend_stop(RdmaBackendDev *backend_dev)
881 {
882     pr_dbg("Stopping rdma_backend\n");
883     stop_backend_thread(&backend_dev->comp_thread);
884 }
885
886 void rdma_backend_fini(RdmaBackendDev *backend_dev)
887 {
888     rdma_backend_stop(backend_dev);
889     g_hash_table_destroy(ah_hash);
890     ibv_destroy_comp_channel(backend_dev->channel);
891     ibv_close_device(backend_dev->context);
892 }
This page took 0.101731 seconds and 4 git commands to generate.