]> Git Repo - linux.git/blob - drivers/infiniband/hw/mlx5/main.c
IB/mlx5: Fix configuration of port capabilities
[linux.git] / drivers / infiniband / hw / mlx5 / main.c
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include <linux/highmem.h>
34 #include <linux/module.h>
35 #include <linux/init.h>
36 #include <linux/errno.h>
37 #include <linux/pci.h>
38 #include <linux/dma-mapping.h>
39 #include <linux/slab.h>
40 #if defined(CONFIG_X86)
41 #include <asm/pat.h>
42 #endif
43 #include <linux/sched.h>
44 #include <linux/delay.h>
45 #include <rdma/ib_user_verbs.h>
46 #include <rdma/ib_addr.h>
47 #include <rdma/ib_cache.h>
48 #include <linux/mlx5/port.h>
49 #include <linux/mlx5/vport.h>
50 #include <linux/list.h>
51 #include <rdma/ib_smi.h>
52 #include <rdma/ib_umem.h>
53 #include <linux/in.h>
54 #include <linux/etherdevice.h>
55 #include <linux/mlx5/fs.h>
56 #include "mlx5_ib.h"
57
58 #define DRIVER_NAME "mlx5_ib"
59 #define DRIVER_VERSION "2.2-1"
60 #define DRIVER_RELDATE  "Feb 2014"
61
62 MODULE_AUTHOR("Eli Cohen <[email protected]>");
63 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
64 MODULE_LICENSE("Dual BSD/GPL");
65 MODULE_VERSION(DRIVER_VERSION);
66
67 static char mlx5_version[] =
68         DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
69         DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
70
71 enum {
72         MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
73 };
74
75 static enum rdma_link_layer
76 mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
77 {
78         switch (port_type_cap) {
79         case MLX5_CAP_PORT_TYPE_IB:
80                 return IB_LINK_LAYER_INFINIBAND;
81         case MLX5_CAP_PORT_TYPE_ETH:
82                 return IB_LINK_LAYER_ETHERNET;
83         default:
84                 return IB_LINK_LAYER_UNSPECIFIED;
85         }
86 }
87
88 static enum rdma_link_layer
89 mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
90 {
91         struct mlx5_ib_dev *dev = to_mdev(device);
92         int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
93
94         return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
95 }
96
97 static int mlx5_netdev_event(struct notifier_block *this,
98                              unsigned long event, void *ptr)
99 {
100         struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
101         struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
102                                                  roce.nb);
103
104         switch (event) {
105         case NETDEV_REGISTER:
106         case NETDEV_UNREGISTER:
107                 write_lock(&ibdev->roce.netdev_lock);
108                 if (ndev->dev.parent == &ibdev->mdev->pdev->dev)
109                         ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ?
110                                              NULL : ndev;
111                 write_unlock(&ibdev->roce.netdev_lock);
112                 break;
113
114         case NETDEV_UP:
115         case NETDEV_DOWN: {
116                 struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
117                 struct net_device *upper = NULL;
118
119                 if (lag_ndev) {
120                         upper = netdev_master_upper_dev_get(lag_ndev);
121                         dev_put(lag_ndev);
122                 }
123
124                 if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev))
125                     && ibdev->ib_active) {
126                         struct ib_event ibev = { };
127
128                         ibev.device = &ibdev->ib_dev;
129                         ibev.event = (event == NETDEV_UP) ?
130                                      IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
131                         ibev.element.port_num = 1;
132                         ib_dispatch_event(&ibev);
133                 }
134                 break;
135         }
136
137         default:
138                 break;
139         }
140
141         return NOTIFY_DONE;
142 }
143
144 static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
145                                              u8 port_num)
146 {
147         struct mlx5_ib_dev *ibdev = to_mdev(device);
148         struct net_device *ndev;
149
150         ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
151         if (ndev)
152                 return ndev;
153
154         /* Ensure ndev does not disappear before we invoke dev_hold()
155          */
156         read_lock(&ibdev->roce.netdev_lock);
157         ndev = ibdev->roce.netdev;
158         if (ndev)
159                 dev_hold(ndev);
160         read_unlock(&ibdev->roce.netdev_lock);
161
162         return ndev;
163 }
164
165 static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
166                                 struct ib_port_attr *props)
167 {
168         struct mlx5_ib_dev *dev = to_mdev(device);
169         struct net_device *ndev, *upper;
170         enum ib_mtu ndev_ib_mtu;
171         u16 qkey_viol_cntr;
172
173         /* props being zeroed by the caller, avoid zeroing it here */
174
175         props->port_cap_flags  |= IB_PORT_CM_SUP;
176         props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
177
178         props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
179                                                 roce_address_table_size);
180         props->max_mtu          = IB_MTU_4096;
181         props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
182         props->pkey_tbl_len     = 1;
183         props->state            = IB_PORT_DOWN;
184         props->phys_state       = 3;
185
186         mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
187         props->qkey_viol_cntr = qkey_viol_cntr;
188
189         ndev = mlx5_ib_get_netdev(device, port_num);
190         if (!ndev)
191                 return 0;
192
193         if (mlx5_lag_is_active(dev->mdev)) {
194                 rcu_read_lock();
195                 upper = netdev_master_upper_dev_get_rcu(ndev);
196                 if (upper) {
197                         dev_put(ndev);
198                         ndev = upper;
199                         dev_hold(ndev);
200                 }
201                 rcu_read_unlock();
202         }
203
204         if (netif_running(ndev) && netif_carrier_ok(ndev)) {
205                 props->state      = IB_PORT_ACTIVE;
206                 props->phys_state = 5;
207         }
208
209         ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
210
211         dev_put(ndev);
212
213         props->active_mtu       = min(props->max_mtu, ndev_ib_mtu);
214
215         props->active_width     = IB_WIDTH_4X;  /* TODO */
216         props->active_speed     = IB_SPEED_QDR; /* TODO */
217
218         return 0;
219 }
220
221 static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
222                                      const struct ib_gid_attr *attr,
223                                      void *mlx5_addr)
224 {
225 #define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
226         char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
227                                                source_l3_address);
228         void *mlx5_addr_mac     = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
229                                                source_mac_47_32);
230
231         if (!gid)
232                 return;
233
234         ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr);
235
236         if (is_vlan_dev(attr->ndev)) {
237                 MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
238                 MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev));
239         }
240
241         switch (attr->gid_type) {
242         case IB_GID_TYPE_IB:
243                 MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
244                 break;
245         case IB_GID_TYPE_ROCE_UDP_ENCAP:
246                 MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
247                 break;
248
249         default:
250                 WARN_ON(true);
251         }
252
253         if (attr->gid_type != IB_GID_TYPE_IB) {
254                 if (ipv6_addr_v4mapped((void *)gid))
255                         MLX5_SET_RA(mlx5_addr, roce_l3_type,
256                                     MLX5_ROCE_L3_TYPE_IPV4);
257                 else
258                         MLX5_SET_RA(mlx5_addr, roce_l3_type,
259                                     MLX5_ROCE_L3_TYPE_IPV6);
260         }
261
262         if ((attr->gid_type == IB_GID_TYPE_IB) ||
263             !ipv6_addr_v4mapped((void *)gid))
264                 memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
265         else
266                 memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
267 }
268
269 static int set_roce_addr(struct ib_device *device, u8 port_num,
270                          unsigned int index,
271                          const union ib_gid *gid,
272                          const struct ib_gid_attr *attr)
273 {
274         struct mlx5_ib_dev *dev = to_mdev(device);
275         u32  in[MLX5_ST_SZ_DW(set_roce_address_in)]  = {0};
276         u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0};
277         void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
278         enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
279
280         if (ll != IB_LINK_LAYER_ETHERNET)
281                 return -EINVAL;
282
283         ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
284
285         MLX5_SET(set_roce_address_in, in, roce_address_index, index);
286         MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
287         return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
288 }
289
290 static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
291                            unsigned int index, const union ib_gid *gid,
292                            const struct ib_gid_attr *attr,
293                            __always_unused void **context)
294 {
295         return set_roce_addr(device, port_num, index, gid, attr);
296 }
297
298 static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
299                            unsigned int index, __always_unused void **context)
300 {
301         return set_roce_addr(device, port_num, index, NULL, NULL);
302 }
303
304 __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
305                                int index)
306 {
307         struct ib_gid_attr attr;
308         union ib_gid gid;
309
310         if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
311                 return 0;
312
313         if (!attr.ndev)
314                 return 0;
315
316         dev_put(attr.ndev);
317
318         if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
319                 return 0;
320
321         return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
322 }
323
324 int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
325                            int index, enum ib_gid_type *gid_type)
326 {
327         struct ib_gid_attr attr;
328         union ib_gid gid;
329         int ret;
330
331         ret = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr);
332         if (ret)
333                 return ret;
334
335         if (!attr.ndev)
336                 return -ENODEV;
337
338         dev_put(attr.ndev);
339
340         *gid_type = attr.gid_type;
341
342         return 0;
343 }
344
345 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
346 {
347         if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
348                 return !MLX5_CAP_GEN(dev->mdev, ib_virt);
349         return 0;
350 }
351
352 enum {
353         MLX5_VPORT_ACCESS_METHOD_MAD,
354         MLX5_VPORT_ACCESS_METHOD_HCA,
355         MLX5_VPORT_ACCESS_METHOD_NIC,
356 };
357
358 static int mlx5_get_vport_access_method(struct ib_device *ibdev)
359 {
360         if (mlx5_use_mad_ifc(to_mdev(ibdev)))
361                 return MLX5_VPORT_ACCESS_METHOD_MAD;
362
363         if (mlx5_ib_port_link_layer(ibdev, 1) ==
364             IB_LINK_LAYER_ETHERNET)
365                 return MLX5_VPORT_ACCESS_METHOD_NIC;
366
367         return MLX5_VPORT_ACCESS_METHOD_HCA;
368 }
369
370 static void get_atomic_caps(struct mlx5_ib_dev *dev,
371                             struct ib_device_attr *props)
372 {
373         u8 tmp;
374         u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
375         u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
376         u8 atomic_req_8B_endianness_mode =
377                 MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
378
379         /* Check if HW supports 8 bytes standard atomic operations and capable
380          * of host endianness respond
381          */
382         tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
383         if (((atomic_operations & tmp) == tmp) &&
384             (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
385             (atomic_req_8B_endianness_mode)) {
386                 props->atomic_cap = IB_ATOMIC_HCA;
387         } else {
388                 props->atomic_cap = IB_ATOMIC_NONE;
389         }
390 }
391
392 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
393                                         __be64 *sys_image_guid)
394 {
395         struct mlx5_ib_dev *dev = to_mdev(ibdev);
396         struct mlx5_core_dev *mdev = dev->mdev;
397         u64 tmp;
398         int err;
399
400         switch (mlx5_get_vport_access_method(ibdev)) {
401         case MLX5_VPORT_ACCESS_METHOD_MAD:
402                 return mlx5_query_mad_ifc_system_image_guid(ibdev,
403                                                             sys_image_guid);
404
405         case MLX5_VPORT_ACCESS_METHOD_HCA:
406                 err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
407                 break;
408
409         case MLX5_VPORT_ACCESS_METHOD_NIC:
410                 err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
411                 break;
412
413         default:
414                 return -EINVAL;
415         }
416
417         if (!err)
418                 *sys_image_guid = cpu_to_be64(tmp);
419
420         return err;
421
422 }
423
424 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
425                                 u16 *max_pkeys)
426 {
427         struct mlx5_ib_dev *dev = to_mdev(ibdev);
428         struct mlx5_core_dev *mdev = dev->mdev;
429
430         switch (mlx5_get_vport_access_method(ibdev)) {
431         case MLX5_VPORT_ACCESS_METHOD_MAD:
432                 return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
433
434         case MLX5_VPORT_ACCESS_METHOD_HCA:
435         case MLX5_VPORT_ACCESS_METHOD_NIC:
436                 *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
437                                                 pkey_table_size));
438                 return 0;
439
440         default:
441                 return -EINVAL;
442         }
443 }
444
445 static int mlx5_query_vendor_id(struct ib_device *ibdev,
446                                 u32 *vendor_id)
447 {
448         struct mlx5_ib_dev *dev = to_mdev(ibdev);
449
450         switch (mlx5_get_vport_access_method(ibdev)) {
451         case MLX5_VPORT_ACCESS_METHOD_MAD:
452                 return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
453
454         case MLX5_VPORT_ACCESS_METHOD_HCA:
455         case MLX5_VPORT_ACCESS_METHOD_NIC:
456                 return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
457
458         default:
459                 return -EINVAL;
460         }
461 }
462
463 static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
464                                 __be64 *node_guid)
465 {
466         u64 tmp;
467         int err;
468
469         switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
470         case MLX5_VPORT_ACCESS_METHOD_MAD:
471                 return mlx5_query_mad_ifc_node_guid(dev, node_guid);
472
473         case MLX5_VPORT_ACCESS_METHOD_HCA:
474                 err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
475                 break;
476
477         case MLX5_VPORT_ACCESS_METHOD_NIC:
478                 err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
479                 break;
480
481         default:
482                 return -EINVAL;
483         }
484
485         if (!err)
486                 *node_guid = cpu_to_be64(tmp);
487
488         return err;
489 }
490
491 struct mlx5_reg_node_desc {
492         u8      desc[IB_DEVICE_NODE_DESC_MAX];
493 };
494
495 static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
496 {
497         struct mlx5_reg_node_desc in;
498
499         if (mlx5_use_mad_ifc(dev))
500                 return mlx5_query_mad_ifc_node_desc(dev, node_desc);
501
502         memset(&in, 0, sizeof(in));
503
504         return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
505                                     sizeof(struct mlx5_reg_node_desc),
506                                     MLX5_REG_NODE_DESC, 0, 0);
507 }
508
509 static int mlx5_ib_query_device(struct ib_device *ibdev,
510                                 struct ib_device_attr *props,
511                                 struct ib_udata *uhw)
512 {
513         struct mlx5_ib_dev *dev = to_mdev(ibdev);
514         struct mlx5_core_dev *mdev = dev->mdev;
515         int err = -ENOMEM;
516         int max_sq_desc;
517         int max_rq_sg;
518         int max_sq_sg;
519         u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
520         struct mlx5_ib_query_device_resp resp = {};
521         size_t resp_len;
522         u64 max_tso;
523
524         resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
525         if (uhw->outlen && uhw->outlen < resp_len)
526                 return -EINVAL;
527         else
528                 resp.response_length = resp_len;
529
530         if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
531                 return -EINVAL;
532
533         memset(props, 0, sizeof(*props));
534         err = mlx5_query_system_image_guid(ibdev,
535                                            &props->sys_image_guid);
536         if (err)
537                 return err;
538
539         err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
540         if (err)
541                 return err;
542
543         err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
544         if (err)
545                 return err;
546
547         props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
548                 (fw_rev_min(dev->mdev) << 16) |
549                 fw_rev_sub(dev->mdev);
550         props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
551                 IB_DEVICE_PORT_ACTIVE_EVENT             |
552                 IB_DEVICE_SYS_IMAGE_GUID                |
553                 IB_DEVICE_RC_RNR_NAK_GEN;
554
555         if (MLX5_CAP_GEN(mdev, pkv))
556                 props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
557         if (MLX5_CAP_GEN(mdev, qkv))
558                 props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
559         if (MLX5_CAP_GEN(mdev, apm))
560                 props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
561         if (MLX5_CAP_GEN(mdev, xrc))
562                 props->device_cap_flags |= IB_DEVICE_XRC;
563         if (MLX5_CAP_GEN(mdev, imaicl)) {
564                 props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
565                                            IB_DEVICE_MEM_WINDOW_TYPE_2B;
566                 props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
567                 /* We support 'Gappy' memory registration too */
568                 props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
569         }
570         props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
571         if (MLX5_CAP_GEN(mdev, sho)) {
572                 props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
573                 /* At this stage no support for signature handover */
574                 props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
575                                       IB_PROT_T10DIF_TYPE_2 |
576                                       IB_PROT_T10DIF_TYPE_3;
577                 props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
578                                        IB_GUARD_T10DIF_CSUM;
579         }
580         if (MLX5_CAP_GEN(mdev, block_lb_mc))
581                 props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
582
583         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) {
584                 if (MLX5_CAP_ETH(mdev, csum_cap)) {
585                         /* Legacy bit to support old userspace libraries */
586                         props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
587                         props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
588                 }
589
590                 if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
591                         props->raw_packet_caps |=
592                                 IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
593
594                 if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
595                         max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
596                         if (max_tso) {
597                                 resp.tso_caps.max_tso = 1 << max_tso;
598                                 resp.tso_caps.supported_qpts |=
599                                         1 << IB_QPT_RAW_PACKET;
600                                 resp.response_length += sizeof(resp.tso_caps);
601                         }
602                 }
603
604                 if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
605                         resp.rss_caps.rx_hash_function =
606                                                 MLX5_RX_HASH_FUNC_TOEPLITZ;
607                         resp.rss_caps.rx_hash_fields_mask =
608                                                 MLX5_RX_HASH_SRC_IPV4 |
609                                                 MLX5_RX_HASH_DST_IPV4 |
610                                                 MLX5_RX_HASH_SRC_IPV6 |
611                                                 MLX5_RX_HASH_DST_IPV6 |
612                                                 MLX5_RX_HASH_SRC_PORT_TCP |
613                                                 MLX5_RX_HASH_DST_PORT_TCP |
614                                                 MLX5_RX_HASH_SRC_PORT_UDP |
615                                                 MLX5_RX_HASH_DST_PORT_UDP;
616                         resp.response_length += sizeof(resp.rss_caps);
617                 }
618         } else {
619                 if (field_avail(typeof(resp), tso_caps, uhw->outlen))
620                         resp.response_length += sizeof(resp.tso_caps);
621                 if (field_avail(typeof(resp), rss_caps, uhw->outlen))
622                         resp.response_length += sizeof(resp.rss_caps);
623         }
624
625         if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
626                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
627                 props->device_cap_flags |= IB_DEVICE_UD_TSO;
628         }
629
630         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
631             MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
632                 /* Legacy bit to support old userspace libraries */
633                 props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
634                 props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
635         }
636
637         if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
638                 props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
639
640         props->vendor_part_id      = mdev->pdev->device;
641         props->hw_ver              = mdev->pdev->revision;
642
643         props->max_mr_size         = ~0ull;
644         props->page_size_cap       = ~(min_page_size - 1);
645         props->max_qp              = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
646         props->max_qp_wr           = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
647         max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
648                      sizeof(struct mlx5_wqe_data_seg);
649         max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
650         max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
651                      sizeof(struct mlx5_wqe_raddr_seg)) /
652                 sizeof(struct mlx5_wqe_data_seg);
653         props->max_sge = min(max_rq_sg, max_sq_sg);
654         props->max_sge_rd          = MLX5_MAX_SGE_RD;
655         props->max_cq              = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
656         props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
657         props->max_mr              = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
658         props->max_pd              = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
659         props->max_qp_rd_atom      = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
660         props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
661         props->max_srq             = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
662         props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
663         props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
664         props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
665         props->max_srq_sge         = max_rq_sg - 1;
666         props->max_fast_reg_page_list_len =
667                 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
668         get_atomic_caps(dev, props);
669         props->masked_atomic_cap   = IB_ATOMIC_NONE;
670         props->max_mcast_grp       = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
671         props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
672         props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
673                                            props->max_mcast_grp;
674         props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
675         props->max_ah = INT_MAX;
676         props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
677         props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
678
679 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
680         if (MLX5_CAP_GEN(mdev, pg))
681                 props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
682         props->odp_caps = dev->odp_caps;
683 #endif
684
685         if (MLX5_CAP_GEN(mdev, cd))
686                 props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
687
688         if (!mlx5_core_is_pf(mdev))
689                 props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
690
691         if (mlx5_ib_port_link_layer(ibdev, 1) ==
692             IB_LINK_LAYER_ETHERNET) {
693                 props->rss_caps.max_rwq_indirection_tables =
694                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
695                 props->rss_caps.max_rwq_indirection_table_size =
696                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
697                 props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
698                 props->max_wq_type_rq =
699                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
700         }
701
702         if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) {
703                 resp.cqe_comp_caps.max_num =
704                         MLX5_CAP_GEN(dev->mdev, cqe_compression) ?
705                         MLX5_CAP_GEN(dev->mdev, cqe_compression_max_num) : 0;
706                 resp.cqe_comp_caps.supported_format =
707                         MLX5_IB_CQE_RES_FORMAT_HASH |
708                         MLX5_IB_CQE_RES_FORMAT_CSUM;
709                 resp.response_length += sizeof(resp.cqe_comp_caps);
710         }
711
712         if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen)) {
713                 if (MLX5_CAP_QOS(mdev, packet_pacing) &&
714                     MLX5_CAP_GEN(mdev, qos)) {
715                         resp.packet_pacing_caps.qp_rate_limit_max =
716                                 MLX5_CAP_QOS(mdev, packet_pacing_max_rate);
717                         resp.packet_pacing_caps.qp_rate_limit_min =
718                                 MLX5_CAP_QOS(mdev, packet_pacing_min_rate);
719                         resp.packet_pacing_caps.supported_qpts |=
720                                 1 << IB_QPT_RAW_PACKET;
721                 }
722                 resp.response_length += sizeof(resp.packet_pacing_caps);
723         }
724
725         if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
726                         uhw->outlen)) {
727                 resp.mlx5_ib_support_multi_pkt_send_wqes =
728                         MLX5_CAP_ETH(mdev, multi_pkt_send_wqe);
729                 resp.response_length +=
730                         sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
731         }
732
733         if (field_avail(typeof(resp), reserved, uhw->outlen))
734                 resp.response_length += sizeof(resp.reserved);
735
736         if (uhw->outlen) {
737                 err = ib_copy_to_udata(uhw, &resp, resp.response_length);
738
739                 if (err)
740                         return err;
741         }
742
743         return 0;
744 }
745
746 enum mlx5_ib_width {
747         MLX5_IB_WIDTH_1X        = 1 << 0,
748         MLX5_IB_WIDTH_2X        = 1 << 1,
749         MLX5_IB_WIDTH_4X        = 1 << 2,
750         MLX5_IB_WIDTH_8X        = 1 << 3,
751         MLX5_IB_WIDTH_12X       = 1 << 4
752 };
753
754 static int translate_active_width(struct ib_device *ibdev, u8 active_width,
755                                   u8 *ib_width)
756 {
757         struct mlx5_ib_dev *dev = to_mdev(ibdev);
758         int err = 0;
759
760         if (active_width & MLX5_IB_WIDTH_1X) {
761                 *ib_width = IB_WIDTH_1X;
762         } else if (active_width & MLX5_IB_WIDTH_2X) {
763                 mlx5_ib_dbg(dev, "active_width %d is not supported by IB spec\n",
764                             (int)active_width);
765                 err = -EINVAL;
766         } else if (active_width & MLX5_IB_WIDTH_4X) {
767                 *ib_width = IB_WIDTH_4X;
768         } else if (active_width & MLX5_IB_WIDTH_8X) {
769                 *ib_width = IB_WIDTH_8X;
770         } else if (active_width & MLX5_IB_WIDTH_12X) {
771                 *ib_width = IB_WIDTH_12X;
772         } else {
773                 mlx5_ib_dbg(dev, "Invalid active_width %d\n",
774                             (int)active_width);
775                 err = -EINVAL;
776         }
777
778         return err;
779 }
780
781 static int mlx5_mtu_to_ib_mtu(int mtu)
782 {
783         switch (mtu) {
784         case 256: return 1;
785         case 512: return 2;
786         case 1024: return 3;
787         case 2048: return 4;
788         case 4096: return 5;
789         default:
790                 pr_warn("invalid mtu\n");
791                 return -1;
792         }
793 }
794
795 enum ib_max_vl_num {
796         __IB_MAX_VL_0           = 1,
797         __IB_MAX_VL_0_1         = 2,
798         __IB_MAX_VL_0_3         = 3,
799         __IB_MAX_VL_0_7         = 4,
800         __IB_MAX_VL_0_14        = 5,
801 };
802
803 enum mlx5_vl_hw_cap {
804         MLX5_VL_HW_0    = 1,
805         MLX5_VL_HW_0_1  = 2,
806         MLX5_VL_HW_0_2  = 3,
807         MLX5_VL_HW_0_3  = 4,
808         MLX5_VL_HW_0_4  = 5,
809         MLX5_VL_HW_0_5  = 6,
810         MLX5_VL_HW_0_6  = 7,
811         MLX5_VL_HW_0_7  = 8,
812         MLX5_VL_HW_0_14 = 15
813 };
814
815 static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
816                                 u8 *max_vl_num)
817 {
818         switch (vl_hw_cap) {
819         case MLX5_VL_HW_0:
820                 *max_vl_num = __IB_MAX_VL_0;
821                 break;
822         case MLX5_VL_HW_0_1:
823                 *max_vl_num = __IB_MAX_VL_0_1;
824                 break;
825         case MLX5_VL_HW_0_3:
826                 *max_vl_num = __IB_MAX_VL_0_3;
827                 break;
828         case MLX5_VL_HW_0_7:
829                 *max_vl_num = __IB_MAX_VL_0_7;
830                 break;
831         case MLX5_VL_HW_0_14:
832                 *max_vl_num = __IB_MAX_VL_0_14;
833                 break;
834
835         default:
836                 return -EINVAL;
837         }
838
839         return 0;
840 }
841
842 static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
843                                struct ib_port_attr *props)
844 {
845         struct mlx5_ib_dev *dev = to_mdev(ibdev);
846         struct mlx5_core_dev *mdev = dev->mdev;
847         struct mlx5_hca_vport_context *rep;
848         u16 max_mtu;
849         u16 oper_mtu;
850         int err;
851         u8 ib_link_width_oper;
852         u8 vl_hw_cap;
853
854         rep = kzalloc(sizeof(*rep), GFP_KERNEL);
855         if (!rep) {
856                 err = -ENOMEM;
857                 goto out;
858         }
859
860         /* props being zeroed by the caller, avoid zeroing it here */
861
862         err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
863         if (err)
864                 goto out;
865
866         props->lid              = rep->lid;
867         props->lmc              = rep->lmc;
868         props->sm_lid           = rep->sm_lid;
869         props->sm_sl            = rep->sm_sl;
870         props->state            = rep->vport_state;
871         props->phys_state       = rep->port_physical_state;
872         props->port_cap_flags   = rep->cap_mask1;
873         props->gid_tbl_len      = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
874         props->max_msg_sz       = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
875         props->pkey_tbl_len     = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
876         props->bad_pkey_cntr    = rep->pkey_violation_counter;
877         props->qkey_viol_cntr   = rep->qkey_violation_counter;
878         props->subnet_timeout   = rep->subnet_timeout;
879         props->init_type_reply  = rep->init_type_reply;
880         props->grh_required     = rep->grh_required;
881
882         err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
883         if (err)
884                 goto out;
885
886         err = translate_active_width(ibdev, ib_link_width_oper,
887                                      &props->active_width);
888         if (err)
889                 goto out;
890         err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port);
891         if (err)
892                 goto out;
893
894         mlx5_query_port_max_mtu(mdev, &max_mtu, port);
895
896         props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
897
898         mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
899
900         props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
901
902         err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
903         if (err)
904                 goto out;
905
906         err = translate_max_vl_num(ibdev, vl_hw_cap,
907                                    &props->max_vl_num);
908 out:
909         kfree(rep);
910         return err;
911 }
912
913 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
914                        struct ib_port_attr *props)
915 {
916         switch (mlx5_get_vport_access_method(ibdev)) {
917         case MLX5_VPORT_ACCESS_METHOD_MAD:
918                 return mlx5_query_mad_ifc_port(ibdev, port, props);
919
920         case MLX5_VPORT_ACCESS_METHOD_HCA:
921                 return mlx5_query_hca_port(ibdev, port, props);
922
923         case MLX5_VPORT_ACCESS_METHOD_NIC:
924                 return mlx5_query_port_roce(ibdev, port, props);
925
926         default:
927                 return -EINVAL;
928         }
929 }
930
931 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
932                              union ib_gid *gid)
933 {
934         struct mlx5_ib_dev *dev = to_mdev(ibdev);
935         struct mlx5_core_dev *mdev = dev->mdev;
936
937         switch (mlx5_get_vport_access_method(ibdev)) {
938         case MLX5_VPORT_ACCESS_METHOD_MAD:
939                 return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
940
941         case MLX5_VPORT_ACCESS_METHOD_HCA:
942                 return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
943
944         default:
945                 return -EINVAL;
946         }
947
948 }
949
950 static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
951                               u16 *pkey)
952 {
953         struct mlx5_ib_dev *dev = to_mdev(ibdev);
954         struct mlx5_core_dev *mdev = dev->mdev;
955
956         switch (mlx5_get_vport_access_method(ibdev)) {
957         case MLX5_VPORT_ACCESS_METHOD_MAD:
958                 return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
959
960         case MLX5_VPORT_ACCESS_METHOD_HCA:
961         case MLX5_VPORT_ACCESS_METHOD_NIC:
962                 return mlx5_query_hca_vport_pkey(mdev, 0, port,  0, index,
963                                                  pkey);
964         default:
965                 return -EINVAL;
966         }
967 }
968
969 static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
970                                  struct ib_device_modify *props)
971 {
972         struct mlx5_ib_dev *dev = to_mdev(ibdev);
973         struct mlx5_reg_node_desc in;
974         struct mlx5_reg_node_desc out;
975         int err;
976
977         if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
978                 return -EOPNOTSUPP;
979
980         if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
981                 return 0;
982
983         /*
984          * If possible, pass node desc to FW, so it can generate
985          * a 144 trap.  If cmd fails, just ignore.
986          */
987         memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
988         err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
989                                    sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
990         if (err)
991                 return err;
992
993         memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
994
995         return err;
996 }
997
998 static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask,
999                                 u32 value)
1000 {
1001         struct mlx5_hca_vport_context ctx = {};
1002         int err;
1003
1004         err = mlx5_query_hca_vport_context(dev->mdev, 0,
1005                                            port_num, 0, &ctx);
1006         if (err)
1007                 return err;
1008
1009         if (~ctx.cap_mask1_perm & mask) {
1010                 mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
1011                              mask, ctx.cap_mask1_perm);
1012                 return -EINVAL;
1013         }
1014
1015         ctx.cap_mask1 = value;
1016         ctx.cap_mask1_perm = mask;
1017         err = mlx5_core_modify_hca_vport_context(dev->mdev, 0,
1018                                                  port_num, 0, &ctx);
1019
1020         return err;
1021 }
1022
1023 static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
1024                                struct ib_port_modify *props)
1025 {
1026         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1027         struct ib_port_attr attr;
1028         u32 tmp;
1029         int err;
1030         u32 change_mask;
1031         u32 value;
1032         bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
1033                       IB_LINK_LAYER_INFINIBAND);
1034
1035         if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
1036                 change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
1037                 value = ~props->clr_port_cap_mask | props->set_port_cap_mask;
1038                 return set_port_caps_atomic(dev, port, change_mask, value);
1039         }
1040
1041         mutex_lock(&dev->cap_mask_mutex);
1042
1043         err = ib_query_port(ibdev, port, &attr);
1044         if (err)
1045                 goto out;
1046
1047         tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
1048                 ~props->clr_port_cap_mask;
1049
1050         err = mlx5_set_port_caps(dev->mdev, port, tmp);
1051
1052 out:
1053         mutex_unlock(&dev->cap_mask_mutex);
1054         return err;
1055 }
1056
1057 static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
1058 {
1059         mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
1060                     caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
1061 }
1062
1063 static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
1064                              struct mlx5_ib_alloc_ucontext_req_v2 *req,
1065                              u32 *num_sys_pages)
1066 {
1067         int uars_per_sys_page;
1068         int bfregs_per_sys_page;
1069         int ref_bfregs = req->total_num_bfregs;
1070
1071         if (req->total_num_bfregs == 0)
1072                 return -EINVAL;
1073
1074         BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
1075         BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
1076
1077         if (req->total_num_bfregs > MLX5_MAX_BFREGS)
1078                 return -ENOMEM;
1079
1080         uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
1081         bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
1082         req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
1083         *num_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
1084
1085         if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
1086                 return -EINVAL;
1087
1088         mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, alloated %d, using %d sys pages\n",
1089                     MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
1090                     lib_uar_4k ? "yes" : "no", ref_bfregs,
1091                     req->total_num_bfregs, *num_sys_pages);
1092
1093         return 0;
1094 }
1095
1096 static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1097 {
1098         struct mlx5_bfreg_info *bfregi;
1099         int err;
1100         int i;
1101
1102         bfregi = &context->bfregi;
1103         for (i = 0; i < bfregi->num_sys_pages; i++) {
1104                 err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
1105                 if (err)
1106                         goto error;
1107
1108                 mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
1109         }
1110         return 0;
1111
1112 error:
1113         for (--i; i >= 0; i--)
1114                 if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]))
1115                         mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1116
1117         return err;
1118 }
1119
1120 static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1121 {
1122         struct mlx5_bfreg_info *bfregi;
1123         int err;
1124         int i;
1125
1126         bfregi = &context->bfregi;
1127         for (i = 0; i < bfregi->num_sys_pages; i++) {
1128                 err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
1129                 if (err) {
1130                         mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1131                         return err;
1132                 }
1133         }
1134         return 0;
1135 }
1136
1137 static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
1138                                                   struct ib_udata *udata)
1139 {
1140         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1141         struct mlx5_ib_alloc_ucontext_req_v2 req = {};
1142         struct mlx5_ib_alloc_ucontext_resp resp = {};
1143         struct mlx5_ib_ucontext *context;
1144         struct mlx5_bfreg_info *bfregi;
1145         int ver;
1146         int err;
1147         size_t reqlen;
1148         size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
1149                                      max_cqe_version);
1150         bool lib_uar_4k;
1151
1152         if (!dev->ib_active)
1153                 return ERR_PTR(-EAGAIN);
1154
1155         if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr))
1156                 return ERR_PTR(-EINVAL);
1157
1158         reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
1159         if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
1160                 ver = 0;
1161         else if (reqlen >= min_req_v2)
1162                 ver = 2;
1163         else
1164                 return ERR_PTR(-EINVAL);
1165
1166         err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req)));
1167         if (err)
1168                 return ERR_PTR(err);
1169
1170         if (req.flags)
1171                 return ERR_PTR(-EINVAL);
1172
1173         if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1174                 return ERR_PTR(-EOPNOTSUPP);
1175
1176         req.total_num_bfregs = ALIGN(req.total_num_bfregs,
1177                                     MLX5_NON_FP_BFREGS_PER_UAR);
1178         if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
1179                 return ERR_PTR(-EINVAL);
1180
1181         resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1182         if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
1183                 resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
1184         resp.cache_line_size = cache_line_size();
1185         resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1186         resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1187         resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1188         resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1189         resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1190         resp.cqe_version = min_t(__u8,
1191                                  (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
1192                                  req.max_cqe_version);
1193         resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1194                                 MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
1195         resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1196                                         MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
1197         resp.response_length = min(offsetof(typeof(resp), response_length) +
1198                                    sizeof(resp.response_length), udata->outlen);
1199
1200         context = kzalloc(sizeof(*context), GFP_KERNEL);
1201         if (!context)
1202                 return ERR_PTR(-ENOMEM);
1203
1204         lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
1205         bfregi = &context->bfregi;
1206
1207         /* updates req->total_num_bfregs */
1208         err = calc_total_bfregs(dev, lib_uar_4k, &req, &bfregi->num_sys_pages);
1209         if (err)
1210                 goto out_ctx;
1211
1212         mutex_init(&bfregi->lock);
1213         bfregi->lib_uar_4k = lib_uar_4k;
1214         bfregi->count = kcalloc(req.total_num_bfregs, sizeof(*bfregi->count),
1215                                 GFP_KERNEL);
1216         if (!bfregi->count) {
1217                 err = -ENOMEM;
1218                 goto out_ctx;
1219         }
1220
1221         bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
1222                                     sizeof(*bfregi->sys_pages),
1223                                     GFP_KERNEL);
1224         if (!bfregi->sys_pages) {
1225                 err = -ENOMEM;
1226                 goto out_count;
1227         }
1228
1229         err = allocate_uars(dev, context);
1230         if (err)
1231                 goto out_sys_pages;
1232
1233 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1234         context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
1235 #endif
1236
1237         context->upd_xlt_page = __get_free_page(GFP_KERNEL);
1238         if (!context->upd_xlt_page) {
1239                 err = -ENOMEM;
1240                 goto out_uars;
1241         }
1242         mutex_init(&context->upd_xlt_page_mutex);
1243
1244         if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
1245                 err = mlx5_core_alloc_transport_domain(dev->mdev,
1246                                                        &context->tdn);
1247                 if (err)
1248                         goto out_page;
1249         }
1250
1251         INIT_LIST_HEAD(&context->vma_private_list);
1252         INIT_LIST_HEAD(&context->db_page_list);
1253         mutex_init(&context->db_page_mutex);
1254
1255         resp.tot_bfregs = req.total_num_bfregs;
1256         resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
1257
1258         if (field_avail(typeof(resp), cqe_version, udata->outlen))
1259                 resp.response_length += sizeof(resp.cqe_version);
1260
1261         if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
1262                 resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
1263                                       MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
1264                 resp.response_length += sizeof(resp.cmds_supp_uhw);
1265         }
1266
1267         /*
1268          * We don't want to expose information from the PCI bar that is located
1269          * after 4096 bytes, so if the arch only supports larger pages, let's
1270          * pretend we don't support reading the HCA's core clock. This is also
1271          * forced by mmap function.
1272          */
1273         if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
1274                 if (PAGE_SIZE <= 4096) {
1275                         resp.comp_mask |=
1276                                 MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1277                         resp.hca_core_clock_offset =
1278                                 offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
1279                 }
1280                 resp.response_length += sizeof(resp.hca_core_clock_offset) +
1281                                         sizeof(resp.reserved2);
1282         }
1283
1284         if (field_avail(typeof(resp), log_uar_size, udata->outlen))
1285                 resp.response_length += sizeof(resp.log_uar_size);
1286
1287         if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
1288                 resp.response_length += sizeof(resp.num_uars_per_page);
1289
1290         err = ib_copy_to_udata(udata, &resp, resp.response_length);
1291         if (err)
1292                 goto out_td;
1293
1294         bfregi->ver = ver;
1295         bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
1296         context->cqe_version = resp.cqe_version;
1297         context->lib_caps = req.lib_caps;
1298         print_lib_caps(dev, context->lib_caps);
1299
1300         return &context->ibucontext;
1301
1302 out_td:
1303         if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1304                 mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1305
1306 out_page:
1307         free_page(context->upd_xlt_page);
1308
1309 out_uars:
1310         deallocate_uars(dev, context);
1311
1312 out_sys_pages:
1313         kfree(bfregi->sys_pages);
1314
1315 out_count:
1316         kfree(bfregi->count);
1317
1318 out_ctx:
1319         kfree(context);
1320
1321         return ERR_PTR(err);
1322 }
1323
1324 static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1325 {
1326         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1327         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1328         struct mlx5_bfreg_info *bfregi;
1329
1330         bfregi = &context->bfregi;
1331         if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1332                 mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1333
1334         free_page(context->upd_xlt_page);
1335         deallocate_uars(dev, context);
1336         kfree(bfregi->sys_pages);
1337         kfree(bfregi->count);
1338         kfree(context);
1339
1340         return 0;
1341 }
1342
1343 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
1344                                  struct mlx5_bfreg_info *bfregi,
1345                                  int idx)
1346 {
1347         int fw_uars_per_page;
1348
1349         fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
1350
1351         return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) +
1352                         bfregi->sys_pages[idx] / fw_uars_per_page;
1353 }
1354
1355 static int get_command(unsigned long offset)
1356 {
1357         return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1358 }
1359
1360 static int get_arg(unsigned long offset)
1361 {
1362         return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1363 }
1364
1365 static int get_index(unsigned long offset)
1366 {
1367         return get_arg(offset);
1368 }
1369
1370 static void  mlx5_ib_vma_open(struct vm_area_struct *area)
1371 {
1372         /* vma_open is called when a new VMA is created on top of our VMA.  This
1373          * is done through either mremap flow or split_vma (usually due to
1374          * mlock, madvise, munmap, etc.) We do not support a clone of the VMA,
1375          * as this VMA is strongly hardware related.  Therefore we set the
1376          * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
1377          * calling us again and trying to do incorrect actions.  We assume that
1378          * the original VMA size is exactly a single page, and therefore all
1379          * "splitting" operation will not happen to it.
1380          */
1381         area->vm_ops = NULL;
1382 }
1383
1384 static void  mlx5_ib_vma_close(struct vm_area_struct *area)
1385 {
1386         struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data;
1387
1388         /* It's guaranteed that all VMAs opened on a FD are closed before the
1389          * file itself is closed, therefore no sync is needed with the regular
1390          * closing flow. (e.g. mlx5 ib_dealloc_ucontext)
1391          * However need a sync with accessing the vma as part of
1392          * mlx5_ib_disassociate_ucontext.
1393          * The close operation is usually called under mm->mmap_sem except when
1394          * process is exiting.
1395          * The exiting case is handled explicitly as part of
1396          * mlx5_ib_disassociate_ucontext.
1397          */
1398         mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data;
1399
1400         /* setting the vma context pointer to null in the mlx5_ib driver's
1401          * private data, to protect a race condition in
1402          * mlx5_ib_disassociate_ucontext().
1403          */
1404         mlx5_ib_vma_priv_data->vma = NULL;
1405         list_del(&mlx5_ib_vma_priv_data->list);
1406         kfree(mlx5_ib_vma_priv_data);
1407 }
1408
1409 static const struct vm_operations_struct mlx5_ib_vm_ops = {
1410         .open = mlx5_ib_vma_open,
1411         .close = mlx5_ib_vma_close
1412 };
1413
1414 static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
1415                                 struct mlx5_ib_ucontext *ctx)
1416 {
1417         struct mlx5_ib_vma_private_data *vma_prv;
1418         struct list_head *vma_head = &ctx->vma_private_list;
1419
1420         vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL);
1421         if (!vma_prv)
1422                 return -ENOMEM;
1423
1424         vma_prv->vma = vma;
1425         vma->vm_private_data = vma_prv;
1426         vma->vm_ops =  &mlx5_ib_vm_ops;
1427
1428         list_add(&vma_prv->list, vma_head);
1429
1430         return 0;
1431 }
1432
1433 static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
1434 {
1435         int ret;
1436         struct vm_area_struct *vma;
1437         struct mlx5_ib_vma_private_data *vma_private, *n;
1438         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1439         struct task_struct *owning_process  = NULL;
1440         struct mm_struct   *owning_mm       = NULL;
1441
1442         owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
1443         if (!owning_process)
1444                 return;
1445
1446         owning_mm = get_task_mm(owning_process);
1447         if (!owning_mm) {
1448                 pr_info("no mm, disassociate ucontext is pending task termination\n");
1449                 while (1) {
1450                         put_task_struct(owning_process);
1451                         usleep_range(1000, 2000);
1452                         owning_process = get_pid_task(ibcontext->tgid,
1453                                                       PIDTYPE_PID);
1454                         if (!owning_process ||
1455                             owning_process->state == TASK_DEAD) {
1456                                 pr_info("disassociate ucontext done, task was terminated\n");
1457                                 /* in case task was dead need to release the
1458                                  * task struct.
1459                                  */
1460                                 if (owning_process)
1461                                         put_task_struct(owning_process);
1462                                 return;
1463                         }
1464                 }
1465         }
1466
1467         /* need to protect from a race on closing the vma as part of
1468          * mlx5_ib_vma_close.
1469          */
1470         down_read(&owning_mm->mmap_sem);
1471         list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
1472                                  list) {
1473                 vma = vma_private->vma;
1474                 ret = zap_vma_ptes(vma, vma->vm_start,
1475                                    PAGE_SIZE);
1476                 WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__);
1477                 /* context going to be destroyed, should
1478                  * not access ops any more.
1479                  */
1480                 vma->vm_ops = NULL;
1481                 list_del(&vma_private->list);
1482                 kfree(vma_private);
1483         }
1484         up_read(&owning_mm->mmap_sem);
1485         mmput(owning_mm);
1486         put_task_struct(owning_process);
1487 }
1488
1489 static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
1490 {
1491         switch (cmd) {
1492         case MLX5_IB_MMAP_WC_PAGE:
1493                 return "WC";
1494         case MLX5_IB_MMAP_REGULAR_PAGE:
1495                 return "best effort WC";
1496         case MLX5_IB_MMAP_NC_PAGE:
1497                 return "NC";
1498         default:
1499                 return NULL;
1500         }
1501 }
1502
1503 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
1504                     struct vm_area_struct *vma,
1505                     struct mlx5_ib_ucontext *context)
1506 {
1507         struct mlx5_bfreg_info *bfregi = &context->bfregi;
1508         int err;
1509         unsigned long idx;
1510         phys_addr_t pfn, pa;
1511         pgprot_t prot;
1512         int uars_per_page;
1513
1514         if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1515                 return -EINVAL;
1516
1517         uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
1518         idx = get_index(vma->vm_pgoff);
1519         if (idx % uars_per_page ||
1520             idx * uars_per_page >= bfregi->num_sys_pages) {
1521                 mlx5_ib_warn(dev, "invalid uar index %lu\n", idx);
1522                 return -EINVAL;
1523         }
1524
1525         switch (cmd) {
1526         case MLX5_IB_MMAP_WC_PAGE:
1527 /* Some architectures don't support WC memory */
1528 #if defined(CONFIG_X86)
1529                 if (!pat_enabled())
1530                         return -EPERM;
1531 #elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
1532                         return -EPERM;
1533 #endif
1534         /* fall through */
1535         case MLX5_IB_MMAP_REGULAR_PAGE:
1536                 /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
1537                 prot = pgprot_writecombine(vma->vm_page_prot);
1538                 break;
1539         case MLX5_IB_MMAP_NC_PAGE:
1540                 prot = pgprot_noncached(vma->vm_page_prot);
1541                 break;
1542         default:
1543                 return -EINVAL;
1544         }
1545
1546         pfn = uar_index2pfn(dev, bfregi, idx);
1547         mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
1548
1549         vma->vm_page_prot = prot;
1550         err = io_remap_pfn_range(vma, vma->vm_start, pfn,
1551                                  PAGE_SIZE, vma->vm_page_prot);
1552         if (err) {
1553                 mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n",
1554                             err, vma->vm_start, &pfn, mmap_cmd2str(cmd));
1555                 return -EAGAIN;
1556         }
1557
1558         pa = pfn << PAGE_SHIFT;
1559         mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd),
1560                     vma->vm_start, &pa);
1561
1562         return mlx5_ib_set_vma_data(vma, context);
1563 }
1564
1565 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
1566 {
1567         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1568         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1569         unsigned long command;
1570         phys_addr_t pfn;
1571
1572         command = get_command(vma->vm_pgoff);
1573         switch (command) {
1574         case MLX5_IB_MMAP_WC_PAGE:
1575         case MLX5_IB_MMAP_NC_PAGE:
1576         case MLX5_IB_MMAP_REGULAR_PAGE:
1577                 return uar_mmap(dev, command, vma, context);
1578
1579         case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
1580                 return -ENOSYS;
1581
1582         case MLX5_IB_MMAP_CORE_CLOCK:
1583                 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1584                         return -EINVAL;
1585
1586                 if (vma->vm_flags & VM_WRITE)
1587                         return -EPERM;
1588
1589                 /* Don't expose to user-space information it shouldn't have */
1590                 if (PAGE_SIZE > 4096)
1591                         return -EOPNOTSUPP;
1592
1593                 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1594                 pfn = (dev->mdev->iseg_base +
1595                        offsetof(struct mlx5_init_seg, internal_timer_h)) >>
1596                         PAGE_SHIFT;
1597                 if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1598                                        PAGE_SIZE, vma->vm_page_prot))
1599                         return -EAGAIN;
1600
1601                 mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n",
1602                             vma->vm_start,
1603                             (unsigned long long)pfn << PAGE_SHIFT);
1604                 break;
1605
1606         default:
1607                 return -EINVAL;
1608         }
1609
1610         return 0;
1611 }
1612
1613 static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
1614                                       struct ib_ucontext *context,
1615                                       struct ib_udata *udata)
1616 {
1617         struct mlx5_ib_alloc_pd_resp resp;
1618         struct mlx5_ib_pd *pd;
1619         int err;
1620
1621         pd = kmalloc(sizeof(*pd), GFP_KERNEL);
1622         if (!pd)
1623                 return ERR_PTR(-ENOMEM);
1624
1625         err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
1626         if (err) {
1627                 kfree(pd);
1628                 return ERR_PTR(err);
1629         }
1630
1631         if (context) {
1632                 resp.pdn = pd->pdn;
1633                 if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
1634                         mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1635                         kfree(pd);
1636                         return ERR_PTR(-EFAULT);
1637                 }
1638         }
1639
1640         return &pd->ibpd;
1641 }
1642
1643 static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
1644 {
1645         struct mlx5_ib_dev *mdev = to_mdev(pd->device);
1646         struct mlx5_ib_pd *mpd = to_mpd(pd);
1647
1648         mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
1649         kfree(mpd);
1650
1651         return 0;
1652 }
1653
1654 enum {
1655         MATCH_CRITERIA_ENABLE_OUTER_BIT,
1656         MATCH_CRITERIA_ENABLE_MISC_BIT,
1657         MATCH_CRITERIA_ENABLE_INNER_BIT
1658 };
1659
1660 #define HEADER_IS_ZERO(match_criteria, headers)                            \
1661         !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
1662                     0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
1663
1664 static u8 get_match_criteria_enable(u32 *match_criteria)
1665 {
1666         u8 match_criteria_enable;
1667
1668         match_criteria_enable =
1669                 (!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
1670                 MATCH_CRITERIA_ENABLE_OUTER_BIT;
1671         match_criteria_enable |=
1672                 (!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
1673                 MATCH_CRITERIA_ENABLE_MISC_BIT;
1674         match_criteria_enable |=
1675                 (!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
1676                 MATCH_CRITERIA_ENABLE_INNER_BIT;
1677
1678         return match_criteria_enable;
1679 }
1680
1681 static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
1682 {
1683         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
1684         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
1685 }
1686
1687 static void set_flow_label(void *misc_c, void *misc_v, u8 mask, u8 val,
1688                            bool inner)
1689 {
1690         if (inner) {
1691                 MLX5_SET(fte_match_set_misc,
1692                          misc_c, inner_ipv6_flow_label, mask);
1693                 MLX5_SET(fte_match_set_misc,
1694                          misc_v, inner_ipv6_flow_label, val);
1695         } else {
1696                 MLX5_SET(fte_match_set_misc,
1697                          misc_c, outer_ipv6_flow_label, mask);
1698                 MLX5_SET(fte_match_set_misc,
1699                          misc_v, outer_ipv6_flow_label, val);
1700         }
1701 }
1702
1703 static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
1704 {
1705         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
1706         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
1707         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
1708         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
1709 }
1710
1711 #define LAST_ETH_FIELD vlan_tag
1712 #define LAST_IB_FIELD sl
1713 #define LAST_IPV4_FIELD tos
1714 #define LAST_IPV6_FIELD traffic_class
1715 #define LAST_TCP_UDP_FIELD src_port
1716 #define LAST_TUNNEL_FIELD tunnel_id
1717 #define LAST_FLOW_TAG_FIELD tag_id
1718
1719 /* Field is the last supported field */
1720 #define FIELDS_NOT_SUPPORTED(filter, field)\
1721         memchr_inv((void *)&filter.field  +\
1722                    sizeof(filter.field), 0,\
1723                    sizeof(filter) -\
1724                    offsetof(typeof(filter), field) -\
1725                    sizeof(filter.field))
1726
1727 static int parse_flow_attr(u32 *match_c, u32 *match_v,
1728                            const union ib_flow_spec *ib_spec, u32 *tag_id)
1729 {
1730         void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
1731                                            misc_parameters);
1732         void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
1733                                            misc_parameters);
1734         void *headers_c;
1735         void *headers_v;
1736
1737         if (ib_spec->type & IB_FLOW_SPEC_INNER) {
1738                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
1739                                          inner_headers);
1740                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
1741                                          inner_headers);
1742         } else {
1743                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
1744                                          outer_headers);
1745                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
1746                                          outer_headers);
1747         }
1748
1749         switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
1750         case IB_FLOW_SPEC_ETH:
1751                 if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
1752                         return -EOPNOTSUPP;
1753
1754                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1755                                              dmac_47_16),
1756                                 ib_spec->eth.mask.dst_mac);
1757                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1758                                              dmac_47_16),
1759                                 ib_spec->eth.val.dst_mac);
1760
1761                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1762                                              smac_47_16),
1763                                 ib_spec->eth.mask.src_mac);
1764                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1765                                              smac_47_16),
1766                                 ib_spec->eth.val.src_mac);
1767
1768                 if (ib_spec->eth.mask.vlan_tag) {
1769                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1770                                  vlan_tag, 1);
1771                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1772                                  vlan_tag, 1);
1773
1774                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1775                                  first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
1776                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1777                                  first_vid, ntohs(ib_spec->eth.val.vlan_tag));
1778
1779                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1780                                  first_cfi,
1781                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
1782                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1783                                  first_cfi,
1784                                  ntohs(ib_spec->eth.val.vlan_tag) >> 12);
1785
1786                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1787                                  first_prio,
1788                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
1789                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1790                                  first_prio,
1791                                  ntohs(ib_spec->eth.val.vlan_tag) >> 13);
1792                 }
1793                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1794                          ethertype, ntohs(ib_spec->eth.mask.ether_type));
1795                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1796                          ethertype, ntohs(ib_spec->eth.val.ether_type));
1797                 break;
1798         case IB_FLOW_SPEC_IPV4:
1799                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
1800                         return -EOPNOTSUPP;
1801
1802                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1803                          ethertype, 0xffff);
1804                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1805                          ethertype, ETH_P_IP);
1806
1807                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1808                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
1809                        &ib_spec->ipv4.mask.src_ip,
1810                        sizeof(ib_spec->ipv4.mask.src_ip));
1811                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1812                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
1813                        &ib_spec->ipv4.val.src_ip,
1814                        sizeof(ib_spec->ipv4.val.src_ip));
1815                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1816                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1817                        &ib_spec->ipv4.mask.dst_ip,
1818                        sizeof(ib_spec->ipv4.mask.dst_ip));
1819                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1820                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1821                        &ib_spec->ipv4.val.dst_ip,
1822                        sizeof(ib_spec->ipv4.val.dst_ip));
1823
1824                 set_tos(headers_c, headers_v,
1825                         ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
1826
1827                 set_proto(headers_c, headers_v,
1828                           ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
1829                 break;
1830         case IB_FLOW_SPEC_IPV6:
1831                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
1832                         return -EOPNOTSUPP;
1833
1834                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1835                          ethertype, 0xffff);
1836                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1837                          ethertype, ETH_P_IPV6);
1838
1839                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1840                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
1841                        &ib_spec->ipv6.mask.src_ip,
1842                        sizeof(ib_spec->ipv6.mask.src_ip));
1843                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1844                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
1845                        &ib_spec->ipv6.val.src_ip,
1846                        sizeof(ib_spec->ipv6.val.src_ip));
1847                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1848                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1849                        &ib_spec->ipv6.mask.dst_ip,
1850                        sizeof(ib_spec->ipv6.mask.dst_ip));
1851                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1852                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1853                        &ib_spec->ipv6.val.dst_ip,
1854                        sizeof(ib_spec->ipv6.val.dst_ip));
1855
1856                 set_tos(headers_c, headers_v,
1857                         ib_spec->ipv6.mask.traffic_class,
1858                         ib_spec->ipv6.val.traffic_class);
1859
1860                 set_proto(headers_c, headers_v,
1861                           ib_spec->ipv6.mask.next_hdr,
1862                           ib_spec->ipv6.val.next_hdr);
1863
1864                 set_flow_label(misc_params_c, misc_params_v,
1865                                ntohl(ib_spec->ipv6.mask.flow_label),
1866                                ntohl(ib_spec->ipv6.val.flow_label),
1867                                ib_spec->type & IB_FLOW_SPEC_INNER);
1868
1869                 break;
1870         case IB_FLOW_SPEC_TCP:
1871                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1872                                          LAST_TCP_UDP_FIELD))
1873                         return -EOPNOTSUPP;
1874
1875                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
1876                          0xff);
1877                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
1878                          IPPROTO_TCP);
1879
1880                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
1881                          ntohs(ib_spec->tcp_udp.mask.src_port));
1882                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
1883                          ntohs(ib_spec->tcp_udp.val.src_port));
1884
1885                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
1886                          ntohs(ib_spec->tcp_udp.mask.dst_port));
1887                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
1888                          ntohs(ib_spec->tcp_udp.val.dst_port));
1889                 break;
1890         case IB_FLOW_SPEC_UDP:
1891                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1892                                          LAST_TCP_UDP_FIELD))
1893                         return -EOPNOTSUPP;
1894
1895                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
1896                          0xff);
1897                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
1898                          IPPROTO_UDP);
1899
1900                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
1901                          ntohs(ib_spec->tcp_udp.mask.src_port));
1902                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
1903                          ntohs(ib_spec->tcp_udp.val.src_port));
1904
1905                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
1906                          ntohs(ib_spec->tcp_udp.mask.dst_port));
1907                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
1908                          ntohs(ib_spec->tcp_udp.val.dst_port));
1909                 break;
1910         case IB_FLOW_SPEC_VXLAN_TUNNEL:
1911                 if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
1912                                          LAST_TUNNEL_FIELD))
1913                         return -EOPNOTSUPP;
1914
1915                 MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
1916                          ntohl(ib_spec->tunnel.mask.tunnel_id));
1917                 MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
1918                          ntohl(ib_spec->tunnel.val.tunnel_id));
1919                 break;
1920         case IB_FLOW_SPEC_ACTION_TAG:
1921                 if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
1922                                          LAST_FLOW_TAG_FIELD))
1923                         return -EOPNOTSUPP;
1924                 if (ib_spec->flow_tag.tag_id >= BIT(24))
1925                         return -EINVAL;
1926
1927                 *tag_id = ib_spec->flow_tag.tag_id;
1928                 break;
1929         default:
1930                 return -EINVAL;
1931         }
1932
1933         return 0;
1934 }
1935
1936 /* If a flow could catch both multicast and unicast packets,
1937  * it won't fall into the multicast flow steering table and this rule
1938  * could steal other multicast packets.
1939  */
1940 static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
1941 {
1942         struct ib_flow_spec_eth *eth_spec;
1943
1944         if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
1945             ib_attr->size < sizeof(struct ib_flow_attr) +
1946             sizeof(struct ib_flow_spec_eth) ||
1947             ib_attr->num_of_specs < 1)
1948                 return false;
1949
1950         eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1);
1951         if (eth_spec->type != IB_FLOW_SPEC_ETH ||
1952             eth_spec->size != sizeof(*eth_spec))
1953                 return false;
1954
1955         return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
1956                is_multicast_ether_addr(eth_spec->val.dst_mac);
1957 }
1958
1959 static bool is_valid_attr(const struct ib_flow_attr *flow_attr)
1960 {
1961         union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
1962         bool has_ipv4_spec = false;
1963         bool eth_type_ipv4 = true;
1964         unsigned int spec_index;
1965
1966         /* Validate that ethertype is correct */
1967         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
1968                 if (ib_spec->type == IB_FLOW_SPEC_ETH &&
1969                     ib_spec->eth.mask.ether_type) {
1970                         if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) &&
1971                               ib_spec->eth.val.ether_type == htons(ETH_P_IP)))
1972                                 eth_type_ipv4 = false;
1973                 } else if (ib_spec->type == IB_FLOW_SPEC_IPV4) {
1974                         has_ipv4_spec = true;
1975                 }
1976                 ib_spec = (void *)ib_spec + ib_spec->size;
1977         }
1978         return !has_ipv4_spec || eth_type_ipv4;
1979 }
1980
1981 static void put_flow_table(struct mlx5_ib_dev *dev,
1982                            struct mlx5_ib_flow_prio *prio, bool ft_added)
1983 {
1984         prio->refcount -= !!ft_added;
1985         if (!prio->refcount) {
1986                 mlx5_destroy_flow_table(prio->flow_table);
1987                 prio->flow_table = NULL;
1988         }
1989 }
1990
1991 static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
1992 {
1993         struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device);
1994         struct mlx5_ib_flow_handler *handler = container_of(flow_id,
1995                                                           struct mlx5_ib_flow_handler,
1996                                                           ibflow);
1997         struct mlx5_ib_flow_handler *iter, *tmp;
1998
1999         mutex_lock(&dev->flow_db.lock);
2000
2001         list_for_each_entry_safe(iter, tmp, &handler->list, list) {
2002                 mlx5_del_flow_rules(iter->rule);
2003                 put_flow_table(dev, iter->prio, true);
2004                 list_del(&iter->list);
2005                 kfree(iter);
2006         }
2007
2008         mlx5_del_flow_rules(handler->rule);
2009         put_flow_table(dev, handler->prio, true);
2010         mutex_unlock(&dev->flow_db.lock);
2011
2012         kfree(handler);
2013
2014         return 0;
2015 }
2016
2017 static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
2018 {
2019         priority *= 2;
2020         if (!dont_trap)
2021                 priority++;
2022         return priority;
2023 }
2024
2025 enum flow_table_type {
2026         MLX5_IB_FT_RX,
2027         MLX5_IB_FT_TX
2028 };
2029
2030 #define MLX5_FS_MAX_TYPES        10
2031 #define MLX5_FS_MAX_ENTRIES      32000UL
2032 static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
2033                                                 struct ib_flow_attr *flow_attr,
2034                                                 enum flow_table_type ft_type)
2035 {
2036         bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
2037         struct mlx5_flow_namespace *ns = NULL;
2038         struct mlx5_ib_flow_prio *prio;
2039         struct mlx5_flow_table *ft;
2040         int num_entries;
2041         int num_groups;
2042         int priority;
2043         int err = 0;
2044
2045         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
2046                 if (flow_is_multicast_only(flow_attr) &&
2047                     !dont_trap)
2048                         priority = MLX5_IB_FLOW_MCAST_PRIO;
2049                 else
2050                         priority = ib_prio_to_core_prio(flow_attr->priority,
2051                                                         dont_trap);
2052                 ns = mlx5_get_flow_namespace(dev->mdev,
2053                                              MLX5_FLOW_NAMESPACE_BYPASS);
2054                 num_entries = MLX5_FS_MAX_ENTRIES;
2055                 num_groups = MLX5_FS_MAX_TYPES;
2056                 prio = &dev->flow_db.prios[priority];
2057         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2058                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
2059                 ns = mlx5_get_flow_namespace(dev->mdev,
2060                                              MLX5_FLOW_NAMESPACE_LEFTOVERS);
2061                 build_leftovers_ft_param(&priority,
2062                                          &num_entries,
2063                                          &num_groups);
2064                 prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
2065         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2066                 if (!MLX5_CAP_FLOWTABLE(dev->mdev,
2067                                         allow_sniffer_and_nic_rx_shared_tir))
2068                         return ERR_PTR(-ENOTSUPP);
2069
2070                 ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
2071                                              MLX5_FLOW_NAMESPACE_SNIFFER_RX :
2072                                              MLX5_FLOW_NAMESPACE_SNIFFER_TX);
2073
2074                 prio = &dev->flow_db.sniffer[ft_type];
2075                 priority = 0;
2076                 num_entries = 1;
2077                 num_groups = 1;
2078         }
2079
2080         if (!ns)
2081                 return ERR_PTR(-ENOTSUPP);
2082
2083         ft = prio->flow_table;
2084         if (!ft) {
2085                 ft = mlx5_create_auto_grouped_flow_table(ns, priority,
2086                                                          num_entries,
2087                                                          num_groups,
2088                                                          0, 0);
2089
2090                 if (!IS_ERR(ft)) {
2091                         prio->refcount = 0;
2092                         prio->flow_table = ft;
2093                 } else {
2094                         err = PTR_ERR(ft);
2095                 }
2096         }
2097
2098         return err ? ERR_PTR(err) : prio;
2099 }
2100
2101 static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
2102                                                      struct mlx5_ib_flow_prio *ft_prio,
2103                                                      const struct ib_flow_attr *flow_attr,
2104                                                      struct mlx5_flow_destination *dst)
2105 {
2106         struct mlx5_flow_table  *ft = ft_prio->flow_table;
2107         struct mlx5_ib_flow_handler *handler;
2108         struct mlx5_flow_act flow_act = {0};
2109         struct mlx5_flow_spec *spec;
2110         const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
2111         unsigned int spec_index;
2112         u32 flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
2113         int err = 0;
2114
2115         if (!is_valid_attr(flow_attr))
2116                 return ERR_PTR(-EINVAL);
2117
2118         spec = mlx5_vzalloc(sizeof(*spec));
2119         handler = kzalloc(sizeof(*handler), GFP_KERNEL);
2120         if (!handler || !spec) {
2121                 err = -ENOMEM;
2122                 goto free;
2123         }
2124
2125         INIT_LIST_HEAD(&handler->list);
2126
2127         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
2128                 err = parse_flow_attr(spec->match_criteria,
2129                                       spec->match_value, ib_flow, &flow_tag);
2130                 if (err < 0)
2131                         goto free;
2132
2133                 ib_flow += ((union ib_flow_spec *)ib_flow)->size;
2134         }
2135
2136         spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
2137         flow_act.action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
2138                 MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
2139
2140         if (flow_tag != MLX5_FS_DEFAULT_FLOW_TAG &&
2141             (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2142              flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
2143                 mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
2144                              flow_tag, flow_attr->type);
2145                 err = -EINVAL;
2146                 goto free;
2147         }
2148         flow_act.flow_tag = flow_tag;
2149         handler->rule = mlx5_add_flow_rules(ft, spec,
2150                                             &flow_act,
2151                                             dst, 1);
2152
2153         if (IS_ERR(handler->rule)) {
2154                 err = PTR_ERR(handler->rule);
2155                 goto free;
2156         }
2157
2158         ft_prio->refcount++;
2159         handler->prio = ft_prio;
2160
2161         ft_prio->flow_table = ft;
2162 free:
2163         if (err)
2164                 kfree(handler);
2165         kvfree(spec);
2166         return err ? ERR_PTR(err) : handler;
2167 }
2168
2169 static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
2170                                                           struct mlx5_ib_flow_prio *ft_prio,
2171                                                           struct ib_flow_attr *flow_attr,
2172                                                           struct mlx5_flow_destination *dst)
2173 {
2174         struct mlx5_ib_flow_handler *handler_dst = NULL;
2175         struct mlx5_ib_flow_handler *handler = NULL;
2176
2177         handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
2178         if (!IS_ERR(handler)) {
2179                 handler_dst = create_flow_rule(dev, ft_prio,
2180                                                flow_attr, dst);
2181                 if (IS_ERR(handler_dst)) {
2182                         mlx5_del_flow_rules(handler->rule);
2183                         ft_prio->refcount--;
2184                         kfree(handler);
2185                         handler = handler_dst;
2186                 } else {
2187                         list_add(&handler_dst->list, &handler->list);
2188                 }
2189         }
2190
2191         return handler;
2192 }
2193 enum {
2194         LEFTOVERS_MC,
2195         LEFTOVERS_UC,
2196 };
2197
2198 static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
2199                                                           struct mlx5_ib_flow_prio *ft_prio,
2200                                                           struct ib_flow_attr *flow_attr,
2201                                                           struct mlx5_flow_destination *dst)
2202 {
2203         struct mlx5_ib_flow_handler *handler_ucast = NULL;
2204         struct mlx5_ib_flow_handler *handler = NULL;
2205
2206         static struct {
2207                 struct ib_flow_attr     flow_attr;
2208                 struct ib_flow_spec_eth eth_flow;
2209         } leftovers_specs[] = {
2210                 [LEFTOVERS_MC] = {
2211                         .flow_attr = {
2212                                 .num_of_specs = 1,
2213                                 .size = sizeof(leftovers_specs[0])
2214                         },
2215                         .eth_flow = {
2216                                 .type = IB_FLOW_SPEC_ETH,
2217                                 .size = sizeof(struct ib_flow_spec_eth),
2218                                 .mask = {.dst_mac = {0x1} },
2219                                 .val =  {.dst_mac = {0x1} }
2220                         }
2221                 },
2222                 [LEFTOVERS_UC] = {
2223                         .flow_attr = {
2224                                 .num_of_specs = 1,
2225                                 .size = sizeof(leftovers_specs[0])
2226                         },
2227                         .eth_flow = {
2228                                 .type = IB_FLOW_SPEC_ETH,
2229                                 .size = sizeof(struct ib_flow_spec_eth),
2230                                 .mask = {.dst_mac = {0x1} },
2231                                 .val = {.dst_mac = {} }
2232                         }
2233                 }
2234         };
2235
2236         handler = create_flow_rule(dev, ft_prio,
2237                                    &leftovers_specs[LEFTOVERS_MC].flow_attr,
2238                                    dst);
2239         if (!IS_ERR(handler) &&
2240             flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
2241                 handler_ucast = create_flow_rule(dev, ft_prio,
2242                                                  &leftovers_specs[LEFTOVERS_UC].flow_attr,
2243                                                  dst);
2244                 if (IS_ERR(handler_ucast)) {
2245                         mlx5_del_flow_rules(handler->rule);
2246                         ft_prio->refcount--;
2247                         kfree(handler);
2248                         handler = handler_ucast;
2249                 } else {
2250                         list_add(&handler_ucast->list, &handler->list);
2251                 }
2252         }
2253
2254         return handler;
2255 }
2256
2257 static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
2258                                                         struct mlx5_ib_flow_prio *ft_rx,
2259                                                         struct mlx5_ib_flow_prio *ft_tx,
2260                                                         struct mlx5_flow_destination *dst)
2261 {
2262         struct mlx5_ib_flow_handler *handler_rx;
2263         struct mlx5_ib_flow_handler *handler_tx;
2264         int err;
2265         static const struct ib_flow_attr flow_attr  = {
2266                 .num_of_specs = 0,
2267                 .size = sizeof(flow_attr)
2268         };
2269
2270         handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
2271         if (IS_ERR(handler_rx)) {
2272                 err = PTR_ERR(handler_rx);
2273                 goto err;
2274         }
2275
2276         handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
2277         if (IS_ERR(handler_tx)) {
2278                 err = PTR_ERR(handler_tx);
2279                 goto err_tx;
2280         }
2281
2282         list_add(&handler_tx->list, &handler_rx->list);
2283
2284         return handler_rx;
2285
2286 err_tx:
2287         mlx5_del_flow_rules(handler_rx->rule);
2288         ft_rx->refcount--;
2289         kfree(handler_rx);
2290 err:
2291         return ERR_PTR(err);
2292 }
2293
2294 static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
2295                                            struct ib_flow_attr *flow_attr,
2296                                            int domain)
2297 {
2298         struct mlx5_ib_dev *dev = to_mdev(qp->device);
2299         struct mlx5_ib_qp *mqp = to_mqp(qp);
2300         struct mlx5_ib_flow_handler *handler = NULL;
2301         struct mlx5_flow_destination *dst = NULL;
2302         struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
2303         struct mlx5_ib_flow_prio *ft_prio;
2304         int err;
2305
2306         if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
2307                 return ERR_PTR(-ENOSPC);
2308
2309         if (domain != IB_FLOW_DOMAIN_USER ||
2310             flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) ||
2311             (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP))
2312                 return ERR_PTR(-EINVAL);
2313
2314         dst = kzalloc(sizeof(*dst), GFP_KERNEL);
2315         if (!dst)
2316                 return ERR_PTR(-ENOMEM);
2317
2318         mutex_lock(&dev->flow_db.lock);
2319
2320         ft_prio = get_flow_table(dev, flow_attr, MLX5_IB_FT_RX);
2321         if (IS_ERR(ft_prio)) {
2322                 err = PTR_ERR(ft_prio);
2323                 goto unlock;
2324         }
2325         if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2326                 ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
2327                 if (IS_ERR(ft_prio_tx)) {
2328                         err = PTR_ERR(ft_prio_tx);
2329                         ft_prio_tx = NULL;
2330                         goto destroy_ft;
2331                 }
2332         }
2333
2334         dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
2335         if (mqp->flags & MLX5_IB_QP_RSS)
2336                 dst->tir_num = mqp->rss_qp.tirn;
2337         else
2338                 dst->tir_num = mqp->raw_packet_qp.rq.tirn;
2339
2340         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
2341                 if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
2342                         handler = create_dont_trap_rule(dev, ft_prio,
2343                                                         flow_attr, dst);
2344                 } else {
2345                         handler = create_flow_rule(dev, ft_prio, flow_attr,
2346                                                    dst);
2347                 }
2348         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2349                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
2350                 handler = create_leftovers_rule(dev, ft_prio, flow_attr,
2351                                                 dst);
2352         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2353                 handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
2354         } else {
2355                 err = -EINVAL;
2356                 goto destroy_ft;
2357         }
2358
2359         if (IS_ERR(handler)) {
2360                 err = PTR_ERR(handler);
2361                 handler = NULL;
2362                 goto destroy_ft;
2363         }
2364
2365         mutex_unlock(&dev->flow_db.lock);
2366         kfree(dst);
2367
2368         return &handler->ibflow;
2369
2370 destroy_ft:
2371         put_flow_table(dev, ft_prio, false);
2372         if (ft_prio_tx)
2373                 put_flow_table(dev, ft_prio_tx, false);
2374 unlock:
2375         mutex_unlock(&dev->flow_db.lock);
2376         kfree(dst);
2377         kfree(handler);
2378         return ERR_PTR(err);
2379 }
2380
2381 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2382 {
2383         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2384         int err;
2385
2386         err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
2387         if (err)
2388                 mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
2389                              ibqp->qp_num, gid->raw);
2390
2391         return err;
2392 }
2393
2394 static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2395 {
2396         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2397         int err;
2398
2399         err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
2400         if (err)
2401                 mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
2402                              ibqp->qp_num, gid->raw);
2403
2404         return err;
2405 }
2406
2407 static int init_node_data(struct mlx5_ib_dev *dev)
2408 {
2409         int err;
2410
2411         err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
2412         if (err)
2413                 return err;
2414
2415         dev->mdev->rev_id = dev->mdev->pdev->revision;
2416
2417         return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
2418 }
2419
2420 static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
2421                              char *buf)
2422 {
2423         struct mlx5_ib_dev *dev =
2424                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2425
2426         return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
2427 }
2428
2429 static ssize_t show_reg_pages(struct device *device,
2430                               struct device_attribute *attr, char *buf)
2431 {
2432         struct mlx5_ib_dev *dev =
2433                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2434
2435         return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
2436 }
2437
2438 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
2439                         char *buf)
2440 {
2441         struct mlx5_ib_dev *dev =
2442                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2443         return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
2444 }
2445
2446 static ssize_t show_rev(struct device *device, struct device_attribute *attr,
2447                         char *buf)
2448 {
2449         struct mlx5_ib_dev *dev =
2450                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2451         return sprintf(buf, "%x\n", dev->mdev->rev_id);
2452 }
2453
2454 static ssize_t show_board(struct device *device, struct device_attribute *attr,
2455                           char *buf)
2456 {
2457         struct mlx5_ib_dev *dev =
2458                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2459         return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
2460                        dev->mdev->board_id);
2461 }
2462
2463 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
2464 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
2465 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
2466 static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
2467 static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
2468
2469 static struct device_attribute *mlx5_class_attributes[] = {
2470         &dev_attr_hw_rev,
2471         &dev_attr_hca_type,
2472         &dev_attr_board_id,
2473         &dev_attr_fw_pages,
2474         &dev_attr_reg_pages,
2475 };
2476
2477 static void pkey_change_handler(struct work_struct *work)
2478 {
2479         struct mlx5_ib_port_resources *ports =
2480                 container_of(work, struct mlx5_ib_port_resources,
2481                              pkey_change_work);
2482
2483         mutex_lock(&ports->devr->mutex);
2484         mlx5_ib_gsi_pkey_change(ports->gsi);
2485         mutex_unlock(&ports->devr->mutex);
2486 }
2487
2488 static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
2489 {
2490         struct mlx5_ib_qp *mqp;
2491         struct mlx5_ib_cq *send_mcq, *recv_mcq;
2492         struct mlx5_core_cq *mcq;
2493         struct list_head cq_armed_list;
2494         unsigned long flags_qp;
2495         unsigned long flags_cq;
2496         unsigned long flags;
2497
2498         INIT_LIST_HEAD(&cq_armed_list);
2499
2500         /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
2501         spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
2502         list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
2503                 spin_lock_irqsave(&mqp->sq.lock, flags_qp);
2504                 if (mqp->sq.tail != mqp->sq.head) {
2505                         send_mcq = to_mcq(mqp->ibqp.send_cq);
2506                         spin_lock_irqsave(&send_mcq->lock, flags_cq);
2507                         if (send_mcq->mcq.comp &&
2508                             mqp->ibqp.send_cq->comp_handler) {
2509                                 if (!send_mcq->mcq.reset_notify_added) {
2510                                         send_mcq->mcq.reset_notify_added = 1;
2511                                         list_add_tail(&send_mcq->mcq.reset_notify,
2512                                                       &cq_armed_list);
2513                                 }
2514                         }
2515                         spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
2516                 }
2517                 spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
2518                 spin_lock_irqsave(&mqp->rq.lock, flags_qp);
2519                 /* no handling is needed for SRQ */
2520                 if (!mqp->ibqp.srq) {
2521                         if (mqp->rq.tail != mqp->rq.head) {
2522                                 recv_mcq = to_mcq(mqp->ibqp.recv_cq);
2523                                 spin_lock_irqsave(&recv_mcq->lock, flags_cq);
2524                                 if (recv_mcq->mcq.comp &&
2525                                     mqp->ibqp.recv_cq->comp_handler) {
2526                                         if (!recv_mcq->mcq.reset_notify_added) {
2527                                                 recv_mcq->mcq.reset_notify_added = 1;
2528                                                 list_add_tail(&recv_mcq->mcq.reset_notify,
2529                                                               &cq_armed_list);
2530                                         }
2531                                 }
2532                                 spin_unlock_irqrestore(&recv_mcq->lock,
2533                                                        flags_cq);
2534                         }
2535                 }
2536                 spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
2537         }
2538         /*At that point all inflight post send were put to be executed as of we
2539          * lock/unlock above locks Now need to arm all involved CQs.
2540          */
2541         list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
2542                 mcq->comp(mcq);
2543         }
2544         spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
2545 }
2546
2547 static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
2548                           enum mlx5_dev_event event, unsigned long param)
2549 {
2550         struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
2551         struct ib_event ibev;
2552         bool fatal = false;
2553         u8 port = 0;
2554
2555         switch (event) {
2556         case MLX5_DEV_EVENT_SYS_ERROR:
2557                 ibev.event = IB_EVENT_DEVICE_FATAL;
2558                 mlx5_ib_handle_internal_error(ibdev);
2559                 fatal = true;
2560                 break;
2561
2562         case MLX5_DEV_EVENT_PORT_UP:
2563         case MLX5_DEV_EVENT_PORT_DOWN:
2564         case MLX5_DEV_EVENT_PORT_INITIALIZED:
2565                 port = (u8)param;
2566
2567                 /* In RoCE, port up/down events are handled in
2568                  * mlx5_netdev_event().
2569                  */
2570                 if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
2571                         IB_LINK_LAYER_ETHERNET)
2572                         return;
2573
2574                 ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ?
2575                              IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
2576                 break;
2577
2578         case MLX5_DEV_EVENT_LID_CHANGE:
2579                 ibev.event = IB_EVENT_LID_CHANGE;
2580                 port = (u8)param;
2581                 break;
2582
2583         case MLX5_DEV_EVENT_PKEY_CHANGE:
2584                 ibev.event = IB_EVENT_PKEY_CHANGE;
2585                 port = (u8)param;
2586
2587                 schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
2588                 break;
2589
2590         case MLX5_DEV_EVENT_GUID_CHANGE:
2591                 ibev.event = IB_EVENT_GID_CHANGE;
2592                 port = (u8)param;
2593                 break;
2594
2595         case MLX5_DEV_EVENT_CLIENT_REREG:
2596                 ibev.event = IB_EVENT_CLIENT_REREGISTER;
2597                 port = (u8)param;
2598                 break;
2599         default:
2600                 return;
2601         }
2602
2603         ibev.device           = &ibdev->ib_dev;
2604         ibev.element.port_num = port;
2605
2606         if (port < 1 || port > ibdev->num_ports) {
2607                 mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
2608                 return;
2609         }
2610
2611         if (ibdev->ib_active)
2612                 ib_dispatch_event(&ibev);
2613
2614         if (fatal)
2615                 ibdev->ib_active = false;
2616 }
2617
2618 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
2619 {
2620         struct mlx5_hca_vport_context vport_ctx;
2621         int err;
2622         int port;
2623
2624         for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
2625                 dev->mdev->port_caps[port - 1].has_smi = false;
2626                 if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2627                     MLX5_CAP_PORT_TYPE_IB) {
2628                         if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
2629                                 err = mlx5_query_hca_vport_context(dev->mdev, 0,
2630                                                                    port, 0,
2631                                                                    &vport_ctx);
2632                                 if (err) {
2633                                         mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
2634                                                     port, err);
2635                                         return err;
2636                                 }
2637                                 dev->mdev->port_caps[port - 1].has_smi =
2638                                         vport_ctx.has_smi;
2639                         } else {
2640                                 dev->mdev->port_caps[port - 1].has_smi = true;
2641                         }
2642                 }
2643         }
2644         return 0;
2645 }
2646
2647 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
2648 {
2649         int port;
2650
2651         for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
2652                 mlx5_query_ext_port_caps(dev, port);
2653 }
2654
2655 static int get_port_caps(struct mlx5_ib_dev *dev)
2656 {
2657         struct ib_device_attr *dprops = NULL;
2658         struct ib_port_attr *pprops = NULL;
2659         int err = -ENOMEM;
2660         int port;
2661         struct ib_udata uhw = {.inlen = 0, .outlen = 0};
2662
2663         pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
2664         if (!pprops)
2665                 goto out;
2666
2667         dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
2668         if (!dprops)
2669                 goto out;
2670
2671         err = set_has_smi_cap(dev);
2672         if (err)
2673                 goto out;
2674
2675         err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
2676         if (err) {
2677                 mlx5_ib_warn(dev, "query_device failed %d\n", err);
2678                 goto out;
2679         }
2680
2681         for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
2682                 memset(pprops, 0, sizeof(*pprops));
2683                 err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
2684                 if (err) {
2685                         mlx5_ib_warn(dev, "query_port %d failed %d\n",
2686                                      port, err);
2687                         break;
2688                 }
2689                 dev->mdev->port_caps[port - 1].pkey_table_len =
2690                                                 dprops->max_pkeys;
2691                 dev->mdev->port_caps[port - 1].gid_table_len =
2692                                                 pprops->gid_tbl_len;
2693                 mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
2694                             dprops->max_pkeys, pprops->gid_tbl_len);
2695         }
2696
2697 out:
2698         kfree(pprops);
2699         kfree(dprops);
2700
2701         return err;
2702 }
2703
2704 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
2705 {
2706         int err;
2707
2708         err = mlx5_mr_cache_cleanup(dev);
2709         if (err)
2710                 mlx5_ib_warn(dev, "mr cache cleanup failed\n");
2711
2712         mlx5_ib_destroy_qp(dev->umrc.qp);
2713         ib_free_cq(dev->umrc.cq);
2714         ib_dealloc_pd(dev->umrc.pd);
2715 }
2716
2717 enum {
2718         MAX_UMR_WR = 128,
2719 };
2720
2721 static int create_umr_res(struct mlx5_ib_dev *dev)
2722 {
2723         struct ib_qp_init_attr *init_attr = NULL;
2724         struct ib_qp_attr *attr = NULL;
2725         struct ib_pd *pd;
2726         struct ib_cq *cq;
2727         struct ib_qp *qp;
2728         int ret;
2729
2730         attr = kzalloc(sizeof(*attr), GFP_KERNEL);
2731         init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
2732         if (!attr || !init_attr) {
2733                 ret = -ENOMEM;
2734                 goto error_0;
2735         }
2736
2737         pd = ib_alloc_pd(&dev->ib_dev, 0);
2738         if (IS_ERR(pd)) {
2739                 mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
2740                 ret = PTR_ERR(pd);
2741                 goto error_0;
2742         }
2743
2744         cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
2745         if (IS_ERR(cq)) {
2746                 mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
2747                 ret = PTR_ERR(cq);
2748                 goto error_2;
2749         }
2750
2751         init_attr->send_cq = cq;
2752         init_attr->recv_cq = cq;
2753         init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
2754         init_attr->cap.max_send_wr = MAX_UMR_WR;
2755         init_attr->cap.max_send_sge = 1;
2756         init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
2757         init_attr->port_num = 1;
2758         qp = mlx5_ib_create_qp(pd, init_attr, NULL);
2759         if (IS_ERR(qp)) {
2760                 mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
2761                 ret = PTR_ERR(qp);
2762                 goto error_3;
2763         }
2764         qp->device     = &dev->ib_dev;
2765         qp->real_qp    = qp;
2766         qp->uobject    = NULL;
2767         qp->qp_type    = MLX5_IB_QPT_REG_UMR;
2768
2769         attr->qp_state = IB_QPS_INIT;
2770         attr->port_num = 1;
2771         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
2772                                 IB_QP_PORT, NULL);
2773         if (ret) {
2774                 mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
2775                 goto error_4;
2776         }
2777
2778         memset(attr, 0, sizeof(*attr));
2779         attr->qp_state = IB_QPS_RTR;
2780         attr->path_mtu = IB_MTU_256;
2781
2782         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2783         if (ret) {
2784                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
2785                 goto error_4;
2786         }
2787
2788         memset(attr, 0, sizeof(*attr));
2789         attr->qp_state = IB_QPS_RTS;
2790         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2791         if (ret) {
2792                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
2793                 goto error_4;
2794         }
2795
2796         dev->umrc.qp = qp;
2797         dev->umrc.cq = cq;
2798         dev->umrc.pd = pd;
2799
2800         sema_init(&dev->umrc.sem, MAX_UMR_WR);
2801         ret = mlx5_mr_cache_init(dev);
2802         if (ret) {
2803                 mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
2804                 goto error_4;
2805         }
2806
2807         kfree(attr);
2808         kfree(init_attr);
2809
2810         return 0;
2811
2812 error_4:
2813         mlx5_ib_destroy_qp(qp);
2814
2815 error_3:
2816         ib_free_cq(cq);
2817
2818 error_2:
2819         ib_dealloc_pd(pd);
2820
2821 error_0:
2822         kfree(attr);
2823         kfree(init_attr);
2824         return ret;
2825 }
2826
2827 static int create_dev_resources(struct mlx5_ib_resources *devr)
2828 {
2829         struct ib_srq_init_attr attr;
2830         struct mlx5_ib_dev *dev;
2831         struct ib_cq_init_attr cq_attr = {.cqe = 1};
2832         int port;
2833         int ret = 0;
2834
2835         dev = container_of(devr, struct mlx5_ib_dev, devr);
2836
2837         mutex_init(&devr->mutex);
2838
2839         devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
2840         if (IS_ERR(devr->p0)) {
2841                 ret = PTR_ERR(devr->p0);
2842                 goto error0;
2843         }
2844         devr->p0->device  = &dev->ib_dev;
2845         devr->p0->uobject = NULL;
2846         atomic_set(&devr->p0->usecnt, 0);
2847
2848         devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
2849         if (IS_ERR(devr->c0)) {
2850                 ret = PTR_ERR(devr->c0);
2851                 goto error1;
2852         }
2853         devr->c0->device        = &dev->ib_dev;
2854         devr->c0->uobject       = NULL;
2855         devr->c0->comp_handler  = NULL;
2856         devr->c0->event_handler = NULL;
2857         devr->c0->cq_context    = NULL;
2858         atomic_set(&devr->c0->usecnt, 0);
2859
2860         devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2861         if (IS_ERR(devr->x0)) {
2862                 ret = PTR_ERR(devr->x0);
2863                 goto error2;
2864         }
2865         devr->x0->device = &dev->ib_dev;
2866         devr->x0->inode = NULL;
2867         atomic_set(&devr->x0->usecnt, 0);
2868         mutex_init(&devr->x0->tgt_qp_mutex);
2869         INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
2870
2871         devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2872         if (IS_ERR(devr->x1)) {
2873                 ret = PTR_ERR(devr->x1);
2874                 goto error3;
2875         }
2876         devr->x1->device = &dev->ib_dev;
2877         devr->x1->inode = NULL;
2878         atomic_set(&devr->x1->usecnt, 0);
2879         mutex_init(&devr->x1->tgt_qp_mutex);
2880         INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
2881
2882         memset(&attr, 0, sizeof(attr));
2883         attr.attr.max_sge = 1;
2884         attr.attr.max_wr = 1;
2885         attr.srq_type = IB_SRQT_XRC;
2886         attr.ext.xrc.cq = devr->c0;
2887         attr.ext.xrc.xrcd = devr->x0;
2888
2889         devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2890         if (IS_ERR(devr->s0)) {
2891                 ret = PTR_ERR(devr->s0);
2892                 goto error4;
2893         }
2894         devr->s0->device        = &dev->ib_dev;
2895         devr->s0->pd            = devr->p0;
2896         devr->s0->uobject       = NULL;
2897         devr->s0->event_handler = NULL;
2898         devr->s0->srq_context   = NULL;
2899         devr->s0->srq_type      = IB_SRQT_XRC;
2900         devr->s0->ext.xrc.xrcd  = devr->x0;
2901         devr->s0->ext.xrc.cq    = devr->c0;
2902         atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
2903         atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
2904         atomic_inc(&devr->p0->usecnt);
2905         atomic_set(&devr->s0->usecnt, 0);
2906
2907         memset(&attr, 0, sizeof(attr));
2908         attr.attr.max_sge = 1;
2909         attr.attr.max_wr = 1;
2910         attr.srq_type = IB_SRQT_BASIC;
2911         devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2912         if (IS_ERR(devr->s1)) {
2913                 ret = PTR_ERR(devr->s1);
2914                 goto error5;
2915         }
2916         devr->s1->device        = &dev->ib_dev;
2917         devr->s1->pd            = devr->p0;
2918         devr->s1->uobject       = NULL;
2919         devr->s1->event_handler = NULL;
2920         devr->s1->srq_context   = NULL;
2921         devr->s1->srq_type      = IB_SRQT_BASIC;
2922         devr->s1->ext.xrc.cq    = devr->c0;
2923         atomic_inc(&devr->p0->usecnt);
2924         atomic_set(&devr->s0->usecnt, 0);
2925
2926         for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
2927                 INIT_WORK(&devr->ports[port].pkey_change_work,
2928                           pkey_change_handler);
2929                 devr->ports[port].devr = devr;
2930         }
2931
2932         return 0;
2933
2934 error5:
2935         mlx5_ib_destroy_srq(devr->s0);
2936 error4:
2937         mlx5_ib_dealloc_xrcd(devr->x1);
2938 error3:
2939         mlx5_ib_dealloc_xrcd(devr->x0);
2940 error2:
2941         mlx5_ib_destroy_cq(devr->c0);
2942 error1:
2943         mlx5_ib_dealloc_pd(devr->p0);
2944 error0:
2945         return ret;
2946 }
2947
2948 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
2949 {
2950         struct mlx5_ib_dev *dev =
2951                 container_of(devr, struct mlx5_ib_dev, devr);
2952         int port;
2953
2954         mlx5_ib_destroy_srq(devr->s1);
2955         mlx5_ib_destroy_srq(devr->s0);
2956         mlx5_ib_dealloc_xrcd(devr->x0);
2957         mlx5_ib_dealloc_xrcd(devr->x1);
2958         mlx5_ib_destroy_cq(devr->c0);
2959         mlx5_ib_dealloc_pd(devr->p0);
2960
2961         /* Make sure no change P_Key work items are still executing */
2962         for (port = 0; port < dev->num_ports; ++port)
2963                 cancel_work_sync(&devr->ports[port].pkey_change_work);
2964 }
2965
2966 static u32 get_core_cap_flags(struct ib_device *ibdev)
2967 {
2968         struct mlx5_ib_dev *dev = to_mdev(ibdev);
2969         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
2970         u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
2971         u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
2972         u32 ret = 0;
2973
2974         if (ll == IB_LINK_LAYER_INFINIBAND)
2975                 return RDMA_CORE_PORT_IBA_IB;
2976
2977         ret = RDMA_CORE_PORT_RAW_PACKET;
2978
2979         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
2980                 return ret;
2981
2982         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
2983                 return ret;
2984
2985         if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
2986                 ret |= RDMA_CORE_PORT_IBA_ROCE;
2987
2988         if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
2989                 ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
2990
2991         return ret;
2992 }
2993
2994 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
2995                                struct ib_port_immutable *immutable)
2996 {
2997         struct ib_port_attr attr;
2998         struct mlx5_ib_dev *dev = to_mdev(ibdev);
2999         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
3000         int err;
3001
3002         immutable->core_cap_flags = get_core_cap_flags(ibdev);
3003
3004         err = ib_query_port(ibdev, port_num, &attr);
3005         if (err)
3006                 return err;
3007
3008         immutable->pkey_tbl_len = attr.pkey_tbl_len;
3009         immutable->gid_tbl_len = attr.gid_tbl_len;
3010         immutable->core_cap_flags = get_core_cap_flags(ibdev);
3011         if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))
3012                 immutable->max_mad_size = IB_MGMT_MAD_SIZE;
3013
3014         return 0;
3015 }
3016
3017 static void get_dev_fw_str(struct ib_device *ibdev, char *str,
3018                            size_t str_len)
3019 {
3020         struct mlx5_ib_dev *dev =
3021                 container_of(ibdev, struct mlx5_ib_dev, ib_dev);
3022         snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev),
3023                        fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
3024 }
3025
3026 static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
3027 {
3028         struct mlx5_core_dev *mdev = dev->mdev;
3029         struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
3030                                                                  MLX5_FLOW_NAMESPACE_LAG);
3031         struct mlx5_flow_table *ft;
3032         int err;
3033
3034         if (!ns || !mlx5_lag_is_active(mdev))
3035                 return 0;
3036
3037         err = mlx5_cmd_create_vport_lag(mdev);
3038         if (err)
3039                 return err;
3040
3041         ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
3042         if (IS_ERR(ft)) {
3043                 err = PTR_ERR(ft);
3044                 goto err_destroy_vport_lag;
3045         }
3046
3047         dev->flow_db.lag_demux_ft = ft;
3048         return 0;
3049
3050 err_destroy_vport_lag:
3051         mlx5_cmd_destroy_vport_lag(mdev);
3052         return err;
3053 }
3054
3055 static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
3056 {
3057         struct mlx5_core_dev *mdev = dev->mdev;
3058
3059         if (dev->flow_db.lag_demux_ft) {
3060                 mlx5_destroy_flow_table(dev->flow_db.lag_demux_ft);
3061                 dev->flow_db.lag_demux_ft = NULL;
3062
3063                 mlx5_cmd_destroy_vport_lag(mdev);
3064         }
3065 }
3066
3067 static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev)
3068 {
3069         int err;
3070
3071         dev->roce.nb.notifier_call = mlx5_netdev_event;
3072         err = register_netdevice_notifier(&dev->roce.nb);
3073         if (err) {
3074                 dev->roce.nb.notifier_call = NULL;
3075                 return err;
3076         }
3077
3078         return 0;
3079 }
3080
3081 static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev)
3082 {
3083         if (dev->roce.nb.notifier_call) {
3084                 unregister_netdevice_notifier(&dev->roce.nb);
3085                 dev->roce.nb.notifier_call = NULL;
3086         }
3087 }
3088
3089 static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
3090 {
3091         int err;
3092
3093         err = mlx5_add_netdev_notifier(dev);
3094         if (err)
3095                 return err;
3096
3097         if (MLX5_CAP_GEN(dev->mdev, roce)) {
3098                 err = mlx5_nic_vport_enable_roce(dev->mdev);
3099                 if (err)
3100                         goto err_unregister_netdevice_notifier;
3101         }
3102
3103         err = mlx5_eth_lag_init(dev);
3104         if (err)
3105                 goto err_disable_roce;
3106
3107         return 0;
3108
3109 err_disable_roce:
3110         if (MLX5_CAP_GEN(dev->mdev, roce))
3111                 mlx5_nic_vport_disable_roce(dev->mdev);
3112
3113 err_unregister_netdevice_notifier:
3114         mlx5_remove_netdev_notifier(dev);
3115         return err;
3116 }
3117
3118 static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
3119 {
3120         mlx5_eth_lag_cleanup(dev);
3121         if (MLX5_CAP_GEN(dev->mdev, roce))
3122                 mlx5_nic_vport_disable_roce(dev->mdev);
3123 }
3124
3125 struct mlx5_ib_q_counter {
3126         const char *name;
3127         size_t offset;
3128 };
3129
3130 #define INIT_Q_COUNTER(_name)           \
3131         { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
3132
3133 static const struct mlx5_ib_q_counter basic_q_cnts[] = {
3134         INIT_Q_COUNTER(rx_write_requests),
3135         INIT_Q_COUNTER(rx_read_requests),
3136         INIT_Q_COUNTER(rx_atomic_requests),
3137         INIT_Q_COUNTER(out_of_buffer),
3138 };
3139
3140 static const struct mlx5_ib_q_counter out_of_seq_q_cnts[] = {
3141         INIT_Q_COUNTER(out_of_sequence),
3142 };
3143
3144 static const struct mlx5_ib_q_counter retrans_q_cnts[] = {
3145         INIT_Q_COUNTER(duplicate_request),
3146         INIT_Q_COUNTER(rnr_nak_retry_err),
3147         INIT_Q_COUNTER(packet_seq_err),
3148         INIT_Q_COUNTER(implied_nak_seq_err),
3149         INIT_Q_COUNTER(local_ack_timeout_err),
3150 };
3151
3152 static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
3153 {
3154         unsigned int i;
3155
3156         for (i = 0; i < dev->num_ports; i++) {
3157                 mlx5_core_dealloc_q_counter(dev->mdev,
3158                                             dev->port[i].q_cnts.set_id);
3159                 kfree(dev->port[i].q_cnts.names);
3160                 kfree(dev->port[i].q_cnts.offsets);
3161         }
3162 }
3163
3164 static int __mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev,
3165                                       const char ***names,
3166                                       size_t **offsets,
3167                                       u32 *num)
3168 {
3169         u32 num_counters;
3170
3171         num_counters = ARRAY_SIZE(basic_q_cnts);
3172
3173         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
3174                 num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
3175
3176         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
3177                 num_counters += ARRAY_SIZE(retrans_q_cnts);
3178
3179         *names = kcalloc(num_counters, sizeof(**names), GFP_KERNEL);
3180         if (!*names)
3181                 return -ENOMEM;
3182
3183         *offsets = kcalloc(num_counters, sizeof(**offsets), GFP_KERNEL);
3184         if (!*offsets)
3185                 goto err_names;
3186
3187         *num = num_counters;
3188
3189         return 0;
3190
3191 err_names:
3192         kfree(*names);
3193         return -ENOMEM;
3194 }
3195
3196 static void mlx5_ib_fill_q_counters(struct mlx5_ib_dev *dev,
3197                                     const char **names,
3198                                     size_t *offsets)
3199 {
3200         int i;
3201         int j = 0;
3202
3203         for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
3204                 names[j] = basic_q_cnts[i].name;
3205                 offsets[j] = basic_q_cnts[i].offset;
3206         }
3207
3208         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
3209                 for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
3210                         names[j] = out_of_seq_q_cnts[i].name;
3211                         offsets[j] = out_of_seq_q_cnts[i].offset;
3212                 }
3213         }
3214
3215         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
3216                 for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
3217                         names[j] = retrans_q_cnts[i].name;
3218                         offsets[j] = retrans_q_cnts[i].offset;
3219                 }
3220         }
3221 }
3222
3223 static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
3224 {
3225         int i;
3226         int ret;
3227
3228         for (i = 0; i < dev->num_ports; i++) {
3229                 struct mlx5_ib_port *port = &dev->port[i];
3230
3231                 ret = mlx5_core_alloc_q_counter(dev->mdev,
3232                                                 &port->q_cnts.set_id);
3233                 if (ret) {
3234                         mlx5_ib_warn(dev,
3235                                      "couldn't allocate queue counter for port %d, err %d\n",
3236                                      i + 1, ret);
3237                         goto dealloc_counters;
3238                 }
3239
3240                 ret = __mlx5_ib_alloc_q_counters(dev,
3241                                                  &port->q_cnts.names,
3242                                                  &port->q_cnts.offsets,
3243                                                  &port->q_cnts.num_counters);
3244                 if (ret)
3245                         goto dealloc_counters;
3246
3247                 mlx5_ib_fill_q_counters(dev, port->q_cnts.names,
3248                                         port->q_cnts.offsets);
3249         }
3250
3251         return 0;
3252
3253 dealloc_counters:
3254         while (--i >= 0)
3255                 mlx5_core_dealloc_q_counter(dev->mdev,
3256                                             dev->port[i].q_cnts.set_id);
3257
3258         return ret;
3259 }
3260
3261 static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
3262                                                     u8 port_num)
3263 {
3264         struct mlx5_ib_dev *dev = to_mdev(ibdev);
3265         struct mlx5_ib_port *port = &dev->port[port_num - 1];
3266
3267         /* We support only per port stats */
3268         if (port_num == 0)
3269                 return NULL;
3270
3271         return rdma_alloc_hw_stats_struct(port->q_cnts.names,
3272                                           port->q_cnts.num_counters,
3273                                           RDMA_HW_STATS_DEFAULT_LIFESPAN);
3274 }
3275
3276 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
3277                                 struct rdma_hw_stats *stats,
3278                                 u8 port_num, int index)
3279 {
3280         struct mlx5_ib_dev *dev = to_mdev(ibdev);
3281         struct mlx5_ib_port *port = &dev->port[port_num - 1];
3282         int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
3283         void *out;
3284         __be32 val;
3285         int ret;
3286         int i;
3287
3288         if (!stats)
3289                 return -ENOSYS;
3290
3291         out = mlx5_vzalloc(outlen);
3292         if (!out)
3293                 return -ENOMEM;
3294
3295         ret = mlx5_core_query_q_counter(dev->mdev,
3296                                         port->q_cnts.set_id, 0,
3297                                         out, outlen);
3298         if (ret)
3299                 goto free;
3300
3301         for (i = 0; i < port->q_cnts.num_counters; i++) {
3302                 val = *(__be32 *)(out + port->q_cnts.offsets[i]);
3303                 stats->value[i] = (u64)be32_to_cpu(val);
3304         }
3305
3306 free:
3307         kvfree(out);
3308         return port->q_cnts.num_counters;
3309 }
3310
3311 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
3312 {
3313         struct mlx5_ib_dev *dev;
3314         enum rdma_link_layer ll;
3315         int port_type_cap;
3316         const char *name;
3317         int err;
3318         int i;
3319
3320         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
3321         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
3322
3323         printk_once(KERN_INFO "%s", mlx5_version);
3324
3325         dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
3326         if (!dev)
3327                 return NULL;
3328
3329         dev->mdev = mdev;
3330
3331         dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
3332                             GFP_KERNEL);
3333         if (!dev->port)
3334                 goto err_dealloc;
3335
3336         rwlock_init(&dev->roce.netdev_lock);
3337         err = get_port_caps(dev);
3338         if (err)
3339                 goto err_free_port;
3340
3341         if (mlx5_use_mad_ifc(dev))
3342                 get_ext_port_caps(dev);
3343
3344         if (!mlx5_lag_is_active(mdev))
3345                 name = "mlx5_%d";
3346         else
3347                 name = "mlx5_bond_%d";
3348
3349         strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX);
3350         dev->ib_dev.owner               = THIS_MODULE;
3351         dev->ib_dev.node_type           = RDMA_NODE_IB_CA;
3352         dev->ib_dev.local_dma_lkey      = 0 /* not supported for now */;
3353         dev->num_ports          = MLX5_CAP_GEN(mdev, num_ports);
3354         dev->ib_dev.phys_port_cnt     = dev->num_ports;
3355         dev->ib_dev.num_comp_vectors    =
3356                 dev->mdev->priv.eq_table.num_comp_vectors;
3357         dev->ib_dev.dma_device  = &mdev->pdev->dev;
3358
3359         dev->ib_dev.uverbs_abi_ver      = MLX5_IB_UVERBS_ABI_VERSION;
3360         dev->ib_dev.uverbs_cmd_mask     =
3361                 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
3362                 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
3363                 (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
3364                 (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
3365                 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
3366                 (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
3367                 (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
3368                 (1ull << IB_USER_VERBS_CMD_REG_MR)              |
3369                 (1ull << IB_USER_VERBS_CMD_REREG_MR)            |
3370                 (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
3371                 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
3372                 (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
3373                 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
3374                 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
3375                 (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
3376                 (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
3377                 (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
3378                 (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
3379                 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
3380                 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
3381                 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
3382                 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
3383                 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
3384                 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
3385                 (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)         |
3386                 (1ull << IB_USER_VERBS_CMD_OPEN_QP);
3387         dev->ib_dev.uverbs_ex_cmd_mask =
3388                 (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)     |
3389                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)        |
3390                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP)        |
3391                 (1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP);
3392
3393         dev->ib_dev.query_device        = mlx5_ib_query_device;
3394         dev->ib_dev.query_port          = mlx5_ib_query_port;
3395         dev->ib_dev.get_link_layer      = mlx5_ib_port_link_layer;
3396         if (ll == IB_LINK_LAYER_ETHERNET)
3397                 dev->ib_dev.get_netdev  = mlx5_ib_get_netdev;
3398         dev->ib_dev.query_gid           = mlx5_ib_query_gid;
3399         dev->ib_dev.add_gid             = mlx5_ib_add_gid;
3400         dev->ib_dev.del_gid             = mlx5_ib_del_gid;
3401         dev->ib_dev.query_pkey          = mlx5_ib_query_pkey;
3402         dev->ib_dev.modify_device       = mlx5_ib_modify_device;
3403         dev->ib_dev.modify_port         = mlx5_ib_modify_port;
3404         dev->ib_dev.alloc_ucontext      = mlx5_ib_alloc_ucontext;
3405         dev->ib_dev.dealloc_ucontext    = mlx5_ib_dealloc_ucontext;
3406         dev->ib_dev.mmap                = mlx5_ib_mmap;
3407         dev->ib_dev.alloc_pd            = mlx5_ib_alloc_pd;
3408         dev->ib_dev.dealloc_pd          = mlx5_ib_dealloc_pd;
3409         dev->ib_dev.create_ah           = mlx5_ib_create_ah;
3410         dev->ib_dev.query_ah            = mlx5_ib_query_ah;
3411         dev->ib_dev.destroy_ah          = mlx5_ib_destroy_ah;
3412         dev->ib_dev.create_srq          = mlx5_ib_create_srq;
3413         dev->ib_dev.modify_srq          = mlx5_ib_modify_srq;
3414         dev->ib_dev.query_srq           = mlx5_ib_query_srq;
3415         dev->ib_dev.destroy_srq         = mlx5_ib_destroy_srq;
3416         dev->ib_dev.post_srq_recv       = mlx5_ib_post_srq_recv;
3417         dev->ib_dev.create_qp           = mlx5_ib_create_qp;
3418         dev->ib_dev.modify_qp           = mlx5_ib_modify_qp;
3419         dev->ib_dev.query_qp            = mlx5_ib_query_qp;
3420         dev->ib_dev.destroy_qp          = mlx5_ib_destroy_qp;
3421         dev->ib_dev.post_send           = mlx5_ib_post_send;
3422         dev->ib_dev.post_recv           = mlx5_ib_post_recv;
3423         dev->ib_dev.create_cq           = mlx5_ib_create_cq;
3424         dev->ib_dev.modify_cq           = mlx5_ib_modify_cq;
3425         dev->ib_dev.resize_cq           = mlx5_ib_resize_cq;
3426         dev->ib_dev.destroy_cq          = mlx5_ib_destroy_cq;
3427         dev->ib_dev.poll_cq             = mlx5_ib_poll_cq;
3428         dev->ib_dev.req_notify_cq       = mlx5_ib_arm_cq;
3429         dev->ib_dev.get_dma_mr          = mlx5_ib_get_dma_mr;
3430         dev->ib_dev.reg_user_mr         = mlx5_ib_reg_user_mr;
3431         dev->ib_dev.rereg_user_mr       = mlx5_ib_rereg_user_mr;
3432         dev->ib_dev.dereg_mr            = mlx5_ib_dereg_mr;
3433         dev->ib_dev.attach_mcast        = mlx5_ib_mcg_attach;
3434         dev->ib_dev.detach_mcast        = mlx5_ib_mcg_detach;
3435         dev->ib_dev.process_mad         = mlx5_ib_process_mad;
3436         dev->ib_dev.alloc_mr            = mlx5_ib_alloc_mr;
3437         dev->ib_dev.map_mr_sg           = mlx5_ib_map_mr_sg;
3438         dev->ib_dev.check_mr_status     = mlx5_ib_check_mr_status;
3439         dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
3440         dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
3441         if (mlx5_core_is_pf(mdev)) {
3442                 dev->ib_dev.get_vf_config       = mlx5_ib_get_vf_config;
3443                 dev->ib_dev.set_vf_link_state   = mlx5_ib_set_vf_link_state;
3444                 dev->ib_dev.get_vf_stats        = mlx5_ib_get_vf_stats;
3445                 dev->ib_dev.set_vf_guid         = mlx5_ib_set_vf_guid;
3446         }
3447
3448         dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
3449
3450         mlx5_ib_internal_fill_odp_caps(dev);
3451
3452         if (MLX5_CAP_GEN(mdev, imaicl)) {
3453                 dev->ib_dev.alloc_mw            = mlx5_ib_alloc_mw;
3454                 dev->ib_dev.dealloc_mw          = mlx5_ib_dealloc_mw;
3455                 dev->ib_dev.uverbs_cmd_mask |=
3456                         (1ull << IB_USER_VERBS_CMD_ALLOC_MW)    |
3457                         (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
3458         }
3459
3460         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
3461                 dev->ib_dev.get_hw_stats        = mlx5_ib_get_hw_stats;
3462                 dev->ib_dev.alloc_hw_stats      = mlx5_ib_alloc_hw_stats;
3463         }
3464
3465         if (MLX5_CAP_GEN(mdev, xrc)) {
3466                 dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
3467                 dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
3468                 dev->ib_dev.uverbs_cmd_mask |=
3469                         (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
3470                         (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
3471         }
3472
3473         if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
3474             IB_LINK_LAYER_ETHERNET) {
3475                 dev->ib_dev.create_flow = mlx5_ib_create_flow;
3476                 dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
3477                 dev->ib_dev.create_wq    = mlx5_ib_create_wq;
3478                 dev->ib_dev.modify_wq    = mlx5_ib_modify_wq;
3479                 dev->ib_dev.destroy_wq   = mlx5_ib_destroy_wq;
3480                 dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
3481                 dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
3482                 dev->ib_dev.uverbs_ex_cmd_mask |=
3483                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
3484                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) |
3485                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
3486                         (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
3487                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
3488                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
3489                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
3490         }
3491         err = init_node_data(dev);
3492         if (err)
3493                 goto err_free_port;
3494
3495         mutex_init(&dev->flow_db.lock);
3496         mutex_init(&dev->cap_mask_mutex);
3497         INIT_LIST_HEAD(&dev->qp_list);
3498         spin_lock_init(&dev->reset_flow_resource_lock);
3499
3500         if (ll == IB_LINK_LAYER_ETHERNET) {
3501                 err = mlx5_enable_eth(dev);
3502                 if (err)
3503                         goto err_free_port;
3504         }
3505
3506         err = create_dev_resources(&dev->devr);
3507         if (err)
3508                 goto err_disable_eth;
3509
3510         err = mlx5_ib_odp_init_one(dev);
3511         if (err)
3512                 goto err_rsrc;
3513
3514         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
3515                 err = mlx5_ib_alloc_q_counters(dev);
3516                 if (err)
3517                         goto err_odp;
3518         }
3519
3520         dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
3521         if (!dev->mdev->priv.uar)
3522                 goto err_q_cnt;
3523
3524         err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
3525         if (err)
3526                 goto err_uar_page;
3527
3528         err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
3529         if (err)
3530                 goto err_bfreg;
3531
3532         err = ib_register_device(&dev->ib_dev, NULL);
3533         if (err)
3534                 goto err_fp_bfreg;
3535
3536         err = create_umr_res(dev);
3537         if (err)
3538                 goto err_dev;
3539
3540         for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
3541                 err = device_create_file(&dev->ib_dev.dev,
3542                                          mlx5_class_attributes[i]);
3543                 if (err)
3544                         goto err_umrc;
3545         }
3546
3547         dev->ib_active = true;
3548
3549         return dev;
3550
3551 err_umrc:
3552         destroy_umrc_res(dev);
3553
3554 err_dev:
3555         ib_unregister_device(&dev->ib_dev);
3556
3557 err_fp_bfreg:
3558         mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
3559
3560 err_bfreg:
3561         mlx5_free_bfreg(dev->mdev, &dev->bfreg);
3562
3563 err_uar_page:
3564         mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
3565
3566 err_q_cnt:
3567         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
3568                 mlx5_ib_dealloc_q_counters(dev);
3569
3570 err_odp:
3571         mlx5_ib_odp_remove_one(dev);
3572
3573 err_rsrc:
3574         destroy_dev_resources(&dev->devr);
3575
3576 err_disable_eth:
3577         if (ll == IB_LINK_LAYER_ETHERNET) {
3578                 mlx5_disable_eth(dev);
3579                 mlx5_remove_netdev_notifier(dev);
3580         }
3581
3582 err_free_port:
3583         kfree(dev->port);
3584
3585 err_dealloc:
3586         ib_dealloc_device((struct ib_device *)dev);
3587
3588         return NULL;
3589 }
3590
3591 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
3592 {
3593         struct mlx5_ib_dev *dev = context;
3594         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
3595
3596         mlx5_remove_netdev_notifier(dev);
3597         ib_unregister_device(&dev->ib_dev);
3598         mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
3599         mlx5_free_bfreg(dev->mdev, &dev->bfreg);
3600         mlx5_put_uars_page(dev->mdev, mdev->priv.uar);
3601         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
3602                 mlx5_ib_dealloc_q_counters(dev);
3603         destroy_umrc_res(dev);
3604         mlx5_ib_odp_remove_one(dev);
3605         destroy_dev_resources(&dev->devr);
3606         if (ll == IB_LINK_LAYER_ETHERNET)
3607                 mlx5_disable_eth(dev);
3608         kfree(dev->port);
3609         ib_dealloc_device(&dev->ib_dev);
3610 }
3611
3612 static struct mlx5_interface mlx5_ib_interface = {
3613         .add            = mlx5_ib_add,
3614         .remove         = mlx5_ib_remove,
3615         .event          = mlx5_ib_event,
3616 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
3617         .pfault         = mlx5_ib_pfault,
3618 #endif
3619         .protocol       = MLX5_INTERFACE_PROTOCOL_IB,
3620 };
3621
3622 static int __init mlx5_ib_init(void)
3623 {
3624         int err;
3625
3626         mlx5_ib_odp_init();
3627
3628         err = mlx5_register_interface(&mlx5_ib_interface);
3629
3630         return err;
3631 }
3632
3633 static void __exit mlx5_ib_cleanup(void)
3634 {
3635         mlx5_unregister_interface(&mlx5_ib_interface);
3636 }
3637
3638 module_init(mlx5_ib_init);
3639 module_exit(mlx5_ib_cleanup);
This page took 0.252738 seconds and 4 git commands to generate.