]> Git Repo - linux.git/blob - drivers/infiniband/hw/mlx5/main.c
sched/headers: Prepare to move the get_task_struct()/put_task_struct() and related...
[linux.git] / drivers / infiniband / hw / mlx5 / main.c
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include <linux/highmem.h>
34 #include <linux/module.h>
35 #include <linux/init.h>
36 #include <linux/errno.h>
37 #include <linux/pci.h>
38 #include <linux/dma-mapping.h>
39 #include <linux/slab.h>
40 #if defined(CONFIG_X86)
41 #include <asm/pat.h>
42 #endif
43 #include <linux/sched.h>
44 #include <linux/sched/mm.h>
45 #include <linux/sched/task.h>
46 #include <linux/delay.h>
47 #include <rdma/ib_user_verbs.h>
48 #include <rdma/ib_addr.h>
49 #include <rdma/ib_cache.h>
50 #include <linux/mlx5/port.h>
51 #include <linux/mlx5/vport.h>
52 #include <linux/list.h>
53 #include <rdma/ib_smi.h>
54 #include <rdma/ib_umem.h>
55 #include <linux/in.h>
56 #include <linux/etherdevice.h>
57 #include <linux/mlx5/fs.h>
58 #include <linux/mlx5/vport.h>
59 #include "mlx5_ib.h"
60
61 #define DRIVER_NAME "mlx5_ib"
62 #define DRIVER_VERSION "2.2-1"
63 #define DRIVER_RELDATE  "Feb 2014"
64
65 MODULE_AUTHOR("Eli Cohen <[email protected]>");
66 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
67 MODULE_LICENSE("Dual BSD/GPL");
68 MODULE_VERSION(DRIVER_VERSION);
69
70 static char mlx5_version[] =
71         DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
72         DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
73
74 enum {
75         MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
76 };
77
78 static enum rdma_link_layer
79 mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
80 {
81         switch (port_type_cap) {
82         case MLX5_CAP_PORT_TYPE_IB:
83                 return IB_LINK_LAYER_INFINIBAND;
84         case MLX5_CAP_PORT_TYPE_ETH:
85                 return IB_LINK_LAYER_ETHERNET;
86         default:
87                 return IB_LINK_LAYER_UNSPECIFIED;
88         }
89 }
90
91 static enum rdma_link_layer
92 mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
93 {
94         struct mlx5_ib_dev *dev = to_mdev(device);
95         int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
96
97         return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
98 }
99
100 static int mlx5_netdev_event(struct notifier_block *this,
101                              unsigned long event, void *ptr)
102 {
103         struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
104         struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
105                                                  roce.nb);
106
107         switch (event) {
108         case NETDEV_REGISTER:
109         case NETDEV_UNREGISTER:
110                 write_lock(&ibdev->roce.netdev_lock);
111                 if (ndev->dev.parent == &ibdev->mdev->pdev->dev)
112                         ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ?
113                                              NULL : ndev;
114                 write_unlock(&ibdev->roce.netdev_lock);
115                 break;
116
117         case NETDEV_UP:
118         case NETDEV_DOWN: {
119                 struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
120                 struct net_device *upper = NULL;
121
122                 if (lag_ndev) {
123                         upper = netdev_master_upper_dev_get(lag_ndev);
124                         dev_put(lag_ndev);
125                 }
126
127                 if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev))
128                     && ibdev->ib_active) {
129                         struct ib_event ibev = { };
130
131                         ibev.device = &ibdev->ib_dev;
132                         ibev.event = (event == NETDEV_UP) ?
133                                      IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
134                         ibev.element.port_num = 1;
135                         ib_dispatch_event(&ibev);
136                 }
137                 break;
138         }
139
140         default:
141                 break;
142         }
143
144         return NOTIFY_DONE;
145 }
146
147 static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
148                                              u8 port_num)
149 {
150         struct mlx5_ib_dev *ibdev = to_mdev(device);
151         struct net_device *ndev;
152
153         ndev = mlx5_lag_get_roce_netdev(ibdev->mdev);
154         if (ndev)
155                 return ndev;
156
157         /* Ensure ndev does not disappear before we invoke dev_hold()
158          */
159         read_lock(&ibdev->roce.netdev_lock);
160         ndev = ibdev->roce.netdev;
161         if (ndev)
162                 dev_hold(ndev);
163         read_unlock(&ibdev->roce.netdev_lock);
164
165         return ndev;
166 }
167
168 static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
169                                 struct ib_port_attr *props)
170 {
171         struct mlx5_ib_dev *dev = to_mdev(device);
172         struct net_device *ndev, *upper;
173         enum ib_mtu ndev_ib_mtu;
174         u16 qkey_viol_cntr;
175
176         /* props being zeroed by the caller, avoid zeroing it here */
177
178         props->port_cap_flags  |= IB_PORT_CM_SUP;
179         props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
180
181         props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
182                                                 roce_address_table_size);
183         props->max_mtu          = IB_MTU_4096;
184         props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
185         props->pkey_tbl_len     = 1;
186         props->state            = IB_PORT_DOWN;
187         props->phys_state       = 3;
188
189         mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
190         props->qkey_viol_cntr = qkey_viol_cntr;
191
192         ndev = mlx5_ib_get_netdev(device, port_num);
193         if (!ndev)
194                 return 0;
195
196         if (mlx5_lag_is_active(dev->mdev)) {
197                 rcu_read_lock();
198                 upper = netdev_master_upper_dev_get_rcu(ndev);
199                 if (upper) {
200                         dev_put(ndev);
201                         ndev = upper;
202                         dev_hold(ndev);
203                 }
204                 rcu_read_unlock();
205         }
206
207         if (netif_running(ndev) && netif_carrier_ok(ndev)) {
208                 props->state      = IB_PORT_ACTIVE;
209                 props->phys_state = 5;
210         }
211
212         ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
213
214         dev_put(ndev);
215
216         props->active_mtu       = min(props->max_mtu, ndev_ib_mtu);
217
218         props->active_width     = IB_WIDTH_4X;  /* TODO */
219         props->active_speed     = IB_SPEED_QDR; /* TODO */
220
221         return 0;
222 }
223
224 static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
225                                      const struct ib_gid_attr *attr,
226                                      void *mlx5_addr)
227 {
228 #define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
229         char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
230                                                source_l3_address);
231         void *mlx5_addr_mac     = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
232                                                source_mac_47_32);
233
234         if (!gid)
235                 return;
236
237         ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr);
238
239         if (is_vlan_dev(attr->ndev)) {
240                 MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
241                 MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev));
242         }
243
244         switch (attr->gid_type) {
245         case IB_GID_TYPE_IB:
246                 MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
247                 break;
248         case IB_GID_TYPE_ROCE_UDP_ENCAP:
249                 MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
250                 break;
251
252         default:
253                 WARN_ON(true);
254         }
255
256         if (attr->gid_type != IB_GID_TYPE_IB) {
257                 if (ipv6_addr_v4mapped((void *)gid))
258                         MLX5_SET_RA(mlx5_addr, roce_l3_type,
259                                     MLX5_ROCE_L3_TYPE_IPV4);
260                 else
261                         MLX5_SET_RA(mlx5_addr, roce_l3_type,
262                                     MLX5_ROCE_L3_TYPE_IPV6);
263         }
264
265         if ((attr->gid_type == IB_GID_TYPE_IB) ||
266             !ipv6_addr_v4mapped((void *)gid))
267                 memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
268         else
269                 memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
270 }
271
272 static int set_roce_addr(struct ib_device *device, u8 port_num,
273                          unsigned int index,
274                          const union ib_gid *gid,
275                          const struct ib_gid_attr *attr)
276 {
277         struct mlx5_ib_dev *dev = to_mdev(device);
278         u32  in[MLX5_ST_SZ_DW(set_roce_address_in)]  = {0};
279         u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0};
280         void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
281         enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
282
283         if (ll != IB_LINK_LAYER_ETHERNET)
284                 return -EINVAL;
285
286         ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
287
288         MLX5_SET(set_roce_address_in, in, roce_address_index, index);
289         MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
290         return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
291 }
292
293 static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
294                            unsigned int index, const union ib_gid *gid,
295                            const struct ib_gid_attr *attr,
296                            __always_unused void **context)
297 {
298         return set_roce_addr(device, port_num, index, gid, attr);
299 }
300
301 static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
302                            unsigned int index, __always_unused void **context)
303 {
304         return set_roce_addr(device, port_num, index, NULL, NULL);
305 }
306
307 __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
308                                int index)
309 {
310         struct ib_gid_attr attr;
311         union ib_gid gid;
312
313         if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
314                 return 0;
315
316         if (!attr.ndev)
317                 return 0;
318
319         dev_put(attr.ndev);
320
321         if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
322                 return 0;
323
324         return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
325 }
326
327 int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
328                            int index, enum ib_gid_type *gid_type)
329 {
330         struct ib_gid_attr attr;
331         union ib_gid gid;
332         int ret;
333
334         ret = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr);
335         if (ret)
336                 return ret;
337
338         if (!attr.ndev)
339                 return -ENODEV;
340
341         dev_put(attr.ndev);
342
343         *gid_type = attr.gid_type;
344
345         return 0;
346 }
347
348 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
349 {
350         if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
351                 return !MLX5_CAP_GEN(dev->mdev, ib_virt);
352         return 0;
353 }
354
355 enum {
356         MLX5_VPORT_ACCESS_METHOD_MAD,
357         MLX5_VPORT_ACCESS_METHOD_HCA,
358         MLX5_VPORT_ACCESS_METHOD_NIC,
359 };
360
361 static int mlx5_get_vport_access_method(struct ib_device *ibdev)
362 {
363         if (mlx5_use_mad_ifc(to_mdev(ibdev)))
364                 return MLX5_VPORT_ACCESS_METHOD_MAD;
365
366         if (mlx5_ib_port_link_layer(ibdev, 1) ==
367             IB_LINK_LAYER_ETHERNET)
368                 return MLX5_VPORT_ACCESS_METHOD_NIC;
369
370         return MLX5_VPORT_ACCESS_METHOD_HCA;
371 }
372
373 static void get_atomic_caps(struct mlx5_ib_dev *dev,
374                             struct ib_device_attr *props)
375 {
376         u8 tmp;
377         u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
378         u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
379         u8 atomic_req_8B_endianness_mode =
380                 MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
381
382         /* Check if HW supports 8 bytes standard atomic operations and capable
383          * of host endianness respond
384          */
385         tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
386         if (((atomic_operations & tmp) == tmp) &&
387             (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
388             (atomic_req_8B_endianness_mode)) {
389                 props->atomic_cap = IB_ATOMIC_HCA;
390         } else {
391                 props->atomic_cap = IB_ATOMIC_NONE;
392         }
393 }
394
395 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
396                                         __be64 *sys_image_guid)
397 {
398         struct mlx5_ib_dev *dev = to_mdev(ibdev);
399         struct mlx5_core_dev *mdev = dev->mdev;
400         u64 tmp;
401         int err;
402
403         switch (mlx5_get_vport_access_method(ibdev)) {
404         case MLX5_VPORT_ACCESS_METHOD_MAD:
405                 return mlx5_query_mad_ifc_system_image_guid(ibdev,
406                                                             sys_image_guid);
407
408         case MLX5_VPORT_ACCESS_METHOD_HCA:
409                 err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
410                 break;
411
412         case MLX5_VPORT_ACCESS_METHOD_NIC:
413                 err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
414                 break;
415
416         default:
417                 return -EINVAL;
418         }
419
420         if (!err)
421                 *sys_image_guid = cpu_to_be64(tmp);
422
423         return err;
424
425 }
426
427 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
428                                 u16 *max_pkeys)
429 {
430         struct mlx5_ib_dev *dev = to_mdev(ibdev);
431         struct mlx5_core_dev *mdev = dev->mdev;
432
433         switch (mlx5_get_vport_access_method(ibdev)) {
434         case MLX5_VPORT_ACCESS_METHOD_MAD:
435                 return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
436
437         case MLX5_VPORT_ACCESS_METHOD_HCA:
438         case MLX5_VPORT_ACCESS_METHOD_NIC:
439                 *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
440                                                 pkey_table_size));
441                 return 0;
442
443         default:
444                 return -EINVAL;
445         }
446 }
447
448 static int mlx5_query_vendor_id(struct ib_device *ibdev,
449                                 u32 *vendor_id)
450 {
451         struct mlx5_ib_dev *dev = to_mdev(ibdev);
452
453         switch (mlx5_get_vport_access_method(ibdev)) {
454         case MLX5_VPORT_ACCESS_METHOD_MAD:
455                 return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
456
457         case MLX5_VPORT_ACCESS_METHOD_HCA:
458         case MLX5_VPORT_ACCESS_METHOD_NIC:
459                 return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
460
461         default:
462                 return -EINVAL;
463         }
464 }
465
466 static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
467                                 __be64 *node_guid)
468 {
469         u64 tmp;
470         int err;
471
472         switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
473         case MLX5_VPORT_ACCESS_METHOD_MAD:
474                 return mlx5_query_mad_ifc_node_guid(dev, node_guid);
475
476         case MLX5_VPORT_ACCESS_METHOD_HCA:
477                 err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
478                 break;
479
480         case MLX5_VPORT_ACCESS_METHOD_NIC:
481                 err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
482                 break;
483
484         default:
485                 return -EINVAL;
486         }
487
488         if (!err)
489                 *node_guid = cpu_to_be64(tmp);
490
491         return err;
492 }
493
494 struct mlx5_reg_node_desc {
495         u8      desc[IB_DEVICE_NODE_DESC_MAX];
496 };
497
498 static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
499 {
500         struct mlx5_reg_node_desc in;
501
502         if (mlx5_use_mad_ifc(dev))
503                 return mlx5_query_mad_ifc_node_desc(dev, node_desc);
504
505         memset(&in, 0, sizeof(in));
506
507         return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
508                                     sizeof(struct mlx5_reg_node_desc),
509                                     MLX5_REG_NODE_DESC, 0, 0);
510 }
511
512 static int mlx5_ib_query_device(struct ib_device *ibdev,
513                                 struct ib_device_attr *props,
514                                 struct ib_udata *uhw)
515 {
516         struct mlx5_ib_dev *dev = to_mdev(ibdev);
517         struct mlx5_core_dev *mdev = dev->mdev;
518         int err = -ENOMEM;
519         int max_sq_desc;
520         int max_rq_sg;
521         int max_sq_sg;
522         u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
523         struct mlx5_ib_query_device_resp resp = {};
524         size_t resp_len;
525         u64 max_tso;
526
527         resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
528         if (uhw->outlen && uhw->outlen < resp_len)
529                 return -EINVAL;
530         else
531                 resp.response_length = resp_len;
532
533         if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
534                 return -EINVAL;
535
536         memset(props, 0, sizeof(*props));
537         err = mlx5_query_system_image_guid(ibdev,
538                                            &props->sys_image_guid);
539         if (err)
540                 return err;
541
542         err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
543         if (err)
544                 return err;
545
546         err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
547         if (err)
548                 return err;
549
550         props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
551                 (fw_rev_min(dev->mdev) << 16) |
552                 fw_rev_sub(dev->mdev);
553         props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
554                 IB_DEVICE_PORT_ACTIVE_EVENT             |
555                 IB_DEVICE_SYS_IMAGE_GUID                |
556                 IB_DEVICE_RC_RNR_NAK_GEN;
557
558         if (MLX5_CAP_GEN(mdev, pkv))
559                 props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
560         if (MLX5_CAP_GEN(mdev, qkv))
561                 props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
562         if (MLX5_CAP_GEN(mdev, apm))
563                 props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
564         if (MLX5_CAP_GEN(mdev, xrc))
565                 props->device_cap_flags |= IB_DEVICE_XRC;
566         if (MLX5_CAP_GEN(mdev, imaicl)) {
567                 props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
568                                            IB_DEVICE_MEM_WINDOW_TYPE_2B;
569                 props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
570                 /* We support 'Gappy' memory registration too */
571                 props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
572         }
573         props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
574         if (MLX5_CAP_GEN(mdev, sho)) {
575                 props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
576                 /* At this stage no support for signature handover */
577                 props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
578                                       IB_PROT_T10DIF_TYPE_2 |
579                                       IB_PROT_T10DIF_TYPE_3;
580                 props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
581                                        IB_GUARD_T10DIF_CSUM;
582         }
583         if (MLX5_CAP_GEN(mdev, block_lb_mc))
584                 props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
585
586         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) {
587                 if (MLX5_CAP_ETH(mdev, csum_cap)) {
588                         /* Legacy bit to support old userspace libraries */
589                         props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
590                         props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
591                 }
592
593                 if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
594                         props->raw_packet_caps |=
595                                 IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
596
597                 if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
598                         max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
599                         if (max_tso) {
600                                 resp.tso_caps.max_tso = 1 << max_tso;
601                                 resp.tso_caps.supported_qpts |=
602                                         1 << IB_QPT_RAW_PACKET;
603                                 resp.response_length += sizeof(resp.tso_caps);
604                         }
605                 }
606
607                 if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
608                         resp.rss_caps.rx_hash_function =
609                                                 MLX5_RX_HASH_FUNC_TOEPLITZ;
610                         resp.rss_caps.rx_hash_fields_mask =
611                                                 MLX5_RX_HASH_SRC_IPV4 |
612                                                 MLX5_RX_HASH_DST_IPV4 |
613                                                 MLX5_RX_HASH_SRC_IPV6 |
614                                                 MLX5_RX_HASH_DST_IPV6 |
615                                                 MLX5_RX_HASH_SRC_PORT_TCP |
616                                                 MLX5_RX_HASH_DST_PORT_TCP |
617                                                 MLX5_RX_HASH_SRC_PORT_UDP |
618                                                 MLX5_RX_HASH_DST_PORT_UDP;
619                         resp.response_length += sizeof(resp.rss_caps);
620                 }
621         } else {
622                 if (field_avail(typeof(resp), tso_caps, uhw->outlen))
623                         resp.response_length += sizeof(resp.tso_caps);
624                 if (field_avail(typeof(resp), rss_caps, uhw->outlen))
625                         resp.response_length += sizeof(resp.rss_caps);
626         }
627
628         if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
629                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
630                 props->device_cap_flags |= IB_DEVICE_UD_TSO;
631         }
632
633         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
634             MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
635                 /* Legacy bit to support old userspace libraries */
636                 props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
637                 props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
638         }
639
640         if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
641                 props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
642
643         props->vendor_part_id      = mdev->pdev->device;
644         props->hw_ver              = mdev->pdev->revision;
645
646         props->max_mr_size         = ~0ull;
647         props->page_size_cap       = ~(min_page_size - 1);
648         props->max_qp              = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
649         props->max_qp_wr           = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
650         max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
651                      sizeof(struct mlx5_wqe_data_seg);
652         max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
653         max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
654                      sizeof(struct mlx5_wqe_raddr_seg)) /
655                 sizeof(struct mlx5_wqe_data_seg);
656         props->max_sge = min(max_rq_sg, max_sq_sg);
657         props->max_sge_rd          = MLX5_MAX_SGE_RD;
658         props->max_cq              = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
659         props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
660         props->max_mr              = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
661         props->max_pd              = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
662         props->max_qp_rd_atom      = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
663         props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
664         props->max_srq             = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
665         props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
666         props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
667         props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
668         props->max_srq_sge         = max_rq_sg - 1;
669         props->max_fast_reg_page_list_len =
670                 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
671         get_atomic_caps(dev, props);
672         props->masked_atomic_cap   = IB_ATOMIC_NONE;
673         props->max_mcast_grp       = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
674         props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
675         props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
676                                            props->max_mcast_grp;
677         props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
678         props->max_ah = INT_MAX;
679         props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
680         props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
681
682 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
683         if (MLX5_CAP_GEN(mdev, pg))
684                 props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
685         props->odp_caps = dev->odp_caps;
686 #endif
687
688         if (MLX5_CAP_GEN(mdev, cd))
689                 props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
690
691         if (!mlx5_core_is_pf(mdev))
692                 props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
693
694         if (mlx5_ib_port_link_layer(ibdev, 1) ==
695             IB_LINK_LAYER_ETHERNET) {
696                 props->rss_caps.max_rwq_indirection_tables =
697                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
698                 props->rss_caps.max_rwq_indirection_table_size =
699                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
700                 props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
701                 props->max_wq_type_rq =
702                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
703         }
704
705         if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) {
706                 resp.cqe_comp_caps.max_num =
707                         MLX5_CAP_GEN(dev->mdev, cqe_compression) ?
708                         MLX5_CAP_GEN(dev->mdev, cqe_compression_max_num) : 0;
709                 resp.cqe_comp_caps.supported_format =
710                         MLX5_IB_CQE_RES_FORMAT_HASH |
711                         MLX5_IB_CQE_RES_FORMAT_CSUM;
712                 resp.response_length += sizeof(resp.cqe_comp_caps);
713         }
714
715         if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen)) {
716                 if (MLX5_CAP_QOS(mdev, packet_pacing) &&
717                     MLX5_CAP_GEN(mdev, qos)) {
718                         resp.packet_pacing_caps.qp_rate_limit_max =
719                                 MLX5_CAP_QOS(mdev, packet_pacing_max_rate);
720                         resp.packet_pacing_caps.qp_rate_limit_min =
721                                 MLX5_CAP_QOS(mdev, packet_pacing_min_rate);
722                         resp.packet_pacing_caps.supported_qpts |=
723                                 1 << IB_QPT_RAW_PACKET;
724                 }
725                 resp.response_length += sizeof(resp.packet_pacing_caps);
726         }
727
728         if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
729                         uhw->outlen)) {
730                 resp.mlx5_ib_support_multi_pkt_send_wqes =
731                         MLX5_CAP_ETH(mdev, multi_pkt_send_wqe);
732                 resp.response_length +=
733                         sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
734         }
735
736         if (field_avail(typeof(resp), reserved, uhw->outlen))
737                 resp.response_length += sizeof(resp.reserved);
738
739         if (uhw->outlen) {
740                 err = ib_copy_to_udata(uhw, &resp, resp.response_length);
741
742                 if (err)
743                         return err;
744         }
745
746         return 0;
747 }
748
749 enum mlx5_ib_width {
750         MLX5_IB_WIDTH_1X        = 1 << 0,
751         MLX5_IB_WIDTH_2X        = 1 << 1,
752         MLX5_IB_WIDTH_4X        = 1 << 2,
753         MLX5_IB_WIDTH_8X        = 1 << 3,
754         MLX5_IB_WIDTH_12X       = 1 << 4
755 };
756
757 static int translate_active_width(struct ib_device *ibdev, u8 active_width,
758                                   u8 *ib_width)
759 {
760         struct mlx5_ib_dev *dev = to_mdev(ibdev);
761         int err = 0;
762
763         if (active_width & MLX5_IB_WIDTH_1X) {
764                 *ib_width = IB_WIDTH_1X;
765         } else if (active_width & MLX5_IB_WIDTH_2X) {
766                 mlx5_ib_dbg(dev, "active_width %d is not supported by IB spec\n",
767                             (int)active_width);
768                 err = -EINVAL;
769         } else if (active_width & MLX5_IB_WIDTH_4X) {
770                 *ib_width = IB_WIDTH_4X;
771         } else if (active_width & MLX5_IB_WIDTH_8X) {
772                 *ib_width = IB_WIDTH_8X;
773         } else if (active_width & MLX5_IB_WIDTH_12X) {
774                 *ib_width = IB_WIDTH_12X;
775         } else {
776                 mlx5_ib_dbg(dev, "Invalid active_width %d\n",
777                             (int)active_width);
778                 err = -EINVAL;
779         }
780
781         return err;
782 }
783
784 static int mlx5_mtu_to_ib_mtu(int mtu)
785 {
786         switch (mtu) {
787         case 256: return 1;
788         case 512: return 2;
789         case 1024: return 3;
790         case 2048: return 4;
791         case 4096: return 5;
792         default:
793                 pr_warn("invalid mtu\n");
794                 return -1;
795         }
796 }
797
798 enum ib_max_vl_num {
799         __IB_MAX_VL_0           = 1,
800         __IB_MAX_VL_0_1         = 2,
801         __IB_MAX_VL_0_3         = 3,
802         __IB_MAX_VL_0_7         = 4,
803         __IB_MAX_VL_0_14        = 5,
804 };
805
806 enum mlx5_vl_hw_cap {
807         MLX5_VL_HW_0    = 1,
808         MLX5_VL_HW_0_1  = 2,
809         MLX5_VL_HW_0_2  = 3,
810         MLX5_VL_HW_0_3  = 4,
811         MLX5_VL_HW_0_4  = 5,
812         MLX5_VL_HW_0_5  = 6,
813         MLX5_VL_HW_0_6  = 7,
814         MLX5_VL_HW_0_7  = 8,
815         MLX5_VL_HW_0_14 = 15
816 };
817
818 static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
819                                 u8 *max_vl_num)
820 {
821         switch (vl_hw_cap) {
822         case MLX5_VL_HW_0:
823                 *max_vl_num = __IB_MAX_VL_0;
824                 break;
825         case MLX5_VL_HW_0_1:
826                 *max_vl_num = __IB_MAX_VL_0_1;
827                 break;
828         case MLX5_VL_HW_0_3:
829                 *max_vl_num = __IB_MAX_VL_0_3;
830                 break;
831         case MLX5_VL_HW_0_7:
832                 *max_vl_num = __IB_MAX_VL_0_7;
833                 break;
834         case MLX5_VL_HW_0_14:
835                 *max_vl_num = __IB_MAX_VL_0_14;
836                 break;
837
838         default:
839                 return -EINVAL;
840         }
841
842         return 0;
843 }
844
845 static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
846                                struct ib_port_attr *props)
847 {
848         struct mlx5_ib_dev *dev = to_mdev(ibdev);
849         struct mlx5_core_dev *mdev = dev->mdev;
850         struct mlx5_hca_vport_context *rep;
851         u16 max_mtu;
852         u16 oper_mtu;
853         int err;
854         u8 ib_link_width_oper;
855         u8 vl_hw_cap;
856
857         rep = kzalloc(sizeof(*rep), GFP_KERNEL);
858         if (!rep) {
859                 err = -ENOMEM;
860                 goto out;
861         }
862
863         /* props being zeroed by the caller, avoid zeroing it here */
864
865         err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
866         if (err)
867                 goto out;
868
869         props->lid              = rep->lid;
870         props->lmc              = rep->lmc;
871         props->sm_lid           = rep->sm_lid;
872         props->sm_sl            = rep->sm_sl;
873         props->state            = rep->vport_state;
874         props->phys_state       = rep->port_physical_state;
875         props->port_cap_flags   = rep->cap_mask1;
876         props->gid_tbl_len      = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
877         props->max_msg_sz       = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
878         props->pkey_tbl_len     = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
879         props->bad_pkey_cntr    = rep->pkey_violation_counter;
880         props->qkey_viol_cntr   = rep->qkey_violation_counter;
881         props->subnet_timeout   = rep->subnet_timeout;
882         props->init_type_reply  = rep->init_type_reply;
883         props->grh_required     = rep->grh_required;
884
885         err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
886         if (err)
887                 goto out;
888
889         err = translate_active_width(ibdev, ib_link_width_oper,
890                                      &props->active_width);
891         if (err)
892                 goto out;
893         err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port);
894         if (err)
895                 goto out;
896
897         mlx5_query_port_max_mtu(mdev, &max_mtu, port);
898
899         props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
900
901         mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
902
903         props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
904
905         err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
906         if (err)
907                 goto out;
908
909         err = translate_max_vl_num(ibdev, vl_hw_cap,
910                                    &props->max_vl_num);
911 out:
912         kfree(rep);
913         return err;
914 }
915
916 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
917                        struct ib_port_attr *props)
918 {
919         switch (mlx5_get_vport_access_method(ibdev)) {
920         case MLX5_VPORT_ACCESS_METHOD_MAD:
921                 return mlx5_query_mad_ifc_port(ibdev, port, props);
922
923         case MLX5_VPORT_ACCESS_METHOD_HCA:
924                 return mlx5_query_hca_port(ibdev, port, props);
925
926         case MLX5_VPORT_ACCESS_METHOD_NIC:
927                 return mlx5_query_port_roce(ibdev, port, props);
928
929         default:
930                 return -EINVAL;
931         }
932 }
933
934 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
935                              union ib_gid *gid)
936 {
937         struct mlx5_ib_dev *dev = to_mdev(ibdev);
938         struct mlx5_core_dev *mdev = dev->mdev;
939
940         switch (mlx5_get_vport_access_method(ibdev)) {
941         case MLX5_VPORT_ACCESS_METHOD_MAD:
942                 return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
943
944         case MLX5_VPORT_ACCESS_METHOD_HCA:
945                 return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
946
947         default:
948                 return -EINVAL;
949         }
950
951 }
952
953 static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
954                               u16 *pkey)
955 {
956         struct mlx5_ib_dev *dev = to_mdev(ibdev);
957         struct mlx5_core_dev *mdev = dev->mdev;
958
959         switch (mlx5_get_vport_access_method(ibdev)) {
960         case MLX5_VPORT_ACCESS_METHOD_MAD:
961                 return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
962
963         case MLX5_VPORT_ACCESS_METHOD_HCA:
964         case MLX5_VPORT_ACCESS_METHOD_NIC:
965                 return mlx5_query_hca_vport_pkey(mdev, 0, port,  0, index,
966                                                  pkey);
967         default:
968                 return -EINVAL;
969         }
970 }
971
972 static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
973                                  struct ib_device_modify *props)
974 {
975         struct mlx5_ib_dev *dev = to_mdev(ibdev);
976         struct mlx5_reg_node_desc in;
977         struct mlx5_reg_node_desc out;
978         int err;
979
980         if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
981                 return -EOPNOTSUPP;
982
983         if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
984                 return 0;
985
986         /*
987          * If possible, pass node desc to FW, so it can generate
988          * a 144 trap.  If cmd fails, just ignore.
989          */
990         memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
991         err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
992                                    sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
993         if (err)
994                 return err;
995
996         memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
997
998         return err;
999 }
1000
1001 static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask,
1002                                 u32 value)
1003 {
1004         struct mlx5_hca_vport_context ctx = {};
1005         int err;
1006
1007         err = mlx5_query_hca_vport_context(dev->mdev, 0,
1008                                            port_num, 0, &ctx);
1009         if (err)
1010                 return err;
1011
1012         if (~ctx.cap_mask1_perm & mask) {
1013                 mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
1014                              mask, ctx.cap_mask1_perm);
1015                 return -EINVAL;
1016         }
1017
1018         ctx.cap_mask1 = value;
1019         ctx.cap_mask1_perm = mask;
1020         err = mlx5_core_modify_hca_vport_context(dev->mdev, 0,
1021                                                  port_num, 0, &ctx);
1022
1023         return err;
1024 }
1025
1026 static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
1027                                struct ib_port_modify *props)
1028 {
1029         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1030         struct ib_port_attr attr;
1031         u32 tmp;
1032         int err;
1033         u32 change_mask;
1034         u32 value;
1035         bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
1036                       IB_LINK_LAYER_INFINIBAND);
1037
1038         if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
1039                 change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
1040                 value = ~props->clr_port_cap_mask | props->set_port_cap_mask;
1041                 return set_port_caps_atomic(dev, port, change_mask, value);
1042         }
1043
1044         mutex_lock(&dev->cap_mask_mutex);
1045
1046         err = ib_query_port(ibdev, port, &attr);
1047         if (err)
1048                 goto out;
1049
1050         tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
1051                 ~props->clr_port_cap_mask;
1052
1053         err = mlx5_set_port_caps(dev->mdev, port, tmp);
1054
1055 out:
1056         mutex_unlock(&dev->cap_mask_mutex);
1057         return err;
1058 }
1059
1060 static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
1061 {
1062         mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
1063                     caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
1064 }
1065
1066 static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
1067                              struct mlx5_ib_alloc_ucontext_req_v2 *req,
1068                              u32 *num_sys_pages)
1069 {
1070         int uars_per_sys_page;
1071         int bfregs_per_sys_page;
1072         int ref_bfregs = req->total_num_bfregs;
1073
1074         if (req->total_num_bfregs == 0)
1075                 return -EINVAL;
1076
1077         BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
1078         BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
1079
1080         if (req->total_num_bfregs > MLX5_MAX_BFREGS)
1081                 return -ENOMEM;
1082
1083         uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
1084         bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
1085         req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
1086         *num_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
1087
1088         if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
1089                 return -EINVAL;
1090
1091         mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, alloated %d, using %d sys pages\n",
1092                     MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
1093                     lib_uar_4k ? "yes" : "no", ref_bfregs,
1094                     req->total_num_bfregs, *num_sys_pages);
1095
1096         return 0;
1097 }
1098
1099 static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1100 {
1101         struct mlx5_bfreg_info *bfregi;
1102         int err;
1103         int i;
1104
1105         bfregi = &context->bfregi;
1106         for (i = 0; i < bfregi->num_sys_pages; i++) {
1107                 err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
1108                 if (err)
1109                         goto error;
1110
1111                 mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
1112         }
1113         return 0;
1114
1115 error:
1116         for (--i; i >= 0; i--)
1117                 if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]))
1118                         mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1119
1120         return err;
1121 }
1122
1123 static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1124 {
1125         struct mlx5_bfreg_info *bfregi;
1126         int err;
1127         int i;
1128
1129         bfregi = &context->bfregi;
1130         for (i = 0; i < bfregi->num_sys_pages; i++) {
1131                 err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
1132                 if (err) {
1133                         mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1134                         return err;
1135                 }
1136         }
1137         return 0;
1138 }
1139
1140 static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
1141                                                   struct ib_udata *udata)
1142 {
1143         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1144         struct mlx5_ib_alloc_ucontext_req_v2 req = {};
1145         struct mlx5_ib_alloc_ucontext_resp resp = {};
1146         struct mlx5_ib_ucontext *context;
1147         struct mlx5_bfreg_info *bfregi;
1148         int ver;
1149         int err;
1150         size_t reqlen;
1151         size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
1152                                      max_cqe_version);
1153         bool lib_uar_4k;
1154
1155         if (!dev->ib_active)
1156                 return ERR_PTR(-EAGAIN);
1157
1158         if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr))
1159                 return ERR_PTR(-EINVAL);
1160
1161         reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
1162         if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
1163                 ver = 0;
1164         else if (reqlen >= min_req_v2)
1165                 ver = 2;
1166         else
1167                 return ERR_PTR(-EINVAL);
1168
1169         err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req)));
1170         if (err)
1171                 return ERR_PTR(err);
1172
1173         if (req.flags)
1174                 return ERR_PTR(-EINVAL);
1175
1176         if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1177                 return ERR_PTR(-EOPNOTSUPP);
1178
1179         req.total_num_bfregs = ALIGN(req.total_num_bfregs,
1180                                     MLX5_NON_FP_BFREGS_PER_UAR);
1181         if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
1182                 return ERR_PTR(-EINVAL);
1183
1184         resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1185         if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
1186                 resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
1187         resp.cache_line_size = cache_line_size();
1188         resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1189         resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1190         resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1191         resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1192         resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1193         resp.cqe_version = min_t(__u8,
1194                                  (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
1195                                  req.max_cqe_version);
1196         resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1197                                 MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
1198         resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1199                                         MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
1200         resp.response_length = min(offsetof(typeof(resp), response_length) +
1201                                    sizeof(resp.response_length), udata->outlen);
1202
1203         context = kzalloc(sizeof(*context), GFP_KERNEL);
1204         if (!context)
1205                 return ERR_PTR(-ENOMEM);
1206
1207         lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
1208         bfregi = &context->bfregi;
1209
1210         /* updates req->total_num_bfregs */
1211         err = calc_total_bfregs(dev, lib_uar_4k, &req, &bfregi->num_sys_pages);
1212         if (err)
1213                 goto out_ctx;
1214
1215         mutex_init(&bfregi->lock);
1216         bfregi->lib_uar_4k = lib_uar_4k;
1217         bfregi->count = kcalloc(req.total_num_bfregs, sizeof(*bfregi->count),
1218                                 GFP_KERNEL);
1219         if (!bfregi->count) {
1220                 err = -ENOMEM;
1221                 goto out_ctx;
1222         }
1223
1224         bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
1225                                     sizeof(*bfregi->sys_pages),
1226                                     GFP_KERNEL);
1227         if (!bfregi->sys_pages) {
1228                 err = -ENOMEM;
1229                 goto out_count;
1230         }
1231
1232         err = allocate_uars(dev, context);
1233         if (err)
1234                 goto out_sys_pages;
1235
1236 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1237         context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
1238 #endif
1239
1240         context->upd_xlt_page = __get_free_page(GFP_KERNEL);
1241         if (!context->upd_xlt_page) {
1242                 err = -ENOMEM;
1243                 goto out_uars;
1244         }
1245         mutex_init(&context->upd_xlt_page_mutex);
1246
1247         if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
1248                 err = mlx5_core_alloc_transport_domain(dev->mdev,
1249                                                        &context->tdn);
1250                 if (err)
1251                         goto out_page;
1252         }
1253
1254         INIT_LIST_HEAD(&context->vma_private_list);
1255         INIT_LIST_HEAD(&context->db_page_list);
1256         mutex_init(&context->db_page_mutex);
1257
1258         resp.tot_bfregs = req.total_num_bfregs;
1259         resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
1260
1261         if (field_avail(typeof(resp), cqe_version, udata->outlen))
1262                 resp.response_length += sizeof(resp.cqe_version);
1263
1264         if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
1265                 resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
1266                                       MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
1267                 resp.response_length += sizeof(resp.cmds_supp_uhw);
1268         }
1269
1270         if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) {
1271                 if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
1272                         mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
1273                         resp.eth_min_inline++;
1274                 }
1275                 resp.response_length += sizeof(resp.eth_min_inline);
1276         }
1277
1278         /*
1279          * We don't want to expose information from the PCI bar that is located
1280          * after 4096 bytes, so if the arch only supports larger pages, let's
1281          * pretend we don't support reading the HCA's core clock. This is also
1282          * forced by mmap function.
1283          */
1284         if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
1285                 if (PAGE_SIZE <= 4096) {
1286                         resp.comp_mask |=
1287                                 MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1288                         resp.hca_core_clock_offset =
1289                                 offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
1290                 }
1291                 resp.response_length += sizeof(resp.hca_core_clock_offset) +
1292                                         sizeof(resp.reserved2);
1293         }
1294
1295         if (field_avail(typeof(resp), log_uar_size, udata->outlen))
1296                 resp.response_length += sizeof(resp.log_uar_size);
1297
1298         if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
1299                 resp.response_length += sizeof(resp.num_uars_per_page);
1300
1301         err = ib_copy_to_udata(udata, &resp, resp.response_length);
1302         if (err)
1303                 goto out_td;
1304
1305         bfregi->ver = ver;
1306         bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
1307         context->cqe_version = resp.cqe_version;
1308         context->lib_caps = req.lib_caps;
1309         print_lib_caps(dev, context->lib_caps);
1310
1311         return &context->ibucontext;
1312
1313 out_td:
1314         if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1315                 mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1316
1317 out_page:
1318         free_page(context->upd_xlt_page);
1319
1320 out_uars:
1321         deallocate_uars(dev, context);
1322
1323 out_sys_pages:
1324         kfree(bfregi->sys_pages);
1325
1326 out_count:
1327         kfree(bfregi->count);
1328
1329 out_ctx:
1330         kfree(context);
1331
1332         return ERR_PTR(err);
1333 }
1334
1335 static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1336 {
1337         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1338         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1339         struct mlx5_bfreg_info *bfregi;
1340
1341         bfregi = &context->bfregi;
1342         if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1343                 mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn);
1344
1345         free_page(context->upd_xlt_page);
1346         deallocate_uars(dev, context);
1347         kfree(bfregi->sys_pages);
1348         kfree(bfregi->count);
1349         kfree(context);
1350
1351         return 0;
1352 }
1353
1354 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
1355                                  struct mlx5_bfreg_info *bfregi,
1356                                  int idx)
1357 {
1358         int fw_uars_per_page;
1359
1360         fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
1361
1362         return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) +
1363                         bfregi->sys_pages[idx] / fw_uars_per_page;
1364 }
1365
1366 static int get_command(unsigned long offset)
1367 {
1368         return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1369 }
1370
1371 static int get_arg(unsigned long offset)
1372 {
1373         return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1374 }
1375
1376 static int get_index(unsigned long offset)
1377 {
1378         return get_arg(offset);
1379 }
1380
1381 static void  mlx5_ib_vma_open(struct vm_area_struct *area)
1382 {
1383         /* vma_open is called when a new VMA is created on top of our VMA.  This
1384          * is done through either mremap flow or split_vma (usually due to
1385          * mlock, madvise, munmap, etc.) We do not support a clone of the VMA,
1386          * as this VMA is strongly hardware related.  Therefore we set the
1387          * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
1388          * calling us again and trying to do incorrect actions.  We assume that
1389          * the original VMA size is exactly a single page, and therefore all
1390          * "splitting" operation will not happen to it.
1391          */
1392         area->vm_ops = NULL;
1393 }
1394
1395 static void  mlx5_ib_vma_close(struct vm_area_struct *area)
1396 {
1397         struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data;
1398
1399         /* It's guaranteed that all VMAs opened on a FD are closed before the
1400          * file itself is closed, therefore no sync is needed with the regular
1401          * closing flow. (e.g. mlx5 ib_dealloc_ucontext)
1402          * However need a sync with accessing the vma as part of
1403          * mlx5_ib_disassociate_ucontext.
1404          * The close operation is usually called under mm->mmap_sem except when
1405          * process is exiting.
1406          * The exiting case is handled explicitly as part of
1407          * mlx5_ib_disassociate_ucontext.
1408          */
1409         mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data;
1410
1411         /* setting the vma context pointer to null in the mlx5_ib driver's
1412          * private data, to protect a race condition in
1413          * mlx5_ib_disassociate_ucontext().
1414          */
1415         mlx5_ib_vma_priv_data->vma = NULL;
1416         list_del(&mlx5_ib_vma_priv_data->list);
1417         kfree(mlx5_ib_vma_priv_data);
1418 }
1419
1420 static const struct vm_operations_struct mlx5_ib_vm_ops = {
1421         .open = mlx5_ib_vma_open,
1422         .close = mlx5_ib_vma_close
1423 };
1424
1425 static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
1426                                 struct mlx5_ib_ucontext *ctx)
1427 {
1428         struct mlx5_ib_vma_private_data *vma_prv;
1429         struct list_head *vma_head = &ctx->vma_private_list;
1430
1431         vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL);
1432         if (!vma_prv)
1433                 return -ENOMEM;
1434
1435         vma_prv->vma = vma;
1436         vma->vm_private_data = vma_prv;
1437         vma->vm_ops =  &mlx5_ib_vm_ops;
1438
1439         list_add(&vma_prv->list, vma_head);
1440
1441         return 0;
1442 }
1443
1444 static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
1445 {
1446         int ret;
1447         struct vm_area_struct *vma;
1448         struct mlx5_ib_vma_private_data *vma_private, *n;
1449         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1450         struct task_struct *owning_process  = NULL;
1451         struct mm_struct   *owning_mm       = NULL;
1452
1453         owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
1454         if (!owning_process)
1455                 return;
1456
1457         owning_mm = get_task_mm(owning_process);
1458         if (!owning_mm) {
1459                 pr_info("no mm, disassociate ucontext is pending task termination\n");
1460                 while (1) {
1461                         put_task_struct(owning_process);
1462                         usleep_range(1000, 2000);
1463                         owning_process = get_pid_task(ibcontext->tgid,
1464                                                       PIDTYPE_PID);
1465                         if (!owning_process ||
1466                             owning_process->state == TASK_DEAD) {
1467                                 pr_info("disassociate ucontext done, task was terminated\n");
1468                                 /* in case task was dead need to release the
1469                                  * task struct.
1470                                  */
1471                                 if (owning_process)
1472                                         put_task_struct(owning_process);
1473                                 return;
1474                         }
1475                 }
1476         }
1477
1478         /* need to protect from a race on closing the vma as part of
1479          * mlx5_ib_vma_close.
1480          */
1481         down_read(&owning_mm->mmap_sem);
1482         list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
1483                                  list) {
1484                 vma = vma_private->vma;
1485                 ret = zap_vma_ptes(vma, vma->vm_start,
1486                                    PAGE_SIZE);
1487                 WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__);
1488                 /* context going to be destroyed, should
1489                  * not access ops any more.
1490                  */
1491                 vma->vm_ops = NULL;
1492                 list_del(&vma_private->list);
1493                 kfree(vma_private);
1494         }
1495         up_read(&owning_mm->mmap_sem);
1496         mmput(owning_mm);
1497         put_task_struct(owning_process);
1498 }
1499
1500 static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
1501 {
1502         switch (cmd) {
1503         case MLX5_IB_MMAP_WC_PAGE:
1504                 return "WC";
1505         case MLX5_IB_MMAP_REGULAR_PAGE:
1506                 return "best effort WC";
1507         case MLX5_IB_MMAP_NC_PAGE:
1508                 return "NC";
1509         default:
1510                 return NULL;
1511         }
1512 }
1513
1514 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
1515                     struct vm_area_struct *vma,
1516                     struct mlx5_ib_ucontext *context)
1517 {
1518         struct mlx5_bfreg_info *bfregi = &context->bfregi;
1519         int err;
1520         unsigned long idx;
1521         phys_addr_t pfn, pa;
1522         pgprot_t prot;
1523         int uars_per_page;
1524
1525         if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1526                 return -EINVAL;
1527
1528         uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
1529         idx = get_index(vma->vm_pgoff);
1530         if (idx % uars_per_page ||
1531             idx * uars_per_page >= bfregi->num_sys_pages) {
1532                 mlx5_ib_warn(dev, "invalid uar index %lu\n", idx);
1533                 return -EINVAL;
1534         }
1535
1536         switch (cmd) {
1537         case MLX5_IB_MMAP_WC_PAGE:
1538 /* Some architectures don't support WC memory */
1539 #if defined(CONFIG_X86)
1540                 if (!pat_enabled())
1541                         return -EPERM;
1542 #elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
1543                         return -EPERM;
1544 #endif
1545         /* fall through */
1546         case MLX5_IB_MMAP_REGULAR_PAGE:
1547                 /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
1548                 prot = pgprot_writecombine(vma->vm_page_prot);
1549                 break;
1550         case MLX5_IB_MMAP_NC_PAGE:
1551                 prot = pgprot_noncached(vma->vm_page_prot);
1552                 break;
1553         default:
1554                 return -EINVAL;
1555         }
1556
1557         pfn = uar_index2pfn(dev, bfregi, idx);
1558         mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
1559
1560         vma->vm_page_prot = prot;
1561         err = io_remap_pfn_range(vma, vma->vm_start, pfn,
1562                                  PAGE_SIZE, vma->vm_page_prot);
1563         if (err) {
1564                 mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n",
1565                             err, vma->vm_start, &pfn, mmap_cmd2str(cmd));
1566                 return -EAGAIN;
1567         }
1568
1569         pa = pfn << PAGE_SHIFT;
1570         mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd),
1571                     vma->vm_start, &pa);
1572
1573         return mlx5_ib_set_vma_data(vma, context);
1574 }
1575
1576 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
1577 {
1578         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1579         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1580         unsigned long command;
1581         phys_addr_t pfn;
1582
1583         command = get_command(vma->vm_pgoff);
1584         switch (command) {
1585         case MLX5_IB_MMAP_WC_PAGE:
1586         case MLX5_IB_MMAP_NC_PAGE:
1587         case MLX5_IB_MMAP_REGULAR_PAGE:
1588                 return uar_mmap(dev, command, vma, context);
1589
1590         case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
1591                 return -ENOSYS;
1592
1593         case MLX5_IB_MMAP_CORE_CLOCK:
1594                 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1595                         return -EINVAL;
1596
1597                 if (vma->vm_flags & VM_WRITE)
1598                         return -EPERM;
1599
1600                 /* Don't expose to user-space information it shouldn't have */
1601                 if (PAGE_SIZE > 4096)
1602                         return -EOPNOTSUPP;
1603
1604                 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1605                 pfn = (dev->mdev->iseg_base +
1606                        offsetof(struct mlx5_init_seg, internal_timer_h)) >>
1607                         PAGE_SHIFT;
1608                 if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1609                                        PAGE_SIZE, vma->vm_page_prot))
1610                         return -EAGAIN;
1611
1612                 mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n",
1613                             vma->vm_start,
1614                             (unsigned long long)pfn << PAGE_SHIFT);
1615                 break;
1616
1617         default:
1618                 return -EINVAL;
1619         }
1620
1621         return 0;
1622 }
1623
1624 static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
1625                                       struct ib_ucontext *context,
1626                                       struct ib_udata *udata)
1627 {
1628         struct mlx5_ib_alloc_pd_resp resp;
1629         struct mlx5_ib_pd *pd;
1630         int err;
1631
1632         pd = kmalloc(sizeof(*pd), GFP_KERNEL);
1633         if (!pd)
1634                 return ERR_PTR(-ENOMEM);
1635
1636         err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
1637         if (err) {
1638                 kfree(pd);
1639                 return ERR_PTR(err);
1640         }
1641
1642         if (context) {
1643                 resp.pdn = pd->pdn;
1644                 if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
1645                         mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1646                         kfree(pd);
1647                         return ERR_PTR(-EFAULT);
1648                 }
1649         }
1650
1651         return &pd->ibpd;
1652 }
1653
1654 static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
1655 {
1656         struct mlx5_ib_dev *mdev = to_mdev(pd->device);
1657         struct mlx5_ib_pd *mpd = to_mpd(pd);
1658
1659         mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
1660         kfree(mpd);
1661
1662         return 0;
1663 }
1664
1665 enum {
1666         MATCH_CRITERIA_ENABLE_OUTER_BIT,
1667         MATCH_CRITERIA_ENABLE_MISC_BIT,
1668         MATCH_CRITERIA_ENABLE_INNER_BIT
1669 };
1670
1671 #define HEADER_IS_ZERO(match_criteria, headers)                            \
1672         !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
1673                     0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
1674
1675 static u8 get_match_criteria_enable(u32 *match_criteria)
1676 {
1677         u8 match_criteria_enable;
1678
1679         match_criteria_enable =
1680                 (!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
1681                 MATCH_CRITERIA_ENABLE_OUTER_BIT;
1682         match_criteria_enable |=
1683                 (!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
1684                 MATCH_CRITERIA_ENABLE_MISC_BIT;
1685         match_criteria_enable |=
1686                 (!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
1687                 MATCH_CRITERIA_ENABLE_INNER_BIT;
1688
1689         return match_criteria_enable;
1690 }
1691
1692 static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
1693 {
1694         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
1695         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
1696 }
1697
1698 static void set_flow_label(void *misc_c, void *misc_v, u8 mask, u8 val,
1699                            bool inner)
1700 {
1701         if (inner) {
1702                 MLX5_SET(fte_match_set_misc,
1703                          misc_c, inner_ipv6_flow_label, mask);
1704                 MLX5_SET(fte_match_set_misc,
1705                          misc_v, inner_ipv6_flow_label, val);
1706         } else {
1707                 MLX5_SET(fte_match_set_misc,
1708                          misc_c, outer_ipv6_flow_label, mask);
1709                 MLX5_SET(fte_match_set_misc,
1710                          misc_v, outer_ipv6_flow_label, val);
1711         }
1712 }
1713
1714 static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
1715 {
1716         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
1717         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
1718         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
1719         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
1720 }
1721
1722 #define LAST_ETH_FIELD vlan_tag
1723 #define LAST_IB_FIELD sl
1724 #define LAST_IPV4_FIELD tos
1725 #define LAST_IPV6_FIELD traffic_class
1726 #define LAST_TCP_UDP_FIELD src_port
1727 #define LAST_TUNNEL_FIELD tunnel_id
1728 #define LAST_FLOW_TAG_FIELD tag_id
1729
1730 /* Field is the last supported field */
1731 #define FIELDS_NOT_SUPPORTED(filter, field)\
1732         memchr_inv((void *)&filter.field  +\
1733                    sizeof(filter.field), 0,\
1734                    sizeof(filter) -\
1735                    offsetof(typeof(filter), field) -\
1736                    sizeof(filter.field))
1737
1738 static int parse_flow_attr(u32 *match_c, u32 *match_v,
1739                            const union ib_flow_spec *ib_spec, u32 *tag_id)
1740 {
1741         void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
1742                                            misc_parameters);
1743         void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
1744                                            misc_parameters);
1745         void *headers_c;
1746         void *headers_v;
1747
1748         if (ib_spec->type & IB_FLOW_SPEC_INNER) {
1749                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
1750                                          inner_headers);
1751                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
1752                                          inner_headers);
1753         } else {
1754                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
1755                                          outer_headers);
1756                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
1757                                          outer_headers);
1758         }
1759
1760         switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
1761         case IB_FLOW_SPEC_ETH:
1762                 if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
1763                         return -EOPNOTSUPP;
1764
1765                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1766                                              dmac_47_16),
1767                                 ib_spec->eth.mask.dst_mac);
1768                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1769                                              dmac_47_16),
1770                                 ib_spec->eth.val.dst_mac);
1771
1772                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1773                                              smac_47_16),
1774                                 ib_spec->eth.mask.src_mac);
1775                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1776                                              smac_47_16),
1777                                 ib_spec->eth.val.src_mac);
1778
1779                 if (ib_spec->eth.mask.vlan_tag) {
1780                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1781                                  cvlan_tag, 1);
1782                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1783                                  cvlan_tag, 1);
1784
1785                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1786                                  first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
1787                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1788                                  first_vid, ntohs(ib_spec->eth.val.vlan_tag));
1789
1790                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1791                                  first_cfi,
1792                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
1793                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1794                                  first_cfi,
1795                                  ntohs(ib_spec->eth.val.vlan_tag) >> 12);
1796
1797                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1798                                  first_prio,
1799                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
1800                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1801                                  first_prio,
1802                                  ntohs(ib_spec->eth.val.vlan_tag) >> 13);
1803                 }
1804                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1805                          ethertype, ntohs(ib_spec->eth.mask.ether_type));
1806                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1807                          ethertype, ntohs(ib_spec->eth.val.ether_type));
1808                 break;
1809         case IB_FLOW_SPEC_IPV4:
1810                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
1811                         return -EOPNOTSUPP;
1812
1813                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1814                          ethertype, 0xffff);
1815                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1816                          ethertype, ETH_P_IP);
1817
1818                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1819                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
1820                        &ib_spec->ipv4.mask.src_ip,
1821                        sizeof(ib_spec->ipv4.mask.src_ip));
1822                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1823                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
1824                        &ib_spec->ipv4.val.src_ip,
1825                        sizeof(ib_spec->ipv4.val.src_ip));
1826                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1827                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1828                        &ib_spec->ipv4.mask.dst_ip,
1829                        sizeof(ib_spec->ipv4.mask.dst_ip));
1830                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1831                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1832                        &ib_spec->ipv4.val.dst_ip,
1833                        sizeof(ib_spec->ipv4.val.dst_ip));
1834
1835                 set_tos(headers_c, headers_v,
1836                         ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
1837
1838                 set_proto(headers_c, headers_v,
1839                           ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
1840                 break;
1841         case IB_FLOW_SPEC_IPV6:
1842                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
1843                         return -EOPNOTSUPP;
1844
1845                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
1846                          ethertype, 0xffff);
1847                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
1848                          ethertype, ETH_P_IPV6);
1849
1850                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1851                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
1852                        &ib_spec->ipv6.mask.src_ip,
1853                        sizeof(ib_spec->ipv6.mask.src_ip));
1854                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1855                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
1856                        &ib_spec->ipv6.val.src_ip,
1857                        sizeof(ib_spec->ipv6.val.src_ip));
1858                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
1859                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1860                        &ib_spec->ipv6.mask.dst_ip,
1861                        sizeof(ib_spec->ipv6.mask.dst_ip));
1862                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
1863                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1864                        &ib_spec->ipv6.val.dst_ip,
1865                        sizeof(ib_spec->ipv6.val.dst_ip));
1866
1867                 set_tos(headers_c, headers_v,
1868                         ib_spec->ipv6.mask.traffic_class,
1869                         ib_spec->ipv6.val.traffic_class);
1870
1871                 set_proto(headers_c, headers_v,
1872                           ib_spec->ipv6.mask.next_hdr,
1873                           ib_spec->ipv6.val.next_hdr);
1874
1875                 set_flow_label(misc_params_c, misc_params_v,
1876                                ntohl(ib_spec->ipv6.mask.flow_label),
1877                                ntohl(ib_spec->ipv6.val.flow_label),
1878                                ib_spec->type & IB_FLOW_SPEC_INNER);
1879
1880                 break;
1881         case IB_FLOW_SPEC_TCP:
1882                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1883                                          LAST_TCP_UDP_FIELD))
1884                         return -EOPNOTSUPP;
1885
1886                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
1887                          0xff);
1888                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
1889                          IPPROTO_TCP);
1890
1891                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
1892                          ntohs(ib_spec->tcp_udp.mask.src_port));
1893                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
1894                          ntohs(ib_spec->tcp_udp.val.src_port));
1895
1896                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
1897                          ntohs(ib_spec->tcp_udp.mask.dst_port));
1898                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
1899                          ntohs(ib_spec->tcp_udp.val.dst_port));
1900                 break;
1901         case IB_FLOW_SPEC_UDP:
1902                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1903                                          LAST_TCP_UDP_FIELD))
1904                         return -EOPNOTSUPP;
1905
1906                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
1907                          0xff);
1908                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
1909                          IPPROTO_UDP);
1910
1911                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
1912                          ntohs(ib_spec->tcp_udp.mask.src_port));
1913                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
1914                          ntohs(ib_spec->tcp_udp.val.src_port));
1915
1916                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
1917                          ntohs(ib_spec->tcp_udp.mask.dst_port));
1918                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
1919                          ntohs(ib_spec->tcp_udp.val.dst_port));
1920                 break;
1921         case IB_FLOW_SPEC_VXLAN_TUNNEL:
1922                 if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
1923                                          LAST_TUNNEL_FIELD))
1924                         return -EOPNOTSUPP;
1925
1926                 MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
1927                          ntohl(ib_spec->tunnel.mask.tunnel_id));
1928                 MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
1929                          ntohl(ib_spec->tunnel.val.tunnel_id));
1930                 break;
1931         case IB_FLOW_SPEC_ACTION_TAG:
1932                 if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
1933                                          LAST_FLOW_TAG_FIELD))
1934                         return -EOPNOTSUPP;
1935                 if (ib_spec->flow_tag.tag_id >= BIT(24))
1936                         return -EINVAL;
1937
1938                 *tag_id = ib_spec->flow_tag.tag_id;
1939                 break;
1940         default:
1941                 return -EINVAL;
1942         }
1943
1944         return 0;
1945 }
1946
1947 /* If a flow could catch both multicast and unicast packets,
1948  * it won't fall into the multicast flow steering table and this rule
1949  * could steal other multicast packets.
1950  */
1951 static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
1952 {
1953         struct ib_flow_spec_eth *eth_spec;
1954
1955         if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
1956             ib_attr->size < sizeof(struct ib_flow_attr) +
1957             sizeof(struct ib_flow_spec_eth) ||
1958             ib_attr->num_of_specs < 1)
1959                 return false;
1960
1961         eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1);
1962         if (eth_spec->type != IB_FLOW_SPEC_ETH ||
1963             eth_spec->size != sizeof(*eth_spec))
1964                 return false;
1965
1966         return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
1967                is_multicast_ether_addr(eth_spec->val.dst_mac);
1968 }
1969
1970 static bool is_valid_attr(const struct ib_flow_attr *flow_attr)
1971 {
1972         union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
1973         bool has_ipv4_spec = false;
1974         bool eth_type_ipv4 = true;
1975         unsigned int spec_index;
1976
1977         /* Validate that ethertype is correct */
1978         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
1979                 if (ib_spec->type == IB_FLOW_SPEC_ETH &&
1980                     ib_spec->eth.mask.ether_type) {
1981                         if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) &&
1982                               ib_spec->eth.val.ether_type == htons(ETH_P_IP)))
1983                                 eth_type_ipv4 = false;
1984                 } else if (ib_spec->type == IB_FLOW_SPEC_IPV4) {
1985                         has_ipv4_spec = true;
1986                 }
1987                 ib_spec = (void *)ib_spec + ib_spec->size;
1988         }
1989         return !has_ipv4_spec || eth_type_ipv4;
1990 }
1991
1992 static void put_flow_table(struct mlx5_ib_dev *dev,
1993                            struct mlx5_ib_flow_prio *prio, bool ft_added)
1994 {
1995         prio->refcount -= !!ft_added;
1996         if (!prio->refcount) {
1997                 mlx5_destroy_flow_table(prio->flow_table);
1998                 prio->flow_table = NULL;
1999         }
2000 }
2001
2002 static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
2003 {
2004         struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device);
2005         struct mlx5_ib_flow_handler *handler = container_of(flow_id,
2006                                                           struct mlx5_ib_flow_handler,
2007                                                           ibflow);
2008         struct mlx5_ib_flow_handler *iter, *tmp;
2009
2010         mutex_lock(&dev->flow_db.lock);
2011
2012         list_for_each_entry_safe(iter, tmp, &handler->list, list) {
2013                 mlx5_del_flow_rules(iter->rule);
2014                 put_flow_table(dev, iter->prio, true);
2015                 list_del(&iter->list);
2016                 kfree(iter);
2017         }
2018
2019         mlx5_del_flow_rules(handler->rule);
2020         put_flow_table(dev, handler->prio, true);
2021         mutex_unlock(&dev->flow_db.lock);
2022
2023         kfree(handler);
2024
2025         return 0;
2026 }
2027
2028 static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
2029 {
2030         priority *= 2;
2031         if (!dont_trap)
2032                 priority++;
2033         return priority;
2034 }
2035
2036 enum flow_table_type {
2037         MLX5_IB_FT_RX,
2038         MLX5_IB_FT_TX
2039 };
2040
2041 #define MLX5_FS_MAX_TYPES        10
2042 #define MLX5_FS_MAX_ENTRIES      32000UL
2043 static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
2044                                                 struct ib_flow_attr *flow_attr,
2045                                                 enum flow_table_type ft_type)
2046 {
2047         bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
2048         struct mlx5_flow_namespace *ns = NULL;
2049         struct mlx5_ib_flow_prio *prio;
2050         struct mlx5_flow_table *ft;
2051         int num_entries;
2052         int num_groups;
2053         int priority;
2054         int err = 0;
2055
2056         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
2057                 if (flow_is_multicast_only(flow_attr) &&
2058                     !dont_trap)
2059                         priority = MLX5_IB_FLOW_MCAST_PRIO;
2060                 else
2061                         priority = ib_prio_to_core_prio(flow_attr->priority,
2062                                                         dont_trap);
2063                 ns = mlx5_get_flow_namespace(dev->mdev,
2064                                              MLX5_FLOW_NAMESPACE_BYPASS);
2065                 num_entries = MLX5_FS_MAX_ENTRIES;
2066                 num_groups = MLX5_FS_MAX_TYPES;
2067                 prio = &dev->flow_db.prios[priority];
2068         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2069                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
2070                 ns = mlx5_get_flow_namespace(dev->mdev,
2071                                              MLX5_FLOW_NAMESPACE_LEFTOVERS);
2072                 build_leftovers_ft_param(&priority,
2073                                          &num_entries,
2074                                          &num_groups);
2075                 prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
2076         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2077                 if (!MLX5_CAP_FLOWTABLE(dev->mdev,
2078                                         allow_sniffer_and_nic_rx_shared_tir))
2079                         return ERR_PTR(-ENOTSUPP);
2080
2081                 ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
2082                                              MLX5_FLOW_NAMESPACE_SNIFFER_RX :
2083                                              MLX5_FLOW_NAMESPACE_SNIFFER_TX);
2084
2085                 prio = &dev->flow_db.sniffer[ft_type];
2086                 priority = 0;
2087                 num_entries = 1;
2088                 num_groups = 1;
2089         }
2090
2091         if (!ns)
2092                 return ERR_PTR(-ENOTSUPP);
2093
2094         ft = prio->flow_table;
2095         if (!ft) {
2096                 ft = mlx5_create_auto_grouped_flow_table(ns, priority,
2097                                                          num_entries,
2098                                                          num_groups,
2099                                                          0, 0);
2100
2101                 if (!IS_ERR(ft)) {
2102                         prio->refcount = 0;
2103                         prio->flow_table = ft;
2104                 } else {
2105                         err = PTR_ERR(ft);
2106                 }
2107         }
2108
2109         return err ? ERR_PTR(err) : prio;
2110 }
2111
2112 static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
2113                                                      struct mlx5_ib_flow_prio *ft_prio,
2114                                                      const struct ib_flow_attr *flow_attr,
2115                                                      struct mlx5_flow_destination *dst)
2116 {
2117         struct mlx5_flow_table  *ft = ft_prio->flow_table;
2118         struct mlx5_ib_flow_handler *handler;
2119         struct mlx5_flow_act flow_act = {0};
2120         struct mlx5_flow_spec *spec;
2121         const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
2122         unsigned int spec_index;
2123         u32 flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
2124         int err = 0;
2125
2126         if (!is_valid_attr(flow_attr))
2127                 return ERR_PTR(-EINVAL);
2128
2129         spec = mlx5_vzalloc(sizeof(*spec));
2130         handler = kzalloc(sizeof(*handler), GFP_KERNEL);
2131         if (!handler || !spec) {
2132                 err = -ENOMEM;
2133                 goto free;
2134         }
2135
2136         INIT_LIST_HEAD(&handler->list);
2137
2138         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
2139                 err = parse_flow_attr(spec->match_criteria,
2140                                       spec->match_value, ib_flow, &flow_tag);
2141                 if (err < 0)
2142                         goto free;
2143
2144                 ib_flow += ((union ib_flow_spec *)ib_flow)->size;
2145         }
2146
2147         spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
2148         flow_act.action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
2149                 MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
2150
2151         if (flow_tag != MLX5_FS_DEFAULT_FLOW_TAG &&
2152             (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2153              flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
2154                 mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
2155                              flow_tag, flow_attr->type);
2156                 err = -EINVAL;
2157                 goto free;
2158         }
2159         flow_act.flow_tag = flow_tag;
2160         handler->rule = mlx5_add_flow_rules(ft, spec,
2161                                             &flow_act,
2162                                             dst, 1);
2163
2164         if (IS_ERR(handler->rule)) {
2165                 err = PTR_ERR(handler->rule);
2166                 goto free;
2167         }
2168
2169         ft_prio->refcount++;
2170         handler->prio = ft_prio;
2171
2172         ft_prio->flow_table = ft;
2173 free:
2174         if (err)
2175                 kfree(handler);
2176         kvfree(spec);
2177         return err ? ERR_PTR(err) : handler;
2178 }
2179
2180 static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
2181                                                           struct mlx5_ib_flow_prio *ft_prio,
2182                                                           struct ib_flow_attr *flow_attr,
2183                                                           struct mlx5_flow_destination *dst)
2184 {
2185         struct mlx5_ib_flow_handler *handler_dst = NULL;
2186         struct mlx5_ib_flow_handler *handler = NULL;
2187
2188         handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
2189         if (!IS_ERR(handler)) {
2190                 handler_dst = create_flow_rule(dev, ft_prio,
2191                                                flow_attr, dst);
2192                 if (IS_ERR(handler_dst)) {
2193                         mlx5_del_flow_rules(handler->rule);
2194                         ft_prio->refcount--;
2195                         kfree(handler);
2196                         handler = handler_dst;
2197                 } else {
2198                         list_add(&handler_dst->list, &handler->list);
2199                 }
2200         }
2201
2202         return handler;
2203 }
2204 enum {
2205         LEFTOVERS_MC,
2206         LEFTOVERS_UC,
2207 };
2208
2209 static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
2210                                                           struct mlx5_ib_flow_prio *ft_prio,
2211                                                           struct ib_flow_attr *flow_attr,
2212                                                           struct mlx5_flow_destination *dst)
2213 {
2214         struct mlx5_ib_flow_handler *handler_ucast = NULL;
2215         struct mlx5_ib_flow_handler *handler = NULL;
2216
2217         static struct {
2218                 struct ib_flow_attr     flow_attr;
2219                 struct ib_flow_spec_eth eth_flow;
2220         } leftovers_specs[] = {
2221                 [LEFTOVERS_MC] = {
2222                         .flow_attr = {
2223                                 .num_of_specs = 1,
2224                                 .size = sizeof(leftovers_specs[0])
2225                         },
2226                         .eth_flow = {
2227                                 .type = IB_FLOW_SPEC_ETH,
2228                                 .size = sizeof(struct ib_flow_spec_eth),
2229                                 .mask = {.dst_mac = {0x1} },
2230                                 .val =  {.dst_mac = {0x1} }
2231                         }
2232                 },
2233                 [LEFTOVERS_UC] = {
2234                         .flow_attr = {
2235                                 .num_of_specs = 1,
2236                                 .size = sizeof(leftovers_specs[0])
2237                         },
2238                         .eth_flow = {
2239                                 .type = IB_FLOW_SPEC_ETH,
2240                                 .size = sizeof(struct ib_flow_spec_eth),
2241                                 .mask = {.dst_mac = {0x1} },
2242                                 .val = {.dst_mac = {} }
2243                         }
2244                 }
2245         };
2246
2247         handler = create_flow_rule(dev, ft_prio,
2248                                    &leftovers_specs[LEFTOVERS_MC].flow_attr,
2249                                    dst);
2250         if (!IS_ERR(handler) &&
2251             flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
2252                 handler_ucast = create_flow_rule(dev, ft_prio,
2253                                                  &leftovers_specs[LEFTOVERS_UC].flow_attr,
2254                                                  dst);
2255                 if (IS_ERR(handler_ucast)) {
2256                         mlx5_del_flow_rules(handler->rule);
2257                         ft_prio->refcount--;
2258                         kfree(handler);
2259                         handler = handler_ucast;
2260                 } else {
2261                         list_add(&handler_ucast->list, &handler->list);
2262                 }
2263         }
2264
2265         return handler;
2266 }
2267
2268 static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
2269                                                         struct mlx5_ib_flow_prio *ft_rx,
2270                                                         struct mlx5_ib_flow_prio *ft_tx,
2271                                                         struct mlx5_flow_destination *dst)
2272 {
2273         struct mlx5_ib_flow_handler *handler_rx;
2274         struct mlx5_ib_flow_handler *handler_tx;
2275         int err;
2276         static const struct ib_flow_attr flow_attr  = {
2277                 .num_of_specs = 0,
2278                 .size = sizeof(flow_attr)
2279         };
2280
2281         handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
2282         if (IS_ERR(handler_rx)) {
2283                 err = PTR_ERR(handler_rx);
2284                 goto err;
2285         }
2286
2287         handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
2288         if (IS_ERR(handler_tx)) {
2289                 err = PTR_ERR(handler_tx);
2290                 goto err_tx;
2291         }
2292
2293         list_add(&handler_tx->list, &handler_rx->list);
2294
2295         return handler_rx;
2296
2297 err_tx:
2298         mlx5_del_flow_rules(handler_rx->rule);
2299         ft_rx->refcount--;
2300         kfree(handler_rx);
2301 err:
2302         return ERR_PTR(err);
2303 }
2304
2305 static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
2306                                            struct ib_flow_attr *flow_attr,
2307                                            int domain)
2308 {
2309         struct mlx5_ib_dev *dev = to_mdev(qp->device);
2310         struct mlx5_ib_qp *mqp = to_mqp(qp);
2311         struct mlx5_ib_flow_handler *handler = NULL;
2312         struct mlx5_flow_destination *dst = NULL;
2313         struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
2314         struct mlx5_ib_flow_prio *ft_prio;
2315         int err;
2316
2317         if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
2318                 return ERR_PTR(-ENOSPC);
2319
2320         if (domain != IB_FLOW_DOMAIN_USER ||
2321             flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) ||
2322             (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP))
2323                 return ERR_PTR(-EINVAL);
2324
2325         dst = kzalloc(sizeof(*dst), GFP_KERNEL);
2326         if (!dst)
2327                 return ERR_PTR(-ENOMEM);
2328
2329         mutex_lock(&dev->flow_db.lock);
2330
2331         ft_prio = get_flow_table(dev, flow_attr, MLX5_IB_FT_RX);
2332         if (IS_ERR(ft_prio)) {
2333                 err = PTR_ERR(ft_prio);
2334                 goto unlock;
2335         }
2336         if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2337                 ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
2338                 if (IS_ERR(ft_prio_tx)) {
2339                         err = PTR_ERR(ft_prio_tx);
2340                         ft_prio_tx = NULL;
2341                         goto destroy_ft;
2342                 }
2343         }
2344
2345         dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
2346         if (mqp->flags & MLX5_IB_QP_RSS)
2347                 dst->tir_num = mqp->rss_qp.tirn;
2348         else
2349                 dst->tir_num = mqp->raw_packet_qp.rq.tirn;
2350
2351         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
2352                 if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
2353                         handler = create_dont_trap_rule(dev, ft_prio,
2354                                                         flow_attr, dst);
2355                 } else {
2356                         handler = create_flow_rule(dev, ft_prio, flow_attr,
2357                                                    dst);
2358                 }
2359         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2360                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
2361                 handler = create_leftovers_rule(dev, ft_prio, flow_attr,
2362                                                 dst);
2363         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2364                 handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
2365         } else {
2366                 err = -EINVAL;
2367                 goto destroy_ft;
2368         }
2369
2370         if (IS_ERR(handler)) {
2371                 err = PTR_ERR(handler);
2372                 handler = NULL;
2373                 goto destroy_ft;
2374         }
2375
2376         mutex_unlock(&dev->flow_db.lock);
2377         kfree(dst);
2378
2379         return &handler->ibflow;
2380
2381 destroy_ft:
2382         put_flow_table(dev, ft_prio, false);
2383         if (ft_prio_tx)
2384                 put_flow_table(dev, ft_prio_tx, false);
2385 unlock:
2386         mutex_unlock(&dev->flow_db.lock);
2387         kfree(dst);
2388         kfree(handler);
2389         return ERR_PTR(err);
2390 }
2391
2392 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2393 {
2394         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2395         int err;
2396
2397         err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
2398         if (err)
2399                 mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
2400                              ibqp->qp_num, gid->raw);
2401
2402         return err;
2403 }
2404
2405 static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2406 {
2407         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2408         int err;
2409
2410         err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
2411         if (err)
2412                 mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
2413                              ibqp->qp_num, gid->raw);
2414
2415         return err;
2416 }
2417
2418 static int init_node_data(struct mlx5_ib_dev *dev)
2419 {
2420         int err;
2421
2422         err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
2423         if (err)
2424                 return err;
2425
2426         dev->mdev->rev_id = dev->mdev->pdev->revision;
2427
2428         return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
2429 }
2430
2431 static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
2432                              char *buf)
2433 {
2434         struct mlx5_ib_dev *dev =
2435                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2436
2437         return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
2438 }
2439
2440 static ssize_t show_reg_pages(struct device *device,
2441                               struct device_attribute *attr, char *buf)
2442 {
2443         struct mlx5_ib_dev *dev =
2444                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2445
2446         return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
2447 }
2448
2449 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
2450                         char *buf)
2451 {
2452         struct mlx5_ib_dev *dev =
2453                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2454         return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
2455 }
2456
2457 static ssize_t show_rev(struct device *device, struct device_attribute *attr,
2458                         char *buf)
2459 {
2460         struct mlx5_ib_dev *dev =
2461                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2462         return sprintf(buf, "%x\n", dev->mdev->rev_id);
2463 }
2464
2465 static ssize_t show_board(struct device *device, struct device_attribute *attr,
2466                           char *buf)
2467 {
2468         struct mlx5_ib_dev *dev =
2469                 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2470         return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
2471                        dev->mdev->board_id);
2472 }
2473
2474 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
2475 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
2476 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
2477 static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
2478 static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
2479
2480 static struct device_attribute *mlx5_class_attributes[] = {
2481         &dev_attr_hw_rev,
2482         &dev_attr_hca_type,
2483         &dev_attr_board_id,
2484         &dev_attr_fw_pages,
2485         &dev_attr_reg_pages,
2486 };
2487
2488 static void pkey_change_handler(struct work_struct *work)
2489 {
2490         struct mlx5_ib_port_resources *ports =
2491                 container_of(work, struct mlx5_ib_port_resources,
2492                              pkey_change_work);
2493
2494         mutex_lock(&ports->devr->mutex);
2495         mlx5_ib_gsi_pkey_change(ports->gsi);
2496         mutex_unlock(&ports->devr->mutex);
2497 }
2498
2499 static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
2500 {
2501         struct mlx5_ib_qp *mqp;
2502         struct mlx5_ib_cq *send_mcq, *recv_mcq;
2503         struct mlx5_core_cq *mcq;
2504         struct list_head cq_armed_list;
2505         unsigned long flags_qp;
2506         unsigned long flags_cq;
2507         unsigned long flags;
2508
2509         INIT_LIST_HEAD(&cq_armed_list);
2510
2511         /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
2512         spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
2513         list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
2514                 spin_lock_irqsave(&mqp->sq.lock, flags_qp);
2515                 if (mqp->sq.tail != mqp->sq.head) {
2516                         send_mcq = to_mcq(mqp->ibqp.send_cq);
2517                         spin_lock_irqsave(&send_mcq->lock, flags_cq);
2518                         if (send_mcq->mcq.comp &&
2519                             mqp->ibqp.send_cq->comp_handler) {
2520                                 if (!send_mcq->mcq.reset_notify_added) {
2521                                         send_mcq->mcq.reset_notify_added = 1;
2522                                         list_add_tail(&send_mcq->mcq.reset_notify,
2523                                                       &cq_armed_list);
2524                                 }
2525                         }
2526                         spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
2527                 }
2528                 spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
2529                 spin_lock_irqsave(&mqp->rq.lock, flags_qp);
2530                 /* no handling is needed for SRQ */
2531                 if (!mqp->ibqp.srq) {
2532                         if (mqp->rq.tail != mqp->rq.head) {
2533                                 recv_mcq = to_mcq(mqp->ibqp.recv_cq);
2534                                 spin_lock_irqsave(&recv_mcq->lock, flags_cq);
2535                                 if (recv_mcq->mcq.comp &&
2536                                     mqp->ibqp.recv_cq->comp_handler) {
2537                                         if (!recv_mcq->mcq.reset_notify_added) {
2538                                                 recv_mcq->mcq.reset_notify_added = 1;
2539                                                 list_add_tail(&recv_mcq->mcq.reset_notify,
2540                                                               &cq_armed_list);
2541                                         }
2542                                 }
2543                                 spin_unlock_irqrestore(&recv_mcq->lock,
2544                                                        flags_cq);
2545                         }
2546                 }
2547                 spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
2548         }
2549         /*At that point all inflight post send were put to be executed as of we
2550          * lock/unlock above locks Now need to arm all involved CQs.
2551          */
2552         list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
2553                 mcq->comp(mcq);
2554         }
2555         spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
2556 }
2557
2558 static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
2559                           enum mlx5_dev_event event, unsigned long param)
2560 {
2561         struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
2562         struct ib_event ibev;
2563         bool fatal = false;
2564         u8 port = 0;
2565
2566         switch (event) {
2567         case MLX5_DEV_EVENT_SYS_ERROR:
2568                 ibev.event = IB_EVENT_DEVICE_FATAL;
2569                 mlx5_ib_handle_internal_error(ibdev);
2570                 fatal = true;
2571                 break;
2572
2573         case MLX5_DEV_EVENT_PORT_UP:
2574         case MLX5_DEV_EVENT_PORT_DOWN:
2575         case MLX5_DEV_EVENT_PORT_INITIALIZED:
2576                 port = (u8)param;
2577
2578                 /* In RoCE, port up/down events are handled in
2579                  * mlx5_netdev_event().
2580                  */
2581                 if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
2582                         IB_LINK_LAYER_ETHERNET)
2583                         return;
2584
2585                 ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ?
2586                              IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
2587                 break;
2588
2589         case MLX5_DEV_EVENT_LID_CHANGE:
2590                 ibev.event = IB_EVENT_LID_CHANGE;
2591                 port = (u8)param;
2592                 break;
2593
2594         case MLX5_DEV_EVENT_PKEY_CHANGE:
2595                 ibev.event = IB_EVENT_PKEY_CHANGE;
2596                 port = (u8)param;
2597
2598                 schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
2599                 break;
2600
2601         case MLX5_DEV_EVENT_GUID_CHANGE:
2602                 ibev.event = IB_EVENT_GID_CHANGE;
2603                 port = (u8)param;
2604                 break;
2605
2606         case MLX5_DEV_EVENT_CLIENT_REREG:
2607                 ibev.event = IB_EVENT_CLIENT_REREGISTER;
2608                 port = (u8)param;
2609                 break;
2610         default:
2611                 return;
2612         }
2613
2614         ibev.device           = &ibdev->ib_dev;
2615         ibev.element.port_num = port;
2616
2617         if (port < 1 || port > ibdev->num_ports) {
2618                 mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
2619                 return;
2620         }
2621
2622         if (ibdev->ib_active)
2623                 ib_dispatch_event(&ibev);
2624
2625         if (fatal)
2626                 ibdev->ib_active = false;
2627 }
2628
2629 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
2630 {
2631         struct mlx5_hca_vport_context vport_ctx;
2632         int err;
2633         int port;
2634
2635         for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
2636                 dev->mdev->port_caps[port - 1].has_smi = false;
2637                 if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2638                     MLX5_CAP_PORT_TYPE_IB) {
2639                         if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
2640                                 err = mlx5_query_hca_vport_context(dev->mdev, 0,
2641                                                                    port, 0,
2642                                                                    &vport_ctx);
2643                                 if (err) {
2644                                         mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
2645                                                     port, err);
2646                                         return err;
2647                                 }
2648                                 dev->mdev->port_caps[port - 1].has_smi =
2649                                         vport_ctx.has_smi;
2650                         } else {
2651                                 dev->mdev->port_caps[port - 1].has_smi = true;
2652                         }
2653                 }
2654         }
2655         return 0;
2656 }
2657
2658 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
2659 {
2660         int port;
2661
2662         for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
2663                 mlx5_query_ext_port_caps(dev, port);
2664 }
2665
2666 static int get_port_caps(struct mlx5_ib_dev *dev)
2667 {
2668         struct ib_device_attr *dprops = NULL;
2669         struct ib_port_attr *pprops = NULL;
2670         int err = -ENOMEM;
2671         int port;
2672         struct ib_udata uhw = {.inlen = 0, .outlen = 0};
2673
2674         pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
2675         if (!pprops)
2676                 goto out;
2677
2678         dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
2679         if (!dprops)
2680                 goto out;
2681
2682         err = set_has_smi_cap(dev);
2683         if (err)
2684                 goto out;
2685
2686         err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
2687         if (err) {
2688                 mlx5_ib_warn(dev, "query_device failed %d\n", err);
2689                 goto out;
2690         }
2691
2692         for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
2693                 memset(pprops, 0, sizeof(*pprops));
2694                 err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
2695                 if (err) {
2696                         mlx5_ib_warn(dev, "query_port %d failed %d\n",
2697                                      port, err);
2698                         break;
2699                 }
2700                 dev->mdev->port_caps[port - 1].pkey_table_len =
2701                                                 dprops->max_pkeys;
2702                 dev->mdev->port_caps[port - 1].gid_table_len =
2703                                                 pprops->gid_tbl_len;
2704                 mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
2705                             dprops->max_pkeys, pprops->gid_tbl_len);
2706         }
2707
2708 out:
2709         kfree(pprops);
2710         kfree(dprops);
2711
2712         return err;
2713 }
2714
2715 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
2716 {
2717         int err;
2718
2719         err = mlx5_mr_cache_cleanup(dev);
2720         if (err)
2721                 mlx5_ib_warn(dev, "mr cache cleanup failed\n");
2722
2723         mlx5_ib_destroy_qp(dev->umrc.qp);
2724         ib_free_cq(dev->umrc.cq);
2725         ib_dealloc_pd(dev->umrc.pd);
2726 }
2727
2728 enum {
2729         MAX_UMR_WR = 128,
2730 };
2731
2732 static int create_umr_res(struct mlx5_ib_dev *dev)
2733 {
2734         struct ib_qp_init_attr *init_attr = NULL;
2735         struct ib_qp_attr *attr = NULL;
2736         struct ib_pd *pd;
2737         struct ib_cq *cq;
2738         struct ib_qp *qp;
2739         int ret;
2740
2741         attr = kzalloc(sizeof(*attr), GFP_KERNEL);
2742         init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
2743         if (!attr || !init_attr) {
2744                 ret = -ENOMEM;
2745                 goto error_0;
2746         }
2747
2748         pd = ib_alloc_pd(&dev->ib_dev, 0);
2749         if (IS_ERR(pd)) {
2750                 mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
2751                 ret = PTR_ERR(pd);
2752                 goto error_0;
2753         }
2754
2755         cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
2756         if (IS_ERR(cq)) {
2757                 mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
2758                 ret = PTR_ERR(cq);
2759                 goto error_2;
2760         }
2761
2762         init_attr->send_cq = cq;
2763         init_attr->recv_cq = cq;
2764         init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
2765         init_attr->cap.max_send_wr = MAX_UMR_WR;
2766         init_attr->cap.max_send_sge = 1;
2767         init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
2768         init_attr->port_num = 1;
2769         qp = mlx5_ib_create_qp(pd, init_attr, NULL);
2770         if (IS_ERR(qp)) {
2771                 mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
2772                 ret = PTR_ERR(qp);
2773                 goto error_3;
2774         }
2775         qp->device     = &dev->ib_dev;
2776         qp->real_qp    = qp;
2777         qp->uobject    = NULL;
2778         qp->qp_type    = MLX5_IB_QPT_REG_UMR;
2779
2780         attr->qp_state = IB_QPS_INIT;
2781         attr->port_num = 1;
2782         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
2783                                 IB_QP_PORT, NULL);
2784         if (ret) {
2785                 mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
2786                 goto error_4;
2787         }
2788
2789         memset(attr, 0, sizeof(*attr));
2790         attr->qp_state = IB_QPS_RTR;
2791         attr->path_mtu = IB_MTU_256;
2792
2793         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2794         if (ret) {
2795                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
2796                 goto error_4;
2797         }
2798
2799         memset(attr, 0, sizeof(*attr));
2800         attr->qp_state = IB_QPS_RTS;
2801         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2802         if (ret) {
2803                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
2804                 goto error_4;
2805         }
2806
2807         dev->umrc.qp = qp;
2808         dev->umrc.cq = cq;
2809         dev->umrc.pd = pd;
2810
2811         sema_init(&dev->umrc.sem, MAX_UMR_WR);
2812         ret = mlx5_mr_cache_init(dev);
2813         if (ret) {
2814                 mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
2815                 goto error_4;
2816         }
2817
2818         kfree(attr);
2819         kfree(init_attr);
2820
2821         return 0;
2822
2823 error_4:
2824         mlx5_ib_destroy_qp(qp);
2825
2826 error_3:
2827         ib_free_cq(cq);
2828
2829 error_2:
2830         ib_dealloc_pd(pd);
2831
2832 error_0:
2833         kfree(attr);
2834         kfree(init_attr);
2835         return ret;
2836 }
2837
2838 static int create_dev_resources(struct mlx5_ib_resources *devr)
2839 {
2840         struct ib_srq_init_attr attr;
2841         struct mlx5_ib_dev *dev;
2842         struct ib_cq_init_attr cq_attr = {.cqe = 1};
2843         int port;
2844         int ret = 0;
2845
2846         dev = container_of(devr, struct mlx5_ib_dev, devr);
2847
2848         mutex_init(&devr->mutex);
2849
2850         devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
2851         if (IS_ERR(devr->p0)) {
2852                 ret = PTR_ERR(devr->p0);
2853                 goto error0;
2854         }
2855         devr->p0->device  = &dev->ib_dev;
2856         devr->p0->uobject = NULL;
2857         atomic_set(&devr->p0->usecnt, 0);
2858
2859         devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
2860         if (IS_ERR(devr->c0)) {
2861                 ret = PTR_ERR(devr->c0);
2862                 goto error1;
2863         }
2864         devr->c0->device        = &dev->ib_dev;
2865         devr->c0->uobject       = NULL;
2866         devr->c0->comp_handler  = NULL;
2867         devr->c0->event_handler = NULL;
2868         devr->c0->cq_context    = NULL;
2869         atomic_set(&devr->c0->usecnt, 0);
2870
2871         devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2872         if (IS_ERR(devr->x0)) {
2873                 ret = PTR_ERR(devr->x0);
2874                 goto error2;
2875         }
2876         devr->x0->device = &dev->ib_dev;
2877         devr->x0->inode = NULL;
2878         atomic_set(&devr->x0->usecnt, 0);
2879         mutex_init(&devr->x0->tgt_qp_mutex);
2880         INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
2881
2882         devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2883         if (IS_ERR(devr->x1)) {
2884                 ret = PTR_ERR(devr->x1);
2885                 goto error3;
2886         }
2887         devr->x1->device = &dev->ib_dev;
2888         devr->x1->inode = NULL;
2889         atomic_set(&devr->x1->usecnt, 0);
2890         mutex_init(&devr->x1->tgt_qp_mutex);
2891         INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
2892
2893         memset(&attr, 0, sizeof(attr));
2894         attr.attr.max_sge = 1;
2895         attr.attr.max_wr = 1;
2896         attr.srq_type = IB_SRQT_XRC;
2897         attr.ext.xrc.cq = devr->c0;
2898         attr.ext.xrc.xrcd = devr->x0;
2899
2900         devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2901         if (IS_ERR(devr->s0)) {
2902                 ret = PTR_ERR(devr->s0);
2903                 goto error4;
2904         }
2905         devr->s0->device        = &dev->ib_dev;
2906         devr->s0->pd            = devr->p0;
2907         devr->s0->uobject       = NULL;
2908         devr->s0->event_handler = NULL;
2909         devr->s0->srq_context   = NULL;
2910         devr->s0->srq_type      = IB_SRQT_XRC;
2911         devr->s0->ext.xrc.xrcd  = devr->x0;
2912         devr->s0->ext.xrc.cq    = devr->c0;
2913         atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
2914         atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
2915         atomic_inc(&devr->p0->usecnt);
2916         atomic_set(&devr->s0->usecnt, 0);
2917
2918         memset(&attr, 0, sizeof(attr));
2919         attr.attr.max_sge = 1;
2920         attr.attr.max_wr = 1;
2921         attr.srq_type = IB_SRQT_BASIC;
2922         devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2923         if (IS_ERR(devr->s1)) {
2924                 ret = PTR_ERR(devr->s1);
2925                 goto error5;
2926         }
2927         devr->s1->device        = &dev->ib_dev;
2928         devr->s1->pd            = devr->p0;
2929         devr->s1->uobject       = NULL;
2930         devr->s1->event_handler = NULL;
2931         devr->s1->srq_context   = NULL;
2932         devr->s1->srq_type      = IB_SRQT_BASIC;
2933         devr->s1->ext.xrc.cq    = devr->c0;
2934         atomic_inc(&devr->p0->usecnt);
2935         atomic_set(&devr->s0->usecnt, 0);
2936
2937         for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
2938                 INIT_WORK(&devr->ports[port].pkey_change_work,
2939                           pkey_change_handler);
2940                 devr->ports[port].devr = devr;
2941         }
2942
2943         return 0;
2944
2945 error5:
2946         mlx5_ib_destroy_srq(devr->s0);
2947 error4:
2948         mlx5_ib_dealloc_xrcd(devr->x1);
2949 error3:
2950         mlx5_ib_dealloc_xrcd(devr->x0);
2951 error2:
2952         mlx5_ib_destroy_cq(devr->c0);
2953 error1:
2954         mlx5_ib_dealloc_pd(devr->p0);
2955 error0:
2956         return ret;
2957 }
2958
2959 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
2960 {
2961         struct mlx5_ib_dev *dev =
2962                 container_of(devr, struct mlx5_ib_dev, devr);
2963         int port;
2964
2965         mlx5_ib_destroy_srq(devr->s1);
2966         mlx5_ib_destroy_srq(devr->s0);
2967         mlx5_ib_dealloc_xrcd(devr->x0);
2968         mlx5_ib_dealloc_xrcd(devr->x1);
2969         mlx5_ib_destroy_cq(devr->c0);
2970         mlx5_ib_dealloc_pd(devr->p0);
2971
2972         /* Make sure no change P_Key work items are still executing */
2973         for (port = 0; port < dev->num_ports; ++port)
2974                 cancel_work_sync(&devr->ports[port].pkey_change_work);
2975 }
2976
2977 static u32 get_core_cap_flags(struct ib_device *ibdev)
2978 {
2979         struct mlx5_ib_dev *dev = to_mdev(ibdev);
2980         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
2981         u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
2982         u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
2983         u32 ret = 0;
2984
2985         if (ll == IB_LINK_LAYER_INFINIBAND)
2986                 return RDMA_CORE_PORT_IBA_IB;
2987
2988         ret = RDMA_CORE_PORT_RAW_PACKET;
2989
2990         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
2991                 return ret;
2992
2993         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
2994                 return ret;
2995
2996         if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
2997                 ret |= RDMA_CORE_PORT_IBA_ROCE;
2998
2999         if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
3000                 ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
3001
3002         return ret;
3003 }
3004
3005 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
3006                                struct ib_port_immutable *immutable)
3007 {
3008         struct ib_port_attr attr;
3009         struct mlx5_ib_dev *dev = to_mdev(ibdev);
3010         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
3011         int err;
3012
3013         immutable->core_cap_flags = get_core_cap_flags(ibdev);
3014
3015         err = ib_query_port(ibdev, port_num, &attr);
3016         if (err)
3017                 return err;
3018
3019         immutable->pkey_tbl_len = attr.pkey_tbl_len;
3020         immutable->gid_tbl_len = attr.gid_tbl_len;
3021         immutable->core_cap_flags = get_core_cap_flags(ibdev);
3022         if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))
3023                 immutable->max_mad_size = IB_MGMT_MAD_SIZE;
3024
3025         return 0;
3026 }
3027
3028 static void get_dev_fw_str(struct ib_device *ibdev, char *str,
3029                            size_t str_len)
3030 {
3031         struct mlx5_ib_dev *dev =
3032                 container_of(ibdev, struct mlx5_ib_dev, ib_dev);
3033         snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev),
3034                        fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
3035 }
3036
3037 static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
3038 {
3039         struct mlx5_core_dev *mdev = dev->mdev;
3040         struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
3041                                                                  MLX5_FLOW_NAMESPACE_LAG);
3042         struct mlx5_flow_table *ft;
3043         int err;
3044
3045         if (!ns || !mlx5_lag_is_active(mdev))
3046                 return 0;
3047
3048         err = mlx5_cmd_create_vport_lag(mdev);
3049         if (err)
3050                 return err;
3051
3052         ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
3053         if (IS_ERR(ft)) {
3054                 err = PTR_ERR(ft);
3055                 goto err_destroy_vport_lag;
3056         }
3057
3058         dev->flow_db.lag_demux_ft = ft;
3059         return 0;
3060
3061 err_destroy_vport_lag:
3062         mlx5_cmd_destroy_vport_lag(mdev);
3063         return err;
3064 }
3065
3066 static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
3067 {
3068         struct mlx5_core_dev *mdev = dev->mdev;
3069
3070         if (dev->flow_db.lag_demux_ft) {
3071                 mlx5_destroy_flow_table(dev->flow_db.lag_demux_ft);
3072                 dev->flow_db.lag_demux_ft = NULL;
3073
3074                 mlx5_cmd_destroy_vport_lag(mdev);
3075         }
3076 }
3077
3078 static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev)
3079 {
3080         int err;
3081
3082         dev->roce.nb.notifier_call = mlx5_netdev_event;
3083         err = register_netdevice_notifier(&dev->roce.nb);
3084         if (err) {
3085                 dev->roce.nb.notifier_call = NULL;
3086                 return err;
3087         }
3088
3089         return 0;
3090 }
3091
3092 static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev)
3093 {
3094         if (dev->roce.nb.notifier_call) {
3095                 unregister_netdevice_notifier(&dev->roce.nb);
3096                 dev->roce.nb.notifier_call = NULL;
3097         }
3098 }
3099
3100 static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
3101 {
3102         int err;
3103
3104         err = mlx5_add_netdev_notifier(dev);
3105         if (err)
3106                 return err;
3107
3108         if (MLX5_CAP_GEN(dev->mdev, roce)) {
3109                 err = mlx5_nic_vport_enable_roce(dev->mdev);
3110                 if (err)
3111                         goto err_unregister_netdevice_notifier;
3112         }
3113
3114         err = mlx5_eth_lag_init(dev);
3115         if (err)
3116                 goto err_disable_roce;
3117
3118         return 0;
3119
3120 err_disable_roce:
3121         if (MLX5_CAP_GEN(dev->mdev, roce))
3122                 mlx5_nic_vport_disable_roce(dev->mdev);
3123
3124 err_unregister_netdevice_notifier:
3125         mlx5_remove_netdev_notifier(dev);
3126         return err;
3127 }
3128
3129 static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
3130 {
3131         mlx5_eth_lag_cleanup(dev);
3132         if (MLX5_CAP_GEN(dev->mdev, roce))
3133                 mlx5_nic_vport_disable_roce(dev->mdev);
3134 }
3135
3136 struct mlx5_ib_q_counter {
3137         const char *name;
3138         size_t offset;
3139 };
3140
3141 #define INIT_Q_COUNTER(_name)           \
3142         { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
3143
3144 static const struct mlx5_ib_q_counter basic_q_cnts[] = {
3145         INIT_Q_COUNTER(rx_write_requests),
3146         INIT_Q_COUNTER(rx_read_requests),
3147         INIT_Q_COUNTER(rx_atomic_requests),
3148         INIT_Q_COUNTER(out_of_buffer),
3149 };
3150
3151 static const struct mlx5_ib_q_counter out_of_seq_q_cnts[] = {
3152         INIT_Q_COUNTER(out_of_sequence),
3153 };
3154
3155 static const struct mlx5_ib_q_counter retrans_q_cnts[] = {
3156         INIT_Q_COUNTER(duplicate_request),
3157         INIT_Q_COUNTER(rnr_nak_retry_err),
3158         INIT_Q_COUNTER(packet_seq_err),
3159         INIT_Q_COUNTER(implied_nak_seq_err),
3160         INIT_Q_COUNTER(local_ack_timeout_err),
3161 };
3162
3163 static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
3164 {
3165         unsigned int i;
3166
3167         for (i = 0; i < dev->num_ports; i++) {
3168                 mlx5_core_dealloc_q_counter(dev->mdev,
3169                                             dev->port[i].q_cnts.set_id);
3170                 kfree(dev->port[i].q_cnts.names);
3171                 kfree(dev->port[i].q_cnts.offsets);
3172         }
3173 }
3174
3175 static int __mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev,
3176                                       const char ***names,
3177                                       size_t **offsets,
3178                                       u32 *num)
3179 {
3180         u32 num_counters;
3181
3182         num_counters = ARRAY_SIZE(basic_q_cnts);
3183
3184         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
3185                 num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
3186
3187         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
3188                 num_counters += ARRAY_SIZE(retrans_q_cnts);
3189
3190         *names = kcalloc(num_counters, sizeof(**names), GFP_KERNEL);
3191         if (!*names)
3192                 return -ENOMEM;
3193
3194         *offsets = kcalloc(num_counters, sizeof(**offsets), GFP_KERNEL);
3195         if (!*offsets)
3196                 goto err_names;
3197
3198         *num = num_counters;
3199
3200         return 0;
3201
3202 err_names:
3203         kfree(*names);
3204         return -ENOMEM;
3205 }
3206
3207 static void mlx5_ib_fill_q_counters(struct mlx5_ib_dev *dev,
3208                                     const char **names,
3209                                     size_t *offsets)
3210 {
3211         int i;
3212         int j = 0;
3213
3214         for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
3215                 names[j] = basic_q_cnts[i].name;
3216                 offsets[j] = basic_q_cnts[i].offset;
3217         }
3218
3219         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
3220                 for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
3221                         names[j] = out_of_seq_q_cnts[i].name;
3222                         offsets[j] = out_of_seq_q_cnts[i].offset;
3223                 }
3224         }
3225
3226         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
3227                 for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
3228                         names[j] = retrans_q_cnts[i].name;
3229                         offsets[j] = retrans_q_cnts[i].offset;
3230                 }
3231         }
3232 }
3233
3234 static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
3235 {
3236         int i;
3237         int ret;
3238
3239         for (i = 0; i < dev->num_ports; i++) {
3240                 struct mlx5_ib_port *port = &dev->port[i];
3241
3242                 ret = mlx5_core_alloc_q_counter(dev->mdev,
3243                                                 &port->q_cnts.set_id);
3244                 if (ret) {
3245                         mlx5_ib_warn(dev,
3246                                      "couldn't allocate queue counter for port %d, err %d\n",
3247                                      i + 1, ret);
3248                         goto dealloc_counters;
3249                 }
3250
3251                 ret = __mlx5_ib_alloc_q_counters(dev,
3252                                                  &port->q_cnts.names,
3253                                                  &port->q_cnts.offsets,
3254                                                  &port->q_cnts.num_counters);
3255                 if (ret)
3256                         goto dealloc_counters;
3257
3258                 mlx5_ib_fill_q_counters(dev, port->q_cnts.names,
3259                                         port->q_cnts.offsets);
3260         }
3261
3262         return 0;
3263
3264 dealloc_counters:
3265         while (--i >= 0)
3266                 mlx5_core_dealloc_q_counter(dev->mdev,
3267                                             dev->port[i].q_cnts.set_id);
3268
3269         return ret;
3270 }
3271
3272 static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
3273                                                     u8 port_num)
3274 {
3275         struct mlx5_ib_dev *dev = to_mdev(ibdev);
3276         struct mlx5_ib_port *port = &dev->port[port_num - 1];
3277
3278         /* We support only per port stats */
3279         if (port_num == 0)
3280                 return NULL;
3281
3282         return rdma_alloc_hw_stats_struct(port->q_cnts.names,
3283                                           port->q_cnts.num_counters,
3284                                           RDMA_HW_STATS_DEFAULT_LIFESPAN);
3285 }
3286
3287 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
3288                                 struct rdma_hw_stats *stats,
3289                                 u8 port_num, int index)
3290 {
3291         struct mlx5_ib_dev *dev = to_mdev(ibdev);
3292         struct mlx5_ib_port *port = &dev->port[port_num - 1];
3293         int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
3294         void *out;
3295         __be32 val;
3296         int ret;
3297         int i;
3298
3299         if (!stats)
3300                 return -ENOSYS;
3301
3302         out = mlx5_vzalloc(outlen);
3303         if (!out)
3304                 return -ENOMEM;
3305
3306         ret = mlx5_core_query_q_counter(dev->mdev,
3307                                         port->q_cnts.set_id, 0,
3308                                         out, outlen);
3309         if (ret)
3310                 goto free;
3311
3312         for (i = 0; i < port->q_cnts.num_counters; i++) {
3313                 val = *(__be32 *)(out + port->q_cnts.offsets[i]);
3314                 stats->value[i] = (u64)be32_to_cpu(val);
3315         }
3316
3317 free:
3318         kvfree(out);
3319         return port->q_cnts.num_counters;
3320 }
3321
3322 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
3323 {
3324         struct mlx5_ib_dev *dev;
3325         enum rdma_link_layer ll;
3326         int port_type_cap;
3327         const char *name;
3328         int err;
3329         int i;
3330
3331         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
3332         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
3333
3334         printk_once(KERN_INFO "%s", mlx5_version);
3335
3336         dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
3337         if (!dev)
3338                 return NULL;
3339
3340         dev->mdev = mdev;
3341
3342         dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
3343                             GFP_KERNEL);
3344         if (!dev->port)
3345                 goto err_dealloc;
3346
3347         rwlock_init(&dev->roce.netdev_lock);
3348         err = get_port_caps(dev);
3349         if (err)
3350                 goto err_free_port;
3351
3352         if (mlx5_use_mad_ifc(dev))
3353                 get_ext_port_caps(dev);
3354
3355         if (!mlx5_lag_is_active(mdev))
3356                 name = "mlx5_%d";
3357         else
3358                 name = "mlx5_bond_%d";
3359
3360         strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX);
3361         dev->ib_dev.owner               = THIS_MODULE;
3362         dev->ib_dev.node_type           = RDMA_NODE_IB_CA;
3363         dev->ib_dev.local_dma_lkey      = 0 /* not supported for now */;
3364         dev->num_ports          = MLX5_CAP_GEN(mdev, num_ports);
3365         dev->ib_dev.phys_port_cnt     = dev->num_ports;
3366         dev->ib_dev.num_comp_vectors    =
3367                 dev->mdev->priv.eq_table.num_comp_vectors;
3368         dev->ib_dev.dev.parent          = &mdev->pdev->dev;
3369
3370         dev->ib_dev.uverbs_abi_ver      = MLX5_IB_UVERBS_ABI_VERSION;
3371         dev->ib_dev.uverbs_cmd_mask     =
3372                 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
3373                 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
3374                 (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
3375                 (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
3376                 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
3377                 (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
3378                 (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
3379                 (1ull << IB_USER_VERBS_CMD_REG_MR)              |
3380                 (1ull << IB_USER_VERBS_CMD_REREG_MR)            |
3381                 (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
3382                 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
3383                 (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
3384                 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
3385                 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
3386                 (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
3387                 (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
3388                 (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
3389                 (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
3390                 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
3391                 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
3392                 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
3393                 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
3394                 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
3395                 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
3396                 (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)         |
3397                 (1ull << IB_USER_VERBS_CMD_OPEN_QP);
3398         dev->ib_dev.uverbs_ex_cmd_mask =
3399                 (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)     |
3400                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)        |
3401                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP)        |
3402                 (1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP);
3403
3404         dev->ib_dev.query_device        = mlx5_ib_query_device;
3405         dev->ib_dev.query_port          = mlx5_ib_query_port;
3406         dev->ib_dev.get_link_layer      = mlx5_ib_port_link_layer;
3407         if (ll == IB_LINK_LAYER_ETHERNET)
3408                 dev->ib_dev.get_netdev  = mlx5_ib_get_netdev;
3409         dev->ib_dev.query_gid           = mlx5_ib_query_gid;
3410         dev->ib_dev.add_gid             = mlx5_ib_add_gid;
3411         dev->ib_dev.del_gid             = mlx5_ib_del_gid;
3412         dev->ib_dev.query_pkey          = mlx5_ib_query_pkey;
3413         dev->ib_dev.modify_device       = mlx5_ib_modify_device;
3414         dev->ib_dev.modify_port         = mlx5_ib_modify_port;
3415         dev->ib_dev.alloc_ucontext      = mlx5_ib_alloc_ucontext;
3416         dev->ib_dev.dealloc_ucontext    = mlx5_ib_dealloc_ucontext;
3417         dev->ib_dev.mmap                = mlx5_ib_mmap;
3418         dev->ib_dev.alloc_pd            = mlx5_ib_alloc_pd;
3419         dev->ib_dev.dealloc_pd          = mlx5_ib_dealloc_pd;
3420         dev->ib_dev.create_ah           = mlx5_ib_create_ah;
3421         dev->ib_dev.query_ah            = mlx5_ib_query_ah;
3422         dev->ib_dev.destroy_ah          = mlx5_ib_destroy_ah;
3423         dev->ib_dev.create_srq          = mlx5_ib_create_srq;
3424         dev->ib_dev.modify_srq          = mlx5_ib_modify_srq;
3425         dev->ib_dev.query_srq           = mlx5_ib_query_srq;
3426         dev->ib_dev.destroy_srq         = mlx5_ib_destroy_srq;
3427         dev->ib_dev.post_srq_recv       = mlx5_ib_post_srq_recv;
3428         dev->ib_dev.create_qp           = mlx5_ib_create_qp;
3429         dev->ib_dev.modify_qp           = mlx5_ib_modify_qp;
3430         dev->ib_dev.query_qp            = mlx5_ib_query_qp;
3431         dev->ib_dev.destroy_qp          = mlx5_ib_destroy_qp;
3432         dev->ib_dev.post_send           = mlx5_ib_post_send;
3433         dev->ib_dev.post_recv           = mlx5_ib_post_recv;
3434         dev->ib_dev.create_cq           = mlx5_ib_create_cq;
3435         dev->ib_dev.modify_cq           = mlx5_ib_modify_cq;
3436         dev->ib_dev.resize_cq           = mlx5_ib_resize_cq;
3437         dev->ib_dev.destroy_cq          = mlx5_ib_destroy_cq;
3438         dev->ib_dev.poll_cq             = mlx5_ib_poll_cq;
3439         dev->ib_dev.req_notify_cq       = mlx5_ib_arm_cq;
3440         dev->ib_dev.get_dma_mr          = mlx5_ib_get_dma_mr;
3441         dev->ib_dev.reg_user_mr         = mlx5_ib_reg_user_mr;
3442         dev->ib_dev.rereg_user_mr       = mlx5_ib_rereg_user_mr;
3443         dev->ib_dev.dereg_mr            = mlx5_ib_dereg_mr;
3444         dev->ib_dev.attach_mcast        = mlx5_ib_mcg_attach;
3445         dev->ib_dev.detach_mcast        = mlx5_ib_mcg_detach;
3446         dev->ib_dev.process_mad         = mlx5_ib_process_mad;
3447         dev->ib_dev.alloc_mr            = mlx5_ib_alloc_mr;
3448         dev->ib_dev.map_mr_sg           = mlx5_ib_map_mr_sg;
3449         dev->ib_dev.check_mr_status     = mlx5_ib_check_mr_status;
3450         dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
3451         dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
3452         if (mlx5_core_is_pf(mdev)) {
3453                 dev->ib_dev.get_vf_config       = mlx5_ib_get_vf_config;
3454                 dev->ib_dev.set_vf_link_state   = mlx5_ib_set_vf_link_state;
3455                 dev->ib_dev.get_vf_stats        = mlx5_ib_get_vf_stats;
3456                 dev->ib_dev.set_vf_guid         = mlx5_ib_set_vf_guid;
3457         }
3458
3459         dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext;
3460
3461         mlx5_ib_internal_fill_odp_caps(dev);
3462
3463         if (MLX5_CAP_GEN(mdev, imaicl)) {
3464                 dev->ib_dev.alloc_mw            = mlx5_ib_alloc_mw;
3465                 dev->ib_dev.dealloc_mw          = mlx5_ib_dealloc_mw;
3466                 dev->ib_dev.uverbs_cmd_mask |=
3467                         (1ull << IB_USER_VERBS_CMD_ALLOC_MW)    |
3468                         (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
3469         }
3470
3471         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
3472                 dev->ib_dev.get_hw_stats        = mlx5_ib_get_hw_stats;
3473                 dev->ib_dev.alloc_hw_stats      = mlx5_ib_alloc_hw_stats;
3474         }
3475
3476         if (MLX5_CAP_GEN(mdev, xrc)) {
3477                 dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
3478                 dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
3479                 dev->ib_dev.uverbs_cmd_mask |=
3480                         (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
3481                         (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
3482         }
3483
3484         if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
3485             IB_LINK_LAYER_ETHERNET) {
3486                 dev->ib_dev.create_flow = mlx5_ib_create_flow;
3487                 dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
3488                 dev->ib_dev.create_wq    = mlx5_ib_create_wq;
3489                 dev->ib_dev.modify_wq    = mlx5_ib_modify_wq;
3490                 dev->ib_dev.destroy_wq   = mlx5_ib_destroy_wq;
3491                 dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
3492                 dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
3493                 dev->ib_dev.uverbs_ex_cmd_mask |=
3494                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
3495                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) |
3496                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
3497                         (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
3498                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
3499                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
3500                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
3501         }
3502         err = init_node_data(dev);
3503         if (err)
3504                 goto err_free_port;
3505
3506         mutex_init(&dev->flow_db.lock);
3507         mutex_init(&dev->cap_mask_mutex);
3508         INIT_LIST_HEAD(&dev->qp_list);
3509         spin_lock_init(&dev->reset_flow_resource_lock);
3510
3511         if (ll == IB_LINK_LAYER_ETHERNET) {
3512                 err = mlx5_enable_eth(dev);
3513                 if (err)
3514                         goto err_free_port;
3515         }
3516
3517         err = create_dev_resources(&dev->devr);
3518         if (err)
3519                 goto err_disable_eth;
3520
3521         err = mlx5_ib_odp_init_one(dev);
3522         if (err)
3523                 goto err_rsrc;
3524
3525         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
3526                 err = mlx5_ib_alloc_q_counters(dev);
3527                 if (err)
3528                         goto err_odp;
3529         }
3530
3531         dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
3532         if (!dev->mdev->priv.uar)
3533                 goto err_q_cnt;
3534
3535         err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
3536         if (err)
3537                 goto err_uar_page;
3538
3539         err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
3540         if (err)
3541                 goto err_bfreg;
3542
3543         err = ib_register_device(&dev->ib_dev, NULL);
3544         if (err)
3545                 goto err_fp_bfreg;
3546
3547         err = create_umr_res(dev);
3548         if (err)
3549                 goto err_dev;
3550
3551         for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
3552                 err = device_create_file(&dev->ib_dev.dev,
3553                                          mlx5_class_attributes[i]);
3554                 if (err)
3555                         goto err_umrc;
3556         }
3557
3558         dev->ib_active = true;
3559
3560         return dev;
3561
3562 err_umrc:
3563         destroy_umrc_res(dev);
3564
3565 err_dev:
3566         ib_unregister_device(&dev->ib_dev);
3567
3568 err_fp_bfreg:
3569         mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
3570
3571 err_bfreg:
3572         mlx5_free_bfreg(dev->mdev, &dev->bfreg);
3573
3574 err_uar_page:
3575         mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
3576
3577 err_q_cnt:
3578         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
3579                 mlx5_ib_dealloc_q_counters(dev);
3580
3581 err_odp:
3582         mlx5_ib_odp_remove_one(dev);
3583
3584 err_rsrc:
3585         destroy_dev_resources(&dev->devr);
3586
3587 err_disable_eth:
3588         if (ll == IB_LINK_LAYER_ETHERNET) {
3589                 mlx5_disable_eth(dev);
3590                 mlx5_remove_netdev_notifier(dev);
3591         }
3592
3593 err_free_port:
3594         kfree(dev->port);
3595
3596 err_dealloc:
3597         ib_dealloc_device((struct ib_device *)dev);
3598
3599         return NULL;
3600 }
3601
3602 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
3603 {
3604         struct mlx5_ib_dev *dev = context;
3605         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
3606
3607         mlx5_remove_netdev_notifier(dev);
3608         ib_unregister_device(&dev->ib_dev);
3609         mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
3610         mlx5_free_bfreg(dev->mdev, &dev->bfreg);
3611         mlx5_put_uars_page(dev->mdev, mdev->priv.uar);
3612         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
3613                 mlx5_ib_dealloc_q_counters(dev);
3614         destroy_umrc_res(dev);
3615         mlx5_ib_odp_remove_one(dev);
3616         destroy_dev_resources(&dev->devr);
3617         if (ll == IB_LINK_LAYER_ETHERNET)
3618                 mlx5_disable_eth(dev);
3619         kfree(dev->port);
3620         ib_dealloc_device(&dev->ib_dev);
3621 }
3622
3623 static struct mlx5_interface mlx5_ib_interface = {
3624         .add            = mlx5_ib_add,
3625         .remove         = mlx5_ib_remove,
3626         .event          = mlx5_ib_event,
3627 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
3628         .pfault         = mlx5_ib_pfault,
3629 #endif
3630         .protocol       = MLX5_INTERFACE_PROTOCOL_IB,
3631 };
3632
3633 static int __init mlx5_ib_init(void)
3634 {
3635         int err;
3636
3637         mlx5_ib_odp_init();
3638
3639         err = mlx5_register_interface(&mlx5_ib_interface);
3640
3641         return err;
3642 }
3643
3644 static void __exit mlx5_ib_cleanup(void)
3645 {
3646         mlx5_unregister_interface(&mlx5_ib_interface);
3647 }
3648
3649 module_init(mlx5_ib_init);
3650 module_exit(mlx5_ib_cleanup);
This page took 0.24854 seconds and 4 git commands to generate.