]> Git Repo - linux.git/blob - drivers/infiniband/hw/mlx5/main.c
RDMA/bnxt_re: make bnxt_re_ib_init static
[linux.git] / drivers / infiniband / hw / mlx5 / main.c
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/module.h>
36 #include <linux/init.h>
37 #include <linux/errno.h>
38 #include <linux/pci.h>
39 #include <linux/dma-mapping.h>
40 #include <linux/slab.h>
41 #include <linux/bitmap.h>
42 #include <linux/sched.h>
43 #include <linux/sched/mm.h>
44 #include <linux/sched/task.h>
45 #include <linux/delay.h>
46 #include <rdma/ib_user_verbs.h>
47 #include <rdma/ib_addr.h>
48 #include <rdma/ib_cache.h>
49 #include <linux/mlx5/port.h>
50 #include <linux/mlx5/vport.h>
51 #include <linux/mlx5/fs.h>
52 #include <linux/mlx5/eswitch.h>
53 #include <linux/list.h>
54 #include <rdma/ib_smi.h>
55 #include <rdma/ib_umem.h>
56 #include <linux/in.h>
57 #include <linux/etherdevice.h>
58 #include "mlx5_ib.h"
59 #include "ib_rep.h"
60 #include "cmd.h"
61 #include "srq.h"
62 #include <linux/mlx5/fs_helpers.h>
63 #include <linux/mlx5/accel.h>
64 #include <rdma/uverbs_std_types.h>
65 #include <rdma/mlx5_user_ioctl_verbs.h>
66 #include <rdma/mlx5_user_ioctl_cmds.h>
67 #include <rdma/ib_umem_odp.h>
68
69 #define UVERBS_MODULE_NAME mlx5_ib
70 #include <rdma/uverbs_named_ioctl.h>
71
72 #define DRIVER_NAME "mlx5_ib"
73 #define DRIVER_VERSION "5.0-0"
74
75 MODULE_AUTHOR("Eli Cohen <[email protected]>");
76 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
77 MODULE_LICENSE("Dual BSD/GPL");
78
79 static char mlx5_version[] =
80         DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
81         DRIVER_VERSION "\n";
82
83 struct mlx5_ib_event_work {
84         struct work_struct      work;
85         union {
86                 struct mlx5_ib_dev            *dev;
87                 struct mlx5_ib_multiport_info *mpi;
88         };
89         bool                    is_slave;
90         unsigned int            event;
91         void                    *param;
92 };
93
94 enum {
95         MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
96 };
97
98 static struct workqueue_struct *mlx5_ib_event_wq;
99 static LIST_HEAD(mlx5_ib_unaffiliated_port_list);
100 static LIST_HEAD(mlx5_ib_dev_list);
101 /*
102  * This mutex should be held when accessing either of the above lists
103  */
104 static DEFINE_MUTEX(mlx5_ib_multiport_mutex);
105
106 /* We can't use an array for xlt_emergency_page because dma_map_single
107  * doesn't work on kernel modules memory
108  */
109 static unsigned long xlt_emergency_page;
110 static struct mutex xlt_emergency_page_mutex;
111
112 struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi)
113 {
114         struct mlx5_ib_dev *dev;
115
116         mutex_lock(&mlx5_ib_multiport_mutex);
117         dev = mpi->ibdev;
118         mutex_unlock(&mlx5_ib_multiport_mutex);
119         return dev;
120 }
121
122 static enum rdma_link_layer
123 mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
124 {
125         switch (port_type_cap) {
126         case MLX5_CAP_PORT_TYPE_IB:
127                 return IB_LINK_LAYER_INFINIBAND;
128         case MLX5_CAP_PORT_TYPE_ETH:
129                 return IB_LINK_LAYER_ETHERNET;
130         default:
131                 return IB_LINK_LAYER_UNSPECIFIED;
132         }
133 }
134
135 static enum rdma_link_layer
136 mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
137 {
138         struct mlx5_ib_dev *dev = to_mdev(device);
139         int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
140
141         return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
142 }
143
144 static int get_port_state(struct ib_device *ibdev,
145                           u8 port_num,
146                           enum ib_port_state *state)
147 {
148         struct ib_port_attr attr;
149         int ret;
150
151         memset(&attr, 0, sizeof(attr));
152         ret = ibdev->ops.query_port(ibdev, port_num, &attr);
153         if (!ret)
154                 *state = attr.state;
155         return ret;
156 }
157
158 static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
159                                            struct net_device *ndev,
160                                            u8 *port_num)
161 {
162         struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
163         struct net_device *rep_ndev;
164         struct mlx5_ib_port *port;
165         int i;
166
167         for (i = 0; i < dev->num_ports; i++) {
168                 port  = &dev->port[i];
169                 if (!port->rep)
170                         continue;
171
172                 read_lock(&port->roce.netdev_lock);
173                 rep_ndev = mlx5_ib_get_rep_netdev(esw,
174                                                   port->rep->vport);
175                 if (rep_ndev == ndev) {
176                         read_unlock(&port->roce.netdev_lock);
177                         *port_num = i + 1;
178                         return &port->roce;
179                 }
180                 read_unlock(&port->roce.netdev_lock);
181         }
182
183         return NULL;
184 }
185
186 static int mlx5_netdev_event(struct notifier_block *this,
187                              unsigned long event, void *ptr)
188 {
189         struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
190         struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
191         u8 port_num = roce->native_port_num;
192         struct mlx5_core_dev *mdev;
193         struct mlx5_ib_dev *ibdev;
194
195         ibdev = roce->dev;
196         mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
197         if (!mdev)
198                 return NOTIFY_DONE;
199
200         switch (event) {
201         case NETDEV_REGISTER:
202                 /* Should already be registered during the load */
203                 if (ibdev->is_rep)
204                         break;
205                 write_lock(&roce->netdev_lock);
206                 if (ndev->dev.parent == mdev->device)
207                         roce->netdev = ndev;
208                 write_unlock(&roce->netdev_lock);
209                 break;
210
211         case NETDEV_UNREGISTER:
212                 /* In case of reps, ib device goes away before the netdevs */
213                 write_lock(&roce->netdev_lock);
214                 if (roce->netdev == ndev)
215                         roce->netdev = NULL;
216                 write_unlock(&roce->netdev_lock);
217                 break;
218
219         case NETDEV_CHANGE:
220         case NETDEV_UP:
221         case NETDEV_DOWN: {
222                 struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev);
223                 struct net_device *upper = NULL;
224
225                 if (lag_ndev) {
226                         upper = netdev_master_upper_dev_get(lag_ndev);
227                         dev_put(lag_ndev);
228                 }
229
230                 if (ibdev->is_rep)
231                         roce = mlx5_get_rep_roce(ibdev, ndev, &port_num);
232                 if (!roce)
233                         return NOTIFY_DONE;
234                 if ((upper == ndev || (!upper && ndev == roce->netdev))
235                     && ibdev->ib_active) {
236                         struct ib_event ibev = { };
237                         enum ib_port_state port_state;
238
239                         if (get_port_state(&ibdev->ib_dev, port_num,
240                                            &port_state))
241                                 goto done;
242
243                         if (roce->last_port_state == port_state)
244                                 goto done;
245
246                         roce->last_port_state = port_state;
247                         ibev.device = &ibdev->ib_dev;
248                         if (port_state == IB_PORT_DOWN)
249                                 ibev.event = IB_EVENT_PORT_ERR;
250                         else if (port_state == IB_PORT_ACTIVE)
251                                 ibev.event = IB_EVENT_PORT_ACTIVE;
252                         else
253                                 goto done;
254
255                         ibev.element.port_num = port_num;
256                         ib_dispatch_event(&ibev);
257                 }
258                 break;
259         }
260
261         default:
262                 break;
263         }
264 done:
265         mlx5_ib_put_native_port_mdev(ibdev, port_num);
266         return NOTIFY_DONE;
267 }
268
269 static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
270                                              u8 port_num)
271 {
272         struct mlx5_ib_dev *ibdev = to_mdev(device);
273         struct net_device *ndev;
274         struct mlx5_core_dev *mdev;
275
276         mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
277         if (!mdev)
278                 return NULL;
279
280         ndev = mlx5_lag_get_roce_netdev(mdev);
281         if (ndev)
282                 goto out;
283
284         /* Ensure ndev does not disappear before we invoke dev_hold()
285          */
286         read_lock(&ibdev->port[port_num - 1].roce.netdev_lock);
287         ndev = ibdev->port[port_num - 1].roce.netdev;
288         if (ndev)
289                 dev_hold(ndev);
290         read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock);
291
292 out:
293         mlx5_ib_put_native_port_mdev(ibdev, port_num);
294         return ndev;
295 }
296
297 struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
298                                                    u8 ib_port_num,
299                                                    u8 *native_port_num)
300 {
301         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
302                                                           ib_port_num);
303         struct mlx5_core_dev *mdev = NULL;
304         struct mlx5_ib_multiport_info *mpi;
305         struct mlx5_ib_port *port;
306
307         if (!mlx5_core_mp_enabled(ibdev->mdev) ||
308             ll != IB_LINK_LAYER_ETHERNET) {
309                 if (native_port_num)
310                         *native_port_num = ib_port_num;
311                 return ibdev->mdev;
312         }
313
314         if (native_port_num)
315                 *native_port_num = 1;
316
317         port = &ibdev->port[ib_port_num - 1];
318         if (!port)
319                 return NULL;
320
321         spin_lock(&port->mp.mpi_lock);
322         mpi = ibdev->port[ib_port_num - 1].mp.mpi;
323         if (mpi && !mpi->unaffiliate) {
324                 mdev = mpi->mdev;
325                 /* If it's the master no need to refcount, it'll exist
326                  * as long as the ib_dev exists.
327                  */
328                 if (!mpi->is_master)
329                         mpi->mdev_refcnt++;
330         }
331         spin_unlock(&port->mp.mpi_lock);
332
333         return mdev;
334 }
335
336 void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num)
337 {
338         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
339                                                           port_num);
340         struct mlx5_ib_multiport_info *mpi;
341         struct mlx5_ib_port *port;
342
343         if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
344                 return;
345
346         port = &ibdev->port[port_num - 1];
347
348         spin_lock(&port->mp.mpi_lock);
349         mpi = ibdev->port[port_num - 1].mp.mpi;
350         if (mpi->is_master)
351                 goto out;
352
353         mpi->mdev_refcnt--;
354         if (mpi->unaffiliate)
355                 complete(&mpi->unref_comp);
356 out:
357         spin_unlock(&port->mp.mpi_lock);
358 }
359
360 static int translate_eth_legacy_proto_oper(u32 eth_proto_oper, u8 *active_speed,
361                                            u8 *active_width)
362 {
363         switch (eth_proto_oper) {
364         case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
365         case MLX5E_PROT_MASK(MLX5E_1000BASE_KX):
366         case MLX5E_PROT_MASK(MLX5E_100BASE_TX):
367         case MLX5E_PROT_MASK(MLX5E_1000BASE_T):
368                 *active_width = IB_WIDTH_1X;
369                 *active_speed = IB_SPEED_SDR;
370                 break;
371         case MLX5E_PROT_MASK(MLX5E_10GBASE_T):
372         case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4):
373         case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4):
374         case MLX5E_PROT_MASK(MLX5E_10GBASE_KR):
375         case MLX5E_PROT_MASK(MLX5E_10GBASE_CR):
376         case MLX5E_PROT_MASK(MLX5E_10GBASE_SR):
377         case MLX5E_PROT_MASK(MLX5E_10GBASE_ER):
378                 *active_width = IB_WIDTH_1X;
379                 *active_speed = IB_SPEED_QDR;
380                 break;
381         case MLX5E_PROT_MASK(MLX5E_25GBASE_CR):
382         case MLX5E_PROT_MASK(MLX5E_25GBASE_KR):
383         case MLX5E_PROT_MASK(MLX5E_25GBASE_SR):
384                 *active_width = IB_WIDTH_1X;
385                 *active_speed = IB_SPEED_EDR;
386                 break;
387         case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4):
388         case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4):
389         case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4):
390         case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4):
391                 *active_width = IB_WIDTH_4X;
392                 *active_speed = IB_SPEED_QDR;
393                 break;
394         case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2):
395         case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2):
396         case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2):
397                 *active_width = IB_WIDTH_1X;
398                 *active_speed = IB_SPEED_HDR;
399                 break;
400         case MLX5E_PROT_MASK(MLX5E_56GBASE_R4):
401                 *active_width = IB_WIDTH_4X;
402                 *active_speed = IB_SPEED_FDR;
403                 break;
404         case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4):
405         case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4):
406         case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4):
407         case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4):
408                 *active_width = IB_WIDTH_4X;
409                 *active_speed = IB_SPEED_EDR;
410                 break;
411         default:
412                 return -EINVAL;
413         }
414
415         return 0;
416 }
417
418 static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u8 *active_speed,
419                                         u8 *active_width)
420 {
421         switch (eth_proto_oper) {
422         case MLX5E_PROT_MASK(MLX5E_SGMII_100M):
423         case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII):
424                 *active_width = IB_WIDTH_1X;
425                 *active_speed = IB_SPEED_SDR;
426                 break;
427         case MLX5E_PROT_MASK(MLX5E_5GBASE_R):
428                 *active_width = IB_WIDTH_1X;
429                 *active_speed = IB_SPEED_DDR;
430                 break;
431         case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1):
432                 *active_width = IB_WIDTH_1X;
433                 *active_speed = IB_SPEED_QDR;
434                 break;
435         case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4):
436                 *active_width = IB_WIDTH_4X;
437                 *active_speed = IB_SPEED_QDR;
438                 break;
439         case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR):
440                 *active_width = IB_WIDTH_1X;
441                 *active_speed = IB_SPEED_EDR;
442                 break;
443         case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2):
444                 *active_width = IB_WIDTH_2X;
445                 *active_speed = IB_SPEED_EDR;
446                 break;
447         case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR):
448                 *active_width = IB_WIDTH_1X;
449                 *active_speed = IB_SPEED_HDR;
450                 break;
451         case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4):
452                 *active_width = IB_WIDTH_4X;
453                 *active_speed = IB_SPEED_EDR;
454                 break;
455         case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2):
456                 *active_width = IB_WIDTH_2X;
457                 *active_speed = IB_SPEED_HDR;
458                 break;
459         case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4):
460                 *active_width = IB_WIDTH_4X;
461                 *active_speed = IB_SPEED_HDR;
462                 break;
463         default:
464                 return -EINVAL;
465         }
466
467         return 0;
468 }
469
470 static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
471                                     u8 *active_width, bool ext)
472 {
473         return ext ?
474                 translate_eth_ext_proto_oper(eth_proto_oper, active_speed,
475                                              active_width) :
476                 translate_eth_legacy_proto_oper(eth_proto_oper, active_speed,
477                                                 active_width);
478 }
479
480 static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
481                                 struct ib_port_attr *props)
482 {
483         struct mlx5_ib_dev *dev = to_mdev(device);
484         u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
485         struct mlx5_core_dev *mdev;
486         struct net_device *ndev, *upper;
487         enum ib_mtu ndev_ib_mtu;
488         bool put_mdev = true;
489         u16 qkey_viol_cntr;
490         u32 eth_prot_oper;
491         u8 mdev_port_num;
492         bool ext;
493         int err;
494
495         mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
496         if (!mdev) {
497                 /* This means the port isn't affiliated yet. Get the
498                  * info for the master port instead.
499                  */
500                 put_mdev = false;
501                 mdev = dev->mdev;
502                 mdev_port_num = 1;
503                 port_num = 1;
504         }
505
506         /* Possible bad flows are checked before filling out props so in case
507          * of an error it will still be zeroed out.
508          * Use native port in case of reps
509          */
510         if (dev->is_rep)
511                 err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
512                                            1);
513         else
514                 err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
515                                            mdev_port_num);
516         if (err)
517                 goto out;
518         ext = MLX5_CAP_PCAM_FEATURE(dev->mdev, ptys_extended_ethernet);
519         eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
520
521         props->active_width     = IB_WIDTH_4X;
522         props->active_speed     = IB_SPEED_QDR;
523
524         translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
525                                  &props->active_width, ext);
526
527         props->port_cap_flags |= IB_PORT_CM_SUP;
528         props->ip_gids = true;
529
530         props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
531                                                 roce_address_table_size);
532         props->max_mtu          = IB_MTU_4096;
533         props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
534         props->pkey_tbl_len     = 1;
535         props->state            = IB_PORT_DOWN;
536         props->phys_state       = IB_PORT_PHYS_STATE_DISABLED;
537
538         mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr);
539         props->qkey_viol_cntr = qkey_viol_cntr;
540
541         /* If this is a stub query for an unaffiliated port stop here */
542         if (!put_mdev)
543                 goto out;
544
545         ndev = mlx5_ib_get_netdev(device, port_num);
546         if (!ndev)
547                 goto out;
548
549         if (dev->lag_active) {
550                 rcu_read_lock();
551                 upper = netdev_master_upper_dev_get_rcu(ndev);
552                 if (upper) {
553                         dev_put(ndev);
554                         ndev = upper;
555                         dev_hold(ndev);
556                 }
557                 rcu_read_unlock();
558         }
559
560         if (netif_running(ndev) && netif_carrier_ok(ndev)) {
561                 props->state      = IB_PORT_ACTIVE;
562                 props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
563         }
564
565         ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
566
567         dev_put(ndev);
568
569         props->active_mtu       = min(props->max_mtu, ndev_ib_mtu);
570 out:
571         if (put_mdev)
572                 mlx5_ib_put_native_port_mdev(dev, port_num);
573         return err;
574 }
575
576 static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
577                          unsigned int index, const union ib_gid *gid,
578                          const struct ib_gid_attr *attr)
579 {
580         enum ib_gid_type gid_type = IB_GID_TYPE_IB;
581         u16 vlan_id = 0xffff;
582         u8 roce_version = 0;
583         u8 roce_l3_type = 0;
584         u8 mac[ETH_ALEN];
585         int ret;
586
587         if (gid) {
588                 gid_type = attr->gid_type;
589                 ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]);
590                 if (ret)
591                         return ret;
592         }
593
594         switch (gid_type) {
595         case IB_GID_TYPE_IB:
596                 roce_version = MLX5_ROCE_VERSION_1;
597                 break;
598         case IB_GID_TYPE_ROCE_UDP_ENCAP:
599                 roce_version = MLX5_ROCE_VERSION_2;
600                 if (ipv6_addr_v4mapped((void *)gid))
601                         roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4;
602                 else
603                         roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6;
604                 break;
605
606         default:
607                 mlx5_ib_warn(dev, "Unexpected GID type %u\n", gid_type);
608         }
609
610         return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
611                                       roce_l3_type, gid->raw, mac,
612                                       vlan_id < VLAN_CFI_MASK, vlan_id,
613                                       port_num);
614 }
615
616 static int mlx5_ib_add_gid(const struct ib_gid_attr *attr,
617                            __always_unused void **context)
618 {
619         return set_roce_addr(to_mdev(attr->device), attr->port_num,
620                              attr->index, &attr->gid, attr);
621 }
622
623 static int mlx5_ib_del_gid(const struct ib_gid_attr *attr,
624                            __always_unused void **context)
625 {
626         return set_roce_addr(to_mdev(attr->device), attr->port_num,
627                              attr->index, NULL, NULL);
628 }
629
630 __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev,
631                                const struct ib_gid_attr *attr)
632 {
633         if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
634                 return 0;
635
636         return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
637 }
638
639 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
640 {
641         if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
642                 return !MLX5_CAP_GEN(dev->mdev, ib_virt);
643         return 0;
644 }
645
646 enum {
647         MLX5_VPORT_ACCESS_METHOD_MAD,
648         MLX5_VPORT_ACCESS_METHOD_HCA,
649         MLX5_VPORT_ACCESS_METHOD_NIC,
650 };
651
652 static int mlx5_get_vport_access_method(struct ib_device *ibdev)
653 {
654         if (mlx5_use_mad_ifc(to_mdev(ibdev)))
655                 return MLX5_VPORT_ACCESS_METHOD_MAD;
656
657         if (mlx5_ib_port_link_layer(ibdev, 1) ==
658             IB_LINK_LAYER_ETHERNET)
659                 return MLX5_VPORT_ACCESS_METHOD_NIC;
660
661         return MLX5_VPORT_ACCESS_METHOD_HCA;
662 }
663
664 static void get_atomic_caps(struct mlx5_ib_dev *dev,
665                             u8 atomic_size_qp,
666                             struct ib_device_attr *props)
667 {
668         u8 tmp;
669         u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
670         u8 atomic_req_8B_endianness_mode =
671                 MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode);
672
673         /* Check if HW supports 8 bytes standard atomic operations and capable
674          * of host endianness respond
675          */
676         tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
677         if (((atomic_operations & tmp) == tmp) &&
678             (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
679             (atomic_req_8B_endianness_mode)) {
680                 props->atomic_cap = IB_ATOMIC_HCA;
681         } else {
682                 props->atomic_cap = IB_ATOMIC_NONE;
683         }
684 }
685
686 static void get_atomic_caps_qp(struct mlx5_ib_dev *dev,
687                                struct ib_device_attr *props)
688 {
689         u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
690
691         get_atomic_caps(dev, atomic_size_qp, props);
692 }
693
694 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
695                                         __be64 *sys_image_guid)
696 {
697         struct mlx5_ib_dev *dev = to_mdev(ibdev);
698         struct mlx5_core_dev *mdev = dev->mdev;
699         u64 tmp;
700         int err;
701
702         switch (mlx5_get_vport_access_method(ibdev)) {
703         case MLX5_VPORT_ACCESS_METHOD_MAD:
704                 return mlx5_query_mad_ifc_system_image_guid(ibdev,
705                                                             sys_image_guid);
706
707         case MLX5_VPORT_ACCESS_METHOD_HCA:
708                 err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
709                 break;
710
711         case MLX5_VPORT_ACCESS_METHOD_NIC:
712                 err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
713                 break;
714
715         default:
716                 return -EINVAL;
717         }
718
719         if (!err)
720                 *sys_image_guid = cpu_to_be64(tmp);
721
722         return err;
723
724 }
725
726 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
727                                 u16 *max_pkeys)
728 {
729         struct mlx5_ib_dev *dev = to_mdev(ibdev);
730         struct mlx5_core_dev *mdev = dev->mdev;
731
732         switch (mlx5_get_vport_access_method(ibdev)) {
733         case MLX5_VPORT_ACCESS_METHOD_MAD:
734                 return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
735
736         case MLX5_VPORT_ACCESS_METHOD_HCA:
737         case MLX5_VPORT_ACCESS_METHOD_NIC:
738                 *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
739                                                 pkey_table_size));
740                 return 0;
741
742         default:
743                 return -EINVAL;
744         }
745 }
746
747 static int mlx5_query_vendor_id(struct ib_device *ibdev,
748                                 u32 *vendor_id)
749 {
750         struct mlx5_ib_dev *dev = to_mdev(ibdev);
751
752         switch (mlx5_get_vport_access_method(ibdev)) {
753         case MLX5_VPORT_ACCESS_METHOD_MAD:
754                 return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
755
756         case MLX5_VPORT_ACCESS_METHOD_HCA:
757         case MLX5_VPORT_ACCESS_METHOD_NIC:
758                 return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
759
760         default:
761                 return -EINVAL;
762         }
763 }
764
765 static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
766                                 __be64 *node_guid)
767 {
768         u64 tmp;
769         int err;
770
771         switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
772         case MLX5_VPORT_ACCESS_METHOD_MAD:
773                 return mlx5_query_mad_ifc_node_guid(dev, node_guid);
774
775         case MLX5_VPORT_ACCESS_METHOD_HCA:
776                 err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
777                 break;
778
779         case MLX5_VPORT_ACCESS_METHOD_NIC:
780                 err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
781                 break;
782
783         default:
784                 return -EINVAL;
785         }
786
787         if (!err)
788                 *node_guid = cpu_to_be64(tmp);
789
790         return err;
791 }
792
793 struct mlx5_reg_node_desc {
794         u8      desc[IB_DEVICE_NODE_DESC_MAX];
795 };
796
797 static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
798 {
799         struct mlx5_reg_node_desc in;
800
801         if (mlx5_use_mad_ifc(dev))
802                 return mlx5_query_mad_ifc_node_desc(dev, node_desc);
803
804         memset(&in, 0, sizeof(in));
805
806         return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
807                                     sizeof(struct mlx5_reg_node_desc),
808                                     MLX5_REG_NODE_DESC, 0, 0);
809 }
810
811 static int mlx5_ib_query_device(struct ib_device *ibdev,
812                                 struct ib_device_attr *props,
813                                 struct ib_udata *uhw)
814 {
815         size_t uhw_outlen = (uhw) ? uhw->outlen : 0;
816         struct mlx5_ib_dev *dev = to_mdev(ibdev);
817         struct mlx5_core_dev *mdev = dev->mdev;
818         int err = -ENOMEM;
819         int max_sq_desc;
820         int max_rq_sg;
821         int max_sq_sg;
822         u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
823         bool raw_support = !mlx5_core_mp_enabled(mdev);
824         struct mlx5_ib_query_device_resp resp = {};
825         size_t resp_len;
826         u64 max_tso;
827
828         resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
829         if (uhw_outlen && uhw_outlen < resp_len)
830                 return -EINVAL;
831
832         resp.response_length = resp_len;
833
834         if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
835                 return -EINVAL;
836
837         memset(props, 0, sizeof(*props));
838         err = mlx5_query_system_image_guid(ibdev,
839                                            &props->sys_image_guid);
840         if (err)
841                 return err;
842
843         err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
844         if (err)
845                 return err;
846
847         err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
848         if (err)
849                 return err;
850
851         props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
852                 (fw_rev_min(dev->mdev) << 16) |
853                 fw_rev_sub(dev->mdev);
854         props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
855                 IB_DEVICE_PORT_ACTIVE_EVENT             |
856                 IB_DEVICE_SYS_IMAGE_GUID                |
857                 IB_DEVICE_RC_RNR_NAK_GEN;
858
859         if (MLX5_CAP_GEN(mdev, pkv))
860                 props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
861         if (MLX5_CAP_GEN(mdev, qkv))
862                 props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
863         if (MLX5_CAP_GEN(mdev, apm))
864                 props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
865         if (MLX5_CAP_GEN(mdev, xrc))
866                 props->device_cap_flags |= IB_DEVICE_XRC;
867         if (MLX5_CAP_GEN(mdev, imaicl)) {
868                 props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
869                                            IB_DEVICE_MEM_WINDOW_TYPE_2B;
870                 props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
871                 /* We support 'Gappy' memory registration too */
872                 props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
873         }
874         props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
875         if (MLX5_CAP_GEN(mdev, sho)) {
876                 props->device_cap_flags |= IB_DEVICE_INTEGRITY_HANDOVER;
877                 /* At this stage no support for signature handover */
878                 props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
879                                       IB_PROT_T10DIF_TYPE_2 |
880                                       IB_PROT_T10DIF_TYPE_3;
881                 props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
882                                        IB_GUARD_T10DIF_CSUM;
883         }
884         if (MLX5_CAP_GEN(mdev, block_lb_mc))
885                 props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
886
887         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) {
888                 if (MLX5_CAP_ETH(mdev, csum_cap)) {
889                         /* Legacy bit to support old userspace libraries */
890                         props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
891                         props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
892                 }
893
894                 if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
895                         props->raw_packet_caps |=
896                                 IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
897
898                 if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) {
899                         max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
900                         if (max_tso) {
901                                 resp.tso_caps.max_tso = 1 << max_tso;
902                                 resp.tso_caps.supported_qpts |=
903                                         1 << IB_QPT_RAW_PACKET;
904                                 resp.response_length += sizeof(resp.tso_caps);
905                         }
906                 }
907
908                 if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) {
909                         resp.rss_caps.rx_hash_function =
910                                                 MLX5_RX_HASH_FUNC_TOEPLITZ;
911                         resp.rss_caps.rx_hash_fields_mask =
912                                                 MLX5_RX_HASH_SRC_IPV4 |
913                                                 MLX5_RX_HASH_DST_IPV4 |
914                                                 MLX5_RX_HASH_SRC_IPV6 |
915                                                 MLX5_RX_HASH_DST_IPV6 |
916                                                 MLX5_RX_HASH_SRC_PORT_TCP |
917                                                 MLX5_RX_HASH_DST_PORT_TCP |
918                                                 MLX5_RX_HASH_SRC_PORT_UDP |
919                                                 MLX5_RX_HASH_DST_PORT_UDP |
920                                                 MLX5_RX_HASH_INNER;
921                         if (mlx5_accel_ipsec_device_caps(dev->mdev) &
922                             MLX5_ACCEL_IPSEC_CAP_DEVICE)
923                                 resp.rss_caps.rx_hash_fields_mask |=
924                                         MLX5_RX_HASH_IPSEC_SPI;
925                         resp.response_length += sizeof(resp.rss_caps);
926                 }
927         } else {
928                 if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen)
929                         resp.response_length += sizeof(resp.tso_caps);
930                 if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen)
931                         resp.response_length += sizeof(resp.rss_caps);
932         }
933
934         if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
935                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
936                 props->device_cap_flags |= IB_DEVICE_UD_TSO;
937         }
938
939         if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
940             MLX5_CAP_GEN(dev->mdev, general_notification_event) &&
941             raw_support)
942                 props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
943
944         if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
945             MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap))
946                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
947
948         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
949             MLX5_CAP_ETH(dev->mdev, scatter_fcs) &&
950             raw_support) {
951                 /* Legacy bit to support old userspace libraries */
952                 props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
953                 props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
954         }
955
956         if (MLX5_CAP_DEV_MEM(mdev, memic)) {
957                 props->max_dm_size =
958                         MLX5_CAP_DEV_MEM(mdev, max_memic_size);
959         }
960
961         if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
962                 props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
963
964         if (MLX5_CAP_GEN(mdev, end_pad))
965                 props->device_cap_flags |= IB_DEVICE_PCI_WRITE_END_PADDING;
966
967         props->vendor_part_id      = mdev->pdev->device;
968         props->hw_ver              = mdev->pdev->revision;
969
970         props->max_mr_size         = ~0ull;
971         props->page_size_cap       = ~(min_page_size - 1);
972         props->max_qp              = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
973         props->max_qp_wr           = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
974         max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
975                      sizeof(struct mlx5_wqe_data_seg);
976         max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
977         max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
978                      sizeof(struct mlx5_wqe_raddr_seg)) /
979                 sizeof(struct mlx5_wqe_data_seg);
980         props->max_send_sge = max_sq_sg;
981         props->max_recv_sge = max_rq_sg;
982         props->max_sge_rd          = MLX5_MAX_SGE_RD;
983         props->max_cq              = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
984         props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
985         props->max_mr              = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
986         props->max_pd              = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
987         props->max_qp_rd_atom      = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
988         props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
989         props->max_srq             = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
990         props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
991         props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
992         props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
993         props->max_srq_sge         = max_rq_sg - 1;
994         props->max_fast_reg_page_list_len =
995                 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
996         props->max_pi_fast_reg_page_list_len =
997                 props->max_fast_reg_page_list_len / 2;
998         props->max_sgl_rd =
999                 MLX5_CAP_GEN(mdev, max_sgl_for_optimized_performance);
1000         get_atomic_caps_qp(dev, props);
1001         props->masked_atomic_cap   = IB_ATOMIC_NONE;
1002         props->max_mcast_grp       = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
1003         props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
1004         props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
1005                                            props->max_mcast_grp;
1006         props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
1007         props->max_ah = INT_MAX;
1008         props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
1009         props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
1010
1011         if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
1012                 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
1013                         props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
1014                 props->odp_caps = dev->odp_caps;
1015                 if (!uhw) {
1016                         /* ODP for kernel QPs is not implemented for receive
1017                          * WQEs and SRQ WQEs
1018                          */
1019                         props->odp_caps.per_transport_caps.rc_odp_caps &=
1020                                 ~(IB_ODP_SUPPORT_READ |
1021                                   IB_ODP_SUPPORT_SRQ_RECV);
1022                         props->odp_caps.per_transport_caps.uc_odp_caps &=
1023                                 ~(IB_ODP_SUPPORT_READ |
1024                                   IB_ODP_SUPPORT_SRQ_RECV);
1025                         props->odp_caps.per_transport_caps.ud_odp_caps &=
1026                                 ~(IB_ODP_SUPPORT_READ |
1027                                   IB_ODP_SUPPORT_SRQ_RECV);
1028                         props->odp_caps.per_transport_caps.xrc_odp_caps &=
1029                                 ~(IB_ODP_SUPPORT_READ |
1030                                   IB_ODP_SUPPORT_SRQ_RECV);
1031                 }
1032         }
1033
1034         if (MLX5_CAP_GEN(mdev, cd))
1035                 props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
1036
1037         if (mlx5_core_is_vf(mdev))
1038                 props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
1039
1040         if (mlx5_ib_port_link_layer(ibdev, 1) ==
1041             IB_LINK_LAYER_ETHERNET && raw_support) {
1042                 props->rss_caps.max_rwq_indirection_tables =
1043                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
1044                 props->rss_caps.max_rwq_indirection_table_size =
1045                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
1046                 props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
1047                 props->max_wq_type_rq =
1048                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
1049         }
1050
1051         if (MLX5_CAP_GEN(mdev, tag_matching)) {
1052                 props->tm_caps.max_num_tags =
1053                         (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
1054                 props->tm_caps.max_ops =
1055                         1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
1056                 props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
1057         }
1058
1059         if (MLX5_CAP_GEN(mdev, tag_matching) &&
1060             MLX5_CAP_GEN(mdev, rndv_offload_rc)) {
1061                 props->tm_caps.flags = IB_TM_CAP_RNDV_RC;
1062                 props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
1063         }
1064
1065         if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
1066                 props->cq_caps.max_cq_moderation_count =
1067                                                 MLX5_MAX_CQ_COUNT;
1068                 props->cq_caps.max_cq_moderation_period =
1069                                                 MLX5_MAX_CQ_PERIOD;
1070         }
1071
1072         if (offsetofend(typeof(resp), cqe_comp_caps) <= uhw_outlen) {
1073                 resp.response_length += sizeof(resp.cqe_comp_caps);
1074
1075                 if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
1076                         resp.cqe_comp_caps.max_num =
1077                                 MLX5_CAP_GEN(dev->mdev,
1078                                              cqe_compression_max_num);
1079
1080                         resp.cqe_comp_caps.supported_format =
1081                                 MLX5_IB_CQE_RES_FORMAT_HASH |
1082                                 MLX5_IB_CQE_RES_FORMAT_CSUM;
1083
1084                         if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index))
1085                                 resp.cqe_comp_caps.supported_format |=
1086                                         MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX;
1087                 }
1088         }
1089
1090         if (offsetofend(typeof(resp), packet_pacing_caps) <= uhw_outlen &&
1091             raw_support) {
1092                 if (MLX5_CAP_QOS(mdev, packet_pacing) &&
1093                     MLX5_CAP_GEN(mdev, qos)) {
1094                         resp.packet_pacing_caps.qp_rate_limit_max =
1095                                 MLX5_CAP_QOS(mdev, packet_pacing_max_rate);
1096                         resp.packet_pacing_caps.qp_rate_limit_min =
1097                                 MLX5_CAP_QOS(mdev, packet_pacing_min_rate);
1098                         resp.packet_pacing_caps.supported_qpts |=
1099                                 1 << IB_QPT_RAW_PACKET;
1100                         if (MLX5_CAP_QOS(mdev, packet_pacing_burst_bound) &&
1101                             MLX5_CAP_QOS(mdev, packet_pacing_typical_size))
1102                                 resp.packet_pacing_caps.cap_flags |=
1103                                         MLX5_IB_PP_SUPPORT_BURST;
1104                 }
1105                 resp.response_length += sizeof(resp.packet_pacing_caps);
1106         }
1107
1108         if (offsetofend(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes) <=
1109             uhw_outlen) {
1110                 if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
1111                         resp.mlx5_ib_support_multi_pkt_send_wqes =
1112                                 MLX5_IB_ALLOW_MPW;
1113
1114                 if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe))
1115                         resp.mlx5_ib_support_multi_pkt_send_wqes |=
1116                                 MLX5_IB_SUPPORT_EMPW;
1117
1118                 resp.response_length +=
1119                         sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
1120         }
1121
1122         if (offsetofend(typeof(resp), flags) <= uhw_outlen) {
1123                 resp.response_length += sizeof(resp.flags);
1124
1125                 if (MLX5_CAP_GEN(mdev, cqe_compression_128))
1126                         resp.flags |=
1127                                 MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP;
1128
1129                 if (MLX5_CAP_GEN(mdev, cqe_128_always))
1130                         resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
1131                 if (MLX5_CAP_GEN(mdev, qp_packet_based))
1132                         resp.flags |=
1133                                 MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
1134
1135                 resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
1136         }
1137
1138         if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) {
1139                 resp.response_length += sizeof(resp.sw_parsing_caps);
1140                 if (MLX5_CAP_ETH(mdev, swp)) {
1141                         resp.sw_parsing_caps.sw_parsing_offloads |=
1142                                 MLX5_IB_SW_PARSING;
1143
1144                         if (MLX5_CAP_ETH(mdev, swp_csum))
1145                                 resp.sw_parsing_caps.sw_parsing_offloads |=
1146                                         MLX5_IB_SW_PARSING_CSUM;
1147
1148                         if (MLX5_CAP_ETH(mdev, swp_lso))
1149                                 resp.sw_parsing_caps.sw_parsing_offloads |=
1150                                         MLX5_IB_SW_PARSING_LSO;
1151
1152                         if (resp.sw_parsing_caps.sw_parsing_offloads)
1153                                 resp.sw_parsing_caps.supported_qpts =
1154                                         BIT(IB_QPT_RAW_PACKET);
1155                 }
1156         }
1157
1158         if (offsetofend(typeof(resp), striding_rq_caps) <= uhw_outlen &&
1159             raw_support) {
1160                 resp.response_length += sizeof(resp.striding_rq_caps);
1161                 if (MLX5_CAP_GEN(mdev, striding_rq)) {
1162                         resp.striding_rq_caps.min_single_stride_log_num_of_bytes =
1163                                 MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
1164                         resp.striding_rq_caps.max_single_stride_log_num_of_bytes =
1165                                 MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES;
1166                         if (MLX5_CAP_GEN(dev->mdev, ext_stride_num_range))
1167                                 resp.striding_rq_caps
1168                                         .min_single_wqe_log_num_of_strides =
1169                                         MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
1170                         else
1171                                 resp.striding_rq_caps
1172                                         .min_single_wqe_log_num_of_strides =
1173                                         MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
1174                         resp.striding_rq_caps.max_single_wqe_log_num_of_strides =
1175                                 MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES;
1176                         resp.striding_rq_caps.supported_qpts =
1177                                 BIT(IB_QPT_RAW_PACKET);
1178                 }
1179         }
1180
1181         if (offsetofend(typeof(resp), tunnel_offloads_caps) <= uhw_outlen) {
1182                 resp.response_length += sizeof(resp.tunnel_offloads_caps);
1183                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
1184                         resp.tunnel_offloads_caps |=
1185                                 MLX5_IB_TUNNELED_OFFLOADS_VXLAN;
1186                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx))
1187                         resp.tunnel_offloads_caps |=
1188                                 MLX5_IB_TUNNELED_OFFLOADS_GENEVE;
1189                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre))
1190                         resp.tunnel_offloads_caps |=
1191                                 MLX5_IB_TUNNELED_OFFLOADS_GRE;
1192                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre))
1193                         resp.tunnel_offloads_caps |=
1194                                 MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE;
1195                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_udp))
1196                         resp.tunnel_offloads_caps |=
1197                                 MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
1198         }
1199
1200         if (uhw_outlen) {
1201                 err = ib_copy_to_udata(uhw, &resp, resp.response_length);
1202
1203                 if (err)
1204                         return err;
1205         }
1206
1207         return 0;
1208 }
1209
1210 enum mlx5_ib_width {
1211         MLX5_IB_WIDTH_1X        = 1 << 0,
1212         MLX5_IB_WIDTH_2X        = 1 << 1,
1213         MLX5_IB_WIDTH_4X        = 1 << 2,
1214         MLX5_IB_WIDTH_8X        = 1 << 3,
1215         MLX5_IB_WIDTH_12X       = 1 << 4
1216 };
1217
1218 static void translate_active_width(struct ib_device *ibdev, u8 active_width,
1219                                   u8 *ib_width)
1220 {
1221         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1222
1223         if (active_width & MLX5_IB_WIDTH_1X)
1224                 *ib_width = IB_WIDTH_1X;
1225         else if (active_width & MLX5_IB_WIDTH_2X)
1226                 *ib_width = IB_WIDTH_2X;
1227         else if (active_width & MLX5_IB_WIDTH_4X)
1228                 *ib_width = IB_WIDTH_4X;
1229         else if (active_width & MLX5_IB_WIDTH_8X)
1230                 *ib_width = IB_WIDTH_8X;
1231         else if (active_width & MLX5_IB_WIDTH_12X)
1232                 *ib_width = IB_WIDTH_12X;
1233         else {
1234                 mlx5_ib_dbg(dev, "Invalid active_width %d, setting width to default value: 4x\n",
1235                             (int)active_width);
1236                 *ib_width = IB_WIDTH_4X;
1237         }
1238
1239         return;
1240 }
1241
1242 static int mlx5_mtu_to_ib_mtu(int mtu)
1243 {
1244         switch (mtu) {
1245         case 256: return 1;
1246         case 512: return 2;
1247         case 1024: return 3;
1248         case 2048: return 4;
1249         case 4096: return 5;
1250         default:
1251                 pr_warn("invalid mtu\n");
1252                 return -1;
1253         }
1254 }
1255
1256 enum ib_max_vl_num {
1257         __IB_MAX_VL_0           = 1,
1258         __IB_MAX_VL_0_1         = 2,
1259         __IB_MAX_VL_0_3         = 3,
1260         __IB_MAX_VL_0_7         = 4,
1261         __IB_MAX_VL_0_14        = 5,
1262 };
1263
1264 enum mlx5_vl_hw_cap {
1265         MLX5_VL_HW_0    = 1,
1266         MLX5_VL_HW_0_1  = 2,
1267         MLX5_VL_HW_0_2  = 3,
1268         MLX5_VL_HW_0_3  = 4,
1269         MLX5_VL_HW_0_4  = 5,
1270         MLX5_VL_HW_0_5  = 6,
1271         MLX5_VL_HW_0_6  = 7,
1272         MLX5_VL_HW_0_7  = 8,
1273         MLX5_VL_HW_0_14 = 15
1274 };
1275
1276 static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
1277                                 u8 *max_vl_num)
1278 {
1279         switch (vl_hw_cap) {
1280         case MLX5_VL_HW_0:
1281                 *max_vl_num = __IB_MAX_VL_0;
1282                 break;
1283         case MLX5_VL_HW_0_1:
1284                 *max_vl_num = __IB_MAX_VL_0_1;
1285                 break;
1286         case MLX5_VL_HW_0_3:
1287                 *max_vl_num = __IB_MAX_VL_0_3;
1288                 break;
1289         case MLX5_VL_HW_0_7:
1290                 *max_vl_num = __IB_MAX_VL_0_7;
1291                 break;
1292         case MLX5_VL_HW_0_14:
1293                 *max_vl_num = __IB_MAX_VL_0_14;
1294                 break;
1295
1296         default:
1297                 return -EINVAL;
1298         }
1299
1300         return 0;
1301 }
1302
1303 static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
1304                                struct ib_port_attr *props)
1305 {
1306         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1307         struct mlx5_core_dev *mdev = dev->mdev;
1308         struct mlx5_hca_vport_context *rep;
1309         u16 max_mtu;
1310         u16 oper_mtu;
1311         int err;
1312         u8 ib_link_width_oper;
1313         u8 vl_hw_cap;
1314
1315         rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1316         if (!rep) {
1317                 err = -ENOMEM;
1318                 goto out;
1319         }
1320
1321         /* props being zeroed by the caller, avoid zeroing it here */
1322
1323         err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
1324         if (err)
1325                 goto out;
1326
1327         props->lid              = rep->lid;
1328         props->lmc              = rep->lmc;
1329         props->sm_lid           = rep->sm_lid;
1330         props->sm_sl            = rep->sm_sl;
1331         props->state            = rep->vport_state;
1332         props->phys_state       = rep->port_physical_state;
1333         props->port_cap_flags   = rep->cap_mask1;
1334         props->gid_tbl_len      = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
1335         props->max_msg_sz       = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
1336         props->pkey_tbl_len     = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
1337         props->bad_pkey_cntr    = rep->pkey_violation_counter;
1338         props->qkey_viol_cntr   = rep->qkey_violation_counter;
1339         props->subnet_timeout   = rep->subnet_timeout;
1340         props->init_type_reply  = rep->init_type_reply;
1341
1342         if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP)
1343                 props->port_cap_flags2 = rep->cap_mask2;
1344
1345         err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
1346         if (err)
1347                 goto out;
1348
1349         translate_active_width(ibdev, ib_link_width_oper, &props->active_width);
1350
1351         err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port);
1352         if (err)
1353                 goto out;
1354
1355         mlx5_query_port_max_mtu(mdev, &max_mtu, port);
1356
1357         props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
1358
1359         mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
1360
1361         props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
1362
1363         err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
1364         if (err)
1365                 goto out;
1366
1367         err = translate_max_vl_num(ibdev, vl_hw_cap,
1368                                    &props->max_vl_num);
1369 out:
1370         kfree(rep);
1371         return err;
1372 }
1373
1374 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
1375                        struct ib_port_attr *props)
1376 {
1377         unsigned int count;
1378         int ret;
1379
1380         switch (mlx5_get_vport_access_method(ibdev)) {
1381         case MLX5_VPORT_ACCESS_METHOD_MAD:
1382                 ret = mlx5_query_mad_ifc_port(ibdev, port, props);
1383                 break;
1384
1385         case MLX5_VPORT_ACCESS_METHOD_HCA:
1386                 ret = mlx5_query_hca_port(ibdev, port, props);
1387                 break;
1388
1389         case MLX5_VPORT_ACCESS_METHOD_NIC:
1390                 ret = mlx5_query_port_roce(ibdev, port, props);
1391                 break;
1392
1393         default:
1394                 ret = -EINVAL;
1395         }
1396
1397         if (!ret && props) {
1398                 struct mlx5_ib_dev *dev = to_mdev(ibdev);
1399                 struct mlx5_core_dev *mdev;
1400                 bool put_mdev = true;
1401
1402                 mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL);
1403                 if (!mdev) {
1404                         /* If the port isn't affiliated yet query the master.
1405                          * The master and slave will have the same values.
1406                          */
1407                         mdev = dev->mdev;
1408                         port = 1;
1409                         put_mdev = false;
1410                 }
1411                 count = mlx5_core_reserved_gids_count(mdev);
1412                 if (put_mdev)
1413                         mlx5_ib_put_native_port_mdev(dev, port);
1414                 props->gid_tbl_len -= count;
1415         }
1416         return ret;
1417 }
1418
1419 static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u8 port,
1420                                   struct ib_port_attr *props)
1421 {
1422         int ret;
1423
1424         /* Only link layer == ethernet is valid for representors
1425          * and we always use port 1
1426          */
1427         ret = mlx5_query_port_roce(ibdev, port, props);
1428         if (ret || !props)
1429                 return ret;
1430
1431         /* We don't support GIDS */
1432         props->gid_tbl_len = 0;
1433
1434         return ret;
1435 }
1436
1437 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
1438                              union ib_gid *gid)
1439 {
1440         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1441         struct mlx5_core_dev *mdev = dev->mdev;
1442
1443         switch (mlx5_get_vport_access_method(ibdev)) {
1444         case MLX5_VPORT_ACCESS_METHOD_MAD:
1445                 return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
1446
1447         case MLX5_VPORT_ACCESS_METHOD_HCA:
1448                 return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
1449
1450         default:
1451                 return -EINVAL;
1452         }
1453
1454 }
1455
1456 static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port,
1457                                    u16 index, u16 *pkey)
1458 {
1459         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1460         struct mlx5_core_dev *mdev;
1461         bool put_mdev = true;
1462         u8 mdev_port_num;
1463         int err;
1464
1465         mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num);
1466         if (!mdev) {
1467                 /* The port isn't affiliated yet, get the PKey from the master
1468                  * port. For RoCE the PKey tables will be the same.
1469                  */
1470                 put_mdev = false;
1471                 mdev = dev->mdev;
1472                 mdev_port_num = 1;
1473         }
1474
1475         err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0,
1476                                         index, pkey);
1477         if (put_mdev)
1478                 mlx5_ib_put_native_port_mdev(dev, port);
1479
1480         return err;
1481 }
1482
1483 static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
1484                               u16 *pkey)
1485 {
1486         switch (mlx5_get_vport_access_method(ibdev)) {
1487         case MLX5_VPORT_ACCESS_METHOD_MAD:
1488                 return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
1489
1490         case MLX5_VPORT_ACCESS_METHOD_HCA:
1491         case MLX5_VPORT_ACCESS_METHOD_NIC:
1492                 return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey);
1493         default:
1494                 return -EINVAL;
1495         }
1496 }
1497
1498 static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
1499                                  struct ib_device_modify *props)
1500 {
1501         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1502         struct mlx5_reg_node_desc in;
1503         struct mlx5_reg_node_desc out;
1504         int err;
1505
1506         if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
1507                 return -EOPNOTSUPP;
1508
1509         if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
1510                 return 0;
1511
1512         /*
1513          * If possible, pass node desc to FW, so it can generate
1514          * a 144 trap.  If cmd fails, just ignore.
1515          */
1516         memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1517         err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
1518                                    sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
1519         if (err)
1520                 return err;
1521
1522         memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1523
1524         return err;
1525 }
1526
1527 static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask,
1528                                 u32 value)
1529 {
1530         struct mlx5_hca_vport_context ctx = {};
1531         struct mlx5_core_dev *mdev;
1532         u8 mdev_port_num;
1533         int err;
1534
1535         mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
1536         if (!mdev)
1537                 return -ENODEV;
1538
1539         err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx);
1540         if (err)
1541                 goto out;
1542
1543         if (~ctx.cap_mask1_perm & mask) {
1544                 mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
1545                              mask, ctx.cap_mask1_perm);
1546                 err = -EINVAL;
1547                 goto out;
1548         }
1549
1550         ctx.cap_mask1 = value;
1551         ctx.cap_mask1_perm = mask;
1552         err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num,
1553                                                  0, &ctx);
1554
1555 out:
1556         mlx5_ib_put_native_port_mdev(dev, port_num);
1557
1558         return err;
1559 }
1560
1561 static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
1562                                struct ib_port_modify *props)
1563 {
1564         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1565         struct ib_port_attr attr;
1566         u32 tmp;
1567         int err;
1568         u32 change_mask;
1569         u32 value;
1570         bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
1571                       IB_LINK_LAYER_INFINIBAND);
1572
1573         /* CM layer calls ib_modify_port() regardless of the link layer. For
1574          * Ethernet ports, qkey violation and Port capabilities are meaningless.
1575          */
1576         if (!is_ib)
1577                 return 0;
1578
1579         if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
1580                 change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
1581                 value = ~props->clr_port_cap_mask | props->set_port_cap_mask;
1582                 return set_port_caps_atomic(dev, port, change_mask, value);
1583         }
1584
1585         mutex_lock(&dev->cap_mask_mutex);
1586
1587         err = ib_query_port(ibdev, port, &attr);
1588         if (err)
1589                 goto out;
1590
1591         tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
1592                 ~props->clr_port_cap_mask;
1593
1594         err = mlx5_set_port_caps(dev->mdev, port, tmp);
1595
1596 out:
1597         mutex_unlock(&dev->cap_mask_mutex);
1598         return err;
1599 }
1600
1601 static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
1602 {
1603         mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
1604                     caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
1605 }
1606
1607 static u16 calc_dynamic_bfregs(int uars_per_sys_page)
1608 {
1609         /* Large page with non 4k uar support might limit the dynamic size */
1610         if (uars_per_sys_page == 1  && PAGE_SIZE > 4096)
1611                 return MLX5_MIN_DYN_BFREGS;
1612
1613         return MLX5_MAX_DYN_BFREGS;
1614 }
1615
1616 static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
1617                              struct mlx5_ib_alloc_ucontext_req_v2 *req,
1618                              struct mlx5_bfreg_info *bfregi)
1619 {
1620         int uars_per_sys_page;
1621         int bfregs_per_sys_page;
1622         int ref_bfregs = req->total_num_bfregs;
1623
1624         if (req->total_num_bfregs == 0)
1625                 return -EINVAL;
1626
1627         BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
1628         BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
1629
1630         if (req->total_num_bfregs > MLX5_MAX_BFREGS)
1631                 return -ENOMEM;
1632
1633         uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
1634         bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
1635         /* This holds the required static allocation asked by the user */
1636         req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
1637         if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
1638                 return -EINVAL;
1639
1640         bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
1641         bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page);
1642         bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs;
1643         bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page;
1644
1645         mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n",
1646                     MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
1647                     lib_uar_4k ? "yes" : "no", ref_bfregs,
1648                     req->total_num_bfregs, bfregi->total_num_bfregs,
1649                     bfregi->num_sys_pages);
1650
1651         return 0;
1652 }
1653
1654 static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1655 {
1656         struct mlx5_bfreg_info *bfregi;
1657         int err;
1658         int i;
1659
1660         bfregi = &context->bfregi;
1661         for (i = 0; i < bfregi->num_static_sys_pages; i++) {
1662                 err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
1663                 if (err)
1664                         goto error;
1665
1666                 mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
1667         }
1668
1669         for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++)
1670                 bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX;
1671
1672         return 0;
1673
1674 error:
1675         for (--i; i >= 0; i--)
1676                 if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]))
1677                         mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1678
1679         return err;
1680 }
1681
1682 static void deallocate_uars(struct mlx5_ib_dev *dev,
1683                             struct mlx5_ib_ucontext *context)
1684 {
1685         struct mlx5_bfreg_info *bfregi;
1686         int i;
1687
1688         bfregi = &context->bfregi;
1689         for (i = 0; i < bfregi->num_sys_pages; i++)
1690                 if (i < bfregi->num_static_sys_pages ||
1691                     bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX)
1692                         mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
1693 }
1694
1695 int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1696 {
1697         int err = 0;
1698
1699         mutex_lock(&dev->lb.mutex);
1700         if (td)
1701                 dev->lb.user_td++;
1702         if (qp)
1703                 dev->lb.qps++;
1704
1705         if (dev->lb.user_td == 2 ||
1706             dev->lb.qps == 1) {
1707                 if (!dev->lb.enabled) {
1708                         err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
1709                         dev->lb.enabled = true;
1710                 }
1711         }
1712
1713         mutex_unlock(&dev->lb.mutex);
1714
1715         return err;
1716 }
1717
1718 void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1719 {
1720         mutex_lock(&dev->lb.mutex);
1721         if (td)
1722                 dev->lb.user_td--;
1723         if (qp)
1724                 dev->lb.qps--;
1725
1726         if (dev->lb.user_td == 1 &&
1727             dev->lb.qps == 0) {
1728                 if (dev->lb.enabled) {
1729                         mlx5_nic_vport_update_local_lb(dev->mdev, false);
1730                         dev->lb.enabled = false;
1731                 }
1732         }
1733
1734         mutex_unlock(&dev->lb.mutex);
1735 }
1736
1737 static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn,
1738                                           u16 uid)
1739 {
1740         int err;
1741
1742         if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1743                 return 0;
1744
1745         err = mlx5_cmd_alloc_transport_domain(dev->mdev, tdn, uid);
1746         if (err)
1747                 return err;
1748
1749         if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1750             (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1751              !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1752                 return err;
1753
1754         return mlx5_ib_enable_lb(dev, true, false);
1755 }
1756
1757 static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
1758                                              u16 uid)
1759 {
1760         if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1761                 return;
1762
1763         mlx5_cmd_dealloc_transport_domain(dev->mdev, tdn, uid);
1764
1765         if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1766             (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1767              !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1768                 return;
1769
1770         mlx5_ib_disable_lb(dev, true, false);
1771 }
1772
1773 static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
1774                                   struct ib_udata *udata)
1775 {
1776         struct ib_device *ibdev = uctx->device;
1777         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1778         struct mlx5_ib_alloc_ucontext_req_v2 req = {};
1779         struct mlx5_ib_alloc_ucontext_resp resp = {};
1780         struct mlx5_core_dev *mdev = dev->mdev;
1781         struct mlx5_ib_ucontext *context = to_mucontext(uctx);
1782         struct mlx5_bfreg_info *bfregi;
1783         int ver;
1784         int err;
1785         size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
1786                                      max_cqe_version);
1787         u32 dump_fill_mkey;
1788         bool lib_uar_4k;
1789         bool lib_uar_dyn;
1790
1791         if (!dev->ib_active)
1792                 return -EAGAIN;
1793
1794         if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
1795                 ver = 0;
1796         else if (udata->inlen >= min_req_v2)
1797                 ver = 2;
1798         else
1799                 return -EINVAL;
1800
1801         err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1802         if (err)
1803                 return err;
1804
1805         if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
1806                 return -EOPNOTSUPP;
1807
1808         if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1809                 return -EOPNOTSUPP;
1810
1811         req.total_num_bfregs = ALIGN(req.total_num_bfregs,
1812                                     MLX5_NON_FP_BFREGS_PER_UAR);
1813         if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
1814                 return -EINVAL;
1815
1816         resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1817         if (dev->wc_support)
1818                 resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
1819         resp.cache_line_size = cache_line_size();
1820         resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1821         resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1822         resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1823         resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1824         resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1825         resp.cqe_version = min_t(__u8,
1826                                  (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
1827                                  req.max_cqe_version);
1828         resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1829                                 MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
1830         resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1831                                         MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
1832         resp.response_length = min(offsetof(typeof(resp), response_length) +
1833                                    sizeof(resp.response_length), udata->outlen);
1834
1835         if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) {
1836                 if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_EGRESS))
1837                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM;
1838                 if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA)
1839                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA;
1840                 if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
1841                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING;
1842                 if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN)
1843                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN;
1844                 /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
1845         }
1846
1847         lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
1848         lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR;
1849         bfregi = &context->bfregi;
1850
1851         if (lib_uar_dyn) {
1852                 bfregi->lib_uar_dyn = lib_uar_dyn;
1853                 goto uar_done;
1854         }
1855
1856         /* updates req->total_num_bfregs */
1857         err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
1858         if (err)
1859                 goto out_ctx;
1860
1861         mutex_init(&bfregi->lock);
1862         bfregi->lib_uar_4k = lib_uar_4k;
1863         bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count),
1864                                 GFP_KERNEL);
1865         if (!bfregi->count) {
1866                 err = -ENOMEM;
1867                 goto out_ctx;
1868         }
1869
1870         bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
1871                                     sizeof(*bfregi->sys_pages),
1872                                     GFP_KERNEL);
1873         if (!bfregi->sys_pages) {
1874                 err = -ENOMEM;
1875                 goto out_count;
1876         }
1877
1878         err = allocate_uars(dev, context);
1879         if (err)
1880                 goto out_sys_pages;
1881
1882 uar_done:
1883         if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
1884                 err = mlx5_ib_devx_create(dev, true);
1885                 if (err < 0)
1886                         goto out_uars;
1887                 context->devx_uid = err;
1888         }
1889
1890         err = mlx5_ib_alloc_transport_domain(dev, &context->tdn,
1891                                              context->devx_uid);
1892         if (err)
1893                 goto out_devx;
1894
1895         if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
1896                 err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey);
1897                 if (err)
1898                         goto out_mdev;
1899         }
1900
1901         INIT_LIST_HEAD(&context->db_page_list);
1902         mutex_init(&context->db_page_mutex);
1903
1904         resp.tot_bfregs = lib_uar_dyn ? 0 : req.total_num_bfregs;
1905         resp.num_ports = dev->num_ports;
1906
1907         if (offsetofend(typeof(resp), cqe_version) <= udata->outlen)
1908                 resp.response_length += sizeof(resp.cqe_version);
1909
1910         if (offsetofend(typeof(resp), cmds_supp_uhw) <= udata->outlen) {
1911                 resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
1912                                       MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
1913                 resp.response_length += sizeof(resp.cmds_supp_uhw);
1914         }
1915
1916         if (offsetofend(typeof(resp), eth_min_inline) <= udata->outlen) {
1917                 if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
1918                         mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
1919                         resp.eth_min_inline++;
1920                 }
1921                 resp.response_length += sizeof(resp.eth_min_inline);
1922         }
1923
1924         if (offsetofend(typeof(resp), clock_info_versions) <= udata->outlen) {
1925                 if (mdev->clock_info)
1926                         resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
1927                 resp.response_length += sizeof(resp.clock_info_versions);
1928         }
1929
1930         /*
1931          * We don't want to expose information from the PCI bar that is located
1932          * after 4096 bytes, so if the arch only supports larger pages, let's
1933          * pretend we don't support reading the HCA's core clock. This is also
1934          * forced by mmap function.
1935          */
1936         if (offsetofend(typeof(resp), hca_core_clock_offset) <= udata->outlen) {
1937                 if (PAGE_SIZE <= 4096) {
1938                         resp.comp_mask |=
1939                                 MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1940                         resp.hca_core_clock_offset =
1941                                 offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
1942                 }
1943                 resp.response_length += sizeof(resp.hca_core_clock_offset);
1944         }
1945
1946         if (offsetofend(typeof(resp), log_uar_size) <= udata->outlen)
1947                 resp.response_length += sizeof(resp.log_uar_size);
1948
1949         if (offsetofend(typeof(resp), num_uars_per_page) <= udata->outlen)
1950                 resp.response_length += sizeof(resp.num_uars_per_page);
1951
1952         if (offsetofend(typeof(resp), num_dyn_bfregs) <= udata->outlen) {
1953                 resp.num_dyn_bfregs = bfregi->num_dyn_bfregs;
1954                 resp.response_length += sizeof(resp.num_dyn_bfregs);
1955         }
1956
1957         if (offsetofend(typeof(resp), dump_fill_mkey) <= udata->outlen) {
1958                 if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
1959                         resp.dump_fill_mkey = dump_fill_mkey;
1960                         resp.comp_mask |=
1961                                 MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY;
1962                 }
1963                 resp.response_length += sizeof(resp.dump_fill_mkey);
1964         }
1965
1966         err = ib_copy_to_udata(udata, &resp, resp.response_length);
1967         if (err)
1968                 goto out_mdev;
1969
1970         bfregi->ver = ver;
1971         bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
1972         context->cqe_version = resp.cqe_version;
1973         context->lib_caps = req.lib_caps;
1974         print_lib_caps(dev, context->lib_caps);
1975
1976         if (dev->lag_active) {
1977                 u8 port = mlx5_core_native_port_num(dev->mdev) - 1;
1978
1979                 atomic_set(&context->tx_port_affinity,
1980                            atomic_add_return(
1981                                    1, &dev->port[port].roce.tx_port_affinity));
1982         }
1983
1984         return 0;
1985
1986 out_mdev:
1987         mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
1988 out_devx:
1989         if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX)
1990                 mlx5_ib_devx_destroy(dev, context->devx_uid);
1991
1992 out_uars:
1993         deallocate_uars(dev, context);
1994
1995 out_sys_pages:
1996         kfree(bfregi->sys_pages);
1997
1998 out_count:
1999         kfree(bfregi->count);
2000
2001 out_ctx:
2002         return err;
2003 }
2004
2005 static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
2006 {
2007         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
2008         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
2009         struct mlx5_bfreg_info *bfregi;
2010
2011         bfregi = &context->bfregi;
2012         mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
2013
2014         if (context->devx_uid)
2015                 mlx5_ib_devx_destroy(dev, context->devx_uid);
2016
2017         deallocate_uars(dev, context);
2018         kfree(bfregi->sys_pages);
2019         kfree(bfregi->count);
2020 }
2021
2022 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
2023                                  int uar_idx)
2024 {
2025         int fw_uars_per_page;
2026
2027         fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
2028
2029         return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
2030 }
2031
2032 static u64 uar_index2paddress(struct mlx5_ib_dev *dev,
2033                                  int uar_idx)
2034 {
2035         unsigned int fw_uars_per_page;
2036
2037         fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
2038                                 MLX5_UARS_IN_PAGE : 1;
2039
2040         return (dev->mdev->bar_addr + (uar_idx / fw_uars_per_page) * PAGE_SIZE);
2041 }
2042
2043 static int get_command(unsigned long offset)
2044 {
2045         return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
2046 }
2047
2048 static int get_arg(unsigned long offset)
2049 {
2050         return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
2051 }
2052
2053 static int get_index(unsigned long offset)
2054 {
2055         return get_arg(offset);
2056 }
2057
2058 /* Index resides in an extra byte to enable larger values than 255 */
2059 static int get_extended_index(unsigned long offset)
2060 {
2061         return get_arg(offset) | ((offset >> 16) & 0xff) << 8;
2062 }
2063
2064
2065 static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
2066 {
2067 }
2068
2069 static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
2070 {
2071         switch (cmd) {
2072         case MLX5_IB_MMAP_WC_PAGE:
2073                 return "WC";
2074         case MLX5_IB_MMAP_REGULAR_PAGE:
2075                 return "best effort WC";
2076         case MLX5_IB_MMAP_NC_PAGE:
2077                 return "NC";
2078         case MLX5_IB_MMAP_DEVICE_MEM:
2079                 return "Device Memory";
2080         default:
2081                 return NULL;
2082         }
2083 }
2084
2085 static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
2086                                         struct vm_area_struct *vma,
2087                                         struct mlx5_ib_ucontext *context)
2088 {
2089         if ((vma->vm_end - vma->vm_start != PAGE_SIZE) ||
2090             !(vma->vm_flags & VM_SHARED))
2091                 return -EINVAL;
2092
2093         if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
2094                 return -EOPNOTSUPP;
2095
2096         if (vma->vm_flags & (VM_WRITE | VM_EXEC))
2097                 return -EPERM;
2098         vma->vm_flags &= ~VM_MAYWRITE;
2099
2100         if (!dev->mdev->clock_info)
2101                 return -EOPNOTSUPP;
2102
2103         return vm_insert_page(vma, vma->vm_start,
2104                               virt_to_page(dev->mdev->clock_info));
2105 }
2106
2107 static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
2108 {
2109         struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
2110         struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device);
2111         struct mlx5_var_table *var_table = &dev->var_table;
2112         struct mlx5_ib_dm *mdm;
2113
2114         switch (mentry->mmap_flag) {
2115         case MLX5_IB_MMAP_TYPE_MEMIC:
2116                 mdm = container_of(mentry, struct mlx5_ib_dm, mentry);
2117                 mlx5_cmd_dealloc_memic(&dev->dm, mdm->dev_addr,
2118                                        mdm->size);
2119                 kfree(mdm);
2120                 break;
2121         case MLX5_IB_MMAP_TYPE_VAR:
2122                 mutex_lock(&var_table->bitmap_lock);
2123                 clear_bit(mentry->page_idx, var_table->bitmap);
2124                 mutex_unlock(&var_table->bitmap_lock);
2125                 kfree(mentry);
2126                 break;
2127         case MLX5_IB_MMAP_TYPE_UAR_WC:
2128         case MLX5_IB_MMAP_TYPE_UAR_NC:
2129                 mlx5_cmd_free_uar(dev->mdev, mentry->page_idx);
2130                 kfree(mentry);
2131                 break;
2132         default:
2133                 WARN_ON(true);
2134         }
2135 }
2136
2137 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
2138                     struct vm_area_struct *vma,
2139                     struct mlx5_ib_ucontext *context)
2140 {
2141         struct mlx5_bfreg_info *bfregi = &context->bfregi;
2142         int err;
2143         unsigned long idx;
2144         phys_addr_t pfn;
2145         pgprot_t prot;
2146         u32 bfreg_dyn_idx = 0;
2147         u32 uar_index;
2148         int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC);
2149         int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
2150                                 bfregi->num_static_sys_pages;
2151
2152         if (bfregi->lib_uar_dyn)
2153                 return -EINVAL;
2154
2155         if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2156                 return -EINVAL;
2157
2158         if (dyn_uar)
2159                 idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages;
2160         else
2161                 idx = get_index(vma->vm_pgoff);
2162
2163         if (idx >= max_valid_idx) {
2164                 mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n",
2165                              idx, max_valid_idx);
2166                 return -EINVAL;
2167         }
2168
2169         switch (cmd) {
2170         case MLX5_IB_MMAP_WC_PAGE:
2171         case MLX5_IB_MMAP_ALLOC_WC:
2172         case MLX5_IB_MMAP_REGULAR_PAGE:
2173                 /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
2174                 prot = pgprot_writecombine(vma->vm_page_prot);
2175                 break;
2176         case MLX5_IB_MMAP_NC_PAGE:
2177                 prot = pgprot_noncached(vma->vm_page_prot);
2178                 break;
2179         default:
2180                 return -EINVAL;
2181         }
2182
2183         if (dyn_uar) {
2184                 int uars_per_page;
2185
2186                 uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
2187                 bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR);
2188                 if (bfreg_dyn_idx >= bfregi->total_num_bfregs) {
2189                         mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n",
2190                                      bfreg_dyn_idx, bfregi->total_num_bfregs);
2191                         return -EINVAL;
2192                 }
2193
2194                 mutex_lock(&bfregi->lock);
2195                 /* Fail if uar already allocated, first bfreg index of each
2196                  * page holds its count.
2197                  */
2198                 if (bfregi->count[bfreg_dyn_idx]) {
2199                         mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx);
2200                         mutex_unlock(&bfregi->lock);
2201                         return -EINVAL;
2202                 }
2203
2204                 bfregi->count[bfreg_dyn_idx]++;
2205                 mutex_unlock(&bfregi->lock);
2206
2207                 err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
2208                 if (err) {
2209                         mlx5_ib_warn(dev, "UAR alloc failed\n");
2210                         goto free_bfreg;
2211                 }
2212         } else {
2213                 uar_index = bfregi->sys_pages[idx];
2214         }
2215
2216         pfn = uar_index2pfn(dev, uar_index);
2217         mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
2218
2219         err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE,
2220                                 prot, NULL);
2221         if (err) {
2222                 mlx5_ib_err(dev,
2223                             "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n",
2224                             err, mmap_cmd2str(cmd));
2225                 goto err;
2226         }
2227
2228         if (dyn_uar)
2229                 bfregi->sys_pages[idx] = uar_index;
2230         return 0;
2231
2232 err:
2233         if (!dyn_uar)
2234                 return err;
2235
2236         mlx5_cmd_free_uar(dev->mdev, idx);
2237
2238 free_bfreg:
2239         mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx);
2240
2241         return err;
2242 }
2243
2244 static int add_dm_mmap_entry(struct ib_ucontext *context,
2245                              struct mlx5_ib_dm *mdm,
2246                              u64 address)
2247 {
2248         mdm->mentry.mmap_flag = MLX5_IB_MMAP_TYPE_MEMIC;
2249         mdm->mentry.address = address;
2250         return rdma_user_mmap_entry_insert_range(
2251                         context, &mdm->mentry.rdma_entry,
2252                         mdm->size,
2253                         MLX5_IB_MMAP_DEVICE_MEM << 16,
2254                         (MLX5_IB_MMAP_DEVICE_MEM << 16) + (1UL << 16) - 1);
2255 }
2256
2257 static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma)
2258 {
2259         unsigned long idx;
2260         u8 command;
2261
2262         command = get_command(vma->vm_pgoff);
2263         idx = get_extended_index(vma->vm_pgoff);
2264
2265         return (command << 16 | idx);
2266 }
2267
2268 static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev,
2269                                struct vm_area_struct *vma,
2270                                struct ib_ucontext *ucontext)
2271 {
2272         struct mlx5_user_mmap_entry *mentry;
2273         struct rdma_user_mmap_entry *entry;
2274         unsigned long pgoff;
2275         pgprot_t prot;
2276         phys_addr_t pfn;
2277         int ret;
2278
2279         pgoff = mlx5_vma_to_pgoff(vma);
2280         entry = rdma_user_mmap_entry_get_pgoff(ucontext, pgoff);
2281         if (!entry)
2282                 return -EINVAL;
2283
2284         mentry = to_mmmap(entry);
2285         pfn = (mentry->address >> PAGE_SHIFT);
2286         if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR ||
2287             mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC)
2288                 prot = pgprot_noncached(vma->vm_page_prot);
2289         else
2290                 prot = pgprot_writecombine(vma->vm_page_prot);
2291         ret = rdma_user_mmap_io(ucontext, vma, pfn,
2292                                 entry->npages * PAGE_SIZE,
2293                                 prot,
2294                                 entry);
2295         rdma_user_mmap_entry_put(&mentry->rdma_entry);
2296         return ret;
2297 }
2298
2299 static u64 mlx5_entry_to_mmap_offset(struct mlx5_user_mmap_entry *entry)
2300 {
2301         u64 cmd = (entry->rdma_entry.start_pgoff >> 16) & 0xFFFF;
2302         u64 index = entry->rdma_entry.start_pgoff & 0xFFFF;
2303
2304         return (((index >> 8) << 16) | (cmd << MLX5_IB_MMAP_CMD_SHIFT) |
2305                 (index & 0xFF)) << PAGE_SHIFT;
2306 }
2307
2308 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
2309 {
2310         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
2311         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
2312         unsigned long command;
2313         phys_addr_t pfn;
2314
2315         command = get_command(vma->vm_pgoff);
2316         switch (command) {
2317         case MLX5_IB_MMAP_WC_PAGE:
2318         case MLX5_IB_MMAP_ALLOC_WC:
2319                 if (!dev->wc_support)
2320                         return -EPERM;
2321                 fallthrough;
2322         case MLX5_IB_MMAP_NC_PAGE:
2323         case MLX5_IB_MMAP_REGULAR_PAGE:
2324                 return uar_mmap(dev, command, vma, context);
2325
2326         case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
2327                 return -ENOSYS;
2328
2329         case MLX5_IB_MMAP_CORE_CLOCK:
2330                 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2331                         return -EINVAL;
2332
2333                 if (vma->vm_flags & VM_WRITE)
2334                         return -EPERM;
2335                 vma->vm_flags &= ~VM_MAYWRITE;
2336
2337                 /* Don't expose to user-space information it shouldn't have */
2338                 if (PAGE_SIZE > 4096)
2339                         return -EOPNOTSUPP;
2340
2341                 pfn = (dev->mdev->iseg_base +
2342                        offsetof(struct mlx5_init_seg, internal_timer_h)) >>
2343                         PAGE_SHIFT;
2344                 return rdma_user_mmap_io(&context->ibucontext, vma, pfn,
2345                                          PAGE_SIZE,
2346                                          pgprot_noncached(vma->vm_page_prot),
2347                                          NULL);
2348         case MLX5_IB_MMAP_CLOCK_INFO:
2349                 return mlx5_ib_mmap_clock_info_page(dev, vma, context);
2350
2351         default:
2352                 return mlx5_ib_mmap_offset(dev, vma, ibcontext);
2353         }
2354
2355         return 0;
2356 }
2357
2358 static inline int check_dm_type_support(struct mlx5_ib_dev *dev,
2359                                         u32 type)
2360 {
2361         switch (type) {
2362         case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2363                 if (!MLX5_CAP_DEV_MEM(dev->mdev, memic))
2364                         return -EOPNOTSUPP;
2365                 break;
2366         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2367         case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2368                 if (!capable(CAP_SYS_RAWIO) ||
2369                     !capable(CAP_NET_RAW))
2370                         return -EPERM;
2371
2372                 if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) ||
2373                       MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner)))
2374                         return -EOPNOTSUPP;
2375                 break;
2376         }
2377
2378         return 0;
2379 }
2380
2381 static int handle_alloc_dm_memic(struct ib_ucontext *ctx,
2382                                  struct mlx5_ib_dm *dm,
2383                                  struct ib_dm_alloc_attr *attr,
2384                                  struct uverbs_attr_bundle *attrs)
2385 {
2386         struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
2387         u64 start_offset;
2388         u16 page_idx;
2389         int err;
2390         u64 address;
2391
2392         dm->size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
2393
2394         err = mlx5_cmd_alloc_memic(dm_db, &dm->dev_addr,
2395                                    dm->size, attr->alignment);
2396         if (err)
2397                 return err;
2398
2399         address = dm->dev_addr & PAGE_MASK;
2400         err = add_dm_mmap_entry(ctx, dm, address);
2401         if (err)
2402                 goto err_dealloc;
2403
2404         page_idx = dm->mentry.rdma_entry.start_pgoff & 0xFFFF;
2405         err = uverbs_copy_to(attrs,
2406                              MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
2407                              &page_idx,
2408                              sizeof(page_idx));
2409         if (err)
2410                 goto err_copy;
2411
2412         start_offset = dm->dev_addr & ~PAGE_MASK;
2413         err = uverbs_copy_to(attrs,
2414                              MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
2415                              &start_offset, sizeof(start_offset));
2416         if (err)
2417                 goto err_copy;
2418
2419         return 0;
2420
2421 err_copy:
2422         rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
2423 err_dealloc:
2424         mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
2425
2426         return err;
2427 }
2428
2429 static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
2430                                   struct mlx5_ib_dm *dm,
2431                                   struct ib_dm_alloc_attr *attr,
2432                                   struct uverbs_attr_bundle *attrs,
2433                                   int type)
2434 {
2435         struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
2436         u64 act_size;
2437         int err;
2438
2439         /* Allocation size must a multiple of the basic block size
2440          * and a power of 2.
2441          */
2442         act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev));
2443         act_size = roundup_pow_of_two(act_size);
2444
2445         dm->size = act_size;
2446         err = mlx5_dm_sw_icm_alloc(dev, type, act_size,
2447                                    to_mucontext(ctx)->devx_uid, &dm->dev_addr,
2448                                    &dm->icm_dm.obj_id);
2449         if (err)
2450                 return err;
2451
2452         err = uverbs_copy_to(attrs,
2453                              MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
2454                              &dm->dev_addr, sizeof(dm->dev_addr));
2455         if (err)
2456                 mlx5_dm_sw_icm_dealloc(dev, type, dm->size,
2457                                        to_mucontext(ctx)->devx_uid, dm->dev_addr,
2458                                        dm->icm_dm.obj_id);
2459
2460         return err;
2461 }
2462
2463 struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
2464                                struct ib_ucontext *context,
2465                                struct ib_dm_alloc_attr *attr,
2466                                struct uverbs_attr_bundle *attrs)
2467 {
2468         struct mlx5_ib_dm *dm;
2469         enum mlx5_ib_uapi_dm_type type;
2470         int err;
2471
2472         err = uverbs_get_const_default(&type, attrs,
2473                                        MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
2474                                        MLX5_IB_UAPI_DM_TYPE_MEMIC);
2475         if (err)
2476                 return ERR_PTR(err);
2477
2478         mlx5_ib_dbg(to_mdev(ibdev), "alloc_dm req: dm_type=%d user_length=0x%llx log_alignment=%d\n",
2479                     type, attr->length, attr->alignment);
2480
2481         err = check_dm_type_support(to_mdev(ibdev), type);
2482         if (err)
2483                 return ERR_PTR(err);
2484
2485         dm = kzalloc(sizeof(*dm), GFP_KERNEL);
2486         if (!dm)
2487                 return ERR_PTR(-ENOMEM);
2488
2489         dm->type = type;
2490
2491         switch (type) {
2492         case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2493                 err = handle_alloc_dm_memic(context, dm,
2494                                             attr,
2495                                             attrs);
2496                 break;
2497         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2498                 err = handle_alloc_dm_sw_icm(context, dm,
2499                                              attr, attrs,
2500                                              MLX5_SW_ICM_TYPE_STEERING);
2501                 break;
2502         case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2503                 err = handle_alloc_dm_sw_icm(context, dm,
2504                                              attr, attrs,
2505                                              MLX5_SW_ICM_TYPE_HEADER_MODIFY);
2506                 break;
2507         default:
2508                 err = -EOPNOTSUPP;
2509         }
2510
2511         if (err)
2512                 goto err_free;
2513
2514         return &dm->ibdm;
2515
2516 err_free:
2517         kfree(dm);
2518         return ERR_PTR(err);
2519 }
2520
2521 int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs)
2522 {
2523         struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
2524                 &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
2525         struct mlx5_core_dev *dev = to_mdev(ibdm->device)->mdev;
2526         struct mlx5_ib_dm *dm = to_mdm(ibdm);
2527         int ret;
2528
2529         switch (dm->type) {
2530         case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2531                 rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
2532                 return 0;
2533         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2534                 ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING,
2535                                              dm->size, ctx->devx_uid, dm->dev_addr,
2536                                              dm->icm_dm.obj_id);
2537                 if (ret)
2538                         return ret;
2539                 break;
2540         case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2541                 ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_HEADER_MODIFY,
2542                                              dm->size, ctx->devx_uid, dm->dev_addr,
2543                                              dm->icm_dm.obj_id);
2544                 if (ret)
2545                         return ret;
2546                 break;
2547         default:
2548                 return -EOPNOTSUPP;
2549         }
2550
2551         kfree(dm);
2552
2553         return 0;
2554 }
2555
2556 static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
2557 {
2558         struct mlx5_ib_pd *pd = to_mpd(ibpd);
2559         struct ib_device *ibdev = ibpd->device;
2560         struct mlx5_ib_alloc_pd_resp resp;
2561         int err;
2562         u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
2563         u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {};
2564         u16 uid = 0;
2565         struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
2566                 udata, struct mlx5_ib_ucontext, ibucontext);
2567
2568         uid = context ? context->devx_uid : 0;
2569         MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
2570         MLX5_SET(alloc_pd_in, in, uid, uid);
2571         err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in),
2572                             out, sizeof(out));
2573         if (err)
2574                 return err;
2575
2576         pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
2577         pd->uid = uid;
2578         if (udata) {
2579                 resp.pdn = pd->pdn;
2580                 if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
2581                         mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
2582                         return -EFAULT;
2583                 }
2584         }
2585
2586         return 0;
2587 }
2588
2589 static void mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
2590 {
2591         struct mlx5_ib_dev *mdev = to_mdev(pd->device);
2592         struct mlx5_ib_pd *mpd = to_mpd(pd);
2593
2594         mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
2595 }
2596
2597 enum {
2598         MATCH_CRITERIA_ENABLE_OUTER_BIT,
2599         MATCH_CRITERIA_ENABLE_MISC_BIT,
2600         MATCH_CRITERIA_ENABLE_INNER_BIT,
2601         MATCH_CRITERIA_ENABLE_MISC2_BIT
2602 };
2603
2604 #define HEADER_IS_ZERO(match_criteria, headers)                            \
2605         !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
2606                     0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
2607
2608 static u8 get_match_criteria_enable(u32 *match_criteria)
2609 {
2610         u8 match_criteria_enable;
2611
2612         match_criteria_enable =
2613                 (!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
2614                 MATCH_CRITERIA_ENABLE_OUTER_BIT;
2615         match_criteria_enable |=
2616                 (!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
2617                 MATCH_CRITERIA_ENABLE_MISC_BIT;
2618         match_criteria_enable |=
2619                 (!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
2620                 MATCH_CRITERIA_ENABLE_INNER_BIT;
2621         match_criteria_enable |=
2622                 (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) <<
2623                 MATCH_CRITERIA_ENABLE_MISC2_BIT;
2624
2625         return match_criteria_enable;
2626 }
2627
2628 static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
2629 {
2630         u8 entry_mask;
2631         u8 entry_val;
2632         int err = 0;
2633
2634         if (!mask)
2635                 goto out;
2636
2637         entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c,
2638                               ip_protocol);
2639         entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v,
2640                              ip_protocol);
2641         if (!entry_mask) {
2642                 MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
2643                 MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
2644                 goto out;
2645         }
2646         /* Don't override existing ip protocol */
2647         if (mask != entry_mask || val != entry_val)
2648                 err = -EINVAL;
2649 out:
2650         return err;
2651 }
2652
2653 static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
2654                            bool inner)
2655 {
2656         if (inner) {
2657                 MLX5_SET(fte_match_set_misc,
2658                          misc_c, inner_ipv6_flow_label, mask);
2659                 MLX5_SET(fte_match_set_misc,
2660                          misc_v, inner_ipv6_flow_label, val);
2661         } else {
2662                 MLX5_SET(fte_match_set_misc,
2663                          misc_c, outer_ipv6_flow_label, mask);
2664                 MLX5_SET(fte_match_set_misc,
2665                          misc_v, outer_ipv6_flow_label, val);
2666         }
2667 }
2668
2669 static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
2670 {
2671         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
2672         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
2673         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
2674         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
2675 }
2676
2677 static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask)
2678 {
2679         if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) &&
2680             !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL))
2681                 return -EOPNOTSUPP;
2682
2683         if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) &&
2684             !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP))
2685                 return -EOPNOTSUPP;
2686
2687         if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) &&
2688             !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS))
2689                 return -EOPNOTSUPP;
2690
2691         if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) &&
2692             !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL))
2693                 return -EOPNOTSUPP;
2694
2695         return 0;
2696 }
2697
2698 #define LAST_ETH_FIELD vlan_tag
2699 #define LAST_IB_FIELD sl
2700 #define LAST_IPV4_FIELD tos
2701 #define LAST_IPV6_FIELD traffic_class
2702 #define LAST_TCP_UDP_FIELD src_port
2703 #define LAST_TUNNEL_FIELD tunnel_id
2704 #define LAST_FLOW_TAG_FIELD tag_id
2705 #define LAST_DROP_FIELD size
2706 #define LAST_COUNTERS_FIELD counters
2707
2708 /* Field is the last supported field */
2709 #define FIELDS_NOT_SUPPORTED(filter, field)\
2710         memchr_inv((void *)&filter.field  +\
2711                    sizeof(filter.field), 0,\
2712                    sizeof(filter) -\
2713                    offsetof(typeof(filter), field) -\
2714                    sizeof(filter.field))
2715
2716 int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
2717                            bool is_egress,
2718                            struct mlx5_flow_act *action)
2719 {
2720
2721         switch (maction->ib_action.type) {
2722         case IB_FLOW_ACTION_ESP:
2723                 if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
2724                                       MLX5_FLOW_CONTEXT_ACTION_DECRYPT))
2725                         return -EINVAL;
2726                 /* Currently only AES_GCM keymat is supported by the driver */
2727                 action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx;
2728                 action->action |= is_egress ?
2729                         MLX5_FLOW_CONTEXT_ACTION_ENCRYPT :
2730                         MLX5_FLOW_CONTEXT_ACTION_DECRYPT;
2731                 return 0;
2732         case IB_FLOW_ACTION_UNSPECIFIED:
2733                 if (maction->flow_action_raw.sub_type ==
2734                     MLX5_IB_FLOW_ACTION_MODIFY_HEADER) {
2735                         if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
2736                                 return -EINVAL;
2737                         action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
2738                         action->modify_hdr =
2739                                 maction->flow_action_raw.modify_hdr;
2740                         return 0;
2741                 }
2742                 if (maction->flow_action_raw.sub_type ==
2743                     MLX5_IB_FLOW_ACTION_DECAP) {
2744                         if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
2745                                 return -EINVAL;
2746                         action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
2747                         return 0;
2748                 }
2749                 if (maction->flow_action_raw.sub_type ==
2750                     MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) {
2751                         if (action->action &
2752                             MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
2753                                 return -EINVAL;
2754                         action->action |=
2755                                 MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
2756                         action->pkt_reformat =
2757                                 maction->flow_action_raw.pkt_reformat;
2758                         return 0;
2759                 }
2760                 /* fall through */
2761         default:
2762                 return -EOPNOTSUPP;
2763         }
2764 }
2765
2766 static int parse_flow_attr(struct mlx5_core_dev *mdev,
2767                            struct mlx5_flow_spec *spec,
2768                            const union ib_flow_spec *ib_spec,
2769                            const struct ib_flow_attr *flow_attr,
2770                            struct mlx5_flow_act *action, u32 prev_type)
2771 {
2772         struct mlx5_flow_context *flow_context = &spec->flow_context;
2773         u32 *match_c = spec->match_criteria;
2774         u32 *match_v = spec->match_value;
2775         void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
2776                                            misc_parameters);
2777         void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
2778                                            misc_parameters);
2779         void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c,
2780                                             misc_parameters_2);
2781         void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v,
2782                                             misc_parameters_2);
2783         void *headers_c;
2784         void *headers_v;
2785         int match_ipv;
2786         int ret;
2787
2788         if (ib_spec->type & IB_FLOW_SPEC_INNER) {
2789                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
2790                                          inner_headers);
2791                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
2792                                          inner_headers);
2793                 match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2794                                         ft_field_support.inner_ip_version);
2795         } else {
2796                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
2797                                          outer_headers);
2798                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
2799                                          outer_headers);
2800                 match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2801                                         ft_field_support.outer_ip_version);
2802         }
2803
2804         switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
2805         case IB_FLOW_SPEC_ETH:
2806                 if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
2807                         return -EOPNOTSUPP;
2808
2809                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2810                                              dmac_47_16),
2811                                 ib_spec->eth.mask.dst_mac);
2812                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2813                                              dmac_47_16),
2814                                 ib_spec->eth.val.dst_mac);
2815
2816                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2817                                              smac_47_16),
2818                                 ib_spec->eth.mask.src_mac);
2819                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2820                                              smac_47_16),
2821                                 ib_spec->eth.val.src_mac);
2822
2823                 if (ib_spec->eth.mask.vlan_tag) {
2824                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2825                                  cvlan_tag, 1);
2826                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2827                                  cvlan_tag, 1);
2828
2829                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2830                                  first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
2831                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2832                                  first_vid, ntohs(ib_spec->eth.val.vlan_tag));
2833
2834                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2835                                  first_cfi,
2836                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
2837                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2838                                  first_cfi,
2839                                  ntohs(ib_spec->eth.val.vlan_tag) >> 12);
2840
2841                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2842                                  first_prio,
2843                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
2844                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2845                                  first_prio,
2846                                  ntohs(ib_spec->eth.val.vlan_tag) >> 13);
2847                 }
2848                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2849                          ethertype, ntohs(ib_spec->eth.mask.ether_type));
2850                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2851                          ethertype, ntohs(ib_spec->eth.val.ether_type));
2852                 break;
2853         case IB_FLOW_SPEC_IPV4:
2854                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
2855                         return -EOPNOTSUPP;
2856
2857                 if (match_ipv) {
2858                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2859                                  ip_version, 0xf);
2860                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2861                                  ip_version, MLX5_FS_IPV4_VERSION);
2862                 } else {
2863                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2864                                  ethertype, 0xffff);
2865                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2866                                  ethertype, ETH_P_IP);
2867                 }
2868
2869                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2870                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
2871                        &ib_spec->ipv4.mask.src_ip,
2872                        sizeof(ib_spec->ipv4.mask.src_ip));
2873                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2874                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
2875                        &ib_spec->ipv4.val.src_ip,
2876                        sizeof(ib_spec->ipv4.val.src_ip));
2877                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2878                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
2879                        &ib_spec->ipv4.mask.dst_ip,
2880                        sizeof(ib_spec->ipv4.mask.dst_ip));
2881                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2882                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
2883                        &ib_spec->ipv4.val.dst_ip,
2884                        sizeof(ib_spec->ipv4.val.dst_ip));
2885
2886                 set_tos(headers_c, headers_v,
2887                         ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
2888
2889                 if (set_proto(headers_c, headers_v,
2890                               ib_spec->ipv4.mask.proto,
2891                               ib_spec->ipv4.val.proto))
2892                         return -EINVAL;
2893                 break;
2894         case IB_FLOW_SPEC_IPV6:
2895                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
2896                         return -EOPNOTSUPP;
2897
2898                 if (match_ipv) {
2899                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2900                                  ip_version, 0xf);
2901                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2902                                  ip_version, MLX5_FS_IPV6_VERSION);
2903                 } else {
2904                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2905                                  ethertype, 0xffff);
2906                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2907                                  ethertype, ETH_P_IPV6);
2908                 }
2909
2910                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2911                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
2912                        &ib_spec->ipv6.mask.src_ip,
2913                        sizeof(ib_spec->ipv6.mask.src_ip));
2914                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2915                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
2916                        &ib_spec->ipv6.val.src_ip,
2917                        sizeof(ib_spec->ipv6.val.src_ip));
2918                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2919                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
2920                        &ib_spec->ipv6.mask.dst_ip,
2921                        sizeof(ib_spec->ipv6.mask.dst_ip));
2922                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2923                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
2924                        &ib_spec->ipv6.val.dst_ip,
2925                        sizeof(ib_spec->ipv6.val.dst_ip));
2926
2927                 set_tos(headers_c, headers_v,
2928                         ib_spec->ipv6.mask.traffic_class,
2929                         ib_spec->ipv6.val.traffic_class);
2930
2931                 if (set_proto(headers_c, headers_v,
2932                               ib_spec->ipv6.mask.next_hdr,
2933                               ib_spec->ipv6.val.next_hdr))
2934                         return -EINVAL;
2935
2936                 set_flow_label(misc_params_c, misc_params_v,
2937                                ntohl(ib_spec->ipv6.mask.flow_label),
2938                                ntohl(ib_spec->ipv6.val.flow_label),
2939                                ib_spec->type & IB_FLOW_SPEC_INNER);
2940                 break;
2941         case IB_FLOW_SPEC_ESP:
2942                 if (ib_spec->esp.mask.seq)
2943                         return -EOPNOTSUPP;
2944
2945                 MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi,
2946                          ntohl(ib_spec->esp.mask.spi));
2947                 MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi,
2948                          ntohl(ib_spec->esp.val.spi));
2949                 break;
2950         case IB_FLOW_SPEC_TCP:
2951                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
2952                                          LAST_TCP_UDP_FIELD))
2953                         return -EOPNOTSUPP;
2954
2955                 if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP))
2956                         return -EINVAL;
2957
2958                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
2959                          ntohs(ib_spec->tcp_udp.mask.src_port));
2960                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
2961                          ntohs(ib_spec->tcp_udp.val.src_port));
2962
2963                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
2964                          ntohs(ib_spec->tcp_udp.mask.dst_port));
2965                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
2966                          ntohs(ib_spec->tcp_udp.val.dst_port));
2967                 break;
2968         case IB_FLOW_SPEC_UDP:
2969                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
2970                                          LAST_TCP_UDP_FIELD))
2971                         return -EOPNOTSUPP;
2972
2973                 if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP))
2974                         return -EINVAL;
2975
2976                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
2977                          ntohs(ib_spec->tcp_udp.mask.src_port));
2978                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
2979                          ntohs(ib_spec->tcp_udp.val.src_port));
2980
2981                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
2982                          ntohs(ib_spec->tcp_udp.mask.dst_port));
2983                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
2984                          ntohs(ib_spec->tcp_udp.val.dst_port));
2985                 break;
2986         case IB_FLOW_SPEC_GRE:
2987                 if (ib_spec->gre.mask.c_ks_res0_ver)
2988                         return -EOPNOTSUPP;
2989
2990                 if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE))
2991                         return -EINVAL;
2992
2993                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
2994                          0xff);
2995                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
2996                          IPPROTO_GRE);
2997
2998                 MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol,
2999                          ntohs(ib_spec->gre.mask.protocol));
3000                 MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol,
3001                          ntohs(ib_spec->gre.val.protocol));
3002
3003                 memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
3004                                     gre_key.nvgre.hi),
3005                        &ib_spec->gre.mask.key,
3006                        sizeof(ib_spec->gre.mask.key));
3007                 memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v,
3008                                     gre_key.nvgre.hi),
3009                        &ib_spec->gre.val.key,
3010                        sizeof(ib_spec->gre.val.key));
3011                 break;
3012         case IB_FLOW_SPEC_MPLS:
3013                 switch (prev_type) {
3014                 case IB_FLOW_SPEC_UDP:
3015                         if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3016                                                    ft_field_support.outer_first_mpls_over_udp),
3017                                                    &ib_spec->mpls.mask.tag))
3018                                 return -EOPNOTSUPP;
3019
3020                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
3021                                             outer_first_mpls_over_udp),
3022                                &ib_spec->mpls.val.tag,
3023                                sizeof(ib_spec->mpls.val.tag));
3024                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
3025                                             outer_first_mpls_over_udp),
3026                                &ib_spec->mpls.mask.tag,
3027                                sizeof(ib_spec->mpls.mask.tag));
3028                         break;
3029                 case IB_FLOW_SPEC_GRE:
3030                         if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3031                                                    ft_field_support.outer_first_mpls_over_gre),
3032                                                    &ib_spec->mpls.mask.tag))
3033                                 return -EOPNOTSUPP;
3034
3035                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
3036                                             outer_first_mpls_over_gre),
3037                                &ib_spec->mpls.val.tag,
3038                                sizeof(ib_spec->mpls.val.tag));
3039                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
3040                                             outer_first_mpls_over_gre),
3041                                &ib_spec->mpls.mask.tag,
3042                                sizeof(ib_spec->mpls.mask.tag));
3043                         break;
3044                 default:
3045                         if (ib_spec->type & IB_FLOW_SPEC_INNER) {
3046                                 if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3047                                                            ft_field_support.inner_first_mpls),
3048                                                            &ib_spec->mpls.mask.tag))
3049                                         return -EOPNOTSUPP;
3050
3051                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
3052                                                     inner_first_mpls),
3053                                        &ib_spec->mpls.val.tag,
3054                                        sizeof(ib_spec->mpls.val.tag));
3055                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
3056                                                     inner_first_mpls),
3057                                        &ib_spec->mpls.mask.tag,
3058                                        sizeof(ib_spec->mpls.mask.tag));
3059                         } else {
3060                                 if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3061                                                            ft_field_support.outer_first_mpls),
3062                                                            &ib_spec->mpls.mask.tag))
3063                                         return -EOPNOTSUPP;
3064
3065                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
3066                                                     outer_first_mpls),
3067                                        &ib_spec->mpls.val.tag,
3068                                        sizeof(ib_spec->mpls.val.tag));
3069                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
3070                                                     outer_first_mpls),
3071                                        &ib_spec->mpls.mask.tag,
3072                                        sizeof(ib_spec->mpls.mask.tag));
3073                         }
3074                 }
3075                 break;
3076         case IB_FLOW_SPEC_VXLAN_TUNNEL:
3077                 if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
3078                                          LAST_TUNNEL_FIELD))
3079                         return -EOPNOTSUPP;
3080
3081                 MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
3082                          ntohl(ib_spec->tunnel.mask.tunnel_id));
3083                 MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
3084                          ntohl(ib_spec->tunnel.val.tunnel_id));
3085                 break;
3086         case IB_FLOW_SPEC_ACTION_TAG:
3087                 if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
3088                                          LAST_FLOW_TAG_FIELD))
3089                         return -EOPNOTSUPP;
3090                 if (ib_spec->flow_tag.tag_id >= BIT(24))
3091                         return -EINVAL;
3092
3093                 flow_context->flow_tag = ib_spec->flow_tag.tag_id;
3094                 flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
3095                 break;
3096         case IB_FLOW_SPEC_ACTION_DROP:
3097                 if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
3098                                          LAST_DROP_FIELD))
3099                         return -EOPNOTSUPP;
3100                 action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
3101                 break;
3102         case IB_FLOW_SPEC_ACTION_HANDLE:
3103                 ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act),
3104                         flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action);
3105                 if (ret)
3106                         return ret;
3107                 break;
3108         case IB_FLOW_SPEC_ACTION_COUNT:
3109                 if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count,
3110                                          LAST_COUNTERS_FIELD))
3111                         return -EOPNOTSUPP;
3112
3113                 /* for now support only one counters spec per flow */
3114                 if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
3115                         return -EINVAL;
3116
3117                 action->counters = ib_spec->flow_count.counters;
3118                 action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
3119                 break;
3120         default:
3121                 return -EINVAL;
3122         }
3123
3124         return 0;
3125 }
3126
3127 /* If a flow could catch both multicast and unicast packets,
3128  * it won't fall into the multicast flow steering table and this rule
3129  * could steal other multicast packets.
3130  */
3131 static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr)
3132 {
3133         union ib_flow_spec *flow_spec;
3134
3135         if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
3136             ib_attr->num_of_specs < 1)
3137                 return false;
3138
3139         flow_spec = (union ib_flow_spec *)(ib_attr + 1);
3140         if (flow_spec->type == IB_FLOW_SPEC_IPV4) {
3141                 struct ib_flow_spec_ipv4 *ipv4_spec;
3142
3143                 ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec;
3144                 if (ipv4_is_multicast(ipv4_spec->val.dst_ip))
3145                         return true;
3146
3147                 return false;
3148         }
3149
3150         if (flow_spec->type == IB_FLOW_SPEC_ETH) {
3151                 struct ib_flow_spec_eth *eth_spec;
3152
3153                 eth_spec = (struct ib_flow_spec_eth *)flow_spec;
3154                 return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
3155                        is_multicast_ether_addr(eth_spec->val.dst_mac);
3156         }
3157
3158         return false;
3159 }
3160
3161 enum valid_spec {
3162         VALID_SPEC_INVALID,
3163         VALID_SPEC_VALID,
3164         VALID_SPEC_NA,
3165 };
3166
3167 static enum valid_spec
3168 is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
3169                      const struct mlx5_flow_spec *spec,
3170                      const struct mlx5_flow_act *flow_act,
3171                      bool egress)
3172 {
3173         const u32 *match_c = spec->match_criteria;
3174         bool is_crypto =
3175                 (flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
3176                                      MLX5_FLOW_CONTEXT_ACTION_DECRYPT));
3177         bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c);
3178         bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP;
3179
3180         /*
3181          * Currently only crypto is supported in egress, when regular egress
3182          * rules would be supported, always return VALID_SPEC_NA.
3183          */
3184         if (!is_crypto)
3185                 return VALID_SPEC_NA;
3186
3187         return is_crypto && is_ipsec &&
3188                 (!egress || (!is_drop &&
3189                              !(spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG))) ?
3190                 VALID_SPEC_VALID : VALID_SPEC_INVALID;
3191 }
3192
3193 static bool is_valid_spec(struct mlx5_core_dev *mdev,
3194                           const struct mlx5_flow_spec *spec,
3195                           const struct mlx5_flow_act *flow_act,
3196                           bool egress)
3197 {
3198         /* We curretly only support ipsec egress flow */
3199         return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID;
3200 }
3201
3202 static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
3203                                const struct ib_flow_attr *flow_attr,
3204                                bool check_inner)
3205 {
3206         union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
3207         int match_ipv = check_inner ?
3208                         MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3209                                         ft_field_support.inner_ip_version) :
3210                         MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3211                                         ft_field_support.outer_ip_version);
3212         int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0;
3213         bool ipv4_spec_valid, ipv6_spec_valid;
3214         unsigned int ip_spec_type = 0;
3215         bool has_ethertype = false;
3216         unsigned int spec_index;
3217         bool mask_valid = true;
3218         u16 eth_type = 0;
3219         bool type_valid;
3220
3221         /* Validate that ethertype is correct */
3222         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
3223                 if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) &&
3224                     ib_spec->eth.mask.ether_type) {
3225                         mask_valid = (ib_spec->eth.mask.ether_type ==
3226                                       htons(0xffff));
3227                         has_ethertype = true;
3228                         eth_type = ntohs(ib_spec->eth.val.ether_type);
3229                 } else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) ||
3230                            (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) {
3231                         ip_spec_type = ib_spec->type;
3232                 }
3233                 ib_spec = (void *)ib_spec + ib_spec->size;
3234         }
3235
3236         type_valid = (!has_ethertype) || (!ip_spec_type);
3237         if (!type_valid && mask_valid) {
3238                 ipv4_spec_valid = (eth_type == ETH_P_IP) &&
3239                         (ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit));
3240                 ipv6_spec_valid = (eth_type == ETH_P_IPV6) &&
3241                         (ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit));
3242
3243                 type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) ||
3244                              (((eth_type == ETH_P_MPLS_UC) ||
3245                                (eth_type == ETH_P_MPLS_MC)) && match_ipv);
3246         }
3247
3248         return type_valid;
3249 }
3250
3251 static bool is_valid_attr(struct mlx5_core_dev *mdev,
3252                           const struct ib_flow_attr *flow_attr)
3253 {
3254         return is_valid_ethertype(mdev, flow_attr, false) &&
3255                is_valid_ethertype(mdev, flow_attr, true);
3256 }
3257
3258 static void put_flow_table(struct mlx5_ib_dev *dev,
3259                            struct mlx5_ib_flow_prio *prio, bool ft_added)
3260 {
3261         prio->refcount -= !!ft_added;
3262         if (!prio->refcount) {
3263                 mlx5_destroy_flow_table(prio->flow_table);
3264                 prio->flow_table = NULL;
3265         }
3266 }
3267
3268 static void counters_clear_description(struct ib_counters *counters)
3269 {
3270         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
3271
3272         mutex_lock(&mcounters->mcntrs_mutex);
3273         kfree(mcounters->counters_data);
3274         mcounters->counters_data = NULL;
3275         mcounters->cntrs_max_index = 0;
3276         mutex_unlock(&mcounters->mcntrs_mutex);
3277 }
3278
3279 static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
3280 {
3281         struct mlx5_ib_flow_handler *handler = container_of(flow_id,
3282                                                           struct mlx5_ib_flow_handler,
3283                                                           ibflow);
3284         struct mlx5_ib_flow_handler *iter, *tmp;
3285         struct mlx5_ib_dev *dev = handler->dev;
3286
3287         mutex_lock(&dev->flow_db->lock);
3288
3289         list_for_each_entry_safe(iter, tmp, &handler->list, list) {
3290                 mlx5_del_flow_rules(iter->rule);
3291                 put_flow_table(dev, iter->prio, true);
3292                 list_del(&iter->list);
3293                 kfree(iter);
3294         }
3295
3296         mlx5_del_flow_rules(handler->rule);
3297         put_flow_table(dev, handler->prio, true);
3298         if (handler->ibcounters &&
3299             atomic_read(&handler->ibcounters->usecnt) == 1)
3300                 counters_clear_description(handler->ibcounters);
3301
3302         mutex_unlock(&dev->flow_db->lock);
3303         if (handler->flow_matcher)
3304                 atomic_dec(&handler->flow_matcher->usecnt);
3305         kfree(handler);
3306
3307         return 0;
3308 }
3309
3310 static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
3311 {
3312         priority *= 2;
3313         if (!dont_trap)
3314                 priority++;
3315         return priority;
3316 }
3317
3318 enum flow_table_type {
3319         MLX5_IB_FT_RX,
3320         MLX5_IB_FT_TX
3321 };
3322
3323 #define MLX5_FS_MAX_TYPES        6
3324 #define MLX5_FS_MAX_ENTRIES      BIT(16)
3325
3326 static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
3327                                            struct mlx5_ib_flow_prio *prio,
3328                                            int priority,
3329                                            int num_entries, int num_groups,
3330                                            u32 flags)
3331 {
3332         struct mlx5_flow_table_attr ft_attr = {};
3333         struct mlx5_flow_table *ft;
3334
3335         ft_attr.prio = priority;
3336         ft_attr.max_fte = num_entries;
3337         ft_attr.flags = flags;
3338         ft_attr.autogroup.max_num_groups = num_groups;
3339         ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
3340         if (IS_ERR(ft))
3341                 return ERR_CAST(ft);
3342
3343         prio->flow_table = ft;
3344         prio->refcount = 0;
3345         return prio;
3346 }
3347
3348 static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
3349                                                 struct ib_flow_attr *flow_attr,
3350                                                 enum flow_table_type ft_type)
3351 {
3352         bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
3353         struct mlx5_flow_namespace *ns = NULL;
3354         struct mlx5_ib_flow_prio *prio;
3355         struct mlx5_flow_table *ft;
3356         int max_table_size;
3357         int num_entries;
3358         int num_groups;
3359         bool esw_encap;
3360         u32 flags = 0;
3361         int priority;
3362
3363         max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3364                                                        log_max_ft_size));
3365         esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
3366                 DEVLINK_ESWITCH_ENCAP_MODE_NONE;
3367         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
3368                 enum mlx5_flow_namespace_type fn_type;
3369
3370                 if (flow_is_multicast_only(flow_attr) &&
3371                     !dont_trap)
3372                         priority = MLX5_IB_FLOW_MCAST_PRIO;
3373                 else
3374                         priority = ib_prio_to_core_prio(flow_attr->priority,
3375                                                         dont_trap);
3376                 if (ft_type == MLX5_IB_FT_RX) {
3377                         fn_type = MLX5_FLOW_NAMESPACE_BYPASS;
3378                         prio = &dev->flow_db->prios[priority];
3379                         if (!dev->is_rep && !esw_encap &&
3380                             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
3381                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
3382                         if (!dev->is_rep && !esw_encap &&
3383                             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3384                                         reformat_l3_tunnel_to_l2))
3385                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3386                 } else {
3387                         max_table_size =
3388                                 BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
3389                                                               log_max_ft_size));
3390                         fn_type = MLX5_FLOW_NAMESPACE_EGRESS;
3391                         prio = &dev->flow_db->egress_prios[priority];
3392                         if (!dev->is_rep && !esw_encap &&
3393                             MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
3394                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3395                 }
3396                 ns = mlx5_get_flow_namespace(dev->mdev, fn_type);
3397                 num_entries = MLX5_FS_MAX_ENTRIES;
3398                 num_groups = MLX5_FS_MAX_TYPES;
3399         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3400                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
3401                 ns = mlx5_get_flow_namespace(dev->mdev,
3402                                              MLX5_FLOW_NAMESPACE_LEFTOVERS);
3403                 build_leftovers_ft_param(&priority,
3404                                          &num_entries,
3405                                          &num_groups);
3406                 prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
3407         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3408                 if (!MLX5_CAP_FLOWTABLE(dev->mdev,
3409                                         allow_sniffer_and_nic_rx_shared_tir))
3410                         return ERR_PTR(-ENOTSUPP);
3411
3412                 ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
3413                                              MLX5_FLOW_NAMESPACE_SNIFFER_RX :
3414                                              MLX5_FLOW_NAMESPACE_SNIFFER_TX);
3415
3416                 prio = &dev->flow_db->sniffer[ft_type];
3417                 priority = 0;
3418                 num_entries = 1;
3419                 num_groups = 1;
3420         }
3421
3422         if (!ns)
3423                 return ERR_PTR(-ENOTSUPP);
3424
3425         max_table_size = min_t(int, num_entries, max_table_size);
3426
3427         ft = prio->flow_table;
3428         if (!ft)
3429                 return _get_prio(ns, prio, priority, max_table_size, num_groups,
3430                                  flags);
3431
3432         return prio;
3433 }
3434
3435 static void set_underlay_qp(struct mlx5_ib_dev *dev,
3436                             struct mlx5_flow_spec *spec,
3437                             u32 underlay_qpn)
3438 {
3439         void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
3440                                            spec->match_criteria,
3441                                            misc_parameters);
3442         void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
3443                                            misc_parameters);
3444
3445         if (underlay_qpn &&
3446             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3447                                       ft_field_support.bth_dst_qp)) {
3448                 MLX5_SET(fte_match_set_misc,
3449                          misc_params_v, bth_dst_qp, underlay_qpn);
3450                 MLX5_SET(fte_match_set_misc,
3451                          misc_params_c, bth_dst_qp, 0xffffff);
3452         }
3453 }
3454
3455 static int read_flow_counters(struct ib_device *ibdev,
3456                               struct mlx5_read_counters_attr *read_attr)
3457 {
3458         struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
3459         struct mlx5_ib_dev *dev = to_mdev(ibdev);
3460
3461         return mlx5_fc_query(dev->mdev, fc,
3462                              &read_attr->out[IB_COUNTER_PACKETS],
3463                              &read_attr->out[IB_COUNTER_BYTES]);
3464 }
3465
3466 /* flow counters currently expose two counters packets and bytes */
3467 #define FLOW_COUNTERS_NUM 2
3468 static int counters_set_description(struct ib_counters *counters,
3469                                     enum mlx5_ib_counters_type counters_type,
3470                                     struct mlx5_ib_flow_counters_desc *desc_data,
3471                                     u32 ncounters)
3472 {
3473         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
3474         u32 cntrs_max_index = 0;
3475         int i;
3476
3477         if (counters_type != MLX5_IB_COUNTERS_FLOW)
3478                 return -EINVAL;
3479
3480         /* init the fields for the object */
3481         mcounters->type = counters_type;
3482         mcounters->read_counters = read_flow_counters;
3483         mcounters->counters_num = FLOW_COUNTERS_NUM;
3484         mcounters->ncounters = ncounters;
3485         /* each counter entry have both description and index pair */
3486         for (i = 0; i < ncounters; i++) {
3487                 if (desc_data[i].description > IB_COUNTER_BYTES)
3488                         return -EINVAL;
3489
3490                 if (cntrs_max_index <= desc_data[i].index)
3491                         cntrs_max_index = desc_data[i].index + 1;
3492         }
3493
3494         mutex_lock(&mcounters->mcntrs_mutex);
3495         mcounters->counters_data = desc_data;
3496         mcounters->cntrs_max_index = cntrs_max_index;
3497         mutex_unlock(&mcounters->mcntrs_mutex);
3498
3499         return 0;
3500 }
3501
3502 #define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
3503 static int flow_counters_set_data(struct ib_counters *ibcounters,
3504                                   struct mlx5_ib_create_flow *ucmd)
3505 {
3506         struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
3507         struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
3508         struct mlx5_ib_flow_counters_desc *desc_data = NULL;
3509         bool hw_hndl = false;
3510         int ret = 0;
3511
3512         if (ucmd && ucmd->ncounters_data != 0) {
3513                 cntrs_data = ucmd->data;
3514                 if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
3515                         return -EINVAL;
3516
3517                 desc_data = kcalloc(cntrs_data->ncounters,
3518                                     sizeof(*desc_data),
3519                                     GFP_KERNEL);
3520                 if (!desc_data)
3521                         return  -ENOMEM;
3522
3523                 if (copy_from_user(desc_data,
3524                                    u64_to_user_ptr(cntrs_data->counters_data),
3525                                    sizeof(*desc_data) * cntrs_data->ncounters)) {
3526                         ret = -EFAULT;
3527                         goto free;
3528                 }
3529         }
3530
3531         if (!mcounters->hw_cntrs_hndl) {
3532                 mcounters->hw_cntrs_hndl = mlx5_fc_create(
3533                         to_mdev(ibcounters->device)->mdev, false);
3534                 if (IS_ERR(mcounters->hw_cntrs_hndl)) {
3535                         ret = PTR_ERR(mcounters->hw_cntrs_hndl);
3536                         goto free;
3537                 }
3538                 hw_hndl = true;
3539         }
3540
3541         if (desc_data) {
3542                 /* counters already bound to at least one flow */
3543                 if (mcounters->cntrs_max_index) {
3544                         ret = -EINVAL;
3545                         goto free_hndl;
3546                 }
3547
3548                 ret = counters_set_description(ibcounters,
3549                                                MLX5_IB_COUNTERS_FLOW,
3550                                                desc_data,
3551                                                cntrs_data->ncounters);
3552                 if (ret)
3553                         goto free_hndl;
3554
3555         } else if (!mcounters->cntrs_max_index) {
3556                 /* counters not bound yet, must have udata passed */
3557                 ret = -EINVAL;
3558                 goto free_hndl;
3559         }
3560
3561         return 0;
3562
3563 free_hndl:
3564         if (hw_hndl) {
3565                 mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev,
3566                                 mcounters->hw_cntrs_hndl);
3567                 mcounters->hw_cntrs_hndl = NULL;
3568         }
3569 free:
3570         kfree(desc_data);
3571         return ret;
3572 }
3573
3574 static void mlx5_ib_set_rule_source_port(struct mlx5_ib_dev *dev,
3575                                          struct mlx5_flow_spec *spec,
3576                                          struct mlx5_eswitch_rep *rep)
3577 {
3578         struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
3579         void *misc;
3580
3581         if (mlx5_eswitch_vport_match_metadata_enabled(esw)) {
3582                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
3583                                     misc_parameters_2);
3584
3585                 MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
3586                          mlx5_eswitch_get_vport_metadata_for_match(esw,
3587                                                                    rep->vport));
3588                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
3589                                     misc_parameters_2);
3590
3591                 MLX5_SET_TO_ONES(fte_match_set_misc2, misc, metadata_reg_c_0);
3592         } else {
3593                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
3594                                     misc_parameters);
3595
3596                 MLX5_SET(fte_match_set_misc, misc, source_port, rep->vport);
3597
3598                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
3599                                     misc_parameters);
3600
3601                 MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
3602         }
3603 }
3604
3605 static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
3606                                                       struct mlx5_ib_flow_prio *ft_prio,
3607                                                       const struct ib_flow_attr *flow_attr,
3608                                                       struct mlx5_flow_destination *dst,
3609                                                       u32 underlay_qpn,
3610                                                       struct mlx5_ib_create_flow *ucmd)
3611 {
3612         struct mlx5_flow_table  *ft = ft_prio->flow_table;
3613         struct mlx5_ib_flow_handler *handler;
3614         struct mlx5_flow_act flow_act = {};
3615         struct mlx5_flow_spec *spec;
3616         struct mlx5_flow_destination dest_arr[2] = {};
3617         struct mlx5_flow_destination *rule_dst = dest_arr;
3618         const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
3619         unsigned int spec_index;
3620         u32 prev_type = 0;
3621         int err = 0;
3622         int dest_num = 0;
3623         bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
3624
3625         if (!is_valid_attr(dev->mdev, flow_attr))
3626                 return ERR_PTR(-EINVAL);
3627
3628         if (dev->is_rep && is_egress)
3629                 return ERR_PTR(-EINVAL);
3630
3631         spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
3632         handler = kzalloc(sizeof(*handler), GFP_KERNEL);
3633         if (!handler || !spec) {
3634                 err = -ENOMEM;
3635                 goto free;
3636         }
3637
3638         INIT_LIST_HEAD(&handler->list);
3639
3640         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
3641                 err = parse_flow_attr(dev->mdev, spec,
3642                                       ib_flow, flow_attr, &flow_act,
3643                                       prev_type);
3644                 if (err < 0)
3645                         goto free;
3646
3647                 prev_type = ((union ib_flow_spec *)ib_flow)->type;
3648                 ib_flow += ((union ib_flow_spec *)ib_flow)->size;
3649         }
3650
3651         if (dst && !(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP)) {
3652                 memcpy(&dest_arr[0], dst, sizeof(*dst));
3653                 dest_num++;
3654         }
3655
3656         if (!flow_is_multicast_only(flow_attr))
3657                 set_underlay_qp(dev, spec, underlay_qpn);
3658
3659         if (dev->is_rep) {
3660                 struct mlx5_eswitch_rep *rep;
3661
3662                 rep = dev->port[flow_attr->port - 1].rep;
3663                 if (!rep) {
3664                         err = -EINVAL;
3665                         goto free;
3666                 }
3667
3668                 mlx5_ib_set_rule_source_port(dev, spec, rep);
3669         }
3670
3671         spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
3672
3673         if (is_egress &&
3674             !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) {
3675                 err = -EINVAL;
3676                 goto free;
3677         }
3678
3679         if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
3680                 struct mlx5_ib_mcounters *mcounters;
3681
3682                 err = flow_counters_set_data(flow_act.counters, ucmd);
3683                 if (err)
3684                         goto free;
3685
3686                 mcounters = to_mcounters(flow_act.counters);
3687                 handler->ibcounters = flow_act.counters;
3688                 dest_arr[dest_num].type =
3689                         MLX5_FLOW_DESTINATION_TYPE_COUNTER;
3690                 dest_arr[dest_num].counter_id =
3691                         mlx5_fc_id(mcounters->hw_cntrs_hndl);
3692                 dest_num++;
3693         }
3694
3695         if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
3696                 if (!dest_num)
3697                         rule_dst = NULL;
3698         } else {
3699                 if (is_egress)
3700                         flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
3701                 else
3702                         flow_act.action |=
3703                                 dest_num ?  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
3704                                         MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
3705         }
3706
3707         if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG)  &&
3708             (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3709              flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
3710                 mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
3711                              spec->flow_context.flow_tag, flow_attr->type);
3712                 err = -EINVAL;
3713                 goto free;
3714         }
3715         handler->rule = mlx5_add_flow_rules(ft, spec,
3716                                             &flow_act,
3717                                             rule_dst, dest_num);
3718
3719         if (IS_ERR(handler->rule)) {
3720                 err = PTR_ERR(handler->rule);
3721                 goto free;
3722         }
3723
3724         ft_prio->refcount++;
3725         handler->prio = ft_prio;
3726         handler->dev = dev;
3727
3728         ft_prio->flow_table = ft;
3729 free:
3730         if (err && handler) {
3731                 if (handler->ibcounters &&
3732                     atomic_read(&handler->ibcounters->usecnt) == 1)
3733                         counters_clear_description(handler->ibcounters);
3734                 kfree(handler);
3735         }
3736         kvfree(spec);
3737         return err ? ERR_PTR(err) : handler;
3738 }
3739
3740 static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
3741                                                      struct mlx5_ib_flow_prio *ft_prio,
3742                                                      const struct ib_flow_attr *flow_attr,
3743                                                      struct mlx5_flow_destination *dst)
3744 {
3745         return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
3746 }
3747
3748 static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
3749                                                           struct mlx5_ib_flow_prio *ft_prio,
3750                                                           struct ib_flow_attr *flow_attr,
3751                                                           struct mlx5_flow_destination *dst)
3752 {
3753         struct mlx5_ib_flow_handler *handler_dst = NULL;
3754         struct mlx5_ib_flow_handler *handler = NULL;
3755
3756         handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
3757         if (!IS_ERR(handler)) {
3758                 handler_dst = create_flow_rule(dev, ft_prio,
3759                                                flow_attr, dst);
3760                 if (IS_ERR(handler_dst)) {
3761                         mlx5_del_flow_rules(handler->rule);
3762                         ft_prio->refcount--;
3763                         kfree(handler);
3764                         handler = handler_dst;
3765                 } else {
3766                         list_add(&handler_dst->list, &handler->list);
3767                 }
3768         }
3769
3770         return handler;
3771 }
3772 enum {
3773         LEFTOVERS_MC,
3774         LEFTOVERS_UC,
3775 };
3776
3777 static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
3778                                                           struct mlx5_ib_flow_prio *ft_prio,
3779                                                           struct ib_flow_attr *flow_attr,
3780                                                           struct mlx5_flow_destination *dst)
3781 {
3782         struct mlx5_ib_flow_handler *handler_ucast = NULL;
3783         struct mlx5_ib_flow_handler *handler = NULL;
3784
3785         static struct {
3786                 struct ib_flow_attr     flow_attr;
3787                 struct ib_flow_spec_eth eth_flow;
3788         } leftovers_specs[] = {
3789                 [LEFTOVERS_MC] = {
3790                         .flow_attr = {
3791                                 .num_of_specs = 1,
3792                                 .size = sizeof(leftovers_specs[0])
3793                         },
3794                         .eth_flow = {
3795                                 .type = IB_FLOW_SPEC_ETH,
3796                                 .size = sizeof(struct ib_flow_spec_eth),
3797                                 .mask = {.dst_mac = {0x1} },
3798                                 .val =  {.dst_mac = {0x1} }
3799                         }
3800                 },
3801                 [LEFTOVERS_UC] = {
3802                         .flow_attr = {
3803                                 .num_of_specs = 1,
3804                                 .size = sizeof(leftovers_specs[0])
3805                         },
3806                         .eth_flow = {
3807                                 .type = IB_FLOW_SPEC_ETH,
3808                                 .size = sizeof(struct ib_flow_spec_eth),
3809                                 .mask = {.dst_mac = {0x1} },
3810                                 .val = {.dst_mac = {} }
3811                         }
3812                 }
3813         };
3814
3815         handler = create_flow_rule(dev, ft_prio,
3816                                    &leftovers_specs[LEFTOVERS_MC].flow_attr,
3817                                    dst);
3818         if (!IS_ERR(handler) &&
3819             flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
3820                 handler_ucast = create_flow_rule(dev, ft_prio,
3821                                                  &leftovers_specs[LEFTOVERS_UC].flow_attr,
3822                                                  dst);
3823                 if (IS_ERR(handler_ucast)) {
3824                         mlx5_del_flow_rules(handler->rule);
3825                         ft_prio->refcount--;
3826                         kfree(handler);
3827                         handler = handler_ucast;
3828                 } else {
3829                         list_add(&handler_ucast->list, &handler->list);
3830                 }
3831         }
3832
3833         return handler;
3834 }
3835
3836 static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
3837                                                         struct mlx5_ib_flow_prio *ft_rx,
3838                                                         struct mlx5_ib_flow_prio *ft_tx,
3839                                                         struct mlx5_flow_destination *dst)
3840 {
3841         struct mlx5_ib_flow_handler *handler_rx;
3842         struct mlx5_ib_flow_handler *handler_tx;
3843         int err;
3844         static const struct ib_flow_attr flow_attr  = {
3845                 .num_of_specs = 0,
3846                 .size = sizeof(flow_attr)
3847         };
3848
3849         handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
3850         if (IS_ERR(handler_rx)) {
3851                 err = PTR_ERR(handler_rx);
3852                 goto err;
3853         }
3854
3855         handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
3856         if (IS_ERR(handler_tx)) {
3857                 err = PTR_ERR(handler_tx);
3858                 goto err_tx;
3859         }
3860
3861         list_add(&handler_tx->list, &handler_rx->list);
3862
3863         return handler_rx;
3864
3865 err_tx:
3866         mlx5_del_flow_rules(handler_rx->rule);
3867         ft_rx->refcount--;
3868         kfree(handler_rx);
3869 err:
3870         return ERR_PTR(err);
3871 }
3872
3873 static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
3874                                            struct ib_flow_attr *flow_attr,
3875                                            int domain,
3876                                            struct ib_udata *udata)
3877 {
3878         struct mlx5_ib_dev *dev = to_mdev(qp->device);
3879         struct mlx5_ib_qp *mqp = to_mqp(qp);
3880         struct mlx5_ib_flow_handler *handler = NULL;
3881         struct mlx5_flow_destination *dst = NULL;
3882         struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
3883         struct mlx5_ib_flow_prio *ft_prio;
3884         bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
3885         struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr;
3886         size_t min_ucmd_sz, required_ucmd_sz;
3887         int err;
3888         int underlay_qpn;
3889
3890         if (udata && udata->inlen) {
3891                 min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) +
3892                                 sizeof(ucmd_hdr.reserved);
3893                 if (udata->inlen < min_ucmd_sz)
3894                         return ERR_PTR(-EOPNOTSUPP);
3895
3896                 err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz);
3897                 if (err)
3898                         return ERR_PTR(err);
3899
3900                 /* currently supports only one counters data */
3901                 if (ucmd_hdr.ncounters_data > 1)
3902                         return ERR_PTR(-EINVAL);
3903
3904                 required_ucmd_sz = min_ucmd_sz +
3905                         sizeof(struct mlx5_ib_flow_counters_data) *
3906                         ucmd_hdr.ncounters_data;
3907                 if (udata->inlen > required_ucmd_sz &&
3908                     !ib_is_udata_cleared(udata, required_ucmd_sz,
3909                                          udata->inlen - required_ucmd_sz))
3910                         return ERR_PTR(-EOPNOTSUPP);
3911
3912                 ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL);
3913                 if (!ucmd)
3914                         return ERR_PTR(-ENOMEM);
3915
3916                 err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz);
3917                 if (err)
3918                         goto free_ucmd;
3919         }
3920
3921         if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) {
3922                 err = -ENOMEM;
3923                 goto free_ucmd;
3924         }
3925
3926         if (domain != IB_FLOW_DOMAIN_USER ||
3927             flow_attr->port > dev->num_ports ||
3928             (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP |
3929                                   IB_FLOW_ATTR_FLAGS_EGRESS))) {
3930                 err = -EINVAL;
3931                 goto free_ucmd;
3932         }
3933
3934         if (is_egress &&
3935             (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3936              flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
3937                 err = -EINVAL;
3938                 goto free_ucmd;
3939         }
3940
3941         dst = kzalloc(sizeof(*dst), GFP_KERNEL);
3942         if (!dst) {
3943                 err = -ENOMEM;
3944                 goto free_ucmd;
3945         }
3946
3947         mutex_lock(&dev->flow_db->lock);
3948
3949         ft_prio = get_flow_table(dev, flow_attr,
3950                                  is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX);
3951         if (IS_ERR(ft_prio)) {
3952                 err = PTR_ERR(ft_prio);
3953                 goto unlock;
3954         }
3955         if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3956                 ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
3957                 if (IS_ERR(ft_prio_tx)) {
3958                         err = PTR_ERR(ft_prio_tx);
3959                         ft_prio_tx = NULL;
3960                         goto destroy_ft;
3961                 }
3962         }
3963
3964         if (is_egress) {
3965                 dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT;
3966         } else {
3967                 dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
3968                 if (mqp->flags & MLX5_IB_QP_RSS)
3969                         dst->tir_num = mqp->rss_qp.tirn;
3970                 else
3971                         dst->tir_num = mqp->raw_packet_qp.rq.tirn;
3972         }
3973
3974         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
3975                 if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
3976                         handler = create_dont_trap_rule(dev, ft_prio,
3977                                                         flow_attr, dst);
3978                 } else {
3979                         underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ?
3980                                         mqp->underlay_qpn : 0;
3981                         handler = _create_flow_rule(dev, ft_prio, flow_attr,
3982                                                     dst, underlay_qpn, ucmd);
3983                 }
3984         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3985                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
3986                 handler = create_leftovers_rule(dev, ft_prio, flow_attr,
3987                                                 dst);
3988         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3989                 handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
3990         } else {
3991                 err = -EINVAL;
3992                 goto destroy_ft;
3993         }
3994
3995         if (IS_ERR(handler)) {
3996                 err = PTR_ERR(handler);
3997                 handler = NULL;
3998                 goto destroy_ft;
3999         }
4000
4001         mutex_unlock(&dev->flow_db->lock);
4002         kfree(dst);
4003         kfree(ucmd);
4004
4005         return &handler->ibflow;
4006
4007 destroy_ft:
4008         put_flow_table(dev, ft_prio, false);
4009         if (ft_prio_tx)
4010                 put_flow_table(dev, ft_prio_tx, false);
4011 unlock:
4012         mutex_unlock(&dev->flow_db->lock);
4013         kfree(dst);
4014 free_ucmd:
4015         kfree(ucmd);
4016         return ERR_PTR(err);
4017 }
4018
4019 static struct mlx5_ib_flow_prio *
4020 _get_flow_table(struct mlx5_ib_dev *dev,
4021                 struct mlx5_ib_flow_matcher *fs_matcher,
4022                 bool mcast)
4023 {
4024         struct mlx5_flow_namespace *ns = NULL;
4025         struct mlx5_ib_flow_prio *prio = NULL;
4026         int max_table_size = 0;
4027         bool esw_encap;
4028         u32 flags = 0;
4029         int priority;
4030
4031         if (mcast)
4032                 priority = MLX5_IB_FLOW_MCAST_PRIO;
4033         else
4034                 priority = ib_prio_to_core_prio(fs_matcher->priority, false);
4035
4036         esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
4037                 DEVLINK_ESWITCH_ENCAP_MODE_NONE;
4038         if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
4039                 max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
4040                                         log_max_ft_size));
4041                 if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap)
4042                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
4043                 if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
4044                                               reformat_l3_tunnel_to_l2) &&
4045                     !esw_encap)
4046                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
4047         } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) {
4048                 max_table_size = BIT(
4049                         MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size));
4050                 if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) && !esw_encap)
4051                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
4052         } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) {
4053                 max_table_size = BIT(
4054                         MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size));
4055                 if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap)
4056                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
4057                 if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, reformat_l3_tunnel_to_l2) &&
4058                     esw_encap)
4059                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
4060                 priority = FDB_BYPASS_PATH;
4061         } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) {
4062                 max_table_size =
4063                         BIT(MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev,
4064                                                        log_max_ft_size));
4065                 priority = fs_matcher->priority;
4066         } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) {
4067                 max_table_size =
4068                         BIT(MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev,
4069                                                        log_max_ft_size));
4070                 priority = fs_matcher->priority;
4071         }
4072
4073         max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
4074
4075         ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
4076         if (!ns)
4077                 return ERR_PTR(-ENOTSUPP);
4078
4079         if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS)
4080                 prio = &dev->flow_db->prios[priority];
4081         else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
4082                 prio = &dev->flow_db->egress_prios[priority];
4083         else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB)
4084                 prio = &dev->flow_db->fdb;
4085         else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX)
4086                 prio = &dev->flow_db->rdma_rx[priority];
4087         else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX)
4088                 prio = &dev->flow_db->rdma_tx[priority];
4089
4090         if (!prio)
4091                 return ERR_PTR(-EINVAL);
4092
4093         if (prio->flow_table)
4094                 return prio;
4095
4096         return _get_prio(ns, prio, priority, max_table_size,
4097                          MLX5_FS_MAX_TYPES, flags);
4098 }
4099
4100 static struct mlx5_ib_flow_handler *
4101 _create_raw_flow_rule(struct mlx5_ib_dev *dev,
4102                       struct mlx5_ib_flow_prio *ft_prio,
4103                       struct mlx5_flow_destination *dst,
4104                       struct mlx5_ib_flow_matcher  *fs_matcher,
4105                       struct mlx5_flow_context *flow_context,
4106                       struct mlx5_flow_act *flow_act,
4107                       void *cmd_in, int inlen,
4108                       int dst_num)
4109 {
4110         struct mlx5_ib_flow_handler *handler;
4111         struct mlx5_flow_spec *spec;
4112         struct mlx5_flow_table *ft = ft_prio->flow_table;
4113         int err = 0;
4114
4115         spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
4116         handler = kzalloc(sizeof(*handler), GFP_KERNEL);
4117         if (!handler || !spec) {
4118                 err = -ENOMEM;
4119                 goto free;
4120         }
4121
4122         INIT_LIST_HEAD(&handler->list);
4123
4124         memcpy(spec->match_value, cmd_in, inlen);
4125         memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params,
4126                fs_matcher->mask_len);
4127         spec->match_criteria_enable = fs_matcher->match_criteria_enable;
4128         spec->flow_context = *flow_context;
4129
4130         handler->rule = mlx5_add_flow_rules(ft, spec,
4131                                             flow_act, dst, dst_num);
4132
4133         if (IS_ERR(handler->rule)) {
4134                 err = PTR_ERR(handler->rule);
4135                 goto free;
4136         }
4137
4138         ft_prio->refcount++;
4139         handler->prio = ft_prio;
4140         handler->dev = dev;
4141         ft_prio->flow_table = ft;
4142
4143 free:
4144         if (err)
4145                 kfree(handler);
4146         kvfree(spec);
4147         return err ? ERR_PTR(err) : handler;
4148 }
4149
4150 static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
4151                                 void *match_v)
4152 {
4153         void *match_c;
4154         void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4;
4155         void *dmac, *dmac_mask;
4156         void *ipv4, *ipv4_mask;
4157
4158         if (!(fs_matcher->match_criteria_enable &
4159               (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT)))
4160                 return false;
4161
4162         match_c = fs_matcher->matcher_mask.match_params;
4163         match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v,
4164                                            outer_headers);
4165         match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c,
4166                                            outer_headers);
4167
4168         dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
4169                             dmac_47_16);
4170         dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
4171                                  dmac_47_16);
4172
4173         if (is_multicast_ether_addr(dmac) &&
4174             is_multicast_ether_addr(dmac_mask))
4175                 return true;
4176
4177         ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
4178                             dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
4179
4180         ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
4181                                  dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
4182
4183         if (ipv4_is_multicast(*(__be32 *)(ipv4)) &&
4184             ipv4_is_multicast(*(__be32 *)(ipv4_mask)))
4185                 return true;
4186
4187         return false;
4188 }
4189
4190 struct mlx5_ib_flow_handler *
4191 mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
4192                         struct mlx5_ib_flow_matcher *fs_matcher,
4193                         struct mlx5_flow_context *flow_context,
4194                         struct mlx5_flow_act *flow_act,
4195                         u32 counter_id,
4196                         void *cmd_in, int inlen, int dest_id,
4197                         int dest_type)
4198 {
4199         struct mlx5_flow_destination *dst;
4200         struct mlx5_ib_flow_prio *ft_prio;
4201         struct mlx5_ib_flow_handler *handler;
4202         int dst_num = 0;
4203         bool mcast;
4204         int err;
4205
4206         if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL)
4207                 return ERR_PTR(-EOPNOTSUPP);
4208
4209         if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
4210                 return ERR_PTR(-ENOMEM);
4211
4212         dst = kcalloc(2, sizeof(*dst), GFP_KERNEL);
4213         if (!dst)
4214                 return ERR_PTR(-ENOMEM);
4215
4216         mcast = raw_fs_is_multicast(fs_matcher, cmd_in);
4217         mutex_lock(&dev->flow_db->lock);
4218
4219         ft_prio = _get_flow_table(dev, fs_matcher, mcast);
4220         if (IS_ERR(ft_prio)) {
4221                 err = PTR_ERR(ft_prio);
4222                 goto unlock;
4223         }
4224
4225         if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) {
4226                 dst[dst_num].type = dest_type;
4227                 dst[dst_num].tir_num = dest_id;
4228                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
4229         } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
4230                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
4231                 dst[dst_num].ft_num = dest_id;
4232                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
4233         } else {
4234                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_PORT;
4235                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
4236         }
4237
4238         dst_num++;
4239
4240         if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
4241                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
4242                 dst[dst_num].counter_id = counter_id;
4243                 dst_num++;
4244         }
4245
4246         handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher,
4247                                         flow_context, flow_act,
4248                                         cmd_in, inlen, dst_num);
4249
4250         if (IS_ERR(handler)) {
4251                 err = PTR_ERR(handler);
4252                 goto destroy_ft;
4253         }
4254
4255         mutex_unlock(&dev->flow_db->lock);
4256         atomic_inc(&fs_matcher->usecnt);
4257         handler->flow_matcher = fs_matcher;
4258
4259         kfree(dst);
4260
4261         return handler;
4262
4263 destroy_ft:
4264         put_flow_table(dev, ft_prio, false);
4265 unlock:
4266         mutex_unlock(&dev->flow_db->lock);
4267         kfree(dst);
4268
4269         return ERR_PTR(err);
4270 }
4271
4272 static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags)
4273 {
4274         u32 flags = 0;
4275
4276         if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA)
4277                 flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA;
4278
4279         return flags;
4280 }
4281
4282 #define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED      MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA
4283 static struct ib_flow_action *
4284 mlx5_ib_create_flow_action_esp(struct ib_device *device,
4285                                const struct ib_flow_action_attrs_esp *attr,
4286                                struct uverbs_attr_bundle *attrs)
4287 {
4288         struct mlx5_ib_dev *mdev = to_mdev(device);
4289         struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm;
4290         struct mlx5_accel_esp_xfrm_attrs accel_attrs = {};
4291         struct mlx5_ib_flow_action *action;
4292         u64 action_flags;
4293         u64 flags;
4294         int err = 0;
4295
4296         err = uverbs_get_flags64(
4297                 &action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
4298                 ((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1));
4299         if (err)
4300                 return ERR_PTR(err);
4301
4302         flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags);
4303
4304         /* We current only support a subset of the standard features. Only a
4305          * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn
4306          * (with overlap). Full offload mode isn't supported.
4307          */
4308         if (!attr->keymat || attr->replay || attr->encap ||
4309             attr->spi || attr->seq || attr->tfc_pad ||
4310             attr->hard_limit_pkts ||
4311             (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
4312                              IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)))
4313                 return ERR_PTR(-EOPNOTSUPP);
4314
4315         if (attr->keymat->protocol !=
4316             IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM)
4317                 return ERR_PTR(-EOPNOTSUPP);
4318
4319         aes_gcm = &attr->keymat->keymat.aes_gcm;
4320
4321         if (aes_gcm->icv_len != 16 ||
4322             aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
4323                 return ERR_PTR(-EOPNOTSUPP);
4324
4325         action = kmalloc(sizeof(*action), GFP_KERNEL);
4326         if (!action)
4327                 return ERR_PTR(-ENOMEM);
4328
4329         action->esp_aes_gcm.ib_flags = attr->flags;
4330         memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key,
4331                sizeof(accel_attrs.keymat.aes_gcm.aes_key));
4332         accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8;
4333         memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt,
4334                sizeof(accel_attrs.keymat.aes_gcm.salt));
4335         memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv,
4336                sizeof(accel_attrs.keymat.aes_gcm.seq_iv));
4337         accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8;
4338         accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ;
4339         accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM;
4340
4341         accel_attrs.esn = attr->esn;
4342         if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED)
4343                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
4344         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
4345                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
4346
4347         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)
4348                 accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT;
4349
4350         action->esp_aes_gcm.ctx =
4351                 mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags);
4352         if (IS_ERR(action->esp_aes_gcm.ctx)) {
4353                 err = PTR_ERR(action->esp_aes_gcm.ctx);
4354                 goto err_parse;
4355         }
4356
4357         action->esp_aes_gcm.ib_flags = attr->flags;
4358
4359         return &action->ib_action;
4360
4361 err_parse:
4362         kfree(action);
4363         return ERR_PTR(err);
4364 }
4365
4366 static int
4367 mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action,
4368                                const struct ib_flow_action_attrs_esp *attr,
4369                                struct uverbs_attr_bundle *attrs)
4370 {
4371         struct mlx5_ib_flow_action *maction = to_mflow_act(action);
4372         struct mlx5_accel_esp_xfrm_attrs accel_attrs;
4373         int err = 0;
4374
4375         if (attr->keymat || attr->replay || attr->encap ||
4376             attr->spi || attr->seq || attr->tfc_pad ||
4377             attr->hard_limit_pkts ||
4378             (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
4379                              IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS |
4380                              IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)))
4381                 return -EOPNOTSUPP;
4382
4383         /* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can
4384          * be modified.
4385          */
4386         if (!(maction->esp_aes_gcm.ib_flags &
4387               IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) &&
4388             attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
4389                            IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))
4390                 return -EINVAL;
4391
4392         memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs,
4393                sizeof(accel_attrs));
4394
4395         accel_attrs.esn = attr->esn;
4396         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
4397                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
4398         else
4399                 accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
4400
4401         err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx,
4402                                          &accel_attrs);
4403         if (err)
4404                 return err;
4405
4406         maction->esp_aes_gcm.ib_flags &=
4407                 ~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
4408         maction->esp_aes_gcm.ib_flags |=
4409                 attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
4410
4411         return 0;
4412 }
4413
4414 static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action)
4415 {
4416         struct mlx5_ib_flow_action *maction = to_mflow_act(action);
4417
4418         switch (action->type) {
4419         case IB_FLOW_ACTION_ESP:
4420                 /*
4421                  * We only support aes_gcm by now, so we implicitly know this is
4422                  * the underline crypto.
4423                  */
4424                 mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx);
4425                 break;
4426         case IB_FLOW_ACTION_UNSPECIFIED:
4427                 mlx5_ib_destroy_flow_action_raw(maction);
4428                 break;
4429         default:
4430                 WARN_ON(true);
4431                 break;
4432         }
4433
4434         kfree(maction);
4435         return 0;
4436 }
4437
4438 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
4439 {
4440         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
4441         struct mlx5_ib_qp *mqp = to_mqp(ibqp);
4442         int err;
4443         u16 uid;
4444
4445         uid = ibqp->pd ?
4446                 to_mpd(ibqp->pd)->uid : 0;
4447
4448         if (mqp->flags & MLX5_IB_QP_UNDERLAY) {
4449                 mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n");
4450                 return -EOPNOTSUPP;
4451         }
4452
4453         err = mlx5_cmd_attach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
4454         if (err)
4455                 mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
4456                              ibqp->qp_num, gid->raw);
4457
4458         return err;
4459 }
4460
4461 static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
4462 {
4463         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
4464         int err;
4465         u16 uid;
4466
4467         uid = ibqp->pd ?
4468                 to_mpd(ibqp->pd)->uid : 0;
4469         err = mlx5_cmd_detach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
4470         if (err)
4471                 mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
4472                              ibqp->qp_num, gid->raw);
4473
4474         return err;
4475 }
4476
4477 static int init_node_data(struct mlx5_ib_dev *dev)
4478 {
4479         int err;
4480
4481         err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
4482         if (err)
4483                 return err;
4484
4485         dev->mdev->rev_id = dev->mdev->pdev->revision;
4486
4487         return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
4488 }
4489
4490 static ssize_t fw_pages_show(struct device *device,
4491                              struct device_attribute *attr, char *buf)
4492 {
4493         struct mlx5_ib_dev *dev =
4494                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4495
4496         return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
4497 }
4498 static DEVICE_ATTR_RO(fw_pages);
4499
4500 static ssize_t reg_pages_show(struct device *device,
4501                               struct device_attribute *attr, char *buf)
4502 {
4503         struct mlx5_ib_dev *dev =
4504                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4505
4506         return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
4507 }
4508 static DEVICE_ATTR_RO(reg_pages);
4509
4510 static ssize_t hca_type_show(struct device *device,
4511                              struct device_attribute *attr, char *buf)
4512 {
4513         struct mlx5_ib_dev *dev =
4514                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4515
4516         return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
4517 }
4518 static DEVICE_ATTR_RO(hca_type);
4519
4520 static ssize_t hw_rev_show(struct device *device,
4521                            struct device_attribute *attr, char *buf)
4522 {
4523         struct mlx5_ib_dev *dev =
4524                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4525
4526         return sprintf(buf, "%x\n", dev->mdev->rev_id);
4527 }
4528 static DEVICE_ATTR_RO(hw_rev);
4529
4530 static ssize_t board_id_show(struct device *device,
4531                              struct device_attribute *attr, char *buf)
4532 {
4533         struct mlx5_ib_dev *dev =
4534                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4535
4536         return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
4537                        dev->mdev->board_id);
4538 }
4539 static DEVICE_ATTR_RO(board_id);
4540
4541 static struct attribute *mlx5_class_attributes[] = {
4542         &dev_attr_hw_rev.attr,
4543         &dev_attr_hca_type.attr,
4544         &dev_attr_board_id.attr,
4545         &dev_attr_fw_pages.attr,
4546         &dev_attr_reg_pages.attr,
4547         NULL,
4548 };
4549
4550 static const struct attribute_group mlx5_attr_group = {
4551         .attrs = mlx5_class_attributes,
4552 };
4553
4554 static void pkey_change_handler(struct work_struct *work)
4555 {
4556         struct mlx5_ib_port_resources *ports =
4557                 container_of(work, struct mlx5_ib_port_resources,
4558                              pkey_change_work);
4559
4560         mutex_lock(&ports->devr->mutex);
4561         mlx5_ib_gsi_pkey_change(ports->gsi);
4562         mutex_unlock(&ports->devr->mutex);
4563 }
4564
4565 static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
4566 {
4567         struct mlx5_ib_qp *mqp;
4568         struct mlx5_ib_cq *send_mcq, *recv_mcq;
4569         struct mlx5_core_cq *mcq;
4570         struct list_head cq_armed_list;
4571         unsigned long flags_qp;
4572         unsigned long flags_cq;
4573         unsigned long flags;
4574
4575         INIT_LIST_HEAD(&cq_armed_list);
4576
4577         /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
4578         spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
4579         list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
4580                 spin_lock_irqsave(&mqp->sq.lock, flags_qp);
4581                 if (mqp->sq.tail != mqp->sq.head) {
4582                         send_mcq = to_mcq(mqp->ibqp.send_cq);
4583                         spin_lock_irqsave(&send_mcq->lock, flags_cq);
4584                         if (send_mcq->mcq.comp &&
4585                             mqp->ibqp.send_cq->comp_handler) {
4586                                 if (!send_mcq->mcq.reset_notify_added) {
4587                                         send_mcq->mcq.reset_notify_added = 1;
4588                                         list_add_tail(&send_mcq->mcq.reset_notify,
4589                                                       &cq_armed_list);
4590                                 }
4591                         }
4592                         spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
4593                 }
4594                 spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
4595                 spin_lock_irqsave(&mqp->rq.lock, flags_qp);
4596                 /* no handling is needed for SRQ */
4597                 if (!mqp->ibqp.srq) {
4598                         if (mqp->rq.tail != mqp->rq.head) {
4599                                 recv_mcq = to_mcq(mqp->ibqp.recv_cq);
4600                                 spin_lock_irqsave(&recv_mcq->lock, flags_cq);
4601                                 if (recv_mcq->mcq.comp &&
4602                                     mqp->ibqp.recv_cq->comp_handler) {
4603                                         if (!recv_mcq->mcq.reset_notify_added) {
4604                                                 recv_mcq->mcq.reset_notify_added = 1;
4605                                                 list_add_tail(&recv_mcq->mcq.reset_notify,
4606                                                               &cq_armed_list);
4607                                         }
4608                                 }
4609                                 spin_unlock_irqrestore(&recv_mcq->lock,
4610                                                        flags_cq);
4611                         }
4612                 }
4613                 spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
4614         }
4615         /*At that point all inflight post send were put to be executed as of we
4616          * lock/unlock above locks Now need to arm all involved CQs.
4617          */
4618         list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
4619                 mcq->comp(mcq, NULL);
4620         }
4621         spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
4622 }
4623
4624 static void delay_drop_handler(struct work_struct *work)
4625 {
4626         int err;
4627         struct mlx5_ib_delay_drop *delay_drop =
4628                 container_of(work, struct mlx5_ib_delay_drop,
4629                              delay_drop_work);
4630
4631         atomic_inc(&delay_drop->events_cnt);
4632
4633         mutex_lock(&delay_drop->lock);
4634         err = mlx5_core_set_delay_drop(delay_drop->dev->mdev,
4635                                        delay_drop->timeout);
4636         if (err) {
4637                 mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n",
4638                              delay_drop->timeout);
4639                 delay_drop->activate = false;
4640         }
4641         mutex_unlock(&delay_drop->lock);
4642 }
4643
4644 static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
4645                                  struct ib_event *ibev)
4646 {
4647         u8 port = (eqe->data.port.port >> 4) & 0xf;
4648
4649         switch (eqe->sub_type) {
4650         case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
4651                 if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
4652                                             IB_LINK_LAYER_ETHERNET)
4653                         schedule_work(&ibdev->delay_drop.delay_drop_work);
4654                 break;
4655         default: /* do nothing */
4656                 return;
4657         }
4658 }
4659
4660 static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
4661                               struct ib_event *ibev)
4662 {
4663         u8 port = (eqe->data.port.port >> 4) & 0xf;
4664
4665         ibev->element.port_num = port;
4666
4667         switch (eqe->sub_type) {
4668         case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
4669         case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
4670         case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
4671                 /* In RoCE, port up/down events are handled in
4672                  * mlx5_netdev_event().
4673                  */
4674                 if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
4675                                             IB_LINK_LAYER_ETHERNET)
4676                         return -EINVAL;
4677
4678                 ibev->event = (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) ?
4679                                 IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
4680                 break;
4681
4682         case MLX5_PORT_CHANGE_SUBTYPE_LID:
4683                 ibev->event = IB_EVENT_LID_CHANGE;
4684                 break;
4685
4686         case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
4687                 ibev->event = IB_EVENT_PKEY_CHANGE;
4688                 schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
4689                 break;
4690
4691         case MLX5_PORT_CHANGE_SUBTYPE_GUID:
4692                 ibev->event = IB_EVENT_GID_CHANGE;
4693                 break;
4694
4695         case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
4696                 ibev->event = IB_EVENT_CLIENT_REREGISTER;
4697                 break;
4698         default:
4699                 return -EINVAL;
4700         }
4701
4702         return 0;
4703 }
4704
4705 static void mlx5_ib_handle_event(struct work_struct *_work)
4706 {
4707         struct mlx5_ib_event_work *work =
4708                 container_of(_work, struct mlx5_ib_event_work, work);
4709         struct mlx5_ib_dev *ibdev;
4710         struct ib_event ibev;
4711         bool fatal = false;
4712
4713         if (work->is_slave) {
4714                 ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
4715                 if (!ibdev)
4716                         goto out;
4717         } else {
4718                 ibdev = work->dev;
4719         }
4720
4721         switch (work->event) {
4722         case MLX5_DEV_EVENT_SYS_ERROR:
4723                 ibev.event = IB_EVENT_DEVICE_FATAL;
4724                 mlx5_ib_handle_internal_error(ibdev);
4725                 ibev.element.port_num  = (u8)(unsigned long)work->param;
4726                 fatal = true;
4727                 break;
4728         case MLX5_EVENT_TYPE_PORT_CHANGE:
4729                 if (handle_port_change(ibdev, work->param, &ibev))
4730                         goto out;
4731                 break;
4732         case MLX5_EVENT_TYPE_GENERAL_EVENT:
4733                 handle_general_event(ibdev, work->param, &ibev);
4734                 /* fall through */
4735         default:
4736                 goto out;
4737         }
4738
4739         ibev.device = &ibdev->ib_dev;
4740
4741         if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) {
4742                 mlx5_ib_warn(ibdev, "warning: event on port %d\n",  ibev.element.port_num);
4743                 goto out;
4744         }
4745
4746         if (ibdev->ib_active)
4747                 ib_dispatch_event(&ibev);
4748
4749         if (fatal)
4750                 ibdev->ib_active = false;
4751 out:
4752         kfree(work);
4753 }
4754
4755 static int mlx5_ib_event(struct notifier_block *nb,
4756                          unsigned long event, void *param)
4757 {
4758         struct mlx5_ib_event_work *work;
4759
4760         work = kmalloc(sizeof(*work), GFP_ATOMIC);
4761         if (!work)
4762                 return NOTIFY_DONE;
4763
4764         INIT_WORK(&work->work, mlx5_ib_handle_event);
4765         work->dev = container_of(nb, struct mlx5_ib_dev, mdev_events);
4766         work->is_slave = false;
4767         work->param = param;
4768         work->event = event;
4769
4770         queue_work(mlx5_ib_event_wq, &work->work);
4771
4772         return NOTIFY_OK;
4773 }
4774
4775 static int mlx5_ib_event_slave_port(struct notifier_block *nb,
4776                                     unsigned long event, void *param)
4777 {
4778         struct mlx5_ib_event_work *work;
4779
4780         work = kmalloc(sizeof(*work), GFP_ATOMIC);
4781         if (!work)
4782                 return NOTIFY_DONE;
4783
4784         INIT_WORK(&work->work, mlx5_ib_handle_event);
4785         work->mpi = container_of(nb, struct mlx5_ib_multiport_info, mdev_events);
4786         work->is_slave = true;
4787         work->param = param;
4788         work->event = event;
4789         queue_work(mlx5_ib_event_wq, &work->work);
4790
4791         return NOTIFY_OK;
4792 }
4793
4794 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
4795 {
4796         struct mlx5_hca_vport_context vport_ctx;
4797         int err;
4798         int port;
4799
4800         for (port = 1; port <= ARRAY_SIZE(dev->mdev->port_caps); port++) {
4801                 dev->mdev->port_caps[port - 1].has_smi = false;
4802                 if (MLX5_CAP_GEN(dev->mdev, port_type) ==
4803                     MLX5_CAP_PORT_TYPE_IB) {
4804                         if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
4805                                 err = mlx5_query_hca_vport_context(dev->mdev, 0,
4806                                                                    port, 0,
4807                                                                    &vport_ctx);
4808                                 if (err) {
4809                                         mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
4810                                                     port, err);
4811                                         return err;
4812                                 }
4813                                 dev->mdev->port_caps[port - 1].has_smi =
4814                                         vport_ctx.has_smi;
4815                         } else {
4816                                 dev->mdev->port_caps[port - 1].has_smi = true;
4817                         }
4818                 }
4819         }
4820         return 0;
4821 }
4822
4823 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
4824 {
4825         int port;
4826
4827         for (port = 1; port <= dev->num_ports; port++)
4828                 mlx5_query_ext_port_caps(dev, port);
4829 }
4830
4831 static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
4832 {
4833         struct ib_device_attr *dprops = NULL;
4834         struct ib_port_attr *pprops = NULL;
4835         int err = -ENOMEM;
4836
4837         pprops = kzalloc(sizeof(*pprops), GFP_KERNEL);
4838         if (!pprops)
4839                 goto out;
4840
4841         dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
4842         if (!dprops)
4843                 goto out;
4844
4845         err = mlx5_ib_query_device(&dev->ib_dev, dprops, NULL);
4846         if (err) {
4847                 mlx5_ib_warn(dev, "query_device failed %d\n", err);
4848                 goto out;
4849         }
4850
4851         err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
4852         if (err) {
4853                 mlx5_ib_warn(dev, "query_port %d failed %d\n",
4854                              port, err);
4855                 goto out;
4856         }
4857
4858         dev->mdev->port_caps[port - 1].pkey_table_len =
4859                                         dprops->max_pkeys;
4860         dev->mdev->port_caps[port - 1].gid_table_len =
4861                                         pprops->gid_tbl_len;
4862         mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n",
4863                     port, dprops->max_pkeys, pprops->gid_tbl_len);
4864
4865 out:
4866         kfree(pprops);
4867         kfree(dprops);
4868
4869         return err;
4870 }
4871
4872 static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
4873 {
4874         /* For representors use port 1, is this is the only native
4875          * port
4876          */
4877         if (dev->is_rep)
4878                 return __get_port_caps(dev, 1);
4879         return __get_port_caps(dev, port);
4880 }
4881
4882 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
4883 {
4884         int err;
4885
4886         err = mlx5_mr_cache_cleanup(dev);
4887         if (err)
4888                 mlx5_ib_warn(dev, "mr cache cleanup failed\n");
4889
4890         if (dev->umrc.qp)
4891                 mlx5_ib_destroy_qp(dev->umrc.qp, NULL);
4892         if (dev->umrc.cq)
4893                 ib_free_cq(dev->umrc.cq);
4894         if (dev->umrc.pd)
4895                 ib_dealloc_pd(dev->umrc.pd);
4896 }
4897
4898 enum {
4899         MAX_UMR_WR = 128,
4900 };
4901
4902 static int create_umr_res(struct mlx5_ib_dev *dev)
4903 {
4904         struct ib_qp_init_attr *init_attr = NULL;
4905         struct ib_qp_attr *attr = NULL;
4906         struct ib_pd *pd;
4907         struct ib_cq *cq;
4908         struct ib_qp *qp;
4909         int ret;
4910
4911         attr = kzalloc(sizeof(*attr), GFP_KERNEL);
4912         init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
4913         if (!attr || !init_attr) {
4914                 ret = -ENOMEM;
4915                 goto error_0;
4916         }
4917
4918         pd = ib_alloc_pd(&dev->ib_dev, 0);
4919         if (IS_ERR(pd)) {
4920                 mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
4921                 ret = PTR_ERR(pd);
4922                 goto error_0;
4923         }
4924
4925         cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
4926         if (IS_ERR(cq)) {
4927                 mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
4928                 ret = PTR_ERR(cq);
4929                 goto error_2;
4930         }
4931
4932         init_attr->send_cq = cq;
4933         init_attr->recv_cq = cq;
4934         init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
4935         init_attr->cap.max_send_wr = MAX_UMR_WR;
4936         init_attr->cap.max_send_sge = 1;
4937         init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
4938         init_attr->port_num = 1;
4939         qp = mlx5_ib_create_qp(pd, init_attr, NULL);
4940         if (IS_ERR(qp)) {
4941                 mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
4942                 ret = PTR_ERR(qp);
4943                 goto error_3;
4944         }
4945         qp->device     = &dev->ib_dev;
4946         qp->real_qp    = qp;
4947         qp->uobject    = NULL;
4948         qp->qp_type    = MLX5_IB_QPT_REG_UMR;
4949         qp->send_cq    = init_attr->send_cq;
4950         qp->recv_cq    = init_attr->recv_cq;
4951
4952         attr->qp_state = IB_QPS_INIT;
4953         attr->port_num = 1;
4954         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
4955                                 IB_QP_PORT, NULL);
4956         if (ret) {
4957                 mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
4958                 goto error_4;
4959         }
4960
4961         memset(attr, 0, sizeof(*attr));
4962         attr->qp_state = IB_QPS_RTR;
4963         attr->path_mtu = IB_MTU_256;
4964
4965         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
4966         if (ret) {
4967                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
4968                 goto error_4;
4969         }
4970
4971         memset(attr, 0, sizeof(*attr));
4972         attr->qp_state = IB_QPS_RTS;
4973         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
4974         if (ret) {
4975                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
4976                 goto error_4;
4977         }
4978
4979         dev->umrc.qp = qp;
4980         dev->umrc.cq = cq;
4981         dev->umrc.pd = pd;
4982
4983         sema_init(&dev->umrc.sem, MAX_UMR_WR);
4984         ret = mlx5_mr_cache_init(dev);
4985         if (ret) {
4986                 mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
4987                 goto error_4;
4988         }
4989
4990         kfree(attr);
4991         kfree(init_attr);
4992
4993         return 0;
4994
4995 error_4:
4996         mlx5_ib_destroy_qp(qp, NULL);
4997         dev->umrc.qp = NULL;
4998
4999 error_3:
5000         ib_free_cq(cq);
5001         dev->umrc.cq = NULL;
5002
5003 error_2:
5004         ib_dealloc_pd(pd);
5005         dev->umrc.pd = NULL;
5006
5007 error_0:
5008         kfree(attr);
5009         kfree(init_attr);
5010         return ret;
5011 }
5012
5013 static u8 mlx5_get_umr_fence(u8 umr_fence_cap)
5014 {
5015         switch (umr_fence_cap) {
5016         case MLX5_CAP_UMR_FENCE_NONE:
5017                 return MLX5_FENCE_MODE_NONE;
5018         case MLX5_CAP_UMR_FENCE_SMALL:
5019                 return MLX5_FENCE_MODE_INITIATOR_SMALL;
5020         default:
5021                 return MLX5_FENCE_MODE_STRONG_ORDERING;
5022         }
5023 }
5024
5025 static int create_dev_resources(struct mlx5_ib_resources *devr)
5026 {
5027         struct ib_srq_init_attr attr;
5028         struct mlx5_ib_dev *dev;
5029         struct ib_device *ibdev;
5030         struct ib_cq_init_attr cq_attr = {.cqe = 1};
5031         int port;
5032         int ret = 0;
5033
5034         dev = container_of(devr, struct mlx5_ib_dev, devr);
5035         ibdev = &dev->ib_dev;
5036
5037         mutex_init(&devr->mutex);
5038
5039         devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
5040         if (!devr->p0)
5041                 return -ENOMEM;
5042
5043         devr->p0->device  = ibdev;
5044         devr->p0->uobject = NULL;
5045         atomic_set(&devr->p0->usecnt, 0);
5046
5047         ret = mlx5_ib_alloc_pd(devr->p0, NULL);
5048         if (ret)
5049                 goto error0;
5050
5051         devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq);
5052         if (!devr->c0) {
5053                 ret = -ENOMEM;
5054                 goto error1;
5055         }
5056
5057         devr->c0->device = &dev->ib_dev;
5058         atomic_set(&devr->c0->usecnt, 0);
5059
5060         ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL);
5061         if (ret)
5062                 goto err_create_cq;
5063
5064         devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
5065         if (IS_ERR(devr->x0)) {
5066                 ret = PTR_ERR(devr->x0);
5067                 goto error2;
5068         }
5069         devr->x0->device = &dev->ib_dev;
5070         devr->x0->inode = NULL;
5071         atomic_set(&devr->x0->usecnt, 0);
5072         mutex_init(&devr->x0->tgt_qp_mutex);
5073         INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
5074
5075         devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
5076         if (IS_ERR(devr->x1)) {
5077                 ret = PTR_ERR(devr->x1);
5078                 goto error3;
5079         }
5080         devr->x1->device = &dev->ib_dev;
5081         devr->x1->inode = NULL;
5082         atomic_set(&devr->x1->usecnt, 0);
5083         mutex_init(&devr->x1->tgt_qp_mutex);
5084         INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
5085
5086         memset(&attr, 0, sizeof(attr));
5087         attr.attr.max_sge = 1;
5088         attr.attr.max_wr = 1;
5089         attr.srq_type = IB_SRQT_XRC;
5090         attr.ext.cq = devr->c0;
5091         attr.ext.xrc.xrcd = devr->x0;
5092
5093         devr->s0 = rdma_zalloc_drv_obj(ibdev, ib_srq);
5094         if (!devr->s0) {
5095                 ret = -ENOMEM;
5096                 goto error4;
5097         }
5098
5099         devr->s0->device        = &dev->ib_dev;
5100         devr->s0->pd            = devr->p0;
5101         devr->s0->srq_type      = IB_SRQT_XRC;
5102         devr->s0->ext.xrc.xrcd  = devr->x0;
5103         devr->s0->ext.cq        = devr->c0;
5104         ret = mlx5_ib_create_srq(devr->s0, &attr, NULL);
5105         if (ret)
5106                 goto err_create;
5107
5108         atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
5109         atomic_inc(&devr->s0->ext.cq->usecnt);
5110         atomic_inc(&devr->p0->usecnt);
5111         atomic_set(&devr->s0->usecnt, 0);
5112
5113         memset(&attr, 0, sizeof(attr));
5114         attr.attr.max_sge = 1;
5115         attr.attr.max_wr = 1;
5116         attr.srq_type = IB_SRQT_BASIC;
5117         devr->s1 = rdma_zalloc_drv_obj(ibdev, ib_srq);
5118         if (!devr->s1) {
5119                 ret = -ENOMEM;
5120                 goto error5;
5121         }
5122
5123         devr->s1->device        = &dev->ib_dev;
5124         devr->s1->pd            = devr->p0;
5125         devr->s1->srq_type      = IB_SRQT_BASIC;
5126         devr->s1->ext.cq        = devr->c0;
5127
5128         ret = mlx5_ib_create_srq(devr->s1, &attr, NULL);
5129         if (ret)
5130                 goto error6;
5131
5132         atomic_inc(&devr->p0->usecnt);
5133         atomic_set(&devr->s1->usecnt, 0);
5134
5135         for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
5136                 INIT_WORK(&devr->ports[port].pkey_change_work,
5137                           pkey_change_handler);
5138                 devr->ports[port].devr = devr;
5139         }
5140
5141         return 0;
5142
5143 error6:
5144         kfree(devr->s1);
5145 error5:
5146         mlx5_ib_destroy_srq(devr->s0, NULL);
5147 err_create:
5148         kfree(devr->s0);
5149 error4:
5150         mlx5_ib_dealloc_xrcd(devr->x1, NULL);
5151 error3:
5152         mlx5_ib_dealloc_xrcd(devr->x0, NULL);
5153 error2:
5154         mlx5_ib_destroy_cq(devr->c0, NULL);
5155 err_create_cq:
5156         kfree(devr->c0);
5157 error1:
5158         mlx5_ib_dealloc_pd(devr->p0, NULL);
5159 error0:
5160         kfree(devr->p0);
5161         return ret;
5162 }
5163
5164 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
5165 {
5166         int port;
5167
5168         mlx5_ib_destroy_srq(devr->s1, NULL);
5169         kfree(devr->s1);
5170         mlx5_ib_destroy_srq(devr->s0, NULL);
5171         kfree(devr->s0);
5172         mlx5_ib_dealloc_xrcd(devr->x0, NULL);
5173         mlx5_ib_dealloc_xrcd(devr->x1, NULL);
5174         mlx5_ib_destroy_cq(devr->c0, NULL);
5175         kfree(devr->c0);
5176         mlx5_ib_dealloc_pd(devr->p0, NULL);
5177         kfree(devr->p0);
5178
5179         /* Make sure no change P_Key work items are still executing */
5180         for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
5181                 cancel_work_sync(&devr->ports[port].pkey_change_work);
5182 }
5183
5184 static u32 get_core_cap_flags(struct ib_device *ibdev,
5185                               struct mlx5_hca_vport_context *rep)
5186 {
5187         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5188         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
5189         u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
5190         u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
5191         bool raw_support = !mlx5_core_mp_enabled(dev->mdev);
5192         u32 ret = 0;
5193
5194         if (rep->grh_required)
5195                 ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED;
5196
5197         if (ll == IB_LINK_LAYER_INFINIBAND)
5198                 return ret | RDMA_CORE_PORT_IBA_IB;
5199
5200         if (raw_support)
5201                 ret |= RDMA_CORE_PORT_RAW_PACKET;
5202
5203         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
5204                 return ret;
5205
5206         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
5207                 return ret;
5208
5209         if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
5210                 ret |= RDMA_CORE_PORT_IBA_ROCE;
5211
5212         if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
5213                 ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
5214
5215         return ret;
5216 }
5217
5218 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
5219                                struct ib_port_immutable *immutable)
5220 {
5221         struct ib_port_attr attr;
5222         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5223         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
5224         struct mlx5_hca_vport_context rep = {0};
5225         int err;
5226
5227         err = ib_query_port(ibdev, port_num, &attr);
5228         if (err)
5229                 return err;
5230
5231         if (ll == IB_LINK_LAYER_INFINIBAND) {
5232                 err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0,
5233                                                    &rep);
5234                 if (err)
5235                         return err;
5236         }
5237
5238         immutable->pkey_tbl_len = attr.pkey_tbl_len;
5239         immutable->gid_tbl_len = attr.gid_tbl_len;
5240         immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep);
5241         immutable->max_mad_size = IB_MGMT_MAD_SIZE;
5242
5243         return 0;
5244 }
5245
5246 static int mlx5_port_rep_immutable(struct ib_device *ibdev, u8 port_num,
5247                                    struct ib_port_immutable *immutable)
5248 {
5249         struct ib_port_attr attr;
5250         int err;
5251
5252         immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
5253
5254         err = ib_query_port(ibdev, port_num, &attr);
5255         if (err)
5256                 return err;
5257
5258         immutable->pkey_tbl_len = attr.pkey_tbl_len;
5259         immutable->gid_tbl_len = attr.gid_tbl_len;
5260         immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
5261
5262         return 0;
5263 }
5264
5265 static void get_dev_fw_str(struct ib_device *ibdev, char *str)
5266 {
5267         struct mlx5_ib_dev *dev =
5268                 container_of(ibdev, struct mlx5_ib_dev, ib_dev);
5269         snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d",
5270                  fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev),
5271                  fw_rev_sub(dev->mdev));
5272 }
5273
5274 static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
5275 {
5276         struct mlx5_core_dev *mdev = dev->mdev;
5277         struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
5278                                                                  MLX5_FLOW_NAMESPACE_LAG);
5279         struct mlx5_flow_table *ft;
5280         int err;
5281
5282         if (!ns || !mlx5_lag_is_roce(mdev))
5283                 return 0;
5284
5285         err = mlx5_cmd_create_vport_lag(mdev);
5286         if (err)
5287                 return err;
5288
5289         ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
5290         if (IS_ERR(ft)) {
5291                 err = PTR_ERR(ft);
5292                 goto err_destroy_vport_lag;
5293         }
5294
5295         dev->flow_db->lag_demux_ft = ft;
5296         dev->lag_active = true;
5297         return 0;
5298
5299 err_destroy_vport_lag:
5300         mlx5_cmd_destroy_vport_lag(mdev);
5301         return err;
5302 }
5303
5304 static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
5305 {
5306         struct mlx5_core_dev *mdev = dev->mdev;
5307
5308         if (dev->lag_active) {
5309                 dev->lag_active = false;
5310
5311                 mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
5312                 dev->flow_db->lag_demux_ft = NULL;
5313
5314                 mlx5_cmd_destroy_vport_lag(mdev);
5315         }
5316 }
5317
5318 static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
5319 {
5320         int err;
5321
5322         dev->port[port_num].roce.nb.notifier_call = mlx5_netdev_event;
5323         err = register_netdevice_notifier(&dev->port[port_num].roce.nb);
5324         if (err) {
5325                 dev->port[port_num].roce.nb.notifier_call = NULL;
5326                 return err;
5327         }
5328
5329         return 0;
5330 }
5331
5332 static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
5333 {
5334         if (dev->port[port_num].roce.nb.notifier_call) {
5335                 unregister_netdevice_notifier(&dev->port[port_num].roce.nb);
5336                 dev->port[port_num].roce.nb.notifier_call = NULL;
5337         }
5338 }
5339
5340 static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
5341 {
5342         int err;
5343
5344         err = mlx5_nic_vport_enable_roce(dev->mdev);
5345         if (err)
5346                 return err;
5347
5348         err = mlx5_eth_lag_init(dev);
5349         if (err)
5350                 goto err_disable_roce;
5351
5352         return 0;
5353
5354 err_disable_roce:
5355         mlx5_nic_vport_disable_roce(dev->mdev);
5356
5357         return err;
5358 }
5359
5360 static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
5361 {
5362         mlx5_eth_lag_cleanup(dev);
5363         mlx5_nic_vport_disable_roce(dev->mdev);
5364 }
5365
5366 struct mlx5_ib_counter {
5367         const char *name;
5368         size_t offset;
5369 };
5370
5371 #define INIT_Q_COUNTER(_name)           \
5372         { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
5373
5374 static const struct mlx5_ib_counter basic_q_cnts[] = {
5375         INIT_Q_COUNTER(rx_write_requests),
5376         INIT_Q_COUNTER(rx_read_requests),
5377         INIT_Q_COUNTER(rx_atomic_requests),
5378         INIT_Q_COUNTER(out_of_buffer),
5379 };
5380
5381 static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
5382         INIT_Q_COUNTER(out_of_sequence),
5383 };
5384
5385 static const struct mlx5_ib_counter retrans_q_cnts[] = {
5386         INIT_Q_COUNTER(duplicate_request),
5387         INIT_Q_COUNTER(rnr_nak_retry_err),
5388         INIT_Q_COUNTER(packet_seq_err),
5389         INIT_Q_COUNTER(implied_nak_seq_err),
5390         INIT_Q_COUNTER(local_ack_timeout_err),
5391 };
5392
5393 #define INIT_CONG_COUNTER(_name)                \
5394         { .name = #_name, .offset =     \
5395                 MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
5396
5397 static const struct mlx5_ib_counter cong_cnts[] = {
5398         INIT_CONG_COUNTER(rp_cnp_ignored),
5399         INIT_CONG_COUNTER(rp_cnp_handled),
5400         INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
5401         INIT_CONG_COUNTER(np_cnp_sent),
5402 };
5403
5404 static const struct mlx5_ib_counter extended_err_cnts[] = {
5405         INIT_Q_COUNTER(resp_local_length_error),
5406         INIT_Q_COUNTER(resp_cqe_error),
5407         INIT_Q_COUNTER(req_cqe_error),
5408         INIT_Q_COUNTER(req_remote_invalid_request),
5409         INIT_Q_COUNTER(req_remote_access_errors),
5410         INIT_Q_COUNTER(resp_remote_access_errors),
5411         INIT_Q_COUNTER(resp_cqe_flush_error),
5412         INIT_Q_COUNTER(req_cqe_flush_error),
5413 };
5414
5415 static const struct mlx5_ib_counter roce_accl_cnts[] = {
5416         INIT_Q_COUNTER(roce_adp_retrans),
5417         INIT_Q_COUNTER(roce_adp_retrans_to),
5418         INIT_Q_COUNTER(roce_slow_restart),
5419         INIT_Q_COUNTER(roce_slow_restart_cnps),
5420         INIT_Q_COUNTER(roce_slow_restart_trans),
5421 };
5422
5423 #define INIT_EXT_PPCNT_COUNTER(_name)           \
5424         { .name = #_name, .offset =     \
5425         MLX5_BYTE_OFF(ppcnt_reg, \
5426                       counter_set.eth_extended_cntrs_grp_data_layout._name##_high)}
5427
5428 static const struct mlx5_ib_counter ext_ppcnt_cnts[] = {
5429         INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
5430 };
5431
5432 static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev)
5433 {
5434         return MLX5_ESWITCH_MANAGER(mdev) &&
5435                mlx5_ib_eswitch_mode(mdev->priv.eswitch) ==
5436                        MLX5_ESWITCH_OFFLOADS;
5437 }
5438
5439 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
5440 {
5441         int num_cnt_ports;
5442         int i;
5443
5444         num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
5445
5446         for (i = 0; i < num_cnt_ports; i++) {
5447                 if (dev->port[i].cnts.set_id_valid)
5448                         mlx5_core_dealloc_q_counter(dev->mdev,
5449                                                     dev->port[i].cnts.set_id);
5450                 kfree(dev->port[i].cnts.names);
5451                 kfree(dev->port[i].cnts.offsets);
5452         }
5453 }
5454
5455 static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
5456                                     struct mlx5_ib_counters *cnts)
5457 {
5458         u32 num_counters;
5459
5460         num_counters = ARRAY_SIZE(basic_q_cnts);
5461
5462         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
5463                 num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
5464
5465         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
5466                 num_counters += ARRAY_SIZE(retrans_q_cnts);
5467
5468         if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
5469                 num_counters += ARRAY_SIZE(extended_err_cnts);
5470
5471         if (MLX5_CAP_GEN(dev->mdev, roce_accl))
5472                 num_counters += ARRAY_SIZE(roce_accl_cnts);
5473
5474         cnts->num_q_counters = num_counters;
5475
5476         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5477                 cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
5478                 num_counters += ARRAY_SIZE(cong_cnts);
5479         }
5480         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5481                 cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts);
5482                 num_counters += ARRAY_SIZE(ext_ppcnt_cnts);
5483         }
5484         cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL);
5485         if (!cnts->names)
5486                 return -ENOMEM;
5487
5488         cnts->offsets = kcalloc(num_counters,
5489                                 sizeof(cnts->offsets), GFP_KERNEL);
5490         if (!cnts->offsets)
5491                 goto err_names;
5492
5493         return 0;
5494
5495 err_names:
5496         kfree(cnts->names);
5497         cnts->names = NULL;
5498         return -ENOMEM;
5499 }
5500
5501 static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
5502                                   const char **names,
5503                                   size_t *offsets)
5504 {
5505         int i;
5506         int j = 0;
5507
5508         for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
5509                 names[j] = basic_q_cnts[i].name;
5510                 offsets[j] = basic_q_cnts[i].offset;
5511         }
5512
5513         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
5514                 for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
5515                         names[j] = out_of_seq_q_cnts[i].name;
5516                         offsets[j] = out_of_seq_q_cnts[i].offset;
5517                 }
5518         }
5519
5520         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
5521                 for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
5522                         names[j] = retrans_q_cnts[i].name;
5523                         offsets[j] = retrans_q_cnts[i].offset;
5524                 }
5525         }
5526
5527         if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
5528                 for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
5529                         names[j] = extended_err_cnts[i].name;
5530                         offsets[j] = extended_err_cnts[i].offset;
5531                 }
5532         }
5533
5534         if (MLX5_CAP_GEN(dev->mdev, roce_accl)) {
5535                 for (i = 0; i < ARRAY_SIZE(roce_accl_cnts); i++, j++) {
5536                         names[j] = roce_accl_cnts[i].name;
5537                         offsets[j] = roce_accl_cnts[i].offset;
5538                 }
5539         }
5540
5541         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5542                 for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
5543                         names[j] = cong_cnts[i].name;
5544                         offsets[j] = cong_cnts[i].offset;
5545                 }
5546         }
5547
5548         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5549                 for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) {
5550                         names[j] = ext_ppcnt_cnts[i].name;
5551                         offsets[j] = ext_ppcnt_cnts[i].offset;
5552                 }
5553         }
5554 }
5555
5556 static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
5557 {
5558         int num_cnt_ports;
5559         int err = 0;
5560         int i;
5561         bool is_shared;
5562
5563         is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
5564         num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
5565
5566         for (i = 0; i < num_cnt_ports; i++) {
5567                 err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
5568                 if (err)
5569                         goto err_alloc;
5570
5571                 mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
5572                                       dev->port[i].cnts.offsets);
5573
5574                 err = mlx5_cmd_alloc_q_counter(dev->mdev,
5575                                                &dev->port[i].cnts.set_id,
5576                                                is_shared ?
5577                                                MLX5_SHARED_RESOURCE_UID : 0);
5578                 if (err) {
5579                         mlx5_ib_warn(dev,
5580                                      "couldn't allocate queue counter for port %d, err %d\n",
5581                                      i + 1, err);
5582                         goto err_alloc;
5583                 }
5584                 dev->port[i].cnts.set_id_valid = true;
5585         }
5586         return 0;
5587
5588 err_alloc:
5589         mlx5_ib_dealloc_counters(dev);
5590         return err;
5591 }
5592
5593 static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
5594                                                    u8 port_num)
5595 {
5596         return is_mdev_switchdev_mode(dev->mdev) ? &dev->port[0].cnts :
5597                                                    &dev->port[port_num].cnts;
5598 }
5599
5600 /**
5601  * mlx5_ib_get_counters_id - Returns counters id to use for device+port
5602  * @dev:        Pointer to mlx5 IB device
5603  * @port_num:   Zero based port number
5604  *
5605  * mlx5_ib_get_counters_id() Returns counters set id to use for given
5606  * device port combination in switchdev and non switchdev mode of the
5607  * parent device.
5608  */
5609 u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num)
5610 {
5611         const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
5612
5613         return cnts->set_id;
5614 }
5615
5616 static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
5617                                                     u8 port_num)
5618 {
5619         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5620         const struct mlx5_ib_counters *cnts;
5621         bool is_switchdev = is_mdev_switchdev_mode(dev->mdev);
5622
5623         if ((is_switchdev && port_num) || (!is_switchdev && !port_num))
5624                 return NULL;
5625
5626         cnts = get_counters(dev, port_num - 1);
5627
5628         return rdma_alloc_hw_stats_struct(cnts->names,
5629                                           cnts->num_q_counters +
5630                                           cnts->num_cong_counters +
5631                                           cnts->num_ext_ppcnt_counters,
5632                                           RDMA_HW_STATS_DEFAULT_LIFESPAN);
5633 }
5634
5635 static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
5636                                     const struct mlx5_ib_counters *cnts,
5637                                     struct rdma_hw_stats *stats,
5638                                     u16 set_id)
5639 {
5640         int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
5641         void *out;
5642         __be32 val;
5643         int ret, i;
5644
5645         out = kvzalloc(outlen, GFP_KERNEL);
5646         if (!out)
5647                 return -ENOMEM;
5648
5649         ret = mlx5_core_query_q_counter(mdev, set_id, 0, out, outlen);
5650         if (ret)
5651                 goto free;
5652
5653         for (i = 0; i < cnts->num_q_counters; i++) {
5654                 val = *(__be32 *)(out + cnts->offsets[i]);
5655                 stats->value[i] = (u64)be32_to_cpu(val);
5656         }
5657
5658 free:
5659         kvfree(out);
5660         return ret;
5661 }
5662
5663 static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
5664                                             const struct mlx5_ib_counters *cnts,
5665                                             struct rdma_hw_stats *stats)
5666 {
5667         int offset = cnts->num_q_counters + cnts->num_cong_counters;
5668         int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
5669         int ret, i;
5670         void *out;
5671
5672         out = kvzalloc(sz, GFP_KERNEL);
5673         if (!out)
5674                 return -ENOMEM;
5675
5676         ret = mlx5_cmd_query_ext_ppcnt_counters(dev->mdev, out);
5677         if (ret)
5678                 goto free;
5679
5680         for (i = 0; i < cnts->num_ext_ppcnt_counters; i++)
5681                 stats->value[i + offset] =
5682                         be64_to_cpup((__be64 *)(out +
5683                                     cnts->offsets[i + offset]));
5684 free:
5685         kvfree(out);
5686         return ret;
5687 }
5688
5689 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
5690                                 struct rdma_hw_stats *stats,
5691                                 u8 port_num, int index)
5692 {
5693         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5694         const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1);
5695         struct mlx5_core_dev *mdev;
5696         int ret, num_counters;
5697         u8 mdev_port_num;
5698
5699         if (!stats)
5700                 return -EINVAL;
5701
5702         num_counters = cnts->num_q_counters +
5703                        cnts->num_cong_counters +
5704                        cnts->num_ext_ppcnt_counters;
5705
5706         /* q_counters are per IB device, query the master mdev */
5707         ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats, cnts->set_id);
5708         if (ret)
5709                 return ret;
5710
5711         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5712                 ret =  mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats);
5713                 if (ret)
5714                         return ret;
5715         }
5716
5717         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5718                 mdev = mlx5_ib_get_native_port_mdev(dev, port_num,
5719                                                     &mdev_port_num);
5720                 if (!mdev) {
5721                         /* If port is not affiliated yet, its in down state
5722                          * which doesn't have any counters yet, so it would be
5723                          * zero. So no need to read from the HCA.
5724                          */
5725                         goto done;
5726                 }
5727                 ret = mlx5_lag_query_cong_counters(dev->mdev,
5728                                                    stats->value +
5729                                                    cnts->num_q_counters,
5730                                                    cnts->num_cong_counters,
5731                                                    cnts->offsets +
5732                                                    cnts->num_q_counters);
5733
5734                 mlx5_ib_put_native_port_mdev(dev, port_num);
5735                 if (ret)
5736                         return ret;
5737         }
5738
5739 done:
5740         return num_counters;
5741 }
5742
5743 static struct rdma_hw_stats *
5744 mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
5745 {
5746         struct mlx5_ib_dev *dev = to_mdev(counter->device);
5747         const struct mlx5_ib_counters *cnts =
5748                 get_counters(dev, counter->port - 1);
5749
5750         /* Q counters are in the beginning of all counters */
5751         return rdma_alloc_hw_stats_struct(cnts->names,
5752                                           cnts->num_q_counters,
5753                                           RDMA_HW_STATS_DEFAULT_LIFESPAN);
5754 }
5755
5756 static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
5757 {
5758         struct mlx5_ib_dev *dev = to_mdev(counter->device);
5759         const struct mlx5_ib_counters *cnts =
5760                 get_counters(dev, counter->port - 1);
5761
5762         return mlx5_ib_query_q_counters(dev->mdev, cnts,
5763                                         counter->stats, counter->id);
5764 }
5765
5766 static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
5767                                    struct ib_qp *qp)
5768 {
5769         struct mlx5_ib_dev *dev = to_mdev(qp->device);
5770         u16 cnt_set_id = 0;
5771         int err;
5772
5773         if (!counter->id) {
5774                 err = mlx5_cmd_alloc_q_counter(dev->mdev,
5775                                                &cnt_set_id,
5776                                                MLX5_SHARED_RESOURCE_UID);
5777                 if (err)
5778                         return err;
5779                 counter->id = cnt_set_id;
5780         }
5781
5782         err = mlx5_ib_qp_set_counter(qp, counter);
5783         if (err)
5784                 goto fail_set_counter;
5785
5786         return 0;
5787
5788 fail_set_counter:
5789         mlx5_core_dealloc_q_counter(dev->mdev, cnt_set_id);
5790         counter->id = 0;
5791
5792         return err;
5793 }
5794
5795 static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
5796 {
5797         return mlx5_ib_qp_set_counter(qp, NULL);
5798 }
5799
5800 static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
5801 {
5802         struct mlx5_ib_dev *dev = to_mdev(counter->device);
5803
5804         return mlx5_core_dealloc_q_counter(dev->mdev, counter->id);
5805 }
5806
5807 static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
5808                                  enum rdma_netdev_t type,
5809                                  struct rdma_netdev_alloc_params *params)
5810 {
5811         if (type != RDMA_NETDEV_IPOIB)
5812                 return -EOPNOTSUPP;
5813
5814         return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params);
5815 }
5816
5817 static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev)
5818 {
5819         if (!dev->delay_drop.dir_debugfs)
5820                 return;
5821         debugfs_remove_recursive(dev->delay_drop.dir_debugfs);
5822         dev->delay_drop.dir_debugfs = NULL;
5823 }
5824
5825 static void cancel_delay_drop(struct mlx5_ib_dev *dev)
5826 {
5827         if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
5828                 return;
5829
5830         cancel_work_sync(&dev->delay_drop.delay_drop_work);
5831         delay_drop_debugfs_cleanup(dev);
5832 }
5833
5834 static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf,
5835                                        size_t count, loff_t *pos)
5836 {
5837         struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
5838         char lbuf[20];
5839         int len;
5840
5841         len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout);
5842         return simple_read_from_buffer(buf, count, pos, lbuf, len);
5843 }
5844
5845 static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf,
5846                                         size_t count, loff_t *pos)
5847 {
5848         struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
5849         u32 timeout;
5850         u32 var;
5851
5852         if (kstrtouint_from_user(buf, count, 0, &var))
5853                 return -EFAULT;
5854
5855         timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS *
5856                         1000);
5857         if (timeout != var)
5858                 mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n",
5859                             timeout);
5860
5861         delay_drop->timeout = timeout;
5862
5863         return count;
5864 }
5865
5866 static const struct file_operations fops_delay_drop_timeout = {
5867         .owner  = THIS_MODULE,
5868         .open   = simple_open,
5869         .write  = delay_drop_timeout_write,
5870         .read   = delay_drop_timeout_read,
5871 };
5872
5873 static void delay_drop_debugfs_init(struct mlx5_ib_dev *dev)
5874 {
5875         struct dentry *root;
5876
5877         if (!mlx5_debugfs_root)
5878                 return;
5879
5880         root = debugfs_create_dir("delay_drop", dev->mdev->priv.dbg_root);
5881         dev->delay_drop.dir_debugfs = root;
5882
5883         debugfs_create_atomic_t("num_timeout_events", 0400, root,
5884                                 &dev->delay_drop.events_cnt);
5885         debugfs_create_atomic_t("num_rqs", 0400, root,
5886                                 &dev->delay_drop.rqs_cnt);
5887         debugfs_create_file("timeout", 0600, root, &dev->delay_drop,
5888                             &fops_delay_drop_timeout);
5889 }
5890
5891 static void init_delay_drop(struct mlx5_ib_dev *dev)
5892 {
5893         if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
5894                 return;
5895
5896         mutex_init(&dev->delay_drop.lock);
5897         dev->delay_drop.dev = dev;
5898         dev->delay_drop.activate = false;
5899         dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
5900         INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
5901         atomic_set(&dev->delay_drop.rqs_cnt, 0);
5902         atomic_set(&dev->delay_drop.events_cnt, 0);
5903
5904         delay_drop_debugfs_init(dev);
5905 }
5906
5907 static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
5908                                       struct mlx5_ib_multiport_info *mpi)
5909 {
5910         u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
5911         struct mlx5_ib_port *port = &ibdev->port[port_num];
5912         int comps;
5913         int err;
5914         int i;
5915
5916         lockdep_assert_held(&mlx5_ib_multiport_mutex);
5917
5918         mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
5919
5920         spin_lock(&port->mp.mpi_lock);
5921         if (!mpi->ibdev) {
5922                 spin_unlock(&port->mp.mpi_lock);
5923                 return;
5924         }
5925
5926         mpi->ibdev = NULL;
5927
5928         spin_unlock(&port->mp.mpi_lock);
5929         if (mpi->mdev_events.notifier_call)
5930                 mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
5931         mpi->mdev_events.notifier_call = NULL;
5932         mlx5_remove_netdev_notifier(ibdev, port_num);
5933         spin_lock(&port->mp.mpi_lock);
5934
5935         comps = mpi->mdev_refcnt;
5936         if (comps) {
5937                 mpi->unaffiliate = true;
5938                 init_completion(&mpi->unref_comp);
5939                 spin_unlock(&port->mp.mpi_lock);
5940
5941                 for (i = 0; i < comps; i++)
5942                         wait_for_completion(&mpi->unref_comp);
5943
5944                 spin_lock(&port->mp.mpi_lock);
5945                 mpi->unaffiliate = false;
5946         }
5947
5948         port->mp.mpi = NULL;
5949
5950         list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
5951
5952         spin_unlock(&port->mp.mpi_lock);
5953
5954         err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev);
5955
5956         mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1);
5957         /* Log an error, still needed to cleanup the pointers and add
5958          * it back to the list.
5959          */
5960         if (err)
5961                 mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n",
5962                             port_num + 1);
5963
5964         ibdev->port[port_num].roce.last_port_state = IB_PORT_DOWN;
5965 }
5966
5967 static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
5968                                     struct mlx5_ib_multiport_info *mpi)
5969 {
5970         u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
5971         int err;
5972
5973         lockdep_assert_held(&mlx5_ib_multiport_mutex);
5974
5975         spin_lock(&ibdev->port[port_num].mp.mpi_lock);
5976         if (ibdev->port[port_num].mp.mpi) {
5977                 mlx5_ib_dbg(ibdev, "port %d already affiliated.\n",
5978                             port_num + 1);
5979                 spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
5980                 return false;
5981         }
5982
5983         ibdev->port[port_num].mp.mpi = mpi;
5984         mpi->ibdev = ibdev;
5985         mpi->mdev_events.notifier_call = NULL;
5986         spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
5987
5988         err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev);
5989         if (err)
5990                 goto unbind;
5991
5992         err = get_port_caps(ibdev, mlx5_core_native_port_num(mpi->mdev));
5993         if (err)
5994                 goto unbind;
5995
5996         err = mlx5_add_netdev_notifier(ibdev, port_num);
5997         if (err) {
5998                 mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n",
5999                             port_num + 1);
6000                 goto unbind;
6001         }
6002
6003         mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
6004         mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
6005
6006         mlx5_ib_init_cong_debugfs(ibdev, port_num);
6007
6008         return true;
6009
6010 unbind:
6011         mlx5_ib_unbind_slave_port(ibdev, mpi);
6012         return false;
6013 }
6014
6015 static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
6016 {
6017         int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6018         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
6019                                                           port_num + 1);
6020         struct mlx5_ib_multiport_info *mpi;
6021         int err;
6022         int i;
6023
6024         if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
6025                 return 0;
6026
6027         err = mlx5_query_nic_vport_system_image_guid(dev->mdev,
6028                                                      &dev->sys_image_guid);
6029         if (err)
6030                 return err;
6031
6032         err = mlx5_nic_vport_enable_roce(dev->mdev);
6033         if (err)
6034                 return err;
6035
6036         mutex_lock(&mlx5_ib_multiport_mutex);
6037         for (i = 0; i < dev->num_ports; i++) {
6038                 bool bound = false;
6039
6040                 /* build a stub multiport info struct for the native port. */
6041                 if (i == port_num) {
6042                         mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
6043                         if (!mpi) {
6044                                 mutex_unlock(&mlx5_ib_multiport_mutex);
6045                                 mlx5_nic_vport_disable_roce(dev->mdev);
6046                                 return -ENOMEM;
6047                         }
6048
6049                         mpi->is_master = true;
6050                         mpi->mdev = dev->mdev;
6051                         mpi->sys_image_guid = dev->sys_image_guid;
6052                         dev->port[i].mp.mpi = mpi;
6053                         mpi->ibdev = dev;
6054                         mpi = NULL;
6055                         continue;
6056                 }
6057
6058                 list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list,
6059                                     list) {
6060                         if (dev->sys_image_guid == mpi->sys_image_guid &&
6061                             (mlx5_core_native_port_num(mpi->mdev) - 1) == i) {
6062                                 bound = mlx5_ib_bind_slave_port(dev, mpi);
6063                         }
6064
6065                         if (bound) {
6066                                 dev_dbg(mpi->mdev->device,
6067                                         "removing port from unaffiliated list.\n");
6068                                 mlx5_ib_dbg(dev, "port %d bound\n", i + 1);
6069                                 list_del(&mpi->list);
6070                                 break;
6071                         }
6072                 }
6073                 if (!bound) {
6074                         get_port_caps(dev, i + 1);
6075                         mlx5_ib_dbg(dev, "no free port found for port %d\n",
6076                                     i + 1);
6077                 }
6078         }
6079
6080         list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list);
6081         mutex_unlock(&mlx5_ib_multiport_mutex);
6082         return err;
6083 }
6084
6085 static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
6086 {
6087         int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6088         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
6089                                                           port_num + 1);
6090         int i;
6091
6092         if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
6093                 return;
6094
6095         mutex_lock(&mlx5_ib_multiport_mutex);
6096         for (i = 0; i < dev->num_ports; i++) {
6097                 if (dev->port[i].mp.mpi) {
6098                         /* Destroy the native port stub */
6099                         if (i == port_num) {
6100                                 kfree(dev->port[i].mp.mpi);
6101                                 dev->port[i].mp.mpi = NULL;
6102                         } else {
6103                                 mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1);
6104                                 mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi);
6105                         }
6106                 }
6107         }
6108
6109         mlx5_ib_dbg(dev, "removing from devlist\n");
6110         list_del(&dev->ib_dev_list);
6111         mutex_unlock(&mlx5_ib_multiport_mutex);
6112
6113         mlx5_nic_vport_disable_roce(dev->mdev);
6114 }
6115
6116 static int mmap_obj_cleanup(struct ib_uobject *uobject,
6117                             enum rdma_remove_reason why,
6118                             struct uverbs_attr_bundle *attrs)
6119 {
6120         struct mlx5_user_mmap_entry *obj = uobject->object;
6121
6122         rdma_user_mmap_entry_remove(&obj->rdma_entry);
6123         return 0;
6124 }
6125
6126 static int mlx5_rdma_user_mmap_entry_insert(struct mlx5_ib_ucontext *c,
6127                                             struct mlx5_user_mmap_entry *entry,
6128                                             size_t length)
6129 {
6130         return rdma_user_mmap_entry_insert_range(
6131                 &c->ibucontext, &entry->rdma_entry, length,
6132                 (MLX5_IB_MMAP_OFFSET_START << 16),
6133                 ((MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1));
6134 }
6135
6136 static struct mlx5_user_mmap_entry *
6137 alloc_var_entry(struct mlx5_ib_ucontext *c)
6138 {
6139         struct mlx5_user_mmap_entry *entry;
6140         struct mlx5_var_table *var_table;
6141         u32 page_idx;
6142         int err;
6143
6144         var_table = &to_mdev(c->ibucontext.device)->var_table;
6145         entry = kzalloc(sizeof(*entry), GFP_KERNEL);
6146         if (!entry)
6147                 return ERR_PTR(-ENOMEM);
6148
6149         mutex_lock(&var_table->bitmap_lock);
6150         page_idx = find_first_zero_bit(var_table->bitmap,
6151                                        var_table->num_var_hw_entries);
6152         if (page_idx >= var_table->num_var_hw_entries) {
6153                 err = -ENOSPC;
6154                 mutex_unlock(&var_table->bitmap_lock);
6155                 goto end;
6156         }
6157
6158         set_bit(page_idx, var_table->bitmap);
6159         mutex_unlock(&var_table->bitmap_lock);
6160
6161         entry->address = var_table->hw_start_addr +
6162                                 (page_idx * var_table->stride_size);
6163         entry->page_idx = page_idx;
6164         entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR;
6165
6166         err = mlx5_rdma_user_mmap_entry_insert(c, entry,
6167                                                var_table->stride_size);
6168         if (err)
6169                 goto err_insert;
6170
6171         return entry;
6172
6173 err_insert:
6174         mutex_lock(&var_table->bitmap_lock);
6175         clear_bit(page_idx, var_table->bitmap);
6176         mutex_unlock(&var_table->bitmap_lock);
6177 end:
6178         kfree(entry);
6179         return ERR_PTR(err);
6180 }
6181
6182 static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)(
6183         struct uverbs_attr_bundle *attrs)
6184 {
6185         struct ib_uobject *uobj = uverbs_attr_get_uobject(
6186                 attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE);
6187         struct mlx5_ib_ucontext *c;
6188         struct mlx5_user_mmap_entry *entry;
6189         u64 mmap_offset;
6190         u32 length;
6191         int err;
6192
6193         c = to_mucontext(ib_uverbs_get_ucontext(attrs));
6194         if (IS_ERR(c))
6195                 return PTR_ERR(c);
6196
6197         entry = alloc_var_entry(c);
6198         if (IS_ERR(entry))
6199                 return PTR_ERR(entry);
6200
6201         mmap_offset = mlx5_entry_to_mmap_offset(entry);
6202         length = entry->rdma_entry.npages * PAGE_SIZE;
6203         uobj->object = entry;
6204
6205         err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
6206                              &mmap_offset, sizeof(mmap_offset));
6207         if (err)
6208                 goto err;
6209
6210         err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
6211                              &entry->page_idx, sizeof(entry->page_idx));
6212         if (err)
6213                 goto err;
6214
6215         err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
6216                              &length, sizeof(length));
6217         if (err)
6218                 goto err;
6219
6220         return 0;
6221
6222 err:
6223         rdma_user_mmap_entry_remove(&entry->rdma_entry);
6224         return err;
6225 }
6226
6227 DECLARE_UVERBS_NAMED_METHOD(
6228         MLX5_IB_METHOD_VAR_OBJ_ALLOC,
6229         UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE,
6230                         MLX5_IB_OBJECT_VAR,
6231                         UVERBS_ACCESS_NEW,
6232                         UA_MANDATORY),
6233         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
6234                            UVERBS_ATTR_TYPE(u32),
6235                            UA_MANDATORY),
6236         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
6237                            UVERBS_ATTR_TYPE(u32),
6238                            UA_MANDATORY),
6239         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
6240                             UVERBS_ATTR_TYPE(u64),
6241                             UA_MANDATORY));
6242
6243 DECLARE_UVERBS_NAMED_METHOD_DESTROY(
6244         MLX5_IB_METHOD_VAR_OBJ_DESTROY,
6245         UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_DESTROY_HANDLE,
6246                         MLX5_IB_OBJECT_VAR,
6247                         UVERBS_ACCESS_DESTROY,
6248                         UA_MANDATORY));
6249
6250 DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR,
6251                             UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
6252                             &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC),
6253                             &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY));
6254
6255 static bool var_is_supported(struct ib_device *device)
6256 {
6257         struct mlx5_ib_dev *dev = to_mdev(device);
6258
6259         return (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
6260                         MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q);
6261 }
6262
6263 static struct mlx5_user_mmap_entry *
6264 alloc_uar_entry(struct mlx5_ib_ucontext *c,
6265                 enum mlx5_ib_uapi_uar_alloc_type alloc_type)
6266 {
6267         struct mlx5_user_mmap_entry *entry;
6268         struct mlx5_ib_dev *dev;
6269         u32 uar_index;
6270         int err;
6271
6272         entry = kzalloc(sizeof(*entry), GFP_KERNEL);
6273         if (!entry)
6274                 return ERR_PTR(-ENOMEM);
6275
6276         dev = to_mdev(c->ibucontext.device);
6277         err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
6278         if (err)
6279                 goto end;
6280
6281         entry->page_idx = uar_index;
6282         entry->address = uar_index2paddress(dev, uar_index);
6283         if (alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
6284                 entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_WC;
6285         else
6286                 entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_NC;
6287
6288         err = mlx5_rdma_user_mmap_entry_insert(c, entry, PAGE_SIZE);
6289         if (err)
6290                 goto err_insert;
6291
6292         return entry;
6293
6294 err_insert:
6295         mlx5_cmd_free_uar(dev->mdev, uar_index);
6296 end:
6297         kfree(entry);
6298         return ERR_PTR(err);
6299 }
6300
6301 static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)(
6302         struct uverbs_attr_bundle *attrs)
6303 {
6304         struct ib_uobject *uobj = uverbs_attr_get_uobject(
6305                 attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE);
6306         enum mlx5_ib_uapi_uar_alloc_type alloc_type;
6307         struct mlx5_ib_ucontext *c;
6308         struct mlx5_user_mmap_entry *entry;
6309         u64 mmap_offset;
6310         u32 length;
6311         int err;
6312
6313         c = to_mucontext(ib_uverbs_get_ucontext(attrs));
6314         if (IS_ERR(c))
6315                 return PTR_ERR(c);
6316
6317         err = uverbs_get_const(&alloc_type, attrs,
6318                                MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE);
6319         if (err)
6320                 return err;
6321
6322         if (alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF &&
6323             alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC)
6324                 return -EOPNOTSUPP;
6325
6326         if (!to_mdev(c->ibucontext.device)->wc_support &&
6327             alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
6328                 return -EOPNOTSUPP;
6329
6330         entry = alloc_uar_entry(c, alloc_type);
6331         if (IS_ERR(entry))
6332                 return PTR_ERR(entry);
6333
6334         mmap_offset = mlx5_entry_to_mmap_offset(entry);
6335         length = entry->rdma_entry.npages * PAGE_SIZE;
6336         uobj->object = entry;
6337
6338         err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
6339                              &mmap_offset, sizeof(mmap_offset));
6340         if (err)
6341                 goto err;
6342
6343         err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
6344                              &entry->page_idx, sizeof(entry->page_idx));
6345         if (err)
6346                 goto err;
6347
6348         err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
6349                              &length, sizeof(length));
6350         if (err)
6351                 goto err;
6352
6353         return 0;
6354
6355 err:
6356         rdma_user_mmap_entry_remove(&entry->rdma_entry);
6357         return err;
6358 }
6359
6360 DECLARE_UVERBS_NAMED_METHOD(
6361         MLX5_IB_METHOD_UAR_OBJ_ALLOC,
6362         UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE,
6363                         MLX5_IB_OBJECT_UAR,
6364                         UVERBS_ACCESS_NEW,
6365                         UA_MANDATORY),
6366         UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE,
6367                              enum mlx5_ib_uapi_uar_alloc_type,
6368                              UA_MANDATORY),
6369         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
6370                            UVERBS_ATTR_TYPE(u32),
6371                            UA_MANDATORY),
6372         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
6373                            UVERBS_ATTR_TYPE(u32),
6374                            UA_MANDATORY),
6375         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
6376                             UVERBS_ATTR_TYPE(u64),
6377                             UA_MANDATORY));
6378
6379 DECLARE_UVERBS_NAMED_METHOD_DESTROY(
6380         MLX5_IB_METHOD_UAR_OBJ_DESTROY,
6381         UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE,
6382                         MLX5_IB_OBJECT_UAR,
6383                         UVERBS_ACCESS_DESTROY,
6384                         UA_MANDATORY));
6385
6386 DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_UAR,
6387                             UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
6388                             &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_ALLOC),
6389                             &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_DESTROY));
6390
6391 ADD_UVERBS_ATTRIBUTES_SIMPLE(
6392         mlx5_ib_dm,
6393         UVERBS_OBJECT_DM,
6394         UVERBS_METHOD_DM_ALLOC,
6395         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
6396                             UVERBS_ATTR_TYPE(u64),
6397                             UA_MANDATORY),
6398         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
6399                             UVERBS_ATTR_TYPE(u16),
6400                             UA_OPTIONAL),
6401         UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
6402                              enum mlx5_ib_uapi_dm_type,
6403                              UA_OPTIONAL));
6404
6405 ADD_UVERBS_ATTRIBUTES_SIMPLE(
6406         mlx5_ib_flow_action,
6407         UVERBS_OBJECT_FLOW_ACTION,
6408         UVERBS_METHOD_FLOW_ACTION_ESP_CREATE,
6409         UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
6410                              enum mlx5_ib_uapi_flow_action_flags));
6411
6412 static const struct uapi_definition mlx5_ib_defs[] = {
6413         UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
6414         UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
6415         UAPI_DEF_CHAIN(mlx5_ib_qos_defs),
6416
6417         UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
6418                                 &mlx5_ib_flow_action),
6419         UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
6420         UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
6421                                 UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
6422         UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR),
6423         {}
6424 };
6425
6426 static int mlx5_ib_read_counters(struct ib_counters *counters,
6427                                  struct ib_counters_read_attr *read_attr,
6428                                  struct uverbs_attr_bundle *attrs)
6429 {
6430         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
6431         struct mlx5_read_counters_attr mread_attr = {};
6432         struct mlx5_ib_flow_counters_desc *desc;
6433         int ret, i;
6434
6435         mutex_lock(&mcounters->mcntrs_mutex);
6436         if (mcounters->cntrs_max_index > read_attr->ncounters) {
6437                 ret = -EINVAL;
6438                 goto err_bound;
6439         }
6440
6441         mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64),
6442                                  GFP_KERNEL);
6443         if (!mread_attr.out) {
6444                 ret = -ENOMEM;
6445                 goto err_bound;
6446         }
6447
6448         mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl;
6449         mread_attr.flags = read_attr->flags;
6450         ret = mcounters->read_counters(counters->device, &mread_attr);
6451         if (ret)
6452                 goto err_read;
6453
6454         /* do the pass over the counters data array to assign according to the
6455          * descriptions and indexing pairs
6456          */
6457         desc = mcounters->counters_data;
6458         for (i = 0; i < mcounters->ncounters; i++)
6459                 read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description];
6460
6461 err_read:
6462         kfree(mread_attr.out);
6463 err_bound:
6464         mutex_unlock(&mcounters->mcntrs_mutex);
6465         return ret;
6466 }
6467
6468 static int mlx5_ib_destroy_counters(struct ib_counters *counters)
6469 {
6470         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
6471
6472         counters_clear_description(counters);
6473         if (mcounters->hw_cntrs_hndl)
6474                 mlx5_fc_destroy(to_mdev(counters->device)->mdev,
6475                                 mcounters->hw_cntrs_hndl);
6476
6477         kfree(mcounters);
6478
6479         return 0;
6480 }
6481
6482 static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
6483                                                    struct uverbs_attr_bundle *attrs)
6484 {
6485         struct mlx5_ib_mcounters *mcounters;
6486
6487         mcounters = kzalloc(sizeof(*mcounters), GFP_KERNEL);
6488         if (!mcounters)
6489                 return ERR_PTR(-ENOMEM);
6490
6491         mutex_init(&mcounters->mcntrs_mutex);
6492
6493         return &mcounters->ibcntrs;
6494 }
6495
6496 static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
6497 {
6498         mlx5_ib_cleanup_multiport_master(dev);
6499         WARN_ON(!xa_empty(&dev->odp_mkeys));
6500         cleanup_srcu_struct(&dev->odp_srcu);
6501
6502         WARN_ON(!xa_empty(&dev->sig_mrs));
6503         WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
6504 }
6505
6506 static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
6507 {
6508         struct mlx5_core_dev *mdev = dev->mdev;
6509         int err;
6510         int i;
6511
6512         for (i = 0; i < dev->num_ports; i++) {
6513                 spin_lock_init(&dev->port[i].mp.mpi_lock);
6514                 rwlock_init(&dev->port[i].roce.netdev_lock);
6515                 dev->port[i].roce.dev = dev;
6516                 dev->port[i].roce.native_port_num = i + 1;
6517                 dev->port[i].roce.last_port_state = IB_PORT_DOWN;
6518         }
6519
6520         mlx5_ib_internal_fill_odp_caps(dev);
6521
6522         err = mlx5_ib_init_multiport_master(dev);
6523         if (err)
6524                 return err;
6525
6526         err = set_has_smi_cap(dev);
6527         if (err)
6528                 return err;
6529
6530         if (!mlx5_core_mp_enabled(mdev)) {
6531                 for (i = 1; i <= dev->num_ports; i++) {
6532                         err = get_port_caps(dev, i);
6533                         if (err)
6534                                 break;
6535                 }
6536         } else {
6537                 err = get_port_caps(dev, mlx5_core_native_port_num(mdev));
6538         }
6539         if (err)
6540                 goto err_mp;
6541
6542         if (mlx5_use_mad_ifc(dev))
6543                 get_ext_port_caps(dev);
6544
6545         dev->ib_dev.node_type           = RDMA_NODE_IB_CA;
6546         dev->ib_dev.local_dma_lkey      = 0 /* not supported for now */;
6547         dev->ib_dev.phys_port_cnt       = dev->num_ports;
6548         dev->ib_dev.num_comp_vectors    = mlx5_comp_vectors_count(mdev);
6549         dev->ib_dev.dev.parent          = mdev->device;
6550
6551         mutex_init(&dev->cap_mask_mutex);
6552         INIT_LIST_HEAD(&dev->qp_list);
6553         spin_lock_init(&dev->reset_flow_resource_lock);
6554         xa_init(&dev->odp_mkeys);
6555         xa_init(&dev->sig_mrs);
6556         atomic_set(&dev->mkey_var, 0);
6557
6558         spin_lock_init(&dev->dm.lock);
6559         dev->dm.dev = mdev;
6560
6561         err = init_srcu_struct(&dev->odp_srcu);
6562         if (err)
6563                 goto err_mp;
6564
6565         return 0;
6566
6567 err_mp:
6568         mlx5_ib_cleanup_multiport_master(dev);
6569
6570         return -ENOMEM;
6571 }
6572
6573 static int mlx5_ib_stage_flow_db_init(struct mlx5_ib_dev *dev)
6574 {
6575         dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
6576
6577         if (!dev->flow_db)
6578                 return -ENOMEM;
6579
6580         mutex_init(&dev->flow_db->lock);
6581
6582         return 0;
6583 }
6584
6585 static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
6586 {
6587         kfree(dev->flow_db);
6588 }
6589
6590 static const struct ib_device_ops mlx5_ib_dev_ops = {
6591         .owner = THIS_MODULE,
6592         .driver_id = RDMA_DRIVER_MLX5,
6593         .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION,
6594
6595         .add_gid = mlx5_ib_add_gid,
6596         .alloc_mr = mlx5_ib_alloc_mr,
6597         .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
6598         .alloc_pd = mlx5_ib_alloc_pd,
6599         .alloc_ucontext = mlx5_ib_alloc_ucontext,
6600         .attach_mcast = mlx5_ib_mcg_attach,
6601         .check_mr_status = mlx5_ib_check_mr_status,
6602         .create_ah = mlx5_ib_create_ah,
6603         .create_counters = mlx5_ib_create_counters,
6604         .create_cq = mlx5_ib_create_cq,
6605         .create_flow = mlx5_ib_create_flow,
6606         .create_qp = mlx5_ib_create_qp,
6607         .create_srq = mlx5_ib_create_srq,
6608         .dealloc_pd = mlx5_ib_dealloc_pd,
6609         .dealloc_ucontext = mlx5_ib_dealloc_ucontext,
6610         .del_gid = mlx5_ib_del_gid,
6611         .dereg_mr = mlx5_ib_dereg_mr,
6612         .destroy_ah = mlx5_ib_destroy_ah,
6613         .destroy_counters = mlx5_ib_destroy_counters,
6614         .destroy_cq = mlx5_ib_destroy_cq,
6615         .destroy_flow = mlx5_ib_destroy_flow,
6616         .destroy_flow_action = mlx5_ib_destroy_flow_action,
6617         .destroy_qp = mlx5_ib_destroy_qp,
6618         .destroy_srq = mlx5_ib_destroy_srq,
6619         .detach_mcast = mlx5_ib_mcg_detach,
6620         .disassociate_ucontext = mlx5_ib_disassociate_ucontext,
6621         .drain_rq = mlx5_ib_drain_rq,
6622         .drain_sq = mlx5_ib_drain_sq,
6623         .enable_driver = mlx5_ib_enable_driver,
6624         .fill_res_entry = mlx5_ib_fill_res_entry,
6625         .fill_stat_entry = mlx5_ib_fill_stat_entry,
6626         .get_dev_fw_str = get_dev_fw_str,
6627         .get_dma_mr = mlx5_ib_get_dma_mr,
6628         .get_link_layer = mlx5_ib_port_link_layer,
6629         .map_mr_sg = mlx5_ib_map_mr_sg,
6630         .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
6631         .mmap = mlx5_ib_mmap,
6632         .mmap_free = mlx5_ib_mmap_free,
6633         .modify_cq = mlx5_ib_modify_cq,
6634         .modify_device = mlx5_ib_modify_device,
6635         .modify_port = mlx5_ib_modify_port,
6636         .modify_qp = mlx5_ib_modify_qp,
6637         .modify_srq = mlx5_ib_modify_srq,
6638         .poll_cq = mlx5_ib_poll_cq,
6639         .post_recv = mlx5_ib_post_recv,
6640         .post_send = mlx5_ib_post_send,
6641         .post_srq_recv = mlx5_ib_post_srq_recv,
6642         .process_mad = mlx5_ib_process_mad,
6643         .query_ah = mlx5_ib_query_ah,
6644         .query_device = mlx5_ib_query_device,
6645         .query_gid = mlx5_ib_query_gid,
6646         .query_pkey = mlx5_ib_query_pkey,
6647         .query_qp = mlx5_ib_query_qp,
6648         .query_srq = mlx5_ib_query_srq,
6649         .read_counters = mlx5_ib_read_counters,
6650         .reg_user_mr = mlx5_ib_reg_user_mr,
6651         .req_notify_cq = mlx5_ib_arm_cq,
6652         .rereg_user_mr = mlx5_ib_rereg_user_mr,
6653         .resize_cq = mlx5_ib_resize_cq,
6654
6655         INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
6656         INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
6657         INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
6658         INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
6659         INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
6660 };
6661
6662 static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
6663         .create_flow_action_esp = mlx5_ib_create_flow_action_esp,
6664         .modify_flow_action_esp = mlx5_ib_modify_flow_action_esp,
6665 };
6666
6667 static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
6668         .rdma_netdev_get_params = mlx5_ib_rn_get_params,
6669 };
6670
6671 static const struct ib_device_ops mlx5_ib_dev_sriov_ops = {
6672         .get_vf_config = mlx5_ib_get_vf_config,
6673         .get_vf_guid = mlx5_ib_get_vf_guid,
6674         .get_vf_stats = mlx5_ib_get_vf_stats,
6675         .set_vf_guid = mlx5_ib_set_vf_guid,
6676         .set_vf_link_state = mlx5_ib_set_vf_link_state,
6677 };
6678
6679 static const struct ib_device_ops mlx5_ib_dev_mw_ops = {
6680         .alloc_mw = mlx5_ib_alloc_mw,
6681         .dealloc_mw = mlx5_ib_dealloc_mw,
6682 };
6683
6684 static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
6685         .alloc_xrcd = mlx5_ib_alloc_xrcd,
6686         .dealloc_xrcd = mlx5_ib_dealloc_xrcd,
6687 };
6688
6689 static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
6690         .alloc_dm = mlx5_ib_alloc_dm,
6691         .dealloc_dm = mlx5_ib_dealloc_dm,
6692         .reg_dm_mr = mlx5_ib_reg_dm_mr,
6693 };
6694
6695 static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev)
6696 {
6697         struct mlx5_core_dev *mdev = dev->mdev;
6698         struct mlx5_var_table *var_table = &dev->var_table;
6699         u8 log_doorbell_bar_size;
6700         u8 log_doorbell_stride;
6701         u64 bar_size;
6702
6703         log_doorbell_bar_size = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
6704                                         log_doorbell_bar_size);
6705         log_doorbell_stride = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
6706                                         log_doorbell_stride);
6707         var_table->hw_start_addr = dev->mdev->bar_addr +
6708                                 MLX5_CAP64_DEV_VDPA_EMULATION(mdev,
6709                                         doorbell_bar_offset);
6710         bar_size = (1ULL << log_doorbell_bar_size) * 4096;
6711         var_table->stride_size = 1ULL << log_doorbell_stride;
6712         var_table->num_var_hw_entries = div_u64(bar_size,
6713                                                 var_table->stride_size);
6714         mutex_init(&var_table->bitmap_lock);
6715         var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries,
6716                                           GFP_KERNEL);
6717         return (var_table->bitmap) ? 0 : -ENOMEM;
6718 }
6719
6720 static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev)
6721 {
6722         bitmap_free(dev->var_table.bitmap);
6723 }
6724
6725 static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
6726 {
6727         struct mlx5_core_dev *mdev = dev->mdev;
6728         int err;
6729
6730         dev->ib_dev.uverbs_cmd_mask     =
6731                 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
6732                 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
6733                 (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
6734                 (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
6735                 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
6736                 (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
6737                 (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
6738                 (1ull << IB_USER_VERBS_CMD_REG_MR)              |
6739                 (1ull << IB_USER_VERBS_CMD_REREG_MR)            |
6740                 (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
6741                 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
6742                 (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
6743                 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
6744                 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
6745                 (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
6746                 (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
6747                 (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
6748                 (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
6749                 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
6750                 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
6751                 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
6752                 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
6753                 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
6754                 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
6755                 (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)         |
6756                 (1ull << IB_USER_VERBS_CMD_OPEN_QP);
6757         dev->ib_dev.uverbs_ex_cmd_mask =
6758                 (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)     |
6759                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)        |
6760                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP)        |
6761                 (1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP)        |
6762                 (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ)        |
6763                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW)      |
6764                 (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
6765
6766         if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
6767             IS_ENABLED(CONFIG_MLX5_CORE_IPOIB))
6768                 ib_set_device_ops(&dev->ib_dev,
6769                                   &mlx5_ib_dev_ipoib_enhanced_ops);
6770
6771         if (mlx5_core_is_pf(mdev))
6772                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_sriov_ops);
6773
6774         dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
6775
6776         if (MLX5_CAP_GEN(mdev, imaicl)) {
6777                 dev->ib_dev.uverbs_cmd_mask |=
6778                         (1ull << IB_USER_VERBS_CMD_ALLOC_MW)    |
6779                         (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
6780                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_mw_ops);
6781         }
6782
6783         if (MLX5_CAP_GEN(mdev, xrc)) {
6784                 dev->ib_dev.uverbs_cmd_mask |=
6785                         (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
6786                         (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
6787                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops);
6788         }
6789
6790         if (MLX5_CAP_DEV_MEM(mdev, memic) ||
6791             MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
6792             MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)
6793                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
6794
6795         if (mlx5_accel_ipsec_device_caps(dev->mdev) &
6796             MLX5_ACCEL_IPSEC_CAP_DEVICE)
6797                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops);
6798         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
6799
6800         if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
6801                 dev->ib_dev.driver_def = mlx5_ib_defs;
6802
6803         err = init_node_data(dev);
6804         if (err)
6805                 return err;
6806
6807         if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
6808             (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) ||
6809              MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
6810                 mutex_init(&dev->lb.mutex);
6811
6812         if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
6813                         MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) {
6814                 err = mlx5_ib_init_var_table(dev);
6815                 if (err)
6816                         return err;
6817         }
6818
6819         dev->ib_dev.use_cq_dim = true;
6820
6821         return 0;
6822 }
6823
6824 static const struct ib_device_ops mlx5_ib_dev_port_ops = {
6825         .get_port_immutable = mlx5_port_immutable,
6826         .query_port = mlx5_ib_query_port,
6827 };
6828
6829 static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev)
6830 {
6831         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_ops);
6832         return 0;
6833 }
6834
6835 static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = {
6836         .get_port_immutable = mlx5_port_rep_immutable,
6837         .query_port = mlx5_ib_rep_query_port,
6838 };
6839
6840 static int mlx5_ib_stage_raw_eth_non_default_cb(struct mlx5_ib_dev *dev)
6841 {
6842         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
6843         return 0;
6844 }
6845
6846 static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
6847         .create_rwq_ind_table = mlx5_ib_create_rwq_ind_table,
6848         .create_wq = mlx5_ib_create_wq,
6849         .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
6850         .destroy_wq = mlx5_ib_destroy_wq,
6851         .get_netdev = mlx5_ib_get_netdev,
6852         .modify_wq = mlx5_ib_modify_wq,
6853 };
6854
6855 static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
6856 {
6857         u8 port_num;
6858
6859         dev->ib_dev.uverbs_ex_cmd_mask |=
6860                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
6861                         (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
6862                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
6863                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
6864                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
6865         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
6866
6867         port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6868
6869         /* Register only for native ports */
6870         return mlx5_add_netdev_notifier(dev, port_num);
6871 }
6872
6873 static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev)
6874 {
6875         u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6876
6877         mlx5_remove_netdev_notifier(dev, port_num);
6878 }
6879
6880 static int mlx5_ib_stage_raw_eth_roce_init(struct mlx5_ib_dev *dev)
6881 {
6882         struct mlx5_core_dev *mdev = dev->mdev;
6883         enum rdma_link_layer ll;
6884         int port_type_cap;
6885         int err = 0;
6886
6887         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6888         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6889
6890         if (ll == IB_LINK_LAYER_ETHERNET)
6891                 err = mlx5_ib_stage_common_roce_init(dev);
6892
6893         return err;
6894 }
6895
6896 static void mlx5_ib_stage_raw_eth_roce_cleanup(struct mlx5_ib_dev *dev)
6897 {
6898         mlx5_ib_stage_common_roce_cleanup(dev);
6899 }
6900
6901 static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev)
6902 {
6903         struct mlx5_core_dev *mdev = dev->mdev;
6904         enum rdma_link_layer ll;
6905         int port_type_cap;
6906         int err;
6907
6908         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6909         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6910
6911         if (ll == IB_LINK_LAYER_ETHERNET) {
6912                 err = mlx5_ib_stage_common_roce_init(dev);
6913                 if (err)
6914                         return err;
6915
6916                 err = mlx5_enable_eth(dev);
6917                 if (err)
6918                         goto cleanup;
6919         }
6920
6921         return 0;
6922 cleanup:
6923         mlx5_ib_stage_common_roce_cleanup(dev);
6924
6925         return err;
6926 }
6927
6928 static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev)
6929 {
6930         struct mlx5_core_dev *mdev = dev->mdev;
6931         enum rdma_link_layer ll;
6932         int port_type_cap;
6933
6934         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6935         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6936
6937         if (ll == IB_LINK_LAYER_ETHERNET) {
6938                 mlx5_disable_eth(dev);
6939                 mlx5_ib_stage_common_roce_cleanup(dev);
6940         }
6941 }
6942
6943 static int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
6944 {
6945         return create_dev_resources(&dev->devr);
6946 }
6947
6948 static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
6949 {
6950         destroy_dev_resources(&dev->devr);
6951 }
6952
6953 static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
6954 {
6955         return mlx5_ib_odp_init_one(dev);
6956 }
6957
6958 static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
6959 {
6960         mlx5_ib_odp_cleanup_one(dev);
6961 }
6962
6963 static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
6964         .alloc_hw_stats = mlx5_ib_alloc_hw_stats,
6965         .get_hw_stats = mlx5_ib_get_hw_stats,
6966         .counter_bind_qp = mlx5_ib_counter_bind_qp,
6967         .counter_unbind_qp = mlx5_ib_counter_unbind_qp,
6968         .counter_dealloc = mlx5_ib_counter_dealloc,
6969         .counter_alloc_stats = mlx5_ib_counter_alloc_stats,
6970         .counter_update_stats = mlx5_ib_counter_update_stats,
6971 };
6972
6973 static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
6974 {
6975         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
6976                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_hw_stats_ops);
6977
6978                 return mlx5_ib_alloc_counters(dev);
6979         }
6980
6981         return 0;
6982 }
6983
6984 static void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
6985 {
6986         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
6987                 mlx5_ib_dealloc_counters(dev);
6988 }
6989
6990 static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
6991 {
6992         mlx5_ib_init_cong_debugfs(dev,
6993                                   mlx5_core_native_port_num(dev->mdev) - 1);
6994         return 0;
6995 }
6996
6997 static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
6998 {
6999         mlx5_ib_cleanup_cong_debugfs(dev,
7000                                      mlx5_core_native_port_num(dev->mdev) - 1);
7001 }
7002
7003 static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
7004 {
7005         dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
7006         return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
7007 }
7008
7009 static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
7010 {
7011         mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
7012 }
7013
7014 static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
7015 {
7016         int err;
7017
7018         err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
7019         if (err)
7020                 return err;
7021
7022         err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
7023         if (err)
7024                 mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
7025
7026         return err;
7027 }
7028
7029 static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
7030 {
7031         mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
7032         mlx5_free_bfreg(dev->mdev, &dev->bfreg);
7033 }
7034
7035 static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
7036 {
7037         const char *name;
7038
7039         rdma_set_device_sysfs_group(&dev->ib_dev, &mlx5_attr_group);
7040         if (!mlx5_lag_is_roce(dev->mdev))
7041                 name = "mlx5_%d";
7042         else
7043                 name = "mlx5_bond_%d";
7044         return ib_register_device(&dev->ib_dev, name);
7045 }
7046
7047 static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
7048 {
7049         destroy_umrc_res(dev);
7050 }
7051
7052 static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
7053 {
7054         ib_unregister_device(&dev->ib_dev);
7055 }
7056
7057 static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
7058 {
7059         return create_umr_res(dev);
7060 }
7061
7062 static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
7063 {
7064         init_delay_drop(dev);
7065
7066         return 0;
7067 }
7068
7069 static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
7070 {
7071         cancel_delay_drop(dev);
7072 }
7073
7074 static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
7075 {
7076         dev->mdev_events.notifier_call = mlx5_ib_event;
7077         mlx5_notifier_register(dev->mdev, &dev->mdev_events);
7078         return 0;
7079 }
7080
7081 static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
7082 {
7083         mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
7084 }
7085
7086 static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev)
7087 {
7088         int uid;
7089
7090         uid = mlx5_ib_devx_create(dev, false);
7091         if (uid > 0) {
7092                 dev->devx_whitelist_uid = uid;
7093                 mlx5_ib_devx_init_event_table(dev);
7094         }
7095
7096         return 0;
7097 }
7098 static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev)
7099 {
7100         if (dev->devx_whitelist_uid) {
7101                 mlx5_ib_devx_cleanup_event_table(dev);
7102                 mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
7103         }
7104 }
7105
7106 int mlx5_ib_enable_driver(struct ib_device *dev)
7107 {
7108         struct mlx5_ib_dev *mdev = to_mdev(dev);
7109         int ret;
7110
7111         ret = mlx5_ib_test_wc(mdev);
7112         mlx5_ib_dbg(mdev, "Write-Combining %s",
7113                     mdev->wc_support ? "supported" : "not supported");
7114
7115         return ret;
7116 }
7117
7118 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
7119                       const struct mlx5_ib_profile *profile,
7120                       int stage)
7121 {
7122         dev->ib_active = false;
7123
7124         /* Number of stages to cleanup */
7125         while (stage) {
7126                 stage--;
7127                 if (profile->stage[stage].cleanup)
7128                         profile->stage[stage].cleanup(dev);
7129         }
7130
7131         kfree(dev->port);
7132         ib_dealloc_device(&dev->ib_dev);
7133 }
7134
7135 void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
7136                     const struct mlx5_ib_profile *profile)
7137 {
7138         int err;
7139         int i;
7140
7141         for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
7142                 if (profile->stage[i].init) {
7143                         err = profile->stage[i].init(dev);
7144                         if (err)
7145                                 goto err_out;
7146                 }
7147         }
7148
7149         dev->profile = profile;
7150         dev->ib_active = true;
7151
7152         return dev;
7153
7154 err_out:
7155         __mlx5_ib_remove(dev, profile, i);
7156
7157         return NULL;
7158 }
7159
7160 static const struct mlx5_ib_profile pf_profile = {
7161         STAGE_CREATE(MLX5_IB_STAGE_INIT,
7162                      mlx5_ib_stage_init_init,
7163                      mlx5_ib_stage_init_cleanup),
7164         STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
7165                      mlx5_ib_stage_flow_db_init,
7166                      mlx5_ib_stage_flow_db_cleanup),
7167         STAGE_CREATE(MLX5_IB_STAGE_CAPS,
7168                      mlx5_ib_stage_caps_init,
7169                      mlx5_ib_stage_caps_cleanup),
7170         STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
7171                      mlx5_ib_stage_non_default_cb,
7172                      NULL),
7173         STAGE_CREATE(MLX5_IB_STAGE_ROCE,
7174                      mlx5_ib_stage_roce_init,
7175                      mlx5_ib_stage_roce_cleanup),
7176         STAGE_CREATE(MLX5_IB_STAGE_SRQ,
7177                      mlx5_init_srq_table,
7178                      mlx5_cleanup_srq_table),
7179         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
7180                      mlx5_ib_stage_dev_res_init,
7181                      mlx5_ib_stage_dev_res_cleanup),
7182         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
7183                      mlx5_ib_stage_dev_notifier_init,
7184                      mlx5_ib_stage_dev_notifier_cleanup),
7185         STAGE_CREATE(MLX5_IB_STAGE_ODP,
7186                      mlx5_ib_stage_odp_init,
7187                      mlx5_ib_stage_odp_cleanup),
7188         STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
7189                      mlx5_ib_stage_counters_init,
7190                      mlx5_ib_stage_counters_cleanup),
7191         STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
7192                      mlx5_ib_stage_cong_debugfs_init,
7193                      mlx5_ib_stage_cong_debugfs_cleanup),
7194         STAGE_CREATE(MLX5_IB_STAGE_UAR,
7195                      mlx5_ib_stage_uar_init,
7196                      mlx5_ib_stage_uar_cleanup),
7197         STAGE_CREATE(MLX5_IB_STAGE_BFREG,
7198                      mlx5_ib_stage_bfrag_init,
7199                      mlx5_ib_stage_bfrag_cleanup),
7200         STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
7201                      NULL,
7202                      mlx5_ib_stage_pre_ib_reg_umr_cleanup),
7203         STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
7204                      mlx5_ib_stage_devx_init,
7205                      mlx5_ib_stage_devx_cleanup),
7206         STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
7207                      mlx5_ib_stage_ib_reg_init,
7208                      mlx5_ib_stage_ib_reg_cleanup),
7209         STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
7210                      mlx5_ib_stage_post_ib_reg_umr_init,
7211                      NULL),
7212         STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
7213                      mlx5_ib_stage_delay_drop_init,
7214                      mlx5_ib_stage_delay_drop_cleanup),
7215 };
7216
7217 const struct mlx5_ib_profile raw_eth_profile = {
7218         STAGE_CREATE(MLX5_IB_STAGE_INIT,
7219                      mlx5_ib_stage_init_init,
7220                      mlx5_ib_stage_init_cleanup),
7221         STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
7222                      mlx5_ib_stage_flow_db_init,
7223                      mlx5_ib_stage_flow_db_cleanup),
7224         STAGE_CREATE(MLX5_IB_STAGE_CAPS,
7225                      mlx5_ib_stage_caps_init,
7226                      mlx5_ib_stage_caps_cleanup),
7227         STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
7228                      mlx5_ib_stage_raw_eth_non_default_cb,
7229                      NULL),
7230         STAGE_CREATE(MLX5_IB_STAGE_ROCE,
7231                      mlx5_ib_stage_raw_eth_roce_init,
7232                      mlx5_ib_stage_raw_eth_roce_cleanup),
7233         STAGE_CREATE(MLX5_IB_STAGE_SRQ,
7234                      mlx5_init_srq_table,
7235                      mlx5_cleanup_srq_table),
7236         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
7237                      mlx5_ib_stage_dev_res_init,
7238                      mlx5_ib_stage_dev_res_cleanup),
7239         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
7240                      mlx5_ib_stage_dev_notifier_init,
7241                      mlx5_ib_stage_dev_notifier_cleanup),
7242         STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
7243                      mlx5_ib_stage_counters_init,
7244                      mlx5_ib_stage_counters_cleanup),
7245         STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
7246                      mlx5_ib_stage_cong_debugfs_init,
7247                      mlx5_ib_stage_cong_debugfs_cleanup),
7248         STAGE_CREATE(MLX5_IB_STAGE_UAR,
7249                      mlx5_ib_stage_uar_init,
7250                      mlx5_ib_stage_uar_cleanup),
7251         STAGE_CREATE(MLX5_IB_STAGE_BFREG,
7252                      mlx5_ib_stage_bfrag_init,
7253                      mlx5_ib_stage_bfrag_cleanup),
7254         STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
7255                      NULL,
7256                      mlx5_ib_stage_pre_ib_reg_umr_cleanup),
7257         STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
7258                      mlx5_ib_stage_devx_init,
7259                      mlx5_ib_stage_devx_cleanup),
7260         STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
7261                      mlx5_ib_stage_ib_reg_init,
7262                      mlx5_ib_stage_ib_reg_cleanup),
7263         STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
7264                      mlx5_ib_stage_post_ib_reg_umr_init,
7265                      NULL),
7266 };
7267
7268 static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev)
7269 {
7270         struct mlx5_ib_multiport_info *mpi;
7271         struct mlx5_ib_dev *dev;
7272         bool bound = false;
7273         int err;
7274
7275         mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
7276         if (!mpi)
7277                 return NULL;
7278
7279         mpi->mdev = mdev;
7280
7281         err = mlx5_query_nic_vport_system_image_guid(mdev,
7282                                                      &mpi->sys_image_guid);
7283         if (err) {
7284                 kfree(mpi);
7285                 return NULL;
7286         }
7287
7288         mutex_lock(&mlx5_ib_multiport_mutex);
7289         list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
7290                 if (dev->sys_image_guid == mpi->sys_image_guid)
7291                         bound = mlx5_ib_bind_slave_port(dev, mpi);
7292
7293                 if (bound) {
7294                         rdma_roce_rescan_device(&dev->ib_dev);
7295                         break;
7296                 }
7297         }
7298
7299         if (!bound) {
7300                 list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
7301                 dev_dbg(mdev->device,
7302                         "no suitable IB device found to bind to, added to unaffiliated list.\n");
7303         }
7304         mutex_unlock(&mlx5_ib_multiport_mutex);
7305
7306         return mpi;
7307 }
7308
7309 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
7310 {
7311         const struct mlx5_ib_profile *profile;
7312         enum rdma_link_layer ll;
7313         struct mlx5_ib_dev *dev;
7314         int port_type_cap;
7315         int num_ports;
7316
7317         printk_once(KERN_INFO "%s", mlx5_version);
7318
7319         if (MLX5_ESWITCH_MANAGER(mdev) &&
7320             mlx5_ib_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) {
7321                 if (!mlx5_core_mp_enabled(mdev))
7322                         mlx5_ib_register_vport_reps(mdev);
7323                 return mdev;
7324         }
7325
7326         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
7327         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
7328
7329         if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET)
7330                 return mlx5_ib_add_slave_port(mdev);
7331
7332         num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
7333                         MLX5_CAP_GEN(mdev, num_vhca_ports));
7334         dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
7335         if (!dev)
7336                 return NULL;
7337         dev->port = kcalloc(num_ports, sizeof(*dev->port),
7338                              GFP_KERNEL);
7339         if (!dev->port) {
7340                 ib_dealloc_device(&dev->ib_dev);
7341                 return NULL;
7342         }
7343
7344         dev->mdev = mdev;
7345         dev->num_ports = num_ports;
7346
7347         if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_enabled(mdev))
7348                 profile = &raw_eth_profile;
7349         else
7350                 profile = &pf_profile;
7351
7352         return __mlx5_ib_add(dev, profile);
7353 }
7354
7355 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
7356 {
7357         struct mlx5_ib_multiport_info *mpi;
7358         struct mlx5_ib_dev *dev;
7359
7360         if (MLX5_ESWITCH_MANAGER(mdev) && context == mdev) {
7361                 mlx5_ib_unregister_vport_reps(mdev);
7362                 return;
7363         }
7364
7365         if (mlx5_core_is_mp_slave(mdev)) {
7366                 mpi = context;
7367                 mutex_lock(&mlx5_ib_multiport_mutex);
7368                 if (mpi->ibdev)
7369                         mlx5_ib_unbind_slave_port(mpi->ibdev, mpi);
7370                 list_del(&mpi->list);
7371                 mutex_unlock(&mlx5_ib_multiport_mutex);
7372                 kfree(mpi);
7373                 return;
7374         }
7375
7376         dev = context;
7377         __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
7378 }
7379
7380 static struct mlx5_interface mlx5_ib_interface = {
7381         .add            = mlx5_ib_add,
7382         .remove         = mlx5_ib_remove,
7383         .protocol       = MLX5_INTERFACE_PROTOCOL_IB,
7384 };
7385
7386 unsigned long mlx5_ib_get_xlt_emergency_page(void)
7387 {
7388         mutex_lock(&xlt_emergency_page_mutex);
7389         return xlt_emergency_page;
7390 }
7391
7392 void mlx5_ib_put_xlt_emergency_page(void)
7393 {
7394         mutex_unlock(&xlt_emergency_page_mutex);
7395 }
7396
7397 static int __init mlx5_ib_init(void)
7398 {
7399         int err;
7400
7401         xlt_emergency_page = __get_free_page(GFP_KERNEL);
7402         if (!xlt_emergency_page)
7403                 return -ENOMEM;
7404
7405         mutex_init(&xlt_emergency_page_mutex);
7406
7407         mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
7408         if (!mlx5_ib_event_wq) {
7409                 free_page(xlt_emergency_page);
7410                 return -ENOMEM;
7411         }
7412
7413         mlx5_ib_odp_init();
7414
7415         err = mlx5_register_interface(&mlx5_ib_interface);
7416
7417         return err;
7418 }
7419
7420 static void __exit mlx5_ib_cleanup(void)
7421 {
7422         mlx5_unregister_interface(&mlx5_ib_interface);
7423         destroy_workqueue(mlx5_ib_event_wq);
7424         mutex_destroy(&xlt_emergency_page_mutex);
7425         free_page(xlt_emergency_page);
7426 }
7427
7428 module_init(mlx5_ib_init);
7429 module_exit(mlx5_ib_cleanup);
This page took 0.545565 seconds and 4 git commands to generate.