]> Git Repo - linux.git/commitdiff
Merge branch 'mlx5-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mellanox...
authorSaeed Mahameed <[email protected]>
Tue, 2 Apr 2019 22:43:45 +0000 (15:43 -0700)
committerSaeed Mahameed <[email protected]>
Fri, 5 Apr 2019 21:10:16 +0000 (14:10 -0700)
This merge commit includes some misc shared code updates from mlx5-next branch needed
for net-next.

1) From Maxim, Remove un-used macros and spinlock from mlx5 code.

2) From Aya, Expose Management PCIE info register layout and add rate limit
print macros.

3) From Tariq, Compilation warning fix in fs_core.c

4) From Vu, Huy and Saeed, Improve mlx5 initialization flow:
The goal is to provide a better logical separation of mlx5 core
device initialization flow and will help to seamlessly support
creating different mlx5 device types such as PF, VF and SF
mlx5 sub-function virtual devices.

Mlx5_core driver needs to separate HCA resources from pci resources.
Its initialize/load/unload will be broken into stages:
1. Initialize common data structures
2. Setup function which initializes pci resources (for PF/VF)
   or some other specific resources for virtual device
3. Initialize software objects according to hardware capabilities
4. Load all mlx5_core components

It is also necessary to detach mlx5_core mdev name/message from pci
device mdev->pdev name/message for a clearer report/debug of
different mlx5 device types.

Signed-off-by: Saeed Mahameed <[email protected]>
12 files changed:
1  2 
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/qp.c
drivers/net/ethernet/mellanox/mlx5/core/cmd.c
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
drivers/net/ethernet/mellanox/mlx5/core/health.c
drivers/net/ethernet/mellanox/mlx5/core/main.c
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
drivers/net/ethernet/mellanox/mlx5/core/uar.c
include/linux/mlx5/driver.h
include/linux/mlx5/mlx5_ifc.h

index 531ff20b32ade6ccb4d0b3533bc1f8ceceed1b26,a5333db0a4c7f41ab2790630e37ede14d0447683..0845e95d2d11e8cc73442d3219c383fc695221ce
@@@ -415,17 -415,10 +415,17 @@@ static int translate_eth_ext_proto_oper
                *active_speed = IB_SPEED_EDR;
                break;
        case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2):
 +              *active_width = IB_WIDTH_2X;
 +              *active_speed = IB_SPEED_EDR;
 +              break;
        case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR):
                *active_width = IB_WIDTH_1X;
                *active_speed = IB_SPEED_HDR;
                break;
 +      case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4):
 +              *active_width = IB_WIDTH_4X;
 +              *active_speed = IB_SPEED_EDR;
 +              break;
        case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2):
                *active_width = IB_WIDTH_2X;
                *active_speed = IB_SPEED_HDR;
@@@ -542,51 -535,24 +542,51 @@@ out
        return err;
  }
  
 +struct mlx5_ib_vlan_info {
 +      u16 vlan_id;
 +      bool vlan;
 +};
 +
 +static int get_lower_dev_vlan(struct net_device *lower_dev, void *data)
 +{
 +      struct mlx5_ib_vlan_info *vlan_info = data;
 +
 +      if (is_vlan_dev(lower_dev)) {
 +              vlan_info->vlan = true;
 +              vlan_info->vlan_id = vlan_dev_vlan_id(lower_dev);
 +      }
 +      /* We are interested only in first level vlan device, so
 +       * always return 1 to stop iterating over next level devices.
 +       */
 +      return 1;
 +}
 +
  static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
                         unsigned int index, const union ib_gid *gid,
                         const struct ib_gid_attr *attr)
  {
        enum ib_gid_type gid_type = IB_GID_TYPE_IB;
 +      struct mlx5_ib_vlan_info vlan_info = { };
        u8 roce_version = 0;
        u8 roce_l3_type = 0;
 -      bool vlan = false;
        u8 mac[ETH_ALEN];
 -      u16 vlan_id = 0;
  
        if (gid) {
                gid_type = attr->gid_type;
                ether_addr_copy(mac, attr->ndev->dev_addr);
  
                if (is_vlan_dev(attr->ndev)) {
 -                      vlan = true;
 -                      vlan_id = vlan_dev_vlan_id(attr->ndev);
 +                      vlan_info.vlan = true;
 +                      vlan_info.vlan_id = vlan_dev_vlan_id(attr->ndev);
 +              } else {
 +                      /* If the netdev is upper device and if it's lower
 +                       * lower device is vlan device, consider vlan id of
 +                       * the lower vlan device for this gid entry.
 +                       */
 +                      rcu_read_lock();
 +                      netdev_walk_all_lower_dev_rcu(attr->ndev,
 +                                      get_lower_dev_vlan, &vlan_info);
 +                      rcu_read_unlock();
                }
        }
  
        }
  
        return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
 -                                    roce_l3_type, gid->raw, mac, vlan,
 -                                    vlan_id, port_num);
 +                                    roce_l3_type, gid->raw, mac,
 +                                    vlan_info.vlan, vlan_info.vlan_id,
 +                                    port_num);
  }
  
  static int mlx5_ib_add_gid(const struct ib_gid_attr *attr,
@@@ -1017,11 -982,11 +1017,11 @@@ static int mlx5_ib_query_device(struct 
        props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
        props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 -      if (MLX5_CAP_GEN(mdev, pg))
 -              props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
 -      props->odp_caps = dev->odp_caps;
 -#endif
 +      if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
 +              if (MLX5_CAP_GEN(mdev, pg))
 +                      props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
 +              props->odp_caps = dev->odp_caps;
 +      }
  
        if (MLX5_CAP_GEN(mdev, cd))
                props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
@@@ -1752,15 -1717,14 +1752,15 @@@ static void mlx5_ib_dealloc_transport_d
        mlx5_ib_disable_lb(dev, true, false);
  }
  
 -static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
 -                                                struct ib_udata *udata)
 +static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
 +                                struct ib_udata *udata)
  {
 +      struct ib_device *ibdev = uctx->device;
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
        struct mlx5_ib_alloc_ucontext_req_v2 req = {};
        struct mlx5_ib_alloc_ucontext_resp resp = {};
        struct mlx5_core_dev *mdev = dev->mdev;
 -      struct mlx5_ib_ucontext *context;
 +      struct mlx5_ib_ucontext *context = to_mucontext(uctx);
        struct mlx5_bfreg_info *bfregi;
        int ver;
        int err;
        bool lib_uar_4k;
  
        if (!dev->ib_active)
 -              return ERR_PTR(-EAGAIN);
 +              return -EAGAIN;
  
        if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
                ver = 0;
        else if (udata->inlen >= min_req_v2)
                ver = 2;
        else
 -              return ERR_PTR(-EINVAL);
 +              return -EINVAL;
  
        err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
        if (err)
 -              return ERR_PTR(err);
 +              return err;
  
        if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
 -              return ERR_PTR(-EOPNOTSUPP);
 +              return -EOPNOTSUPP;
  
        if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
 -              return ERR_PTR(-EOPNOTSUPP);
 +              return -EOPNOTSUPP;
  
        req.total_num_bfregs = ALIGN(req.total_num_bfregs,
                                    MLX5_NON_FP_BFREGS_PER_UAR);
        if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
 -              return ERR_PTR(-EINVAL);
 +              return -EINVAL;
  
        resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
        if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
                /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
        }
  
 -      context = kzalloc(sizeof(*context), GFP_KERNEL);
 -      if (!context)
 -              return ERR_PTR(-ENOMEM);
 -
        lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
        bfregi = &context->bfregi;
  
        if (err)
                goto out_sys_pages;
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 -      context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
 -#endif
 +      if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)
 +              context->ibucontext.invalidate_range =
 +                      &mlx5_ib_invalidate_range;
  
        if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
                err = mlx5_ib_devx_create(dev, true);
                                   1, &dev->roce[port].tx_port_affinity));
        }
  
 -      return &context->ibucontext;
 +      return 0;
  
  out_mdev:
        mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
@@@ -1977,19 -1945,23 +1977,19 @@@ out_count
        kfree(bfregi->count);
  
  out_ctx:
 -      kfree(context);
 -
 -      return ERR_PTR(err);
 +      return err;
  }
  
 -static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 +static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
  {
        struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
        struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
        struct mlx5_bfreg_info *bfregi;
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        /* All umem's must be destroyed before destroying the ucontext. */
        mutex_lock(&ibcontext->per_mm_list_lock);
        WARN_ON(!list_empty(&ibcontext->per_mm_list));
        mutex_unlock(&ibcontext->per_mm_list_lock);
 -#endif
  
        bfregi = &context->bfregi;
        mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
        deallocate_uars(dev, context);
        kfree(bfregi->sys_pages);
        kfree(bfregi->count);
 -      kfree(context);
 -
 -      return 0;
  }
  
  static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
  
        fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
  
-       return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
+       return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
  }
  
  static int get_command(unsigned long offset)
@@@ -2199,7 -2174,7 +2199,7 @@@ static int dm_mmap(struct ib_ucontext *
            page_idx + npages)
                return -EINVAL;
  
-       pfn = ((pci_resource_start(dev->mdev->pdev, 0) +
+       pfn = ((dev->mdev->bar_addr +
              MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >>
              PAGE_SHIFT) +
              page_idx;
@@@ -2283,7 -2258,7 +2283,7 @@@ struct ib_dm *mlx5_ib_alloc_dm(struct i
                goto err_free;
  
        start_offset = memic_addr & ~PAGE_MASK;
-       page_idx = (memic_addr - pci_resource_start(memic->dev->pdev, 0) -
+       page_idx = (memic_addr - memic->dev->bar_addr -
                    MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
                    PAGE_SHIFT;
  
@@@ -2326,7 -2301,7 +2326,7 @@@ int mlx5_ib_dealloc_dm(struct ib_dm *ib
        if (ret)
                return ret;
  
-       page_idx = (dm->dev_addr - pci_resource_start(memic->dev->pdev, 0) -
+       page_idx = (dm->dev_addr - memic->dev->bar_addr -
                    MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
                    PAGE_SHIFT;
        bitmap_clear(to_mucontext(ibdm->uobject->context)->dm_pages,
        return 0;
  }
  
 -static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
 -                                    struct ib_ucontext *context,
 -                                    struct ib_udata *udata)
 +static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
 +                          struct ib_udata *udata)
  {
 +      struct mlx5_ib_pd *pd = to_mpd(ibpd);
 +      struct ib_device *ibdev = ibpd->device;
        struct mlx5_ib_alloc_pd_resp resp;
 -      struct mlx5_ib_pd *pd;
        int err;
        u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
        u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {};
        u16 uid = 0;
  
 -      pd = kmalloc(sizeof(*pd), GFP_KERNEL);
 -      if (!pd)
 -              return ERR_PTR(-ENOMEM);
 -
        uid = context ? to_mucontext(context)->devx_uid : 0;
        MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
        MLX5_SET(alloc_pd_in, in, uid, uid);
        err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in),
                            out, sizeof(out));
 -      if (err) {
 -              kfree(pd);
 -              return ERR_PTR(err);
 -      }
 +      if (err)
 +              return err;
  
        pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
        pd->uid = uid;
                resp.pdn = pd->pdn;
                if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
                        mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
 -                      kfree(pd);
 -                      return ERR_PTR(-EFAULT);
 +                      return -EFAULT;
                }
        }
  
 -      return &pd->ibpd;
 +      return 0;
  }
  
 -static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
 +static void mlx5_ib_dealloc_pd(struct ib_pd *pd)
  {
        struct mlx5_ib_dev *mdev = to_mdev(pd->device);
        struct mlx5_ib_pd *mpd = to_mpd(pd);
  
        mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
 -      kfree(mpd);
 -
 -      return 0;
  }
  
  enum {
@@@ -2409,29 -2394,10 +2409,29 @@@ static u8 get_match_criteria_enable(u3
        return match_criteria_enable;
  }
  
 -static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
 +static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
  {
 -      MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
 -      MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
 +      u8 entry_mask;
 +      u8 entry_val;
 +      int err = 0;
 +
 +      if (!mask)
 +              goto out;
 +
 +      entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c,
 +                            ip_protocol);
 +      entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v,
 +                           ip_protocol);
 +      if (!entry_mask) {
 +              MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
 +              MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
 +              goto out;
 +      }
 +      /* Don't override existing ip protocol */
 +      if (mask != entry_mask || val != entry_val)
 +              err = -EINVAL;
 +out:
 +      return err;
  }
  
  static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
@@@ -2665,10 -2631,8 +2665,10 @@@ static int parse_flow_attr(struct mlx5_
                set_tos(headers_c, headers_v,
                        ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
  
 -              set_proto(headers_c, headers_v,
 -                        ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
 +              if (set_proto(headers_c, headers_v,
 +                            ib_spec->ipv4.mask.proto,
 +                            ib_spec->ipv4.val.proto))
 +                      return -EINVAL;
                break;
        case IB_FLOW_SPEC_IPV6:
                if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
                        ib_spec->ipv6.mask.traffic_class,
                        ib_spec->ipv6.val.traffic_class);
  
 -              set_proto(headers_c, headers_v,
 -                        ib_spec->ipv6.mask.next_hdr,
 -                        ib_spec->ipv6.val.next_hdr);
 +              if (set_proto(headers_c, headers_v,
 +                            ib_spec->ipv6.mask.next_hdr,
 +                            ib_spec->ipv6.val.next_hdr))
 +                      return -EINVAL;
  
                set_flow_label(misc_params_c, misc_params_v,
                               ntohl(ib_spec->ipv6.mask.flow_label),
                                         LAST_TCP_UDP_FIELD))
                        return -EOPNOTSUPP;
  
 -              MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
 -                       0xff);
 -              MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
 -                       IPPROTO_TCP);
 +              if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP))
 +                      return -EINVAL;
  
                MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
                         ntohs(ib_spec->tcp_udp.mask.src_port));
                                         LAST_TCP_UDP_FIELD))
                        return -EOPNOTSUPP;
  
 -              MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
 -                       0xff);
 -              MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
 -                       IPPROTO_UDP);
 +              if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP))
 +                      return -EINVAL;
  
                MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
                         ntohs(ib_spec->tcp_udp.mask.src_port));
                if (ib_spec->gre.mask.c_ks_res0_ver)
                        return -EOPNOTSUPP;
  
 +              if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE))
 +                      return -EINVAL;
 +
                MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
                         0xff);
                MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
@@@ -3920,7 -3884,7 +3920,7 @@@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_
        if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
                return ERR_PTR(-ENOMEM);
  
 -      dst = kzalloc(sizeof(*dst) * 2, GFP_KERNEL);
 +      dst = kcalloc(2, sizeof(*dst), GFP_KERNEL);
        if (!dst)
                return ERR_PTR(-ENOMEM);
  
@@@ -4201,7 -4165,7 +4201,7 @@@ static ssize_t fw_pages_show(struct dev
                             struct device_attribute *attr, char *buf)
  {
        struct mlx5_ib_dev *dev =
 -              container_of(device, struct mlx5_ib_dev, ib_dev.dev);
 +              rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
  
        return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
  }
@@@ -4211,7 -4175,7 +4211,7 @@@ static ssize_t reg_pages_show(struct de
                              struct device_attribute *attr, char *buf)
  {
        struct mlx5_ib_dev *dev =
 -              container_of(device, struct mlx5_ib_dev, ib_dev.dev);
 +              rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
  
        return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
  }
@@@ -4221,8 -4185,7 +4221,8 @@@ static ssize_t hca_type_show(struct dev
                             struct device_attribute *attr, char *buf)
  {
        struct mlx5_ib_dev *dev =
 -              container_of(device, struct mlx5_ib_dev, ib_dev.dev);
 +              rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
 +
        return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
  }
  static DEVICE_ATTR_RO(hca_type);
@@@ -4231,8 -4194,7 +4231,8 @@@ static ssize_t hw_rev_show(struct devic
                           struct device_attribute *attr, char *buf)
  {
        struct mlx5_ib_dev *dev =
 -              container_of(device, struct mlx5_ib_dev, ib_dev.dev);
 +              rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
 +
        return sprintf(buf, "%x\n", dev->mdev->rev_id);
  }
  static DEVICE_ATTR_RO(hw_rev);
@@@ -4241,8 -4203,7 +4241,8 @@@ static ssize_t board_id_show(struct dev
                             struct device_attribute *attr, char *buf)
  {
        struct mlx5_ib_dev *dev =
 -              container_of(device, struct mlx5_ib_dev, ib_dev.dev);
 +              rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
 +
        return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
                       dev->mdev->board_id);
  }
@@@ -4728,28 -4689,23 +4728,28 @@@ static int create_dev_resources(struct 
  {
        struct ib_srq_init_attr attr;
        struct mlx5_ib_dev *dev;
 +      struct ib_device *ibdev;
        struct ib_cq_init_attr cq_attr = {.cqe = 1};
        int port;
        int ret = 0;
  
        dev = container_of(devr, struct mlx5_ib_dev, devr);
 +      ibdev = &dev->ib_dev;
  
        mutex_init(&devr->mutex);
  
 -      devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
 -      if (IS_ERR(devr->p0)) {
 -              ret = PTR_ERR(devr->p0);
 -              goto error0;
 -      }
 -      devr->p0->device  = &dev->ib_dev;
 +      devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
 +      if (!devr->p0)
 +              return -ENOMEM;
 +
 +      devr->p0->device  = ibdev;
        devr->p0->uobject = NULL;
        atomic_set(&devr->p0->usecnt, 0);
  
 +      ret = mlx5_ib_alloc_pd(devr->p0, NULL, NULL);
 +      if (ret)
 +              goto error0;
 +
        devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
        if (IS_ERR(devr->c0)) {
                ret = PTR_ERR(devr->c0);
@@@ -4847,7 -4803,6 +4847,7 @@@ error2
  error1:
        mlx5_ib_dealloc_pd(devr->p0);
  error0:
 +      kfree(devr->p0);
        return ret;
  }
  
@@@ -4863,7 -4818,6 +4863,7 @@@ static void destroy_dev_resources(struc
        mlx5_ib_dealloc_xrcd(devr->x1);
        mlx5_ib_destroy_cq(devr->c0);
        mlx5_ib_dealloc_pd(devr->p0);
 +      kfree(devr->p0);
  
        /* Make sure no change P_Key work items are still executing */
        for (port = 0; port < dev->num_ports; ++port)
@@@ -5613,7 -5567,9 +5613,7 @@@ static bool mlx5_ib_bind_slave_port(str
        mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
        mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
  
 -      err = mlx5_ib_init_cong_debugfs(ibdev, port_num);
 -      if (err)
 -              goto unbind;
 +      mlx5_ib_init_cong_debugfs(ibdev, port_num);
  
        return true;
  
@@@ -5825,10 -5781,11 +5825,10 @@@ static struct ib_counters *mlx5_ib_crea
  void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
  {
        mlx5_ib_cleanup_multiport_master(dev);
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 -      cleanup_srcu_struct(&dev->mr_srcu);
 -      drain_workqueue(dev->advise_mr_wq);
 -      destroy_workqueue(dev->advise_mr_wq);
 -#endif
 +      if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
 +              srcu_barrier(&dev->mr_srcu);
 +              cleanup_srcu_struct(&dev->mr_srcu);
 +      }
        kfree(dev->port);
  }
  
@@@ -5881,11 -5838,19 +5881,11 @@@ int mlx5_ib_stage_init_init(struct mlx5
        spin_lock_init(&dev->memic.memic_lock);
        dev->memic.dev = mdev;
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 -      dev->advise_mr_wq = alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0);
 -      if (!dev->advise_mr_wq) {
 -              err = -ENOMEM;
 -              goto err_mp;
 -      }
 -
 -      err = init_srcu_struct(&dev->mr_srcu);
 -      if (err) {
 -              destroy_workqueue(dev->advise_mr_wq);
 -              goto err_mp;
 +      if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
 +              err = init_srcu_struct(&dev->mr_srcu);
 +              if (err)
 +                      goto err_mp;
        }
 -#endif
  
        return 0;
  err_mp:
@@@ -5982,8 -5947,6 +5982,8 @@@ static const struct ib_device_ops mlx5_
        .req_notify_cq = mlx5_ib_arm_cq,
        .rereg_user_mr = mlx5_ib_rereg_user_mr,
        .resize_cq = mlx5_ib_resize_cq,
 +      INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
 +      INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
  };
  
  static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
@@@ -6250,7 -6213,7 +6250,7 @@@ static int mlx5_ib_stage_odp_init(struc
        return mlx5_ib_odp_init_one(dev);
  }
  
 -void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
 +static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
  {
        mlx5_ib_odp_cleanup_one(dev);
  }
@@@ -6279,9 -6242,8 +6279,9 @@@ void mlx5_ib_stage_counters_cleanup(str
  
  static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
  {
 -      return mlx5_ib_init_cong_debugfs(dev,
 -                                       mlx5_core_native_port_num(dev->mdev) - 1);
 +      mlx5_ib_init_cong_debugfs(dev,
 +                                mlx5_core_native_port_num(dev->mdev) - 1);
 +      return 0;
  }
  
  static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
@@@ -6331,7 -6293,7 +6331,7 @@@ int mlx5_ib_stage_ib_reg_init(struct ml
                name = "mlx5_%d";
        else
                name = "mlx5_bond_%d";
 -      return ib_register_device(&dev->ib_dev, name, NULL);
 +      return ib_register_device(&dev->ib_dev, name);
  }
  
  void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
@@@ -6588,7 -6550,7 +6588,7 @@@ static void *mlx5_ib_add(struct mlx5_co
        if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET)
                return mlx5_ib_add_slave_port(mdev);
  
 -      dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
 +      dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
        if (!dev)
                return NULL;
  
index c85f002558843ff50402bf14ba14ede892beb911,2b90d8dc70cdb57f4f8d6dab1737e9ea12312673..ca921fd4049963b84ad4f0541505ade7714eabbc
@@@ -71,9 -71,10 +71,9 @@@ static int destroy_mkey(struct mlx5_ib_
  {
        int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 -      /* Wait until all page fault handlers using the mr complete. */
 -      synchronize_srcu(&dev->mr_srcu);
 -#endif
 +      if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
 +              /* Wait until all page fault handlers using the mr complete. */
 +              synchronize_srcu(&dev->mr_srcu);
  
        return err;
  }
@@@ -94,9 -95,10 +94,9 @@@ static bool use_umr_mtt_update(struct m
                length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
  }
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
  static void update_odp_mr(struct mlx5_ib_mr *mr)
  {
 -      if (mr->umem->is_odp) {
 +      if (is_odp_mr(mr)) {
                /*
                 * This barrier prevents the compiler from moving the
                 * setting of umem->odp_data->private to point to our
                smp_wmb();
        }
  }
 -#endif
  
  static void reg_mr_callback(int status, struct mlx5_async_work *context)
  {
@@@ -254,8 -257,9 +254,8 @@@ static void remove_keys(struct mlx5_ib_
                mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
        }
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 -      synchronize_srcu(&dev->mr_srcu);
 -#endif
 +      if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
 +              synchronize_srcu(&dev->mr_srcu);
  
        list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
                list_del(&mr->list);
@@@ -607,27 -611,52 +607,27 @@@ static void mlx5_mr_cache_debugfs_clean
        dev->cache.root = NULL;
  }
  
 -static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 +static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
  {
        struct mlx5_mr_cache *cache = &dev->cache;
        struct mlx5_cache_ent *ent;
 +      struct dentry *dir;
        int i;
  
        if (!mlx5_debugfs_root || dev->rep)
 -              return 0;
 +              return;
  
        cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
 -      if (!cache->root)
 -              return -ENOMEM;
  
        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
                ent = &cache->ent[i];
                sprintf(ent->name, "%d", ent->order);
 -              ent->dir = debugfs_create_dir(ent->name,  cache->root);
 -              if (!ent->dir)
 -                      goto err;
 -
 -              ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
 -                                               &size_fops);
 -              if (!ent->fsize)
 -                      goto err;
 -
 -              ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
 -                                                &limit_fops);
 -              if (!ent->flimit)
 -                      goto err;
 -
 -              ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
 -                                             &ent->cur);
 -              if (!ent->fcur)
 -                      goto err;
 -
 -              ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
 -                                              &ent->miss);
 -              if (!ent->fmiss)
 -                      goto err;
 +              dir = debugfs_create_dir(ent->name, cache->root);
 +              debugfs_create_file("size", 0600, dir, ent, &size_fops);
 +              debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
 +              debugfs_create_u32("cur", 0400, dir, &ent->cur);
 +              debugfs_create_u32("miss", 0600, dir, &ent->miss);
        }
 -
 -      return 0;
 -err:
 -      mlx5_mr_cache_debugfs_cleanup(dev);
 -
 -      return -ENOMEM;
  }
  
  static void delay_time_func(struct timer_list *t)
@@@ -641,6 -670,7 +641,6 @@@ int mlx5_mr_cache_init(struct mlx5_ib_d
  {
        struct mlx5_mr_cache *cache = &dev->cache;
        struct mlx5_cache_ent *ent;
 -      int err;
        int i;
  
        mutex_init(&dev->slow_path_mutex);
                queue_work(cache->wq, &ent->work);
        }
  
 -      err = mlx5_mr_cache_debugfs_init(dev);
 -      if (err)
 -              mlx5_ib_warn(dev, "cache debugfs failure\n");
 -
 -      /*
 -       * We don't want to fail driver if debugfs failed to initialize,
 -       * so we are not forwarding error to the user.
 -       */
 +      mlx5_mr_cache_debugfs_init(dev);
  
        return 0;
  }
@@@ -785,17 -822,18 +785,17 @@@ static int mr_cache_max_order(struct ml
        return MLX5_MAX_UMR_SHIFT;
  }
  
 -static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
 -                     int access_flags, struct ib_umem **umem,
 -                     int *npages, int *page_shift, int *ncont,
 -                     int *order)
 +static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 +                     u64 start, u64 length, int access_flags,
 +                     struct ib_umem **umem, int *npages, int *page_shift,
 +                     int *ncont, int *order)
  {
 -      struct mlx5_ib_dev *dev = to_mdev(pd->device);
        struct ib_umem *u;
        int err;
  
        *umem = NULL;
  
 -      u = ib_umem_get(pd->uobject->context, start, length, access_flags, 0);
 +      u = ib_umem_get(udata, start, length, access_flags, 0);
        err = PTR_ERR_OR_ZERO(u);
        if (err) {
                mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
@@@ -1194,8 -1232,7 +1194,7 @@@ static struct ib_mr *mlx5_ib_get_memic_
        MLX5_SET64(mkc, mkc, len, length);
        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
        MLX5_SET(mkc, mkc, qpn, 0xffffff);
-       MLX5_SET64(mkc, mkc, start_addr,
-                  memic_addr - pci_resource_start(dev->mdev->pdev, 0));
+       MLX5_SET64(mkc, mkc, start_addr, memic_addr - dev->mdev->bar_addr);
  
        err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
        if (err)
@@@ -1268,20 -1305,21 +1267,20 @@@ struct ib_mr *mlx5_ib_reg_user_mr(struc
        mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
                    start, virt_addr, length, access_flags);
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 -      if (!start && length == U64_MAX) {
 +      if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
 +          length == U64_MAX) {
                if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
                    !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
                        return ERR_PTR(-EINVAL);
  
 -              mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
 +              mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags);
                if (IS_ERR(mr))
                        return ERR_CAST(mr);
                return &mr->ibmr;
        }
 -#endif
  
 -      err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
 -                         &page_shift, &ncont, &order);
 +      err = mr_umem_get(dev, udata, start, length, access_flags, &umem,
 +                        &npages, &page_shift, &ncont, &order);
  
        if (err < 0)
                return ERR_PTR(err);
        mr->umem = umem;
        set_mr_fields(dev, mr, npages, length, access_flags);
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        update_odp_mr(mr);
 -#endif
  
        if (!populate_mtts) {
                int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
                }
        }
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 -      mr->live = 1;
 -#endif
 +      if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
 +              mr->live = 1;
 +              atomic_set(&mr->num_pending_prefetch, 0);
 +      }
 +
        return &mr->ibmr;
  error:
        ib_umem_release(umem);
@@@ -1431,9 -1469,8 +1430,9 @@@ int mlx5_ib_rereg_user_mr(struct ib_mr 
                flags |= IB_MR_REREG_TRANS;
                ib_umem_release(mr->umem);
                mr->umem = NULL;
 -              err = mr_umem_get(pd, addr, len, access_flags, &mr->umem,
 -                                &npages, &page_shift, &ncont, &order);
 +              err = mr_umem_get(dev, udata, addr, len, access_flags,
 +                                &mr->umem, &npages, &page_shift, &ncont,
 +                                &order);
                if (err)
                        goto err;
        }
                }
  
                mr->allocated_from_cache = 0;
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 -              mr->live = 1;
 -#endif
 +              if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
 +                      mr->live = 1;
        } else {
                /*
                 * Send a UMR WQE
  
        set_mr_fields(dev, mr, npages, len, access_flags);
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        update_odp_mr(mr);
 -#endif
        return 0;
  
  err:
@@@ -1575,19 -1615,12 +1574,19 @@@ static void dereg_mr(struct mlx5_ib_de
        int npages = mr->npages;
        struct ib_umem *umem = mr->umem;
  
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 -      if (umem && umem->is_odp) {
 +      if (is_odp_mr(mr)) {
                struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem);
  
 -              /* Prevent new page faults from succeeding */
 +              /* Prevent new page faults and
 +               * prefetch requests from succeeding
 +               */
                mr->live = 0;
 +
 +              /* dequeue pending prefetch requests for the mr */
 +              if (atomic_read(&mr->num_pending_prefetch))
 +                      flush_workqueue(system_unbound_wq);
 +              WARN_ON(atomic_read(&mr->num_pending_prefetch));
 +
                /* Wait for all running page-fault handlers to finish. */
                synchronize_srcu(&dev->mr_srcu);
                /* Destroy all page mappings */
                /* Avoid double-freeing the umem. */
                umem = NULL;
        }
 -#endif
 +
        clean_mr(dev, mr);
  
        /*
index 7cd006da1daef05cd335dc77cda8281e179630c4,816c34ee91cfb782f1017e83c55ca15459942ba9..ef7d69269a88de4fbb8ab42a853840c10c26263d
@@@ -109,173 -109,75 +109,173 @@@ static int is_sqp(enum ib_qp_type qp_ty
  }
  
  /**
 - * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space.
 + * mlx5_ib_read_user_wqe_common() - Copy a WQE (or part of) from user WQ
 + * to kernel buffer
   *
 - * @qp: QP to copy from.
 - * @send: copy from the send queue when non-zero, use the receive queue
 - *      otherwise.
 - * @wqe_index:  index to start copying from. For send work queues, the
 - *            wqe_index is in units of MLX5_SEND_WQE_BB.
 - *            For receive work queue, it is the number of work queue
 - *            element in the queue.
 - * @buffer: destination buffer.
 - * @length: maximum number of bytes to copy.
 + * @umem: User space memory where the WQ is
 + * @buffer: buffer to copy to
 + * @buflen: buffer length
 + * @wqe_index: index of WQE to copy from
 + * @wq_offset: offset to start of WQ
 + * @wq_wqe_cnt: number of WQEs in WQ
 + * @wq_wqe_shift: log2 of WQE size
 + * @bcnt: number of bytes to copy
 + * @bytes_copied: number of bytes to copy (return value)
   *
 - * Copies at least a single WQE, but may copy more data.
 + * Copies from start of WQE bcnt or less bytes.
 + * Does not gurantee to copy the entire WQE.
   *
 - * Return: the number of bytes copied, or an error code.
 + * Return: zero on success, or an error code.
   */
 -int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
 -                        void *buffer, u32 length,
 -                        struct mlx5_ib_qp_base *base)
 +static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem,
 +                                      void *buffer,
 +                                      u32 buflen,
 +                                      int wqe_index,
 +                                      int wq_offset,
 +                                      int wq_wqe_cnt,
 +                                      int wq_wqe_shift,
 +                                      int bcnt,
 +                                      size_t *bytes_copied)
 +{
 +      size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift);
 +      size_t wq_end = wq_offset + (wq_wqe_cnt << wq_wqe_shift);
 +      size_t copy_length;
 +      int ret;
 +
 +      /* don't copy more than requested, more than buffer length or
 +       * beyond WQ end
 +       */
 +      copy_length = min_t(u32, buflen, wq_end - offset);
 +      copy_length = min_t(u32, copy_length, bcnt);
 +
 +      ret = ib_umem_copy_from(buffer, umem, offset, copy_length);
 +      if (ret)
 +              return ret;
 +
 +      if (!ret && bytes_copied)
 +              *bytes_copied = copy_length;
 +
 +      return 0;
 +}
 +
 +int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
 +                           int wqe_index,
 +                           void *buffer,
 +                           int buflen,
 +                           size_t *bc)
  {
 -      struct ib_device *ibdev = qp->ibqp.device;
 -      struct mlx5_ib_dev *dev = to_mdev(ibdev);
 -      struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
 -      size_t offset;
 -      size_t wq_end;
 +      struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
        struct ib_umem *umem = base->ubuffer.umem;
 -      u32 first_copy_length;
 -      int wqe_length;
 +      struct mlx5_ib_wq *wq = &qp->sq;
 +      struct mlx5_wqe_ctrl_seg *ctrl;
 +      size_t bytes_copied;
 +      size_t bytes_copied2;
 +      size_t wqe_length;
        int ret;
 +      int ds;
  
 -      if (wq->wqe_cnt == 0) {
 -              mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n",
 -                          qp->ibqp.qp_type);
 +      if (buflen < sizeof(*ctrl))
                return -EINVAL;
 -      }
  
 -      offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift);
 -      wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift);
 +      /* at first read as much as possible */
 +      ret = mlx5_ib_read_user_wqe_common(umem,
 +                                         buffer,
 +                                         buflen,
 +                                         wqe_index,
 +                                         wq->offset,
 +                                         wq->wqe_cnt,
 +                                         wq->wqe_shift,
 +                                         buflen,
 +                                         &bytes_copied);
 +      if (ret)
 +              return ret;
  
 -      if (send && length < sizeof(struct mlx5_wqe_ctrl_seg))
 +      /* we need at least control segment size to proceed */
 +      if (bytes_copied < sizeof(*ctrl))
                return -EINVAL;
  
 -      if (offset > umem->length ||
 -          (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length))
 -              return -EINVAL;
 +      ctrl = buffer;
 +      ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
 +      wqe_length = ds * MLX5_WQE_DS_UNITS;
 +
 +      /* if we copied enough then we are done */
 +      if (bytes_copied >= wqe_length) {
 +              *bc = bytes_copied;
 +              return 0;
 +      }
 +
 +      /* otherwise this a wrapped around wqe
 +       * so read the remaining bytes starting
 +       * from  wqe_index 0
 +       */
 +      ret = mlx5_ib_read_user_wqe_common(umem,
 +                                         buffer + bytes_copied,
 +                                         buflen - bytes_copied,
 +                                         0,
 +                                         wq->offset,
 +                                         wq->wqe_cnt,
 +                                         wq->wqe_shift,
 +                                         wqe_length - bytes_copied,
 +                                         &bytes_copied2);
  
 -      first_copy_length = min_t(u32, offset + length, wq_end) - offset;
 -      ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length);
        if (ret)
                return ret;
 +      *bc = bytes_copied + bytes_copied2;
 +      return 0;
 +}
  
 -      if (send) {
 -              struct mlx5_wqe_ctrl_seg *ctrl = buffer;
 -              int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
 -
 -              wqe_length = ds * MLX5_WQE_DS_UNITS;
 -      } else {
 -              wqe_length = 1 << wq->wqe_shift;
 -      }
 +int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
 +                           int wqe_index,
 +                           void *buffer,
 +                           int buflen,
 +                           size_t *bc)
 +{
 +      struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
 +      struct ib_umem *umem = base->ubuffer.umem;
 +      struct mlx5_ib_wq *wq = &qp->rq;
 +      size_t bytes_copied;
 +      int ret;
  
 -      if (wqe_length <= first_copy_length)
 -              return first_copy_length;
 +      ret = mlx5_ib_read_user_wqe_common(umem,
 +                                         buffer,
 +                                         buflen,
 +                                         wqe_index,
 +                                         wq->offset,
 +                                         wq->wqe_cnt,
 +                                         wq->wqe_shift,
 +                                         buflen,
 +                                         &bytes_copied);
  
 -      ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset,
 -                              wqe_length - first_copy_length);
        if (ret)
                return ret;
 +      *bc = bytes_copied;
 +      return 0;
 +}
  
 -      return wqe_length;
 +int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq,
 +                            int wqe_index,
 +                            void *buffer,
 +                            int buflen,
 +                            size_t *bc)
 +{
 +      struct ib_umem *umem = srq->umem;
 +      size_t bytes_copied;
 +      int ret;
 +
 +      ret = mlx5_ib_read_user_wqe_common(umem,
 +                                         buffer,
 +                                         buflen,
 +                                         wqe_index,
 +                                         0,
 +                                         srq->msrq.max,
 +                                         srq->msrq.wqe_shift,
 +                                         buflen,
 +                                         &bytes_copied);
 +
 +      if (ret)
 +              return ret;
 +      *bc = bytes_copied;
 +      return 0;
  }
  
  static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
@@@ -533,9 -435,9 +533,9 @@@ static int set_user_buf_size(struct mlx
                return -EINVAL;
        }
  
 -      if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) {
 -              mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n",
 -                           ucmd->sq_wqe_count, ucmd->sq_wqe_count);
 +      if (ucmd->sq_wqe_count && !is_power_of_2(ucmd->sq_wqe_count)) {
 +              mlx5_ib_warn(dev, "sq_wqe_count %d is not a power of two\n",
 +                           ucmd->sq_wqe_count);
                return -EINVAL;
        }
  
@@@ -743,14 -645,16 +743,14 @@@ int bfregn_to_uar_index(struct mlx5_ib_
        return bfregi->sys_pages[index_of_sys_page] + offset;
  }
  
 -static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
 -                          struct ib_pd *pd,
 +static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
                            unsigned long addr, size_t size,
 -                          struct ib_umem **umem,
 -                          int *npages, int *page_shift, int *ncont,
 -                          u32 *offset)
 +                          struct ib_umem **umem, int *npages, int *page_shift,
 +                          int *ncont, u32 *offset)
  {
        int err;
  
 -      *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0);
 +      *umem = ib_umem_get(udata, addr, size, 0, 0);
        if (IS_ERR(*umem)) {
                mlx5_ib_dbg(dev, "umem_get failed\n");
                return PTR_ERR(*umem);
@@@ -791,11 -695,10 +791,11 @@@ static void destroy_user_rq(struct mlx5
  }
  
  static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 -                        struct mlx5_ib_rwq *rwq,
 +                        struct ib_udata *udata, struct mlx5_ib_rwq *rwq,
                          struct mlx5_ib_create_wq *ucmd)
  {
 -      struct mlx5_ib_ucontext *context;
 +      struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
 +              udata, struct mlx5_ib_ucontext, ibucontext);
        int page_shift = 0;
        int npages;
        u32 offset = 0;
        if (!ucmd->buf_addr)
                return -EINVAL;
  
 -      context = to_mucontext(pd->uobject->context);
 -      rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr,
 -                             rwq->buf_size, 0, 0);
 +      rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0, 0);
        if (IS_ERR(rwq->umem)) {
                mlx5_ib_dbg(dev, "umem_get failed\n");
                err = PTR_ERR(rwq->umem);
                    (unsigned long long)ucmd->buf_addr, rwq->buf_size,
                    npages, page_shift, ncont, offset);
  
 -      err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db);
 +      err = mlx5_ib_db_map_user(ucontext, udata, ucmd->db_addr, &rwq->db);
        if (err) {
                mlx5_ib_dbg(dev, "map failed\n");
                goto err_umem;
@@@ -878,8 -783,7 +878,8 @@@ static int create_user_qp(struct mlx5_i
                return err;
        }
  
 -      context = to_mucontext(pd->uobject->context);
 +      context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext,
 +                                          ibucontext);
        if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) {
                uar_index = bfregn_to_uar_index(dev, &context->bfregi,
                                                ucmd.bfreg_index, true);
  
        if (ucmd.buf_addr && ubuffer->buf_size) {
                ubuffer->buf_addr = ucmd.buf_addr;
 -              err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr,
 -                                     ubuffer->buf_size,
 -                                     &ubuffer->umem, &npages, &page_shift,
 -                                     &ncont, &offset);
 +              err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr,
 +                                     ubuffer->buf_size, &ubuffer->umem,
 +                                     &npages, &page_shift, &ncont, &offset);
                if (err)
                        goto err_bfreg;
        } else {
                resp->bfreg_index = MLX5_IB_INVALID_BFREG;
        qp->bfregn = bfregn;
  
 -      err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
 +      err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &qp->db);
        if (err) {
                mlx5_ib_dbg(dev, "map failed\n");
                goto err_free;
@@@ -1214,7 -1119,6 +1214,7 @@@ static void destroy_flow_rule_vport_sq(
  }
  
  static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
 +                                 struct ib_udata *udata,
                                   struct mlx5_ib_sq *sq, void *qpin,
                                   struct ib_pd *pd)
  {
        int ncont = 0;
        u32 offset = 0;
  
 -      err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size,
 -                             &sq->ubuffer.umem, &npages, &page_shift,
 -                             &ncont, &offset);
 +      err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr, ubuffer->buf_size,
 +                             &sq->ubuffer.umem, &npages, &page_shift, &ncont,
 +                             &offset);
        if (err)
                return err;
  
@@@ -1458,8 -1362,9 +1458,8 @@@ static int create_raw_packet_qp(struct 
        struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
        struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
        struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
 -      struct ib_uobject *uobj = pd->uobject;
 -      struct ib_ucontext *ucontext = uobj->context;
 -      struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
 +      struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context(
 +              udata, struct mlx5_ib_ucontext, ibucontext);
        int err;
        u32 tdn = mucontext->tdn;
        u16 uid = to_mpd(pd)->uid;
                if (err)
                        return err;
  
 -              err = create_raw_packet_qp_sq(dev, sq, in, pd);
 +              err = create_raw_packet_qp_sq(dev, udata, sq, in, pd);
                if (err)
                        goto err_destroy_tis;
  
@@@ -1573,8 -1478,9 +1573,8 @@@ static int create_rss_raw_qp_tir(struc
                                 struct ib_qp_init_attr *init_attr,
                                 struct ib_udata *udata)
  {
 -      struct ib_uobject *uobj = pd->uobject;
 -      struct ib_ucontext *ucontext = uobj->context;
 -      struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
 +      struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context(
 +              udata, struct mlx5_ib_ucontext, ibucontext);
        struct mlx5_ib_create_qp_resp resp = {};
        int inlen;
        int err;
@@@ -1916,8 -1822,6 +1916,8 @@@ static int create_qp_common(struct mlx5
        int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
        struct mlx5_core_dev *mdev = dev->mdev;
        struct mlx5_ib_create_qp_resp resp = {};
 +      struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
 +              udata, struct mlx5_ib_ucontext, ibucontext);
        struct mlx5_ib_cq *send_cq;
        struct mlx5_ib_cq *recv_cq;
        unsigned long flags;
                }
  
                if (!check_flags_mask(ucmd.flags,
 +                                    MLX5_QP_FLAG_ALLOW_SCATTER_CQE |
 +                                    MLX5_QP_FLAG_BFREG_INDEX |
 +                                    MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE |
 +                                    MLX5_QP_FLAG_SCATTER_CQE |
                                      MLX5_QP_FLAG_SIGNATURE |
 -                                            MLX5_QP_FLAG_SCATTER_CQE |
 -                                            MLX5_QP_FLAG_TUNNEL_OFFLOADS |
 -                                            MLX5_QP_FLAG_BFREG_INDEX |
 -                                            MLX5_QP_FLAG_TYPE_DCT |
 -                                            MLX5_QP_FLAG_TYPE_DCI |
 -                                            MLX5_QP_FLAG_ALLOW_SCATTER_CQE |
 -                                            MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE))
 +                                    MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC |
 +                                    MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
 +                                    MLX5_QP_FLAG_TUNNEL_OFFLOADS |
 +                                    MLX5_QP_FLAG_TYPE_DCI |
 +                                    MLX5_QP_FLAG_TYPE_DCT))
                        return -EINVAL;
  
 -              err = get_qp_user_index(to_mucontext(pd->uobject->context),
 -                                      &ucmd, udata->inlen, &uidx);
 +              err = get_qp_user_index(ucontext, &ucmd, udata->inlen, &uidx);
                if (err)
                        return err;
  
@@@ -2504,11 -2407,8 +2504,11 @@@ static const char *ib_qp_type_str(enum 
  
  static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd,
                                        struct ib_qp_init_attr *attr,
 -                                      struct mlx5_ib_create_qp *ucmd)
 +                                      struct mlx5_ib_create_qp *ucmd,
 +                                      struct ib_udata *udata)
  {
 +      struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
 +              udata, struct mlx5_ib_ucontext, ibucontext);
        struct mlx5_ib_qp *qp;
        int err = 0;
        u32 uidx = MLX5_IB_DEFAULT_UIDX;
        if (!attr->srq || !attr->recv_cq)
                return ERR_PTR(-EINVAL);
  
 -      err = get_qp_user_index(to_mucontext(pd->uobject->context),
 -                              ucmd, sizeof(*ucmd), &uidx);
 +      err = get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), &uidx);
        if (err)
                return ERR_PTR(err);
  
@@@ -2599,17 -2500,15 +2599,17 @@@ struct ib_qp *mlx5_ib_create_qp(struct 
        int err;
        struct ib_qp_init_attr mlx_init_attr;
        struct ib_qp_init_attr *init_attr = verbs_init_attr;
 +      struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
 +              udata, struct mlx5_ib_ucontext, ibucontext);
  
        if (pd) {
                dev = to_mdev(pd->device);
  
                if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
 -                      if (!udata) {
 +                      if (!ucontext) {
                                mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n");
                                return ERR_PTR(-EINVAL);
 -                      } else if (!to_mucontext(pd->uobject->context)->cqe_version) {
 +                      } else if (!ucontext->cqe_version) {
                                mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n");
                                return ERR_PTR(-EINVAL);
                        }
                                return ERR_PTR(-EINVAL);
                        }
                } else {
 -                      return mlx5_ib_create_dct(pd, init_attr, &ucmd);
 +                      return mlx5_ib_create_dct(pd, init_attr, &ucmd, udata);
                }
        }
  
@@@ -2752,10 -2651,10 +2752,10 @@@ int mlx5_ib_destroy_qp(struct ib_qp *qp
  
  static int to_mlx5_access_flags(struct mlx5_ib_qp *qp,
                                const struct ib_qp_attr *attr,
 -                              int attr_mask, __be32 *hw_access_flags)
 +                              int attr_mask, __be32 *hw_access_flags_be)
  {
        u8 dest_rd_atomic;
 -      u32 access_flags;
 +      u32 access_flags, hw_access_flags = 0;
  
        struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
  
                access_flags &= IB_ACCESS_REMOTE_WRITE;
  
        if (access_flags & IB_ACCESS_REMOTE_READ)
 -              *hw_access_flags |= MLX5_QP_BIT_RRE;
 +              hw_access_flags |= MLX5_QP_BIT_RRE;
        if (access_flags & IB_ACCESS_REMOTE_ATOMIC) {
                int atomic_mode;
  
                if (atomic_mode < 0)
                        return -EOPNOTSUPP;
  
 -              *hw_access_flags |= MLX5_QP_BIT_RAE;
 -              *hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET;
 +              hw_access_flags |= MLX5_QP_BIT_RAE;
 +              hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET;
        }
  
        if (access_flags & IB_ACCESS_REMOTE_WRITE)
 -              *hw_access_flags |= MLX5_QP_BIT_RWE;
 +              hw_access_flags |= MLX5_QP_BIT_RWE;
  
 -      *hw_access_flags = cpu_to_be32(*hw_access_flags);
 +      *hw_access_flags_be = cpu_to_be32(hw_access_flags);
  
        return 0;
  }
@@@ -3279,12 -3178,14 +3279,12 @@@ static int modify_raw_packet_qp(struct 
  static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
                                    struct mlx5_ib_pd *pd,
                                    struct mlx5_ib_qp_base *qp_base,
 -                                  u8 port_num)
 +                                  u8 port_num, struct ib_udata *udata)
  {
 -      struct mlx5_ib_ucontext *ucontext = NULL;
 +      struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
 +              udata, struct mlx5_ib_ucontext, ibucontext);
        unsigned int tx_port_affinity;
  
 -      if (pd && pd->ibpd.uobject && pd->ibpd.uobject->context)
 -              ucontext = to_mucontext(pd->ibpd.uobject->context);
 -
        if (ucontext) {
                tx_port_affinity = (unsigned int)atomic_add_return(
                                           1, &ucontext->tx_port_affinity) %
  
  static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                               const struct ib_qp_attr *attr, int attr_mask,
 -                             enum ib_qp_state cur_state, enum ib_qp_state new_state,
 -                             const struct mlx5_ib_modify_qp *ucmd)
 +                             enum ib_qp_state cur_state,
 +                             enum ib_qp_state new_state,
 +                             const struct mlx5_ib_modify_qp *ucmd,
 +                             struct ib_udata *udata)
  {
        static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
                [MLX5_QP_STATE_RST] = {
                    (ibqp->qp_type == IB_QPT_XRC_TGT)) {
                        if (dev->lag_active) {
                                u8 p = mlx5_core_native_port_num(dev->mdev);
 -                              tx_affinity = get_tx_affinity(dev, pd, base, p);
 +                              tx_affinity = get_tx_affinity(dev, pd, base, p,
 +                                                            udata);
                                context->flags |= cpu_to_be32(tx_affinity << 24);
                        }
                }
        }
  
        if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
 -              __be32 access_flags = 0;
 +              __be32 access_flags;
  
                err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags);
                if (err)
@@@ -3729,7 -3627,6 +3729,7 @@@ static int mlx5_ib_modify_dct(struct ib
  
        } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
                struct mlx5_ib_modify_qp_resp resp = {};
 +              u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0};
                u32 min_resp_len = offsetof(typeof(resp), dctn) +
                                   sizeof(resp.dctn);
  
                MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit);
  
                err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in,
 -                                         MLX5_ST_SZ_BYTES(create_dct_in));
 +                                         MLX5_ST_SZ_BYTES(create_dct_in), out,
 +                                         sizeof(out));
                if (err)
                        return err;
                resp.dctn = qp->dct.mdct.mqp.qpn;
@@@ -3887,7 -3783,7 +3887,7 @@@ int mlx5_ib_modify_qp(struct ib_qp *ibq
        }
  
        err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state,
 -                                new_state, &ucmd);
 +                                new_state, &ucmd, udata);
  
  out:
        mutex_unlock(&qp->mutex);
@@@ -5119,7 -5015,7 +5119,7 @@@ out
                wmb();
  
                /* currently we support only regular doorbells */
-               mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset, NULL);
+               mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset);
                /* Make sure doorbells don't leak out of SQ spinlock
                 * and reach the HCA out of order.
                 */
@@@ -5897,7 -5793,7 +5897,7 @@@ static int prepare_user_rq(struct ib_p
                return err;
        }
  
 -      err = create_user_rq(dev, pd, rwq, &ucmd);
 +      err = create_user_rq(dev, pd, udata, rwq, &ucmd);
        if (err) {
                mlx5_ib_dbg(dev, "err %d\n", err);
                return err;
index be48c6440251fb7426ab77d90346e440b9b833c4,7f1a2afca22ac48f3dbd64d949fb1c589bfb0d03..0a2ffe794a547170bbce5935dd596d0e2ca3b034
@@@ -1347,7 -1347,7 +1347,7 @@@ static void set_wqname(struct mlx5_core
        struct mlx5_cmd *cmd = &dev->cmd;
  
        snprintf(cmd->wq_name, sizeof(cmd->wq_name), "mlx5_cmd_%s",
-                dev_name(&dev->pdev->dev));
+                dev->priv.name);
  }
  
  static void clean_debug_files(struct mlx5_core_dev *dev)
@@@ -1585,24 -1585,6 +1585,24 @@@ no_trig
        spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
  }
  
 +void mlx5_cmd_flush(struct mlx5_core_dev *dev)
 +{
 +      struct mlx5_cmd *cmd = &dev->cmd;
 +      int i;
 +
 +      for (i = 0; i < cmd->max_reg_cmds; i++)
 +              while (down_trylock(&cmd->sem))
 +                      mlx5_cmd_trigger_completions(dev);
 +
 +      while (down_trylock(&cmd->pages_sem))
 +              mlx5_cmd_trigger_completions(dev);
 +
 +      /* Unlock cmdif */
 +      up(&cmd->pages_sem);
 +      for (i = 0; i < cmd->max_reg_cmds; i++)
 +              up(&cmd->sem);
 +}
 +
  static int status_to_err(u8 status)
  {
        return status ? -1 : 0; /* TBD more meaningful codes */
@@@ -1902,9 -1884,9 +1902,9 @@@ int mlx5_cmd_init(struct mlx5_core_dev 
        memset(cmd, 0, sizeof(*cmd));
        cmd_if_rev = cmdif_rev(dev);
        if (cmd_if_rev != CMD_IF_REV) {
-               dev_err(&dev->pdev->dev,
-                       "Driver cmdif rev(%d) differs from firmware's(%d)\n",
-                       CMD_IF_REV, cmd_if_rev);
+               mlx5_core_err(dev,
+                             "Driver cmdif rev(%d) differs from firmware's(%d)\n",
+                             CMD_IF_REV, cmd_if_rev);
                return -EINVAL;
        }
  
        cmd->log_sz = cmd_l >> 4 & 0xf;
        cmd->log_stride = cmd_l & 0xf;
        if (1 << cmd->log_sz > MLX5_MAX_COMMANDS) {
-               dev_err(&dev->pdev->dev, "firmware reports too many outstanding commands %d\n",
-                       1 << cmd->log_sz);
+               mlx5_core_err(dev, "firmware reports too many outstanding commands %d\n",
+                             1 << cmd->log_sz);
                err = -EINVAL;
                goto err_free_page;
        }
  
        if (cmd->log_sz + cmd->log_stride > MLX5_ADAPTER_PAGE_SHIFT) {
-               dev_err(&dev->pdev->dev, "command queue size overflow\n");
+               mlx5_core_err(dev, "command queue size overflow\n");
                err = -EINVAL;
                goto err_free_page;
        }
  
        cmd->cmdif_rev = ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16;
        if (cmd->cmdif_rev > CMD_IF_REV) {
-               dev_err(&dev->pdev->dev, "driver does not support command interface version. driver %d, firmware %d\n",
-                       CMD_IF_REV, cmd->cmdif_rev);
+               mlx5_core_err(dev, "driver does not support command interface version. driver %d, firmware %d\n",
+                             CMD_IF_REV, cmd->cmdif_rev);
                err = -EOPNOTSUPP;
                goto err_free_page;
        }
        cmd_h = (u32)((u64)(cmd->dma) >> 32);
        cmd_l = (u32)(cmd->dma);
        if (cmd_l & 0xfff) {
-               dev_err(&dev->pdev->dev, "invalid command queue address\n");
+               mlx5_core_err(dev, "invalid command queue address\n");
                err = -ENOMEM;
                goto err_free_page;
        }
        set_wqname(dev);
        cmd->wq = create_singlethread_workqueue(cmd->wq_name);
        if (!cmd->wq) {
-               dev_err(&dev->pdev->dev, "failed to create command workqueue\n");
+               mlx5_core_err(dev, "failed to create command workqueue\n");
                err = -ENOMEM;
                goto err_cache;
        }
index 3e1ea8b42c772d2d1da240eb12963034004196c5,2623d3fb6b963d65c215be20c4081e6dc3ef8a17..b0b68dde30172cf42dc629c1c64826cde4c8c038
@@@ -76,14 -76,15 +76,14 @@@ struct page_pool
  #define MLX5_SKB_FRAG_SZ(len) (SKB_DATA_ALIGN(len) +  \
                                 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
  
 +#define MLX5E_RX_MAX_HEAD (256)
 +
  #define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \
        (6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */
  #define MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, req) \
        max_t(u32, MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev), req)
 -#define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev)       MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 6)
 -#define MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 8)
 -#define MLX5E_MPWQE_STRIDE_SZ(mdev, cqe_cmprs) \
 -      (cqe_cmprs ? MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) : \
 -      MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev))
 +#define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev) \
 +      MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, order_base_2(MLX5E_RX_MAX_HEAD))
  
  #define MLX5_MPWRQ_LOG_WQE_SZ                 18
  #define MLX5_MPWRQ_WQE_PAGE_ORDER  (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
  
  #define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW            0x2
  
 -#define MLX5E_RX_MAX_HEAD (256)
 -
  #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (64 * 1024)
  #define MLX5E_DEFAULT_LRO_TIMEOUT                       32
  #define MLX5E_LRO_TIMEOUT_ARR_SIZE                      4
@@@ -306,18 -309,16 +306,18 @@@ struct mlx5e_cq 
        struct mlx5_core_cq        mcq;
        struct mlx5e_channel      *channel;
  
 +      /* control */
 +      struct mlx5_core_dev      *mdev;
 +      struct mlx5_wq_ctrl        wq_ctrl;
 +} ____cacheline_aligned_in_smp;
 +
 +struct mlx5e_cq_decomp {
        /* cqe decompression */
        struct mlx5_cqe64          title;
        struct mlx5_mini_cqe8      mini_arr[MLX5_MINI_CQE_ARRAY_SIZE];
        u8                         mini_arr_idx;
 -      u16                        decmprs_left;
 -      u16                        decmprs_wqe_counter;
 -
 -      /* control */
 -      struct mlx5_core_dev      *mdev;
 -      struct mlx5_wq_ctrl        wq_ctrl;
 +      u16                        left;
 +      u16                        wqe_counter;
  } ____cacheline_aligned_in_smp;
  
  struct mlx5e_tx_wqe_info {
@@@ -387,7 -388,10 +387,7 @@@ struct mlx5e_txqsq 
        struct mlx5e_channel      *channel;
        int                        txq_ix;
        u32                        rate_limit;
 -      struct mlx5e_txqsq_recover {
 -              struct work_struct         recover_work;
 -              u64                        last_recover;
 -      } recover;
 +      struct work_struct         recover_work;
  } ____cacheline_aligned_in_smp;
  
  struct mlx5e_dma_info {
@@@ -577,7 -581,6 +577,7 @@@ struct mlx5e_rq 
        struct net_device     *netdev;
        struct mlx5e_rq_stats *stats;
        struct mlx5e_cq        cq;
 +      struct mlx5e_cq_decomp cqd;
        struct mlx5e_page_cache page_cache;
        struct hwtstamp_config *tstamp;
        struct mlx5_clock      *clock;
@@@ -635,7 -638,6 +635,7 @@@ struct mlx5e_channel 
        struct hwtstamp_config    *tstamp;
        int                        ix;
        int                        cpu;
 +      cpumask_var_t              xps_cpumask;
  };
  
  struct mlx5e_channels {
@@@ -655,7 -657,6 +655,7 @@@ struct mlx5e_channel_stats 
  enum {
        MLX5E_STATE_OPENED,
        MLX5E_STATE_DESTROYING,
 +      MLX5E_STATE_XDP_TX_ENABLED,
  };
  
  struct mlx5e_rqt {
@@@ -681,13 -682,6 +681,13 @@@ struct mlx5e_rss_params 
        u8      hfunc;
  };
  
 +struct mlx5e_modify_sq_param {
 +      int curr_state;
 +      int next_state;
 +      int rl_update;
 +      int rl_index;
 +};
 +
  struct mlx5e_priv {
        /* priv data path fields - start */
        struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
  #ifdef CONFIG_MLX5_EN_TLS
        struct mlx5e_tls          *tls;
  #endif
 +      struct devlink_health_reporter *tx_reporter;
  };
  
  struct mlx5e_profile {
  void mlx5e_build_ptys2ethtool_map(void);
  
  u16 mlx5e_select_queue(struct net_device *dev, struct sk_buff *skb,
 -                     struct net_device *sb_dev,
 -                     select_queue_fallback_t fallback);
 +                     struct net_device *sb_dev);
  netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev);
  netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
 -                        struct mlx5e_tx_wqe *wqe, u16 pi);
 +                        struct mlx5e_tx_wqe *wqe, u16 pi, bool xmit_more);
  
  void mlx5e_completion_event(struct mlx5_core_cq *mcq);
  void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event);
@@@ -809,7 -803,6 +809,7 @@@ mlx5e_skb_from_cqe_nonlinear(struct mlx
  
  void mlx5e_update_stats(struct mlx5e_priv *priv);
  void mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats);
 +void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s);
  
  void mlx5e_init_l2_addr(struct mlx5e_priv *priv);
  int mlx5e_self_test_num(struct mlx5e_priv *priv);
@@@ -857,9 -850,9 +857,9 @@@ void mlx5e_close_channels(struct mlx5e_
   * switching channels
   */
  typedef int (*mlx5e_fp_hw_modify)(struct mlx5e_priv *priv);
 -void mlx5e_switch_priv_channels(struct mlx5e_priv *priv,
 -                              struct mlx5e_channels *new_chs,
 -                              mlx5e_fp_hw_modify hw_modify);
 +int mlx5e_safe_switch_channels(struct mlx5e_priv *priv,
 +                             struct mlx5e_channels *new_chs,
 +                             mlx5e_fp_hw_modify hw_modify);
  void mlx5e_activate_priv_channels(struct mlx5e_priv *priv);
  void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv);
  
@@@ -873,64 -866,12 +873,64 @@@ void mlx5e_set_rq_type(struct mlx5_core
  void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
                               struct mlx5e_params *params);
  
 +int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
 +                  struct mlx5e_modify_sq_param *p);
 +void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq);
 +void mlx5e_tx_disable_queue(struct netdev_queue *txq);
 +
  static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
  {
        return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
                MLX5_CAP_FLOWTABLE_NIC_RX(mdev, ft_field_support.inner_ip_version));
  }
  
 +static inline bool mlx5_tx_swp_supported(struct mlx5_core_dev *mdev)
 +{
 +      return MLX5_CAP_ETH(mdev, swp) &&
 +              MLX5_CAP_ETH(mdev, swp_csum) && MLX5_CAP_ETH(mdev, swp_lso);
 +}
 +
 +struct mlx5e_swp_spec {
 +      __be16 l3_proto;
 +      u8 l4_proto;
 +      u8 is_tun;
 +      __be16 tun_l3_proto;
 +      u8 tun_l4_proto;
 +};
 +
 +static inline void
 +mlx5e_set_eseg_swp(struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg,
 +                 struct mlx5e_swp_spec *swp_spec)
 +{
 +      /* SWP offsets are in 2-bytes words */
 +      eseg->swp_outer_l3_offset = skb_network_offset(skb) / 2;
 +      if (swp_spec->l3_proto == htons(ETH_P_IPV6))
 +              eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L3_IPV6;
 +      if (swp_spec->l4_proto) {
 +              eseg->swp_outer_l4_offset = skb_transport_offset(skb) / 2;
 +              if (swp_spec->l4_proto == IPPROTO_UDP)
 +                      eseg->swp_flags |= MLX5_ETH_WQE_SWP_OUTER_L4_UDP;
 +      }
 +
 +      if (swp_spec->is_tun) {
 +              eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2;
 +              if (swp_spec->tun_l3_proto == htons(ETH_P_IPV6))
 +                      eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6;
 +      } else { /* typically for ipsec when xfrm mode != XFRM_MODE_TUNNEL */
 +              eseg->swp_inner_l3_offset = skb_network_offset(skb) / 2;
 +              if (swp_spec->l3_proto == htons(ETH_P_IPV6))
 +                      eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6;
 +      }
 +      switch (swp_spec->tun_l4_proto) {
 +      case IPPROTO_UDP:
 +              eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L4_UDP;
 +              /* fall through */
 +      case IPPROTO_TCP:
 +              eseg->swp_inner_l4_offset = skb_inner_transport_offset(skb) / 2;
 +              break;
 +      }
 +}
 +
  static inline void mlx5e_sq_fetch_wqe(struct mlx5e_txqsq *sq,
                                      struct mlx5e_tx_wqe **wqe,
                                      u16 *pi)
@@@ -975,7 -916,7 +975,7 @@@ void mlx5e_notify_hw(struct mlx5_wq_cy
         */
        wmb();
  
-       mlx5_write64((__be32 *)ctrl, uar_map, NULL);
+       mlx5_write64((__be32 *)ctrl, uar_map);
  }
  
  static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
index 0be3eb86dd84e7abc0d7aac32d19b4d7643bc91d,8d199c5e7c81528d92a1628813d5faa5630eb49f..78e073243f40bf0e452cd59111738858923551fa
@@@ -263,11 -263,10 +263,11 @@@ static void nested_down_write_ref_node(
        }
  }
  
 -static void down_write_ref_node(struct fs_node *node)
 +static void down_write_ref_node(struct fs_node *node, bool locked)
  {
        if (node) {
 -              down_write(&node->lock);
 +              if (!locked)
 +                      down_write(&node->lock);
                refcount_inc(&node->refcount);
        }
  }
@@@ -278,14 -277,13 +278,14 @@@ static void up_read_ref_node(struct fs_
        up_read(&node->lock);
  }
  
 -static void up_write_ref_node(struct fs_node *node)
 +static void up_write_ref_node(struct fs_node *node, bool locked)
  {
        refcount_dec(&node->refcount);
 -      up_write(&node->lock);
 +      if (!locked)
 +              up_write(&node->lock);
  }
  
 -static void tree_put_node(struct fs_node *node)
 +static void tree_put_node(struct fs_node *node, bool locked)
  {
        struct fs_node *parent_node = node->parent;
  
                        /* Only root namespace doesn't have parent and we just
                         * need to free its node.
                         */
 -                      down_write_ref_node(parent_node);
 +                      down_write_ref_node(parent_node, locked);
                        list_del_init(&node->list);
                        if (node->del_sw_func)
                                node->del_sw_func(node);
 -                      up_write_ref_node(parent_node);
 +                      up_write_ref_node(parent_node, locked);
                } else {
                        kfree(node);
                }
                node = NULL;
        }
        if (!node && parent_node)
 -              tree_put_node(parent_node);
 +              tree_put_node(parent_node, locked);
  }
  
 -static int tree_remove_node(struct fs_node *node)
 +static int tree_remove_node(struct fs_node *node, bool locked)
  {
        if (refcount_read(&node->refcount) > 1) {
                refcount_dec(&node->refcount);
                return -EEXIST;
        }
 -      tree_put_node(node);
 +      tree_put_node(node, locked);
        return 0;
  }
  
@@@ -400,7 -398,6 +400,7 @@@ static void del_hw_flow_table(struct fs
        fs_get_obj(ft, node);
        dev = get_dev(&ft->node);
        root = find_root(&ft->node);
 +      trace_mlx5_fs_del_ft(ft);
  
        if (node->active) {
                err = root->cmds->destroy_flow_table(dev, ft);
@@@ -422,34 -419,22 +422,34 @@@ static void del_sw_flow_table(struct fs
        kfree(ft);
  }
  
 -static void del_sw_hw_rule(struct fs_node *node)
 +static void modify_fte(struct fs_fte *fte)
  {
        struct mlx5_flow_root_namespace *root;
 -      struct mlx5_flow_rule *rule;
        struct mlx5_flow_table *ft;
        struct mlx5_flow_group *fg;
 -      struct fs_fte *fte;
 -      int modify_mask;
 -      struct mlx5_core_dev *dev = get_dev(node);
 +      struct mlx5_core_dev *dev;
        int err;
 -      bool update_fte = false;
  
 -      fs_get_obj(rule, node);
 -      fs_get_obj(fte, rule->node.parent);
        fs_get_obj(fg, fte->node.parent);
        fs_get_obj(ft, fg->node.parent);
 +      dev = get_dev(&fte->node);
 +
 +      root = find_root(&ft->node);
 +      err = root->cmds->update_fte(dev, ft, fg->id, fte->modify_mask, fte);
 +      if (err)
 +              mlx5_core_warn(dev,
 +                             "%s can't del rule fg id=%d fte_index=%d\n",
 +                             __func__, fg->id, fte->index);
 +      fte->modify_mask = 0;
 +}
 +
 +static void del_sw_hw_rule(struct fs_node *node)
 +{
 +      struct mlx5_flow_rule *rule;
 +      struct fs_fte *fte;
 +
 +      fs_get_obj(rule, node);
 +      fs_get_obj(fte, rule->node.parent);
        trace_mlx5_fs_del_rule(rule);
        if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
                mutex_lock(&rule->dest_attr.ft->lock);
  
        if (rule->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER  &&
            --fte->dests_size) {
 -              modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) |
 -                            BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
 +              fte->modify_mask |=
 +                      BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) |
 +                      BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
                fte->action.action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT;
 -              update_fte = true;
                goto out;
        }
  
        if ((fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
            --fte->dests_size) {
 -              modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
 -              update_fte = true;
 +              fte->modify_mask |=
 +                      BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
        }
  out:
 -      root = find_root(&ft->node);
 -      if (update_fte && fte->dests_size) {
 -              err = root->cmds->update_fte(dev, ft, fg->id, modify_mask, fte);
 -              if (err)
 -                      mlx5_core_warn(dev,
 -                                     "%s can't del rule fg id=%d fte_index=%d\n",
 -                                     __func__, fg->id, fte->index);
 -      }
        kfree(rule);
  }
  
@@@ -497,7 -490,6 +497,7 @@@ static void del_hw_fte(struct fs_node *
                        mlx5_core_warn(dev,
                                       "flow steering can't delete fte in index %d of flow group id %d\n",
                                       fte->index, fg->id);
 +              node->active = 0;
        }
  }
  
@@@ -598,7 -590,7 +598,7 @@@ static struct fs_fte *alloc_fte(struct 
        fte->node.type =  FS_TYPE_FLOW_ENTRY;
        fte->action = *flow_act;
  
 -      tree_init_node(&fte->node, del_hw_fte, del_sw_fte);
 +      tree_init_node(&fte->node, NULL, del_sw_fte);
  
        return fte;
  }
@@@ -627,8 -619,7 +627,8 @@@ static struct mlx5_flow_group *alloc_fl
        if (ret) {
                kmem_cache_free(steering->fgs_cache, fg);
                return ERR_PTR(ret);
 -}
 +      }
 +
        ida_init(&fg->fte_allocator);
        fg->mask.match_criteria_enable = match_criteria_enable;
        memcpy(&fg->mask.match_criteria, match_criteria,
@@@ -819,7 -810,7 +819,7 @@@ static int update_root_ft_create(struc
        struct mlx5_flow_root_namespace *root = find_root(&prio->node);
        struct mlx5_ft_underlay_qp *uqp;
        int min_level = INT_MAX;
-       int err;
+       int err = 0;
        u32 qpn;
  
        if (root->root_ft)
@@@ -865,7 -856,7 +865,7 @@@ static int _mlx5_modify_rule_destinatio
        fs_get_obj(fte, rule->node.parent);
        if (!(fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
                return -EINVAL;
 -      down_write_ref_node(&fte->node);
 +      down_write_ref_node(&fte->node, false);
        fs_get_obj(fg, fte->node.parent);
        fs_get_obj(ft, fg->node.parent);
  
        root = find_root(&ft->node);
        err = root->cmds->update_fte(get_dev(&ft->node), ft, fg->id,
                                     modify_mask, fte);
 -      up_write_ref_node(&fte->node);
 +      up_write_ref_node(&fte->node, false);
  
        return err;
  }
@@@ -1023,13 -1014,12 +1023,13 @@@ static struct mlx5_flow_table *__mlx5_c
        if (err)
                goto destroy_ft;
        ft->node.active = true;
 -      down_write_ref_node(&fs_prio->node);
 +      down_write_ref_node(&fs_prio->node, false);
        tree_add_node(&ft->node, &fs_prio->node);
        list_add_flow_table(ft, fs_prio);
        fs_prio->num_ft++;
 -      up_write_ref_node(&fs_prio->node);
 +      up_write_ref_node(&fs_prio->node, false);
        mutex_unlock(&root->chain_lock);
 +      trace_mlx5_fs_add_ft(ft);
        return ft;
  destroy_ft:
        root->cmds->destroy_flow_table(root->dev, ft);
@@@ -1121,17 -1111,17 +1121,17 @@@ struct mlx5_flow_group *mlx5_create_flo
        if (ft->autogroup.active)
                return ERR_PTR(-EPERM);
  
 -      down_write_ref_node(&ft->node);
 +      down_write_ref_node(&ft->node, false);
        fg = alloc_insert_flow_group(ft, match_criteria_enable, match_criteria,
                                     start_index, end_index,
                                     ft->node.children.prev);
 -      up_write_ref_node(&ft->node);
 +      up_write_ref_node(&ft->node, false);
        if (IS_ERR(fg))
                return fg;
  
        err = root->cmds->create_flow_group(dev, ft, fg_in, &fg->id);
        if (err) {
 -              tree_put_node(&fg->node);
 +              tree_put_node(&fg->node, false);
                return ERR_PTR(err);
        }
        trace_mlx5_fs_add_fg(fg);
@@@ -1528,10 -1518,10 +1528,10 @@@ static void free_match_list(struct matc
                struct match_list *iter, *match_tmp;
  
                list_del(&head->first.list);
 -              tree_put_node(&head->first.g->node);
 +              tree_put_node(&head->first.g->node, false);
                list_for_each_entry_safe(iter, match_tmp, &head->list,
                                         list) {
 -                      tree_put_node(&iter->g->node);
 +                      tree_put_node(&iter->g->node, false);
                        list_del(&iter->list);
                        kfree(iter);
                }
@@@ -1608,16 -1598,11 +1608,16 @@@ lookup_fte_locked(struct mlx5_flow_grou
                fte_tmp = NULL;
                goto out;
        }
 +      if (!fte_tmp->node.active) {
 +              tree_put_node(&fte_tmp->node, false);
 +              fte_tmp = NULL;
 +              goto out;
 +      }
  
        nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD);
  out:
        if (take_write)
 -              up_write_ref_node(&g->node);
 +              up_write_ref_node(&g->node, false);
        else
                up_read_ref_node(&g->node);
        return fte_tmp;
@@@ -1659,8 -1644,8 +1659,8 @@@ search_again_locked
                        continue;
                rule = add_rule_fg(g, spec->match_value,
                                   flow_act, dest, dest_num, fte_tmp);
 -              up_write_ref_node(&fte_tmp->node);
 -              tree_put_node(&fte_tmp->node);
 +              up_write_ref_node(&fte_tmp->node, false);
 +              tree_put_node(&fte_tmp->node, false);
                kmem_cache_free(steering->ftes_cache, fte);
                return rule;
        }
@@@ -1696,7 -1681,7 +1696,7 @@@ skip_search
  
                err = insert_fte(g, fte);
                if (err) {
 -                      up_write_ref_node(&g->node);
 +                      up_write_ref_node(&g->node, false);
                        if (err == -ENOSPC)
                                continue;
                        kmem_cache_free(steering->ftes_cache, fte);
                }
  
                nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
 -              up_write_ref_node(&g->node);
 +              up_write_ref_node(&g->node, false);
                rule = add_rule_fg(g, spec->match_value,
                                   flow_act, dest, dest_num, fte);
 -              up_write_ref_node(&fte->node);
 -              tree_put_node(&fte->node);
 +              up_write_ref_node(&fte->node, false);
 +              tree_put_node(&fte->node, false);
                return rule;
        }
        rule = ERR_PTR(-ENOENT);
@@@ -1750,7 -1735,7 +1750,7 @@@ search_again_locked
        err = build_match_list(&match_head, ft, spec);
        if (err) {
                if (take_write)
 -                      up_write_ref_node(&ft->node);
 +                      up_write_ref_node(&ft->node, false);
                else
                        up_read_ref_node(&ft->node);
                return ERR_PTR(err);
        if (!IS_ERR(rule) ||
            (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN)) {
                if (take_write)
 -                      up_write_ref_node(&ft->node);
 +                      up_write_ref_node(&ft->node, false);
                return rule;
        }
  
        g = alloc_auto_flow_group(ft, spec);
        if (IS_ERR(g)) {
                rule = ERR_CAST(g);
 -              up_write_ref_node(&ft->node);
 +              up_write_ref_node(&ft->node, false);
                return rule;
        }
  
        nested_down_write_ref_node(&g->node, FS_LOCK_PARENT);
 -      up_write_ref_node(&ft->node);
 +      up_write_ref_node(&ft->node, false);
  
        err = create_auto_flow_group(ft, g);
        if (err)
        }
  
        nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
 -      up_write_ref_node(&g->node);
 +      up_write_ref_node(&g->node, false);
        rule = add_rule_fg(g, spec->match_value, flow_act, dest,
                           dest_num, fte);
 -      up_write_ref_node(&fte->node);
 -      tree_put_node(&fte->node);
 -      tree_put_node(&g->node);
 +      up_write_ref_node(&fte->node, false);
 +      tree_put_node(&fte->node, false);
 +      tree_put_node(&g->node, false);
        return rule;
  
  err_release_fg:
 -      up_write_ref_node(&g->node);
 -      tree_put_node(&g->node);
 +      up_write_ref_node(&g->node, false);
 +      tree_put_node(&g->node, false);
        return ERR_PTR(err);
  }
  
@@@ -1878,33 -1863,10 +1878,33 @@@ EXPORT_SYMBOL(mlx5_add_flow_rules)
  
  void mlx5_del_flow_rules(struct mlx5_flow_handle *handle)
  {
 +      struct fs_fte *fte;
        int i;
  
 +      /* In order to consolidate the HW changes we lock the FTE for other
 +       * changes, and increase its refcount, in order not to perform the
 +       * "del" functions of the FTE. Will handle them here.
 +       * The removal of the rules is done under locked FTE.
 +       * After removing all the handle's rules, if there are remaining
 +       * rules, it means we just need to modify the FTE in FW, and
 +       * unlock/decrease the refcount we increased before.
 +       * Otherwise, it means the FTE should be deleted. First delete the
 +       * FTE in FW. Then, unlock the FTE, and proceed the tree_put_node of
 +       * the FTE, which will handle the last decrease of the refcount, as
 +       * well as required handling of its parent.
 +       */
 +      fs_get_obj(fte, handle->rule[0]->node.parent);
 +      down_write_ref_node(&fte->node, false);
        for (i = handle->num_rules - 1; i >= 0; i--)
 -              tree_remove_node(&handle->rule[i]->node);
 +              tree_remove_node(&handle->rule[i]->node, true);
 +      if (fte->modify_mask && fte->dests_size) {
 +              modify_fte(fte);
 +              up_write_ref_node(&fte->node, false);
 +      } else {
 +              del_hw_fte(&fte->node);
 +              up_write(&fte->node.lock);
 +              tree_put_node(&fte->node, false);
 +      }
        kfree(handle);
  }
  EXPORT_SYMBOL(mlx5_del_flow_rules);
@@@ -2007,7 -1969,7 +2007,7 @@@ int mlx5_destroy_flow_table(struct mlx5
                mutex_unlock(&root->chain_lock);
                return err;
        }
 -      if (tree_remove_node(&ft->node))
 +      if (tree_remove_node(&ft->node, false))
                mlx5_core_warn(get_dev(&ft->node), "Flow table %d wasn't destroyed, refcount > 1\n",
                               ft->id);
        mutex_unlock(&root->chain_lock);
@@@ -2018,7 -1980,7 +2018,7 @@@ EXPORT_SYMBOL(mlx5_destroy_flow_table)
  
  void mlx5_destroy_flow_group(struct mlx5_flow_group *fg)
  {
 -      if (tree_remove_node(&fg->node))
 +      if (tree_remove_node(&fg->node, false))
                mlx5_core_warn(get_dev(&fg->node), "Flow group %d wasn't destroyed, refcount > 1\n",
                               fg->id);
  }
@@@ -2402,8 -2364,8 +2402,8 @@@ static void clean_tree(struct fs_node *
                tree_get_node(node);
                list_for_each_entry_safe(iter, temp, &node->children, list)
                        clean_tree(iter);
 -              tree_put_node(node);
 -              tree_remove_node(node);
 +              tree_put_node(node, false);
 +              tree_remove_node(node, false);
        }
  }
  
index cb9fa3430c5358678a6a7b719ed0d22e8d2eb15e,00c288b6a6d195352f7be87cacfb2d73e01a539b..3b98fcdd7d0e4c9911668f857b5ec55b5737de8e
@@@ -103,7 -103,7 +103,7 @@@ void mlx5_enter_error_state(struct mlx5
        mlx5_core_err(dev, "start\n");
        if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) {
                dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
 -              mlx5_cmd_trigger_completions(dev);
 +              mlx5_cmd_flush(dev);
        }
  
        mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1);
@@@ -152,11 -152,11 +152,11 @@@ static void health_recover(struct work_
  
        nic_state = mlx5_get_nic_state(dev);
        if (nic_state == MLX5_NIC_IFC_INVALID) {
-               dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n");
+               mlx5_core_err(dev, "health recovery flow aborted since the nic state is invalid\n");
                return;
        }
  
-       dev_err(&dev->pdev->dev, "starting health recovery flow\n");
+       mlx5_core_err(dev, "starting health recovery flow\n");
        mlx5_recover_device(dev);
  }
  
@@@ -180,8 -180,8 +180,8 @@@ static void health_care(struct work_str
        if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags))
                schedule_delayed_work(&health->recover_work, recover_delay);
        else
-               dev_err(&dev->pdev->dev,
-                       "new health works are not permitted at this stage\n");
+               mlx5_core_err(dev,
+                             "new health works are not permitted at this stage\n");
        spin_unlock_irqrestore(&health->wq_lock, flags);
  }
  
@@@ -228,18 -228,22 +228,22 @@@ static void print_health_info(struct ml
                return;
  
        for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
-               dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
+               mlx5_core_err(dev, "assert_var[%d] 0x%08x\n", i,
+                             ioread32be(h->assert_var + i));
  
-       dev_err(&dev->pdev->dev, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr));
-       dev_err(&dev->pdev->dev, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra));
+       mlx5_core_err(dev, "assert_exit_ptr 0x%08x\n",
+                     ioread32be(&h->assert_exit_ptr));
+       mlx5_core_err(dev, "assert_callra 0x%08x\n",
+                     ioread32be(&h->assert_callra));
        sprintf(fw_str, "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
-       dev_err(&dev->pdev->dev, "fw_ver %s\n", fw_str);
-       dev_err(&dev->pdev->dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
-       dev_err(&dev->pdev->dev, "irisc_index %d\n", ioread8(&h->irisc_index));
-       dev_err(&dev->pdev->dev, "synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd)));
-       dev_err(&dev->pdev->dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
+       mlx5_core_err(dev, "fw_ver %s\n", fw_str);
+       mlx5_core_err(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
+       mlx5_core_err(dev, "irisc_index %d\n", ioread8(&h->irisc_index));
+       mlx5_core_err(dev, "synd 0x%x: %s\n", ioread8(&h->synd),
+                     hsynd_str(ioread8(&h->synd)));
+       mlx5_core_err(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
        fw = ioread32be(&h->fw_ver);
-       dev_err(&dev->pdev->dev, "raw fw_ver 0x%08x\n", fw);
+       mlx5_core_err(dev, "raw fw_ver 0x%08x\n", fw);
  }
  
  static unsigned long get_next_poll_jiffies(void)
@@@ -262,8 -266,7 +266,7 @@@ void mlx5_trigger_health_work(struct ml
        if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
                queue_work(health->wq, &health->work);
        else
-               dev_err(&dev->pdev->dev,
-                       "new health works are not permitted at this stage\n");
+               mlx5_core_err(dev, "new health works are not permitted at this stage\n");
        spin_unlock_irqrestore(&health->wq_lock, flags);
  }
  
@@@ -284,7 -287,7 +287,7 @@@ static void poll_health(struct timer_li
  
        health->prev = count;
        if (health->miss_counter == MAX_MISSES) {
-               dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
+               mlx5_core_err(dev, "device's health compromised - reached miss count\n");
                print_health_info(dev);
        }
  
@@@ -352,6 -355,13 +355,13 @@@ void mlx5_drain_health_recovery(struct 
        cancel_delayed_work_sync(&dev->priv.health.recover_work);
  }
  
+ void mlx5_health_flush(struct mlx5_core_dev *dev)
+ {
+       struct mlx5_core_health *health = &dev->priv.health;
+       flush_workqueue(health->wq);
+ }
  void mlx5_health_cleanup(struct mlx5_core_dev *dev)
  {
        struct mlx5_core_health *health = &dev->priv.health;
@@@ -370,7 -380,7 +380,7 @@@ int mlx5_health_init(struct mlx5_core_d
                return -ENOMEM;
  
        strcpy(name, "mlx5_health");
-       strcat(name, dev_name(&dev->pdev->dev));
+       strcat(name, dev->priv.name);
        health->wq = create_singlethread_workqueue(name);
        kfree(name);
        if (!health->wq)
index 70cc906a102b2dde87d161385126f43da4948266,bda9c4bd17e6a97f8a0a2e0f1f4946692836c9a8..b200a29d142057cd4e407b61fdccb1df074a01e2
@@@ -465,7 -465,6 +465,7 @@@ static int handle_hca_cap_odp(struct ml
        void *set_hca_cap;
        void *set_ctx;
        int set_sz;
 +      bool do_set = false;
        int err;
  
        if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) ||
        if (err)
                return err;
  
 -      if (!(MLX5_CAP_ODP_MAX(dev, ud_odp_caps.srq_receive) ||
 -            MLX5_CAP_ODP_MAX(dev, rc_odp_caps.srq_receive) ||
 -            MLX5_CAP_ODP_MAX(dev, xrc_odp_caps.srq_receive)))
 -              return 0;
 -
        set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
        set_ctx = kzalloc(set_sz, GFP_KERNEL);
        if (!set_ctx)
        memcpy(set_hca_cap, dev->caps.hca_cur[MLX5_CAP_ODP],
               MLX5_ST_SZ_BYTES(odp_cap));
  
 -      /* set ODP SRQ support for RC/UD and XRC transports */
 -      MLX5_SET(odp_cap, set_hca_cap, ud_odp_caps.srq_receive,
 -               MLX5_CAP_ODP_MAX(dev, ud_odp_caps.srq_receive));
 -
 -      MLX5_SET(odp_cap, set_hca_cap, rc_odp_caps.srq_receive,
 -               MLX5_CAP_ODP_MAX(dev, rc_odp_caps.srq_receive));
 -
 -      MLX5_SET(odp_cap, set_hca_cap, xrc_odp_caps.srq_receive,
 -               MLX5_CAP_ODP_MAX(dev, xrc_odp_caps.srq_receive));
 -
 -      err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_ODP);
 +#define ODP_CAP_SET_MAX(dev, field)                                            \
 +      do {                                                                   \
 +              u32 _res = MLX5_CAP_ODP_MAX(dev, field);                       \
 +              if (_res) {                                                    \
 +                      do_set = true;                                         \
 +                      MLX5_SET(odp_cap, set_hca_cap, field, _res);           \
 +              }                                                              \
 +      } while (0)
 +
 +      ODP_CAP_SET_MAX(dev, ud_odp_caps.srq_receive);
 +      ODP_CAP_SET_MAX(dev, rc_odp_caps.srq_receive);
 +      ODP_CAP_SET_MAX(dev, xrc_odp_caps.srq_receive);
 +      ODP_CAP_SET_MAX(dev, xrc_odp_caps.send);
 +      ODP_CAP_SET_MAX(dev, xrc_odp_caps.receive);
 +      ODP_CAP_SET_MAX(dev, xrc_odp_caps.write);
 +      ODP_CAP_SET_MAX(dev, xrc_odp_caps.read);
 +      ODP_CAP_SET_MAX(dev, xrc_odp_caps.atomic);
 +
 +      if (do_set)
 +              err = set_caps(dev, set_ctx, set_sz,
 +                             MLX5_SET_HCA_CAP_OP_MOD_ODP);
  
        kfree(set_ctx);
 +
        return err;
  }
  
@@@ -587,24 -580,23 +587,23 @@@ query_ex
  
  static int set_hca_cap(struct mlx5_core_dev *dev)
  {
-       struct pci_dev *pdev = dev->pdev;
        int err;
  
        err = handle_hca_cap(dev);
        if (err) {
-               dev_err(&pdev->dev, "handle_hca_cap failed\n");
+               mlx5_core_err(dev, "handle_hca_cap failed\n");
                goto out;
        }
  
        err = handle_hca_cap_atomic(dev);
        if (err) {
-               dev_err(&pdev->dev, "handle_hca_cap_atomic failed\n");
+               mlx5_core_err(dev, "handle_hca_cap_atomic failed\n");
                goto out;
        }
  
        err = handle_hca_cap_odp(dev);
        if (err) {
-               dev_err(&pdev->dev, "handle_hca_cap_odp failed\n");
+               mlx5_core_err(dev, "handle_hca_cap_odp failed\n");
                goto out;
        }
  
@@@ -736,36 -728,29 +735,29 @@@ static int mlx5_core_set_issi(struct ml
        return -EOPNOTSUPP;
  }
  
- static int mlx5_pci_init(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
+ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev,
+                        const struct pci_device_id *id)
  {
-       struct pci_dev *pdev = dev->pdev;
+       struct mlx5_priv *priv = &dev->priv;
        int err = 0;
  
-       pci_set_drvdata(dev->pdev, dev);
-       strncpy(priv->name, dev_name(&pdev->dev), MLX5_MAX_NAME_LEN);
-       priv->name[MLX5_MAX_NAME_LEN - 1] = 0;
-       mutex_init(&priv->pgdir_mutex);
-       INIT_LIST_HEAD(&priv->pgdir_list);
-       spin_lock_init(&priv->mkey_lock);
+       dev->pdev = pdev;
+       priv->pci_dev_data = id->driver_data;
  
-       mutex_init(&priv->alloc_mutex);
+       pci_set_drvdata(dev->pdev, dev);
  
+       dev->bar_addr = pci_resource_start(pdev, 0);
        priv->numa_node = dev_to_node(&dev->pdev->dev);
  
-       if (mlx5_debugfs_root)
-               priv->dbg_root =
-                       debugfs_create_dir(pci_name(pdev), mlx5_debugfs_root);
        err = mlx5_pci_enable_device(dev);
        if (err) {
-               dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
-               goto err_dbg;
+               mlx5_core_err(dev, "Cannot enable PCI device, aborting\n");
+               return err;
        }
  
        err = request_bar(pdev);
        if (err) {
-               dev_err(&pdev->dev, "error requesting BARs, aborting\n");
+               mlx5_core_err(dev, "error requesting BARs, aborting\n");
                goto err_disable;
        }
  
  
        err = set_dma_caps(pdev);
        if (err) {
-               dev_err(&pdev->dev, "Failed setting DMA capabilities mask, aborting\n");
+               mlx5_core_err(dev, "Failed setting DMA capabilities mask, aborting\n");
                goto err_clr_master;
        }
  
            pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128))
                mlx5_core_dbg(dev, "Enabling pci atomics failed\n");
  
-       dev->iseg_base = pci_resource_start(dev->pdev, 0);
+       dev->iseg_base = dev->bar_addr;
        dev->iseg = ioremap(dev->iseg_base, sizeof(*dev->iseg));
        if (!dev->iseg) {
                err = -ENOMEM;
-               dev_err(&pdev->dev, "Failed mapping initialization segment, aborting\n");
+               mlx5_core_err(dev, "Failed mapping initialization segment, aborting\n");
                goto err_clr_master;
        }
  
@@@ -797,52 -782,47 +789,47 @@@ err_clr_master
        release_bar(dev->pdev);
  err_disable:
        mlx5_pci_disable_device(dev);
- err_dbg:
-       debugfs_remove(priv->dbg_root);
        return err;
  }
  
- static void mlx5_pci_close(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
+ static void mlx5_pci_close(struct mlx5_core_dev *dev)
  {
        iounmap(dev->iseg);
        pci_clear_master(dev->pdev);
        release_bar(dev->pdev);
        mlx5_pci_disable_device(dev);
-       debugfs_remove_recursive(priv->dbg_root);
  }
  
- static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
+ static int mlx5_init_once(struct mlx5_core_dev *dev)
  {
-       struct pci_dev *pdev = dev->pdev;
        int err;
  
-       priv->devcom = mlx5_devcom_register_device(dev);
-       if (IS_ERR(priv->devcom))
-               dev_err(&pdev->dev, "failed to register with devcom (0x%p)\n",
-                       priv->devcom);
+       dev->priv.devcom = mlx5_devcom_register_device(dev);
+       if (IS_ERR(dev->priv.devcom))
+               mlx5_core_err(dev, "failed to register with devcom (0x%p)\n",
+                             dev->priv.devcom);
  
        err = mlx5_query_board_id(dev);
        if (err) {
-               dev_err(&pdev->dev, "query board id failed\n");
+               mlx5_core_err(dev, "query board id failed\n");
                goto err_devcom;
        }
  
        err = mlx5_eq_table_init(dev);
        if (err) {
-               dev_err(&pdev->dev, "failed to initialize eq\n");
+               mlx5_core_err(dev, "failed to initialize eq\n");
                goto err_devcom;
        }
  
        err = mlx5_events_init(dev);
        if (err) {
-               dev_err(&pdev->dev, "failed to initialize events\n");
+               mlx5_core_err(dev, "failed to initialize events\n");
                goto err_eq_cleanup;
        }
  
        err = mlx5_cq_debugfs_init(dev);
        if (err) {
-               dev_err(&pdev->dev, "failed to initialize cq debugfs\n");
+               mlx5_core_err(dev, "failed to initialize cq debugfs\n");
                goto err_events_cleanup;
        }
  
  
        err = mlx5_init_rl_table(dev);
        if (err) {
-               dev_err(&pdev->dev, "Failed to init rate limiting\n");
+               mlx5_core_err(dev, "Failed to init rate limiting\n");
                goto err_tables_cleanup;
        }
  
        err = mlx5_mpfs_init(dev);
        if (err) {
-               dev_err(&pdev->dev, "Failed to init l2 table %d\n", err);
+               mlx5_core_err(dev, "Failed to init l2 table %d\n", err);
                goto err_rl_cleanup;
        }
  
        err = mlx5_eswitch_init(dev);
        if (err) {
-               dev_err(&pdev->dev, "Failed to init eswitch %d\n", err);
+               mlx5_core_err(dev, "Failed to init eswitch %d\n", err);
                goto err_mpfs_cleanup;
        }
  
        err = mlx5_sriov_init(dev);
        if (err) {
-               dev_err(&pdev->dev, "Failed to init sriov %d\n", err);
+               mlx5_core_err(dev, "Failed to init sriov %d\n", err);
                goto err_eswitch_cleanup;
        }
  
        err = mlx5_fpga_init(dev);
        if (err) {
-               dev_err(&pdev->dev, "Failed to init fpga device %d\n", err);
+               mlx5_core_err(dev, "Failed to init fpga device %d\n", err);
                goto err_sriov_cleanup;
        }
  
@@@ -932,93 -912,78 +919,78 @@@ static void mlx5_cleanup_once(struct ml
        mlx5_devcom_unregister_device(dev->priv.devcom);
  }
  
- static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
-                        bool boot)
+ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
  {
-       struct pci_dev *pdev = dev->pdev;
        int err;
  
-       dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev);
-       mutex_lock(&dev->intf_state_mutex);
-       if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
-               dev_warn(&dev->pdev->dev, "%s: interface is up, NOP\n",
-                        __func__);
-               goto out;
-       }
-       dev_info(&pdev->dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
-                fw_rev_min(dev), fw_rev_sub(dev));
+       mlx5_core_info(dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
+                      fw_rev_min(dev), fw_rev_sub(dev));
  
        /* Only PFs hold the relevant PCIe information for this query */
        if (mlx5_core_is_pf(dev))
                pcie_print_link_status(dev->pdev);
  
-       /* on load removing any previous indication of internal error, device is
-        * up
-        */
-       dev->state = MLX5_DEVICE_STATE_UP;
        /* wait for firmware to accept initialization segments configurations
         */
        err = wait_fw_init(dev, FW_PRE_INIT_TIMEOUT_MILI);
        if (err) {
-               dev_err(&dev->pdev->dev, "Firmware over %d MS in pre-initializing state, aborting\n",
-                       FW_PRE_INIT_TIMEOUT_MILI);
-               goto out_err;
+               mlx5_core_err(dev, "Firmware over %d MS in pre-initializing state, aborting\n",
+                             FW_PRE_INIT_TIMEOUT_MILI);
+               return err;
        }
  
        err = mlx5_cmd_init(dev);
        if (err) {
-               dev_err(&pdev->dev, "Failed initializing command interface, aborting\n");
-               goto out_err;
+               mlx5_core_err(dev, "Failed initializing command interface, aborting\n");
+               return err;
        }
  
        err = wait_fw_init(dev, FW_INIT_TIMEOUT_MILI);
        if (err) {
-               dev_err(&dev->pdev->dev, "Firmware over %d MS in initializing state, aborting\n",
-                       FW_INIT_TIMEOUT_MILI);
+               mlx5_core_err(dev, "Firmware over %d MS in initializing state, aborting\n",
+                             FW_INIT_TIMEOUT_MILI);
                goto err_cmd_cleanup;
        }
  
        err = mlx5_core_enable_hca(dev, 0);
        if (err) {
-               dev_err(&pdev->dev, "enable hca failed\n");
+               mlx5_core_err(dev, "enable hca failed\n");
                goto err_cmd_cleanup;
        }
  
        err = mlx5_core_set_issi(dev);
        if (err) {
-               dev_err(&pdev->dev, "failed to set issi\n");
+               mlx5_core_err(dev, "failed to set issi\n");
                goto err_disable_hca;
        }
  
        err = mlx5_satisfy_startup_pages(dev, 1);
        if (err) {
-               dev_err(&pdev->dev, "failed to allocate boot pages\n");
+               mlx5_core_err(dev, "failed to allocate boot pages\n");
                goto err_disable_hca;
        }
  
        err = set_hca_ctrl(dev);
        if (err) {
-               dev_err(&pdev->dev, "set_hca_ctrl failed\n");
+               mlx5_core_err(dev, "set_hca_ctrl failed\n");
                goto reclaim_boot_pages;
        }
  
        err = set_hca_cap(dev);
        if (err) {
-               dev_err(&pdev->dev, "set_hca_cap failed\n");
+               mlx5_core_err(dev, "set_hca_cap failed\n");
                goto reclaim_boot_pages;
        }
  
        err = mlx5_satisfy_startup_pages(dev, 0);
        if (err) {
-               dev_err(&pdev->dev, "failed to allocate init pages\n");
+               mlx5_core_err(dev, "failed to allocate init pages\n");
                goto reclaim_boot_pages;
        }
  
        err = mlx5_cmd_init_hca(dev, sw_owner_id);
        if (err) {
-               dev_err(&pdev->dev, "init hca failed\n");
+               mlx5_core_err(dev, "init hca failed\n");
                goto reclaim_boot_pages;
        }
  
  
        err = mlx5_query_hca_caps(dev);
        if (err) {
-               dev_err(&pdev->dev, "query hca failed\n");
-               goto err_stop_poll;
+               mlx5_core_err(dev, "query hca failed\n");
+               goto stop_health;
        }
  
-       if (boot) {
-               err = mlx5_init_once(dev, priv);
-               if (err) {
-                       dev_err(&pdev->dev, "sw objs init failed\n");
-                       goto err_stop_poll;
-               }
+       return 0;
+ stop_health:
+       mlx5_stop_health_poll(dev, boot);
+ reclaim_boot_pages:
+       mlx5_reclaim_startup_pages(dev);
+ err_disable_hca:
+       mlx5_core_disable_hca(dev, 0);
+ err_cmd_cleanup:
+       mlx5_cmd_cleanup(dev);
+       return err;
+ }
+ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot)
+ {
+       int err;
+       mlx5_stop_health_poll(dev, boot);
+       err = mlx5_cmd_teardown_hca(dev);
+       if (err) {
+               mlx5_core_err(dev, "tear_down_hca failed, skip cleanup\n");
+               return err;
        }
+       mlx5_reclaim_startup_pages(dev);
+       mlx5_core_disable_hca(dev, 0);
+       mlx5_cmd_cleanup(dev);
+       return 0;
+ }
+ static int mlx5_load(struct mlx5_core_dev *dev)
+ {
+       int err;
  
        dev->priv.uar = mlx5_get_uars_page(dev);
        if (IS_ERR(dev->priv.uar)) {
-               dev_err(&pdev->dev, "Failed allocating uar, aborting\n");
+               mlx5_core_err(dev, "Failed allocating uar, aborting\n");
                err = PTR_ERR(dev->priv.uar);
-               goto err_get_uars;
+               return err;
        }
  
        mlx5_events_start(dev);
  
        err = mlx5_eq_table_create(dev);
        if (err) {
-               dev_err(&pdev->dev, "Failed to create EQs\n");
+               mlx5_core_err(dev, "Failed to create EQs\n");
                goto err_eq_table;
        }
  
        err = mlx5_fw_tracer_init(dev->tracer);
        if (err) {
-               dev_err(&pdev->dev, "Failed to init FW tracer\n");
+               mlx5_core_err(dev, "Failed to init FW tracer\n");
                goto err_fw_tracer;
        }
  
        err = mlx5_fpga_device_start(dev);
        if (err) {
-               dev_err(&pdev->dev, "fpga device start failed %d\n", err);
+               mlx5_core_err(dev, "fpga device start failed %d\n", err);
                goto err_fpga_start;
        }
  
        err = mlx5_accel_ipsec_init(dev);
        if (err) {
-               dev_err(&pdev->dev, "IPSec device start failed %d\n", err);
+               mlx5_core_err(dev, "IPSec device start failed %d\n", err);
                goto err_ipsec_start;
        }
  
        err = mlx5_accel_tls_init(dev);
        if (err) {
-               dev_err(&pdev->dev, "TLS device start failed %d\n", err);
+               mlx5_core_err(dev, "TLS device start failed %d\n", err);
                goto err_tls_start;
        }
  
        err = mlx5_init_fs(dev);
        if (err) {
-               dev_err(&pdev->dev, "Failed to init flow steering\n");
+               mlx5_core_err(dev, "Failed to init flow steering\n");
                goto err_fs;
        }
  
        err = mlx5_core_set_hca_defaults(dev);
        if (err) {
-               dev_err(&pdev->dev, "Failed to set hca defaults\n");
+               mlx5_core_err(dev, "Failed to set hca defaults\n");
                goto err_fs;
        }
  
        err = mlx5_sriov_attach(dev);
        if (err) {
-               dev_err(&pdev->dev, "sriov init failed %d\n", err);
+               mlx5_core_err(dev, "sriov init failed %d\n", err);
                goto err_sriov;
        }
  
        err = mlx5_ec_init(dev);
        if (err) {
-               dev_err(&pdev->dev, "Failed to init embedded CPU\n");
+               mlx5_core_err(dev, "Failed to init embedded CPU\n");
                goto err_ec;
        }
  
-       if (mlx5_device_registered(dev)) {
-               mlx5_attach_device(dev);
-       } else {
-               err = mlx5_register_device(dev);
-               if (err) {
-                       dev_err(&pdev->dev, "mlx5_register_device failed %d\n", err);
-                       goto err_reg_dev;
-               }
-       }
-       set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
- out:
-       mutex_unlock(&dev->intf_state_mutex);
        return 0;
  
- err_reg_dev:
-       mlx5_ec_cleanup(dev);
  err_ec:
        mlx5_sriov_detach(dev);
  err_sriov:
        mlx5_cleanup_fs(dev);
  err_fs:
        mlx5_accel_tls_cleanup(dev);
  err_tls_start:
        mlx5_accel_ipsec_cleanup(dev);
  err_ipsec_start:
        mlx5_fpga_device_stop(dev);
  err_fpga_start:
        mlx5_fw_tracer_cleanup(dev->tracer);
  err_fw_tracer:
        mlx5_eq_table_destroy(dev);
  err_eq_table:
        mlx5_pagealloc_stop(dev);
        mlx5_events_stop(dev);
-       mlx5_put_uars_page(dev, priv->uar);
+       mlx5_put_uars_page(dev, dev->priv.uar);
+       return err;
+ }
  
- err_get_uars:
-       if (boot)
-               mlx5_cleanup_once(dev);
+ static void mlx5_unload(struct mlx5_core_dev *dev)
+ {
+       mlx5_ec_cleanup(dev);
+       mlx5_sriov_detach(dev);
+       mlx5_cleanup_fs(dev);
+       mlx5_accel_ipsec_cleanup(dev);
+       mlx5_accel_tls_cleanup(dev);
+       mlx5_fpga_device_stop(dev);
+       mlx5_fw_tracer_cleanup(dev->tracer);
+       mlx5_eq_table_destroy(dev);
+       mlx5_pagealloc_stop(dev);
+       mlx5_events_stop(dev);
+       mlx5_put_uars_page(dev, dev->priv.uar);
+ }
  
- err_stop_poll:
-       mlx5_stop_health_poll(dev, boot);
-       if (mlx5_cmd_teardown_hca(dev)) {
-               dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
-               goto out_err;
+ static int mlx5_load_one(struct mlx5_core_dev *dev, bool boot)
+ {
+       int err = 0;
+       dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev);
+       mutex_lock(&dev->intf_state_mutex);
+       if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
+               mlx5_core_warn(dev, "interface is up, NOP\n");
+               goto out;
        }
+       /* remove any previous indication of internal error */
+       dev->state = MLX5_DEVICE_STATE_UP;
  
- reclaim_boot_pages:
-       mlx5_reclaim_startup_pages(dev);
+       err = mlx5_function_setup(dev, boot);
+       if (err)
+               goto out;
  
- err_disable_hca:
-       mlx5_core_disable_hca(dev, 0);
+       if (boot) {
+               err = mlx5_init_once(dev);
+               if (err) {
+                       mlx5_core_err(dev, "sw objs init failed\n");
+                       goto function_teardown;
+               }
+       }
  
- err_cmd_cleanup:
-       mlx5_cmd_cleanup(dev);
+       err = mlx5_load(dev);
+       if (err)
+               goto err_load;
  
- out_err:
+       if (mlx5_device_registered(dev)) {
+               mlx5_attach_device(dev);
+       } else {
+               err = mlx5_register_device(dev);
+               if (err) {
+                       mlx5_core_err(dev, "register device failed %d\n", err);
+                       goto err_reg_dev;
+               }
+       }
+       set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
+ out:
+       mutex_unlock(&dev->intf_state_mutex);
+       return err;
+ err_reg_dev:
+       mlx5_unload(dev);
+ err_load:
+       if (boot)
+               mlx5_cleanup_once(dev);
+ function_teardown:
+       mlx5_function_teardown(dev, boot);
        dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
        mutex_unlock(&dev->intf_state_mutex);
  
        return err;
  }
  
- static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
-                          bool cleanup)
+ static int mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup)
  {
        int err = 0;
  
  
        mutex_lock(&dev->intf_state_mutex);
        if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
-               dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
-                        __func__);
+               mlx5_core_warn(dev, "%s: interface is down, NOP\n",
+                              __func__);
                if (cleanup)
                        mlx5_cleanup_once(dev);
                goto out;
        if (mlx5_device_registered(dev))
                mlx5_detach_device(dev);
  
-       mlx5_ec_cleanup(dev);
-       mlx5_sriov_detach(dev);
-       mlx5_cleanup_fs(dev);
-       mlx5_accel_ipsec_cleanup(dev);
-       mlx5_accel_tls_cleanup(dev);
-       mlx5_fpga_device_stop(dev);
-       mlx5_fw_tracer_cleanup(dev->tracer);
-       mlx5_eq_table_destroy(dev);
-       mlx5_pagealloc_stop(dev);
-       mlx5_events_stop(dev);
-       mlx5_put_uars_page(dev, priv->uar);
+       mlx5_unload(dev);
        if (cleanup)
                mlx5_cleanup_once(dev);
-       mlx5_stop_health_poll(dev, cleanup);
-       err = mlx5_cmd_teardown_hca(dev);
-       if (err) {
-               dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
-               goto out;
-       }
-       mlx5_reclaim_startup_pages(dev);
-       mlx5_core_disable_hca(dev, 0);
-       mlx5_cmd_cleanup(dev);
  
+       mlx5_function_teardown(dev, cleanup);
  out:
        mutex_unlock(&dev->intf_state_mutex);
        return err;
@@@ -1238,29 -1235,15 +1242,15 @@@ static const struct devlink_ops mlx5_de
  #endif
  };
  
- #define MLX5_IB_MOD "mlx5_ib"
- static int init_one(struct pci_dev *pdev,
-                   const struct pci_device_id *id)
+ static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx, const char *name)
  {
-       struct mlx5_core_dev *dev;
-       struct devlink *devlink;
-       struct mlx5_priv *priv;
+       struct mlx5_priv *priv = &dev->priv;
        int err;
  
-       devlink = devlink_alloc(&mlx5_devlink_ops, sizeof(*dev));
-       if (!devlink) {
-               dev_err(&pdev->dev, "kzalloc failed\n");
-               return -ENOMEM;
-       }
-       dev = devlink_priv(devlink);
-       priv = &dev->priv;
-       priv->pci_dev_data = id->driver_data;
-       pci_set_drvdata(pdev, dev);
+       strncpy(priv->name, name, MLX5_MAX_NAME_LEN);
+       priv->name[MLX5_MAX_NAME_LEN - 1] = 0;
  
-       dev->pdev = pdev;
-       dev->profile = &profile[prof_sel];
+       dev->profile = &profile[profile_idx];
  
        INIT_LIST_HEAD(&priv->ctx_list);
        spin_lock_init(&priv->ctx_lock);
        INIT_LIST_HEAD(&priv->bfregs.reg_head.list);
        INIT_LIST_HEAD(&priv->bfregs.wc_head.list);
  
-       err = mlx5_pci_init(dev, priv);
-       if (err) {
-               dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err);
-               goto clean_dev;
+       mutex_init(&priv->alloc_mutex);
+       mutex_init(&priv->pgdir_mutex);
+       INIT_LIST_HEAD(&priv->pgdir_list);
+       spin_lock_init(&priv->mkey_lock);
+       priv->dbg_root = debugfs_create_dir(name, mlx5_debugfs_root);
+       if (!priv->dbg_root) {
+               pr_err("mlx5_core: %s error, Cannot create debugfs dir, aborting\n", name);
+               return -ENOMEM;
        }
  
        err = mlx5_health_init(dev);
-       if (err) {
-               dev_err(&pdev->dev, "mlx5_health_init failed with error code %d\n", err);
-               goto close_pci;
-       }
+       if (err)
+               goto err_health_init;
  
        err = mlx5_pagealloc_init(dev);
        if (err)
                goto err_pagealloc_init;
  
-       err = mlx5_load_one(dev, priv, true);
+       return 0;
+ err_pagealloc_init:
+       mlx5_health_cleanup(dev);
+ err_health_init:
+       debugfs_remove(dev->priv.dbg_root);
+       return err;
+ }
+ static void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
+ {
+       mlx5_pagealloc_cleanup(dev);
+       mlx5_health_cleanup(dev);
+       debugfs_remove_recursive(dev->priv.dbg_root);
+ }
+ #define MLX5_IB_MOD "mlx5_ib"
+ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id)
+ {
+       struct mlx5_core_dev *dev;
+       struct devlink *devlink;
+       int err;
+       devlink = devlink_alloc(&mlx5_devlink_ops, sizeof(*dev));
+       if (!devlink) {
+               dev_err(&pdev->dev, "kzalloc failed\n");
+               return -ENOMEM;
+       }
+       dev = devlink_priv(devlink);
+       err = mlx5_mdev_init(dev, prof_sel, dev_name(&pdev->dev));
+       if (err)
+               goto mdev_init_err;
+       err = mlx5_pci_init(dev, pdev, id);
+       if (err) {
+               mlx5_core_err(dev, "mlx5_pci_init failed with error code %d\n",
+                             err);
+               goto pci_init_err;
+       }
+       err = mlx5_load_one(dev, true);
        if (err) {
-               dev_err(&pdev->dev, "mlx5_load_one failed with error code %d\n", err);
+               mlx5_core_err(dev, "mlx5_load_one failed with error code %d\n",
+                             err);
                goto err_load_one;
        }
  
        return 0;
  
  clean_load:
-       mlx5_unload_one(dev, priv, true);
+       mlx5_unload_one(dev, true);
  err_load_one:
-       mlx5_pagealloc_cleanup(dev);
- err_pagealloc_init:
-       mlx5_health_cleanup(dev);
- close_pci:
-       mlx5_pci_close(dev, priv);
- clean_dev:
+       mlx5_pci_close(dev);
+ pci_init_err:
+       mlx5_mdev_uninit(dev);
+ mdev_init_err:
        devlink_free(devlink);
  
        return err;
@@@ -1321,20 -1350,18 +1357,18 @@@ static void remove_one(struct pci_dev *
  {
        struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
        struct devlink *devlink = priv_to_devlink(dev);
-       struct mlx5_priv *priv = &dev->priv;
  
        devlink_unregister(devlink);
        mlx5_unregister_device(dev);
  
-       if (mlx5_unload_one(dev, priv, true)) {
-               dev_err(&dev->pdev->dev, "mlx5_unload_one failed\n");
-               mlx5_health_cleanup(dev);
+       if (mlx5_unload_one(dev, true)) {
+               mlx5_core_err(dev, "mlx5_unload_one failed\n");
+               mlx5_health_flush(dev);
                return;
        }
  
-       mlx5_pagealloc_cleanup(dev);
-       mlx5_health_cleanup(dev);
-       mlx5_pci_close(dev, priv);
+       mlx5_pci_close(dev);
+       mlx5_mdev_uninit(dev);
        devlink_free(devlink);
  }
  
@@@ -1342,12 -1369,11 +1376,11 @@@ static pci_ers_result_t mlx5_pci_err_de
                                              pci_channel_state_t state)
  {
        struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
-       struct mlx5_priv *priv = &dev->priv;
  
-       dev_info(&pdev->dev, "%s was called\n", __func__);
+       mlx5_core_info(dev, "%s was called\n", __func__);
  
        mlx5_enter_error_state(dev, false);
-       mlx5_unload_one(dev, priv, false);
+       mlx5_unload_one(dev, false);
        /* In case of kernel call drain the health wq */
        if (state) {
                mlx5_drain_health_wq(dev);
@@@ -1374,7 -1400,9 +1407,9 @@@ static int wait_vital(struct pci_dev *p
                count = ioread32be(health->health_counter);
                if (count && count != 0xffffffff) {
                        if (last_count && last_count != count) {
-                               dev_info(&pdev->dev, "Counter value 0x%x after %d iterations\n", count, i);
+                               mlx5_core_info(dev,
+                                              "wait vital counter value 0x%x after %d iterations\n",
+                                              count, i);
                                return 0;
                        }
                        last_count = count;
@@@ -1390,12 -1418,12 +1425,12 @@@ static pci_ers_result_t mlx5_pci_slot_r
        struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
        int err;
  
-       dev_info(&pdev->dev, "%s was called\n", __func__);
+       mlx5_core_info(dev, "%s was called\n", __func__);
  
        err = mlx5_pci_enable_device(dev);
        if (err) {
-               dev_err(&pdev->dev, "%s: mlx5_pci_enable_device failed with error code: %d\n"
-                       , __func__, err);
+               mlx5_core_err(dev, "%s: mlx5_pci_enable_device failed with error code: %d\n",
+                             __func__, err);
                return PCI_ERS_RESULT_DISCONNECT;
        }
  
        pci_save_state(pdev);
  
        if (wait_vital(pdev)) {
-               dev_err(&pdev->dev, "%s: wait_vital timed out\n", __func__);
+               mlx5_core_err(dev, "%s: wait_vital timed out\n", __func__);
                return PCI_ERS_RESULT_DISCONNECT;
        }
  
  static void mlx5_pci_resume(struct pci_dev *pdev)
  {
        struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
-       struct mlx5_priv *priv = &dev->priv;
        int err;
  
-       dev_info(&pdev->dev, "%s was called\n", __func__);
+       mlx5_core_info(dev, "%s was called\n", __func__);
  
-       err = mlx5_load_one(dev, priv, false);
+       err = mlx5_load_one(dev, false);
        if (err)
-               dev_err(&pdev->dev, "%s: mlx5_load_one failed with error code: %d\n"
-                       , __func__, err);
+               mlx5_core_err(dev, "%s: mlx5_load_one failed with error code: %d\n",
+                             __func__, err);
        else
-               dev_info(&pdev->dev, "%s: device recovered\n", __func__);
+               mlx5_core_info(dev, "%s: device recovered\n", __func__);
  }
  
  static const struct pci_error_handlers mlx5_err_handler = {
@@@ -1486,13 -1513,12 +1520,12 @@@ succeed
  static void shutdown(struct pci_dev *pdev)
  {
        struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
-       struct mlx5_priv *priv = &dev->priv;
        int err;
  
-       dev_info(&pdev->dev, "Shutdown was called\n");
+       mlx5_core_info(dev, "Shutdown was called\n");
        err = mlx5_try_fast_unload(dev);
        if (err)
-               mlx5_unload_one(dev, priv, false);
+               mlx5_unload_one(dev, false);
        mlx5_pci_disable_device(dev);
  }
  
@@@ -1509,8 -1535,6 +1542,8 @@@ static const struct pci_device_id mlx5_
        { PCI_VDEVICE(MELLANOX, 0x101a), MLX5_PCI_DEV_IS_VF},   /* ConnectX-5 Ex VF */
        { PCI_VDEVICE(MELLANOX, 0x101b) },                      /* ConnectX-6 */
        { PCI_VDEVICE(MELLANOX, 0x101c), MLX5_PCI_DEV_IS_VF},   /* ConnectX-6 VF */
 +      { PCI_VDEVICE(MELLANOX, 0x101d) },                      /* ConnectX-6 Dx */
 +      { PCI_VDEVICE(MELLANOX, 0x101e), MLX5_PCI_DEV_IS_VF},   /* ConnectX Family mlx5Gen Virtual Function */
        { PCI_VDEVICE(MELLANOX, 0xa2d2) },                      /* BlueField integrated ConnectX-5 network controller */
        { PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF},   /* BlueField integrated ConnectX-5 network controller VF */
        { 0, }
index 6fb99be6058471a4f35c26fa50fc7b4f111ff723,a67d3d5f651ed6402e7448046c27e200bb7ff0e2..8213c994e205fdba527590201e879f1e5395630a
  extern uint mlx5_core_debug_mask;
  
  #define mlx5_core_dbg(__dev, format, ...)                             \
-       dev_dbg(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,         \
+       pr_debug("%s:%s:%d:(pid %d): " format, (__dev)->priv.name,      \
                 __func__, __LINE__, current->pid,                      \
                 ##__VA_ARGS__)
  
  #define mlx5_core_dbg_once(__dev, format, ...)                                \
-       dev_dbg_once(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,    \
+       pr_debug_once("%s:%s:%d:(pid %d): " format, (__dev)->priv.name, \
                     __func__, __LINE__, current->pid,                  \
                     ##__VA_ARGS__)
  
@@@ -64,28 -64,37 +64,37 @@@ do {                                                                       
  } while (0)
  
  #define mlx5_core_err(__dev, format, ...)                             \
-       dev_err(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format, \
+       pr_err("%s:%s:%d:(pid %d): " format, (__dev)->priv.name,        \
                __func__, __LINE__, current->pid,       \
               ##__VA_ARGS__)
  
- #define mlx5_core_err_rl(__dev, format, ...)                          \
-       dev_err_ratelimited(&(__dev)->pdev->dev,                        \
-                          "%s:%d:(pid %d): " format,                   \
-                          __func__, __LINE__, current->pid,            \
+ #define mlx5_core_err_rl(__dev, format, ...)                               \
+       pr_err_ratelimited("%s:%s:%d:(pid %d): " format, (__dev)->priv.name, \
+                          __func__, __LINE__, current->pid,                 \
                           ##__VA_ARGS__)
  
  #define mlx5_core_warn(__dev, format, ...)                            \
-       dev_warn(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,        \
+       pr_warn("%s:%s:%d:(pid %d): " format, (__dev)->priv.name,       \
                 __func__, __LINE__, current->pid,                      \
                ##__VA_ARGS__)
  
  #define mlx5_core_warn_once(__dev, format, ...)                               \
-       dev_warn_once(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,   \
+       pr_warn_once("%s:%s:%d:(pid %d): " format, (__dev)->priv.name,  \
                      __func__, __LINE__, current->pid,                 \
                      ##__VA_ARGS__)
  
+ #define mlx5_core_warn_rl(__dev, format, ...)                               \
+       pr_warn_ratelimited("%s:%s:%d:(pid %d): " format, (__dev)->priv.name, \
+                          __func__, __LINE__, current->pid,                  \
+                          ##__VA_ARGS__)
  #define mlx5_core_info(__dev, format, ...)                            \
-       dev_info(&(__dev)->pdev->dev, format, ##__VA_ARGS__)
+       pr_info("%s " format, (__dev)->priv.name, ##__VA_ARGS__)
+ #define mlx5_core_info_rl(__dev, format, ...)                               \
+       pr_info_ratelimited("%s:%s:%d:(pid %d): " format, (__dev)->priv.name, \
+                          __func__, __LINE__, current->pid,                  \
+                          ##__VA_ARGS__)
  
  enum {
        MLX5_CMD_DATA, /* print command payload only */
@@@ -111,6 -120,7 +120,6 @@@ void mlx5_sriov_cleanup(struct mlx5_cor
  int mlx5_sriov_attach(struct mlx5_core_dev *dev);
  void mlx5_sriov_detach(struct mlx5_core_dev *dev);
  int mlx5_core_sriov_configure(struct pci_dev *dev, int num_vfs);
 -bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev);
  int mlx5_core_enable_hca(struct mlx5_core_dev *dev, u16 func_id);
  int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id);
  int mlx5_create_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
@@@ -125,7 -135,6 +134,7 @@@ u64 mlx5_read_internal_timer(struct mlx
                             struct ptp_system_timestamp *sts);
  
  void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev);
 +void mlx5_cmd_flush(struct mlx5_core_dev *dev);
  int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev);
  void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev);
  
@@@ -175,11 -184,6 +184,11 @@@ int mlx5_firmware_flash(struct mlx5_cor
  void mlx5e_init(void);
  void mlx5e_cleanup(void);
  
 +static inline bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev)
 +{
 +      return pci_num_vf(dev->pdev) ? true : false;
 +}
 +
  static inline int mlx5_lag_is_lacp_owner(struct mlx5_core_dev *dev)
  {
        /* LACP owner conditions:
index 94464723ff77de3c112fde55e61490675572c7e1,b7d52709b8b1f4ea3dbfbf7e139f4904f24cdb0c..0d006224d7b057ecfdb9115f0835259ca1165494
@@@ -79,7 -79,7 +79,7 @@@ static u64 uar2pfn(struct mlx5_core_de
        else
                system_page_index = index;
  
-       return (pci_resource_start(mdev->pdev, 0) >> PAGE_SHIFT) + system_page_index;
+       return (mdev->bar_addr >> PAGE_SHIFT) + system_page_index;
  }
  
  static void up_rel_func(struct kref *kref)
@@@ -90,8 -90,8 +90,8 @@@
        iounmap(up->map);
        if (mlx5_cmd_free_uar(up->mdev, up->index))
                mlx5_core_warn(up->mdev, "failed to free uar index %d\n", up->index);
 -      kfree(up->reg_bitmap);
 -      kfree(up->fp_bitmap);
 +      bitmap_free(up->reg_bitmap);
 +      bitmap_free(up->fp_bitmap);
        kfree(up);
  }
  
@@@ -110,11 -110,11 +110,11 @@@ static struct mlx5_uars_page *alloc_uar
                return ERR_PTR(err);
  
        up->mdev = mdev;
 -      up->reg_bitmap = kcalloc(BITS_TO_LONGS(bfregs), sizeof(unsigned long), GFP_KERNEL);
 +      up->reg_bitmap = bitmap_zalloc(bfregs, GFP_KERNEL);
        if (!up->reg_bitmap)
                goto error1;
  
 -      up->fp_bitmap = kcalloc(BITS_TO_LONGS(bfregs), sizeof(unsigned long), GFP_KERNEL);
 +      up->fp_bitmap = bitmap_zalloc(bfregs, GFP_KERNEL);
        if (!up->fp_bitmap)
                goto error1;
  
@@@ -157,8 -157,8 +157,8 @@@ error2
        if (mlx5_cmd_free_uar(mdev, up->index))
                mlx5_core_warn(mdev, "failed to free uar index %d\n", up->index);
  error1:
 -      kfree(up->fp_bitmap);
 -      kfree(up->reg_bitmap);
 +      bitmap_free(up->fp_bitmap);
 +      bitmap_free(up->reg_bitmap);
        kfree(up);
        return ERR_PTR(err);
  }
index 022541dc5dbfd7b12a54601c1d1a59e30eed8a37,0bfb95e30e4723d2039e16cb0b21ce527e9b1212..6c43191c0186c6cf595a7a2b1620831a1f2ebeca
@@@ -133,6 -133,7 +133,7 @@@ enum 
        MLX5_REG_MTRC_CONF       = 0x9041,
        MLX5_REG_MTRC_STDB       = 0x9042,
        MLX5_REG_MTRC_CTRL       = 0x9043,
+       MLX5_REG_MPEIN           = 0x9050,
        MLX5_REG_MPCNT           = 0x9051,
        MLX5_REG_MTPPS           = 0x9053,
        MLX5_REG_MTPPSE          = 0x9054,
@@@ -195,7 -196,6 +196,7 @@@ struct mlx5_rsc_debug 
  
  enum mlx5_dev_event {
        MLX5_DEV_EVENT_SYS_ERROR = 128, /* 0 - 127 are FW events */
 +      MLX5_DEV_EVENT_PORT_AFFINITY = 129,
  };
  
  enum mlx5_port_status {
@@@ -365,7 -365,6 +366,7 @@@ struct mlx5_core_sig_ctx 
  enum {
        MLX5_MKEY_MR = 1,
        MLX5_MKEY_MW,
 +      MLX5_MKEY_INDIRECT_DEVX,
  };
  
  struct mlx5_core_mkey {
@@@ -660,6 -659,7 +661,7 @@@ struct mlx5_core_dev 
        u64                     sys_image_guid;
        phys_addr_t             iseg_base;
        struct mlx5_init_seg __iomem *iseg;
+       phys_addr_t             bar_addr;
        enum mlx5_device_state  state;
        /* sync interface state */
        struct mutex            intf_state_mutex;
@@@ -885,6 -885,7 +887,7 @@@ void mlx5_cmd_mbox_status(void *out, u
  int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
  int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
  int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
+ void mlx5_health_flush(struct mlx5_core_dev *dev);
  void mlx5_health_cleanup(struct mlx5_core_dev *dev);
  int mlx5_health_init(struct mlx5_core_dev *dev);
  void mlx5_start_health_poll(struct mlx5_core_dev *dev);
@@@ -961,6 -962,10 +964,6 @@@ int mlx5_query_odp_caps(struct mlx5_cor
                        struct mlx5_odp_caps *odp_caps);
  int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev,
                             u8 port_num, void *out, size_t sz);
 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 -int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
 -                              u32 wq_num, u8 type, int error);
 -#endif
  
  int mlx5_init_rl_table(struct mlx5_core_dev *dev);
  void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
@@@ -1039,7 -1044,6 +1042,7 @@@ int mlx5_cmd_create_vport_lag(struct ml
  int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev);
  bool mlx5_lag_is_roce(struct mlx5_core_dev *dev);
  bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev);
 +bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev);
  bool mlx5_lag_is_active(struct mlx5_core_dev *dev);
  struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
  int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
@@@ -1087,11 -1091,6 +1090,11 @@@ static inline bool mlx5_core_is_ecpf_es
        return dev->caps.embedded_cpu && MLX5_CAP_GEN(dev, eswitch_manager);
  }
  
 +static inline bool mlx5_ecpf_vport_exists(struct mlx5_core_dev *dev)
 +{
 +      return mlx5_core_is_pf(dev) && MLX5_CAP_ESW(dev, ecpf_vport_exists);
 +}
 +
  #define MLX5_HOST_PF_MAX_VFS  (127u)
  static inline u16 mlx5_core_max_vfs(struct mlx5_core_dev *dev)
  {
index b0e17c94566c130978ca03f2a8118752f3c90ff9,d31712af5a7bb2e9c4eadf19a7085f5a6979a761..0e0e63d4d7aa4fb9e57139e30cb871a37c22ecf1
@@@ -631,8 -631,7 +631,8 @@@ struct mlx5_ifc_e_switch_cap_bits 
        u8         vport_svlan_insert[0x1];
        u8         vport_cvlan_insert_if_not_exist[0x1];
        u8         vport_cvlan_insert_overwrite[0x1];
 -      u8         reserved_at_5[0x17];
 +      u8         reserved_at_5[0x16];
 +      u8         ecpf_vport_exists[0x1];
        u8         counter_eswitch_affinity[0x1];
        u8         merged_eswitch[0x1];
        u8         nic_vport_node_guid_modify[0x1];
@@@ -5110,7 -5109,6 +5110,7 @@@ enum 
        MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0    = 0x14,
        MLX5_ACTION_IN_FIELD_OUT_SIPV4         = 0x15,
        MLX5_ACTION_IN_FIELD_OUT_DIPV4         = 0x16,
 +      MLX5_ACTION_IN_FIELD_OUT_FIRST_VID     = 0x17,
        MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT = 0x47,
  };
  
@@@ -8027,6 -8025,52 +8027,52 @@@ struct mlx5_ifc_ppcnt_reg_bits 
        union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set;
  };
  
+ struct mlx5_ifc_mpein_reg_bits {
+       u8         reserved_at_0[0x2];
+       u8         depth[0x6];
+       u8         pcie_index[0x8];
+       u8         node[0x8];
+       u8         reserved_at_18[0x8];
+       u8         capability_mask[0x20];
+       u8         reserved_at_40[0x8];
+       u8         link_width_enabled[0x8];
+       u8         link_speed_enabled[0x10];
+       u8         lane0_physical_position[0x8];
+       u8         link_width_active[0x8];
+       u8         link_speed_active[0x10];
+       u8         num_of_pfs[0x10];
+       u8         num_of_vfs[0x10];
+       u8         bdf0[0x10];
+       u8         reserved_at_b0[0x10];
+       u8         max_read_request_size[0x4];
+       u8         max_payload_size[0x4];
+       u8         reserved_at_c8[0x5];
+       u8         pwr_status[0x3];
+       u8         port_type[0x4];
+       u8         reserved_at_d4[0xb];
+       u8         lane_reversal[0x1];
+       u8         reserved_at_e0[0x14];
+       u8         pci_power[0xc];
+       u8         reserved_at_100[0x20];
+       u8         device_status[0x10];
+       u8         port_state[0x8];
+       u8         reserved_at_138[0x8];
+       u8         reserved_at_140[0x10];
+       u8         receiver_detect_result[0x10];
+       u8         reserved_at_160[0x20];
+ };
  struct mlx5_ifc_mpcnt_reg_bits {
        u8         reserved_at_0[0x8];
        u8         pcie_index[0x8];
@@@ -8346,7 -8390,9 +8392,9 @@@ struct mlx5_ifc_pcam_reg_bits 
  };
  
  struct mlx5_ifc_mcam_enhanced_features_bits {
-       u8         reserved_at_0[0x74];
+       u8         reserved_at_0[0x6e];
+       u8         pci_status_and_power[0x1];
+       u8         reserved_at_6f[0x5];
        u8         mark_tx_action_cnp[0x1];
        u8         mark_tx_action_cqe[0x1];
        u8         dynamic_tx_overflow[0x1];
@@@ -8474,17 -8520,9 +8522,17 @@@ struct mlx5_ifc_pamp_reg_bits 
  struct mlx5_ifc_pcmr_reg_bits {
        u8         reserved_at_0[0x8];
        u8         local_port[0x8];
 -      u8         reserved_at_10[0x2e];
 +      u8         reserved_at_10[0x10];
 +      u8         entropy_force_cap[0x1];
 +      u8         entropy_calc_cap[0x1];
 +      u8         entropy_gre_calc_cap[0x1];
 +      u8         reserved_at_23[0x1b];
        u8         fcs_cap[0x1];
 -      u8         reserved_at_3f[0x1f];
 +      u8         reserved_at_3f[0x1];
 +      u8         entropy_force[0x1];
 +      u8         entropy_calc[0x1];
 +      u8         entropy_gre_calc[0x1];
 +      u8         reserved_at_43[0x1b];
        u8         fcs_chk[0x1];
        u8         reserved_at_5f[0x1];
  };
@@@ -8954,6 -8992,7 +9002,7 @@@ union mlx5_ifc_ports_control_registers_
        struct mlx5_ifc_pmtu_reg_bits pmtu_reg;
        struct mlx5_ifc_ppad_reg_bits ppad_reg;
        struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg;
+       struct mlx5_ifc_mpein_reg_bits mpein_reg;
        struct mlx5_ifc_mpcnt_reg_bits mpcnt_reg;
        struct mlx5_ifc_pplm_reg_bits pplm_reg;
        struct mlx5_ifc_pplr_reg_bits pplr_reg;
This page took 0.235504 seconds and 4 git commands to generate.