]> Git Repo - linux.git/commitdiff
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
authorLinus Torvalds <[email protected]>
Thu, 2 Apr 2020 01:18:18 +0000 (18:18 -0700)
committerLinus Torvalds <[email protected]>
Thu, 2 Apr 2020 01:18:18 +0000 (18:18 -0700)
Pull rdma updates from Jason Gunthorpe:
 "The majority of the patches are cleanups, refactorings and clarity
  improvements.

  This cycle saw some more activity from Syzkaller, I think we are now
  clean on all but one of those bugs, including the long standing and
  obnoxious rdma_cm locking design defect. Continue to see many drivers
  getting cleanups, with a few new user visible features.

  Summary:

   - Various driver updates for siw, bnxt_re, rxe, efa, mlx5, hfi1

   - Lots of cleanup patches for hns

   - Convert more places to use refcount

   - Aggressively lock the RDMA CM code that syzkaller says isn't
     working

   - Work to clarify ib_cm

   - Use the new ib_device lifecycle model in bnxt_re

   - Fix mlx5's MR cache which seems to be failing more often with the
     new ODP code

   - mlx5 'dynamic uar' and 'tx steering' user interfaces"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (144 commits)
  RDMA/bnxt_re: make bnxt_re_ib_init static
  IB/qib: Delete struct qib_ivdev.qp_rnd
  RDMA/hns: Fix uninitialized variable bug
  RDMA/hns: Modify the mask of QP number for CQE of hip08
  RDMA/hns: Reduce the maximum number of extend SGE per WQE
  RDMA/hns: Reduce PFC frames in congestion scenarios
  RDMA/mlx5: Add support for RDMA TX flow table
  net/mlx5: Add support for RDMA TX steering
  IB/hfi1: Call kobject_put() when kobject_init_and_add() fails
  IB/hfi1: Fix memory leaks in sysfs registration and unregistration
  IB/mlx5: Move to fully dynamic UAR mode once user space supports it
  IB/mlx5: Limit the scope of struct mlx5_bfreg_info to mlx5_ib
  IB/mlx5: Extend QP creation to get uar page index from user space
  IB/mlx5: Extend CQ creation to get uar page index from user space
  IB/mlx5: Expose UAR object and its alloc/destroy commands
  IB/hfi1: Get rid of a warning
  RDMA/hns: Remove redundant judgment of qp_type
  RDMA/hns: Remove redundant assignment of wc->smac when polling cq
  RDMA/hns: Remove redundant qpc setup operations
  RDMA/hns: Remove meaningless prints
  ...

1  2 
drivers/infiniband/hw/mlx5/cq.c
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
include/linux/mlx5/driver.h
include/linux/mlx5/fs.h
include/linux/mlx5/mlx5_ifc.h

index 3dec3de903b751054c68bd2f407d3c44bd1c4ef6,f1c7fa561b1664158431207e0865a52e89a64825..146ba29667441eadcc3f4aac7443181cb9707142
@@@ -330,22 -330,6 +330,22 @@@ static void mlx5_handle_error_cqe(struc
                dump_cqe(dev, cqe);
  }
  
 +static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
 +                         u16 tail, u16 head)
 +{
 +      u16 idx;
 +
 +      do {
 +              idx = tail & (qp->sq.wqe_cnt - 1);
 +              if (idx == head)
 +                      break;
 +
 +              tail = qp->sq.w_list[idx].next;
 +      } while (1);
 +      tail = qp->sq.w_list[idx].next;
 +      qp->sq.last_poll = tail;
 +}
 +
  static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf)
  {
        mlx5_frag_buf_free(dev->mdev, &buf->frag_buf);
@@@ -384,7 -368,7 +384,7 @@@ static void get_sig_err_item(struct mlx
  }
  
  static void sw_comp(struct mlx5_ib_qp *qp, int num_entries, struct ib_wc *wc,
 -                  int *npolled, int is_send)
 +                  int *npolled, bool is_send)
  {
        struct mlx5_ib_wq *wq;
        unsigned int cur;
                return;
  
        for (i = 0;  i < cur && np < num_entries; i++) {
 -              wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
 +              unsigned int idx;
 +
 +              idx = (is_send) ? wq->last_poll : wq->tail;
 +              idx &= (wq->wqe_cnt - 1);
 +              wc->wr_id = wq->wrid[idx];
                wc->status = IB_WC_WR_FLUSH_ERR;
                wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
                wq->tail++;
 +              if (is_send)
 +                      wq->last_poll = wq->w_list[idx].next;
                np++;
                wc->qp = &qp->ibqp;
                wc++;
@@@ -495,7 -473,6 +495,7 @@@ repoll
                wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
                idx = wqe_ctr & (wq->wqe_cnt - 1);
                handle_good_req(wc, cqe64, wq, idx);
 +              handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
                wc->wr_id = wq->wrid[idx];
                wq->tail = wq->wqe_head[idx] + 1;
                wc->status = IB_WC_SUCCESS;
@@@ -715,17 -692,19 +715,19 @@@ static int create_cq_user(struct mlx5_i
        struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
                udata, struct mlx5_ib_ucontext, ibucontext);
  
-       ucmdlen = udata->inlen < sizeof(ucmd) ?
-                 (sizeof(ucmd) - sizeof(ucmd.flags)) : sizeof(ucmd);
+       ucmdlen = min(udata->inlen, sizeof(ucmd));
+       if (ucmdlen < offsetof(struct mlx5_ib_create_cq, flags))
+               return -EINVAL;
  
        if (ib_copy_from_udata(&ucmd, udata, ucmdlen))
                return -EFAULT;
  
-       if (ucmdlen == sizeof(ucmd) &&
-           (ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD)))
+       if ((ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD |
+                           MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX)))
                return -EINVAL;
  
-       if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128)
+       if ((ucmd.cqe_size != 64 && ucmd.cqe_size != 128) ||
+           ucmd.reserved0 || ucmd.reserved1)
                return -EINVAL;
  
        *cqe_size = ucmd.cqe_size;
        MLX5_SET(cqc, cqc, log_page_size,
                 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
  
-       *index = context->bfregi.sys_pages[0];
+       if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) {
+               *index = ucmd.uar_page_index;
+       } else if (context->bfregi.lib_uar_dyn) {
+               err = -EINVAL;
+               goto err_cqb;
+       } else {
+               *index = context->bfregi.sys_pages[0];
+       }
  
        if (ucmd.cqe_comp_en == 1) {
                int mini_cqe_format;
index 3efa7493456bd29aff8cb954e98ab632b518fa35,bc9d7a99ef4b24847de81831e512d1c803bcfc5c..6679756506e605d115c41f2b25b86b7caeae2d25
@@@ -39,9 -39,6 +39,6 @@@
  #include <linux/dma-mapping.h>
  #include <linux/slab.h>
  #include <linux/bitmap.h>
- #if defined(CONFIG_X86)
- #include <asm/memtype.h>
- #endif
  #include <linux/sched.h>
  #include <linux/sched/mm.h>
  #include <linux/sched/task.h>
@@@ -898,7 -895,7 +895,7 @@@ static int mlx5_ib_query_device(struct 
                        props->raw_packet_caps |=
                                IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
  
-               if (field_avail(typeof(resp), tso_caps, uhw_outlen)) {
+               if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) {
                        max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
                        if (max_tso) {
                                resp.tso_caps.max_tso = 1 << max_tso;
                        }
                }
  
-               if (field_avail(typeof(resp), rss_caps, uhw_outlen)) {
+               if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) {
                        resp.rss_caps.rx_hash_function =
                                                MLX5_RX_HASH_FUNC_TOEPLITZ;
                        resp.rss_caps.rx_hash_fields_mask =
                        resp.response_length += sizeof(resp.rss_caps);
                }
        } else {
-               if (field_avail(typeof(resp), tso_caps, uhw_outlen))
+               if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen)
                        resp.response_length += sizeof(resp.tso_caps);
-               if (field_avail(typeof(resp), rss_caps, uhw_outlen))
+               if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen)
                        resp.response_length += sizeof(resp.rss_caps);
        }
  
                                                MLX5_MAX_CQ_PERIOD;
        }
  
-       if (field_avail(typeof(resp), cqe_comp_caps, uhw_outlen)) {
+       if (offsetofend(typeof(resp), cqe_comp_caps) <= uhw_outlen) {
                resp.response_length += sizeof(resp.cqe_comp_caps);
  
                if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
                }
        }
  
-       if (field_avail(typeof(resp), packet_pacing_caps, uhw_outlen) &&
+       if (offsetofend(typeof(resp), packet_pacing_caps) <= uhw_outlen &&
            raw_support) {
                if (MLX5_CAP_QOS(mdev, packet_pacing) &&
                    MLX5_CAP_GEN(mdev, qos)) {
                resp.response_length += sizeof(resp.packet_pacing_caps);
        }
  
-       if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
-                       uhw_outlen)) {
+       if (offsetofend(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes) <=
+           uhw_outlen) {
                if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
                        resp.mlx5_ib_support_multi_pkt_send_wqes =
                                MLX5_IB_ALLOW_MPW;
                        sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
        }
  
-       if (field_avail(typeof(resp), flags, uhw_outlen)) {
+       if (offsetofend(typeof(resp), flags) <= uhw_outlen) {
                resp.response_length += sizeof(resp.flags);
  
                if (MLX5_CAP_GEN(mdev, cqe_compression_128))
                resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
        }
  
-       if (field_avail(typeof(resp), sw_parsing_caps, uhw_outlen)) {
+       if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) {
                resp.response_length += sizeof(resp.sw_parsing_caps);
                if (MLX5_CAP_ETH(mdev, swp)) {
                        resp.sw_parsing_caps.sw_parsing_offloads |=
                }
        }
  
-       if (field_avail(typeof(resp), striding_rq_caps, uhw_outlen) &&
+       if (offsetofend(typeof(resp), striding_rq_caps) <= uhw_outlen &&
            raw_support) {
                resp.response_length += sizeof(resp.striding_rq_caps);
                if (MLX5_CAP_GEN(mdev, striding_rq)) {
                }
        }
  
-       if (field_avail(typeof(resp), tunnel_offloads_caps, uhw_outlen)) {
+       if (offsetofend(typeof(resp), tunnel_offloads_caps) <= uhw_outlen) {
                resp.response_length += sizeof(resp.tunnel_offloads_caps);
                if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
                        resp.tunnel_offloads_caps |=
                if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre))
                        resp.tunnel_offloads_caps |=
                                MLX5_IB_TUNNELED_OFFLOADS_GRE;
-               if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
-                   MLX5_FLEX_PROTO_CW_MPLS_GRE)
+               if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre))
                        resp.tunnel_offloads_caps |=
                                MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE;
-               if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
-                   MLX5_FLEX_PROTO_CW_MPLS_UDP)
+               if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_udp))
                        resp.tunnel_offloads_caps |=
                                MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
        }
@@@ -1791,6 -1786,7 +1786,7 @@@ static int mlx5_ib_alloc_ucontext(struc
                                     max_cqe_version);
        u32 dump_fill_mkey;
        bool lib_uar_4k;
+       bool lib_uar_dyn;
  
        if (!dev->ib_active)
                return -EAGAIN;
        }
  
        lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
+       lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR;
        bfregi = &context->bfregi;
  
+       if (lib_uar_dyn) {
+               bfregi->lib_uar_dyn = lib_uar_dyn;
+               goto uar_done;
+       }
        /* updates req->total_num_bfregs */
        err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
        if (err)
        if (err)
                goto out_sys_pages;
  
+ uar_done:
        if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
                err = mlx5_ib_devx_create(dev, true);
                if (err < 0)
        INIT_LIST_HEAD(&context->db_page_list);
        mutex_init(&context->db_page_mutex);
  
-       resp.tot_bfregs = req.total_num_bfregs;
+       resp.tot_bfregs = lib_uar_dyn ? 0 : req.total_num_bfregs;
        resp.num_ports = dev->num_ports;
  
-       if (field_avail(typeof(resp), cqe_version, udata->outlen))
+       if (offsetofend(typeof(resp), cqe_version) <= udata->outlen)
                resp.response_length += sizeof(resp.cqe_version);
  
-       if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
+       if (offsetofend(typeof(resp), cmds_supp_uhw) <= udata->outlen) {
                resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
                                      MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
                resp.response_length += sizeof(resp.cmds_supp_uhw);
        }
  
-       if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) {
+       if (offsetofend(typeof(resp), eth_min_inline) <= udata->outlen) {
                if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
                        mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
                        resp.eth_min_inline++;
                resp.response_length += sizeof(resp.eth_min_inline);
        }
  
-       if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) {
+       if (offsetofend(typeof(resp), clock_info_versions) <= udata->outlen) {
                if (mdev->clock_info)
                        resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
                resp.response_length += sizeof(resp.clock_info_versions);
         * pretend we don't support reading the HCA's core clock. This is also
         * forced by mmap function.
         */
-       if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
+       if (offsetofend(typeof(resp), hca_core_clock_offset) <= udata->outlen) {
                if (PAGE_SIZE <= 4096) {
                        resp.comp_mask |=
                                MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
                resp.response_length += sizeof(resp.hca_core_clock_offset);
        }
  
-       if (field_avail(typeof(resp), log_uar_size, udata->outlen))
+       if (offsetofend(typeof(resp), log_uar_size) <= udata->outlen)
                resp.response_length += sizeof(resp.log_uar_size);
  
-       if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
+       if (offsetofend(typeof(resp), num_uars_per_page) <= udata->outlen)
                resp.response_length += sizeof(resp.num_uars_per_page);
  
-       if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) {
+       if (offsetofend(typeof(resp), num_dyn_bfregs) <= udata->outlen) {
                resp.num_dyn_bfregs = bfregi->num_dyn_bfregs;
                resp.response_length += sizeof(resp.num_dyn_bfregs);
        }
  
-       if (field_avail(typeof(resp), dump_fill_mkey, udata->outlen)) {
+       if (offsetofend(typeof(resp), dump_fill_mkey) <= udata->outlen) {
                if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
                        resp.dump_fill_mkey = dump_fill_mkey;
                        resp.comp_mask |=
@@@ -2026,6 -2029,17 +2029,17 @@@ static phys_addr_t uar_index2pfn(struc
        return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
  }
  
+ static u64 uar_index2paddress(struct mlx5_ib_dev *dev,
+                                int uar_idx)
+ {
+       unsigned int fw_uars_per_page;
+       fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+                               MLX5_UARS_IN_PAGE : 1;
+       return (dev->mdev->bar_addr + (uar_idx / fw_uars_per_page) * PAGE_SIZE);
+ }
  static int get_command(unsigned long offset)
  {
        return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
@@@ -2110,6 -2124,11 +2124,11 @@@ static void mlx5_ib_mmap_free(struct rd
                mutex_unlock(&var_table->bitmap_lock);
                kfree(mentry);
                break;
+       case MLX5_IB_MMAP_TYPE_UAR_WC:
+       case MLX5_IB_MMAP_TYPE_UAR_NC:
+               mlx5_cmd_free_uar(dev->mdev, mentry->page_idx);
+               kfree(mentry);
+               break;
        default:
                WARN_ON(true);
        }
@@@ -2130,6 -2149,9 +2149,9 @@@ static int uar_mmap(struct mlx5_ib_dev 
        int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
                                bfregi->num_static_sys_pages;
  
+       if (bfregi->lib_uar_dyn)
+               return -EINVAL;
        if (vma->vm_end - vma->vm_start != PAGE_SIZE)
                return -EINVAL;
  
        switch (cmd) {
        case MLX5_IB_MMAP_WC_PAGE:
        case MLX5_IB_MMAP_ALLOC_WC:
- /* Some architectures don't support WC memory */
- #if defined(CONFIG_X86)
-               if (!pat_enabled())
-                       return -EPERM;
- #elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
-                       return -EPERM;
- #endif
-       /* fall through */
        case MLX5_IB_MMAP_REGULAR_PAGE:
                /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
                prot = pgprot_writecombine(vma->vm_page_prot);
@@@ -2269,7 -2283,8 +2283,8 @@@ static int mlx5_ib_mmap_offset(struct m
  
        mentry = to_mmmap(entry);
        pfn = (mentry->address >> PAGE_SHIFT);
-       if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR)
+       if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR ||
+           mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC)
                prot = pgprot_noncached(vma->vm_page_prot);
        else
                prot = pgprot_writecombine(vma->vm_page_prot);
@@@ -2300,9 -2315,12 +2315,12 @@@ static int mlx5_ib_mmap(struct ib_ucont
        command = get_command(vma->vm_pgoff);
        switch (command) {
        case MLX5_IB_MMAP_WC_PAGE:
+       case MLX5_IB_MMAP_ALLOC_WC:
+               if (!dev->wc_support)
+                       return -EPERM;
+               fallthrough;
        case MLX5_IB_MMAP_NC_PAGE:
        case MLX5_IB_MMAP_REGULAR_PAGE:
-       case MLX5_IB_MMAP_ALLOC_WC:
                return uar_mmap(dev, command, vma, context);
  
        case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
@@@ -3570,8 -3588,7 +3588,8 @@@ static void mlx5_ib_set_rule_source_por
                misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
                                    misc_parameters_2);
  
 -              MLX5_SET_TO_ONES(fte_match_set_misc2, misc, metadata_reg_c_0);
 +              MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
 +                       mlx5_eswitch_get_vport_metadata_mask());
        } else {
                misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
                                    misc_parameters);
@@@ -4046,6 -4063,11 +4064,11 @@@ _get_flow_table(struct mlx5_ib_dev *dev
                        BIT(MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev,
                                                       log_max_ft_size));
                priority = fs_matcher->priority;
+       } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) {
+               max_table_size =
+                       BIT(MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev,
+                                                      log_max_ft_size));
+               priority = fs_matcher->priority;
        }
  
        max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
                prio = &dev->flow_db->fdb;
        else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX)
                prio = &dev->flow_db->rdma_rx[priority];
+       else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX)
+               prio = &dev->flow_db->rdma_tx[priority];
  
        if (!prio)
                return ERR_PTR(-EINVAL);
@@@ -5723,10 -5747,9 +5748,10 @@@ mlx5_ib_counter_alloc_stats(struct rdma
        const struct mlx5_ib_counters *cnts =
                get_counters(dev, counter->port - 1);
  
 -      /* Q counters are in the beginning of all counters */
        return rdma_alloc_hw_stats_struct(cnts->names,
 -                                        cnts->num_q_counters,
 +                                        cnts->num_q_counters +
 +                                        cnts->num_cong_counters +
 +                                        cnts->num_ext_ppcnt_counters,
                                          RDMA_HW_STATS_DEFAULT_LIFESPAN);
  }
  
@@@ -6090,9 -6113,9 +6115,9 @@@ static void mlx5_ib_cleanup_multiport_m
        mlx5_nic_vport_disable_roce(dev->mdev);
  }
  
- static int var_obj_cleanup(struct ib_uobject *uobject,
-                          enum rdma_remove_reason why,
-                          struct uverbs_attr_bundle *attrs)
+ static int mmap_obj_cleanup(struct ib_uobject *uobject,
+                           enum rdma_remove_reason why,
+                           struct uverbs_attr_bundle *attrs)
  {
        struct mlx5_user_mmap_entry *obj = uobject->object;
  
        return 0;
  }
  
+ static int mlx5_rdma_user_mmap_entry_insert(struct mlx5_ib_ucontext *c,
+                                           struct mlx5_user_mmap_entry *entry,
+                                           size_t length)
+ {
+       return rdma_user_mmap_entry_insert_range(
+               &c->ibucontext, &entry->rdma_entry, length,
+               (MLX5_IB_MMAP_OFFSET_START << 16),
+               ((MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1));
+ }
  static struct mlx5_user_mmap_entry *
  alloc_var_entry(struct mlx5_ib_ucontext *c)
  {
        entry->page_idx = page_idx;
        entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR;
  
-       err = rdma_user_mmap_entry_insert_range(
-               &c->ibucontext, &entry->rdma_entry, var_table->stride_size,
-               MLX5_IB_MMAP_OFFSET_START << 16,
-               (MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1);
+       err = mlx5_rdma_user_mmap_entry_insert(c, entry,
+                                              var_table->stride_size);
        if (err)
                goto err_insert;
  
@@@ -6217,7 -6248,7 +6250,7 @@@ DECLARE_UVERBS_NAMED_METHOD_DESTROY
                        UA_MANDATORY));
  
  DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR,
-                           UVERBS_TYPE_ALLOC_IDR(var_obj_cleanup),
+                           UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
                            &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC),
                            &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY));
  
@@@ -6229,6 -6260,134 +6262,134 @@@ static bool var_is_supported(struct ib_
                        MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q);
  }
  
+ static struct mlx5_user_mmap_entry *
+ alloc_uar_entry(struct mlx5_ib_ucontext *c,
+               enum mlx5_ib_uapi_uar_alloc_type alloc_type)
+ {
+       struct mlx5_user_mmap_entry *entry;
+       struct mlx5_ib_dev *dev;
+       u32 uar_index;
+       int err;
+       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+       if (!entry)
+               return ERR_PTR(-ENOMEM);
+       dev = to_mdev(c->ibucontext.device);
+       err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
+       if (err)
+               goto end;
+       entry->page_idx = uar_index;
+       entry->address = uar_index2paddress(dev, uar_index);
+       if (alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
+               entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_WC;
+       else
+               entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_NC;
+       err = mlx5_rdma_user_mmap_entry_insert(c, entry, PAGE_SIZE);
+       if (err)
+               goto err_insert;
+       return entry;
+ err_insert:
+       mlx5_cmd_free_uar(dev->mdev, uar_index);
+ end:
+       kfree(entry);
+       return ERR_PTR(err);
+ }
+ static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)(
+       struct uverbs_attr_bundle *attrs)
+ {
+       struct ib_uobject *uobj = uverbs_attr_get_uobject(
+               attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE);
+       enum mlx5_ib_uapi_uar_alloc_type alloc_type;
+       struct mlx5_ib_ucontext *c;
+       struct mlx5_user_mmap_entry *entry;
+       u64 mmap_offset;
+       u32 length;
+       int err;
+       c = to_mucontext(ib_uverbs_get_ucontext(attrs));
+       if (IS_ERR(c))
+               return PTR_ERR(c);
+       err = uverbs_get_const(&alloc_type, attrs,
+                              MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE);
+       if (err)
+               return err;
+       if (alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF &&
+           alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC)
+               return -EOPNOTSUPP;
+       if (!to_mdev(c->ibucontext.device)->wc_support &&
+           alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
+               return -EOPNOTSUPP;
+       entry = alloc_uar_entry(c, alloc_type);
+       if (IS_ERR(entry))
+               return PTR_ERR(entry);
+       mmap_offset = mlx5_entry_to_mmap_offset(entry);
+       length = entry->rdma_entry.npages * PAGE_SIZE;
+       uobj->object = entry;
+       err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
+                            &mmap_offset, sizeof(mmap_offset));
+       if (err)
+               goto err;
+       err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
+                            &entry->page_idx, sizeof(entry->page_idx));
+       if (err)
+               goto err;
+       err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
+                            &length, sizeof(length));
+       if (err)
+               goto err;
+       return 0;
+ err:
+       rdma_user_mmap_entry_remove(&entry->rdma_entry);
+       return err;
+ }
+ DECLARE_UVERBS_NAMED_METHOD(
+       MLX5_IB_METHOD_UAR_OBJ_ALLOC,
+       UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE,
+                       MLX5_IB_OBJECT_UAR,
+                       UVERBS_ACCESS_NEW,
+                       UA_MANDATORY),
+       UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE,
+                            enum mlx5_ib_uapi_uar_alloc_type,
+                            UA_MANDATORY),
+       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
+                          UVERBS_ATTR_TYPE(u32),
+                          UA_MANDATORY),
+       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
+                          UVERBS_ATTR_TYPE(u32),
+                          UA_MANDATORY),
+       UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
+                           UVERBS_ATTR_TYPE(u64),
+                           UA_MANDATORY));
+ DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+       MLX5_IB_METHOD_UAR_OBJ_DESTROY,
+       UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE,
+                       MLX5_IB_OBJECT_UAR,
+                       UVERBS_ACCESS_DESTROY,
+                       UA_MANDATORY));
+ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_UAR,
+                           UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
+                           &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_ALLOC),
+                           &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_DESTROY));
  ADD_UVERBS_ATTRIBUTES_SIMPLE(
        mlx5_ib_dm,
        UVERBS_OBJECT_DM,
@@@ -6253,12 -6412,14 +6414,14 @@@ ADD_UVERBS_ATTRIBUTES_SIMPLE
  static const struct uapi_definition mlx5_ib_defs[] = {
        UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
        UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
+       UAPI_DEF_CHAIN(mlx5_ib_qos_defs),
  
        UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
                                &mlx5_ib_flow_action),
        UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
        UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
                                UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
+       UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR),
        {}
  };
  
@@@ -6392,7 -6553,7 +6555,7 @@@ static int mlx5_ib_stage_init_init(stru
        spin_lock_init(&dev->reset_flow_resource_lock);
        xa_init(&dev->odp_mkeys);
        xa_init(&dev->sig_mrs);
-       spin_lock_init(&dev->mkey_lock);
+       atomic_set(&dev->mkey_var, 0);
  
        spin_lock_init(&dev->dm.lock);
        dev->dm.dev = mdev;
@@@ -6548,7 -6709,8 +6711,8 @@@ static int mlx5_ib_init_var_table(struc
                                        doorbell_bar_offset);
        bar_size = (1ULL << log_doorbell_bar_size) * 4096;
        var_table->stride_size = 1ULL << log_doorbell_stride;
-       var_table->num_var_hw_entries = div64_u64(bar_size, var_table->stride_size);
+       var_table->num_var_hw_entries = div_u64(bar_size,
+                                               var_table->stride_size);
        mutex_init(&var_table->bitmap_lock);
        var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries,
                                          GFP_KERNEL);
@@@ -7080,6 -7242,9 +7244,9 @@@ const struct mlx5_ib_profile raw_eth_pr
        STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
                     mlx5_ib_stage_counters_init,
                     mlx5_ib_stage_counters_cleanup),
+       STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
+                    mlx5_ib_stage_cong_debugfs_init,
+                    mlx5_ib_stage_cong_debugfs_cleanup),
        STAGE_CREATE(MLX5_IB_STAGE_UAR,
                     mlx5_ib_stage_uar_init,
                     mlx5_ib_stage_uar_cleanup),
index fc19dc1cf2e12dff3248c1b90f0942a7ab64c513,544b6392359e90a132c72e2b1b7e8f1abaf520ea..a4e522385de05413b8e49421b8f847b0466cd9d6
@@@ -64,8 -64,6 +64,6 @@@
        dev_warn(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__,     \
                 __LINE__, current->pid, ##arg)
  
- #define field_avail(type, fld, sz) (offsetof(type, fld) +             \
-                                   sizeof(((type *)0)->fld) <= (sz))
  #define MLX5_IB_DEFAULT_UIDX 0xffffff
  #define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index)
  
@@@ -126,11 -124,27 +124,27 @@@ enum 
  enum mlx5_ib_mmap_type {
        MLX5_IB_MMAP_TYPE_MEMIC = 1,
        MLX5_IB_MMAP_TYPE_VAR = 2,
+       MLX5_IB_MMAP_TYPE_UAR_WC = 3,
+       MLX5_IB_MMAP_TYPE_UAR_NC = 4,
  };
  
- #define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)                                        \
-       (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
- #define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
+ struct mlx5_bfreg_info {
+       u32 *sys_pages;
+       int num_low_latency_bfregs;
+       unsigned int *count;
+       /*
+        * protect bfreg allocation data structs
+        */
+       struct mutex lock;
+       u32 ver;
+       u8 lib_uar_4k : 1;
+       u8 lib_uar_dyn : 1;
+       u32 num_sys_pages;
+       u32 num_static_sys_pages;
+       u32 total_num_bfregs;
+       u32 num_dyn_bfregs;
+ };
  
  struct mlx5_ib_ucontext {
        struct ib_ucontext      ibucontext;
@@@ -203,6 -217,11 +217,11 @@@ struct mlx5_ib_flow_matcher 
        u8                      match_criteria_enable;
  };
  
+ struct mlx5_ib_pp {
+       u16 index;
+       struct mlx5_core_dev *mdev;
+ };
  struct mlx5_ib_flow_db {
        struct mlx5_ib_flow_prio        prios[MLX5_IB_NUM_FLOW_FT];
        struct mlx5_ib_flow_prio        egress_prios[MLX5_IB_NUM_FLOW_FT];
        struct mlx5_ib_flow_prio        egress[MLX5_IB_NUM_EGRESS_FTS];
        struct mlx5_ib_flow_prio        fdb;
        struct mlx5_ib_flow_prio        rdma_rx[MLX5_IB_NUM_FLOW_FT];
+       struct mlx5_ib_flow_prio        rdma_tx[MLX5_IB_NUM_FLOW_FT];
        struct mlx5_flow_table          *lag_demux_ft;
        /* Protect flow steering bypass flow tables
         * when add/del flow rules.
@@@ -288,7 -308,6 +308,7 @@@ struct mlx5_ib_wq 
        unsigned                head;
        unsigned                tail;
        u16                     cur_post;
 +      u16                     last_poll;
        void                    *cur_edge;
  };
  
@@@ -618,8 -637,8 +638,8 @@@ struct mlx5_ib_mr 
        struct ib_umem         *umem;
        struct mlx5_shared_mr_info      *smr_info;
        struct list_head        list;
-       int                     order;
-       bool                    allocated_from_cache;
+       unsigned int            order;
+       struct mlx5_cache_ent  *cache_ent;
        int                     npages;
        struct mlx5_ib_dev     *dev;
        u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
@@@ -701,22 -720,34 +721,34 @@@ struct mlx5_cache_ent 
        u32                     access_mode;
        u32                     page;
  
-       u32                     size;
-       u32                     cur;
+       u8 disabled:1;
+       u8 fill_to_high_water:1;
+       /*
+        * - available_mrs is the length of list head, ie the number of MRs
+        *   available for immediate allocation.
+        * - total_mrs is available_mrs plus all in use MRs that could be
+        *   returned to the cache.
+        * - limit is the low water mark for available_mrs, 2* limit is the
+        *   upper water mark.
+        * - pending is the number of MRs currently being created
+        */
+       u32 total_mrs;
+       u32 available_mrs;
+       u32 limit;
+       u32 pending;
+       /* Statistics */
        u32                     miss;
-       u32                     limit;
  
        struct mlx5_ib_dev     *dev;
        struct work_struct      work;
        struct delayed_work     dwork;
-       int                     pending;
-       struct completion       compl;
  };
  
  struct mlx5_mr_cache {
        struct workqueue_struct *wq;
        struct mlx5_cache_ent   ent[MAX_MR_CACHE_ENTRIES];
-       int                     stopped;
        struct dentry           *root;
        unsigned long           last_add;
  };
@@@ -794,6 -825,7 +826,7 @@@ enum mlx5_ib_dbg_cc_types 
        MLX5_IB_DBG_CC_RP_BYTE_RESET,
        MLX5_IB_DBG_CC_RP_THRESHOLD,
        MLX5_IB_DBG_CC_RP_AI_RATE,
+       MLX5_IB_DBG_CC_RP_MAX_RATE,
        MLX5_IB_DBG_CC_RP_HAI_RATE,
        MLX5_IB_DBG_CC_RP_MIN_DEC_FAC,
        MLX5_IB_DBG_CC_RP_MIN_RATE,
        MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD,
        MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE,
        MLX5_IB_DBG_CC_RP_GD,
+       MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS,
        MLX5_IB_DBG_CC_NP_CNP_DSCP,
        MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE,
        MLX5_IB_DBG_CC_NP_CNP_PRIO,
@@@ -986,19 -1019,16 +1020,16 @@@ struct mlx5_ib_dev 
         */
        struct mutex                    cap_mask_mutex;
        u8                              ib_active:1;
-       u8                              fill_delay:1;
        u8                              is_rep:1;
        u8                              lag_active:1;
        u8                              wc_support:1;
+       u8                              fill_delay;
        struct umr_common               umrc;
        /* sync used page count stats
         */
        struct mlx5_ib_resources        devr;
  
-       /* protect mkey key part */
-       spinlock_t                      mkey_lock;
-       u8                              mkey_key;
+       atomic_t                        mkey_var;
        struct mlx5_mr_cache            cache;
        struct timer_list               delay_timer;
        /* Prevents soft lock on massive reg MRs */
@@@ -1268,7 -1298,8 +1299,8 @@@ int mlx5_ib_get_cqe_size(struct ib_cq *
  int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
  int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
  
- struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry);
+ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+                                      unsigned int entry);
  void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
  int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr);
  
@@@ -1388,6 -1419,7 +1420,7 @@@ int mlx5_ib_fill_stat_entry(struct sk_b
  
  extern const struct uapi_definition mlx5_ib_devx_defs[];
  extern const struct uapi_definition mlx5_ib_flow_defs[];
+ extern const struct uapi_definition mlx5_ib_qos_defs[];
  
  #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
  int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
@@@ -1477,12 -1509,11 +1510,11 @@@ static inline int get_qp_user_index(str
  {
        u8 cqe_version = ucontext->cqe_version;
  
-       if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) &&
-           !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+       if ((offsetofend(typeof(*ucmd), uidx) <= inlen) && !cqe_version &&
+           (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
                return 0;
  
-       if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) !=
-              !!cqe_version))
+       if ((offsetofend(typeof(*ucmd), uidx) <= inlen) != !!cqe_version)
                return -EINVAL;
  
        return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
@@@ -1495,12 -1526,11 +1527,11 @@@ static inline int get_srq_user_index(st
  {
        u8 cqe_version = ucontext->cqe_version;
  
-       if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) &&
-           !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+       if ((offsetofend(typeof(*ucmd), uidx) <= inlen) && !cqe_version &&
+           (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
                return 0;
  
-       if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) !=
-              !!cqe_version))
+       if ((offsetofend(typeof(*ucmd), uidx) <= inlen) != !!cqe_version)
                return -EINVAL;
  
        return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
@@@ -1539,7 -1569,9 +1570,9 @@@ static inline bool mlx5_ib_can_use_umr(
            MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
                return false;
  
-       if (access_flags & IB_ACCESS_RELAXED_ORDERING)
+       if (access_flags & IB_ACCESS_RELAXED_ORDERING &&
+           (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) ||
+            MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)))
                return false;
  
        return true;
index 8fe149e808af1f3ed32dc4b9fcf74899758fe84b,aa7834d804937852af3655ffc6d8d185b8def0e5..1456db4b6295907c47e0b7f2d083819e58b49dc3
@@@ -697,6 -697,9 +697,9 @@@ static int alloc_bfreg(struct mlx5_ib_d
  {
        int bfregn = -ENOMEM;
  
+       if (bfregi->lib_uar_dyn)
+               return -EINVAL;
        mutex_lock(&bfregi->lock);
        if (bfregi->ver >= 2) {
                bfregn = alloc_high_class_bfreg(dev, bfregi);
@@@ -768,6 -771,9 +771,9 @@@ int bfregn_to_uar_index(struct mlx5_ib_
        u32 index_of_sys_page;
        u32 offset;
  
+       if (bfregi->lib_uar_dyn)
+               return -EINVAL;
        bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) *
                                MLX5_NON_FP_BFREGS_PER_UAR;
        index_of_sys_page = bfregn / bfregs_per_sys_page;
@@@ -919,6 -925,7 +925,7 @@@ static int create_user_qp(struct mlx5_i
        void *qpc;
        int err;
        u16 uid;
+       u32 uar_flags;
  
        err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
        if (err) {
  
        context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext,
                                            ibucontext);
-       if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) {
+       uar_flags = ucmd.flags & (MLX5_QP_FLAG_UAR_PAGE_INDEX |
+                                 MLX5_QP_FLAG_BFREG_INDEX);
+       switch (uar_flags) {
+       case MLX5_QP_FLAG_UAR_PAGE_INDEX:
+               uar_index = ucmd.bfreg_index;
+               bfregn = MLX5_IB_INVALID_BFREG;
+               break;
+       case MLX5_QP_FLAG_BFREG_INDEX:
                uar_index = bfregn_to_uar_index(dev, &context->bfregi,
                                                ucmd.bfreg_index, true);
                if (uar_index < 0)
                        return uar_index;
                bfregn = MLX5_IB_INVALID_BFREG;
-       } else if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) {
-               /*
-                * TBD: should come from the verbs when we have the API
-                */
-               /* In CROSS_CHANNEL CQ and QP must use the same UAR */
-               bfregn = MLX5_CROSS_CHANNEL_BFREG;
-       }
-       else {
+               break;
+       case 0:
+               if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+                       return -EINVAL;
                bfregn = alloc_bfreg(dev, &context->bfregi);
                if (bfregn < 0)
                        return bfregn;
+               break;
+       default:
+               return -EINVAL;
        }
  
        mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index);
@@@ -2100,6 -2112,7 +2112,7 @@@ static int create_qp_common(struct mlx5
                                      MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC |
                                      MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
                                      MLX5_QP_FLAG_TUNNEL_OFFLOADS |
+                                     MLX5_QP_FLAG_UAR_PAGE_INDEX |
                                      MLX5_QP_FLAG_TYPE_DCI |
                                      MLX5_QP_FLAG_TYPE_DCT))
                        return -EINVAL;
@@@ -2789,7 -2802,7 +2802,7 @@@ struct ib_qp *mlx5_ib_create_qp(struct 
                mlx5_ib_dbg(dev, "unsupported qp type %d\n",
                            init_attr->qp_type);
                /* Don't support raw QPs */
-               return ERR_PTR(-EINVAL);
+               return ERR_PTR(-EOPNOTSUPP);
        }
  
        if (verbs_init_attr->qp_type == IB_QPT_DRIVER)
@@@ -3775,7 -3788,6 +3788,7 @@@ static int __mlx5_ib_modify_qp(struct i
                qp->sq.cur_post = 0;
                if (qp->sq.wqe_cnt)
                        qp->sq.cur_edge = get_sq_edge(&qp->sq, 0);
 +              qp->sq.last_poll = 0;
                qp->db.db[MLX5_RCV_DBR] = 0;
                qp->db.db[MLX5_SND_DBR] = 0;
        }
@@@ -6205,10 -6217,6 +6218,10 @@@ struct ib_wq *mlx5_ib_create_wq(struct 
        if (udata->outlen && udata->outlen < min_resp_len)
                return ERR_PTR(-EINVAL);
  
 +      if (!capable(CAP_SYS_RAWIO) &&
 +          init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP)
 +              return ERR_PTR(-EPERM);
 +
        dev = to_mdev(pd->device);
        switch (init_attr->wq_type) {
        case IB_WQT_RQ:
index a10a0c2ca2da10c143cf1d63e789ee37d6773390,a47097d4577c7d547a152fdfd7e038d5c62931cd..67a21fdf5367f82a9dd80c2211657a0ca094c059
@@@ -68,9 -68,6 +68,6 @@@ static void ipoib_get_drvinfo(struct ne
        strlcpy(drvinfo->bus_info, dev_name(priv->ca->dev.parent),
                sizeof(drvinfo->bus_info));
  
-       strlcpy(drvinfo->version, ipoib_driver_version,
-               sizeof(drvinfo->version));
        strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver));
  }
  
@@@ -213,8 -210,6 +210,8 @@@ static int ipoib_get_link_ksettings(str
  }
  
  static const struct ethtool_ops ipoib_ethtool_ops = {
 +      .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS |
 +                                   ETHTOOL_COALESCE_RX_MAX_FRAMES,
        .get_link_ksettings     = ipoib_get_link_ksettings,
        .get_drvinfo            = ipoib_get_drvinfo,
        .get_coalesce           = ipoib_get_coalesce,
index 62ce2b9417abca4e8b5aaf3f25609b629978fc6b,98c74a867ef4e4a53f47f807afe1fde4a52fd82e..d5defe09339a8a4e406027062b0e64f855f4acd2
                               .identified_miss_table_mode),                   \
                FS_CAP(flow_table_properties_nic_transmit.flow_table_modify))
  
+ #define FS_CHAINING_CAPS_RDMA_TX                                                \
+       FS_REQUIRED_CAPS(                                                       \
+               FS_CAP(flow_table_properties_nic_transmit_rdma.flow_modify_en), \
+               FS_CAP(flow_table_properties_nic_transmit_rdma.modify_root),    \
+               FS_CAP(flow_table_properties_nic_transmit_rdma                  \
+                              .identified_miss_table_mode),                    \
+               FS_CAP(flow_table_properties_nic_transmit_rdma                  \
+                              .flow_table_modify))
  #define LEFTOVERS_NUM_LEVELS 1
  #define LEFTOVERS_NUM_PRIOS 1
  
  #define ANCHOR_NUM_PRIOS 1
  #define ANCHOR_MIN_LEVEL (BY_PASS_MIN_LEVEL + 1)
  
 -#define OFFLOADS_MAX_FT 1
 -#define OFFLOADS_NUM_PRIOS 1
 -#define OFFLOADS_MIN_LEVEL (ANCHOR_MIN_LEVEL + 1)
 +#define OFFLOADS_MAX_FT 2
 +#define OFFLOADS_NUM_PRIOS 2
 +#define OFFLOADS_MIN_LEVEL (ANCHOR_MIN_LEVEL + OFFLOADS_NUM_PRIOS)
  
  #define LAG_PRIO_NUM_LEVELS 1
  #define LAG_NUM_PRIOS 1
@@@ -145,7 -154,7 +154,7 @@@ static struct init_tree_node 
                           ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF,
                                  ADD_MULTIPLE_PRIO(LAG_NUM_PRIOS,
                                                    LAG_PRIO_NUM_LEVELS))),
 -                ADD_PRIO(0, OFFLOADS_MIN_LEVEL, 0, {},
 +                ADD_PRIO(0, OFFLOADS_MIN_LEVEL, 0, FS_CHAINING_CAPS,
                           ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF,
                                  ADD_MULTIPLE_PRIO(OFFLOADS_NUM_PRIOS,
                                                    OFFLOADS_MAX_FT))),
@@@ -202,6 -211,18 +211,18 @@@ static struct init_tree_node rdma_rx_ro
        }
  };
  
+ static struct init_tree_node rdma_tx_root_fs = {
+       .type = FS_TYPE_NAMESPACE,
+       .ar_size = 1,
+       .children = (struct init_tree_node[]) {
+               ADD_PRIO(0, MLX5_BY_PASS_NUM_PRIOS, 0,
+                        FS_CHAINING_CAPS_RDMA_TX,
+                        ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF,
+                               ADD_MULTIPLE_PRIO(MLX5_BY_PASS_NUM_PRIOS,
+                                                 BY_PASS_PRIO_NUM_LEVELS))),
+       }
+ };
  enum fs_i_lock_class {
        FS_LOCK_GRANDPARENT,
        FS_LOCK_PARENT,
@@@ -1322,7 -1343,7 +1343,7 @@@ add_rule_fte(struct fs_fte *fte
  
        fte->node.active = true;
        fte->status |= FS_FTE_STATUS_EXISTING;
 -      atomic_inc(&fte->node.version);
 +      atomic_inc(&fg->node.version);
  
  out:
        return handle;
@@@ -1577,19 -1598,28 +1598,19 @@@ struct match_list 
        struct mlx5_flow_group *g;
  };
  
 -struct match_list_head {
 -      struct list_head  list;
 -      struct match_list first;
 -};
 -
 -static void free_match_list(struct match_list_head *head, bool ft_locked)
 +static void free_match_list(struct match_list *head, bool ft_locked)
  {
 -      if (!list_empty(&head->list)) {
 -              struct match_list *iter, *match_tmp;
 +      struct match_list *iter, *match_tmp;
  
 -              list_del(&head->first.list);
 -              tree_put_node(&head->first.g->node, ft_locked);
 -              list_for_each_entry_safe(iter, match_tmp, &head->list,
 -                                       list) {
 -                      tree_put_node(&iter->g->node, ft_locked);
 -                      list_del(&iter->list);
 -                      kfree(iter);
 -              }
 +      list_for_each_entry_safe(iter, match_tmp, &head->list,
 +                               list) {
 +              tree_put_node(&iter->g->node, ft_locked);
 +              list_del(&iter->list);
 +              kfree(iter);
        }
  }
  
 -static int build_match_list(struct match_list_head *match_head,
 +static int build_match_list(struct match_list *match_head,
                            struct mlx5_flow_table *ft,
                            const struct mlx5_flow_spec *spec,
                            bool ft_locked)
        rhl_for_each_entry_rcu(g, tmp, list, hash) {
                struct match_list *curr_match;
  
 -              if (likely(list_empty(&match_head->list))) {
 -                      if (!tree_get_node(&g->node))
 -                              continue;
 -                      match_head->first.g = g;
 -                      list_add_tail(&match_head->first.list,
 -                                    &match_head->list);
 +              if (unlikely(!tree_get_node(&g->node)))
                        continue;
 -              }
  
                curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC);
                if (!curr_match) {
                        err = -ENOMEM;
                        goto out;
                }
 -              if (!tree_get_node(&g->node)) {
 -                      kfree(curr_match);
 -                      continue;
 -              }
                curr_match->g = g;
                list_add_tail(&curr_match->list, &match_head->list);
        }
@@@ -1680,7 -1720,7 +1701,7 @@@ try_add_to_existing_fg(struct mlx5_flow
        struct match_list *iter;
        bool take_write = false;
        struct fs_fte *fte;
 -      u64  version;
 +      u64  version = 0;
        int err;
  
        fte = alloc_fte(ft, spec, flow_act);
                return  ERR_PTR(-ENOMEM);
  
  search_again_locked:
 -      version = matched_fgs_get_version(match_head);
        if (flow_act->flags & FLOW_ACT_NO_APPEND)
                goto skip_search;
 -      /* Try to find a fg that already contains a matching fte */
 +      version = matched_fgs_get_version(match_head);
 +      /* Try to find an fte with identical match value and attempt update its
 +       * action.
 +       */
        list_for_each_entry(iter, match_head, list) {
                struct fs_fte *fte_tmp;
  
@@@ -1721,12 -1759,10 +1742,12 @@@ skip_search
                goto out;
        }
  
 -      /* Check the fgs version, for case the new FTE with the
 -       * same values was added while the fgs weren't locked
 +      /* Check the fgs version. If version have changed it could be that an
 +       * FTE with the same match value was added while the fgs weren't
 +       * locked.
         */
 -      if (version != matched_fgs_get_version(match_head)) {
 +      if (!(flow_act->flags & FLOW_ACT_NO_APPEND) &&
 +          version != matched_fgs_get_version(match_head)) {
                take_write = true;
                goto search_again_locked;
        }
@@@ -1770,9 -1806,9 +1791,9 @@@ _mlx5_add_flow_rules(struct mlx5_flow_t
  
  {
        struct mlx5_flow_steering *steering = get_steering(&ft->node);
 -      struct mlx5_flow_group *g;
        struct mlx5_flow_handle *rule;
 -      struct match_list_head match_head;
 +      struct match_list match_head;
 +      struct mlx5_flow_group *g;
        bool take_write = false;
        struct fs_fte *fte;
        int version;
@@@ -1877,16 -1913,12 +1898,16 @@@ mlx5_add_flow_rules(struct mlx5_flow_ta
                    int num_dest)
  {
        struct mlx5_flow_root_namespace *root = find_root(&ft->node);
 +      static const struct mlx5_flow_spec zero_spec = {};
        struct mlx5_flow_destination gen_dest = {};
        struct mlx5_flow_table *next_ft = NULL;
        struct mlx5_flow_handle *handle = NULL;
        u32 sw_action = flow_act->action;
        struct fs_prio *prio;
  
 +      if (!spec)
 +              spec = &zero_spec;
 +
        fs_get_obj(prio, ft->node.parent);
        if (flow_act->action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
                if (!fwd_next_prio_supported(ft))
@@@ -2121,6 -2153,8 +2142,8 @@@ struct mlx5_flow_namespace *mlx5_get_fl
        } else if (type == MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL) {
                root_ns = steering->rdma_rx_root_ns;
                prio = RDMA_RX_KERNEL_PRIO;
+       } else if (type == MLX5_FLOW_NAMESPACE_RDMA_TX) {
+               root_ns = steering->rdma_tx_root_ns;
        } else { /* Must be NIC RX */
                root_ns = steering->root_ns;
                prio = type;
@@@ -2524,6 -2558,7 +2547,7 @@@ void mlx5_cleanup_fs(struct mlx5_core_d
        cleanup_root_ns(steering->sniffer_rx_root_ns);
        cleanup_root_ns(steering->sniffer_tx_root_ns);
        cleanup_root_ns(steering->rdma_rx_root_ns);
+       cleanup_root_ns(steering->rdma_tx_root_ns);
        cleanup_root_ns(steering->egress_root_ns);
        mlx5_cleanup_fc_stats(dev);
        kmem_cache_destroy(steering->ftes_cache);
@@@ -2580,6 -2615,29 +2604,29 @@@ out_err
        return err;
  }
  
+ static int init_rdma_tx_root_ns(struct mlx5_flow_steering *steering)
+ {
+       int err;
+       steering->rdma_tx_root_ns = create_root_ns(steering, FS_FT_RDMA_TX);
+       if (!steering->rdma_tx_root_ns)
+               return -ENOMEM;
+       err = init_root_tree(steering, &rdma_tx_root_fs,
+                            &steering->rdma_tx_root_ns->ns.node);
+       if (err)
+               goto out_err;
+       set_prio_attrs(steering->rdma_tx_root_ns);
+       return 0;
+ out_err:
+       cleanup_root_ns(steering->rdma_tx_root_ns);
+       steering->rdma_tx_root_ns = NULL;
+       return err;
+ }
  /* FT and tc chains are stored in the same array so we can re-use the
   * mlx5_get_fdb_sub_ns() and tc api for FT chains.
   * When creating a new ns for each chain store it in the first available slot.
@@@ -2689,17 -2747,6 +2736,17 @@@ static int init_fdb_root_ns(struct mlx5
                goto out_err;
        }
  
 +      /* We put this priority last, knowing that nothing will get here
 +       * unless explicitly forwarded to. This is possible because the
 +       * slow path tables have catch all rules and nothing gets passed
 +       * those tables.
 +       */
 +      maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_PER_VPORT, 1);
 +      if (IS_ERR(maj_prio)) {
 +              err = PTR_ERR(maj_prio);
 +              goto out_err;
 +      }
 +
        set_prio_attrs(steering->fdb_root_ns);
        return 0;
  
@@@ -2890,6 -2937,12 +2937,12 @@@ int mlx5_init_fs(struct mlx5_core_dev *
                        goto err;
        }
  
+       if (MLX5_CAP_FLOWTABLE_RDMA_TX(dev, ft_support)) {
+               err = init_rdma_tx_root_ns(steering);
+               if (err)
+                       goto err;
+       }
        if (MLX5_IPSEC_DEV(dev) || MLX5_CAP_FLOWTABLE_NIC_TX(dev, ft_support)) {
                err = init_egress_root_ns(steering);
                if (err)
index d143b8bd55c9012f186b55b551d10d66d90c5959,a30d834fdf7e507e1427e00537e4664a57bfcf95..6f8f79ef829b1829b9c19eba43857c4467d8fa9b
@@@ -213,23 -213,6 +213,6 @@@ enum mlx5_port_status 
        MLX5_PORT_DOWN      = 2,
  };
  
- struct mlx5_bfreg_info {
-       u32                    *sys_pages;
-       int                     num_low_latency_bfregs;
-       unsigned int           *count;
-       /*
-        * protect bfreg allocation data structs
-        */
-       struct mutex            lock;
-       u32                     ver;
-       bool                    lib_uar_4k;
-       u32                     num_sys_pages;
-       u32                     num_static_sys_pages;
-       u32                     total_num_bfregs;
-       u32                     num_dyn_bfregs;
- };
  struct mlx5_cmd_first {
        __be32          data[4];
  };
@@@ -720,7 -703,6 +703,7 @@@ struct mlx5_core_dev 
        struct mlx5_clock        clock;
        struct mlx5_ib_clock_info  *clock_info;
        struct mlx5_fw_tracer   *tracer;
 +      struct mlx5_rsc_dump    *rsc_dump;
        u32                      vsc_addr;
        struct mlx5_hv_vhca     *hv_vhca;
  };
diff --combined include/linux/mlx5/fs.h
index a5cf5c76f348e5d1429bc04b5c63ebeb458ef742,44c9fe792fc45432e6369b5f2366bf5040c54c3e..e2d13e0740670985035aaebe4f542b98fa39ebe7
@@@ -77,6 -77,7 +77,7 @@@ enum mlx5_flow_namespace_type 
        MLX5_FLOW_NAMESPACE_EGRESS,
        MLX5_FLOW_NAMESPACE_RDMA_RX,
        MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL,
+       MLX5_FLOW_NAMESPACE_RDMA_TX,
  };
  
  enum {
@@@ -84,7 -85,6 +85,7 @@@
        FDB_TC_OFFLOAD,
        FDB_FT_OFFLOAD,
        FDB_SLOW_PATH,
 +      FDB_PER_VPORT,
  };
  
  struct mlx5_pkt_reformat;
index cc55cee3b53cea151dd7931ccfeaf786e7566dd5,a67ed87a68cd8f105e8031139c08ba402143c206..69b27c7dfc3e27d6eb4349613686b03f0718959a
@@@ -416,8 -416,7 +416,8 @@@ struct mlx5_ifc_flow_table_prop_layout_
        u8         termination_table[0x1];
        u8         reformat_and_fwd_to_table[0x1];
        u8         reserved_at_1a[0x6];
 -      u8         reserved_at_20[0x2];
 +      u8         termination_table_raw_traffic[0x1];
 +      u8         reserved_at_21[0x1];
        u8         log_max_ft_size[0x6];
        u8         log_max_modify_header_context[0x8];
        u8         max_modify_header_actions[0x8];
@@@ -709,7 -708,7 +709,7 @@@ struct mlx5_ifc_flow_table_nic_cap_bit
  
        struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit;
  
-       u8         reserved_at_a00[0x200];
+       struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_rdma;
  
        struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer;
  
@@@ -879,7 -878,11 +879,11 @@@ struct mlx5_ifc_per_protocol_networking
        u8         swp_csum[0x1];
        u8         swp_lso[0x1];
        u8         cqe_checksum_full[0x1];
-       u8         reserved_at_24[0x5];
+       u8         tunnel_stateless_geneve_tx[0x1];
+       u8         tunnel_stateless_mpls_over_udp[0x1];
+       u8         tunnel_stateless_mpls_over_gre[0x1];
+       u8         tunnel_stateless_vxlan_gpe[0x1];
+       u8         tunnel_stateless_ipv4_over_vxlan[0x1];
        u8         tunnel_stateless_ip_over_ip[0x1];
        u8         reserved_at_2a[0x6];
        u8         max_vxlan_udp_ports[0x8];
This page took 0.154293 seconds and 4 git commands to generate.