drivers/net/ethernet/intel/ice/ice_txrx.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* Copyright (c) 2018, Intel Corporation. */
   3
   4 /* The driver transmit and receive code */
   5
   6 #include <linux/mm.h>
   7 #include <linux/netdevice.h>
   8 #include <linux/prefetch.h>
   9 #include <linux/bpf_trace.h>
  10 #include <net/dsfield.h>
  11 #include <net/xdp.h>
  12 #include "ice_txrx_lib.h"
  13 #include "ice_lib.h"
  14 #include "ice.h"
  15 #include "ice_trace.h"
  16 #include "ice_dcb_lib.h"
  17 #include "ice_xsk.h"
  18 #include "ice_eswitch.h"
  19
  20 #define ICE_RX_HDR_SIZE         256
  21
  22 #define FDIR_DESC_RXDID 0x40
  23 #define ICE_FDIR_CLEAN_DELAY 10
  24
  25 /**
  26  * ice_prgm_fdir_fltr - Program a Flow Director filter
  27  * @vsi: VSI to send dummy packet
  28  * @fdir_desc: flow director descriptor
  29  * @raw_packet: allocated buffer for flow director
  30  */
  31 int
  32 ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc,
  33                    u8 *raw_packet)
  34 {
  35         struct ice_tx_buf *tx_buf, *first;
  36         struct ice_fltr_desc *f_desc;
  37         struct ice_tx_desc *tx_desc;
  38         struct ice_tx_ring *tx_ring;
  39         struct device *dev;
  40         dma_addr_t dma;
  41         u32 td_cmd;
  42         u16 i;
  43
  44         /* VSI and Tx ring */
  45         if (!vsi)
  46                 return -ENOENT;
  47         tx_ring = vsi->tx_rings[0];
  48         if (!tx_ring || !tx_ring->desc)
  49                 return -ENOENT;
  50         dev = tx_ring->dev;
  51
  52         /* we are using two descriptors to add/del a filter and we can wait */
  53         for (i = ICE_FDIR_CLEAN_DELAY; ICE_DESC_UNUSED(tx_ring) < 2; i--) {
  54                 if (!i)
  55                         return -EAGAIN;
  56                 msleep_interruptible(1);
  57         }
  58
  59         dma = dma_map_single(dev, raw_packet, ICE_FDIR_MAX_RAW_PKT_SIZE,
  60                              DMA_TO_DEVICE);
  61
  62         if (dma_mapping_error(dev, dma))
  63                 return -EINVAL;
  64
  65         /* grab the next descriptor */
  66         i = tx_ring->next_to_use;
  67         first = &tx_ring->tx_buf[i];
  68         f_desc = ICE_TX_FDIRDESC(tx_ring, i);
  69         memcpy(f_desc, fdir_desc, sizeof(*f_desc));
  70
  71         i++;
  72         i = (i < tx_ring->count) ? i : 0;
  73         tx_desc = ICE_TX_DESC(tx_ring, i);
  74         tx_buf = &tx_ring->tx_buf[i];
  75
  76         i++;
  77         tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
  78
  79         memset(tx_buf, 0, sizeof(*tx_buf));
  80         dma_unmap_len_set(tx_buf, len, ICE_FDIR_MAX_RAW_PKT_SIZE);
  81         dma_unmap_addr_set(tx_buf, dma, dma);
  82
  83         tx_desc->buf_addr = cpu_to_le64(dma);
  84         td_cmd = ICE_TXD_LAST_DESC_CMD | ICE_TX_DESC_CMD_DUMMY |
  85                  ICE_TX_DESC_CMD_RE;
  86
  87         tx_buf->tx_flags = ICE_TX_FLAGS_DUMMY_PKT;
  88         tx_buf->raw_buf = raw_packet;
  89
  90         tx_desc->cmd_type_offset_bsz =
  91                 ice_build_ctob(td_cmd, 0, ICE_FDIR_MAX_RAW_PKT_SIZE, 0);
  92
  93         /* Force memory write to complete before letting h/w know
  94          * there are new descriptors to fetch.
  95          */
  96         wmb();
  97
  98         /* mark the data descriptor to be watched */
  99         first->next_to_watch = tx_desc;
 100
 101         writel(tx_ring->next_to_use, tx_ring->tail);
 102
 103         return 0;
 104 }
 105
 106 /**
 107  * ice_unmap_and_free_tx_buf - Release a Tx buffer
 108  * @ring: the ring that owns the buffer
 109  * @tx_buf: the buffer to free
 110  */
 111 static void
 112 ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf)
 113 {
 114         if (tx_buf->skb) {
 115                 if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT)
 116                         devm_kfree(ring->dev, tx_buf->raw_buf);
 117                 else if (ice_ring_is_xdp(ring))
 118                         page_frag_free(tx_buf->raw_buf);
 119                 else
 120                         dev_kfree_skb_any(tx_buf->skb);
 121                 if (dma_unmap_len(tx_buf, len))
 122                         dma_unmap_single(ring->dev,
 123                                          dma_unmap_addr(tx_buf, dma),
 124                                          dma_unmap_len(tx_buf, len),
 125                                          DMA_TO_DEVICE);
 126         } else if (dma_unmap_len(tx_buf, len)) {
 127                 dma_unmap_page(ring->dev,
 128                                dma_unmap_addr(tx_buf, dma),
 129                                dma_unmap_len(tx_buf, len),
 130                                DMA_TO_DEVICE);
 131         }
 132
 133         tx_buf->next_to_watch = NULL;
 134         tx_buf->skb = NULL;
 135         dma_unmap_len_set(tx_buf, len, 0);
 136         /* tx_buf must be completely set up in the transmit path */
 137 }
 138
 139 static struct netdev_queue *txring_txq(const struct ice_tx_ring *ring)
 140 {
 141         return netdev_get_tx_queue(ring->netdev, ring->q_index);
 142 }
 143
 144 /**
 145  * ice_clean_tx_ring - Free any empty Tx buffers
 146  * @tx_ring: ring to be cleaned
 147  */
 148 void ice_clean_tx_ring(struct ice_tx_ring *tx_ring)
 149 {
 150         u32 size;
 151         u16 i;
 152
 153         if (ice_ring_is_xdp(tx_ring) && tx_ring->xsk_pool) {
 154                 ice_xsk_clean_xdp_ring(tx_ring);
 155                 goto tx_skip_free;
 156         }
 157
 158         /* ring already cleared, nothing to do */
 159         if (!tx_ring->tx_buf)
 160                 return;
 161
 162         /* Free all the Tx ring sk_buffs */
 163         for (i = 0; i < tx_ring->count; i++)
 164                 ice_unmap_and_free_tx_buf(tx_ring, &tx_ring->tx_buf[i]);
 165
 166 tx_skip_free:
 167         memset(tx_ring->tx_buf, 0, sizeof(*tx_ring->tx_buf) * tx_ring->count);
 168
 169         size = ALIGN(tx_ring->count * sizeof(struct ice_tx_desc),
 170                      PAGE_SIZE);
 171         /* Zero out the descriptor ring */
 172         memset(tx_ring->desc, 0, size);
 173
 174         tx_ring->next_to_use = 0;
 175         tx_ring->next_to_clean = 0;
 176
 177         if (!tx_ring->netdev)
 178                 return;
 179
 180         /* cleanup Tx queue statistics */
 181         netdev_tx_reset_queue(txring_txq(tx_ring));
 182 }
 183
 184 /**
 185  * ice_free_tx_ring - Free Tx resources per queue
 186  * @tx_ring: Tx descriptor ring for a specific queue
 187  *
 188  * Free all transmit software resources
 189  */
 190 void ice_free_tx_ring(struct ice_tx_ring *tx_ring)
 191 {
 192         u32 size;
 193
 194         ice_clean_tx_ring(tx_ring);
 195         devm_kfree(tx_ring->dev, tx_ring->tx_buf);
 196         tx_ring->tx_buf = NULL;
 197
 198         if (tx_ring->desc) {
 199                 size = ALIGN(tx_ring->count * sizeof(struct ice_tx_desc),
 200                              PAGE_SIZE);
 201                 dmam_free_coherent(tx_ring->dev, size,
 202                                    tx_ring->desc, tx_ring->dma);
 203                 tx_ring->desc = NULL;
 204         }
 205 }
 206
 207 /**
 208  * ice_clean_tx_irq - Reclaim resources after transmit completes
 209  * @tx_ring: Tx ring to clean
 210  * @napi_budget: Used to determine if we are in netpoll
 211  *
 212  * Returns true if there's any budget left (e.g. the clean is finished)
 213  */
 214 static bool ice_clean_tx_irq(struct ice_tx_ring *tx_ring, int napi_budget)
 215 {
 216         unsigned int total_bytes = 0, total_pkts = 0;
 217         unsigned int budget = ICE_DFLT_IRQ_WORK;
 218         struct ice_vsi *vsi = tx_ring->vsi;
 219         s16 i = tx_ring->next_to_clean;
 220         struct ice_tx_desc *tx_desc;
 221         struct ice_tx_buf *tx_buf;
 222
 223         /* get the bql data ready */
 224         if (!ice_ring_is_xdp(tx_ring))
 225                 netdev_txq_bql_complete_prefetchw(txring_txq(tx_ring));
 226
 227         tx_buf = &tx_ring->tx_buf[i];
 228         tx_desc = ICE_TX_DESC(tx_ring, i);
 229         i -= tx_ring->count;
 230
 231         prefetch(&vsi->state);
 232
 233         do {
 234                 struct ice_tx_desc *eop_desc = tx_buf->next_to_watch;
 235
 236                 /* if next_to_watch is not set then there is no work pending */
 237                 if (!eop_desc)
 238                         break;
 239
 240                 /* follow the guidelines of other drivers */
 241                 prefetchw(&tx_buf->skb->users);
 242
 243                 smp_rmb();      /* prevent any other reads prior to eop_desc */
 244
 245                 ice_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf);
 246                 /* if the descriptor isn't done, no work yet to do */
 247                 if (!(eop_desc->cmd_type_offset_bsz &
 248                       cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
 249                         break;
 250
 251                 /* clear next_to_watch to prevent false hangs */
 252                 tx_buf->next_to_watch = NULL;
 253
 254                 /* update the statistics for this packet */
 255                 total_bytes += tx_buf->bytecount;
 256                 total_pkts += tx_buf->gso_segs;
 257
 258                 /* free the skb */
 259                 napi_consume_skb(tx_buf->skb, napi_budget);
 260
 261                 /* unmap skb header data */
 262                 dma_unmap_single(tx_ring->dev,
 263                                  dma_unmap_addr(tx_buf, dma),
 264                                  dma_unmap_len(tx_buf, len),
 265                                  DMA_TO_DEVICE);
 266
 267                 /* clear tx_buf data */
 268                 tx_buf->skb = NULL;
 269                 dma_unmap_len_set(tx_buf, len, 0);
 270
 271                 /* unmap remaining buffers */
 272                 while (tx_desc != eop_desc) {
 273                         ice_trace(clean_tx_irq_unmap, tx_ring, tx_desc, tx_buf);
 274                         tx_buf++;
 275                         tx_desc++;
 276                         i++;
 277                         if (unlikely(!i)) {
 278                                 i -= tx_ring->count;
 279                                 tx_buf = tx_ring->tx_buf;
 280                                 tx_desc = ICE_TX_DESC(tx_ring, 0);
 281                         }
 282
 283                         /* unmap any remaining paged data */
 284                         if (dma_unmap_len(tx_buf, len)) {
 285                                 dma_unmap_page(tx_ring->dev,
 286                                                dma_unmap_addr(tx_buf, dma),
 287                                                dma_unmap_len(tx_buf, len),
 288                                                DMA_TO_DEVICE);
 289                                 dma_unmap_len_set(tx_buf, len, 0);
 290                         }
 291                 }
 292                 ice_trace(clean_tx_irq_unmap_eop, tx_ring, tx_desc, tx_buf);
 293
 294                 /* move us one more past the eop_desc for start of next pkt */
 295                 tx_buf++;
 296                 tx_desc++;
 297                 i++;
 298                 if (unlikely(!i)) {
 299                         i -= tx_ring->count;
 300                         tx_buf = tx_ring->tx_buf;
 301                         tx_desc = ICE_TX_DESC(tx_ring, 0);
 302                 }
 303
 304                 prefetch(tx_desc);
 305
 306                 /* update budget accounting */
 307                 budget--;
 308         } while (likely(budget));
 309
 310         i += tx_ring->count;
 311         tx_ring->next_to_clean = i;
 312
 313         ice_update_tx_ring_stats(tx_ring, total_pkts, total_bytes);
 314
 315         if (ice_ring_is_xdp(tx_ring))
 316                 return !!budget;
 317
 318         netdev_tx_completed_queue(txring_txq(tx_ring), total_pkts, total_bytes);
 319
 320 #define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
 321         if (unlikely(total_pkts && netif_carrier_ok(tx_ring->netdev) &&
 322                      (ICE_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
 323                 /* Make sure that anybody stopping the queue after this
 324                  * sees the new next_to_clean.
 325                  */
 326                 smp_mb();
 327                 if (netif_tx_queue_stopped(txring_txq(tx_ring)) &&
 328                     !test_bit(ICE_VSI_DOWN, vsi->state)) {
 329                         netif_tx_wake_queue(txring_txq(tx_ring));
 330                         ++tx_ring->tx_stats.restart_q;
 331                 }
 332         }
 333
 334         return !!budget;
 335 }
 336
 337 /**
 338  * ice_setup_tx_ring - Allocate the Tx descriptors
 339  * @tx_ring: the Tx ring to set up
 340  *
 341  * Return 0 on success, negative on error
 342  */
 343 int ice_setup_tx_ring(struct ice_tx_ring *tx_ring)
 344 {
 345         struct device *dev = tx_ring->dev;
 346         u32 size;
 347
 348         if (!dev)
 349                 return -ENOMEM;
 350
 351         /* warn if we are about to overwrite the pointer */
 352         WARN_ON(tx_ring->tx_buf);
 353         tx_ring->tx_buf =
 354                 devm_kcalloc(dev, sizeof(*tx_ring->tx_buf), tx_ring->count,
 355                              GFP_KERNEL);
 356         if (!tx_ring->tx_buf)
 357                 return -ENOMEM;
 358
 359         /* round up to nearest page */
 360         size = ALIGN(tx_ring->count * sizeof(struct ice_tx_desc),
 361                      PAGE_SIZE);
 362         tx_ring->desc = dmam_alloc_coherent(dev, size, &tx_ring->dma,
 363                                             GFP_KERNEL);
 364         if (!tx_ring->desc) {
 365                 dev_err(dev, "Unable to allocate memory for the Tx descriptor ring, size=%d\n",
 366                         size);
 367                 goto err;
 368         }
 369
 370         tx_ring->next_to_use = 0;
 371         tx_ring->next_to_clean = 0;
 372         tx_ring->tx_stats.prev_pkt = -1;
 373         return 0;
 374
 375 err:
 376         devm_kfree(dev, tx_ring->tx_buf);
 377         tx_ring->tx_buf = NULL;
 378         return -ENOMEM;
 379 }
 380
 381 /**
 382  * ice_clean_rx_ring - Free Rx buffers
 383  * @rx_ring: ring to be cleaned
 384  */
 385 void ice_clean_rx_ring(struct ice_rx_ring *rx_ring)
 386 {
 387         struct device *dev = rx_ring->dev;
 388         u32 size;
 389         u16 i;
 390
 391         /* ring already cleared, nothing to do */
 392         if (!rx_ring->rx_buf)
 393                 return;
 394
 395         if (rx_ring->skb) {
 396                 dev_kfree_skb(rx_ring->skb);
 397                 rx_ring->skb = NULL;
 398         }
 399
 400         if (rx_ring->xsk_pool) {
 401                 ice_xsk_clean_rx_ring(rx_ring);
 402                 goto rx_skip_free;
 403         }
 404
 405         /* Free all the Rx ring sk_buffs */
 406         for (i = 0; i < rx_ring->count; i++) {
 407                 struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
 408
 409                 if (!rx_buf->page)
 410                         continue;
 411
 412                 /* Invalidate cache lines that may have been written to by
 413                  * device so that we avoid corrupting memory.
 414                  */
 415                 dma_sync_single_range_for_cpu(dev, rx_buf->dma,
 416                                               rx_buf->page_offset,
 417                                               rx_ring->rx_buf_len,
 418                                               DMA_FROM_DEVICE);
 419
 420                 /* free resources associated with mapping */
 421                 dma_unmap_page_attrs(dev, rx_buf->dma, ice_rx_pg_size(rx_ring),
 422                                      DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
 423                 __page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);
 424
 425                 rx_buf->page = NULL;
 426                 rx_buf->page_offset = 0;
 427         }
 428
 429 rx_skip_free:
 430         if (rx_ring->xsk_pool)
 431                 memset(rx_ring->xdp_buf, 0, array_size(rx_ring->count, sizeof(*rx_ring->xdp_buf)));
 432         else
 433                 memset(rx_ring->rx_buf, 0, array_size(rx_ring->count, sizeof(*rx_ring->rx_buf)));
 434
 435         /* Zero out the descriptor ring */
 436         size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc),
 437                      PAGE_SIZE);
 438         memset(rx_ring->desc, 0, size);
 439
 440         rx_ring->next_to_alloc = 0;
 441         rx_ring->next_to_clean = 0;
 442         rx_ring->next_to_use = 0;
 443 }
 444
 445 /**
 446  * ice_free_rx_ring - Free Rx resources
 447  * @rx_ring: ring to clean the resources from
 448  *
 449  * Free all receive software resources
 450  */
 451 void ice_free_rx_ring(struct ice_rx_ring *rx_ring)
 452 {
 453         u32 size;
 454
 455         ice_clean_rx_ring(rx_ring);
 456         if (rx_ring->vsi->type == ICE_VSI_PF)
 457                 if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
 458                         xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 459         rx_ring->xdp_prog = NULL;
 460         if (rx_ring->xsk_pool) {
 461                 kfree(rx_ring->xdp_buf);
 462                 rx_ring->xdp_buf = NULL;
 463         } else {
 464                 kfree(rx_ring->rx_buf);
 465                 rx_ring->rx_buf = NULL;
 466         }
 467
 468         if (rx_ring->desc) {
 469                 size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc),
 470                              PAGE_SIZE);
 471                 dmam_free_coherent(rx_ring->dev, size,
 472                                    rx_ring->desc, rx_ring->dma);
 473                 rx_ring->desc = NULL;
 474         }
 475 }
 476
 477 /**
 478  * ice_setup_rx_ring - Allocate the Rx descriptors
 479  * @rx_ring: the Rx ring to set up
 480  *
 481  * Return 0 on success, negative on error
 482  */
 483 int ice_setup_rx_ring(struct ice_rx_ring *rx_ring)
 484 {
 485         struct device *dev = rx_ring->dev;
 486         u32 size;
 487
 488         if (!dev)
 489                 return -ENOMEM;
 490
 491         /* warn if we are about to overwrite the pointer */
 492         WARN_ON(rx_ring->rx_buf);
 493         rx_ring->rx_buf =
 494                 kcalloc(rx_ring->count, sizeof(*rx_ring->rx_buf), GFP_KERNEL);
 495         if (!rx_ring->rx_buf)
 496                 return -ENOMEM;
 497
 498         /* round up to nearest page */
 499         size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc),
 500                      PAGE_SIZE);
 501         rx_ring->desc = dmam_alloc_coherent(dev, size, &rx_ring->dma,
 502                                             GFP_KERNEL);
 503         if (!rx_ring->desc) {
 504                 dev_err(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
 505                         size);
 506                 goto err;
 507         }
 508
 509         rx_ring->next_to_use = 0;
 510         rx_ring->next_to_clean = 0;
 511
 512         if (ice_is_xdp_ena_vsi(rx_ring->vsi))
 513                 WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
 514
 515         if (rx_ring->vsi->type == ICE_VSI_PF &&
 516             !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
 517                 if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
 518                                      rx_ring->q_index, rx_ring->q_vector->napi.napi_id))
 519                         goto err;
 520         return 0;
 521
 522 err:
 523         kfree(rx_ring->rx_buf);
 524         rx_ring->rx_buf = NULL;
 525         return -ENOMEM;
 526 }
 527
 528 static unsigned int
 529 ice_rx_frame_truesize(struct ice_rx_ring *rx_ring, unsigned int __maybe_unused size)
 530 {
 531         unsigned int truesize;
 532
 533 #if (PAGE_SIZE < 8192)
 534         truesize = ice_rx_pg_size(rx_ring) / 2; /* Must be power-of-2 */
 535 #else
 536         truesize = rx_ring->rx_offset ?
 537                 SKB_DATA_ALIGN(rx_ring->rx_offset + size) +
 538                 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
 539                 SKB_DATA_ALIGN(size);
 540 #endif
 541         return truesize;
 542 }
 543
 544 /**
 545  * ice_run_xdp - Executes an XDP program on initialized xdp_buff
 546  * @rx_ring: Rx ring
 547  * @xdp: xdp_buff used as input to the XDP program
 548  * @xdp_prog: XDP program to run
 549  * @xdp_ring: ring to be used for XDP_TX action
 550  *
 551  * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
 552  */
 553 static int
 554 ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
 555             struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring)
 556 {
 557         int err;
 558         u32 act;
 559
 560         act = bpf_prog_run_xdp(xdp_prog, xdp);
 561         switch (act) {
 562         case XDP_PASS:
 563                 return ICE_XDP_PASS;
 564         case XDP_TX:
 565                 if (static_branch_unlikely(&ice_xdp_locking_key))
 566                         spin_lock(&xdp_ring->tx_lock);
 567                 err = ice_xmit_xdp_ring(xdp->data, xdp->data_end - xdp->data, xdp_ring);
 568                 if (static_branch_unlikely(&ice_xdp_locking_key))
 569                         spin_unlock(&xdp_ring->tx_lock);
 570                 if (err == ICE_XDP_CONSUMED)
 571                         goto out_failure;
 572                 return err;
 573         case XDP_REDIRECT:
 574                 err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
 575                 if (err)
 576                         goto out_failure;
 577                 return ICE_XDP_REDIR;
 578         default:
 579                 bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act);
 580                 fallthrough;
 581         case XDP_ABORTED:
 582 out_failure:
 583                 trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
 584                 fallthrough;
 585         case XDP_DROP:
 586                 return ICE_XDP_CONSUMED;
 587         }
 588 }
 589
 590 /**
 591  * ice_xdp_xmit - submit packets to XDP ring for transmission
 592  * @dev: netdev
 593  * @n: number of XDP frames to be transmitted
 594  * @frames: XDP frames to be transmitted
 595  * @flags: transmit flags
 596  *
 597  * Returns number of frames successfully sent. Failed frames
 598  * will be free'ed by XDP core.
 599  * For error cases, a negative errno code is returned and no-frames
 600  * are transmitted (caller must handle freeing frames).
 601  */
 602 int
 603 ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 604              u32 flags)
 605 {
 606         struct ice_netdev_priv *np = netdev_priv(dev);
 607         unsigned int queue_index = smp_processor_id();
 608         struct ice_vsi *vsi = np->vsi;
 609         struct ice_tx_ring *xdp_ring;
 610         int nxmit = 0, i;
 611
 612         if (test_bit(ICE_VSI_DOWN, vsi->state))
 613                 return -ENETDOWN;
 614
 615         if (!ice_is_xdp_ena_vsi(vsi) || queue_index >= vsi->num_xdp_txq)
 616                 return -ENXIO;
 617
 618         if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 619                 return -EINVAL;
 620
 621         if (static_branch_unlikely(&ice_xdp_locking_key)) {
 622                 queue_index %= vsi->num_xdp_txq;
 623                 xdp_ring = vsi->xdp_rings[queue_index];
 624                 spin_lock(&xdp_ring->tx_lock);
 625         } else {
 626                 xdp_ring = vsi->xdp_rings[queue_index];
 627         }
 628
 629         for (i = 0; i < n; i++) {
 630                 struct xdp_frame *xdpf = frames[i];
 631                 int err;
 632
 633                 err = ice_xmit_xdp_ring(xdpf->data, xdpf->len, xdp_ring);
 634                 if (err != ICE_XDP_TX)
 635                         break;
 636                 nxmit++;
 637         }
 638
 639         if (unlikely(flags & XDP_XMIT_FLUSH))
 640                 ice_xdp_ring_update_tail(xdp_ring);
 641
 642         if (static_branch_unlikely(&ice_xdp_locking_key))
 643                 spin_unlock(&xdp_ring->tx_lock);
 644
 645         return nxmit;
 646 }
 647
 648 /**
 649  * ice_alloc_mapped_page - recycle or make a new page
 650  * @rx_ring: ring to use
 651  * @bi: rx_buf struct to modify
 652  *
 653  * Returns true if the page was successfully allocated or
 654  * reused.
 655  */
 656 static bool
 657 ice_alloc_mapped_page(struct ice_rx_ring *rx_ring, struct ice_rx_buf *bi)
 658 {
 659         struct page *page = bi->page;
 660         dma_addr_t dma;
 661
 662         /* since we are recycling buffers we should seldom need to alloc */
 663         if (likely(page))
 664                 return true;
 665
 666         /* alloc new page for storage */
 667         page = dev_alloc_pages(ice_rx_pg_order(rx_ring));
 668         if (unlikely(!page)) {
 669                 rx_ring->rx_stats.alloc_page_failed++;
 670                 return false;
 671         }
 672
 673         /* map page for use */
 674         dma = dma_map_page_attrs(rx_ring->dev, page, 0, ice_rx_pg_size(rx_ring),
 675                                  DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
 676
 677         /* if mapping failed free memory back to system since
 678          * there isn't much point in holding memory we can't use
 679          */
 680         if (dma_mapping_error(rx_ring->dev, dma)) {
 681                 __free_pages(page, ice_rx_pg_order(rx_ring));
 682                 rx_ring->rx_stats.alloc_page_failed++;
 683                 return false;
 684         }
 685
 686         bi->dma = dma;
 687         bi->page = page;
 688         bi->page_offset = rx_ring->rx_offset;
 689         page_ref_add(page, USHRT_MAX - 1);
 690         bi->pagecnt_bias = USHRT_MAX;
 691
 692         return true;
 693 }
 694
 695 /**
 696  * ice_alloc_rx_bufs - Replace used receive buffers
 697  * @rx_ring: ring to place buffers on
 698  * @cleaned_count: number of buffers to replace
 699  *
 700  * Returns false if all allocations were successful, true if any fail. Returning
 701  * true signals to the caller that we didn't replace cleaned_count buffers and
 702  * there is more work to do.
 703  *
 704  * First, try to clean "cleaned_count" Rx buffers. Then refill the cleaned Rx
 705  * buffers. Then bump tail at most one time. Grouping like this lets us avoid
 706  * multiple tail writes per call.
 707  */
 708 bool ice_alloc_rx_bufs(struct ice_rx_ring *rx_ring, u16 cleaned_count)
 709 {
 710         union ice_32b_rx_flex_desc *rx_desc;
 711         u16 ntu = rx_ring->next_to_use;
 712         struct ice_rx_buf *bi;
 713
 714         /* do nothing if no valid netdev defined */
 715         if ((!rx_ring->netdev && rx_ring->vsi->type != ICE_VSI_CTRL) ||
 716             !cleaned_count)
 717                 return false;
 718
 719         /* get the Rx descriptor and buffer based on next_to_use */
 720         rx_desc = ICE_RX_DESC(rx_ring, ntu);
 721         bi = &rx_ring->rx_buf[ntu];
 722
 723         do {
 724                 /* if we fail here, we have work remaining */
 725                 if (!ice_alloc_mapped_page(rx_ring, bi))
 726                         break;
 727
 728                 /* sync the buffer for use by the device */
 729                 dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
 730                                                  bi->page_offset,
 731                                                  rx_ring->rx_buf_len,
 732                                                  DMA_FROM_DEVICE);
 733
 734                 /* Refresh the desc even if buffer_addrs didn't change
 735                  * because each write-back erases this info.
 736                  */
 737                 rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
 738
 739                 rx_desc++;
 740                 bi++;
 741                 ntu++;
 742                 if (unlikely(ntu == rx_ring->count)) {
 743                         rx_desc = ICE_RX_DESC(rx_ring, 0);
 744                         bi = rx_ring->rx_buf;
 745                         ntu = 0;
 746                 }
 747
 748                 /* clear the status bits for the next_to_use descriptor */
 749                 rx_desc->wb.status_error0 = 0;
 750
 751                 cleaned_count--;
 752         } while (cleaned_count);
 753
 754         if (rx_ring->next_to_use != ntu)
 755                 ice_release_rx_desc(rx_ring, ntu);
 756
 757         return !!cleaned_count;
 758 }
 759
 760 /**
 761  * ice_rx_buf_adjust_pg_offset - Prepare Rx buffer for reuse
 762  * @rx_buf: Rx buffer to adjust
 763  * @size: Size of adjustment
 764  *
 765  * Update the offset within page so that Rx buf will be ready to be reused.
 766  * For systems with PAGE_SIZE < 8192 this function will flip the page offset
 767  * so the second half of page assigned to Rx buffer will be used, otherwise
 768  * the offset is moved by "size" bytes
 769  */
 770 static void
 771 ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size)
 772 {
 773 #if (PAGE_SIZE < 8192)
 774         /* flip page offset to other buffer */
 775         rx_buf->page_offset ^= size;
 776 #else
 777         /* move offset up to the next cache line */
 778         rx_buf->page_offset += size;
 779 #endif
 780 }
 781
 782 /**
 783  * ice_can_reuse_rx_page - Determine if page can be reused for another Rx
 784  * @rx_buf: buffer containing the page
 785  * @rx_buf_pgcnt: rx_buf page refcount pre xdp_do_redirect() call
 786  *
 787  * If page is reusable, we have a green light for calling ice_reuse_rx_page,
 788  * which will assign the current buffer to the buffer that next_to_alloc is
 789  * pointing to; otherwise, the DMA mapping needs to be destroyed and
 790  * page freed
 791  */
 792 static bool
 793 ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf, int rx_buf_pgcnt)
 794 {
 795         unsigned int pagecnt_bias = rx_buf->pagecnt_bias;
 796         struct page *page = rx_buf->page;
 797
 798         /* avoid re-using remote and pfmemalloc pages */
 799         if (!dev_page_is_reusable(page))
 800                 return false;
 801
 802 #if (PAGE_SIZE < 8192)
 803         /* if we are only owner of page we can reuse it */
 804         if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
 805                 return false;
 806 #else
 807 #define ICE_LAST_OFFSET \
 808         (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048)
 809         if (rx_buf->page_offset > ICE_LAST_OFFSET)
 810                 return false;
 811 #endif /* PAGE_SIZE < 8192) */
 812
 813         /* If we have drained the page fragment pool we need to update
 814          * the pagecnt_bias and page count so that we fully restock the
 815          * number of references the driver holds.
 816          */
 817         if (unlikely(pagecnt_bias == 1)) {
 818                 page_ref_add(page, USHRT_MAX - 1);
 819                 rx_buf->pagecnt_bias = USHRT_MAX;
 820         }
 821
 822         return true;
 823 }
 824
 825 /**
 826  * ice_add_rx_frag - Add contents of Rx buffer to sk_buff as a frag
 827  * @rx_ring: Rx descriptor ring to transact packets on
 828  * @rx_buf: buffer containing page to add
 829  * @skb: sk_buff to place the data into
 830  * @size: packet length from rx_desc
 831  *
 832  * This function will add the data contained in rx_buf->page to the skb.
 833  * It will just attach the page as a frag to the skb.
 834  * The function will then update the page offset.
 835  */
 836 static void
 837 ice_add_rx_frag(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
 838                 struct sk_buff *skb, unsigned int size)
 839 {
 840 #if (PAGE_SIZE >= 8192)
 841         unsigned int truesize = SKB_DATA_ALIGN(size + rx_ring->rx_offset);
 842 #else
 843         unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
 844 #endif
 845
 846         if (!size)
 847                 return;
 848         skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buf->page,
 849                         rx_buf->page_offset, size, truesize);
 850
 851         /* page is being used so we must update the page offset */
 852         ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
 853 }
 854
 855 /**
 856  * ice_reuse_rx_page - page flip buffer and store it back on the ring
 857  * @rx_ring: Rx descriptor ring to store buffers on
 858  * @old_buf: donor buffer to have page reused
 859  *
 860  * Synchronizes page for reuse by the adapter
 861  */
 862 static void
 863 ice_reuse_rx_page(struct ice_rx_ring *rx_ring, struct ice_rx_buf *old_buf)
 864 {
 865         u16 nta = rx_ring->next_to_alloc;
 866         struct ice_rx_buf *new_buf;
 867
 868         new_buf = &rx_ring->rx_buf[nta];
 869
 870         /* update, and store next to alloc */
 871         nta++;
 872         rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
 873
 874         /* Transfer page from old buffer to new buffer.
 875          * Move each member individually to avoid possible store
 876          * forwarding stalls and unnecessary copy of skb.
 877          */
 878         new_buf->dma = old_buf->dma;
 879         new_buf->page = old_buf->page;
 880         new_buf->page_offset = old_buf->page_offset;
 881         new_buf->pagecnt_bias = old_buf->pagecnt_bias;
 882 }
 883
 884 /**
 885  * ice_get_rx_buf - Fetch Rx buffer and synchronize data for use
 886  * @rx_ring: Rx descriptor ring to transact packets on
 887  * @size: size of buffer to add to skb
 888  * @rx_buf_pgcnt: rx_buf page refcount
 889  *
 890  * This function will pull an Rx buffer from the ring and synchronize it
 891  * for use by the CPU.
 892  */
 893 static struct ice_rx_buf *
 894 ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
 895                int *rx_buf_pgcnt)
 896 {
 897         struct ice_rx_buf *rx_buf;
 898
 899         rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
 900         *rx_buf_pgcnt =
 901 #if (PAGE_SIZE < 8192)
 902                 page_count(rx_buf->page);
 903 #else
 904                 0;
 905 #endif
 906         prefetchw(rx_buf->page);
 907
 908         if (!size)
 909                 return rx_buf;
 910         /* we are reusing so sync this buffer for CPU use */
 911         dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma,
 912                                       rx_buf->page_offset, size,
 913                                       DMA_FROM_DEVICE);
 914
 915         /* We have pulled a buffer for use, so decrement pagecnt_bias */
 916         rx_buf->pagecnt_bias--;
 917
 918         return rx_buf;
 919 }
 920
 921 /**
 922  * ice_build_skb - Build skb around an existing buffer
 923  * @rx_ring: Rx descriptor ring to transact packets on
 924  * @rx_buf: Rx buffer to pull data from
 925  * @xdp: xdp_buff pointing to the data
 926  *
 927  * This function builds an skb around an existing Rx buffer, taking care
 928  * to set up the skb correctly and avoid any memcpy overhead.
 929  */
 930 static struct sk_buff *
 931 ice_build_skb(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
 932               struct xdp_buff *xdp)
 933 {
 934         u8 metasize = xdp->data - xdp->data_meta;
 935 #if (PAGE_SIZE < 8192)
 936         unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
 937 #else
 938         unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
 939                                 SKB_DATA_ALIGN(xdp->data_end -
 940                                                xdp->data_hard_start);
 941 #endif
 942         struct sk_buff *skb;
 943
 944         /* Prefetch first cache line of first page. If xdp->data_meta
 945          * is unused, this points exactly as xdp->data, otherwise we
 946          * likely have a consumer accessing first few bytes of meta
 947          * data, and then actual data.
 948          */
 949         net_prefetch(xdp->data_meta);
 950         /* build an skb around the page buffer */
 951         skb = napi_build_skb(xdp->data_hard_start, truesize);
 952         if (unlikely(!skb))
 953                 return NULL;
 954
 955         /* must to record Rx queue, otherwise OS features such as
 956          * symmetric queue won't work
 957          */
 958         skb_record_rx_queue(skb, rx_ring->q_index);
 959
 960         /* update pointers within the skb to store the data */
 961         skb_reserve(skb, xdp->data - xdp->data_hard_start);
 962         __skb_put(skb, xdp->data_end - xdp->data);
 963         if (metasize)
 964                 skb_metadata_set(skb, metasize);
 965
 966         /* buffer is used by skb, update page_offset */
 967         ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
 968
 969         return skb;
 970 }
 971
 972 /**
 973  * ice_construct_skb - Allocate skb and populate it
 974  * @rx_ring: Rx descriptor ring to transact packets on
 975  * @rx_buf: Rx buffer to pull data from
 976  * @xdp: xdp_buff pointing to the data
 977  *
 978  * This function allocates an skb. It then populates it with the page
 979  * data from the current receive descriptor, taking care to set up the
 980  * skb correctly.
 981  */
 982 static struct sk_buff *
 983 ice_construct_skb(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
 984                   struct xdp_buff *xdp)
 985 {
 986         unsigned int size = xdp->data_end - xdp->data;
 987         unsigned int headlen;
 988         struct sk_buff *skb;
 989
 990         /* prefetch first cache line of first page */
 991         net_prefetch(xdp->data);
 992
 993         /* allocate a skb to store the frags */
 994         skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE,
 995                                GFP_ATOMIC | __GFP_NOWARN);
 996         if (unlikely(!skb))
 997                 return NULL;
 998
 999         skb_record_rx_queue(skb, rx_ring->q_index);
1000         /* Determine available headroom for copy */
1001         headlen = size;
1002         if (headlen > ICE_RX_HDR_SIZE)
1003                 headlen = eth_get_headlen(skb->dev, xdp->data, ICE_RX_HDR_SIZE);
1004
1005         /* align pull length to size of long to optimize memcpy performance */
1006         memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen,
1007                                                          sizeof(long)));
1008
1009         /* if we exhaust the linear part then add what is left as a frag */
1010         size -= headlen;
1011         if (size) {
1012 #if (PAGE_SIZE >= 8192)
1013                 unsigned int truesize = SKB_DATA_ALIGN(size);
1014 #else
1015                 unsigned int truesize = ice_rx_pg_size(rx_ring) / 2;
1016 #endif
1017                 skb_add_rx_frag(skb, 0, rx_buf->page,
1018                                 rx_buf->page_offset + headlen, size, truesize);
1019                 /* buffer is used by skb, update page_offset */
1020                 ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
1021         } else {
1022                 /* buffer is unused, reset bias back to rx_buf; data was copied
1023                  * onto skb's linear part so there's no need for adjusting
1024                  * page offset and we can reuse this buffer as-is
1025                  */
1026                 rx_buf->pagecnt_bias++;
1027         }
1028
1029         return skb;
1030 }
1031
1032 /**
1033  * ice_put_rx_buf - Clean up used buffer and either recycle or free
1034  * @rx_ring: Rx descriptor ring to transact packets on
1035  * @rx_buf: Rx buffer to pull data from
1036  * @rx_buf_pgcnt: Rx buffer page count pre xdp_do_redirect()
1037  *
1038  * This function will update next_to_clean and then clean up the contents
1039  * of the rx_buf. It will either recycle the buffer or unmap it and free
1040  * the associated resources.
1041  */
1042 static void
1043 ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
1044                int rx_buf_pgcnt)
1045 {
1046         u16 ntc = rx_ring->next_to_clean + 1;
1047
1048         /* fetch, update, and store next to clean */
1049         ntc = (ntc < rx_ring->count) ? ntc : 0;
1050         rx_ring->next_to_clean = ntc;
1051
1052         if (!rx_buf)
1053                 return;
1054
1055         if (ice_can_reuse_rx_page(rx_buf, rx_buf_pgcnt)) {
1056                 /* hand second half of page back to the ring */
1057                 ice_reuse_rx_page(rx_ring, rx_buf);
1058         } else {
1059                 /* we are not reusing the buffer so unmap it */
1060                 dma_unmap_page_attrs(rx_ring->dev, rx_buf->dma,
1061                                      ice_rx_pg_size(rx_ring), DMA_FROM_DEVICE,
1062                                      ICE_RX_DMA_ATTR);
1063                 __page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);
1064         }
1065
1066         /* clear contents of buffer_info */
1067         rx_buf->page = NULL;
1068 }
1069
1070 /**
1071  * ice_is_non_eop - process handling of non-EOP buffers
1072  * @rx_ring: Rx ring being processed
1073  * @rx_desc: Rx descriptor for current buffer
1074  *
1075  * If the buffer is an EOP buffer, this function exits returning false,
1076  * otherwise return true indicating that this is in fact a non-EOP buffer.
1077  */
1078 static bool
1079 ice_is_non_eop(struct ice_rx_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc)
1080 {
1081         /* if we are the last buffer then there is nothing else to do */
1082 #define ICE_RXD_EOF BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S)
1083         if (likely(ice_test_staterr(rx_desc, ICE_RXD_EOF)))
1084                 return false;
1085
1086         rx_ring->rx_stats.non_eop_descs++;
1087
1088         return true;
1089 }
1090
1091 /**
1092  * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
1093  * @rx_ring: Rx descriptor ring to transact packets on
1094  * @budget: Total limit on number of packets to process
1095  *
1096  * This function provides a "bounce buffer" approach to Rx interrupt
1097  * processing. The advantage to this is that on systems that have
1098  * expensive overhead for IOMMU access this provides a means of avoiding
1099  * it by maintaining the mapping of the page to the system.
1100  *
1101  * Returns amount of work completed
1102  */
1103 int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
1104 {
1105         unsigned int total_rx_bytes = 0, total_rx_pkts = 0, frame_sz = 0;
1106         u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
1107         unsigned int offset = rx_ring->rx_offset;
1108         struct ice_tx_ring *xdp_ring = NULL;
1109         unsigned int xdp_res, xdp_xmit = 0;
1110         struct sk_buff *skb = rx_ring->skb;
1111         struct bpf_prog *xdp_prog = NULL;
1112         struct xdp_buff xdp;
1113         bool failure;
1114
1115         /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
1116 #if (PAGE_SIZE < 8192)
1117         frame_sz = ice_rx_frame_truesize(rx_ring, 0);
1118 #endif
1119         xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq);
1120
1121         xdp_prog = READ_ONCE(rx_ring->xdp_prog);
1122         if (xdp_prog)
1123                 xdp_ring = rx_ring->xdp_ring;
1124
1125         /* start the loop to process Rx packets bounded by 'budget' */
1126         while (likely(total_rx_pkts < (unsigned int)budget)) {
1127                 union ice_32b_rx_flex_desc *rx_desc;
1128                 struct ice_rx_buf *rx_buf;
1129                 unsigned char *hard_start;
1130                 unsigned int size;
1131                 u16 stat_err_bits;
1132                 int rx_buf_pgcnt;
1133                 u16 vlan_tag = 0;
1134                 u16 rx_ptype;
1135
1136                 /* get the Rx desc from Rx ring based on 'next_to_clean' */
1137                 rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
1138
1139                 /* status_error_len will always be zero for unused descriptors
1140                  * because it's cleared in cleanup, and overlaps with hdr_addr
1141                  * which is always zero because packet split isn't used, if the
1142                  * hardware wrote DD then it will be non-zero
1143                  */
1144                 stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
1145                 if (!ice_test_staterr(rx_desc, stat_err_bits))
1146                         break;
1147
1148                 /* This memory barrier is needed to keep us from reading
1149                  * any other fields out of the rx_desc until we know the
1150                  * DD bit is set.
1151                  */
1152                 dma_rmb();
1153
1154                 ice_trace(clean_rx_irq, rx_ring, rx_desc);
1155                 if (rx_desc->wb.rxdid == FDIR_DESC_RXDID || !rx_ring->netdev) {
1156                         struct ice_vsi *ctrl_vsi = rx_ring->vsi;
1157
1158                         if (rx_desc->wb.rxdid == FDIR_DESC_RXDID &&
1159                             ctrl_vsi->vf_id != ICE_INVAL_VFID)
1160                                 ice_vc_fdir_irq_handler(ctrl_vsi, rx_desc);
1161                         ice_put_rx_buf(rx_ring, NULL, 0);
1162                         cleaned_count++;
1163                         continue;
1164                 }
1165
1166                 size = le16_to_cpu(rx_desc->wb.pkt_len) &
1167                         ICE_RX_FLX_DESC_PKT_LEN_M;
1168
1169                 /* retrieve a buffer from the ring */
1170                 rx_buf = ice_get_rx_buf(rx_ring, size, &rx_buf_pgcnt);
1171
1172                 if (!size) {
1173                         xdp.data = NULL;
1174                         xdp.data_end = NULL;
1175                         xdp.data_hard_start = NULL;
1176                         xdp.data_meta = NULL;
1177                         goto construct_skb;
1178                 }
1179
1180                 hard_start = page_address(rx_buf->page) + rx_buf->page_offset -
1181                              offset;
1182                 xdp_prepare_buff(&xdp, hard_start, offset, size, true);
1183 #if (PAGE_SIZE > 4096)
1184                 /* At larger PAGE_SIZE, frame_sz depend on len size */
1185                 xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size);
1186 #endif
1187
1188                 if (!xdp_prog)
1189                         goto construct_skb;
1190
1191                 xdp_res = ice_run_xdp(rx_ring, &xdp, xdp_prog, xdp_ring);
1192                 if (!xdp_res)
1193                         goto construct_skb;
1194                 if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
1195                         xdp_xmit |= xdp_res;
1196                         ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz);
1197                 } else {
1198                         rx_buf->pagecnt_bias++;
1199                 }
1200                 total_rx_bytes += size;
1201                 total_rx_pkts++;
1202
1203                 cleaned_count++;
1204                 ice_put_rx_buf(rx_ring, rx_buf, rx_buf_pgcnt);
1205                 continue;
1206 construct_skb:
1207                 if (skb) {
1208                         ice_add_rx_frag(rx_ring, rx_buf, skb, size);
1209                 } else if (likely(xdp.data)) {
1210                         if (ice_ring_uses_build_skb(rx_ring))
1211                                 skb = ice_build_skb(rx_ring, rx_buf, &xdp);
1212                         else
1213                                 skb = ice_construct_skb(rx_ring, rx_buf, &xdp);
1214                 }
1215                 /* exit if we failed to retrieve a buffer */
1216                 if (!skb) {
1217                         rx_ring->rx_stats.alloc_buf_failed++;
1218                         if (rx_buf)
1219                                 rx_buf->pagecnt_bias++;
1220                         break;
1221                 }
1222
1223                 ice_put_rx_buf(rx_ring, rx_buf, rx_buf_pgcnt);
1224                 cleaned_count++;
1225
1226                 /* skip if it is NOP desc */
1227                 if (ice_is_non_eop(rx_ring, rx_desc))
1228                         continue;
1229
1230                 stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
1231                 if (unlikely(ice_test_staterr(rx_desc, stat_err_bits))) {
1232                         dev_kfree_skb_any(skb);
1233                         continue;
1234                 }
1235
1236                 stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S);
1237                 if (ice_test_staterr(rx_desc, stat_err_bits))
1238                         vlan_tag = le16_to_cpu(rx_desc->wb.l2tag1);
1239
1240                 /* pad the skb if needed, to make a valid ethernet frame */
1241                 if (eth_skb_pad(skb)) {
1242                         skb = NULL;
1243                         continue;
1244                 }
1245
1246                 /* probably a little skewed due to removing CRC */
1247                 total_rx_bytes += skb->len;
1248
1249                 /* populate checksum, VLAN, and protocol */
1250                 rx_ptype = le16_to_cpu(rx_desc->wb.ptype_flex_flags0) &
1251                         ICE_RX_FLEX_DESC_PTYPE_M;
1252
1253                 ice_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
1254
1255                 ice_trace(clean_rx_irq_indicate, rx_ring, rx_desc, skb);
1256                 /* send completed skb up the stack */
1257                 ice_receive_skb(rx_ring, skb, vlan_tag);
1258                 skb = NULL;
1259
1260                 /* update budget accounting */
1261                 total_rx_pkts++;
1262         }
1263
1264         /* return up to cleaned_count buffers to hardware */
1265         failure = ice_alloc_rx_bufs(rx_ring, cleaned_count);
1266
1267         if (xdp_prog)
1268                 ice_finalize_xdp_rx(xdp_ring, xdp_xmit);
1269         rx_ring->skb = skb;
1270
1271         ice_update_rx_ring_stats(rx_ring, total_rx_pkts, total_rx_bytes);
1272
1273         /* guarantee a trip back through this routine if there was a failure */
1274         return failure ? budget : (int)total_rx_pkts;
1275 }
1276
1277 static void __ice_update_sample(struct ice_q_vector *q_vector,
1278                                 struct ice_ring_container *rc,
1279                                 struct dim_sample *sample,
1280                                 bool is_tx)
1281 {
1282         u64 packets = 0, bytes = 0;
1283
1284         if (is_tx) {
1285                 struct ice_tx_ring *tx_ring;
1286
1287                 ice_for_each_tx_ring(tx_ring, *rc) {
1288                         packets += tx_ring->stats.pkts;
1289                         bytes += tx_ring->stats.bytes;
1290                 }
1291         } else {
1292                 struct ice_rx_ring *rx_ring;
1293
1294                 ice_for_each_rx_ring(rx_ring, *rc) {
1295                         packets += rx_ring->stats.pkts;
1296                         bytes += rx_ring->stats.bytes;
1297                 }
1298         }
1299
1300         dim_update_sample(q_vector->total_events, packets, bytes, sample);
1301         sample->comp_ctr = 0;
1302
1303         /* if dim settings get stale, like when not updated for 1
1304          * second or longer, force it to start again. This addresses the
1305          * frequent case of an idle queue being switched to by the
1306          * scheduler. The 1,000 here means 1,000 milliseconds.
1307          */
1308         if (ktime_ms_delta(sample->time, rc->dim.start_sample.time) >= 1000)
1309                 rc->dim.state = DIM_START_MEASURE;
1310 }
1311
1312 /**
1313  * ice_net_dim - Update net DIM algorithm
1314  * @q_vector: the vector associated with the interrupt
1315  *
1316  * Create a DIM sample and notify net_dim() so that it can possibly decide
1317  * a new ITR value based on incoming packets, bytes, and interrupts.
1318  *
1319  * This function is a no-op if the ring is not configured to dynamic ITR.
1320  */
1321 static void ice_net_dim(struct ice_q_vector *q_vector)
1322 {
1323         struct ice_ring_container *tx = &q_vector->tx;
1324         struct ice_ring_container *rx = &q_vector->rx;
1325
1326         if (ITR_IS_DYNAMIC(tx)) {
1327                 struct dim_sample dim_sample;
1328
1329                 __ice_update_sample(q_vector, tx, &dim_sample, true);
1330                 net_dim(&tx->dim, dim_sample);
1331         }
1332
1333         if (ITR_IS_DYNAMIC(rx)) {
1334                 struct dim_sample dim_sample;
1335
1336                 __ice_update_sample(q_vector, rx, &dim_sample, false);
1337                 net_dim(&rx->dim, dim_sample);
1338         }
1339 }
1340
1341 /**
1342  * ice_buildreg_itr - build value for writing to the GLINT_DYN_CTL register
1343  * @itr_idx: interrupt throttling index
1344  * @itr: interrupt throttling value in usecs
1345  */
1346 static u32 ice_buildreg_itr(u16 itr_idx, u16 itr)
1347 {
1348         /* The ITR value is reported in microseconds, and the register value is
1349          * recorded in 2 microsecond units. For this reason we only need to
1350          * shift by the GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S to apply this
1351          * granularity as a shift instead of division. The mask makes sure the
1352          * ITR value is never odd so we don't accidentally write into the field
1353          * prior to the ITR field.
1354          */
1355         itr &= ICE_ITR_MASK;
1356
1357         return GLINT_DYN_CTL_INTENA_M | GLINT_DYN_CTL_CLEARPBA_M |
1358                 (itr_idx << GLINT_DYN_CTL_ITR_INDX_S) |
1359                 (itr << (GLINT_DYN_CTL_INTERVAL_S - ICE_ITR_GRAN_S));
1360 }
1361
1362 /**
1363  * ice_enable_interrupt - re-enable MSI-X interrupt
1364  * @q_vector: the vector associated with the interrupt to enable
1365  *
1366  * If the VSI is down, the interrupt will not be re-enabled. Also,
1367  * when enabling the interrupt always reset the wb_on_itr to false
1368  * and trigger a software interrupt to clean out internal state.
1369  */
1370 static void ice_enable_interrupt(struct ice_q_vector *q_vector)
1371 {
1372         struct ice_vsi *vsi = q_vector->vsi;
1373         bool wb_en = q_vector->wb_on_itr;
1374         u32 itr_val;
1375
1376         if (test_bit(ICE_DOWN, vsi->state))
1377                 return;
1378
1379         /* trigger an ITR delayed software interrupt when exiting busy poll, to
1380          * make sure to catch any pending cleanups that might have been missed
1381          * due to interrupt state transition. If busy poll or poll isn't
1382          * enabled, then don't update ITR, and just enable the interrupt.
1383          */
1384         if (!wb_en) {
1385                 itr_val = ice_buildreg_itr(ICE_ITR_NONE, 0);
1386         } else {
1387                 q_vector->wb_on_itr = false;
1388
1389                 /* do two things here with a single write. Set up the third ITR
1390                  * index to be used for software interrupt moderation, and then
1391                  * trigger a software interrupt with a rate limit of 20K on
1392                  * software interrupts, this will help avoid high interrupt
1393                  * loads due to frequently polling and exiting polling.
1394                  */
1395                 itr_val = ice_buildreg_itr(ICE_IDX_ITR2, ICE_ITR_20K);
1396                 itr_val |= GLINT_DYN_CTL_SWINT_TRIG_M |
1397                            ICE_IDX_ITR2 << GLINT_DYN_CTL_SW_ITR_INDX_S |
1398                            GLINT_DYN_CTL_SW_ITR_INDX_ENA_M;
1399         }
1400         wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx), itr_val);
1401 }
1402
1403 /**
1404  * ice_set_wb_on_itr - set WB_ON_ITR for this q_vector
1405  * @q_vector: q_vector to set WB_ON_ITR on
1406  *
1407  * We need to tell hardware to write-back completed descriptors even when
1408  * interrupts are disabled. Descriptors will be written back on cache line
1409  * boundaries without WB_ON_ITR enabled, but if we don't enable WB_ON_ITR
1410  * descriptors may not be written back if they don't fill a cache line until
1411  * the next interrupt.
1412  *
1413  * This sets the write-back frequency to whatever was set previously for the
1414  * ITR indices. Also, set the INTENA_MSK bit to make sure hardware knows we
1415  * aren't meddling with the INTENA_M bit.
1416  */
1417 static void ice_set_wb_on_itr(struct ice_q_vector *q_vector)
1418 {
1419         struct ice_vsi *vsi = q_vector->vsi;
1420
1421         /* already in wb_on_itr mode no need to change it */
1422         if (q_vector->wb_on_itr)
1423                 return;
1424
1425         /* use previously set ITR values for all of the ITR indices by
1426          * specifying ICE_ITR_NONE, which will vary in adaptive (AIM) mode and
1427          * be static in non-adaptive mode (user configured)
1428          */
1429         wr32(&vsi->back->hw, GLINT_DYN_CTL(q_vector->reg_idx),
1430              ((ICE_ITR_NONE << GLINT_DYN_CTL_ITR_INDX_S) &
1431               GLINT_DYN_CTL_ITR_INDX_M) | GLINT_DYN_CTL_INTENA_MSK_M |
1432              GLINT_DYN_CTL_WB_ON_ITR_M);
1433
1434         q_vector->wb_on_itr = true;
1435 }
1436
1437 /**
1438  * ice_napi_poll - NAPI polling Rx/Tx cleanup routine
1439  * @napi: napi struct with our devices info in it
1440  * @budget: amount of work driver is allowed to do this pass, in packets
1441  *
1442  * This function will clean all queues associated with a q_vector.
1443  *
1444  * Returns the amount of work done
1445  */
1446 int ice_napi_poll(struct napi_struct *napi, int budget)
1447 {
1448         struct ice_q_vector *q_vector =
1449                                 container_of(napi, struct ice_q_vector, napi);
1450         struct ice_tx_ring *tx_ring;
1451         struct ice_rx_ring *rx_ring;
1452         bool clean_complete = true;
1453         int budget_per_ring;
1454         int work_done = 0;
1455
1456         /* Since the actual Tx work is minimal, we can give the Tx a larger
1457          * budget and be more aggressive about cleaning up the Tx descriptors.
1458          */
1459         ice_for_each_tx_ring(tx_ring, q_vector->tx) {
1460                 bool wd;
1461
1462                 if (tx_ring->xsk_pool)
1463                         wd = ice_clean_tx_irq_zc(tx_ring, budget);
1464                 else if (ice_ring_is_xdp(tx_ring))
1465                         wd = true;
1466                 else
1467                         wd = ice_clean_tx_irq(tx_ring, budget);
1468
1469                 if (!wd)
1470                         clean_complete = false;
1471         }
1472
1473         /* Handle case where we are called by netpoll with a budget of 0 */
1474         if (unlikely(budget <= 0))
1475                 return budget;
1476
1477         /* normally we have 1 Rx ring per q_vector */
1478         if (unlikely(q_vector->num_ring_rx > 1))
1479                 /* We attempt to distribute budget to each Rx queue fairly, but
1480                  * don't allow the budget to go below 1 because that would exit
1481                  * polling early.
1482                  */
1483                 budget_per_ring = max_t(int, budget / q_vector->num_ring_rx, 1);
1484         else
1485                 /* Max of 1 Rx ring in this q_vector so give it the budget */
1486                 budget_per_ring = budget;
1487
1488         ice_for_each_rx_ring(rx_ring, q_vector->rx) {
1489                 int cleaned;
1490
1491                 /* A dedicated path for zero-copy allows making a single
1492                  * comparison in the irq context instead of many inside the
1493                  * ice_clean_rx_irq function and makes the codebase cleaner.
1494                  */
1495                 cleaned = rx_ring->xsk_pool ?
1496                           ice_clean_rx_irq_zc(rx_ring, budget_per_ring) :
1497                           ice_clean_rx_irq(rx_ring, budget_per_ring);
1498                 work_done += cleaned;
1499                 /* if we clean as many as budgeted, we must not be done */
1500                 if (cleaned >= budget_per_ring)
1501                         clean_complete = false;
1502         }
1503
1504         /* If work not completed, return budget and polling will return */
1505         if (!clean_complete) {
1506                 /* Set the writeback on ITR so partial completions of
1507                  * cache-lines will still continue even if we're polling.
1508                  */
1509                 ice_set_wb_on_itr(q_vector);
1510                 return budget;
1511         }
1512
1513         /* Exit the polling mode, but don't re-enable interrupts if stack might
1514          * poll us due to busy-polling
1515          */
1516         if (likely(napi_complete_done(napi, work_done))) {
1517                 ice_net_dim(q_vector);
1518                 ice_enable_interrupt(q_vector);
1519         } else {
1520                 ice_set_wb_on_itr(q_vector);
1521         }
1522
1523         return min_t(int, work_done, budget - 1);
1524 }
1525
1526 /**
1527  * __ice_maybe_stop_tx - 2nd level check for Tx stop conditions
1528  * @tx_ring: the ring to be checked
1529  * @size: the size buffer we want to assure is available
1530  *
1531  * Returns -EBUSY if a stop is needed, else 0
1532  */
1533 static int __ice_maybe_stop_tx(struct ice_tx_ring *tx_ring, unsigned int size)
1534 {
1535         netif_tx_stop_queue(txring_txq(tx_ring));
1536         /* Memory barrier before checking head and tail */
1537         smp_mb();
1538
1539         /* Check again in a case another CPU has just made room available. */
1540         if (likely(ICE_DESC_UNUSED(tx_ring) < size))
1541                 return -EBUSY;
1542
1543         /* A reprieve! - use start_queue because it doesn't call schedule */
1544         netif_tx_start_queue(txring_txq(tx_ring));
1545         ++tx_ring->tx_stats.restart_q;
1546         return 0;
1547 }
1548
1549 /**
1550  * ice_maybe_stop_tx - 1st level check for Tx stop conditions
1551  * @tx_ring: the ring to be checked
1552  * @size:    the size buffer we want to assure is available
1553  *
1554  * Returns 0 if stop is not needed
1555  */
1556 static int ice_maybe_stop_tx(struct ice_tx_ring *tx_ring, unsigned int size)
1557 {
1558         if (likely(ICE_DESC_UNUSED(tx_ring) >= size))
1559                 return 0;
1560
1561         return __ice_maybe_stop_tx(tx_ring, size);
1562 }
1563
1564 /**
1565  * ice_tx_map - Build the Tx descriptor
1566  * @tx_ring: ring to send buffer on
1567  * @first: first buffer info buffer to use
1568  * @off: pointer to struct that holds offload parameters
1569  *
1570  * This function loops over the skb data pointed to by *first
1571  * and gets a physical address for each memory location and programs
1572  * it and the length into the transmit descriptor.
1573  */
1574 static void
1575 ice_tx_map(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first,
1576            struct ice_tx_offload_params *off)
1577 {
1578         u64 td_offset, td_tag, td_cmd;
1579         u16 i = tx_ring->next_to_use;
1580         unsigned int data_len, size;
1581         struct ice_tx_desc *tx_desc;
1582         struct ice_tx_buf *tx_buf;
1583         struct sk_buff *skb;
1584         skb_frag_t *frag;
1585         dma_addr_t dma;
1586         bool kick;
1587
1588         td_tag = off->td_l2tag1;
1589         td_cmd = off->td_cmd;
1590         td_offset = off->td_offset;
1591         skb = first->skb;
1592
1593         data_len = skb->data_len;
1594         size = skb_headlen(skb);
1595
1596         tx_desc = ICE_TX_DESC(tx_ring, i);
1597
1598         if (first->tx_flags & ICE_TX_FLAGS_HW_VLAN) {
1599                 td_cmd |= (u64)ICE_TX_DESC_CMD_IL2TAG1;
1600                 td_tag = (first->tx_flags & ICE_TX_FLAGS_VLAN_M) >>
1601                           ICE_TX_FLAGS_VLAN_S;
1602         }
1603
1604         dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
1605
1606         tx_buf = first;
1607
1608         for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
1609                 unsigned int max_data = ICE_MAX_DATA_PER_TXD_ALIGNED;
1610
1611                 if (dma_mapping_error(tx_ring->dev, dma))
1612                         goto dma_error;
1613
1614                 /* record length, and DMA address */
1615                 dma_unmap_len_set(tx_buf, len, size);
1616                 dma_unmap_addr_set(tx_buf, dma, dma);
1617
1618                 /* align size to end of page */
1619                 max_data += -dma & (ICE_MAX_READ_REQ_SIZE - 1);
1620                 tx_desc->buf_addr = cpu_to_le64(dma);
1621
1622                 /* account for data chunks larger than the hardware
1623                  * can handle
1624                  */
1625                 while (unlikely(size > ICE_MAX_DATA_PER_TXD)) {
1626                         tx_desc->cmd_type_offset_bsz =
1627                                 ice_build_ctob(td_cmd, td_offset, max_data,
1628                                                td_tag);
1629
1630                         tx_desc++;
1631                         i++;
1632
1633                         if (i == tx_ring->count) {
1634                                 tx_desc = ICE_TX_DESC(tx_ring, 0);
1635                                 i = 0;
1636                         }
1637
1638                         dma += max_data;
1639                         size -= max_data;
1640
1641                         max_data = ICE_MAX_DATA_PER_TXD_ALIGNED;
1642                         tx_desc->buf_addr = cpu_to_le64(dma);
1643                 }
1644
1645                 if (likely(!data_len))
1646                         break;
1647
1648                 tx_desc->cmd_type_offset_bsz = ice_build_ctob(td_cmd, td_offset,
1649                                                               size, td_tag);
1650
1651                 tx_desc++;
1652                 i++;
1653
1654                 if (i == tx_ring->count) {
1655                         tx_desc = ICE_TX_DESC(tx_ring, 0);
1656                         i = 0;
1657                 }
1658
1659                 size = skb_frag_size(frag);
1660                 data_len -= size;
1661
1662                 dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
1663                                        DMA_TO_DEVICE);
1664
1665                 tx_buf = &tx_ring->tx_buf[i];
1666         }
1667
1668         /* record SW timestamp if HW timestamp is not available */
1669         skb_tx_timestamp(first->skb);
1670
1671         i++;
1672         if (i == tx_ring->count)
1673                 i = 0;
1674
1675         /* write last descriptor with RS and EOP bits */
1676         td_cmd |= (u64)ICE_TXD_LAST_DESC_CMD;
1677         tx_desc->cmd_type_offset_bsz =
1678                         ice_build_ctob(td_cmd, td_offset, size, td_tag);
1679
1680         /* Force memory writes to complete before letting h/w know there
1681          * are new descriptors to fetch.
1682          *
1683          * We also use this memory barrier to make certain all of the
1684          * status bits have been updated before next_to_watch is written.
1685          */
1686         wmb();
1687
1688         /* set next_to_watch value indicating a packet is present */
1689         first->next_to_watch = tx_desc;
1690
1691         tx_ring->next_to_use = i;
1692
1693         ice_maybe_stop_tx(tx_ring, DESC_NEEDED);
1694
1695         /* notify HW of packet */
1696         kick = __netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount,
1697                                       netdev_xmit_more());
1698         if (kick)
1699                 /* notify HW of packet */
1700                 writel(i, tx_ring->tail);
1701
1702         return;
1703
1704 dma_error:
1705         /* clear DMA mappings for failed tx_buf map */
1706         for (;;) {
1707                 tx_buf = &tx_ring->tx_buf[i];
1708                 ice_unmap_and_free_tx_buf(tx_ring, tx_buf);
1709                 if (tx_buf == first)
1710                         break;
1711                 if (i == 0)
1712                         i = tx_ring->count;
1713                 i--;
1714         }
1715
1716         tx_ring->next_to_use = i;
1717 }
1718
1719 /**
1720  * ice_tx_csum - Enable Tx checksum offloads
1721  * @first: pointer to the first descriptor
1722  * @off: pointer to struct that holds offload parameters
1723  *
1724  * Returns 0 or error (negative) if checksum offload can't happen, 1 otherwise.
1725  */
1726 static
1727 int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
1728 {
1729         u32 l4_len = 0, l3_len = 0, l2_len = 0;
1730         struct sk_buff *skb = first->skb;
1731         union {
1732                 struct iphdr *v4;
1733                 struct ipv6hdr *v6;
1734                 unsigned char *hdr;
1735         } ip;
1736         union {
1737                 struct tcphdr *tcp;
1738                 unsigned char *hdr;
1739         } l4;
1740         __be16 frag_off, protocol;
1741         unsigned char *exthdr;
1742         u32 offset, cmd = 0;
1743         u8 l4_proto = 0;
1744
1745         if (skb->ip_summed != CHECKSUM_PARTIAL)
1746                 return 0;
1747
1748         ip.hdr = skb_network_header(skb);
1749         l4.hdr = skb_transport_header(skb);
1750
1751         /* compute outer L2 header size */
1752         l2_len = ip.hdr - skb->data;
1753         offset = (l2_len / 2) << ICE_TX_DESC_LEN_MACLEN_S;
1754
1755         protocol = vlan_get_protocol(skb);
1756
1757         if (protocol == htons(ETH_P_IP))
1758                 first->tx_flags |= ICE_TX_FLAGS_IPV4;
1759         else if (protocol == htons(ETH_P_IPV6))
1760                 first->tx_flags |= ICE_TX_FLAGS_IPV6;
1761
1762         if (skb->encapsulation) {
1763                 bool gso_ena = false;
1764                 u32 tunnel = 0;
1765
1766                 /* define outer network header type */
1767                 if (first->tx_flags & ICE_TX_FLAGS_IPV4) {
1768                         tunnel |= (first->tx_flags & ICE_TX_FLAGS_TSO) ?
1769                                   ICE_TX_CTX_EIPT_IPV4 :
1770                                   ICE_TX_CTX_EIPT_IPV4_NO_CSUM;
1771                         l4_proto = ip.v4->protocol;
1772                 } else if (first->tx_flags & ICE_TX_FLAGS_IPV6) {
1773                         int ret;
1774
1775                         tunnel |= ICE_TX_CTX_EIPT_IPV6;
1776                         exthdr = ip.hdr + sizeof(*ip.v6);
1777                         l4_proto = ip.v6->nexthdr;
1778                         ret = ipv6_skip_exthdr(skb, exthdr - skb->data,
1779                                                &l4_proto, &frag_off);
1780                         if (ret < 0)
1781                                 return -1;
1782                 }
1783
1784                 /* define outer transport */
1785                 switch (l4_proto) {
1786                 case IPPROTO_UDP:
1787                         tunnel |= ICE_TXD_CTX_UDP_TUNNELING;
1788                         first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
1789                         break;
1790                 case IPPROTO_GRE:
1791                         tunnel |= ICE_TXD_CTX_GRE_TUNNELING;
1792                         first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
1793                         break;
1794                 case IPPROTO_IPIP:
1795                 case IPPROTO_IPV6:
1796                         first->tx_flags |= ICE_TX_FLAGS_TUNNEL;
1797                         l4.hdr = skb_inner_network_header(skb);
1798                         break;
1799                 default:
1800                         if (first->tx_flags & ICE_TX_FLAGS_TSO)
1801                                 return -1;
1802
1803                         skb_checksum_help(skb);
1804                         return 0;
1805                 }
1806
1807                 /* compute outer L3 header size */
1808                 tunnel |= ((l4.hdr - ip.hdr) / 4) <<
1809                           ICE_TXD_CTX_QW0_EIPLEN_S;
1810
1811                 /* switch IP header pointer from outer to inner header */
1812                 ip.hdr = skb_inner_network_header(skb);
1813
1814                 /* compute tunnel header size */
1815                 tunnel |= ((ip.hdr - l4.hdr) / 2) <<
1816                            ICE_TXD_CTX_QW0_NATLEN_S;
1817
1818                 gso_ena = skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL;
1819                 /* indicate if we need to offload outer UDP header */
1820                 if ((first->tx_flags & ICE_TX_FLAGS_TSO) && !gso_ena &&
1821                     (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
1822                         tunnel |= ICE_TXD_CTX_QW0_L4T_CS_M;
1823
1824                 /* record tunnel offload values */
1825                 off->cd_tunnel_params |= tunnel;
1826
1827                 /* set DTYP=1 to indicate that it's an Tx context descriptor
1828                  * in IPsec tunnel mode with Tx offloads in Quad word 1
1829                  */
1830                 off->cd_qw1 |= (u64)ICE_TX_DESC_DTYPE_CTX;
1831
1832                 /* switch L4 header pointer from outer to inner */
1833                 l4.hdr = skb_inner_transport_header(skb);
1834                 l4_proto = 0;
1835
1836                 /* reset type as we transition from outer to inner headers */
1837                 first->tx_flags &= ~(ICE_TX_FLAGS_IPV4 | ICE_TX_FLAGS_IPV6);
1838                 if (ip.v4->version == 4)
1839                         first->tx_flags |= ICE_TX_FLAGS_IPV4;
1840                 if (ip.v6->version == 6)
1841                         first->tx_flags |= ICE_TX_FLAGS_IPV6;
1842         }
1843
1844         /* Enable IP checksum offloads */
1845         if (first->tx_flags & ICE_TX_FLAGS_IPV4) {
1846                 l4_proto = ip.v4->protocol;
1847                 /* the stack computes the IP header already, the only time we
1848                  * need the hardware to recompute it is in the case of TSO.
1849                  */
1850                 if (first->tx_flags & ICE_TX_FLAGS_TSO)
1851                         cmd |= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM;
1852                 else
1853                         cmd |= ICE_TX_DESC_CMD_IIPT_IPV4;
1854
1855         } else if (first->tx_flags & ICE_TX_FLAGS_IPV6) {
1856                 cmd |= ICE_TX_DESC_CMD_IIPT_IPV6;
1857                 exthdr = ip.hdr + sizeof(*ip.v6);
1858                 l4_proto = ip.v6->nexthdr;
1859                 if (l4.hdr != exthdr)
1860                         ipv6_skip_exthdr(skb, exthdr - skb->data, &l4_proto,
1861                                          &frag_off);
1862         } else {
1863                 return -1;
1864         }
1865
1866         /* compute inner L3 header size */
1867         l3_len = l4.hdr - ip.hdr;
1868         offset |= (l3_len / 4) << ICE_TX_DESC_LEN_IPLEN_S;
1869
1870         /* Enable L4 checksum offloads */
1871         switch (l4_proto) {
1872         case IPPROTO_TCP:
1873                 /* enable checksum offloads */
1874                 cmd |= ICE_TX_DESC_CMD_L4T_EOFT_TCP;
1875                 l4_len = l4.tcp->doff;
1876                 offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
1877                 break;
1878         case IPPROTO_UDP:
1879                 /* enable UDP checksum offload */
1880                 cmd |= ICE_TX_DESC_CMD_L4T_EOFT_UDP;
1881                 l4_len = (sizeof(struct udphdr) >> 2);
1882                 offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
1883                 break;
1884         case IPPROTO_SCTP:
1885                 /* enable SCTP checksum offload */
1886                 cmd |= ICE_TX_DESC_CMD_L4T_EOFT_SCTP;
1887                 l4_len = sizeof(struct sctphdr) >> 2;
1888                 offset |= l4_len << ICE_TX_DESC_LEN_L4_LEN_S;
1889                 break;
1890
1891         default:
1892                 if (first->tx_flags & ICE_TX_FLAGS_TSO)
1893                         return -1;
1894                 skb_checksum_help(skb);
1895                 return 0;
1896         }
1897
1898         off->td_cmd |= cmd;
1899         off->td_offset |= offset;
1900         return 1;
1901 }
1902
1903 /**
1904  * ice_tx_prepare_vlan_flags - prepare generic Tx VLAN tagging flags for HW
1905  * @tx_ring: ring to send buffer on
1906  * @first: pointer to struct ice_tx_buf
1907  *
1908  * Checks the skb and set up correspondingly several generic transmit flags
1909  * related to VLAN tagging for the HW, such as VLAN, DCB, etc.
1910  */
1911 static void
1912 ice_tx_prepare_vlan_flags(struct ice_tx_ring *tx_ring, struct ice_tx_buf *first)
1913 {
1914         struct sk_buff *skb = first->skb;
1915
1916         /* nothing left to do, software offloaded VLAN */
1917         if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol))
1918                 return;
1919
1920         /* currently, we always assume 802.1Q for VLAN insertion as VLAN
1921          * insertion for 802.1AD is not supported
1922          */
1923         if (skb_vlan_tag_present(skb)) {
1924                 first->tx_flags |= skb_vlan_tag_get(skb) << ICE_TX_FLAGS_VLAN_S;
1925                 first->tx_flags |= ICE_TX_FLAGS_HW_VLAN;
1926         }
1927
1928         ice_tx_prepare_vlan_flags_dcb(tx_ring, first);
1929 }
1930
1931 /**
1932  * ice_tso - computes mss and TSO length to prepare for TSO
1933  * @first: pointer to struct ice_tx_buf
1934  * @off: pointer to struct that holds offload parameters
1935  *
1936  * Returns 0 or error (negative) if TSO can't happen, 1 otherwise.
1937  */
1938 static
1939 int ice_tso(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
1940 {
1941         struct sk_buff *skb = first->skb;
1942         union {
1943                 struct iphdr *v4;
1944                 struct ipv6hdr *v6;
1945                 unsigned char *hdr;
1946         } ip;
1947         union {
1948                 struct tcphdr *tcp;
1949                 struct udphdr *udp;
1950                 unsigned char *hdr;
1951         } l4;
1952         u64 cd_mss, cd_tso_len;
1953         u32 paylen;
1954         u8 l4_start;
1955         int err;
1956
1957         if (skb->ip_summed != CHECKSUM_PARTIAL)
1958                 return 0;
1959
1960         if (!skb_is_gso(skb))
1961                 return 0;
1962
1963         err = skb_cow_head(skb, 0);
1964         if (err < 0)
1965                 return err;
1966
1967         /* cppcheck-suppress unreadVariable */
1968         ip.hdr = skb_network_header(skb);
1969         l4.hdr = skb_transport_header(skb);
1970
1971         /* initialize outer IP header fields */
1972         if (ip.v4->version == 4) {
1973                 ip.v4->tot_len = 0;
1974                 ip.v4->check = 0;
1975         } else {
1976                 ip.v6->payload_len = 0;
1977         }
1978
1979         if (skb_shinfo(skb)->gso_type & (SKB_GSO_GRE |
1980                                          SKB_GSO_GRE_CSUM |
1981                                          SKB_GSO_IPXIP4 |
1982                                          SKB_GSO_IPXIP6 |
1983                                          SKB_GSO_UDP_TUNNEL |
1984                                          SKB_GSO_UDP_TUNNEL_CSUM)) {
1985                 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
1986                     (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)) {
1987                         l4.udp->len = 0;
1988
1989                         /* determine offset of outer transport header */
1990                         l4_start = (u8)(l4.hdr - skb->data);
1991
1992                         /* remove payload length from outer checksum */
1993                         paylen = skb->len - l4_start;
1994                         csum_replace_by_diff(&l4.udp->check,
1995                                              (__force __wsum)htonl(paylen));
1996                 }
1997
1998                 /* reset pointers to inner headers */
1999
2000                 /* cppcheck-suppress unreadVariable */
2001                 ip.hdr = skb_inner_network_header(skb);
2002                 l4.hdr = skb_inner_transport_header(skb);
2003
2004                 /* initialize inner IP header fields */
2005                 if (ip.v4->version == 4) {
2006                         ip.v4->tot_len = 0;
2007                         ip.v4->check = 0;
2008                 } else {
2009                         ip.v6->payload_len = 0;
2010                 }
2011         }
2012
2013         /* determine offset of transport header */
2014         l4_start = (u8)(l4.hdr - skb->data);
2015
2016         /* remove payload length from checksum */
2017         paylen = skb->len - l4_start;
2018
2019         if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
2020                 csum_replace_by_diff(&l4.udp->check,
2021                                      (__force __wsum)htonl(paylen));
2022                 /* compute length of UDP segmentation header */
2023                 off->header_len = (u8)sizeof(l4.udp) + l4_start;
2024         } else {
2025                 csum_replace_by_diff(&l4.tcp->check,
2026                                      (__force __wsum)htonl(paylen));
2027                 /* compute length of TCP segmentation header */
2028                 off->header_len = (u8)((l4.tcp->doff * 4) + l4_start);
2029         }
2030
2031         /* update gso_segs and bytecount */
2032         first->gso_segs = skb_shinfo(skb)->gso_segs;
2033         first->bytecount += (first->gso_segs - 1) * off->header_len;
2034
2035         cd_tso_len = skb->len - off->header_len;
2036         cd_mss = skb_shinfo(skb)->gso_size;
2037
2038         /* record cdesc_qw1 with TSO parameters */
2039         off->cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
2040                              (ICE_TX_CTX_DESC_TSO << ICE_TXD_CTX_QW1_CMD_S) |
2041                              (cd_tso_len << ICE_TXD_CTX_QW1_TSO_LEN_S) |
2042                              (cd_mss << ICE_TXD_CTX_QW1_MSS_S));
2043         first->tx_flags |= ICE_TX_FLAGS_TSO;
2044         return 1;
2045 }
2046
2047 /**
2048  * ice_txd_use_count  - estimate the number of descriptors needed for Tx
2049  * @size: transmit request size in bytes
2050  *
2051  * Due to hardware alignment restrictions (4K alignment), we need to
2052  * assume that we can have no more than 12K of data per descriptor, even
2053  * though each descriptor can take up to 16K - 1 bytes of aligned memory.
2054  * Thus, we need to divide by 12K. But division is slow! Instead,
2055  * we decompose the operation into shifts and one relatively cheap
2056  * multiply operation.
2057  *
2058  * To divide by 12K, we first divide by 4K, then divide by 3:
2059  *     To divide by 4K, shift right by 12 bits
2060  *     To divide by 3, multiply by 85, then divide by 256
2061  *     (Divide by 256 is done by shifting right by 8 bits)
2062  * Finally, we add one to round up. Because 256 isn't an exact multiple of
2063  * 3, we'll underestimate near each multiple of 12K. This is actually more
2064  * accurate as we have 4K - 1 of wiggle room that we can fit into the last
2065  * segment. For our purposes this is accurate out to 1M which is orders of
2066  * magnitude greater than our largest possible GSO size.
2067  *
2068  * This would then be implemented as:
2069  *     return (((size >> 12) * 85) >> 8) + ICE_DESCS_FOR_SKB_DATA_PTR;
2070  *
2071  * Since multiplication and division are commutative, we can reorder
2072  * operations into:
2073  *     return ((size * 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR;
2074  */
2075 static unsigned int ice_txd_use_count(unsigned int size)
2076 {
2077         return ((size * 85) >> 20) + ICE_DESCS_FOR_SKB_DATA_PTR;
2078 }
2079
2080 /**
2081  * ice_xmit_desc_count - calculate number of Tx descriptors needed
2082  * @skb: send buffer
2083  *
2084  * Returns number of data descriptors needed for this skb.
2085  */
2086 static unsigned int ice_xmit_desc_count(struct sk_buff *skb)
2087 {
2088         const skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
2089         unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
2090         unsigned int count = 0, size = skb_headlen(skb);
2091
2092         for (;;) {
2093                 count += ice_txd_use_count(size);
2094
2095                 if (!nr_frags--)
2096                         break;
2097
2098                 size = skb_frag_size(frag++);
2099         }
2100
2101         return count;
2102 }
2103
2104 /**
2105  * __ice_chk_linearize - Check if there are more than 8 buffers per packet
2106  * @skb: send buffer
2107  *
2108  * Note: This HW can't DMA more than 8 buffers to build a packet on the wire
2109  * and so we need to figure out the cases where we need to linearize the skb.
2110  *
2111  * For TSO we need to count the TSO header and segment payload separately.
2112  * As such we need to check cases where we have 7 fragments or more as we
2113  * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
2114  * the segment payload in the first descriptor, and another 7 for the
2115  * fragments.
2116  */
2117 static bool __ice_chk_linearize(struct sk_buff *skb)
2118 {
2119         const skb_frag_t *frag, *stale;
2120         int nr_frags, sum;
2121
2122         /* no need to check if number of frags is less than 7 */
2123         nr_frags = skb_shinfo(skb)->nr_frags;
2124         if (nr_frags < (ICE_MAX_BUF_TXD - 1))
2125                 return false;
2126
2127         /* We need to walk through the list and validate that each group
2128          * of 6 fragments totals at least gso_size.
2129          */
2130         nr_frags -= ICE_MAX_BUF_TXD - 2;
2131         frag = &skb_shinfo(skb)->frags[0];
2132
2133         /* Initialize size to the negative value of gso_size minus 1. We
2134          * use this as the worst case scenario in which the frag ahead
2135          * of us only provides one byte which is why we are limited to 6
2136          * descriptors for a single transmit as the header and previous
2137          * fragment are already consuming 2 descriptors.
2138          */
2139         sum = 1 - skb_shinfo(skb)->gso_size;
2140
2141         /* Add size of frags 0 through 4 to create our initial sum */
2142         sum += skb_frag_size(frag++);
2143         sum += skb_frag_size(frag++);
2144         sum += skb_frag_size(frag++);
2145         sum += skb_frag_size(frag++);
2146         sum += skb_frag_size(frag++);
2147
2148         /* Walk through fragments adding latest fragment, testing it, and
2149          * then removing stale fragments from the sum.
2150          */
2151         for (stale = &skb_shinfo(skb)->frags[0];; stale++) {
2152                 int stale_size = skb_frag_size(stale);
2153
2154                 sum += skb_frag_size(frag++);
2155
2156                 /* The stale fragment may present us with a smaller
2157                  * descriptor than the actual fragment size. To account
2158                  * for that we need to remove all the data on the front and
2159                  * figure out what the remainder would be in the last
2160                  * descriptor associated with the fragment.
2161                  */
2162                 if (stale_size > ICE_MAX_DATA_PER_TXD) {
2163                         int align_pad = -(skb_frag_off(stale)) &
2164                                         (ICE_MAX_READ_REQ_SIZE - 1);
2165
2166                         sum -= align_pad;
2167                         stale_size -= align_pad;
2168
2169                         do {
2170                                 sum -= ICE_MAX_DATA_PER_TXD_ALIGNED;
2171                                 stale_size -= ICE_MAX_DATA_PER_TXD_ALIGNED;
2172                         } while (stale_size > ICE_MAX_DATA_PER_TXD);
2173                 }
2174
2175                 /* if sum is negative we failed to make sufficient progress */
2176                 if (sum < 0)
2177                         return true;
2178
2179                 if (!nr_frags--)
2180                         break;
2181
2182                 sum -= stale_size;
2183         }
2184
2185         return false;
2186 }
2187
2188 /**
2189  * ice_chk_linearize - Check if there are more than 8 fragments per packet
2190  * @skb:      send buffer
2191  * @count:    number of buffers used
2192  *
2193  * Note: Our HW can't scatter-gather more than 8 fragments to build
2194  * a packet on the wire and so we need to figure out the cases where we
2195  * need to linearize the skb.
2196  */
2197 static bool ice_chk_linearize(struct sk_buff *skb, unsigned int count)
2198 {
2199         /* Both TSO and single send will work if count is less than 8 */
2200         if (likely(count < ICE_MAX_BUF_TXD))
2201                 return false;
2202
2203         if (skb_is_gso(skb))
2204                 return __ice_chk_linearize(skb);
2205
2206         /* we can support up to 8 data buffers for a single send */
2207         return count != ICE_MAX_BUF_TXD;
2208 }
2209
2210 /**
2211  * ice_tstamp - set up context descriptor for hardware timestamp
2212  * @tx_ring: pointer to the Tx ring to send buffer on
2213  * @skb: pointer to the SKB we're sending
2214  * @first: Tx buffer
2215  * @off: Tx offload parameters
2216  */
2217 static void
2218 ice_tstamp(struct ice_tx_ring *tx_ring, struct sk_buff *skb,
2219            struct ice_tx_buf *first, struct ice_tx_offload_params *off)
2220 {
2221         s8 idx;
2222
2223         /* only timestamp the outbound packet if the user has requested it */
2224         if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)))
2225                 return;
2226
2227         if (!tx_ring->ptp_tx)
2228                 return;
2229
2230         /* Tx timestamps cannot be sampled when doing TSO */
2231         if (first->tx_flags & ICE_TX_FLAGS_TSO)
2232                 return;
2233
2234         /* Grab an open timestamp slot */
2235         idx = ice_ptp_request_ts(tx_ring->tx_tstamps, skb);
2236         if (idx < 0)
2237                 return;
2238
2239         off->cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
2240                              (ICE_TX_CTX_DESC_TSYN << ICE_TXD_CTX_QW1_CMD_S) |
2241                              ((u64)idx << ICE_TXD_CTX_QW1_TSO_LEN_S));
2242         first->tx_flags |= ICE_TX_FLAGS_TSYN;
2243 }
2244
2245 /**
2246  * ice_xmit_frame_ring - Sends buffer on Tx ring
2247  * @skb: send buffer
2248  * @tx_ring: ring to send buffer on
2249  *
2250  * Returns NETDEV_TX_OK if sent, else an error code
2251  */
2252 static netdev_tx_t
2253 ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring)
2254 {
2255         struct ice_tx_offload_params offload = { 0 };
2256         struct ice_vsi *vsi = tx_ring->vsi;
2257         struct ice_tx_buf *first;
2258         struct ethhdr *eth;
2259         unsigned int count;
2260         int tso, csum;
2261
2262         ice_trace(xmit_frame_ring, tx_ring, skb);
2263
2264         count = ice_xmit_desc_count(skb);
2265         if (ice_chk_linearize(skb, count)) {
2266                 if (__skb_linearize(skb))
2267                         goto out_drop;
2268                 count = ice_txd_use_count(skb->len);
2269                 tx_ring->tx_stats.tx_linearize++;
2270         }
2271
2272         /* need: 1 descriptor per page * PAGE_SIZE/ICE_MAX_DATA_PER_TXD,
2273          *       + 1 desc for skb_head_len/ICE_MAX_DATA_PER_TXD,
2274          *       + 4 desc gap to avoid the cache line where head is,
2275          *       + 1 desc for context descriptor,
2276          * otherwise try next time
2277          */
2278         if (ice_maybe_stop_tx(tx_ring, count + ICE_DESCS_PER_CACHE_LINE +
2279                               ICE_DESCS_FOR_CTX_DESC)) {
2280                 tx_ring->tx_stats.tx_busy++;
2281                 return NETDEV_TX_BUSY;
2282         }
2283
2284         /* prefetch for bql data which is infrequently used */
2285         netdev_txq_bql_enqueue_prefetchw(txring_txq(tx_ring));
2286
2287         offload.tx_ring = tx_ring;
2288
2289         /* record the location of the first descriptor for this packet */
2290         first = &tx_ring->tx_buf[tx_ring->next_to_use];
2291         first->skb = skb;
2292         first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
2293         first->gso_segs = 1;
2294         first->tx_flags = 0;
2295
2296         /* prepare the VLAN tagging flags for Tx */
2297         ice_tx_prepare_vlan_flags(tx_ring, first);
2298
2299         /* set up TSO offload */
2300         tso = ice_tso(first, &offload);
2301         if (tso < 0)
2302                 goto out_drop;
2303
2304         /* always set up Tx checksum offload */
2305         csum = ice_tx_csum(first, &offload);
2306         if (csum < 0)
2307                 goto out_drop;
2308
2309         /* allow CONTROL frames egress from main VSI if FW LLDP disabled */
2310         eth = (struct ethhdr *)skb_mac_header(skb);
2311         if (unlikely((skb->priority == TC_PRIO_CONTROL ||
2312                       eth->h_proto == htons(ETH_P_LLDP)) &&
2313                      vsi->type == ICE_VSI_PF &&
2314                      vsi->port_info->qos_cfg.is_sw_lldp))
2315                 offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX |
2316                                         ICE_TX_CTX_DESC_SWTCH_UPLINK <<
2317                                         ICE_TXD_CTX_QW1_CMD_S);
2318
2319         ice_tstamp(tx_ring, skb, first, &offload);
2320         if (ice_is_switchdev_running(vsi->back))
2321                 ice_eswitch_set_target_vsi(skb, &offload);
2322
2323         if (offload.cd_qw1 & ICE_TX_DESC_DTYPE_CTX) {
2324                 struct ice_tx_ctx_desc *cdesc;
2325                 u16 i = tx_ring->next_to_use;
2326
2327                 /* grab the next descriptor */
2328                 cdesc = ICE_TX_CTX_DESC(tx_ring, i);
2329                 i++;
2330                 tx_ring->next_to_use = (i < tx_ring->count) ? i : 0;
2331
2332                 /* setup context descriptor */
2333                 cdesc->tunneling_params = cpu_to_le32(offload.cd_tunnel_params);
2334                 cdesc->l2tag2 = cpu_to_le16(offload.cd_l2tag2);
2335                 cdesc->rsvd = cpu_to_le16(0);
2336                 cdesc->qw1 = cpu_to_le64(offload.cd_qw1);
2337         }
2338
2339         ice_tx_map(tx_ring, first, &offload);
2340         return NETDEV_TX_OK;
2341
2342 out_drop:
2343         ice_trace(xmit_frame_ring_drop, tx_ring, skb);
2344         dev_kfree_skb_any(skb);
2345         return NETDEV_TX_OK;
2346 }
2347
2348 /**
2349  * ice_start_xmit - Selects the correct VSI and Tx queue to send buffer
2350  * @skb: send buffer
2351  * @netdev: network interface device structure
2352  *
2353  * Returns NETDEV_TX_OK if sent, else an error code
2354  */
2355 netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev)
2356 {
2357         struct ice_netdev_priv *np = netdev_priv(netdev);
2358         struct ice_vsi *vsi = np->vsi;
2359         struct ice_tx_ring *tx_ring;
2360
2361         tx_ring = vsi->tx_rings[skb->queue_mapping];
2362
2363         /* hardware can't handle really short frames, hardware padding works
2364          * beyond this point
2365          */
2366         if (skb_put_padto(skb, ICE_MIN_TX_LEN))
2367                 return NETDEV_TX_OK;
2368
2369         return ice_xmit_frame_ring(skb, tx_ring);
2370 }
2371
2372 /**
2373  * ice_get_dscp_up - return the UP/TC value for a SKB
2374  * @dcbcfg: DCB config that contains DSCP to UP/TC mapping
2375  * @skb: SKB to query for info to determine UP/TC
2376  *
2377  * This function is to only be called when the PF is in L3 DSCP PFC mode
2378  */
2379 static u8 ice_get_dscp_up(struct ice_dcbx_cfg *dcbcfg, struct sk_buff *skb)
2380 {
2381         u8 dscp = 0;
2382
2383         if (skb->protocol == htons(ETH_P_IP))
2384                 dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
2385         else if (skb->protocol == htons(ETH_P_IPV6))
2386                 dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
2387
2388         return dcbcfg->dscp_map[dscp];
2389 }
2390
2391 u16
2392 ice_select_queue(struct net_device *netdev, struct sk_buff *skb,
2393                  struct net_device *sb_dev)
2394 {
2395         struct ice_pf *pf = ice_netdev_to_pf(netdev);
2396         struct ice_dcbx_cfg *dcbcfg;
2397
2398         dcbcfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
2399         if (dcbcfg->pfc_mode == ICE_QOS_MODE_DSCP)
2400                 skb->priority = ice_get_dscp_up(dcbcfg, skb);
2401
2402         return netdev_pick_tx(netdev, skb, sb_dev);
2403 }
2404
2405 /**
2406  * ice_clean_ctrl_tx_irq - interrupt handler for flow director Tx queue
2407  * @tx_ring: tx_ring to clean
2408  */
2409 void ice_clean_ctrl_tx_irq(struct ice_tx_ring *tx_ring)
2410 {
2411         struct ice_vsi *vsi = tx_ring->vsi;
2412         s16 i = tx_ring->next_to_clean;
2413         int budget = ICE_DFLT_IRQ_WORK;
2414         struct ice_tx_desc *tx_desc;
2415         struct ice_tx_buf *tx_buf;
2416
2417         tx_buf = &tx_ring->tx_buf[i];
2418         tx_desc = ICE_TX_DESC(tx_ring, i);
2419         i -= tx_ring->count;
2420
2421         do {
2422                 struct ice_tx_desc *eop_desc = tx_buf->next_to_watch;
2423
2424                 /* if next_to_watch is not set then there is no pending work */
2425                 if (!eop_desc)
2426                         break;
2427
2428                 /* prevent any other reads prior to eop_desc */
2429                 smp_rmb();
2430
2431                 /* if the descriptor isn't done, no work to do */
2432                 if (!(eop_desc->cmd_type_offset_bsz &
2433                       cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
2434                         break;
2435
2436                 /* clear next_to_watch to prevent false hangs */
2437                 tx_buf->next_to_watch = NULL;
2438                 tx_desc->buf_addr = 0;
2439                 tx_desc->cmd_type_offset_bsz = 0;
2440
2441                 /* move past filter desc */
2442                 tx_buf++;
2443                 tx_desc++;
2444                 i++;
2445                 if (unlikely(!i)) {
2446                         i -= tx_ring->count;
2447                         tx_buf = tx_ring->tx_buf;
2448                         tx_desc = ICE_TX_DESC(tx_ring, 0);
2449                 }
2450
2451                 /* unmap the data header */
2452                 if (dma_unmap_len(tx_buf, len))
2453                         dma_unmap_single(tx_ring->dev,
2454                                          dma_unmap_addr(tx_buf, dma),
2455                                          dma_unmap_len(tx_buf, len),
2456                                          DMA_TO_DEVICE);
2457                 if (tx_buf->tx_flags & ICE_TX_FLAGS_DUMMY_PKT)
2458                         devm_kfree(tx_ring->dev, tx_buf->raw_buf);
2459
2460                 /* clear next_to_watch to prevent false hangs */
2461                 tx_buf->raw_buf = NULL;
2462                 tx_buf->tx_flags = 0;
2463                 tx_buf->next_to_watch = NULL;
2464                 dma_unmap_len_set(tx_buf, len, 0);
2465                 tx_desc->buf_addr = 0;
2466                 tx_desc->cmd_type_offset_bsz = 0;
2467
2468                 /* move past eop_desc for start of next FD desc */
2469                 tx_buf++;
2470                 tx_desc++;
2471                 i++;
2472                 if (unlikely(!i)) {
2473                         i -= tx_ring->count;
2474                         tx_buf = tx_ring->tx_buf;
2475                         tx_desc = ICE_TX_DESC(tx_ring, 0);
2476                 }
2477
2478                 budget--;
2479         } while (likely(budget));
2480
2481         i += tx_ring->count;
2482         tx_ring->next_to_clean = i;
2483
2484         /* re-enable interrupt if needed */
2485         ice_irq_dynamic_ena(&vsi->back->hw, vsi, vsi->q_vectors[0]);
2486 }