drivers/net/virtio_net.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* A network driver using virtio.
   3  *
   4  * Copyright 2007 Rusty Russell <[email protected]> IBM Corporation
   5  */
   6 //#define DEBUG
   7 #include <linux/netdevice.h>
   8 #include <linux/etherdevice.h>
   9 #include <linux/ethtool.h>
  10 #include <linux/module.h>
  11 #include <linux/virtio.h>
  12 #include <linux/virtio_net.h>
  13 #include <linux/bpf.h>
  14 #include <linux/bpf_trace.h>
  15 #include <linux/scatterlist.h>
  16 #include <linux/if_vlan.h>
  17 #include <linux/slab.h>
  18 #include <linux/cpu.h>
  19 #include <linux/average.h>
  20 #include <linux/filter.h>
  21 #include <linux/kernel.h>
  22 #include <net/route.h>
  23 #include <net/xdp.h>
  24 #include <net/net_failover.h>
  25
  26 static int napi_weight = NAPI_POLL_WEIGHT;
  27 module_param(napi_weight, int, 0444);
  28
  29 static bool csum = true, gso = true, napi_tx = true;
  30 module_param(csum, bool, 0444);
  31 module_param(gso, bool, 0444);
  32 module_param(napi_tx, bool, 0644);
  33
  34 /* FIXME: MTU in config. */
  35 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
  36 #define GOOD_COPY_LEN   128
  37
  38 #define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
  39
  40 /* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
  41 #define VIRTIO_XDP_HEADROOM 256
  42
  43 /* Separating two types of XDP xmit */
  44 #define VIRTIO_XDP_TX           BIT(0)
  45 #define VIRTIO_XDP_REDIR        BIT(1)
  46
  47 #define VIRTIO_XDP_FLAG BIT(0)
  48
  49 /* RX packet size EWMA. The average packet size is used to determine the packet
  50  * buffer size when refilling RX rings. As the entire RX ring may be refilled
  51  * at once, the weight is chosen so that the EWMA will be insensitive to short-
  52  * term, transient changes in packet size.
  53  */
  54 DECLARE_EWMA(pkt_len, 0, 64)
  55
  56 #define VIRTNET_DRIVER_VERSION "1.0.0"
  57
  58 static const unsigned long guest_offloads[] = {
  59         VIRTIO_NET_F_GUEST_TSO4,
  60         VIRTIO_NET_F_GUEST_TSO6,
  61         VIRTIO_NET_F_GUEST_ECN,
  62         VIRTIO_NET_F_GUEST_UFO,
  63         VIRTIO_NET_F_GUEST_CSUM,
  64         VIRTIO_NET_F_GUEST_USO4,
  65         VIRTIO_NET_F_GUEST_USO6
  66 };
  67
  68 #define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
  69                                 (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
  70                                 (1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
  71                                 (1ULL << VIRTIO_NET_F_GUEST_UFO)  | \
  72                                 (1ULL << VIRTIO_NET_F_GUEST_USO4) | \
  73                                 (1ULL << VIRTIO_NET_F_GUEST_USO6))
  74
  75 struct virtnet_stat_desc {
  76         char desc[ETH_GSTRING_LEN];
  77         size_t offset;
  78 };
  79
  80 struct virtnet_sq_stats {
  81         struct u64_stats_sync syncp;
  82         u64 packets;
  83         u64 bytes;
  84         u64 xdp_tx;
  85         u64 xdp_tx_drops;
  86         u64 kicks;
  87         u64 tx_timeouts;
  88 };
  89
  90 struct virtnet_rq_stats {
  91         struct u64_stats_sync syncp;
  92         u64 packets;
  93         u64 bytes;
  94         u64 drops;
  95         u64 xdp_packets;
  96         u64 xdp_tx;
  97         u64 xdp_redirects;
  98         u64 xdp_drops;
  99         u64 kicks;
 100 };
 101
 102 #define VIRTNET_SQ_STAT(m)      offsetof(struct virtnet_sq_stats, m)
 103 #define VIRTNET_RQ_STAT(m)      offsetof(struct virtnet_rq_stats, m)
 104
 105 static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
 106         { "packets",            VIRTNET_SQ_STAT(packets) },
 107         { "bytes",              VIRTNET_SQ_STAT(bytes) },
 108         { "xdp_tx",             VIRTNET_SQ_STAT(xdp_tx) },
 109         { "xdp_tx_drops",       VIRTNET_SQ_STAT(xdp_tx_drops) },
 110         { "kicks",              VIRTNET_SQ_STAT(kicks) },
 111         { "tx_timeouts",        VIRTNET_SQ_STAT(tx_timeouts) },
 112 };
 113
 114 static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
 115         { "packets",            VIRTNET_RQ_STAT(packets) },
 116         { "bytes",              VIRTNET_RQ_STAT(bytes) },
 117         { "drops",              VIRTNET_RQ_STAT(drops) },
 118         { "xdp_packets",        VIRTNET_RQ_STAT(xdp_packets) },
 119         { "xdp_tx",             VIRTNET_RQ_STAT(xdp_tx) },
 120         { "xdp_redirects",      VIRTNET_RQ_STAT(xdp_redirects) },
 121         { "xdp_drops",          VIRTNET_RQ_STAT(xdp_drops) },
 122         { "kicks",              VIRTNET_RQ_STAT(kicks) },
 123 };
 124
 125 #define VIRTNET_SQ_STATS_LEN    ARRAY_SIZE(virtnet_sq_stats_desc)
 126 #define VIRTNET_RQ_STATS_LEN    ARRAY_SIZE(virtnet_rq_stats_desc)
 127
 128 /* Internal representation of a send virtqueue */
 129 struct send_queue {
 130         /* Virtqueue associated with this send _queue */
 131         struct virtqueue *vq;
 132
 133         /* TX: fragments + linear part + virtio header */
 134         struct scatterlist sg[MAX_SKB_FRAGS + 2];
 135
 136         /* Name of the send queue: output.$index */
 137         char name[40];
 138
 139         struct virtnet_sq_stats stats;
 140
 141         struct napi_struct napi;
 142
 143         /* Record whether sq is in reset state. */
 144         bool reset;
 145 };
 146
 147 /* Internal representation of a receive virtqueue */
 148 struct receive_queue {
 149         /* Virtqueue associated with this receive_queue */
 150         struct virtqueue *vq;
 151
 152         struct napi_struct napi;
 153
 154         struct bpf_prog __rcu *xdp_prog;
 155
 156         struct virtnet_rq_stats stats;
 157
 158         /* Chain pages by the private ptr. */
 159         struct page *pages;
 160
 161         /* Average packet length for mergeable receive buffers. */
 162         struct ewma_pkt_len mrg_avg_pkt_len;
 163
 164         /* Page frag for packet buffer allocation. */
 165         struct page_frag alloc_frag;
 166
 167         /* RX: fragments + linear part + virtio header */
 168         struct scatterlist sg[MAX_SKB_FRAGS + 2];
 169
 170         /* Min single buffer size for mergeable buffers case. */
 171         unsigned int min_buf_len;
 172
 173         /* Name of this receive queue: input.$index */
 174         char name[40];
 175
 176         struct xdp_rxq_info xdp_rxq;
 177 };
 178
 179 /* This structure can contain rss message with maximum settings for indirection table and keysize
 180  * Note, that default structure that describes RSS configuration virtio_net_rss_config
 181  * contains same info but can't handle table values.
 182  * In any case, structure would be passed to virtio hw through sg_buf split by parts
 183  * because table sizes may be differ according to the device configuration.
 184  */
 185 #define VIRTIO_NET_RSS_MAX_KEY_SIZE     40
 186 #define VIRTIO_NET_RSS_MAX_TABLE_LEN    128
 187 struct virtio_net_ctrl_rss {
 188         u32 hash_types;
 189         u16 indirection_table_mask;
 190         u16 unclassified_queue;
 191         u16 indirection_table[VIRTIO_NET_RSS_MAX_TABLE_LEN];
 192         u16 max_tx_vq;
 193         u8 hash_key_length;
 194         u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
 195 };
 196
 197 /* Control VQ buffers: protected by the rtnl lock */
 198 struct control_buf {
 199         struct virtio_net_ctrl_hdr hdr;
 200         virtio_net_ctrl_ack status;
 201         struct virtio_net_ctrl_mq mq;
 202         u8 promisc;
 203         u8 allmulti;
 204         __virtio16 vid;
 205         __virtio64 offloads;
 206         struct virtio_net_ctrl_rss rss;
 207 };
 208
 209 struct virtnet_info {
 210         struct virtio_device *vdev;
 211         struct virtqueue *cvq;
 212         struct net_device *dev;
 213         struct send_queue *sq;
 214         struct receive_queue *rq;
 215         unsigned int status;
 216
 217         /* Max # of queue pairs supported by the device */
 218         u16 max_queue_pairs;
 219
 220         /* # of queue pairs currently used by the driver */
 221         u16 curr_queue_pairs;
 222
 223         /* # of XDP queue pairs currently used by the driver */
 224         u16 xdp_queue_pairs;
 225
 226         /* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */
 227         bool xdp_enabled;
 228
 229         /* I like... big packets and I cannot lie! */
 230         bool big_packets;
 231
 232         /* number of sg entries allocated for big packets */
 233         unsigned int big_packets_num_skbfrags;
 234
 235         /* Host will merge rx buffers for big packets (shake it! shake it!) */
 236         bool mergeable_rx_bufs;
 237
 238         /* Host supports rss and/or hash report */
 239         bool has_rss;
 240         bool has_rss_hash_report;
 241         u8 rss_key_size;
 242         u16 rss_indir_table_size;
 243         u32 rss_hash_types_supported;
 244         u32 rss_hash_types_saved;
 245
 246         /* Has control virtqueue */
 247         bool has_cvq;
 248
 249         /* Host can handle any s/g split between our header and packet data */
 250         bool any_header_sg;
 251
 252         /* Packet virtio header size */
 253         u8 hdr_len;
 254
 255         /* Work struct for delayed refilling if we run low on memory. */
 256         struct delayed_work refill;
 257
 258         /* Is delayed refill enabled? */
 259         bool refill_enabled;
 260
 261         /* The lock to synchronize the access to refill_enabled */
 262         spinlock_t refill_lock;
 263
 264         /* Work struct for config space updates */
 265         struct work_struct config_work;
 266
 267         /* Does the affinity hint is set for virtqueues? */
 268         bool affinity_hint_set;
 269
 270         /* CPU hotplug instances for online & dead */
 271         struct hlist_node node;
 272         struct hlist_node node_dead;
 273
 274         struct control_buf *ctrl;
 275
 276         /* Ethtool settings */
 277         u8 duplex;
 278         u32 speed;
 279
 280         /* Interrupt coalescing settings */
 281         u32 tx_usecs;
 282         u32 rx_usecs;
 283         u32 tx_max_packets;
 284         u32 rx_max_packets;
 285
 286         unsigned long guest_offloads;
 287         unsigned long guest_offloads_capable;
 288
 289         /* failover when STANDBY feature enabled */
 290         struct failover *failover;
 291 };
 292
 293 struct padded_vnet_hdr {
 294         struct virtio_net_hdr_v1_hash hdr;
 295         /*
 296          * hdr is in a separate sg buffer, and data sg buffer shares same page
 297          * with this header sg. This padding makes next sg 16 byte aligned
 298          * after the header.
 299          */
 300         char padding[12];
 301 };
 302
 303 static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf);
 304 static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
 305
 306 static bool is_xdp_frame(void *ptr)
 307 {
 308         return (unsigned long)ptr & VIRTIO_XDP_FLAG;
 309 }
 310
 311 static void *xdp_to_ptr(struct xdp_frame *ptr)
 312 {
 313         return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
 314 }
 315
 316 static struct xdp_frame *ptr_to_xdp(void *ptr)
 317 {
 318         return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
 319 }
 320
 321 /* Converting between virtqueue no. and kernel tx/rx queue no.
 322  * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 323  */
 324 static int vq2txq(struct virtqueue *vq)
 325 {
 326         return (vq->index - 1) / 2;
 327 }
 328
 329 static int txq2vq(int txq)
 330 {
 331         return txq * 2 + 1;
 332 }
 333
 334 static int vq2rxq(struct virtqueue *vq)
 335 {
 336         return vq->index / 2;
 337 }
 338
 339 static int rxq2vq(int rxq)
 340 {
 341         return rxq * 2;
 342 }
 343
 344 static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
 345 {
 346         return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
 347 }
 348
 349 /*
 350  * private is used to chain pages for big packets, put the whole
 351  * most recent used list in the beginning for reuse
 352  */
 353 static void give_pages(struct receive_queue *rq, struct page *page)
 354 {
 355         struct page *end;
 356
 357         /* Find end of list, sew whole thing into vi->rq.pages. */
 358         for (end = page; end->private; end = (struct page *)end->private);
 359         end->private = (unsigned long)rq->pages;
 360         rq->pages = page;
 361 }
 362
 363 static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 364 {
 365         struct page *p = rq->pages;
 366
 367         if (p) {
 368                 rq->pages = (struct page *)p->private;
 369                 /* clear private here, it is used to chain pages */
 370                 p->private = 0;
 371         } else
 372                 p = alloc_page(gfp_mask);
 373         return p;
 374 }
 375
 376 static void enable_delayed_refill(struct virtnet_info *vi)
 377 {
 378         spin_lock_bh(&vi->refill_lock);
 379         vi->refill_enabled = true;
 380         spin_unlock_bh(&vi->refill_lock);
 381 }
 382
 383 static void disable_delayed_refill(struct virtnet_info *vi)
 384 {
 385         spin_lock_bh(&vi->refill_lock);
 386         vi->refill_enabled = false;
 387         spin_unlock_bh(&vi->refill_lock);
 388 }
 389
 390 static void virtqueue_napi_schedule(struct napi_struct *napi,
 391                                     struct virtqueue *vq)
 392 {
 393         if (napi_schedule_prep(napi)) {
 394                 virtqueue_disable_cb(vq);
 395                 __napi_schedule(napi);
 396         }
 397 }
 398
 399 static void virtqueue_napi_complete(struct napi_struct *napi,
 400                                     struct virtqueue *vq, int processed)
 401 {
 402         int opaque;
 403
 404         opaque = virtqueue_enable_cb_prepare(vq);
 405         if (napi_complete_done(napi, processed)) {
 406                 if (unlikely(virtqueue_poll(vq, opaque)))
 407                         virtqueue_napi_schedule(napi, vq);
 408         } else {
 409                 virtqueue_disable_cb(vq);
 410         }
 411 }
 412
 413 static void skb_xmit_done(struct virtqueue *vq)
 414 {
 415         struct virtnet_info *vi = vq->vdev->priv;
 416         struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
 417
 418         /* Suppress further interrupts. */
 419         virtqueue_disable_cb(vq);
 420
 421         if (napi->weight)
 422                 virtqueue_napi_schedule(napi, vq);
 423         else
 424                 /* We were probably waiting for more output buffers. */
 425                 netif_wake_subqueue(vi->dev, vq2txq(vq));
 426 }
 427
 428 #define MRG_CTX_HEADER_SHIFT 22
 429 static void *mergeable_len_to_ctx(unsigned int truesize,
 430                                   unsigned int headroom)
 431 {
 432         return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
 433 }
 434
 435 static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
 436 {
 437         return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
 438 }
 439
 440 static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
 441 {
 442         return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
 443 }
 444
 445 /* Called from bottom half context */
 446 static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 447                                    struct receive_queue *rq,
 448                                    struct page *page, unsigned int offset,
 449                                    unsigned int len, unsigned int truesize,
 450                                    bool hdr_valid, unsigned int metasize,
 451                                    unsigned int headroom)
 452 {
 453         struct sk_buff *skb;
 454         struct virtio_net_hdr_mrg_rxbuf *hdr;
 455         unsigned int copy, hdr_len, hdr_padded_len;
 456         struct page *page_to_free = NULL;
 457         int tailroom, shinfo_size;
 458         char *p, *hdr_p, *buf;
 459
 460         p = page_address(page) + offset;
 461         hdr_p = p;
 462
 463         hdr_len = vi->hdr_len;
 464         if (vi->mergeable_rx_bufs)
 465                 hdr_padded_len = hdr_len;
 466         else
 467                 hdr_padded_len = sizeof(struct padded_vnet_hdr);
 468
 469         /* If headroom is not 0, there is an offset between the beginning of the
 470          * data and the allocated space, otherwise the data and the allocated
 471          * space are aligned.
 472          *
 473          * Buffers with headroom use PAGE_SIZE as alloc size, see
 474          * add_recvbuf_mergeable() + get_mergeable_buf_len()
 475          */
 476         truesize = headroom ? PAGE_SIZE : truesize;
 477         tailroom = truesize - headroom;
 478         buf = p - headroom;
 479
 480         len -= hdr_len;
 481         offset += hdr_padded_len;
 482         p += hdr_padded_len;
 483         tailroom -= hdr_padded_len + len;
 484
 485         shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 486
 487         /* copy small packet so we can reuse these pages */
 488         if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
 489                 skb = build_skb(buf, truesize);
 490                 if (unlikely(!skb))
 491                         return NULL;
 492
 493                 skb_reserve(skb, p - buf);
 494                 skb_put(skb, len);
 495
 496                 page = (struct page *)page->private;
 497                 if (page)
 498                         give_pages(rq, page);
 499                 goto ok;
 500         }
 501
 502         /* copy small packet so we can reuse these pages for small data */
 503         skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
 504         if (unlikely(!skb))
 505                 return NULL;
 506
 507         /* Copy all frame if it fits skb->head, otherwise
 508          * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
 509          */
 510         if (len <= skb_tailroom(skb))
 511                 copy = len;
 512         else
 513                 copy = ETH_HLEN + metasize;
 514         skb_put_data(skb, p, copy);
 515
 516         len -= copy;
 517         offset += copy;
 518
 519         if (vi->mergeable_rx_bufs) {
 520                 if (len)
 521                         skb_add_rx_frag(skb, 0, page, offset, len, truesize);
 522                 else
 523                         page_to_free = page;
 524                 goto ok;
 525         }
 526
 527         /*
 528          * Verify that we can indeed put this data into a skb.
 529          * This is here to handle cases when the device erroneously
 530          * tries to receive more than is possible. This is usually
 531          * the case of a broken device.
 532          */
 533         if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
 534                 net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
 535                 dev_kfree_skb(skb);
 536                 return NULL;
 537         }
 538         BUG_ON(offset >= PAGE_SIZE);
 539         while (len) {
 540                 unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
 541                 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
 542                                 frag_size, truesize);
 543                 len -= frag_size;
 544                 page = (struct page *)page->private;
 545                 offset = 0;
 546         }
 547
 548         if (page)
 549                 give_pages(rq, page);
 550
 551 ok:
 552         /* hdr_valid means no XDP, so we can copy the vnet header */
 553         if (hdr_valid) {
 554                 hdr = skb_vnet_hdr(skb);
 555                 memcpy(hdr, hdr_p, hdr_len);
 556         }
 557         if (page_to_free)
 558                 put_page(page_to_free);
 559
 560         if (metasize) {
 561                 __skb_pull(skb, metasize);
 562                 skb_metadata_set(skb, metasize);
 563         }
 564
 565         return skb;
 566 }
 567
 568 static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
 569                                    struct send_queue *sq,
 570                                    struct xdp_frame *xdpf)
 571 {
 572         struct virtio_net_hdr_mrg_rxbuf *hdr;
 573         int err;
 574
 575         if (unlikely(xdpf->headroom < vi->hdr_len))
 576                 return -EOVERFLOW;
 577
 578         /* Make room for virtqueue hdr (also change xdpf->headroom?) */
 579         xdpf->data -= vi->hdr_len;
 580         /* Zero header and leave csum up to XDP layers */
 581         hdr = xdpf->data;
 582         memset(hdr, 0, vi->hdr_len);
 583         xdpf->len   += vi->hdr_len;
 584
 585         sg_init_one(sq->sg, xdpf->data, xdpf->len);
 586
 587         err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp_to_ptr(xdpf),
 588                                    GFP_ATOMIC);
 589         if (unlikely(err))
 590                 return -ENOSPC; /* Caller handle free/refcnt */
 591
 592         return 0;
 593 }
 594
 595 /* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on
 596  * the current cpu, so it does not need to be locked.
 597  *
 598  * Here we use marco instead of inline functions because we have to deal with
 599  * three issues at the same time: 1. the choice of sq. 2. judge and execute the
 600  * lock/unlock of txq 3. make sparse happy. It is difficult for two inline
 601  * functions to perfectly solve these three problems at the same time.
 602  */
 603 #define virtnet_xdp_get_sq(vi) ({                                       \
 604         int cpu = smp_processor_id();                                   \
 605         struct netdev_queue *txq;                                       \
 606         typeof(vi) v = (vi);                                            \
 607         unsigned int qp;                                                \
 608                                                                         \
 609         if (v->curr_queue_pairs > nr_cpu_ids) {                         \
 610                 qp = v->curr_queue_pairs - v->xdp_queue_pairs;          \
 611                 qp += cpu;                                              \
 612                 txq = netdev_get_tx_queue(v->dev, qp);                  \
 613                 __netif_tx_acquire(txq);                                \
 614         } else {                                                        \
 615                 qp = cpu % v->curr_queue_pairs;                         \
 616                 txq = netdev_get_tx_queue(v->dev, qp);                  \
 617                 __netif_tx_lock(txq, cpu);                              \
 618         }                                                               \
 619         v->sq + qp;                                                     \
 620 })
 621
 622 #define virtnet_xdp_put_sq(vi, q) {                                     \
 623         struct netdev_queue *txq;                                       \
 624         typeof(vi) v = (vi);                                            \
 625                                                                         \
 626         txq = netdev_get_tx_queue(v->dev, (q) - v->sq);                 \
 627         if (v->curr_queue_pairs > nr_cpu_ids)                           \
 628                 __netif_tx_release(txq);                                \
 629         else                                                            \
 630                 __netif_tx_unlock(txq);                                 \
 631 }
 632
 633 static int virtnet_xdp_xmit(struct net_device *dev,
 634                             int n, struct xdp_frame **frames, u32 flags)
 635 {
 636         struct virtnet_info *vi = netdev_priv(dev);
 637         struct receive_queue *rq = vi->rq;
 638         struct bpf_prog *xdp_prog;
 639         struct send_queue *sq;
 640         unsigned int len;
 641         int packets = 0;
 642         int bytes = 0;
 643         int nxmit = 0;
 644         int kicks = 0;
 645         void *ptr;
 646         int ret;
 647         int i;
 648
 649         /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
 650          * indicate XDP resources have been successfully allocated.
 651          */
 652         xdp_prog = rcu_access_pointer(rq->xdp_prog);
 653         if (!xdp_prog)
 654                 return -ENXIO;
 655
 656         sq = virtnet_xdp_get_sq(vi);
 657
 658         if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
 659                 ret = -EINVAL;
 660                 goto out;
 661         }
 662
 663         /* Free up any pending old buffers before queueing new ones. */
 664         while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
 665                 if (likely(is_xdp_frame(ptr))) {
 666                         struct xdp_frame *frame = ptr_to_xdp(ptr);
 667
 668                         bytes += frame->len;
 669                         xdp_return_frame(frame);
 670                 } else {
 671                         struct sk_buff *skb = ptr;
 672
 673                         bytes += skb->len;
 674                         napi_consume_skb(skb, false);
 675                 }
 676                 packets++;
 677         }
 678
 679         for (i = 0; i < n; i++) {
 680                 struct xdp_frame *xdpf = frames[i];
 681
 682                 if (__virtnet_xdp_xmit_one(vi, sq, xdpf))
 683                         break;
 684                 nxmit++;
 685         }
 686         ret = nxmit;
 687
 688         if (flags & XDP_XMIT_FLUSH) {
 689                 if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
 690                         kicks = 1;
 691         }
 692 out:
 693         u64_stats_update_begin(&sq->stats.syncp);
 694         sq->stats.bytes += bytes;
 695         sq->stats.packets += packets;
 696         sq->stats.xdp_tx += n;
 697         sq->stats.xdp_tx_drops += n - nxmit;
 698         sq->stats.kicks += kicks;
 699         u64_stats_update_end(&sq->stats.syncp);
 700
 701         virtnet_xdp_put_sq(vi, sq);
 702         return ret;
 703 }
 704
 705 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
 706 {
 707         return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0;
 708 }
 709
 710 /* We copy the packet for XDP in the following cases:
 711  *
 712  * 1) Packet is scattered across multiple rx buffers.
 713  * 2) Headroom space is insufficient.
 714  *
 715  * This is inefficient but it's a temporary condition that
 716  * we hit right after XDP is enabled and until queue is refilled
 717  * with large buffers with sufficient headroom - so it should affect
 718  * at most queue size packets.
 719  * Afterwards, the conditions to enable
 720  * XDP should preclude the underlying device from sending packets
 721  * across multiple buffers (num_buf > 1), and we make sure buffers
 722  * have enough headroom.
 723  */
 724 static struct page *xdp_linearize_page(struct receive_queue *rq,
 725                                        u16 *num_buf,
 726                                        struct page *p,
 727                                        int offset,
 728                                        int page_off,
 729                                        unsigned int *len)
 730 {
 731         struct page *page = alloc_page(GFP_ATOMIC);
 732
 733         if (!page)
 734                 return NULL;
 735
 736         memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
 737         page_off += *len;
 738
 739         while (--*num_buf) {
 740                 int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 741                 unsigned int buflen;
 742                 void *buf;
 743                 int off;
 744
 745                 buf = virtqueue_get_buf(rq->vq, &buflen);
 746                 if (unlikely(!buf))
 747                         goto err_buf;
 748
 749                 p = virt_to_head_page(buf);
 750                 off = buf - page_address(p);
 751
 752                 /* guard against a misconfigured or uncooperative backend that
 753                  * is sending packet larger than the MTU.
 754                  */
 755                 if ((page_off + buflen + tailroom) > PAGE_SIZE) {
 756                         put_page(p);
 757                         goto err_buf;
 758                 }
 759
 760                 memcpy(page_address(page) + page_off,
 761                        page_address(p) + off, buflen);
 762                 page_off += buflen;
 763                 put_page(p);
 764         }
 765
 766         /* Headroom does not contribute to packet length */
 767         *len = page_off - VIRTIO_XDP_HEADROOM;
 768         return page;
 769 err_buf:
 770         __free_pages(page, 0);
 771         return NULL;
 772 }
 773
 774 static struct sk_buff *receive_small(struct net_device *dev,
 775                                      struct virtnet_info *vi,
 776                                      struct receive_queue *rq,
 777                                      void *buf, void *ctx,
 778                                      unsigned int len,
 779                                      unsigned int *xdp_xmit,
 780                                      struct virtnet_rq_stats *stats)
 781 {
 782         struct sk_buff *skb;
 783         struct bpf_prog *xdp_prog;
 784         unsigned int xdp_headroom = (unsigned long)ctx;
 785         unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
 786         unsigned int headroom = vi->hdr_len + header_offset;
 787         unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 788                               SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 789         struct page *page = virt_to_head_page(buf);
 790         unsigned int delta = 0;
 791         struct page *xdp_page;
 792         int err;
 793         unsigned int metasize = 0;
 794
 795         len -= vi->hdr_len;
 796         stats->bytes += len;
 797
 798         if (unlikely(len > GOOD_PACKET_LEN)) {
 799                 pr_debug("%s: rx error: len %u exceeds max size %d\n",
 800                          dev->name, len, GOOD_PACKET_LEN);
 801                 dev->stats.rx_length_errors++;
 802                 goto err;
 803         }
 804
 805         if (likely(!vi->xdp_enabled)) {
 806                 xdp_prog = NULL;
 807                 goto skip_xdp;
 808         }
 809
 810         rcu_read_lock();
 811         xdp_prog = rcu_dereference(rq->xdp_prog);
 812         if (xdp_prog) {
 813                 struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
 814                 struct xdp_frame *xdpf;
 815                 struct xdp_buff xdp;
 816                 void *orig_data;
 817                 u32 act;
 818
 819                 if (unlikely(hdr->hdr.gso_type))
 820                         goto err_xdp;
 821
 822                 if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
 823                         int offset = buf - page_address(page) + header_offset;
 824                         unsigned int tlen = len + vi->hdr_len;
 825                         u16 num_buf = 1;
 826
 827                         xdp_headroom = virtnet_get_headroom(vi);
 828                         header_offset = VIRTNET_RX_PAD + xdp_headroom;
 829                         headroom = vi->hdr_len + header_offset;
 830                         buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 831                                  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 832                         xdp_page = xdp_linearize_page(rq, &num_buf, page,
 833                                                       offset, header_offset,
 834                                                       &tlen);
 835                         if (!xdp_page)
 836                                 goto err_xdp;
 837
 838                         buf = page_address(xdp_page);
 839                         put_page(page);
 840                         page = xdp_page;
 841                 }
 842
 843                 xdp_init_buff(&xdp, buflen, &rq->xdp_rxq);
 844                 xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len,
 845                                  xdp_headroom, len, true);
 846                 orig_data = xdp.data;
 847                 act = bpf_prog_run_xdp(xdp_prog, &xdp);
 848                 stats->xdp_packets++;
 849
 850                 switch (act) {
 851                 case XDP_PASS:
 852                         /* Recalculate length in case bpf program changed it */
 853                         delta = orig_data - xdp.data;
 854                         len = xdp.data_end - xdp.data;
 855                         metasize = xdp.data - xdp.data_meta;
 856                         break;
 857                 case XDP_TX:
 858                         stats->xdp_tx++;
 859                         xdpf = xdp_convert_buff_to_frame(&xdp);
 860                         if (unlikely(!xdpf))
 861                                 goto err_xdp;
 862                         err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
 863                         if (unlikely(!err)) {
 864                                 xdp_return_frame_rx_napi(xdpf);
 865                         } else if (unlikely(err < 0)) {
 866                                 trace_xdp_exception(vi->dev, xdp_prog, act);
 867                                 goto err_xdp;
 868                         }
 869                         *xdp_xmit |= VIRTIO_XDP_TX;
 870                         rcu_read_unlock();
 871                         goto xdp_xmit;
 872                 case XDP_REDIRECT:
 873                         stats->xdp_redirects++;
 874                         err = xdp_do_redirect(dev, &xdp, xdp_prog);
 875                         if (err)
 876                                 goto err_xdp;
 877                         *xdp_xmit |= VIRTIO_XDP_REDIR;
 878                         rcu_read_unlock();
 879                         goto xdp_xmit;
 880                 default:
 881                         bpf_warn_invalid_xdp_action(vi->dev, xdp_prog, act);
 882                         fallthrough;
 883                 case XDP_ABORTED:
 884                         trace_xdp_exception(vi->dev, xdp_prog, act);
 885                         goto err_xdp;
 886                 case XDP_DROP:
 887                         goto err_xdp;
 888                 }
 889         }
 890         rcu_read_unlock();
 891
 892 skip_xdp:
 893         skb = build_skb(buf, buflen);
 894         if (!skb)
 895                 goto err;
 896         skb_reserve(skb, headroom - delta);
 897         skb_put(skb, len);
 898         if (!xdp_prog) {
 899                 buf += header_offset;
 900                 memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
 901         } /* keep zeroed vnet hdr since XDP is loaded */
 902
 903         if (metasize)
 904                 skb_metadata_set(skb, metasize);
 905
 906         return skb;
 907
 908 err_xdp:
 909         rcu_read_unlock();
 910         stats->xdp_drops++;
 911 err:
 912         stats->drops++;
 913         put_page(page);
 914 xdp_xmit:
 915         return NULL;
 916 }
 917
 918 static struct sk_buff *receive_big(struct net_device *dev,
 919                                    struct virtnet_info *vi,
 920                                    struct receive_queue *rq,
 921                                    void *buf,
 922                                    unsigned int len,
 923                                    struct virtnet_rq_stats *stats)
 924 {
 925         struct page *page = buf;
 926         struct sk_buff *skb =
 927                 page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 0);
 928
 929         stats->bytes += len - vi->hdr_len;
 930         if (unlikely(!skb))
 931                 goto err;
 932
 933         return skb;
 934
 935 err:
 936         stats->drops++;
 937         give_pages(rq, page);
 938         return NULL;
 939 }
 940
 941 static struct sk_buff *receive_mergeable(struct net_device *dev,
 942                                          struct virtnet_info *vi,
 943                                          struct receive_queue *rq,
 944                                          void *buf,
 945                                          void *ctx,
 946                                          unsigned int len,
 947                                          unsigned int *xdp_xmit,
 948                                          struct virtnet_rq_stats *stats)
 949 {
 950         struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
 951         u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
 952         struct page *page = virt_to_head_page(buf);
 953         int offset = buf - page_address(page);
 954         struct sk_buff *head_skb, *curr_skb;
 955         struct bpf_prog *xdp_prog;
 956         unsigned int truesize = mergeable_ctx_to_truesize(ctx);
 957         unsigned int headroom = mergeable_ctx_to_headroom(ctx);
 958         unsigned int metasize = 0;
 959         unsigned int frame_sz;
 960         int err;
 961
 962         head_skb = NULL;
 963         stats->bytes += len - vi->hdr_len;
 964
 965         if (unlikely(len > truesize)) {
 966                 pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
 967                          dev->name, len, (unsigned long)ctx);
 968                 dev->stats.rx_length_errors++;
 969                 goto err_skb;
 970         }
 971
 972         if (likely(!vi->xdp_enabled)) {
 973                 xdp_prog = NULL;
 974                 goto skip_xdp;
 975         }
 976
 977         rcu_read_lock();
 978         xdp_prog = rcu_dereference(rq->xdp_prog);
 979         if (xdp_prog) {
 980                 struct xdp_frame *xdpf;
 981                 struct page *xdp_page;
 982                 struct xdp_buff xdp;
 983                 void *data;
 984                 u32 act;
 985
 986                 /* Transient failure which in theory could occur if
 987                  * in-flight packets from before XDP was enabled reach
 988                  * the receive path after XDP is loaded.
 989                  */
 990                 if (unlikely(hdr->hdr.gso_type))
 991                         goto err_xdp;
 992
 993                 /* Buffers with headroom use PAGE_SIZE as alloc size,
 994                  * see add_recvbuf_mergeable() + get_mergeable_buf_len()
 995                  */
 996                 frame_sz = headroom ? PAGE_SIZE : truesize;
 997
 998                 /* This happens when rx buffer size is underestimated
 999                  * or headroom is not enough because of the buffer
1000                  * was refilled before XDP is set. This should only
1001                  * happen for the first several packets, so we don't
1002                  * care much about its performance.
1003                  */
1004                 if (unlikely(num_buf > 1 ||
1005                              headroom < virtnet_get_headroom(vi))) {
1006                         /* linearize data for XDP */
1007                         xdp_page = xdp_linearize_page(rq, &num_buf,
1008                                                       page, offset,
1009                                                       VIRTIO_XDP_HEADROOM,
1010                                                       &len);
1011                         frame_sz = PAGE_SIZE;
1012
1013                         if (!xdp_page)
1014                                 goto err_xdp;
1015                         offset = VIRTIO_XDP_HEADROOM;
1016                 } else {
1017                         xdp_page = page;
1018                 }
1019
1020                 /* Allow consuming headroom but reserve enough space to push
1021                  * the descriptor on if we get an XDP_TX return code.
1022                  */
1023                 data = page_address(xdp_page) + offset;
1024                 xdp_init_buff(&xdp, frame_sz - vi->hdr_len, &rq->xdp_rxq);
1025                 xdp_prepare_buff(&xdp, data - VIRTIO_XDP_HEADROOM + vi->hdr_len,
1026                                  VIRTIO_XDP_HEADROOM, len - vi->hdr_len, true);
1027
1028                 act = bpf_prog_run_xdp(xdp_prog, &xdp);
1029                 stats->xdp_packets++;
1030
1031                 switch (act) {
1032                 case XDP_PASS:
1033                         metasize = xdp.data - xdp.data_meta;
1034
1035                         /* recalculate offset to account for any header
1036                          * adjustments and minus the metasize to copy the
1037                          * metadata in page_to_skb(). Note other cases do not
1038                          * build an skb and avoid using offset
1039                          */
1040                         offset = xdp.data - page_address(xdp_page) -
1041                                  vi->hdr_len - metasize;
1042
1043                         /* recalculate len if xdp.data, xdp.data_end or
1044                          * xdp.data_meta were adjusted
1045                          */
1046                         len = xdp.data_end - xdp.data + vi->hdr_len + metasize;
1047
1048                         /* recalculate headroom if xdp.data or xdp_data_meta
1049                          * were adjusted, note that offset should always point
1050                          * to the start of the reserved bytes for virtio_net
1051                          * header which are followed by xdp.data, that means
1052                          * that offset is equal to the headroom (when buf is
1053                          * starting at the beginning of the page, otherwise
1054                          * there is a base offset inside the page) but it's used
1055                          * with a different starting point (buf start) than
1056                          * xdp.data (buf start + vnet hdr size). If xdp.data or
1057                          * data_meta were adjusted by the xdp prog then the
1058                          * headroom size has changed and so has the offset, we
1059                          * can use data_hard_start, which points at buf start +
1060                          * vnet hdr size, to calculate the new headroom and use
1061                          * it later to compute buf start in page_to_skb()
1062                          */
1063                         headroom = xdp.data - xdp.data_hard_start - metasize;
1064
1065                         /* We can only create skb based on xdp_page. */
1066                         if (unlikely(xdp_page != page)) {
1067                                 rcu_read_unlock();
1068                                 put_page(page);
1069                                 head_skb = page_to_skb(vi, rq, xdp_page, offset,
1070                                                        len, PAGE_SIZE, false,
1071                                                        metasize,
1072                                                        headroom);
1073                                 return head_skb;
1074                         }
1075                         break;
1076                 case XDP_TX:
1077                         stats->xdp_tx++;
1078                         xdpf = xdp_convert_buff_to_frame(&xdp);
1079                         if (unlikely(!xdpf)) {
1080                                 if (unlikely(xdp_page != page))
1081                                         put_page(xdp_page);
1082                                 goto err_xdp;
1083                         }
1084                         err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
1085                         if (unlikely(!err)) {
1086                                 xdp_return_frame_rx_napi(xdpf);
1087                         } else if (unlikely(err < 0)) {
1088                                 trace_xdp_exception(vi->dev, xdp_prog, act);
1089                                 if (unlikely(xdp_page != page))
1090                                         put_page(xdp_page);
1091                                 goto err_xdp;
1092                         }
1093                         *xdp_xmit |= VIRTIO_XDP_TX;
1094                         if (unlikely(xdp_page != page))
1095                                 put_page(page);
1096                         rcu_read_unlock();
1097                         goto xdp_xmit;
1098                 case XDP_REDIRECT:
1099                         stats->xdp_redirects++;
1100                         err = xdp_do_redirect(dev, &xdp, xdp_prog);
1101                         if (err) {
1102                                 if (unlikely(xdp_page != page))
1103                                         put_page(xdp_page);
1104                                 goto err_xdp;
1105                         }
1106                         *xdp_xmit |= VIRTIO_XDP_REDIR;
1107                         if (unlikely(xdp_page != page))
1108                                 put_page(page);
1109                         rcu_read_unlock();
1110                         goto xdp_xmit;
1111                 default:
1112                         bpf_warn_invalid_xdp_action(vi->dev, xdp_prog, act);
1113                         fallthrough;
1114                 case XDP_ABORTED:
1115                         trace_xdp_exception(vi->dev, xdp_prog, act);
1116                         fallthrough;
1117                 case XDP_DROP:
1118                         if (unlikely(xdp_page != page))
1119                                 __free_pages(xdp_page, 0);
1120                         goto err_xdp;
1121                 }
1122         }
1123         rcu_read_unlock();
1124
1125 skip_xdp:
1126         head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog,
1127                                metasize, headroom);
1128         curr_skb = head_skb;
1129
1130         if (unlikely(!curr_skb))
1131                 goto err_skb;
1132         while (--num_buf) {
1133                 int num_skb_frags;
1134
1135                 buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
1136                 if (unlikely(!buf)) {
1137                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
1138                                  dev->name, num_buf,
1139                                  virtio16_to_cpu(vi->vdev,
1140                                                  hdr->num_buffers));
1141                         dev->stats.rx_length_errors++;
1142                         goto err_buf;
1143                 }
1144
1145                 stats->bytes += len;
1146                 page = virt_to_head_page(buf);
1147
1148                 truesize = mergeable_ctx_to_truesize(ctx);
1149                 if (unlikely(len > truesize)) {
1150                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1151                                  dev->name, len, (unsigned long)ctx);
1152                         dev->stats.rx_length_errors++;
1153                         goto err_skb;
1154                 }
1155
1156                 num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
1157                 if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
1158                         struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
1159
1160                         if (unlikely(!nskb))
1161                                 goto err_skb;
1162                         if (curr_skb == head_skb)
1163                                 skb_shinfo(curr_skb)->frag_list = nskb;
1164                         else
1165                                 curr_skb->next = nskb;
1166                         curr_skb = nskb;
1167                         head_skb->truesize += nskb->truesize;
1168                         num_skb_frags = 0;
1169                 }
1170                 if (curr_skb != head_skb) {
1171                         head_skb->data_len += len;
1172                         head_skb->len += len;
1173                         head_skb->truesize += truesize;
1174                 }
1175                 offset = buf - page_address(page);
1176                 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
1177                         put_page(page);
1178                         skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
1179                                              len, truesize);
1180                 } else {
1181                         skb_add_rx_frag(curr_skb, num_skb_frags, page,
1182                                         offset, len, truesize);
1183                 }
1184         }
1185
1186         ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
1187         return head_skb;
1188
1189 err_xdp:
1190         rcu_read_unlock();
1191         stats->xdp_drops++;
1192 err_skb:
1193         put_page(page);
1194         while (num_buf-- > 1) {
1195                 buf = virtqueue_get_buf(rq->vq, &len);
1196                 if (unlikely(!buf)) {
1197                         pr_debug("%s: rx error: %d buffers missing\n",
1198                                  dev->name, num_buf);
1199                         dev->stats.rx_length_errors++;
1200                         break;
1201                 }
1202                 stats->bytes += len;
1203                 page = virt_to_head_page(buf);
1204                 put_page(page);
1205         }
1206 err_buf:
1207         stats->drops++;
1208         dev_kfree_skb(head_skb);
1209 xdp_xmit:
1210         return NULL;
1211 }
1212
1213 static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash,
1214                                 struct sk_buff *skb)
1215 {
1216         enum pkt_hash_types rss_hash_type;
1217
1218         if (!hdr_hash || !skb)
1219                 return;
1220
1221         switch (__le16_to_cpu(hdr_hash->hash_report)) {
1222         case VIRTIO_NET_HASH_REPORT_TCPv4:
1223         case VIRTIO_NET_HASH_REPORT_UDPv4:
1224         case VIRTIO_NET_HASH_REPORT_TCPv6:
1225         case VIRTIO_NET_HASH_REPORT_UDPv6:
1226         case VIRTIO_NET_HASH_REPORT_TCPv6_EX:
1227         case VIRTIO_NET_HASH_REPORT_UDPv6_EX:
1228                 rss_hash_type = PKT_HASH_TYPE_L4;
1229                 break;
1230         case VIRTIO_NET_HASH_REPORT_IPv4:
1231         case VIRTIO_NET_HASH_REPORT_IPv6:
1232         case VIRTIO_NET_HASH_REPORT_IPv6_EX:
1233                 rss_hash_type = PKT_HASH_TYPE_L3;
1234                 break;
1235         case VIRTIO_NET_HASH_REPORT_NONE:
1236         default:
1237                 rss_hash_type = PKT_HASH_TYPE_NONE;
1238         }
1239         skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type);
1240 }
1241
1242 static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
1243                         void *buf, unsigned int len, void **ctx,
1244                         unsigned int *xdp_xmit,
1245                         struct virtnet_rq_stats *stats)
1246 {
1247         struct net_device *dev = vi->dev;
1248         struct sk_buff *skb;
1249         struct virtio_net_hdr_mrg_rxbuf *hdr;
1250
1251         if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
1252                 pr_debug("%s: short packet %i\n", dev->name, len);
1253                 dev->stats.rx_length_errors++;
1254                 if (vi->mergeable_rx_bufs) {
1255                         put_page(virt_to_head_page(buf));
1256                 } else if (vi->big_packets) {
1257                         give_pages(rq, buf);
1258                 } else {
1259                         put_page(virt_to_head_page(buf));
1260                 }
1261                 return;
1262         }
1263
1264         if (vi->mergeable_rx_bufs)
1265                 skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
1266                                         stats);
1267         else if (vi->big_packets)
1268                 skb = receive_big(dev, vi, rq, buf, len, stats);
1269         else
1270                 skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
1271
1272         if (unlikely(!skb))
1273                 return;
1274
1275         hdr = skb_vnet_hdr(skb);
1276         if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report)
1277                 virtio_skb_set_hash((const struct virtio_net_hdr_v1_hash *)hdr, skb);
1278
1279         if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
1280                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1281
1282         if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
1283                                   virtio_is_little_endian(vi->vdev))) {
1284                 net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
1285                                      dev->name, hdr->hdr.gso_type,
1286                                      hdr->hdr.gso_size);
1287                 goto frame_err;
1288         }
1289
1290         skb_record_rx_queue(skb, vq2rxq(rq->vq));
1291         skb->protocol = eth_type_trans(skb, dev);
1292         pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
1293                  ntohs(skb->protocol), skb->len, skb->pkt_type);
1294
1295         napi_gro_receive(&rq->napi, skb);
1296         return;
1297
1298 frame_err:
1299         dev->stats.rx_frame_errors++;
1300         dev_kfree_skb(skb);
1301 }
1302
1303 /* Unlike mergeable buffers, all buffers are allocated to the
1304  * same size, except for the headroom. For this reason we do
1305  * not need to use  mergeable_len_to_ctx here - it is enough
1306  * to store the headroom as the context ignoring the truesize.
1307  */
1308 static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
1309                              gfp_t gfp)
1310 {
1311         struct page_frag *alloc_frag = &rq->alloc_frag;
1312         char *buf;
1313         unsigned int xdp_headroom = virtnet_get_headroom(vi);
1314         void *ctx = (void *)(unsigned long)xdp_headroom;
1315         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
1316         int err;
1317
1318         len = SKB_DATA_ALIGN(len) +
1319               SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1320         if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
1321                 return -ENOMEM;
1322
1323         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1324         get_page(alloc_frag->page);
1325         alloc_frag->offset += len;
1326         sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
1327                     vi->hdr_len + GOOD_PACKET_LEN);
1328         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1329         if (err < 0)
1330                 put_page(virt_to_head_page(buf));
1331         return err;
1332 }
1333
1334 static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
1335                            gfp_t gfp)
1336 {
1337         struct page *first, *list = NULL;
1338         char *p;
1339         int i, err, offset;
1340
1341         sg_init_table(rq->sg, vi->big_packets_num_skbfrags + 2);
1342
1343         /* page in rq->sg[vi->big_packets_num_skbfrags + 1] is list tail */
1344         for (i = vi->big_packets_num_skbfrags + 1; i > 1; --i) {
1345                 first = get_a_page(rq, gfp);
1346                 if (!first) {
1347                         if (list)
1348                                 give_pages(rq, list);
1349                         return -ENOMEM;
1350                 }
1351                 sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
1352
1353                 /* chain new page in list head to match sg */
1354                 first->private = (unsigned long)list;
1355                 list = first;
1356         }
1357
1358         first = get_a_page(rq, gfp);
1359         if (!first) {
1360                 give_pages(rq, list);
1361                 return -ENOMEM;
1362         }
1363         p = page_address(first);
1364
1365         /* rq->sg[0], rq->sg[1] share the same page */
1366         /* a separated rq->sg[0] for header - required in case !any_header_sg */
1367         sg_set_buf(&rq->sg[0], p, vi->hdr_len);
1368
1369         /* rq->sg[1] for data packet, from offset */
1370         offset = sizeof(struct padded_vnet_hdr);
1371         sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
1372
1373         /* chain first in list head */
1374         first->private = (unsigned long)list;
1375         err = virtqueue_add_inbuf(rq->vq, rq->sg, vi->big_packets_num_skbfrags + 2,
1376                                   first, gfp);
1377         if (err < 0)
1378                 give_pages(rq, first);
1379
1380         return err;
1381 }
1382
1383 static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
1384                                           struct ewma_pkt_len *avg_pkt_len,
1385                                           unsigned int room)
1386 {
1387         struct virtnet_info *vi = rq->vq->vdev->priv;
1388         const size_t hdr_len = vi->hdr_len;
1389         unsigned int len;
1390
1391         if (room)
1392                 return PAGE_SIZE - room;
1393
1394         len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
1395                                 rq->min_buf_len, PAGE_SIZE - hdr_len);
1396
1397         return ALIGN(len, L1_CACHE_BYTES);
1398 }
1399
1400 static int add_recvbuf_mergeable(struct virtnet_info *vi,
1401                                  struct receive_queue *rq, gfp_t gfp)
1402 {
1403         struct page_frag *alloc_frag = &rq->alloc_frag;
1404         unsigned int headroom = virtnet_get_headroom(vi);
1405         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
1406         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1407         char *buf;
1408         void *ctx;
1409         int err;
1410         unsigned int len, hole;
1411
1412         /* Extra tailroom is needed to satisfy XDP's assumption. This
1413          * means rx frags coalescing won't work, but consider we've
1414          * disabled GSO for XDP, it won't be a big issue.
1415          */
1416         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
1417         if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
1418                 return -ENOMEM;
1419
1420         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1421         buf += headroom; /* advance address leaving hole at front of pkt */
1422         get_page(alloc_frag->page);
1423         alloc_frag->offset += len + room;
1424         hole = alloc_frag->size - alloc_frag->offset;
1425         if (hole < len + room) {
1426                 /* To avoid internal fragmentation, if there is very likely not
1427                  * enough space for another buffer, add the remaining space to
1428                  * the current buffer.
1429                  */
1430                 len += hole;
1431                 alloc_frag->offset += hole;
1432         }
1433
1434         sg_init_one(rq->sg, buf, len);
1435         ctx = mergeable_len_to_ctx(len, headroom);
1436         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1437         if (err < 0)
1438                 put_page(virt_to_head_page(buf));
1439
1440         return err;
1441 }
1442
1443 /*
1444  * Returns false if we couldn't fill entirely (OOM).
1445  *
1446  * Normally run in the receive path, but can also be run from ndo_open
1447  * before we're receiving packets, or from refill_work which is
1448  * careful to disable receiving (using napi_disable).
1449  */
1450 static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
1451                           gfp_t gfp)
1452 {
1453         int err;
1454         bool oom;
1455
1456         do {
1457                 if (vi->mergeable_rx_bufs)
1458                         err = add_recvbuf_mergeable(vi, rq, gfp);
1459                 else if (vi->big_packets)
1460                         err = add_recvbuf_big(vi, rq, gfp);
1461                 else
1462                         err = add_recvbuf_small(vi, rq, gfp);
1463
1464                 oom = err == -ENOMEM;
1465                 if (err)
1466                         break;
1467         } while (rq->vq->num_free);
1468         if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
1469                 unsigned long flags;
1470
1471                 flags = u64_stats_update_begin_irqsave(&rq->stats.syncp);
1472                 rq->stats.kicks++;
1473                 u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
1474         }
1475
1476         return !oom;
1477 }
1478
1479 static void skb_recv_done(struct virtqueue *rvq)
1480 {
1481         struct virtnet_info *vi = rvq->vdev->priv;
1482         struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1483
1484         virtqueue_napi_schedule(&rq->napi, rvq);
1485 }
1486
1487 static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1488 {
1489         napi_enable(napi);
1490
1491         /* If all buffers were filled by other side before we napi_enabled, we
1492          * won't get another interrupt, so process any outstanding packets now.
1493          * Call local_bh_enable after to trigger softIRQ processing.
1494          */
1495         local_bh_disable();
1496         virtqueue_napi_schedule(napi, vq);
1497         local_bh_enable();
1498 }
1499
1500 static void virtnet_napi_tx_enable(struct virtnet_info *vi,
1501                                    struct virtqueue *vq,
1502                                    struct napi_struct *napi)
1503 {
1504         if (!napi->weight)
1505                 return;
1506
1507         /* Tx napi touches cachelines on the cpu handling tx interrupts. Only
1508          * enable the feature if this is likely affine with the transmit path.
1509          */
1510         if (!vi->affinity_hint_set) {
1511                 napi->weight = 0;
1512                 return;
1513         }
1514
1515         return virtnet_napi_enable(vq, napi);
1516 }
1517
1518 static void virtnet_napi_tx_disable(struct napi_struct *napi)
1519 {
1520         if (napi->weight)
1521                 napi_disable(napi);
1522 }
1523
1524 static void refill_work(struct work_struct *work)
1525 {
1526         struct virtnet_info *vi =
1527                 container_of(work, struct virtnet_info, refill.work);
1528         bool still_empty;
1529         int i;
1530
1531         for (i = 0; i < vi->curr_queue_pairs; i++) {
1532                 struct receive_queue *rq = &vi->rq[i];
1533
1534                 napi_disable(&rq->napi);
1535                 still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1536                 virtnet_napi_enable(rq->vq, &rq->napi);
1537
1538                 /* In theory, this can happen: if we don't get any buffers in
1539                  * we will *never* try to fill again.
1540                  */
1541                 if (still_empty)
1542                         schedule_delayed_work(&vi->refill, HZ/2);
1543         }
1544 }
1545
1546 static int virtnet_receive(struct receive_queue *rq, int budget,
1547                            unsigned int *xdp_xmit)
1548 {
1549         struct virtnet_info *vi = rq->vq->vdev->priv;
1550         struct virtnet_rq_stats stats = {};
1551         unsigned int len;
1552         void *buf;
1553         int i;
1554
1555         if (!vi->big_packets || vi->mergeable_rx_bufs) {
1556                 void *ctx;
1557
1558                 while (stats.packets < budget &&
1559                        (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1560                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
1561                         stats.packets++;
1562                 }
1563         } else {
1564                 while (stats.packets < budget &&
1565                        (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1566                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
1567                         stats.packets++;
1568                 }
1569         }
1570
1571         if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
1572                 if (!try_fill_recv(vi, rq, GFP_ATOMIC)) {
1573                         spin_lock(&vi->refill_lock);
1574                         if (vi->refill_enabled)
1575                                 schedule_delayed_work(&vi->refill, 0);
1576                         spin_unlock(&vi->refill_lock);
1577                 }
1578         }
1579
1580         u64_stats_update_begin(&rq->stats.syncp);
1581         for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
1582                 size_t offset = virtnet_rq_stats_desc[i].offset;
1583                 u64 *item;
1584
1585                 item = (u64 *)((u8 *)&rq->stats + offset);
1586                 *item += *(u64 *)((u8 *)&stats + offset);
1587         }
1588         u64_stats_update_end(&rq->stats.syncp);
1589
1590         return stats.packets;
1591 }
1592
1593 static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
1594 {
1595         unsigned int len;
1596         unsigned int packets = 0;
1597         unsigned int bytes = 0;
1598         void *ptr;
1599
1600         while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
1601                 if (likely(!is_xdp_frame(ptr))) {
1602                         struct sk_buff *skb = ptr;
1603
1604                         pr_debug("Sent skb %p\n", skb);
1605
1606                         bytes += skb->len;
1607                         napi_consume_skb(skb, in_napi);
1608                 } else {
1609                         struct xdp_frame *frame = ptr_to_xdp(ptr);
1610
1611                         bytes += frame->len;
1612                         xdp_return_frame(frame);
1613                 }
1614                 packets++;
1615         }
1616
1617         /* Avoid overhead when no packets have been processed
1618          * happens when called speculatively from start_xmit.
1619          */
1620         if (!packets)
1621                 return;
1622
1623         u64_stats_update_begin(&sq->stats.syncp);
1624         sq->stats.bytes += bytes;
1625         sq->stats.packets += packets;
1626         u64_stats_update_end(&sq->stats.syncp);
1627 }
1628
1629 static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
1630 {
1631         if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
1632                 return false;
1633         else if (q < vi->curr_queue_pairs)
1634                 return true;
1635         else
1636                 return false;
1637 }
1638
1639 static void virtnet_poll_cleantx(struct receive_queue *rq)
1640 {
1641         struct virtnet_info *vi = rq->vq->vdev->priv;
1642         unsigned int index = vq2rxq(rq->vq);
1643         struct send_queue *sq = &vi->sq[index];
1644         struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);
1645
1646         if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index))
1647                 return;
1648
1649         if (__netif_tx_trylock(txq)) {
1650                 if (sq->reset) {
1651                         __netif_tx_unlock(txq);
1652                         return;
1653                 }
1654
1655                 do {
1656                         virtqueue_disable_cb(sq->vq);
1657                         free_old_xmit_skbs(sq, true);
1658                 } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1659
1660                 if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1661                         netif_tx_wake_queue(txq);
1662
1663                 __netif_tx_unlock(txq);
1664         }
1665 }
1666
1667 static int virtnet_poll(struct napi_struct *napi, int budget)
1668 {
1669         struct receive_queue *rq =
1670                 container_of(napi, struct receive_queue, napi);
1671         struct virtnet_info *vi = rq->vq->vdev->priv;
1672         struct send_queue *sq;
1673         unsigned int received;
1674         unsigned int xdp_xmit = 0;
1675
1676         virtnet_poll_cleantx(rq);
1677
1678         received = virtnet_receive(rq, budget, &xdp_xmit);
1679
1680         /* Out of packets? */
1681         if (received < budget)
1682                 virtqueue_napi_complete(napi, rq->vq, received);
1683
1684         if (xdp_xmit & VIRTIO_XDP_REDIR)
1685                 xdp_do_flush();
1686
1687         if (xdp_xmit & VIRTIO_XDP_TX) {
1688                 sq = virtnet_xdp_get_sq(vi);
1689                 if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
1690                         u64_stats_update_begin(&sq->stats.syncp);
1691                         sq->stats.kicks++;
1692                         u64_stats_update_end(&sq->stats.syncp);
1693                 }
1694                 virtnet_xdp_put_sq(vi, sq);
1695         }
1696
1697         return received;
1698 }
1699
1700 static int virtnet_open(struct net_device *dev)
1701 {
1702         struct virtnet_info *vi = netdev_priv(dev);
1703         int i, err;
1704
1705         enable_delayed_refill(vi);
1706
1707         for (i = 0; i < vi->max_queue_pairs; i++) {
1708                 if (i < vi->curr_queue_pairs)
1709                         /* Make sure we have some buffers: if oom use wq. */
1710                         if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1711                                 schedule_delayed_work(&vi->refill, 0);
1712
1713                 err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
1714                 if (err < 0)
1715                         return err;
1716
1717                 err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
1718                                                  MEM_TYPE_PAGE_SHARED, NULL);
1719                 if (err < 0) {
1720                         xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
1721                         return err;
1722                 }
1723
1724                 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
1725                 virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
1726         }
1727
1728         return 0;
1729 }
1730
1731 static int virtnet_poll_tx(struct napi_struct *napi, int budget)
1732 {
1733         struct send_queue *sq = container_of(napi, struct send_queue, napi);
1734         struct virtnet_info *vi = sq->vq->vdev->priv;
1735         unsigned int index = vq2txq(sq->vq);
1736         struct netdev_queue *txq;
1737         int opaque;
1738         bool done;
1739
1740         if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
1741                 /* We don't need to enable cb for XDP */
1742                 napi_complete_done(napi, 0);
1743                 return 0;
1744         }
1745
1746         txq = netdev_get_tx_queue(vi->dev, index);
1747         __netif_tx_lock(txq, raw_smp_processor_id());
1748         virtqueue_disable_cb(sq->vq);
1749         free_old_xmit_skbs(sq, true);
1750
1751         if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1752                 netif_tx_wake_queue(txq);
1753
1754         opaque = virtqueue_enable_cb_prepare(sq->vq);
1755
1756         done = napi_complete_done(napi, 0);
1757
1758         if (!done)
1759                 virtqueue_disable_cb(sq->vq);
1760
1761         __netif_tx_unlock(txq);
1762
1763         if (done) {
1764                 if (unlikely(virtqueue_poll(sq->vq, opaque))) {
1765                         if (napi_schedule_prep(napi)) {
1766                                 __netif_tx_lock(txq, raw_smp_processor_id());
1767                                 virtqueue_disable_cb(sq->vq);
1768                                 __netif_tx_unlock(txq);
1769                                 __napi_schedule(napi);
1770                         }
1771                 }
1772         }
1773
1774         return 0;
1775 }
1776
1777 static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
1778 {
1779         struct virtio_net_hdr_mrg_rxbuf *hdr;
1780         const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1781         struct virtnet_info *vi = sq->vq->vdev->priv;
1782         int num_sg;
1783         unsigned hdr_len = vi->hdr_len;
1784         bool can_push;
1785
1786         pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1787
1788         can_push = vi->any_header_sg &&
1789                 !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
1790                 !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
1791         /* Even if we can, don't push here yet as this would skew
1792          * csum_start offset below. */
1793         if (can_push)
1794                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1795         else
1796                 hdr = skb_vnet_hdr(skb);
1797
1798         if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1799                                     virtio_is_little_endian(vi->vdev), false,
1800                                     0))
1801                 return -EPROTO;
1802
1803         if (vi->mergeable_rx_bufs)
1804                 hdr->num_buffers = 0;
1805
1806         sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1807         if (can_push) {
1808                 __skb_push(skb, hdr_len);
1809                 num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1810                 if (unlikely(num_sg < 0))
1811                         return num_sg;
1812                 /* Pull header back to avoid skew in tx bytes calculations. */
1813                 __skb_pull(skb, hdr_len);
1814         } else {
1815                 sg_set_buf(sq->sg, hdr, hdr_len);
1816                 num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
1817                 if (unlikely(num_sg < 0))
1818                         return num_sg;
1819                 num_sg++;
1820         }
1821         return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1822 }
1823
1824 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1825 {
1826         struct virtnet_info *vi = netdev_priv(dev);
1827         int qnum = skb_get_queue_mapping(skb);
1828         struct send_queue *sq = &vi->sq[qnum];
1829         int err;
1830         struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
1831         bool kick = !netdev_xmit_more();
1832         bool use_napi = sq->napi.weight;
1833
1834         /* Free up any pending old buffers before queueing new ones. */
1835         do {
1836                 if (use_napi)
1837                         virtqueue_disable_cb(sq->vq);
1838
1839                 free_old_xmit_skbs(sq, false);
1840
1841         } while (use_napi && kick &&
1842                unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1843
1844         /* timestamp packet in software */
1845         skb_tx_timestamp(skb);
1846
1847         /* Try to transmit */
1848         err = xmit_skb(sq, skb);
1849
1850         /* This should not happen! */
1851         if (unlikely(err)) {
1852                 dev->stats.tx_fifo_errors++;
1853                 if (net_ratelimit())
1854                         dev_warn(&dev->dev,
1855                                  "Unexpected TXQ (%d) queue failure: %d\n",
1856                                  qnum, err);
1857                 dev->stats.tx_dropped++;
1858                 dev_kfree_skb_any(skb);
1859                 return NETDEV_TX_OK;
1860         }
1861
1862         /* Don't wait up for transmitted skbs to be freed. */
1863         if (!use_napi) {
1864                 skb_orphan(skb);
1865                 nf_reset_ct(skb);
1866         }
1867
1868         /* If running out of space, stop queue to avoid getting packets that we
1869          * are then unable to transmit.
1870          * An alternative would be to force queuing layer to requeue the skb by
1871          * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
1872          * returned in a normal path of operation: it means that driver is not
1873          * maintaining the TX queue stop/start state properly, and causes
1874          * the stack to do a non-trivial amount of useless work.
1875          * Since most packets only take 1 or 2 ring slots, stopping the queue
1876          * early means 16 slots are typically wasted.
1877          */
1878         if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
1879                 netif_stop_subqueue(dev, qnum);
1880                 if (!use_napi &&
1881                     unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1882                         /* More just got used, free them then recheck. */
1883                         free_old_xmit_skbs(sq, false);
1884                         if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
1885                                 netif_start_subqueue(dev, qnum);
1886                                 virtqueue_disable_cb(sq->vq);
1887                         }
1888                 }
1889         }
1890
1891         if (kick || netif_xmit_stopped(txq)) {
1892                 if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
1893                         u64_stats_update_begin(&sq->stats.syncp);
1894                         sq->stats.kicks++;
1895                         u64_stats_update_end(&sq->stats.syncp);
1896                 }
1897         }
1898
1899         return NETDEV_TX_OK;
1900 }
1901
1902 static int virtnet_rx_resize(struct virtnet_info *vi,
1903                              struct receive_queue *rq, u32 ring_num)
1904 {
1905         bool running = netif_running(vi->dev);
1906         int err, qindex;
1907
1908         qindex = rq - vi->rq;
1909
1910         if (running)
1911                 napi_disable(&rq->napi);
1912
1913         err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_free_unused_buf);
1914         if (err)
1915                 netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err);
1916
1917         if (!try_fill_recv(vi, rq, GFP_KERNEL))
1918                 schedule_delayed_work(&vi->refill, 0);
1919
1920         if (running)
1921                 virtnet_napi_enable(rq->vq, &rq->napi);
1922         return err;
1923 }
1924
1925 static int virtnet_tx_resize(struct virtnet_info *vi,
1926                              struct send_queue *sq, u32 ring_num)
1927 {
1928         bool running = netif_running(vi->dev);
1929         struct netdev_queue *txq;
1930         int err, qindex;
1931
1932         qindex = sq - vi->sq;
1933
1934         if (running)
1935                 virtnet_napi_tx_disable(&sq->napi);
1936
1937         txq = netdev_get_tx_queue(vi->dev, qindex);
1938
1939         /* 1. wait all ximt complete
1940          * 2. fix the race of netif_stop_subqueue() vs netif_start_subqueue()
1941          */
1942         __netif_tx_lock_bh(txq);
1943
1944         /* Prevent rx poll from accessing sq. */
1945         sq->reset = true;
1946
1947         /* Prevent the upper layer from trying to send packets. */
1948         netif_stop_subqueue(vi->dev, qindex);
1949
1950         __netif_tx_unlock_bh(txq);
1951
1952         err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf);
1953         if (err)
1954                 netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err);
1955
1956         __netif_tx_lock_bh(txq);
1957         sq->reset = false;
1958         netif_tx_wake_queue(txq);
1959         __netif_tx_unlock_bh(txq);
1960
1961         if (running)
1962                 virtnet_napi_tx_enable(vi, sq->vq, &sq->napi);
1963         return err;
1964 }
1965
1966 /*
1967  * Send command via the control virtqueue and check status.  Commands
1968  * supported by the hypervisor, as indicated by feature bits, should
1969  * never fail unless improperly formatted.
1970  */
1971 static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1972                                  struct scatterlist *out)
1973 {
1974         struct scatterlist *sgs[4], hdr, stat;
1975         unsigned out_num = 0, tmp;
1976         int ret;
1977
1978         /* Caller should know better */
1979         BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1980
1981         vi->ctrl->status = ~0;
1982         vi->ctrl->hdr.class = class;
1983         vi->ctrl->hdr.cmd = cmd;
1984         /* Add header */
1985         sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
1986         sgs[out_num++] = &hdr;
1987
1988         if (out)
1989                 sgs[out_num++] = out;
1990
1991         /* Add return status. */
1992         sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
1993         sgs[out_num] = &stat;
1994
1995         BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1996         ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1997         if (ret < 0) {
1998                 dev_warn(&vi->vdev->dev,
1999                          "Failed to add sgs for command vq: %d\n.", ret);
2000                 return false;
2001         }
2002
2003         if (unlikely(!virtqueue_kick(vi->cvq)))
2004                 return vi->ctrl->status == VIRTIO_NET_OK;
2005
2006         /* Spin for a response, the kick causes an ioport write, trapping
2007          * into the hypervisor, so the request should be handled immediately.
2008          */
2009         while (!virtqueue_get_buf(vi->cvq, &tmp) &&
2010                !virtqueue_is_broken(vi->cvq))
2011                 cpu_relax();
2012
2013         return vi->ctrl->status == VIRTIO_NET_OK;
2014 }
2015
2016 static int virtnet_set_mac_address(struct net_device *dev, void *p)
2017 {
2018         struct virtnet_info *vi = netdev_priv(dev);
2019         struct virtio_device *vdev = vi->vdev;
2020         int ret;
2021         struct sockaddr *addr;
2022         struct scatterlist sg;
2023
2024         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
2025                 return -EOPNOTSUPP;
2026
2027         addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
2028         if (!addr)
2029                 return -ENOMEM;
2030
2031         ret = eth_prepare_mac_addr_change(dev, addr);
2032         if (ret)
2033                 goto out;
2034
2035         if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
2036                 sg_init_one(&sg, addr->sa_data, dev->addr_len);
2037                 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
2038                                           VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
2039                         dev_warn(&vdev->dev,
2040                                  "Failed to set mac address by vq command.\n");
2041                         ret = -EINVAL;
2042                         goto out;
2043                 }
2044         } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
2045                    !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
2046                 unsigned int i;
2047
2048                 /* Naturally, this has an atomicity problem. */
2049                 for (i = 0; i < dev->addr_len; i++)
2050                         virtio_cwrite8(vdev,
2051                                        offsetof(struct virtio_net_config, mac) +
2052                                        i, addr->sa_data[i]);
2053         }
2054
2055         eth_commit_mac_addr_change(dev, p);
2056         ret = 0;
2057
2058 out:
2059         kfree(addr);
2060         return ret;
2061 }
2062
2063 static void virtnet_stats(struct net_device *dev,
2064                           struct rtnl_link_stats64 *tot)
2065 {
2066         struct virtnet_info *vi = netdev_priv(dev);
2067         unsigned int start;
2068         int i;
2069
2070         for (i = 0; i < vi->max_queue_pairs; i++) {
2071                 u64 tpackets, tbytes, terrors, rpackets, rbytes, rdrops;
2072                 struct receive_queue *rq = &vi->rq[i];
2073                 struct send_queue *sq = &vi->sq[i];
2074
2075                 do {
2076                         start = u64_stats_fetch_begin(&sq->stats.syncp);
2077                         tpackets = sq->stats.packets;
2078                         tbytes   = sq->stats.bytes;
2079                         terrors  = sq->stats.tx_timeouts;
2080                 } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
2081
2082                 do {
2083                         start = u64_stats_fetch_begin(&rq->stats.syncp);
2084                         rpackets = rq->stats.packets;
2085                         rbytes   = rq->stats.bytes;
2086                         rdrops   = rq->stats.drops;
2087                 } while (u64_stats_fetch_retry(&rq->stats.syncp, start));
2088
2089                 tot->rx_packets += rpackets;
2090                 tot->tx_packets += tpackets;
2091                 tot->rx_bytes   += rbytes;
2092                 tot->tx_bytes   += tbytes;
2093                 tot->rx_dropped += rdrops;
2094                 tot->tx_errors  += terrors;
2095         }
2096
2097         tot->tx_dropped = dev->stats.tx_dropped;
2098         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
2099         tot->rx_length_errors = dev->stats.rx_length_errors;
2100         tot->rx_frame_errors = dev->stats.rx_frame_errors;
2101 }
2102
2103 static void virtnet_ack_link_announce(struct virtnet_info *vi)
2104 {
2105         rtnl_lock();
2106         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
2107                                   VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
2108                 dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
2109         rtnl_unlock();
2110 }
2111
2112 static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
2113 {
2114         struct scatterlist sg;
2115         struct net_device *dev = vi->dev;
2116
2117         if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
2118                 return 0;
2119
2120         vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
2121         sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
2122
2123         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
2124                                   VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
2125                 dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
2126                          queue_pairs);
2127                 return -EINVAL;
2128         } else {
2129                 vi->curr_queue_pairs = queue_pairs;
2130                 /* virtnet_open() will refill when device is going to up. */
2131                 if (dev->flags & IFF_UP)
2132                         schedule_delayed_work(&vi->refill, 0);
2133         }
2134
2135         return 0;
2136 }
2137
2138 static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
2139 {
2140         int err;
2141
2142         rtnl_lock();
2143         err = _virtnet_set_queues(vi, queue_pairs);
2144         rtnl_unlock();
2145         return err;
2146 }
2147
2148 static int virtnet_close(struct net_device *dev)
2149 {
2150         struct virtnet_info *vi = netdev_priv(dev);
2151         int i;
2152
2153         /* Make sure NAPI doesn't schedule refill work */
2154         disable_delayed_refill(vi);
2155         /* Make sure refill_work doesn't re-enable napi! */
2156         cancel_delayed_work_sync(&vi->refill);
2157
2158         for (i = 0; i < vi->max_queue_pairs; i++) {
2159                 xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
2160                 napi_disable(&vi->rq[i].napi);
2161                 virtnet_napi_tx_disable(&vi->sq[i].napi);
2162         }
2163
2164         return 0;
2165 }
2166
2167 static void virtnet_set_rx_mode(struct net_device *dev)
2168 {
2169         struct virtnet_info *vi = netdev_priv(dev);
2170         struct scatterlist sg[2];
2171         struct virtio_net_ctrl_mac *mac_data;
2172         struct netdev_hw_addr *ha;
2173         int uc_count;
2174         int mc_count;
2175         void *buf;
2176         int i;
2177
2178         /* We can't dynamically set ndo_set_rx_mode, so return gracefully */
2179         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
2180                 return;
2181
2182         vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
2183         vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
2184
2185         sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
2186
2187         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
2188                                   VIRTIO_NET_CTRL_RX_PROMISC, sg))
2189                 dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
2190                          vi->ctrl->promisc ? "en" : "dis");
2191
2192         sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
2193
2194         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
2195                                   VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
2196                 dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
2197                          vi->ctrl->allmulti ? "en" : "dis");
2198
2199         uc_count = netdev_uc_count(dev);
2200         mc_count = netdev_mc_count(dev);
2201         /* MAC filter - use one buffer for both lists */
2202         buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
2203                       (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
2204         mac_data = buf;
2205         if (!buf)
2206                 return;
2207
2208         sg_init_table(sg, 2);
2209
2210         /* Store the unicast list and count in the front of the buffer */
2211         mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
2212         i = 0;
2213         netdev_for_each_uc_addr(ha, dev)
2214                 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
2215
2216         sg_set_buf(&sg[0], mac_data,
2217                    sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
2218
2219         /* multicast list and count fill the end */
2220         mac_data = (void *)&mac_data->macs[uc_count][0];
2221
2222         mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
2223         i = 0;
2224         netdev_for_each_mc_addr(ha, dev)
2225                 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
2226
2227         sg_set_buf(&sg[1], mac_data,
2228                    sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
2229
2230         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
2231                                   VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
2232                 dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
2233
2234         kfree(buf);
2235 }
2236
2237 static int virtnet_vlan_rx_add_vid(struct net_device *dev,
2238                                    __be16 proto, u16 vid)
2239 {
2240         struct virtnet_info *vi = netdev_priv(dev);
2241         struct scatterlist sg;
2242
2243         vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2244         sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2245
2246         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2247                                   VIRTIO_NET_CTRL_VLAN_ADD, &sg))
2248                 dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
2249         return 0;
2250 }
2251
2252 static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
2253                                     __be16 proto, u16 vid)
2254 {
2255         struct virtnet_info *vi = netdev_priv(dev);
2256         struct scatterlist sg;
2257
2258         vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2259         sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2260
2261         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2262                                   VIRTIO_NET_CTRL_VLAN_DEL, &sg))
2263                 dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
2264         return 0;
2265 }
2266
2267 static void virtnet_clean_affinity(struct virtnet_info *vi)
2268 {
2269         int i;
2270
2271         if (vi->affinity_hint_set) {
2272                 for (i = 0; i < vi->max_queue_pairs; i++) {
2273                         virtqueue_set_affinity(vi->rq[i].vq, NULL);
2274                         virtqueue_set_affinity(vi->sq[i].vq, NULL);
2275                 }
2276
2277                 vi->affinity_hint_set = false;
2278         }
2279 }
2280
2281 static void virtnet_set_affinity(struct virtnet_info *vi)
2282 {
2283         cpumask_var_t mask;
2284         int stragglers;
2285         int group_size;
2286         int i, j, cpu;
2287         int num_cpu;
2288         int stride;
2289
2290         if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
2291                 virtnet_clean_affinity(vi);
2292                 return;
2293         }
2294
2295         num_cpu = num_online_cpus();
2296         stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
2297         stragglers = num_cpu >= vi->curr_queue_pairs ?
2298                         num_cpu % vi->curr_queue_pairs :
2299                         0;
2300         cpu = cpumask_first(cpu_online_mask);
2301
2302         for (i = 0; i < vi->curr_queue_pairs; i++) {
2303                 group_size = stride + (i < stragglers ? 1 : 0);
2304
2305                 for (j = 0; j < group_size; j++) {
2306                         cpumask_set_cpu(cpu, mask);
2307                         cpu = cpumask_next_wrap(cpu, cpu_online_mask,
2308                                                 nr_cpu_ids, false);
2309                 }
2310                 virtqueue_set_affinity(vi->rq[i].vq, mask);
2311                 virtqueue_set_affinity(vi->sq[i].vq, mask);
2312                 __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
2313                 cpumask_clear(mask);
2314         }
2315
2316         vi->affinity_hint_set = true;
2317         free_cpumask_var(mask);
2318 }
2319
2320 static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
2321 {
2322         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2323                                                    node);
2324         virtnet_set_affinity(vi);
2325         return 0;
2326 }
2327
2328 static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
2329 {
2330         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2331                                                    node_dead);
2332         virtnet_set_affinity(vi);
2333         return 0;
2334 }
2335
2336 static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
2337 {
2338         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2339                                                    node);
2340
2341         virtnet_clean_affinity(vi);
2342         return 0;
2343 }
2344
2345 static enum cpuhp_state virtionet_online;
2346
2347 static int virtnet_cpu_notif_add(struct virtnet_info *vi)
2348 {
2349         int ret;
2350
2351         ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
2352         if (ret)
2353                 return ret;
2354         ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
2355                                                &vi->node_dead);
2356         if (!ret)
2357                 return ret;
2358         cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
2359         return ret;
2360 }
2361
2362 static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
2363 {
2364         cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
2365         cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
2366                                             &vi->node_dead);
2367 }
2368
2369 static void virtnet_get_ringparam(struct net_device *dev,
2370                                   struct ethtool_ringparam *ring,
2371                                   struct kernel_ethtool_ringparam *kernel_ring,
2372                                   struct netlink_ext_ack *extack)
2373 {
2374         struct virtnet_info *vi = netdev_priv(dev);
2375
2376         ring->rx_max_pending = vi->rq[0].vq->num_max;
2377         ring->tx_max_pending = vi->sq[0].vq->num_max;
2378         ring->rx_pending = virtqueue_get_vring_size(vi->rq[0].vq);
2379         ring->tx_pending = virtqueue_get_vring_size(vi->sq[0].vq);
2380 }
2381
2382 static int virtnet_set_ringparam(struct net_device *dev,
2383                                  struct ethtool_ringparam *ring,
2384                                  struct kernel_ethtool_ringparam *kernel_ring,
2385                                  struct netlink_ext_ack *extack)
2386 {
2387         struct virtnet_info *vi = netdev_priv(dev);
2388         u32 rx_pending, tx_pending;
2389         struct receive_queue *rq;
2390         struct send_queue *sq;
2391         int i, err;
2392
2393         if (ring->rx_mini_pending || ring->rx_jumbo_pending)
2394                 return -EINVAL;
2395
2396         rx_pending = virtqueue_get_vring_size(vi->rq[0].vq);
2397         tx_pending = virtqueue_get_vring_size(vi->sq[0].vq);
2398
2399         if (ring->rx_pending == rx_pending &&
2400             ring->tx_pending == tx_pending)
2401                 return 0;
2402
2403         if (ring->rx_pending > vi->rq[0].vq->num_max)
2404                 return -EINVAL;
2405
2406         if (ring->tx_pending > vi->sq[0].vq->num_max)
2407                 return -EINVAL;
2408
2409         for (i = 0; i < vi->max_queue_pairs; i++) {
2410                 rq = vi->rq + i;
2411                 sq = vi->sq + i;
2412
2413                 if (ring->tx_pending != tx_pending) {
2414                         err = virtnet_tx_resize(vi, sq, ring->tx_pending);
2415                         if (err)
2416                                 return err;
2417                 }
2418
2419                 if (ring->rx_pending != rx_pending) {
2420                         err = virtnet_rx_resize(vi, rq, ring->rx_pending);
2421                         if (err)
2422                                 return err;
2423                 }
2424         }
2425
2426         return 0;
2427 }
2428
2429 static bool virtnet_commit_rss_command(struct virtnet_info *vi)
2430 {
2431         struct net_device *dev = vi->dev;
2432         struct scatterlist sgs[4];
2433         unsigned int sg_buf_size;
2434
2435         /* prepare sgs */
2436         sg_init_table(sgs, 4);
2437
2438         sg_buf_size = offsetof(struct virtio_net_ctrl_rss, indirection_table);
2439         sg_set_buf(&sgs[0], &vi->ctrl->rss, sg_buf_size);
2440
2441         sg_buf_size = sizeof(uint16_t) * (vi->ctrl->rss.indirection_table_mask + 1);
2442         sg_set_buf(&sgs[1], vi->ctrl->rss.indirection_table, sg_buf_size);
2443
2444         sg_buf_size = offsetof(struct virtio_net_ctrl_rss, key)
2445                         - offsetof(struct virtio_net_ctrl_rss, max_tx_vq);
2446         sg_set_buf(&sgs[2], &vi->ctrl->rss.max_tx_vq, sg_buf_size);
2447
2448         sg_buf_size = vi->rss_key_size;
2449         sg_set_buf(&sgs[3], vi->ctrl->rss.key, sg_buf_size);
2450
2451         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
2452                                   vi->has_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG
2453                                   : VIRTIO_NET_CTRL_MQ_HASH_CONFIG, sgs)) {
2454                 dev_warn(&dev->dev, "VIRTIONET issue with committing RSS sgs\n");
2455                 return false;
2456         }
2457         return true;
2458 }
2459
2460 static void virtnet_init_default_rss(struct virtnet_info *vi)
2461 {
2462         u32 indir_val = 0;
2463         int i = 0;
2464
2465         vi->ctrl->rss.hash_types = vi->rss_hash_types_supported;
2466         vi->rss_hash_types_saved = vi->rss_hash_types_supported;
2467         vi->ctrl->rss.indirection_table_mask = vi->rss_indir_table_size
2468                                                 ? vi->rss_indir_table_size - 1 : 0;
2469         vi->ctrl->rss.unclassified_queue = 0;
2470
2471         for (; i < vi->rss_indir_table_size; ++i) {
2472                 indir_val = ethtool_rxfh_indir_default(i, vi->curr_queue_pairs);
2473                 vi->ctrl->rss.indirection_table[i] = indir_val;
2474         }
2475
2476         vi->ctrl->rss.max_tx_vq = vi->curr_queue_pairs;
2477         vi->ctrl->rss.hash_key_length = vi->rss_key_size;
2478
2479         netdev_rss_key_fill(vi->ctrl->rss.key, vi->rss_key_size);
2480 }
2481
2482 static void virtnet_get_hashflow(const struct virtnet_info *vi, struct ethtool_rxnfc *info)
2483 {
2484         info->data = 0;
2485         switch (info->flow_type) {
2486         case TCP_V4_FLOW:
2487                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
2488                         info->data = RXH_IP_SRC | RXH_IP_DST |
2489                                                  RXH_L4_B_0_1 | RXH_L4_B_2_3;
2490                 } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
2491                         info->data = RXH_IP_SRC | RXH_IP_DST;
2492                 }
2493                 break;
2494         case TCP_V6_FLOW:
2495                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
2496                         info->data = RXH_IP_SRC | RXH_IP_DST |
2497                                                  RXH_L4_B_0_1 | RXH_L4_B_2_3;
2498                 } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
2499                         info->data = RXH_IP_SRC | RXH_IP_DST;
2500                 }
2501                 break;
2502         case UDP_V4_FLOW:
2503                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
2504                         info->data = RXH_IP_SRC | RXH_IP_DST |
2505                                                  RXH_L4_B_0_1 | RXH_L4_B_2_3;
2506                 } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
2507                         info->data = RXH_IP_SRC | RXH_IP_DST;
2508                 }
2509                 break;
2510         case UDP_V6_FLOW:
2511                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
2512                         info->data = RXH_IP_SRC | RXH_IP_DST |
2513                                                  RXH_L4_B_0_1 | RXH_L4_B_2_3;
2514                 } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
2515                         info->data = RXH_IP_SRC | RXH_IP_DST;
2516                 }
2517                 break;
2518         case IPV4_FLOW:
2519                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
2520                         info->data = RXH_IP_SRC | RXH_IP_DST;
2521
2522                 break;
2523         case IPV6_FLOW:
2524                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
2525                         info->data = RXH_IP_SRC | RXH_IP_DST;
2526
2527                 break;
2528         default:
2529                 info->data = 0;
2530                 break;
2531         }
2532 }
2533
2534 static bool virtnet_set_hashflow(struct virtnet_info *vi, struct ethtool_rxnfc *info)
2535 {
2536         u32 new_hashtypes = vi->rss_hash_types_saved;
2537         bool is_disable = info->data & RXH_DISCARD;
2538         bool is_l4 = info->data == (RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3);
2539
2540         /* supports only 'sd', 'sdfn' and 'r' */
2541         if (!((info->data == (RXH_IP_SRC | RXH_IP_DST)) | is_l4 | is_disable))
2542                 return false;
2543
2544         switch (info->flow_type) {
2545         case TCP_V4_FLOW:
2546                 new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_TCPv4);
2547                 if (!is_disable)
2548                         new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4
2549                                 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv4 : 0);
2550                 break;
2551         case UDP_V4_FLOW:
2552                 new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_UDPv4);
2553                 if (!is_disable)
2554                         new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4
2555                                 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv4 : 0);
2556                 break;
2557         case IPV4_FLOW:
2558                 new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv4;
2559                 if (!is_disable)
2560                         new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv4;
2561                 break;
2562         case TCP_V6_FLOW:
2563                 new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_TCPv6);
2564                 if (!is_disable)
2565                         new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6
2566                                 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv6 : 0);
2567                 break;
2568         case UDP_V6_FLOW:
2569                 new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_UDPv6);
2570                 if (!is_disable)
2571                         new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6
2572                                 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv6 : 0);
2573                 break;
2574         case IPV6_FLOW:
2575                 new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv6;
2576                 if (!is_disable)
2577                         new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv6;
2578                 break;
2579         default:
2580                 /* unsupported flow */
2581                 return false;
2582         }
2583
2584         /* if unsupported hashtype was set */
2585         if (new_hashtypes != (new_hashtypes & vi->rss_hash_types_supported))
2586                 return false;
2587
2588         if (new_hashtypes != vi->rss_hash_types_saved) {
2589                 vi->rss_hash_types_saved = new_hashtypes;
2590                 vi->ctrl->rss.hash_types = vi->rss_hash_types_saved;
2591                 if (vi->dev->features & NETIF_F_RXHASH)
2592                         return virtnet_commit_rss_command(vi);
2593         }
2594
2595         return true;
2596 }
2597
2598 static void virtnet_get_drvinfo(struct net_device *dev,
2599                                 struct ethtool_drvinfo *info)
2600 {
2601         struct virtnet_info *vi = netdev_priv(dev);
2602         struct virtio_device *vdev = vi->vdev;
2603
2604         strscpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
2605         strscpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
2606         strscpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));
2607
2608 }
2609
2610 /* TODO: Eliminate OOO packets during switching */
2611 static int virtnet_set_channels(struct net_device *dev,
2612                                 struct ethtool_channels *channels)
2613 {
2614         struct virtnet_info *vi = netdev_priv(dev);
2615         u16 queue_pairs = channels->combined_count;
2616         int err;
2617
2618         /* We don't support separate rx/tx channels.
2619          * We don't allow setting 'other' channels.
2620          */
2621         if (channels->rx_count || channels->tx_count || channels->other_count)
2622                 return -EINVAL;
2623
2624         if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
2625                 return -EINVAL;
2626
2627         /* For now we don't support modifying channels while XDP is loaded
2628          * also when XDP is loaded all RX queues have XDP programs so we only
2629          * need to check a single RX queue.
2630          */
2631         if (vi->rq[0].xdp_prog)
2632                 return -EINVAL;
2633
2634         cpus_read_lock();
2635         err = _virtnet_set_queues(vi, queue_pairs);
2636         if (err) {
2637                 cpus_read_unlock();
2638                 goto err;
2639         }
2640         virtnet_set_affinity(vi);
2641         cpus_read_unlock();
2642
2643         netif_set_real_num_tx_queues(dev, queue_pairs);
2644         netif_set_real_num_rx_queues(dev, queue_pairs);
2645  err:
2646         return err;
2647 }
2648
2649 static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
2650 {
2651         struct virtnet_info *vi = netdev_priv(dev);
2652         unsigned int i, j;
2653         u8 *p = data;
2654
2655         switch (stringset) {
2656         case ETH_SS_STATS:
2657                 for (i = 0; i < vi->curr_queue_pairs; i++) {
2658                         for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++)
2659                                 ethtool_sprintf(&p, "rx_queue_%u_%s", i,
2660                                                 virtnet_rq_stats_desc[j].desc);
2661                 }
2662
2663                 for (i = 0; i < vi->curr_queue_pairs; i++) {
2664                         for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++)
2665                                 ethtool_sprintf(&p, "tx_queue_%u_%s", i,
2666                                                 virtnet_sq_stats_desc[j].desc);
2667                 }
2668                 break;
2669         }
2670 }
2671
2672 static int virtnet_get_sset_count(struct net_device *dev, int sset)
2673 {
2674         struct virtnet_info *vi = netdev_priv(dev);
2675
2676         switch (sset) {
2677         case ETH_SS_STATS:
2678                 return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
2679                                                VIRTNET_SQ_STATS_LEN);
2680         default:
2681                 return -EOPNOTSUPP;
2682         }
2683 }
2684
2685 static void virtnet_get_ethtool_stats(struct net_device *dev,
2686                                       struct ethtool_stats *stats, u64 *data)
2687 {
2688         struct virtnet_info *vi = netdev_priv(dev);
2689         unsigned int idx = 0, start, i, j;
2690         const u8 *stats_base;
2691         size_t offset;
2692
2693         for (i = 0; i < vi->curr_queue_pairs; i++) {
2694                 struct receive_queue *rq = &vi->rq[i];
2695
2696                 stats_base = (u8 *)&rq->stats;
2697                 do {
2698                         start = u64_stats_fetch_begin(&rq->stats.syncp);
2699                         for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
2700                                 offset = virtnet_rq_stats_desc[j].offset;
2701                                 data[idx + j] = *(u64 *)(stats_base + offset);
2702                         }
2703                 } while (u64_stats_fetch_retry(&rq->stats.syncp, start));
2704                 idx += VIRTNET_RQ_STATS_LEN;
2705         }
2706
2707         for (i = 0; i < vi->curr_queue_pairs; i++) {
2708                 struct send_queue *sq = &vi->sq[i];
2709
2710                 stats_base = (u8 *)&sq->stats;
2711                 do {
2712                         start = u64_stats_fetch_begin(&sq->stats.syncp);
2713                         for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
2714                                 offset = virtnet_sq_stats_desc[j].offset;
2715                                 data[idx + j] = *(u64 *)(stats_base + offset);
2716                         }
2717                 } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
2718                 idx += VIRTNET_SQ_STATS_LEN;
2719         }
2720 }
2721
2722 static void virtnet_get_channels(struct net_device *dev,
2723                                  struct ethtool_channels *channels)
2724 {
2725         struct virtnet_info *vi = netdev_priv(dev);
2726
2727         channels->combined_count = vi->curr_queue_pairs;
2728         channels->max_combined = vi->max_queue_pairs;
2729         channels->max_other = 0;
2730         channels->rx_count = 0;
2731         channels->tx_count = 0;
2732         channels->other_count = 0;
2733 }
2734
2735 static int virtnet_set_link_ksettings(struct net_device *dev,
2736                                       const struct ethtool_link_ksettings *cmd)
2737 {
2738         struct virtnet_info *vi = netdev_priv(dev);
2739
2740         return ethtool_virtdev_set_link_ksettings(dev, cmd,
2741                                                   &vi->speed, &vi->duplex);
2742 }
2743
2744 static int virtnet_get_link_ksettings(struct net_device *dev,
2745                                       struct ethtool_link_ksettings *cmd)
2746 {
2747         struct virtnet_info *vi = netdev_priv(dev);
2748
2749         cmd->base.speed = vi->speed;
2750         cmd->base.duplex = vi->duplex;
2751         cmd->base.port = PORT_OTHER;
2752
2753         return 0;
2754 }
2755
2756 static int virtnet_send_notf_coal_cmds(struct virtnet_info *vi,
2757                                        struct ethtool_coalesce *ec)
2758 {
2759         struct scatterlist sgs_tx, sgs_rx;
2760         struct virtio_net_ctrl_coal_tx coal_tx;
2761         struct virtio_net_ctrl_coal_rx coal_rx;
2762
2763         coal_tx.tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs);
2764         coal_tx.tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames);
2765         sg_init_one(&sgs_tx, &coal_tx, sizeof(coal_tx));
2766
2767         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL,
2768                                   VIRTIO_NET_CTRL_NOTF_COAL_TX_SET,
2769                                   &sgs_tx))
2770                 return -EINVAL;
2771
2772         /* Save parameters */
2773         vi->tx_usecs = ec->tx_coalesce_usecs;
2774         vi->tx_max_packets = ec->tx_max_coalesced_frames;
2775
2776         coal_rx.rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs);
2777         coal_rx.rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames);
2778         sg_init_one(&sgs_rx, &coal_rx, sizeof(coal_rx));
2779
2780         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL,
2781                                   VIRTIO_NET_CTRL_NOTF_COAL_RX_SET,
2782                                   &sgs_rx))
2783                 return -EINVAL;
2784
2785         /* Save parameters */
2786         vi->rx_usecs = ec->rx_coalesce_usecs;
2787         vi->rx_max_packets = ec->rx_max_coalesced_frames;
2788
2789         return 0;
2790 }
2791
2792 static int virtnet_coal_params_supported(struct ethtool_coalesce *ec)
2793 {
2794         /* usecs coalescing is supported only if VIRTIO_NET_F_NOTF_COAL
2795          * feature is negotiated.
2796          */
2797         if (ec->rx_coalesce_usecs || ec->tx_coalesce_usecs)
2798                 return -EOPNOTSUPP;
2799
2800         if (ec->tx_max_coalesced_frames > 1 ||
2801             ec->rx_max_coalesced_frames != 1)
2802                 return -EINVAL;
2803
2804         return 0;
2805 }
2806
2807 static int virtnet_set_coalesce(struct net_device *dev,
2808                                 struct ethtool_coalesce *ec,
2809                                 struct kernel_ethtool_coalesce *kernel_coal,
2810                                 struct netlink_ext_ack *extack)
2811 {
2812         struct virtnet_info *vi = netdev_priv(dev);
2813         int ret, i, napi_weight;
2814         bool update_napi = false;
2815
2816         /* Can't change NAPI weight if the link is up */
2817         napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
2818         if (napi_weight ^ vi->sq[0].napi.weight) {
2819                 if (dev->flags & IFF_UP)
2820                         return -EBUSY;
2821                 else
2822                         update_napi = true;
2823         }
2824
2825         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
2826                 ret = virtnet_send_notf_coal_cmds(vi, ec);
2827         else
2828                 ret = virtnet_coal_params_supported(ec);
2829
2830         if (ret)
2831                 return ret;
2832
2833         if (update_napi) {
2834                 for (i = 0; i < vi->max_queue_pairs; i++)
2835                         vi->sq[i].napi.weight = napi_weight;
2836         }
2837
2838         return ret;
2839 }
2840
2841 static int virtnet_get_coalesce(struct net_device *dev,
2842                                 struct ethtool_coalesce *ec,
2843                                 struct kernel_ethtool_coalesce *kernel_coal,
2844                                 struct netlink_ext_ack *extack)
2845 {
2846         struct virtnet_info *vi = netdev_priv(dev);
2847
2848         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) {
2849                 ec->rx_coalesce_usecs = vi->rx_usecs;
2850                 ec->tx_coalesce_usecs = vi->tx_usecs;
2851                 ec->tx_max_coalesced_frames = vi->tx_max_packets;
2852                 ec->rx_max_coalesced_frames = vi->rx_max_packets;
2853         } else {
2854                 ec->rx_max_coalesced_frames = 1;
2855
2856                 if (vi->sq[0].napi.weight)
2857                         ec->tx_max_coalesced_frames = 1;
2858         }
2859
2860         return 0;
2861 }
2862
2863 static void virtnet_init_settings(struct net_device *dev)
2864 {
2865         struct virtnet_info *vi = netdev_priv(dev);
2866
2867         vi->speed = SPEED_UNKNOWN;
2868         vi->duplex = DUPLEX_UNKNOWN;
2869 }
2870
2871 static void virtnet_update_settings(struct virtnet_info *vi)
2872 {
2873         u32 speed;
2874         u8 duplex;
2875
2876         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
2877                 return;
2878
2879         virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
2880
2881         if (ethtool_validate_speed(speed))
2882                 vi->speed = speed;
2883
2884         virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
2885
2886         if (ethtool_validate_duplex(duplex))
2887                 vi->duplex = duplex;
2888 }
2889
2890 static u32 virtnet_get_rxfh_key_size(struct net_device *dev)
2891 {
2892         return ((struct virtnet_info *)netdev_priv(dev))->rss_key_size;
2893 }
2894
2895 static u32 virtnet_get_rxfh_indir_size(struct net_device *dev)
2896 {
2897         return ((struct virtnet_info *)netdev_priv(dev))->rss_indir_table_size;
2898 }
2899
2900 static int virtnet_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
2901 {
2902         struct virtnet_info *vi = netdev_priv(dev);
2903         int i;
2904
2905         if (indir) {
2906                 for (i = 0; i < vi->rss_indir_table_size; ++i)
2907                         indir[i] = vi->ctrl->rss.indirection_table[i];
2908         }
2909
2910         if (key)
2911                 memcpy(key, vi->ctrl->rss.key, vi->rss_key_size);
2912
2913         if (hfunc)
2914                 *hfunc = ETH_RSS_HASH_TOP;
2915
2916         return 0;
2917 }
2918
2919 static int virtnet_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key, const u8 hfunc)
2920 {
2921         struct virtnet_info *vi = netdev_priv(dev);
2922         int i;
2923
2924         if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
2925                 return -EOPNOTSUPP;
2926
2927         if (indir) {
2928                 for (i = 0; i < vi->rss_indir_table_size; ++i)
2929                         vi->ctrl->rss.indirection_table[i] = indir[i];
2930         }
2931         if (key)
2932                 memcpy(vi->ctrl->rss.key, key, vi->rss_key_size);
2933
2934         virtnet_commit_rss_command(vi);
2935
2936         return 0;
2937 }
2938
2939 static int virtnet_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, u32 *rule_locs)
2940 {
2941         struct virtnet_info *vi = netdev_priv(dev);
2942         int rc = 0;
2943
2944         switch (info->cmd) {
2945         case ETHTOOL_GRXRINGS:
2946                 info->data = vi->curr_queue_pairs;
2947                 break;
2948         case ETHTOOL_GRXFH:
2949                 virtnet_get_hashflow(vi, info);
2950                 break;
2951         default:
2952                 rc = -EOPNOTSUPP;
2953         }
2954
2955         return rc;
2956 }
2957
2958 static int virtnet_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info)
2959 {
2960         struct virtnet_info *vi = netdev_priv(dev);
2961         int rc = 0;
2962
2963         switch (info->cmd) {
2964         case ETHTOOL_SRXFH:
2965                 if (!virtnet_set_hashflow(vi, info))
2966                         rc = -EINVAL;
2967
2968                 break;
2969         default:
2970                 rc = -EOPNOTSUPP;
2971         }
2972
2973         return rc;
2974 }
2975
2976 static const struct ethtool_ops virtnet_ethtool_ops = {
2977         .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES |
2978                 ETHTOOL_COALESCE_USECS,
2979         .get_drvinfo = virtnet_get_drvinfo,
2980         .get_link = ethtool_op_get_link,
2981         .get_ringparam = virtnet_get_ringparam,
2982         .set_ringparam = virtnet_set_ringparam,
2983         .get_strings = virtnet_get_strings,
2984         .get_sset_count = virtnet_get_sset_count,
2985         .get_ethtool_stats = virtnet_get_ethtool_stats,
2986         .set_channels = virtnet_set_channels,
2987         .get_channels = virtnet_get_channels,
2988         .get_ts_info = ethtool_op_get_ts_info,
2989         .get_link_ksettings = virtnet_get_link_ksettings,
2990         .set_link_ksettings = virtnet_set_link_ksettings,
2991         .set_coalesce = virtnet_set_coalesce,
2992         .get_coalesce = virtnet_get_coalesce,
2993         .get_rxfh_key_size = virtnet_get_rxfh_key_size,
2994         .get_rxfh_indir_size = virtnet_get_rxfh_indir_size,
2995         .get_rxfh = virtnet_get_rxfh,
2996         .set_rxfh = virtnet_set_rxfh,
2997         .get_rxnfc = virtnet_get_rxnfc,
2998         .set_rxnfc = virtnet_set_rxnfc,
2999 };
3000
3001 static void virtnet_freeze_down(struct virtio_device *vdev)
3002 {
3003         struct virtnet_info *vi = vdev->priv;
3004
3005         /* Make sure no work handler is accessing the device */
3006         flush_work(&vi->config_work);
3007
3008         netif_tx_lock_bh(vi->dev);
3009         netif_device_detach(vi->dev);
3010         netif_tx_unlock_bh(vi->dev);
3011         if (netif_running(vi->dev))
3012                 virtnet_close(vi->dev);
3013 }
3014
3015 static int init_vqs(struct virtnet_info *vi);
3016
3017 static int virtnet_restore_up(struct virtio_device *vdev)
3018 {
3019         struct virtnet_info *vi = vdev->priv;
3020         int err;
3021
3022         err = init_vqs(vi);
3023         if (err)
3024                 return err;
3025
3026         virtio_device_ready(vdev);
3027
3028         enable_delayed_refill(vi);
3029
3030         if (netif_running(vi->dev)) {
3031                 err = virtnet_open(vi->dev);
3032                 if (err)
3033                         return err;
3034         }
3035
3036         netif_tx_lock_bh(vi->dev);
3037         netif_device_attach(vi->dev);
3038         netif_tx_unlock_bh(vi->dev);
3039         return err;
3040 }
3041
3042 static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
3043 {
3044         struct scatterlist sg;
3045         vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
3046
3047         sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
3048
3049         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
3050                                   VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
3051                 dev_warn(&vi->dev->dev, "Fail to set guest offload.\n");
3052                 return -EINVAL;
3053         }
3054
3055         return 0;
3056 }
3057
3058 static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
3059 {
3060         u64 offloads = 0;
3061
3062         if (!vi->guest_offloads)
3063                 return 0;
3064
3065         return virtnet_set_guest_offloads(vi, offloads);
3066 }
3067
3068 static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
3069 {
3070         u64 offloads = vi->guest_offloads;
3071
3072         if (!vi->guest_offloads)
3073                 return 0;
3074
3075         return virtnet_set_guest_offloads(vi, offloads);
3076 }
3077
3078 static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
3079                            struct netlink_ext_ack *extack)
3080 {
3081         unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
3082         struct virtnet_info *vi = netdev_priv(dev);
3083         struct bpf_prog *old_prog;
3084         u16 xdp_qp = 0, curr_qp;
3085         int i, err;
3086
3087         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
3088             && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
3089                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
3090                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
3091                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
3092                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM) ||
3093                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO4) ||
3094                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6))) {
3095                 NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first");
3096                 return -EOPNOTSUPP;
3097         }
3098
3099         if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
3100                 NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
3101                 return -EINVAL;
3102         }
3103
3104         if (dev->mtu > max_sz) {
3105                 NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
3106                 netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
3107                 return -EINVAL;
3108         }
3109
3110         curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
3111         if (prog)
3112                 xdp_qp = nr_cpu_ids;
3113
3114         /* XDP requires extra queues for XDP_TX */
3115         if (curr_qp + xdp_qp > vi->max_queue_pairs) {
3116                 netdev_warn_once(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n",
3117                                  curr_qp + xdp_qp, vi->max_queue_pairs);
3118                 xdp_qp = 0;
3119         }
3120
3121         old_prog = rtnl_dereference(vi->rq[0].xdp_prog);
3122         if (!prog && !old_prog)
3123                 return 0;
3124
3125         if (prog)
3126                 bpf_prog_add(prog, vi->max_queue_pairs - 1);
3127
3128         /* Make sure NAPI is not using any XDP TX queues for RX. */
3129         if (netif_running(dev)) {
3130                 for (i = 0; i < vi->max_queue_pairs; i++) {
3131                         napi_disable(&vi->rq[i].napi);
3132                         virtnet_napi_tx_disable(&vi->sq[i].napi);
3133                 }
3134         }
3135
3136         if (!prog) {
3137                 for (i = 0; i < vi->max_queue_pairs; i++) {
3138                         rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
3139                         if (i == 0)
3140                                 virtnet_restore_guest_offloads(vi);
3141                 }
3142                 synchronize_net();
3143         }
3144
3145         err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
3146         if (err)
3147                 goto err;
3148         netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
3149         vi->xdp_queue_pairs = xdp_qp;
3150
3151         if (prog) {
3152                 vi->xdp_enabled = true;
3153                 for (i = 0; i < vi->max_queue_pairs; i++) {
3154                         rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
3155                         if (i == 0 && !old_prog)
3156                                 virtnet_clear_guest_offloads(vi);
3157                 }
3158         } else {
3159                 vi->xdp_enabled = false;
3160         }
3161
3162         for (i = 0; i < vi->max_queue_pairs; i++) {
3163                 if (old_prog)
3164                         bpf_prog_put(old_prog);
3165                 if (netif_running(dev)) {
3166                         virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
3167                         virtnet_napi_tx_enable(vi, vi->sq[i].vq,
3168                                                &vi->sq[i].napi);
3169                 }
3170         }
3171
3172         return 0;
3173
3174 err:
3175         if (!prog) {
3176                 virtnet_clear_guest_offloads(vi);
3177                 for (i = 0; i < vi->max_queue_pairs; i++)
3178                         rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
3179         }
3180
3181         if (netif_running(dev)) {
3182                 for (i = 0; i < vi->max_queue_pairs; i++) {
3183                         virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
3184                         virtnet_napi_tx_enable(vi, vi->sq[i].vq,
3185                                                &vi->sq[i].napi);
3186                 }
3187         }
3188         if (prog)
3189                 bpf_prog_sub(prog, vi->max_queue_pairs - 1);
3190         return err;
3191 }
3192
3193 static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
3194 {
3195         switch (xdp->command) {
3196         case XDP_SETUP_PROG:
3197                 return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
3198         default:
3199                 return -EINVAL;
3200         }
3201 }
3202
3203 static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
3204                                       size_t len)
3205 {
3206         struct virtnet_info *vi = netdev_priv(dev);
3207         int ret;
3208
3209         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
3210                 return -EOPNOTSUPP;
3211
3212         ret = snprintf(buf, len, "sby");
3213         if (ret >= len)
3214                 return -EOPNOTSUPP;
3215
3216         return 0;
3217 }
3218
3219 static int virtnet_set_features(struct net_device *dev,
3220                                 netdev_features_t features)
3221 {
3222         struct virtnet_info *vi = netdev_priv(dev);
3223         u64 offloads;
3224         int err;
3225
3226         if ((dev->features ^ features) & NETIF_F_GRO_HW) {
3227                 if (vi->xdp_enabled)
3228                         return -EBUSY;
3229
3230                 if (features & NETIF_F_GRO_HW)
3231                         offloads = vi->guest_offloads_capable;
3232                 else
3233                         offloads = vi->guest_offloads_capable &
3234                                    ~GUEST_OFFLOAD_GRO_HW_MASK;
3235
3236                 err = virtnet_set_guest_offloads(vi, offloads);
3237                 if (err)
3238                         return err;
3239                 vi->guest_offloads = offloads;
3240         }
3241
3242         if ((dev->features ^ features) & NETIF_F_RXHASH) {
3243                 if (features & NETIF_F_RXHASH)
3244                         vi->ctrl->rss.hash_types = vi->rss_hash_types_saved;
3245                 else
3246                         vi->ctrl->rss.hash_types = VIRTIO_NET_HASH_REPORT_NONE;
3247
3248                 if (!virtnet_commit_rss_command(vi))
3249                         return -EINVAL;
3250         }
3251
3252         return 0;
3253 }
3254
3255 static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue)
3256 {
3257         struct virtnet_info *priv = netdev_priv(dev);
3258         struct send_queue *sq = &priv->sq[txqueue];
3259         struct netdev_queue *txq = netdev_get_tx_queue(dev, txqueue);
3260
3261         u64_stats_update_begin(&sq->stats.syncp);
3262         sq->stats.tx_timeouts++;
3263         u64_stats_update_end(&sq->stats.syncp);
3264
3265         netdev_err(dev, "TX timeout on queue: %u, sq: %s, vq: 0x%x, name: %s, %u usecs ago\n",
3266                    txqueue, sq->name, sq->vq->index, sq->vq->name,
3267                    jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start)));
3268 }
3269
3270 static const struct net_device_ops virtnet_netdev = {
3271         .ndo_open            = virtnet_open,
3272         .ndo_stop            = virtnet_close,
3273         .ndo_start_xmit      = start_xmit,
3274         .ndo_validate_addr   = eth_validate_addr,
3275         .ndo_set_mac_address = virtnet_set_mac_address,
3276         .ndo_set_rx_mode     = virtnet_set_rx_mode,
3277         .ndo_get_stats64     = virtnet_stats,
3278         .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
3279         .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
3280         .ndo_bpf                = virtnet_xdp,
3281         .ndo_xdp_xmit           = virtnet_xdp_xmit,
3282         .ndo_features_check     = passthru_features_check,
3283         .ndo_get_phys_port_name = virtnet_get_phys_port_name,
3284         .ndo_set_features       = virtnet_set_features,
3285         .ndo_tx_timeout         = virtnet_tx_timeout,
3286 };
3287
3288 static void virtnet_config_changed_work(struct work_struct *work)
3289 {
3290         struct virtnet_info *vi =
3291                 container_of(work, struct virtnet_info, config_work);
3292         u16 v;
3293
3294         if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
3295                                  struct virtio_net_config, status, &v) < 0)
3296                 return;
3297
3298         if (v & VIRTIO_NET_S_ANNOUNCE) {
3299                 netdev_notify_peers(vi->dev);
3300                 virtnet_ack_link_announce(vi);
3301         }
3302
3303         /* Ignore unknown (future) status bits */
3304         v &= VIRTIO_NET_S_LINK_UP;
3305
3306         if (vi->status == v)
3307                 return;
3308
3309         vi->status = v;
3310
3311         if (vi->status & VIRTIO_NET_S_LINK_UP) {
3312                 virtnet_update_settings(vi);
3313                 netif_carrier_on(vi->dev);
3314                 netif_tx_wake_all_queues(vi->dev);
3315         } else {
3316                 netif_carrier_off(vi->dev);
3317                 netif_tx_stop_all_queues(vi->dev);
3318         }
3319 }
3320
3321 static void virtnet_config_changed(struct virtio_device *vdev)
3322 {
3323         struct virtnet_info *vi = vdev->priv;
3324
3325         schedule_work(&vi->config_work);
3326 }
3327
3328 static void virtnet_free_queues(struct virtnet_info *vi)
3329 {
3330         int i;
3331
3332         for (i = 0; i < vi->max_queue_pairs; i++) {
3333                 __netif_napi_del(&vi->rq[i].napi);
3334                 __netif_napi_del(&vi->sq[i].napi);
3335         }
3336
3337         /* We called __netif_napi_del(),
3338          * we need to respect an RCU grace period before freeing vi->rq
3339          */
3340         synchronize_net();
3341
3342         kfree(vi->rq);
3343         kfree(vi->sq);
3344         kfree(vi->ctrl);
3345 }
3346
3347 static void _free_receive_bufs(struct virtnet_info *vi)
3348 {
3349         struct bpf_prog *old_prog;
3350         int i;
3351
3352         for (i = 0; i < vi->max_queue_pairs; i++) {
3353                 while (vi->rq[i].pages)
3354                         __free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
3355
3356                 old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
3357                 RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
3358                 if (old_prog)
3359                         bpf_prog_put(old_prog);
3360         }
3361 }
3362
3363 static void free_receive_bufs(struct virtnet_info *vi)
3364 {
3365         rtnl_lock();
3366         _free_receive_bufs(vi);
3367         rtnl_unlock();
3368 }
3369
3370 static void free_receive_page_frags(struct virtnet_info *vi)
3371 {
3372         int i;
3373         for (i = 0; i < vi->max_queue_pairs; i++)
3374                 if (vi->rq[i].alloc_frag.page)
3375                         put_page(vi->rq[i].alloc_frag.page);
3376 }
3377
3378 static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
3379 {
3380         if (!is_xdp_frame(buf))
3381                 dev_kfree_skb(buf);
3382         else
3383                 xdp_return_frame(ptr_to_xdp(buf));
3384 }
3385
3386 static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
3387 {
3388         struct virtnet_info *vi = vq->vdev->priv;
3389         int i = vq2rxq(vq);
3390
3391         if (vi->mergeable_rx_bufs)
3392                 put_page(virt_to_head_page(buf));
3393         else if (vi->big_packets)
3394                 give_pages(&vi->rq[i], buf);
3395         else
3396                 put_page(virt_to_head_page(buf));
3397 }
3398
3399 static void free_unused_bufs(struct virtnet_info *vi)
3400 {
3401         void *buf;
3402         int i;
3403
3404         for (i = 0; i < vi->max_queue_pairs; i++) {
3405                 struct virtqueue *vq = vi->sq[i].vq;
3406                 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
3407                         virtnet_sq_free_unused_buf(vq, buf);
3408         }
3409
3410         for (i = 0; i < vi->max_queue_pairs; i++) {
3411                 struct virtqueue *vq = vi->rq[i].vq;
3412                 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
3413                         virtnet_rq_free_unused_buf(vq, buf);
3414         }
3415 }
3416
3417 static void virtnet_del_vqs(struct virtnet_info *vi)
3418 {
3419         struct virtio_device *vdev = vi->vdev;
3420
3421         virtnet_clean_affinity(vi);
3422
3423         vdev->config->del_vqs(vdev);
3424
3425         virtnet_free_queues(vi);
3426 }
3427
3428 /* How large should a single buffer be so a queue full of these can fit at
3429  * least one full packet?
3430  * Logic below assumes the mergeable buffer header is used.
3431  */
3432 static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
3433 {
3434         const unsigned int hdr_len = vi->hdr_len;
3435         unsigned int rq_size = virtqueue_get_vring_size(vq);
3436         unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
3437         unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
3438         unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);
3439
3440         return max(max(min_buf_len, hdr_len) - hdr_len,
3441                    (unsigned int)GOOD_PACKET_LEN);
3442 }
3443
3444 static int virtnet_find_vqs(struct virtnet_info *vi)
3445 {
3446         vq_callback_t **callbacks;
3447         struct virtqueue **vqs;
3448         int ret = -ENOMEM;
3449         int i, total_vqs;
3450         const char **names;
3451         bool *ctx;
3452
3453         /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
3454          * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
3455          * possible control vq.
3456          */
3457         total_vqs = vi->max_queue_pairs * 2 +
3458                     virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
3459
3460         /* Allocate space for find_vqs parameters */
3461         vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
3462         if (!vqs)
3463                 goto err_vq;
3464         callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
3465         if (!callbacks)
3466                 goto err_callback;
3467         names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
3468         if (!names)
3469                 goto err_names;
3470         if (!vi->big_packets || vi->mergeable_rx_bufs) {
3471                 ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
3472                 if (!ctx)
3473                         goto err_ctx;
3474         } else {
3475                 ctx = NULL;
3476         }
3477
3478         /* Parameters for control virtqueue, if any */
3479         if (vi->has_cvq) {
3480                 callbacks[total_vqs - 1] = NULL;
3481                 names[total_vqs - 1] = "control";
3482         }
3483
3484         /* Allocate/initialize parameters for send/receive virtqueues */
3485         for (i = 0; i < vi->max_queue_pairs; i++) {
3486                 callbacks[rxq2vq(i)] = skb_recv_done;
3487                 callbacks[txq2vq(i)] = skb_xmit_done;
3488                 sprintf(vi->rq[i].name, "input.%d", i);
3489                 sprintf(vi->sq[i].name, "output.%d", i);
3490                 names[rxq2vq(i)] = vi->rq[i].name;
3491                 names[txq2vq(i)] = vi->sq[i].name;
3492                 if (ctx)
3493                         ctx[rxq2vq(i)] = true;
3494         }
3495
3496         ret = virtio_find_vqs_ctx(vi->vdev, total_vqs, vqs, callbacks,
3497                                   names, ctx, NULL);
3498         if (ret)
3499                 goto err_find;
3500
3501         if (vi->has_cvq) {
3502                 vi->cvq = vqs[total_vqs - 1];
3503                 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
3504                         vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
3505         }
3506
3507         for (i = 0; i < vi->max_queue_pairs; i++) {
3508                 vi->rq[i].vq = vqs[rxq2vq(i)];
3509                 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
3510                 vi->sq[i].vq = vqs[txq2vq(i)];
3511         }
3512
3513         /* run here: ret == 0. */
3514
3515
3516 err_find:
3517         kfree(ctx);
3518 err_ctx:
3519         kfree(names);
3520 err_names:
3521         kfree(callbacks);
3522 err_callback:
3523         kfree(vqs);
3524 err_vq:
3525         return ret;
3526 }
3527
3528 static int virtnet_alloc_queues(struct virtnet_info *vi)
3529 {
3530         int i;
3531
3532         if (vi->has_cvq) {
3533                 vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
3534                 if (!vi->ctrl)
3535                         goto err_ctrl;
3536         } else {
3537                 vi->ctrl = NULL;
3538         }
3539         vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
3540         if (!vi->sq)
3541                 goto err_sq;
3542         vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
3543         if (!vi->rq)
3544                 goto err_rq;
3545
3546         INIT_DELAYED_WORK(&vi->refill, refill_work);
3547         for (i = 0; i < vi->max_queue_pairs; i++) {
3548                 vi->rq[i].pages = NULL;
3549                 netif_napi_add_weight(vi->dev, &vi->rq[i].napi, virtnet_poll,
3550                                       napi_weight);
3551                 netif_napi_add_tx_weight(vi->dev, &vi->sq[i].napi,
3552                                          virtnet_poll_tx,
3553                                          napi_tx ? napi_weight : 0);
3554
3555                 sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
3556                 ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
3557                 sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
3558
3559                 u64_stats_init(&vi->rq[i].stats.syncp);
3560                 u64_stats_init(&vi->sq[i].stats.syncp);
3561         }
3562
3563         return 0;
3564
3565 err_rq:
3566         kfree(vi->sq);
3567 err_sq:
3568         kfree(vi->ctrl);
3569 err_ctrl:
3570         return -ENOMEM;
3571 }
3572
3573 static int init_vqs(struct virtnet_info *vi)
3574 {
3575         int ret;
3576
3577         /* Allocate send & receive queues */
3578         ret = virtnet_alloc_queues(vi);
3579         if (ret)
3580                 goto err;
3581
3582         ret = virtnet_find_vqs(vi);
3583         if (ret)
3584                 goto err_free;
3585
3586         cpus_read_lock();
3587         virtnet_set_affinity(vi);
3588         cpus_read_unlock();
3589
3590         return 0;
3591
3592 err_free:
3593         virtnet_free_queues(vi);
3594 err:
3595         return ret;
3596 }
3597
3598 #ifdef CONFIG_SYSFS
3599 static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
3600                 char *buf)
3601 {
3602         struct virtnet_info *vi = netdev_priv(queue->dev);
3603         unsigned int queue_index = get_netdev_rx_queue_index(queue);
3604         unsigned int headroom = virtnet_get_headroom(vi);
3605         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
3606         struct ewma_pkt_len *avg;
3607
3608         BUG_ON(queue_index >= vi->max_queue_pairs);
3609         avg = &vi->rq[queue_index].mrg_avg_pkt_len;
3610         return sprintf(buf, "%u\n",
3611                        get_mergeable_buf_len(&vi->rq[queue_index], avg,
3612                                        SKB_DATA_ALIGN(headroom + tailroom)));
3613 }
3614
3615 static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
3616         __ATTR_RO(mergeable_rx_buffer_size);
3617
3618 static struct attribute *virtio_net_mrg_rx_attrs[] = {
3619         &mergeable_rx_buffer_size_attribute.attr,
3620         NULL
3621 };
3622
3623 static const struct attribute_group virtio_net_mrg_rx_group = {
3624         .name = "virtio_net",
3625         .attrs = virtio_net_mrg_rx_attrs
3626 };
3627 #endif
3628
3629 static bool virtnet_fail_on_feature(struct virtio_device *vdev,
3630                                     unsigned int fbit,
3631                                     const char *fname, const char *dname)
3632 {
3633         if (!virtio_has_feature(vdev, fbit))
3634                 return false;
3635
3636         dev_err(&vdev->dev, "device advertises feature %s but not %s",
3637                 fname, dname);
3638
3639         return true;
3640 }
3641
3642 #define VIRTNET_FAIL_ON(vdev, fbit, dbit)                       \
3643         virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)
3644
3645 static bool virtnet_validate_features(struct virtio_device *vdev)
3646 {
3647         if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
3648             (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
3649                              "VIRTIO_NET_F_CTRL_VQ") ||
3650              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
3651                              "VIRTIO_NET_F_CTRL_VQ") ||
3652              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
3653                              "VIRTIO_NET_F_CTRL_VQ") ||
3654              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
3655              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
3656                              "VIRTIO_NET_F_CTRL_VQ") ||
3657              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_RSS,
3658                              "VIRTIO_NET_F_CTRL_VQ") ||
3659              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_HASH_REPORT,
3660                              "VIRTIO_NET_F_CTRL_VQ") ||
3661              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_NOTF_COAL,
3662                              "VIRTIO_NET_F_CTRL_VQ"))) {
3663                 return false;
3664         }
3665
3666         return true;
3667 }
3668
3669 #define MIN_MTU ETH_MIN_MTU
3670 #define MAX_MTU ETH_MAX_MTU
3671
3672 static int virtnet_validate(struct virtio_device *vdev)
3673 {
3674         if (!vdev->config->get) {
3675                 dev_err(&vdev->dev, "%s failure: config access disabled\n",
3676                         __func__);
3677                 return -EINVAL;
3678         }
3679
3680         if (!virtnet_validate_features(vdev))
3681                 return -EINVAL;
3682
3683         if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
3684                 int mtu = virtio_cread16(vdev,
3685                                          offsetof(struct virtio_net_config,
3686                                                   mtu));
3687                 if (mtu < MIN_MTU)
3688                         __virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
3689         }
3690
3691         return 0;
3692 }
3693
3694 static bool virtnet_check_guest_gso(const struct virtnet_info *vi)
3695 {
3696         return virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
3697                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
3698                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
3699                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
3700                 (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO4) &&
3701                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6));
3702 }
3703
3704 static void virtnet_set_big_packets(struct virtnet_info *vi, const int mtu)
3705 {
3706         bool guest_gso = virtnet_check_guest_gso(vi);
3707
3708         /* If device can receive ANY guest GSO packets, regardless of mtu,
3709          * allocate packets of maximum size, otherwise limit it to only
3710          * mtu size worth only.
3711          */
3712         if (mtu > ETH_DATA_LEN || guest_gso) {
3713                 vi->big_packets = true;
3714                 vi->big_packets_num_skbfrags = guest_gso ? MAX_SKB_FRAGS : DIV_ROUND_UP(mtu, PAGE_SIZE);
3715         }
3716 }
3717
3718 static int virtnet_probe(struct virtio_device *vdev)
3719 {
3720         int i, err = -ENOMEM;
3721         struct net_device *dev;
3722         struct virtnet_info *vi;
3723         u16 max_queue_pairs;
3724         int mtu = 0;
3725
3726         /* Find if host supports multiqueue/rss virtio_net device */
3727         max_queue_pairs = 1;
3728         if (virtio_has_feature(vdev, VIRTIO_NET_F_MQ) || virtio_has_feature(vdev, VIRTIO_NET_F_RSS))
3729                 max_queue_pairs =
3730                      virtio_cread16(vdev, offsetof(struct virtio_net_config, max_virtqueue_pairs));
3731
3732         /* We need at least 2 queue's */
3733         if (max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
3734             max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
3735             !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
3736                 max_queue_pairs = 1;
3737
3738         /* Allocate ourselves a network device with room for our info */
3739         dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
3740         if (!dev)
3741                 return -ENOMEM;
3742
3743         /* Set up network device as normal. */
3744         dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
3745                            IFF_TX_SKB_NO_LINEAR;
3746         dev->netdev_ops = &virtnet_netdev;
3747         dev->features = NETIF_F_HIGHDMA;
3748
3749         dev->ethtool_ops = &virtnet_ethtool_ops;
3750         SET_NETDEV_DEV(dev, &vdev->dev);
3751
3752         /* Do we support "hardware" checksums? */
3753         if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
3754                 /* This opens up the world of extra features. */
3755                 dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3756                 if (csum)
3757                         dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3758
3759                 if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
3760                         dev->hw_features |= NETIF_F_TSO
3761                                 | NETIF_F_TSO_ECN | NETIF_F_TSO6;
3762                 }
3763                 /* Individual feature bits: what can host handle? */
3764                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
3765                         dev->hw_features |= NETIF_F_TSO;
3766                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
3767                         dev->hw_features |= NETIF_F_TSO6;
3768                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
3769                         dev->hw_features |= NETIF_F_TSO_ECN;
3770                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_USO))
3771                         dev->hw_features |= NETIF_F_GSO_UDP_L4;
3772
3773                 dev->features |= NETIF_F_GSO_ROBUST;
3774
3775                 if (gso)
3776                         dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
3777                 /* (!csum && gso) case will be fixed by register_netdev() */
3778         }
3779         if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
3780                 dev->features |= NETIF_F_RXCSUM;
3781         if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
3782             virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6))
3783                 dev->features |= NETIF_F_GRO_HW;
3784         if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS))
3785                 dev->hw_features |= NETIF_F_GRO_HW;
3786
3787         dev->vlan_features = dev->features;
3788
3789         /* MTU range: 68 - 65535 */
3790         dev->min_mtu = MIN_MTU;
3791         dev->max_mtu = MAX_MTU;
3792
3793         /* Configuration may specify what MAC to use.  Otherwise random. */
3794         if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
3795                 u8 addr[ETH_ALEN];
3796
3797                 virtio_cread_bytes(vdev,
3798                                    offsetof(struct virtio_net_config, mac),
3799                                    addr, ETH_ALEN);
3800                 eth_hw_addr_set(dev, addr);
3801         } else {
3802                 eth_hw_addr_random(dev);
3803         }
3804
3805         /* Set up our device-specific information */
3806         vi = netdev_priv(dev);
3807         vi->dev = dev;
3808         vi->vdev = vdev;
3809         vdev->priv = vi;
3810
3811         INIT_WORK(&vi->config_work, virtnet_config_changed_work);
3812         spin_lock_init(&vi->refill_lock);
3813
3814         if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
3815                 vi->mergeable_rx_bufs = true;
3816
3817         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) {
3818                 vi->rx_usecs = 0;
3819                 vi->tx_usecs = 0;
3820                 vi->tx_max_packets = 0;
3821                 vi->rx_max_packets = 0;
3822         }
3823
3824         if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT))
3825                 vi->has_rss_hash_report = true;
3826
3827         if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS))
3828                 vi->has_rss = true;
3829
3830         if (vi->has_rss || vi->has_rss_hash_report) {
3831                 vi->rss_indir_table_size =
3832                         virtio_cread16(vdev, offsetof(struct virtio_net_config,
3833                                 rss_max_indirection_table_length));
3834                 vi->rss_key_size =
3835                         virtio_cread8(vdev, offsetof(struct virtio_net_config, rss_max_key_size));
3836
3837                 vi->rss_hash_types_supported =
3838                     virtio_cread32(vdev, offsetof(struct virtio_net_config, supported_hash_types));
3839                 vi->rss_hash_types_supported &=
3840                                 ~(VIRTIO_NET_RSS_HASH_TYPE_IP_EX |
3841                                   VIRTIO_NET_RSS_HASH_TYPE_TCP_EX |
3842                                   VIRTIO_NET_RSS_HASH_TYPE_UDP_EX);
3843
3844                 dev->hw_features |= NETIF_F_RXHASH;
3845         }
3846
3847         if (vi->has_rss_hash_report)
3848                 vi->hdr_len = sizeof(struct virtio_net_hdr_v1_hash);
3849         else if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
3850                  virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3851                 vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
3852         else
3853                 vi->hdr_len = sizeof(struct virtio_net_hdr);
3854
3855         if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
3856             virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3857                 vi->any_header_sg = true;
3858
3859         if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
3860                 vi->has_cvq = true;
3861
3862         if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
3863                 mtu = virtio_cread16(vdev,
3864                                      offsetof(struct virtio_net_config,
3865                                               mtu));
3866                 if (mtu < dev->min_mtu) {
3867                         /* Should never trigger: MTU was previously validated
3868                          * in virtnet_validate.
3869                          */
3870                         dev_err(&vdev->dev,
3871                                 "device MTU appears to have changed it is now %d < %d",
3872                                 mtu, dev->min_mtu);
3873                         err = -EINVAL;
3874                         goto free;
3875                 }
3876
3877                 dev->mtu = mtu;
3878                 dev->max_mtu = mtu;
3879         }
3880
3881         virtnet_set_big_packets(vi, mtu);
3882
3883         if (vi->any_header_sg)
3884                 dev->needed_headroom = vi->hdr_len;
3885
3886         /* Enable multiqueue by default */
3887         if (num_online_cpus() >= max_queue_pairs)
3888                 vi->curr_queue_pairs = max_queue_pairs;
3889         else
3890                 vi->curr_queue_pairs = num_online_cpus();
3891         vi->max_queue_pairs = max_queue_pairs;
3892
3893         /* Allocate/initialize the rx/tx queues, and invoke find_vqs */
3894         err = init_vqs(vi);
3895         if (err)
3896                 goto free;
3897
3898 #ifdef CONFIG_SYSFS
3899         if (vi->mergeable_rx_bufs)
3900                 dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
3901 #endif
3902         netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
3903         netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
3904
3905         virtnet_init_settings(dev);
3906
3907         if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
3908                 vi->failover = net_failover_create(vi->dev);
3909                 if (IS_ERR(vi->failover)) {
3910                         err = PTR_ERR(vi->failover);
3911                         goto free_vqs;
3912                 }
3913         }
3914
3915         if (vi->has_rss || vi->has_rss_hash_report)
3916                 virtnet_init_default_rss(vi);
3917
3918         /* serialize netdev register + virtio_device_ready() with ndo_open() */
3919         rtnl_lock();
3920
3921         err = register_netdevice(dev);
3922         if (err) {
3923                 pr_debug("virtio_net: registering device failed\n");
3924                 rtnl_unlock();
3925                 goto free_failover;
3926         }
3927
3928         virtio_device_ready(vdev);
3929
3930         rtnl_unlock();
3931
3932         err = virtnet_cpu_notif_add(vi);
3933         if (err) {
3934                 pr_debug("virtio_net: registering cpu notifier failed\n");
3935                 goto free_unregister_netdev;
3936         }
3937
3938         virtnet_set_queues(vi, vi->curr_queue_pairs);
3939
3940         /* Assume link up if device can't report link status,
3941            otherwise get link status from config. */
3942         netif_carrier_off(dev);
3943         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
3944                 schedule_work(&vi->config_work);
3945         } else {
3946                 vi->status = VIRTIO_NET_S_LINK_UP;
3947                 virtnet_update_settings(vi);
3948                 netif_carrier_on(dev);
3949         }
3950
3951         for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
3952                 if (virtio_has_feature(vi->vdev, guest_offloads[i]))
3953                         set_bit(guest_offloads[i], &vi->guest_offloads);
3954         vi->guest_offloads_capable = vi->guest_offloads;
3955
3956         pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
3957                  dev->name, max_queue_pairs);
3958
3959         return 0;
3960
3961 free_unregister_netdev:
3962         unregister_netdev(dev);
3963 free_failover:
3964         net_failover_destroy(vi->failover);
3965 free_vqs:
3966         virtio_reset_device(vdev);
3967         cancel_delayed_work_sync(&vi->refill);
3968         free_receive_page_frags(vi);
3969         virtnet_del_vqs(vi);
3970 free:
3971         free_netdev(dev);
3972         return err;
3973 }
3974
3975 static void remove_vq_common(struct virtnet_info *vi)
3976 {
3977         virtio_reset_device(vi->vdev);
3978
3979         /* Free unused buffers in both send and recv, if any. */
3980         free_unused_bufs(vi);
3981
3982         free_receive_bufs(vi);
3983
3984         free_receive_page_frags(vi);
3985
3986         virtnet_del_vqs(vi);
3987 }
3988
3989 static void virtnet_remove(struct virtio_device *vdev)
3990 {
3991         struct virtnet_info *vi = vdev->priv;
3992
3993         virtnet_cpu_notif_remove(vi);
3994
3995         /* Make sure no work handler is accessing the device. */
3996         flush_work(&vi->config_work);
3997
3998         unregister_netdev(vi->dev);
3999
4000         net_failover_destroy(vi->failover);
4001
4002         remove_vq_common(vi);
4003
4004         free_netdev(vi->dev);
4005 }
4006
4007 static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
4008 {
4009         struct virtnet_info *vi = vdev->priv;
4010
4011         virtnet_cpu_notif_remove(vi);
4012         virtnet_freeze_down(vdev);
4013         remove_vq_common(vi);
4014
4015         return 0;
4016 }
4017
4018 static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
4019 {
4020         struct virtnet_info *vi = vdev->priv;
4021         int err;
4022
4023         err = virtnet_restore_up(vdev);
4024         if (err)
4025                 return err;
4026         virtnet_set_queues(vi, vi->curr_queue_pairs);
4027
4028         err = virtnet_cpu_notif_add(vi);
4029         if (err) {
4030                 virtnet_freeze_down(vdev);
4031                 remove_vq_common(vi);
4032                 return err;
4033         }
4034
4035         return 0;
4036 }
4037
4038 static struct virtio_device_id id_table[] = {
4039         { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
4040         { 0 },
4041 };
4042
4043 #define VIRTNET_FEATURES \
4044         VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
4045         VIRTIO_NET_F_MAC, \
4046         VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
4047         VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
4048         VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
4049         VIRTIO_NET_F_HOST_USO, VIRTIO_NET_F_GUEST_USO4, VIRTIO_NET_F_GUEST_USO6, \
4050         VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
4051         VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
4052         VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
4053         VIRTIO_NET_F_CTRL_MAC_ADDR, \
4054         VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
4055         VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \
4056         VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL
4057
4058 static unsigned int features[] = {
4059         VIRTNET_FEATURES,
4060 };
4061
4062 static unsigned int features_legacy[] = {
4063         VIRTNET_FEATURES,
4064         VIRTIO_NET_F_GSO,
4065         VIRTIO_F_ANY_LAYOUT,
4066 };
4067
4068 static struct virtio_driver virtio_net_driver = {
4069         .feature_table = features,
4070         .feature_table_size = ARRAY_SIZE(features),
4071         .feature_table_legacy = features_legacy,
4072         .feature_table_size_legacy = ARRAY_SIZE(features_legacy),
4073         .driver.name =  KBUILD_MODNAME,
4074         .driver.owner = THIS_MODULE,
4075         .id_table =     id_table,
4076         .validate =     virtnet_validate,
4077         .probe =        virtnet_probe,
4078         .remove =       virtnet_remove,
4079         .config_changed = virtnet_config_changed,
4080 #ifdef CONFIG_PM_SLEEP
4081         .freeze =       virtnet_freeze,
4082         .restore =      virtnet_restore,
4083 #endif
4084 };
4085
4086 static __init int virtio_net_driver_init(void)
4087 {
4088         int ret;
4089
4090         ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
4091                                       virtnet_cpu_online,
4092                                       virtnet_cpu_down_prep);
4093         if (ret < 0)
4094                 goto out;
4095         virtionet_online = ret;
4096         ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
4097                                       NULL, virtnet_cpu_dead);
4098         if (ret)
4099                 goto err_dead;
4100         ret = register_virtio_driver(&virtio_net_driver);
4101         if (ret)
4102                 goto err_virtio;
4103         return 0;
4104 err_virtio:
4105         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
4106 err_dead:
4107         cpuhp_remove_multi_state(virtionet_online);
4108 out:
4109         return ret;
4110 }
4111 module_init(virtio_net_driver_init);
4112
4113 static __exit void virtio_net_driver_exit(void)
4114 {
4115         unregister_virtio_driver(&virtio_net_driver);
4116         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
4117         cpuhp_remove_multi_state(virtionet_online);
4118 }
4119 module_exit(virtio_net_driver_exit);
4120
4121 MODULE_DEVICE_TABLE(virtio, id_table);
4122 MODULE_DESCRIPTION("Virtio network driver");
4123 MODULE_LICENSE("GPL");