drivers/net/virtio_net.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* A network driver using virtio.
   3  *
   4  * Copyright 2007 Rusty Russell <[email protected]> IBM Corporation
   5  */
   6 //#define DEBUG
   7 #include <linux/netdevice.h>
   8 #include <linux/etherdevice.h>
   9 #include <linux/ethtool.h>
  10 #include <linux/module.h>
  11 #include <linux/virtio.h>
  12 #include <linux/virtio_net.h>
  13 #include <linux/bpf.h>
  14 #include <linux/bpf_trace.h>
  15 #include <linux/scatterlist.h>
  16 #include <linux/if_vlan.h>
  17 #include <linux/slab.h>
  18 #include <linux/cpu.h>
  19 #include <linux/average.h>
  20 #include <linux/filter.h>
  21 #include <linux/kernel.h>
  22 #include <net/route.h>
  23 #include <net/xdp.h>
  24 #include <net/net_failover.h>
  25
  26 static int napi_weight = NAPI_POLL_WEIGHT;
  27 module_param(napi_weight, int, 0444);
  28
  29 static bool csum = true, gso = true, napi_tx = true;
  30 module_param(csum, bool, 0444);
  31 module_param(gso, bool, 0444);
  32 module_param(napi_tx, bool, 0644);
  33
  34 /* FIXME: MTU in config. */
  35 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
  36 #define GOOD_COPY_LEN   128
  37
  38 #define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
  39
  40 /* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
  41 #define VIRTIO_XDP_HEADROOM 256
  42
  43 /* Separating two types of XDP xmit */
  44 #define VIRTIO_XDP_TX           BIT(0)
  45 #define VIRTIO_XDP_REDIR        BIT(1)
  46
  47 #define VIRTIO_XDP_FLAG BIT(0)
  48
  49 /* RX packet size EWMA. The average packet size is used to determine the packet
  50  * buffer size when refilling RX rings. As the entire RX ring may be refilled
  51  * at once, the weight is chosen so that the EWMA will be insensitive to short-
  52  * term, transient changes in packet size.
  53  */
  54 DECLARE_EWMA(pkt_len, 0, 64)
  55
  56 #define VIRTNET_DRIVER_VERSION "1.0.0"
  57
  58 static const unsigned long guest_offloads[] = {
  59         VIRTIO_NET_F_GUEST_TSO4,
  60         VIRTIO_NET_F_GUEST_TSO6,
  61         VIRTIO_NET_F_GUEST_ECN,
  62         VIRTIO_NET_F_GUEST_UFO,
  63         VIRTIO_NET_F_GUEST_CSUM,
  64         VIRTIO_NET_F_GUEST_USO4,
  65         VIRTIO_NET_F_GUEST_USO6
  66 };
  67
  68 #define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
  69                                 (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
  70                                 (1ULL << VIRTIO_NET_F_GUEST_ECN)  | \
  71                                 (1ULL << VIRTIO_NET_F_GUEST_UFO)  | \
  72                                 (1ULL << VIRTIO_NET_F_GUEST_USO4) | \
  73                                 (1ULL << VIRTIO_NET_F_GUEST_USO6))
  74
  75 struct virtnet_stat_desc {
  76         char desc[ETH_GSTRING_LEN];
  77         size_t offset;
  78 };
  79
  80 struct virtnet_sq_stats {
  81         struct u64_stats_sync syncp;
  82         u64 packets;
  83         u64 bytes;
  84         u64 xdp_tx;
  85         u64 xdp_tx_drops;
  86         u64 kicks;
  87         u64 tx_timeouts;
  88 };
  89
  90 struct virtnet_rq_stats {
  91         struct u64_stats_sync syncp;
  92         u64 packets;
  93         u64 bytes;
  94         u64 drops;
  95         u64 xdp_packets;
  96         u64 xdp_tx;
  97         u64 xdp_redirects;
  98         u64 xdp_drops;
  99         u64 kicks;
 100 };
 101
 102 #define VIRTNET_SQ_STAT(m)      offsetof(struct virtnet_sq_stats, m)
 103 #define VIRTNET_RQ_STAT(m)      offsetof(struct virtnet_rq_stats, m)
 104
 105 static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = {
 106         { "packets",            VIRTNET_SQ_STAT(packets) },
 107         { "bytes",              VIRTNET_SQ_STAT(bytes) },
 108         { "xdp_tx",             VIRTNET_SQ_STAT(xdp_tx) },
 109         { "xdp_tx_drops",       VIRTNET_SQ_STAT(xdp_tx_drops) },
 110         { "kicks",              VIRTNET_SQ_STAT(kicks) },
 111         { "tx_timeouts",        VIRTNET_SQ_STAT(tx_timeouts) },
 112 };
 113
 114 static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = {
 115         { "packets",            VIRTNET_RQ_STAT(packets) },
 116         { "bytes",              VIRTNET_RQ_STAT(bytes) },
 117         { "drops",              VIRTNET_RQ_STAT(drops) },
 118         { "xdp_packets",        VIRTNET_RQ_STAT(xdp_packets) },
 119         { "xdp_tx",             VIRTNET_RQ_STAT(xdp_tx) },
 120         { "xdp_redirects",      VIRTNET_RQ_STAT(xdp_redirects) },
 121         { "xdp_drops",          VIRTNET_RQ_STAT(xdp_drops) },
 122         { "kicks",              VIRTNET_RQ_STAT(kicks) },
 123 };
 124
 125 #define VIRTNET_SQ_STATS_LEN    ARRAY_SIZE(virtnet_sq_stats_desc)
 126 #define VIRTNET_RQ_STATS_LEN    ARRAY_SIZE(virtnet_rq_stats_desc)
 127
 128 /* Internal representation of a send virtqueue */
 129 struct send_queue {
 130         /* Virtqueue associated with this send _queue */
 131         struct virtqueue *vq;
 132
 133         /* TX: fragments + linear part + virtio header */
 134         struct scatterlist sg[MAX_SKB_FRAGS + 2];
 135
 136         /* Name of the send queue: output.$index */
 137         char name[16];
 138
 139         struct virtnet_sq_stats stats;
 140
 141         struct napi_struct napi;
 142
 143         /* Record whether sq is in reset state. */
 144         bool reset;
 145 };
 146
 147 /* Internal representation of a receive virtqueue */
 148 struct receive_queue {
 149         /* Virtqueue associated with this receive_queue */
 150         struct virtqueue *vq;
 151
 152         struct napi_struct napi;
 153
 154         struct bpf_prog __rcu *xdp_prog;
 155
 156         struct virtnet_rq_stats stats;
 157
 158         /* Chain pages by the private ptr. */
 159         struct page *pages;
 160
 161         /* Average packet length for mergeable receive buffers. */
 162         struct ewma_pkt_len mrg_avg_pkt_len;
 163
 164         /* Page frag for packet buffer allocation. */
 165         struct page_frag alloc_frag;
 166
 167         /* RX: fragments + linear part + virtio header */
 168         struct scatterlist sg[MAX_SKB_FRAGS + 2];
 169
 170         /* Min single buffer size for mergeable buffers case. */
 171         unsigned int min_buf_len;
 172
 173         /* Name of this receive queue: input.$index */
 174         char name[16];
 175
 176         struct xdp_rxq_info xdp_rxq;
 177 };
 178
 179 /* This structure can contain rss message with maximum settings for indirection table and keysize
 180  * Note, that default structure that describes RSS configuration virtio_net_rss_config
 181  * contains same info but can't handle table values.
 182  * In any case, structure would be passed to virtio hw through sg_buf split by parts
 183  * because table sizes may be differ according to the device configuration.
 184  */
 185 #define VIRTIO_NET_RSS_MAX_KEY_SIZE     40
 186 #define VIRTIO_NET_RSS_MAX_TABLE_LEN    128
 187 struct virtio_net_ctrl_rss {
 188         u32 hash_types;
 189         u16 indirection_table_mask;
 190         u16 unclassified_queue;
 191         u16 indirection_table[VIRTIO_NET_RSS_MAX_TABLE_LEN];
 192         u16 max_tx_vq;
 193         u8 hash_key_length;
 194         u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
 195 };
 196
 197 /* Control VQ buffers: protected by the rtnl lock */
 198 struct control_buf {
 199         struct virtio_net_ctrl_hdr hdr;
 200         virtio_net_ctrl_ack status;
 201         struct virtio_net_ctrl_mq mq;
 202         u8 promisc;
 203         u8 allmulti;
 204         __virtio16 vid;
 205         __virtio64 offloads;
 206         struct virtio_net_ctrl_rss rss;
 207 };
 208
 209 struct virtnet_info {
 210         struct virtio_device *vdev;
 211         struct virtqueue *cvq;
 212         struct net_device *dev;
 213         struct send_queue *sq;
 214         struct receive_queue *rq;
 215         unsigned int status;
 216
 217         /* Max # of queue pairs supported by the device */
 218         u16 max_queue_pairs;
 219
 220         /* # of queue pairs currently used by the driver */
 221         u16 curr_queue_pairs;
 222
 223         /* # of XDP queue pairs currently used by the driver */
 224         u16 xdp_queue_pairs;
 225
 226         /* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */
 227         bool xdp_enabled;
 228
 229         /* I like... big packets and I cannot lie! */
 230         bool big_packets;
 231
 232         /* number of sg entries allocated for big packets */
 233         unsigned int big_packets_num_skbfrags;
 234
 235         /* Host will merge rx buffers for big packets (shake it! shake it!) */
 236         bool mergeable_rx_bufs;
 237
 238         /* Host supports rss and/or hash report */
 239         bool has_rss;
 240         bool has_rss_hash_report;
 241         u8 rss_key_size;
 242         u16 rss_indir_table_size;
 243         u32 rss_hash_types_supported;
 244         u32 rss_hash_types_saved;
 245
 246         /* Has control virtqueue */
 247         bool has_cvq;
 248
 249         /* Host can handle any s/g split between our header and packet data */
 250         bool any_header_sg;
 251
 252         /* Packet virtio header size */
 253         u8 hdr_len;
 254
 255         /* Work struct for delayed refilling if we run low on memory. */
 256         struct delayed_work refill;
 257
 258         /* Is delayed refill enabled? */
 259         bool refill_enabled;
 260
 261         /* The lock to synchronize the access to refill_enabled */
 262         spinlock_t refill_lock;
 263
 264         /* Work struct for config space updates */
 265         struct work_struct config_work;
 266
 267         /* Does the affinity hint is set for virtqueues? */
 268         bool affinity_hint_set;
 269
 270         /* CPU hotplug instances for online & dead */
 271         struct hlist_node node;
 272         struct hlist_node node_dead;
 273
 274         struct control_buf *ctrl;
 275
 276         /* Ethtool settings */
 277         u8 duplex;
 278         u32 speed;
 279
 280         /* Interrupt coalescing settings */
 281         u32 tx_usecs;
 282         u32 rx_usecs;
 283         u32 tx_max_packets;
 284         u32 rx_max_packets;
 285
 286         unsigned long guest_offloads;
 287         unsigned long guest_offloads_capable;
 288
 289         /* failover when STANDBY feature enabled */
 290         struct failover *failover;
 291 };
 292
 293 struct padded_vnet_hdr {
 294         struct virtio_net_hdr_v1_hash hdr;
 295         /*
 296          * hdr is in a separate sg buffer, and data sg buffer shares same page
 297          * with this header sg. This padding makes next sg 16 byte aligned
 298          * after the header.
 299          */
 300         char padding[12];
 301 };
 302
 303 static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf);
 304 static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
 305
 306 static bool is_xdp_frame(void *ptr)
 307 {
 308         return (unsigned long)ptr & VIRTIO_XDP_FLAG;
 309 }
 310
 311 static void *xdp_to_ptr(struct xdp_frame *ptr)
 312 {
 313         return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
 314 }
 315
 316 static struct xdp_frame *ptr_to_xdp(void *ptr)
 317 {
 318         return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
 319 }
 320
 321 /* Converting between virtqueue no. and kernel tx/rx queue no.
 322  * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 323  */
 324 static int vq2txq(struct virtqueue *vq)
 325 {
 326         return (vq->index - 1) / 2;
 327 }
 328
 329 static int txq2vq(int txq)
 330 {
 331         return txq * 2 + 1;
 332 }
 333
 334 static int vq2rxq(struct virtqueue *vq)
 335 {
 336         return vq->index / 2;
 337 }
 338
 339 static int rxq2vq(int rxq)
 340 {
 341         return rxq * 2;
 342 }
 343
 344 static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
 345 {
 346         return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
 347 }
 348
 349 /*
 350  * private is used to chain pages for big packets, put the whole
 351  * most recent used list in the beginning for reuse
 352  */
 353 static void give_pages(struct receive_queue *rq, struct page *page)
 354 {
 355         struct page *end;
 356
 357         /* Find end of list, sew whole thing into vi->rq.pages. */
 358         for (end = page; end->private; end = (struct page *)end->private);
 359         end->private = (unsigned long)rq->pages;
 360         rq->pages = page;
 361 }
 362
 363 static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 364 {
 365         struct page *p = rq->pages;
 366
 367         if (p) {
 368                 rq->pages = (struct page *)p->private;
 369                 /* clear private here, it is used to chain pages */
 370                 p->private = 0;
 371         } else
 372                 p = alloc_page(gfp_mask);
 373         return p;
 374 }
 375
 376 static void enable_delayed_refill(struct virtnet_info *vi)
 377 {
 378         spin_lock_bh(&vi->refill_lock);
 379         vi->refill_enabled = true;
 380         spin_unlock_bh(&vi->refill_lock);
 381 }
 382
 383 static void disable_delayed_refill(struct virtnet_info *vi)
 384 {
 385         spin_lock_bh(&vi->refill_lock);
 386         vi->refill_enabled = false;
 387         spin_unlock_bh(&vi->refill_lock);
 388 }
 389
 390 static void virtqueue_napi_schedule(struct napi_struct *napi,
 391                                     struct virtqueue *vq)
 392 {
 393         if (napi_schedule_prep(napi)) {
 394                 virtqueue_disable_cb(vq);
 395                 __napi_schedule(napi);
 396         }
 397 }
 398
 399 static void virtqueue_napi_complete(struct napi_struct *napi,
 400                                     struct virtqueue *vq, int processed)
 401 {
 402         int opaque;
 403
 404         opaque = virtqueue_enable_cb_prepare(vq);
 405         if (napi_complete_done(napi, processed)) {
 406                 if (unlikely(virtqueue_poll(vq, opaque)))
 407                         virtqueue_napi_schedule(napi, vq);
 408         } else {
 409                 virtqueue_disable_cb(vq);
 410         }
 411 }
 412
 413 static void skb_xmit_done(struct virtqueue *vq)
 414 {
 415         struct virtnet_info *vi = vq->vdev->priv;
 416         struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
 417
 418         /* Suppress further interrupts. */
 419         virtqueue_disable_cb(vq);
 420
 421         if (napi->weight)
 422                 virtqueue_napi_schedule(napi, vq);
 423         else
 424                 /* We were probably waiting for more output buffers. */
 425                 netif_wake_subqueue(vi->dev, vq2txq(vq));
 426 }
 427
 428 #define MRG_CTX_HEADER_SHIFT 22
 429 static void *mergeable_len_to_ctx(unsigned int truesize,
 430                                   unsigned int headroom)
 431 {
 432         return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
 433 }
 434
 435 static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
 436 {
 437         return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
 438 }
 439
 440 static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
 441 {
 442         return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
 443 }
 444
 445 /* Called from bottom half context */
 446 static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 447                                    struct receive_queue *rq,
 448                                    struct page *page, unsigned int offset,
 449                                    unsigned int len, unsigned int truesize)
 450 {
 451         struct sk_buff *skb;
 452         struct virtio_net_hdr_mrg_rxbuf *hdr;
 453         unsigned int copy, hdr_len, hdr_padded_len;
 454         struct page *page_to_free = NULL;
 455         int tailroom, shinfo_size;
 456         char *p, *hdr_p, *buf;
 457
 458         p = page_address(page) + offset;
 459         hdr_p = p;
 460
 461         hdr_len = vi->hdr_len;
 462         if (vi->mergeable_rx_bufs)
 463                 hdr_padded_len = hdr_len;
 464         else
 465                 hdr_padded_len = sizeof(struct padded_vnet_hdr);
 466
 467         buf = p;
 468         len -= hdr_len;
 469         offset += hdr_padded_len;
 470         p += hdr_padded_len;
 471         tailroom = truesize - hdr_padded_len - len;
 472
 473         shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 474
 475         /* copy small packet so we can reuse these pages */
 476         if (!NET_IP_ALIGN && len > GOOD_COPY_LEN && tailroom >= shinfo_size) {
 477                 skb = build_skb(buf, truesize);
 478                 if (unlikely(!skb))
 479                         return NULL;
 480
 481                 skb_reserve(skb, p - buf);
 482                 skb_put(skb, len);
 483
 484                 page = (struct page *)page->private;
 485                 if (page)
 486                         give_pages(rq, page);
 487                 goto ok;
 488         }
 489
 490         /* copy small packet so we can reuse these pages for small data */
 491         skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
 492         if (unlikely(!skb))
 493                 return NULL;
 494
 495         /* Copy all frame if it fits skb->head, otherwise
 496          * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
 497          */
 498         if (len <= skb_tailroom(skb))
 499                 copy = len;
 500         else
 501                 copy = ETH_HLEN;
 502         skb_put_data(skb, p, copy);
 503
 504         len -= copy;
 505         offset += copy;
 506
 507         if (vi->mergeable_rx_bufs) {
 508                 if (len)
 509                         skb_add_rx_frag(skb, 0, page, offset, len, truesize);
 510                 else
 511                         page_to_free = page;
 512                 goto ok;
 513         }
 514
 515         /*
 516          * Verify that we can indeed put this data into a skb.
 517          * This is here to handle cases when the device erroneously
 518          * tries to receive more than is possible. This is usually
 519          * the case of a broken device.
 520          */
 521         if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
 522                 net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
 523                 dev_kfree_skb(skb);
 524                 return NULL;
 525         }
 526         BUG_ON(offset >= PAGE_SIZE);
 527         while (len) {
 528                 unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
 529                 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
 530                                 frag_size, truesize);
 531                 len -= frag_size;
 532                 page = (struct page *)page->private;
 533                 offset = 0;
 534         }
 535
 536         if (page)
 537                 give_pages(rq, page);
 538
 539 ok:
 540         hdr = skb_vnet_hdr(skb);
 541         memcpy(hdr, hdr_p, hdr_len);
 542         if (page_to_free)
 543                 put_page(page_to_free);
 544
 545         return skb;
 546 }
 547
 548 static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
 549                                    struct send_queue *sq,
 550                                    struct xdp_frame *xdpf)
 551 {
 552         struct virtio_net_hdr_mrg_rxbuf *hdr;
 553         struct skb_shared_info *shinfo;
 554         u8 nr_frags = 0;
 555         int err, i;
 556
 557         if (unlikely(xdpf->headroom < vi->hdr_len))
 558                 return -EOVERFLOW;
 559
 560         if (unlikely(xdp_frame_has_frags(xdpf))) {
 561                 shinfo = xdp_get_shared_info_from_frame(xdpf);
 562                 nr_frags = shinfo->nr_frags;
 563         }
 564
 565         /* In wrapping function virtnet_xdp_xmit(), we need to free
 566          * up the pending old buffers, where we need to calculate the
 567          * position of skb_shared_info in xdp_get_frame_len() and
 568          * xdp_return_frame(), which will involve to xdpf->data and
 569          * xdpf->headroom. Therefore, we need to update the value of
 570          * headroom synchronously here.
 571          */
 572         xdpf->headroom -= vi->hdr_len;
 573         xdpf->data -= vi->hdr_len;
 574         /* Zero header and leave csum up to XDP layers */
 575         hdr = xdpf->data;
 576         memset(hdr, 0, vi->hdr_len);
 577         xdpf->len   += vi->hdr_len;
 578
 579         sg_init_table(sq->sg, nr_frags + 1);
 580         sg_set_buf(sq->sg, xdpf->data, xdpf->len);
 581         for (i = 0; i < nr_frags; i++) {
 582                 skb_frag_t *frag = &shinfo->frags[i];
 583
 584                 sg_set_page(&sq->sg[i + 1], skb_frag_page(frag),
 585                             skb_frag_size(frag), skb_frag_off(frag));
 586         }
 587
 588         err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
 589                                    xdp_to_ptr(xdpf), GFP_ATOMIC);
 590         if (unlikely(err))
 591                 return -ENOSPC; /* Caller handle free/refcnt */
 592
 593         return 0;
 594 }
 595
 596 /* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on
 597  * the current cpu, so it does not need to be locked.
 598  *
 599  * Here we use marco instead of inline functions because we have to deal with
 600  * three issues at the same time: 1. the choice of sq. 2. judge and execute the
 601  * lock/unlock of txq 3. make sparse happy. It is difficult for two inline
 602  * functions to perfectly solve these three problems at the same time.
 603  */
 604 #define virtnet_xdp_get_sq(vi) ({                                       \
 605         int cpu = smp_processor_id();                                   \
 606         struct netdev_queue *txq;                                       \
 607         typeof(vi) v = (vi);                                            \
 608         unsigned int qp;                                                \
 609                                                                         \
 610         if (v->curr_queue_pairs > nr_cpu_ids) {                         \
 611                 qp = v->curr_queue_pairs - v->xdp_queue_pairs;          \
 612                 qp += cpu;                                              \
 613                 txq = netdev_get_tx_queue(v->dev, qp);                  \
 614                 __netif_tx_acquire(txq);                                \
 615         } else {                                                        \
 616                 qp = cpu % v->curr_queue_pairs;                         \
 617                 txq = netdev_get_tx_queue(v->dev, qp);                  \
 618                 __netif_tx_lock(txq, cpu);                              \
 619         }                                                               \
 620         v->sq + qp;                                                     \
 621 })
 622
 623 #define virtnet_xdp_put_sq(vi, q) {                                     \
 624         struct netdev_queue *txq;                                       \
 625         typeof(vi) v = (vi);                                            \
 626                                                                         \
 627         txq = netdev_get_tx_queue(v->dev, (q) - v->sq);                 \
 628         if (v->curr_queue_pairs > nr_cpu_ids)                           \
 629                 __netif_tx_release(txq);                                \
 630         else                                                            \
 631                 __netif_tx_unlock(txq);                                 \
 632 }
 633
 634 static int virtnet_xdp_xmit(struct net_device *dev,
 635                             int n, struct xdp_frame **frames, u32 flags)
 636 {
 637         struct virtnet_info *vi = netdev_priv(dev);
 638         struct receive_queue *rq = vi->rq;
 639         struct bpf_prog *xdp_prog;
 640         struct send_queue *sq;
 641         unsigned int len;
 642         int packets = 0;
 643         int bytes = 0;
 644         int nxmit = 0;
 645         int kicks = 0;
 646         void *ptr;
 647         int ret;
 648         int i;
 649
 650         /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
 651          * indicate XDP resources have been successfully allocated.
 652          */
 653         xdp_prog = rcu_access_pointer(rq->xdp_prog);
 654         if (!xdp_prog)
 655                 return -ENXIO;
 656
 657         sq = virtnet_xdp_get_sq(vi);
 658
 659         if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
 660                 ret = -EINVAL;
 661                 goto out;
 662         }
 663
 664         /* Free up any pending old buffers before queueing new ones. */
 665         while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
 666                 if (likely(is_xdp_frame(ptr))) {
 667                         struct xdp_frame *frame = ptr_to_xdp(ptr);
 668
 669                         bytes += xdp_get_frame_len(frame);
 670                         xdp_return_frame(frame);
 671                 } else {
 672                         struct sk_buff *skb = ptr;
 673
 674                         bytes += skb->len;
 675                         napi_consume_skb(skb, false);
 676                 }
 677                 packets++;
 678         }
 679
 680         for (i = 0; i < n; i++) {
 681                 struct xdp_frame *xdpf = frames[i];
 682
 683                 if (__virtnet_xdp_xmit_one(vi, sq, xdpf))
 684                         break;
 685                 nxmit++;
 686         }
 687         ret = nxmit;
 688
 689         if (flags & XDP_XMIT_FLUSH) {
 690                 if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
 691                         kicks = 1;
 692         }
 693 out:
 694         u64_stats_update_begin(&sq->stats.syncp);
 695         sq->stats.bytes += bytes;
 696         sq->stats.packets += packets;
 697         sq->stats.xdp_tx += n;
 698         sq->stats.xdp_tx_drops += n - nxmit;
 699         sq->stats.kicks += kicks;
 700         u64_stats_update_end(&sq->stats.syncp);
 701
 702         virtnet_xdp_put_sq(vi, sq);
 703         return ret;
 704 }
 705
 706 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
 707 {
 708         return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0;
 709 }
 710
 711 /* We copy the packet for XDP in the following cases:
 712  *
 713  * 1) Packet is scattered across multiple rx buffers.
 714  * 2) Headroom space is insufficient.
 715  *
 716  * This is inefficient but it's a temporary condition that
 717  * we hit right after XDP is enabled and until queue is refilled
 718  * with large buffers with sufficient headroom - so it should affect
 719  * at most queue size packets.
 720  * Afterwards, the conditions to enable
 721  * XDP should preclude the underlying device from sending packets
 722  * across multiple buffers (num_buf > 1), and we make sure buffers
 723  * have enough headroom.
 724  */
 725 static struct page *xdp_linearize_page(struct receive_queue *rq,
 726                                        int *num_buf,
 727                                        struct page *p,
 728                                        int offset,
 729                                        int page_off,
 730                                        unsigned int *len)
 731 {
 732         struct page *page = alloc_page(GFP_ATOMIC);
 733
 734         if (!page)
 735                 return NULL;
 736
 737         memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
 738         page_off += *len;
 739
 740         while (--*num_buf) {
 741                 int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 742                 unsigned int buflen;
 743                 void *buf;
 744                 int off;
 745
 746                 buf = virtqueue_get_buf(rq->vq, &buflen);
 747                 if (unlikely(!buf))
 748                         goto err_buf;
 749
 750                 p = virt_to_head_page(buf);
 751                 off = buf - page_address(p);
 752
 753                 /* guard against a misconfigured or uncooperative backend that
 754                  * is sending packet larger than the MTU.
 755                  */
 756                 if ((page_off + buflen + tailroom) > PAGE_SIZE) {
 757                         put_page(p);
 758                         goto err_buf;
 759                 }
 760
 761                 memcpy(page_address(page) + page_off,
 762                        page_address(p) + off, buflen);
 763                 page_off += buflen;
 764                 put_page(p);
 765         }
 766
 767         /* Headroom does not contribute to packet length */
 768         *len = page_off - VIRTIO_XDP_HEADROOM;
 769         return page;
 770 err_buf:
 771         __free_pages(page, 0);
 772         return NULL;
 773 }
 774
 775 static struct sk_buff *receive_small(struct net_device *dev,
 776                                      struct virtnet_info *vi,
 777                                      struct receive_queue *rq,
 778                                      void *buf, void *ctx,
 779                                      unsigned int len,
 780                                      unsigned int *xdp_xmit,
 781                                      struct virtnet_rq_stats *stats)
 782 {
 783         struct sk_buff *skb;
 784         struct bpf_prog *xdp_prog;
 785         unsigned int xdp_headroom = (unsigned long)ctx;
 786         unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
 787         unsigned int headroom = vi->hdr_len + header_offset;
 788         unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 789                               SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 790         struct page *page = virt_to_head_page(buf);
 791         unsigned int delta = 0;
 792         struct page *xdp_page;
 793         int err;
 794         unsigned int metasize = 0;
 795
 796         len -= vi->hdr_len;
 797         stats->bytes += len;
 798
 799         if (unlikely(len > GOOD_PACKET_LEN)) {
 800                 pr_debug("%s: rx error: len %u exceeds max size %d\n",
 801                          dev->name, len, GOOD_PACKET_LEN);
 802                 dev->stats.rx_length_errors++;
 803                 goto err;
 804         }
 805
 806         if (likely(!vi->xdp_enabled)) {
 807                 xdp_prog = NULL;
 808                 goto skip_xdp;
 809         }
 810
 811         rcu_read_lock();
 812         xdp_prog = rcu_dereference(rq->xdp_prog);
 813         if (xdp_prog) {
 814                 struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
 815                 struct xdp_frame *xdpf;
 816                 struct xdp_buff xdp;
 817                 void *orig_data;
 818                 u32 act;
 819
 820                 if (unlikely(hdr->hdr.gso_type))
 821                         goto err_xdp;
 822
 823                 if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
 824                         int offset = buf - page_address(page) + header_offset;
 825                         unsigned int tlen = len + vi->hdr_len;
 826                         int num_buf = 1;
 827
 828                         xdp_headroom = virtnet_get_headroom(vi);
 829                         header_offset = VIRTNET_RX_PAD + xdp_headroom;
 830                         headroom = vi->hdr_len + header_offset;
 831                         buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 832                                  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 833                         xdp_page = xdp_linearize_page(rq, &num_buf, page,
 834                                                       offset, header_offset,
 835                                                       &tlen);
 836                         if (!xdp_page)
 837                                 goto err_xdp;
 838
 839                         buf = page_address(xdp_page);
 840                         put_page(page);
 841                         page = xdp_page;
 842                 }
 843
 844                 xdp_init_buff(&xdp, buflen, &rq->xdp_rxq);
 845                 xdp_prepare_buff(&xdp, buf + VIRTNET_RX_PAD + vi->hdr_len,
 846                                  xdp_headroom, len, true);
 847                 orig_data = xdp.data;
 848                 act = bpf_prog_run_xdp(xdp_prog, &xdp);
 849                 stats->xdp_packets++;
 850
 851                 switch (act) {
 852                 case XDP_PASS:
 853                         /* Recalculate length in case bpf program changed it */
 854                         delta = orig_data - xdp.data;
 855                         len = xdp.data_end - xdp.data;
 856                         metasize = xdp.data - xdp.data_meta;
 857                         break;
 858                 case XDP_TX:
 859                         stats->xdp_tx++;
 860                         xdpf = xdp_convert_buff_to_frame(&xdp);
 861                         if (unlikely(!xdpf))
 862                                 goto err_xdp;
 863                         err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
 864                         if (unlikely(!err)) {
 865                                 xdp_return_frame_rx_napi(xdpf);
 866                         } else if (unlikely(err < 0)) {
 867                                 trace_xdp_exception(vi->dev, xdp_prog, act);
 868                                 goto err_xdp;
 869                         }
 870                         *xdp_xmit |= VIRTIO_XDP_TX;
 871                         rcu_read_unlock();
 872                         goto xdp_xmit;
 873                 case XDP_REDIRECT:
 874                         stats->xdp_redirects++;
 875                         err = xdp_do_redirect(dev, &xdp, xdp_prog);
 876                         if (err)
 877                                 goto err_xdp;
 878                         *xdp_xmit |= VIRTIO_XDP_REDIR;
 879                         rcu_read_unlock();
 880                         goto xdp_xmit;
 881                 default:
 882                         bpf_warn_invalid_xdp_action(vi->dev, xdp_prog, act);
 883                         fallthrough;
 884                 case XDP_ABORTED:
 885                         trace_xdp_exception(vi->dev, xdp_prog, act);
 886                         goto err_xdp;
 887                 case XDP_DROP:
 888                         goto err_xdp;
 889                 }
 890         }
 891         rcu_read_unlock();
 892
 893 skip_xdp:
 894         skb = build_skb(buf, buflen);
 895         if (!skb)
 896                 goto err;
 897         skb_reserve(skb, headroom - delta);
 898         skb_put(skb, len);
 899         if (!xdp_prog) {
 900                 buf += header_offset;
 901                 memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
 902         } /* keep zeroed vnet hdr since XDP is loaded */
 903
 904         if (metasize)
 905                 skb_metadata_set(skb, metasize);
 906
 907         return skb;
 908
 909 err_xdp:
 910         rcu_read_unlock();
 911         stats->xdp_drops++;
 912 err:
 913         stats->drops++;
 914         put_page(page);
 915 xdp_xmit:
 916         return NULL;
 917 }
 918
 919 static struct sk_buff *receive_big(struct net_device *dev,
 920                                    struct virtnet_info *vi,
 921                                    struct receive_queue *rq,
 922                                    void *buf,
 923                                    unsigned int len,
 924                                    struct virtnet_rq_stats *stats)
 925 {
 926         struct page *page = buf;
 927         struct sk_buff *skb =
 928                 page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
 929
 930         stats->bytes += len - vi->hdr_len;
 931         if (unlikely(!skb))
 932                 goto err;
 933
 934         return skb;
 935
 936 err:
 937         stats->drops++;
 938         give_pages(rq, page);
 939         return NULL;
 940 }
 941
 942 /* Why not use xdp_build_skb_from_frame() ?
 943  * XDP core assumes that xdp frags are PAGE_SIZE in length, while in
 944  * virtio-net there are 2 points that do not match its requirements:
 945  *  1. The size of the prefilled buffer is not fixed before xdp is set.
 946  *  2. xdp_build_skb_from_frame() does more checks that we don't need,
 947  *     like eth_type_trans() (which virtio-net does in receive_buf()).
 948  */
 949 static struct sk_buff *build_skb_from_xdp_buff(struct net_device *dev,
 950                                                struct virtnet_info *vi,
 951                                                struct xdp_buff *xdp,
 952                                                unsigned int xdp_frags_truesz)
 953 {
 954         struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
 955         unsigned int headroom, data_len;
 956         struct sk_buff *skb;
 957         int metasize;
 958         u8 nr_frags;
 959
 960         if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
 961                 pr_debug("Error building skb as missing reserved tailroom for xdp");
 962                 return NULL;
 963         }
 964
 965         if (unlikely(xdp_buff_has_frags(xdp)))
 966                 nr_frags = sinfo->nr_frags;
 967
 968         skb = build_skb(xdp->data_hard_start, xdp->frame_sz);
 969         if (unlikely(!skb))
 970                 return NULL;
 971
 972         headroom = xdp->data - xdp->data_hard_start;
 973         data_len = xdp->data_end - xdp->data;
 974         skb_reserve(skb, headroom);
 975         __skb_put(skb, data_len);
 976
 977         metasize = xdp->data - xdp->data_meta;
 978         metasize = metasize > 0 ? metasize : 0;
 979         if (metasize)
 980                 skb_metadata_set(skb, metasize);
 981
 982         if (unlikely(xdp_buff_has_frags(xdp)))
 983                 xdp_update_skb_shared_info(skb, nr_frags,
 984                                            sinfo->xdp_frags_size,
 985                                            xdp_frags_truesz,
 986                                            xdp_buff_is_frag_pfmemalloc(xdp));
 987
 988         return skb;
 989 }
 990
 991 /* TODO: build xdp in big mode */
 992 static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
 993                                       struct virtnet_info *vi,
 994                                       struct receive_queue *rq,
 995                                       struct xdp_buff *xdp,
 996                                       void *buf,
 997                                       unsigned int len,
 998                                       unsigned int frame_sz,
 999                                       int *num_buf,
1000                                       unsigned int *xdp_frags_truesize,
1001                                       struct virtnet_rq_stats *stats)
1002 {
1003         struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
1004         unsigned int headroom, tailroom, room;
1005         unsigned int truesize, cur_frag_size;
1006         struct skb_shared_info *shinfo;
1007         unsigned int xdp_frags_truesz = 0;
1008         struct page *page;
1009         skb_frag_t *frag;
1010         int offset;
1011         void *ctx;
1012
1013         xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
1014         xdp_prepare_buff(xdp, buf - VIRTIO_XDP_HEADROOM,
1015                          VIRTIO_XDP_HEADROOM + vi->hdr_len, len - vi->hdr_len, true);
1016
1017         if (!*num_buf)
1018                 return 0;
1019
1020         if (*num_buf > 1) {
1021                 /* If we want to build multi-buffer xdp, we need
1022                  * to specify that the flags of xdp_buff have the
1023                  * XDP_FLAGS_HAS_FRAG bit.
1024                  */
1025                 if (!xdp_buff_has_frags(xdp))
1026                         xdp_buff_set_frags_flag(xdp);
1027
1028                 shinfo = xdp_get_shared_info_from_buff(xdp);
1029                 shinfo->nr_frags = 0;
1030                 shinfo->xdp_frags_size = 0;
1031         }
1032
1033         if (*num_buf > MAX_SKB_FRAGS + 1)
1034                 return -EINVAL;
1035
1036         while (--*num_buf > 0) {
1037                 buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
1038                 if (unlikely(!buf)) {
1039                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
1040                                  dev->name, *num_buf,
1041                                  virtio16_to_cpu(vi->vdev, hdr->num_buffers));
1042                         dev->stats.rx_length_errors++;
1043                         return -EINVAL;
1044                 }
1045
1046                 stats->bytes += len;
1047                 page = virt_to_head_page(buf);
1048                 offset = buf - page_address(page);
1049
1050                 truesize = mergeable_ctx_to_truesize(ctx);
1051                 headroom = mergeable_ctx_to_headroom(ctx);
1052                 tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
1053                 room = SKB_DATA_ALIGN(headroom + tailroom);
1054
1055                 cur_frag_size = truesize;
1056                 xdp_frags_truesz += cur_frag_size;
1057                 if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
1058                         put_page(page);
1059                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1060                                  dev->name, len, (unsigned long)(truesize - room));
1061                         dev->stats.rx_length_errors++;
1062                         return -EINVAL;
1063                 }
1064
1065                 frag = &shinfo->frags[shinfo->nr_frags++];
1066                 __skb_frag_set_page(frag, page);
1067                 skb_frag_off_set(frag, offset);
1068                 skb_frag_size_set(frag, len);
1069                 if (page_is_pfmemalloc(page))
1070                         xdp_buff_set_frag_pfmemalloc(xdp);
1071
1072                 shinfo->xdp_frags_size += len;
1073         }
1074
1075         *xdp_frags_truesize = xdp_frags_truesz;
1076         return 0;
1077 }
1078
1079 static struct sk_buff *receive_mergeable(struct net_device *dev,
1080                                          struct virtnet_info *vi,
1081                                          struct receive_queue *rq,
1082                                          void *buf,
1083                                          void *ctx,
1084                                          unsigned int len,
1085                                          unsigned int *xdp_xmit,
1086                                          struct virtnet_rq_stats *stats)
1087 {
1088         struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
1089         int num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
1090         struct page *page = virt_to_head_page(buf);
1091         int offset = buf - page_address(page);
1092         struct sk_buff *head_skb, *curr_skb;
1093         struct bpf_prog *xdp_prog;
1094         unsigned int truesize = mergeable_ctx_to_truesize(ctx);
1095         unsigned int headroom = mergeable_ctx_to_headroom(ctx);
1096         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
1097         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1098         unsigned int frame_sz, xdp_room;
1099         int err;
1100
1101         head_skb = NULL;
1102         stats->bytes += len - vi->hdr_len;
1103
1104         if (unlikely(len > truesize - room)) {
1105                 pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1106                          dev->name, len, (unsigned long)(truesize - room));
1107                 dev->stats.rx_length_errors++;
1108                 goto err_skb;
1109         }
1110
1111         if (likely(!vi->xdp_enabled)) {
1112                 xdp_prog = NULL;
1113                 goto skip_xdp;
1114         }
1115
1116         rcu_read_lock();
1117         xdp_prog = rcu_dereference(rq->xdp_prog);
1118         if (xdp_prog) {
1119                 unsigned int xdp_frags_truesz = 0;
1120                 struct skb_shared_info *shinfo;
1121                 struct xdp_frame *xdpf;
1122                 struct page *xdp_page;
1123                 struct xdp_buff xdp;
1124                 void *data;
1125                 u32 act;
1126                 int i;
1127
1128                 /* Transient failure which in theory could occur if
1129                  * in-flight packets from before XDP was enabled reach
1130                  * the receive path after XDP is loaded.
1131                  */
1132                 if (unlikely(hdr->hdr.gso_type))
1133                         goto err_xdp;
1134
1135                 /* Now XDP core assumes frag size is PAGE_SIZE, but buffers
1136                  * with headroom may add hole in truesize, which
1137                  * make their length exceed PAGE_SIZE. So we disabled the
1138                  * hole mechanism for xdp. See add_recvbuf_mergeable().
1139                  */
1140                 frame_sz = truesize;
1141
1142                 /* This happens when headroom is not enough because
1143                  * of the buffer was prefilled before XDP is set.
1144                  * This should only happen for the first several packets.
1145                  * In fact, vq reset can be used here to help us clean up
1146                  * the prefilled buffers, but many existing devices do not
1147                  * support it, and we don't want to bother users who are
1148                  * using xdp normally.
1149                  */
1150                 if (!xdp_prog->aux->xdp_has_frags &&
1151                     (num_buf > 1 || headroom < virtnet_get_headroom(vi))) {
1152                         /* linearize data for XDP */
1153                         xdp_page = xdp_linearize_page(rq, &num_buf,
1154                                                       page, offset,
1155                                                       VIRTIO_XDP_HEADROOM,
1156                                                       &len);
1157                         frame_sz = PAGE_SIZE;
1158
1159                         if (!xdp_page)
1160                                 goto err_xdp;
1161                         offset = VIRTIO_XDP_HEADROOM;
1162                 } else if (unlikely(headroom < virtnet_get_headroom(vi))) {
1163                         xdp_room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM +
1164                                                   sizeof(struct skb_shared_info));
1165                         if (len + xdp_room > PAGE_SIZE)
1166                                 goto err_xdp;
1167
1168                         xdp_page = alloc_page(GFP_ATOMIC);
1169                         if (!xdp_page)
1170                                 goto err_xdp;
1171
1172                         memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM,
1173                                page_address(page) + offset, len);
1174                         frame_sz = PAGE_SIZE;
1175                         offset = VIRTIO_XDP_HEADROOM;
1176                 } else {
1177                         xdp_page = page;
1178                 }
1179
1180                 data = page_address(xdp_page) + offset;
1181                 err = virtnet_build_xdp_buff_mrg(dev, vi, rq, &xdp, data, len, frame_sz,
1182                                                  &num_buf, &xdp_frags_truesz, stats);
1183                 if (unlikely(err))
1184                         goto err_xdp_frags;
1185
1186                 act = bpf_prog_run_xdp(xdp_prog, &xdp);
1187                 stats->xdp_packets++;
1188
1189                 switch (act) {
1190                 case XDP_PASS:
1191                         if (unlikely(xdp_page != page))
1192                                 put_page(page);
1193                         head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
1194                         rcu_read_unlock();
1195                         return head_skb;
1196                 case XDP_TX:
1197                         stats->xdp_tx++;
1198                         xdpf = xdp_convert_buff_to_frame(&xdp);
1199                         if (unlikely(!xdpf)) {
1200                                 netdev_dbg(dev, "convert buff to frame failed for xdp\n");
1201                                 goto err_xdp_frags;
1202                         }
1203                         err = virtnet_xdp_xmit(dev, 1, &xdpf, 0);
1204                         if (unlikely(!err)) {
1205                                 xdp_return_frame_rx_napi(xdpf);
1206                         } else if (unlikely(err < 0)) {
1207                                 trace_xdp_exception(vi->dev, xdp_prog, act);
1208                                 goto err_xdp_frags;
1209                         }
1210                         *xdp_xmit |= VIRTIO_XDP_TX;
1211                         if (unlikely(xdp_page != page))
1212                                 put_page(page);
1213                         rcu_read_unlock();
1214                         goto xdp_xmit;
1215                 case XDP_REDIRECT:
1216                         stats->xdp_redirects++;
1217                         err = xdp_do_redirect(dev, &xdp, xdp_prog);
1218                         if (err)
1219                                 goto err_xdp_frags;
1220                         *xdp_xmit |= VIRTIO_XDP_REDIR;
1221                         if (unlikely(xdp_page != page))
1222                                 put_page(page);
1223                         rcu_read_unlock();
1224                         goto xdp_xmit;
1225                 default:
1226                         bpf_warn_invalid_xdp_action(vi->dev, xdp_prog, act);
1227                         fallthrough;
1228                 case XDP_ABORTED:
1229                         trace_xdp_exception(vi->dev, xdp_prog, act);
1230                         fallthrough;
1231                 case XDP_DROP:
1232                         goto err_xdp_frags;
1233                 }
1234 err_xdp_frags:
1235                 if (unlikely(xdp_page != page))
1236                         __free_pages(xdp_page, 0);
1237
1238                 if (xdp_buff_has_frags(&xdp)) {
1239                         shinfo = xdp_get_shared_info_from_buff(&xdp);
1240                         for (i = 0; i < shinfo->nr_frags; i++) {
1241                                 xdp_page = skb_frag_page(&shinfo->frags[i]);
1242                                 put_page(xdp_page);
1243                         }
1244                 }
1245
1246                 goto err_xdp;
1247         }
1248         rcu_read_unlock();
1249
1250 skip_xdp:
1251         head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
1252         curr_skb = head_skb;
1253
1254         if (unlikely(!curr_skb))
1255                 goto err_skb;
1256         while (--num_buf) {
1257                 int num_skb_frags;
1258
1259                 buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
1260                 if (unlikely(!buf)) {
1261                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
1262                                  dev->name, num_buf,
1263                                  virtio16_to_cpu(vi->vdev,
1264                                                  hdr->num_buffers));
1265                         dev->stats.rx_length_errors++;
1266                         goto err_buf;
1267                 }
1268
1269                 stats->bytes += len;
1270                 page = virt_to_head_page(buf);
1271
1272                 truesize = mergeable_ctx_to_truesize(ctx);
1273                 headroom = mergeable_ctx_to_headroom(ctx);
1274                 tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
1275                 room = SKB_DATA_ALIGN(headroom + tailroom);
1276                 if (unlikely(len > truesize - room)) {
1277                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
1278                                  dev->name, len, (unsigned long)(truesize - room));
1279                         dev->stats.rx_length_errors++;
1280                         goto err_skb;
1281                 }
1282
1283                 num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
1284                 if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
1285                         struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
1286
1287                         if (unlikely(!nskb))
1288                                 goto err_skb;
1289                         if (curr_skb == head_skb)
1290                                 skb_shinfo(curr_skb)->frag_list = nskb;
1291                         else
1292                                 curr_skb->next = nskb;
1293                         curr_skb = nskb;
1294                         head_skb->truesize += nskb->truesize;
1295                         num_skb_frags = 0;
1296                 }
1297                 if (curr_skb != head_skb) {
1298                         head_skb->data_len += len;
1299                         head_skb->len += len;
1300                         head_skb->truesize += truesize;
1301                 }
1302                 offset = buf - page_address(page);
1303                 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
1304                         put_page(page);
1305                         skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
1306                                              len, truesize);
1307                 } else {
1308                         skb_add_rx_frag(curr_skb, num_skb_frags, page,
1309                                         offset, len, truesize);
1310                 }
1311         }
1312
1313         ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
1314         return head_skb;
1315
1316 err_xdp:
1317         rcu_read_unlock();
1318         stats->xdp_drops++;
1319 err_skb:
1320         put_page(page);
1321         while (num_buf-- > 1) {
1322                 buf = virtqueue_get_buf(rq->vq, &len);
1323                 if (unlikely(!buf)) {
1324                         pr_debug("%s: rx error: %d buffers missing\n",
1325                                  dev->name, num_buf);
1326                         dev->stats.rx_length_errors++;
1327                         break;
1328                 }
1329                 stats->bytes += len;
1330                 page = virt_to_head_page(buf);
1331                 put_page(page);
1332         }
1333 err_buf:
1334         stats->drops++;
1335         dev_kfree_skb(head_skb);
1336 xdp_xmit:
1337         return NULL;
1338 }
1339
1340 static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash,
1341                                 struct sk_buff *skb)
1342 {
1343         enum pkt_hash_types rss_hash_type;
1344
1345         if (!hdr_hash || !skb)
1346                 return;
1347
1348         switch (__le16_to_cpu(hdr_hash->hash_report)) {
1349         case VIRTIO_NET_HASH_REPORT_TCPv4:
1350         case VIRTIO_NET_HASH_REPORT_UDPv4:
1351         case VIRTIO_NET_HASH_REPORT_TCPv6:
1352         case VIRTIO_NET_HASH_REPORT_UDPv6:
1353         case VIRTIO_NET_HASH_REPORT_TCPv6_EX:
1354         case VIRTIO_NET_HASH_REPORT_UDPv6_EX:
1355                 rss_hash_type = PKT_HASH_TYPE_L4;
1356                 break;
1357         case VIRTIO_NET_HASH_REPORT_IPv4:
1358         case VIRTIO_NET_HASH_REPORT_IPv6:
1359         case VIRTIO_NET_HASH_REPORT_IPv6_EX:
1360                 rss_hash_type = PKT_HASH_TYPE_L3;
1361                 break;
1362         case VIRTIO_NET_HASH_REPORT_NONE:
1363         default:
1364                 rss_hash_type = PKT_HASH_TYPE_NONE;
1365         }
1366         skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type);
1367 }
1368
1369 static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
1370                         void *buf, unsigned int len, void **ctx,
1371                         unsigned int *xdp_xmit,
1372                         struct virtnet_rq_stats *stats)
1373 {
1374         struct net_device *dev = vi->dev;
1375         struct sk_buff *skb;
1376         struct virtio_net_hdr_mrg_rxbuf *hdr;
1377
1378         if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
1379                 pr_debug("%s: short packet %i\n", dev->name, len);
1380                 dev->stats.rx_length_errors++;
1381                 virtnet_rq_free_unused_buf(rq->vq, buf);
1382                 return;
1383         }
1384
1385         if (vi->mergeable_rx_bufs)
1386                 skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
1387                                         stats);
1388         else if (vi->big_packets)
1389                 skb = receive_big(dev, vi, rq, buf, len, stats);
1390         else
1391                 skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
1392
1393         if (unlikely(!skb))
1394                 return;
1395
1396         hdr = skb_vnet_hdr(skb);
1397         if (dev->features & NETIF_F_RXHASH && vi->has_rss_hash_report)
1398                 virtio_skb_set_hash((const struct virtio_net_hdr_v1_hash *)hdr, skb);
1399
1400         if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
1401                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1402
1403         if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
1404                                   virtio_is_little_endian(vi->vdev))) {
1405                 net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
1406                                      dev->name, hdr->hdr.gso_type,
1407                                      hdr->hdr.gso_size);
1408                 goto frame_err;
1409         }
1410
1411         skb_record_rx_queue(skb, vq2rxq(rq->vq));
1412         skb->protocol = eth_type_trans(skb, dev);
1413         pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
1414                  ntohs(skb->protocol), skb->len, skb->pkt_type);
1415
1416         napi_gro_receive(&rq->napi, skb);
1417         return;
1418
1419 frame_err:
1420         dev->stats.rx_frame_errors++;
1421         dev_kfree_skb(skb);
1422 }
1423
1424 /* Unlike mergeable buffers, all buffers are allocated to the
1425  * same size, except for the headroom. For this reason we do
1426  * not need to use  mergeable_len_to_ctx here - it is enough
1427  * to store the headroom as the context ignoring the truesize.
1428  */
1429 static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
1430                              gfp_t gfp)
1431 {
1432         struct page_frag *alloc_frag = &rq->alloc_frag;
1433         char *buf;
1434         unsigned int xdp_headroom = virtnet_get_headroom(vi);
1435         void *ctx = (void *)(unsigned long)xdp_headroom;
1436         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
1437         int err;
1438
1439         len = SKB_DATA_ALIGN(len) +
1440               SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1441         if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
1442                 return -ENOMEM;
1443
1444         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1445         get_page(alloc_frag->page);
1446         alloc_frag->offset += len;
1447         sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
1448                     vi->hdr_len + GOOD_PACKET_LEN);
1449         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1450         if (err < 0)
1451                 put_page(virt_to_head_page(buf));
1452         return err;
1453 }
1454
1455 static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
1456                            gfp_t gfp)
1457 {
1458         struct page *first, *list = NULL;
1459         char *p;
1460         int i, err, offset;
1461
1462         sg_init_table(rq->sg, vi->big_packets_num_skbfrags + 2);
1463
1464         /* page in rq->sg[vi->big_packets_num_skbfrags + 1] is list tail */
1465         for (i = vi->big_packets_num_skbfrags + 1; i > 1; --i) {
1466                 first = get_a_page(rq, gfp);
1467                 if (!first) {
1468                         if (list)
1469                                 give_pages(rq, list);
1470                         return -ENOMEM;
1471                 }
1472                 sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
1473
1474                 /* chain new page in list head to match sg */
1475                 first->private = (unsigned long)list;
1476                 list = first;
1477         }
1478
1479         first = get_a_page(rq, gfp);
1480         if (!first) {
1481                 give_pages(rq, list);
1482                 return -ENOMEM;
1483         }
1484         p = page_address(first);
1485
1486         /* rq->sg[0], rq->sg[1] share the same page */
1487         /* a separated rq->sg[0] for header - required in case !any_header_sg */
1488         sg_set_buf(&rq->sg[0], p, vi->hdr_len);
1489
1490         /* rq->sg[1] for data packet, from offset */
1491         offset = sizeof(struct padded_vnet_hdr);
1492         sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
1493
1494         /* chain first in list head */
1495         first->private = (unsigned long)list;
1496         err = virtqueue_add_inbuf(rq->vq, rq->sg, vi->big_packets_num_skbfrags + 2,
1497                                   first, gfp);
1498         if (err < 0)
1499                 give_pages(rq, first);
1500
1501         return err;
1502 }
1503
1504 static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
1505                                           struct ewma_pkt_len *avg_pkt_len,
1506                                           unsigned int room)
1507 {
1508         struct virtnet_info *vi = rq->vq->vdev->priv;
1509         const size_t hdr_len = vi->hdr_len;
1510         unsigned int len;
1511
1512         if (room)
1513                 return PAGE_SIZE - room;
1514
1515         len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
1516                                 rq->min_buf_len, PAGE_SIZE - hdr_len);
1517
1518         return ALIGN(len, L1_CACHE_BYTES);
1519 }
1520
1521 static int add_recvbuf_mergeable(struct virtnet_info *vi,
1522                                  struct receive_queue *rq, gfp_t gfp)
1523 {
1524         struct page_frag *alloc_frag = &rq->alloc_frag;
1525         unsigned int headroom = virtnet_get_headroom(vi);
1526         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
1527         unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
1528         char *buf;
1529         void *ctx;
1530         int err;
1531         unsigned int len, hole;
1532
1533         /* Extra tailroom is needed to satisfy XDP's assumption. This
1534          * means rx frags coalescing won't work, but consider we've
1535          * disabled GSO for XDP, it won't be a big issue.
1536          */
1537         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
1538         if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
1539                 return -ENOMEM;
1540
1541         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1542         buf += headroom; /* advance address leaving hole at front of pkt */
1543         get_page(alloc_frag->page);
1544         alloc_frag->offset += len + room;
1545         hole = alloc_frag->size - alloc_frag->offset;
1546         if (hole < len + room) {
1547                 /* To avoid internal fragmentation, if there is very likely not
1548                  * enough space for another buffer, add the remaining space to
1549                  * the current buffer.
1550                  * XDP core assumes that frame_size of xdp_buff and the length
1551                  * of the frag are PAGE_SIZE, so we disable the hole mechanism.
1552                  */
1553                 if (!headroom)
1554                         len += hole;
1555                 alloc_frag->offset += hole;
1556         }
1557
1558         sg_init_one(rq->sg, buf, len);
1559         ctx = mergeable_len_to_ctx(len + room, headroom);
1560         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
1561         if (err < 0)
1562                 put_page(virt_to_head_page(buf));
1563
1564         return err;
1565 }
1566
1567 /*
1568  * Returns false if we couldn't fill entirely (OOM).
1569  *
1570  * Normally run in the receive path, but can also be run from ndo_open
1571  * before we're receiving packets, or from refill_work which is
1572  * careful to disable receiving (using napi_disable).
1573  */
1574 static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
1575                           gfp_t gfp)
1576 {
1577         int err;
1578         bool oom;
1579
1580         do {
1581                 if (vi->mergeable_rx_bufs)
1582                         err = add_recvbuf_mergeable(vi, rq, gfp);
1583                 else if (vi->big_packets)
1584                         err = add_recvbuf_big(vi, rq, gfp);
1585                 else
1586                         err = add_recvbuf_small(vi, rq, gfp);
1587
1588                 oom = err == -ENOMEM;
1589                 if (err)
1590                         break;
1591         } while (rq->vq->num_free);
1592         if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) {
1593                 unsigned long flags;
1594
1595                 flags = u64_stats_update_begin_irqsave(&rq->stats.syncp);
1596                 rq->stats.kicks++;
1597                 u64_stats_update_end_irqrestore(&rq->stats.syncp, flags);
1598         }
1599
1600         return !oom;
1601 }
1602
1603 static void skb_recv_done(struct virtqueue *rvq)
1604 {
1605         struct virtnet_info *vi = rvq->vdev->priv;
1606         struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1607
1608         virtqueue_napi_schedule(&rq->napi, rvq);
1609 }
1610
1611 static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1612 {
1613         napi_enable(napi);
1614
1615         /* If all buffers were filled by other side before we napi_enabled, we
1616          * won't get another interrupt, so process any outstanding packets now.
1617          * Call local_bh_enable after to trigger softIRQ processing.
1618          */
1619         local_bh_disable();
1620         virtqueue_napi_schedule(napi, vq);
1621         local_bh_enable();
1622 }
1623
1624 static void virtnet_napi_tx_enable(struct virtnet_info *vi,
1625                                    struct virtqueue *vq,
1626                                    struct napi_struct *napi)
1627 {
1628         if (!napi->weight)
1629                 return;
1630
1631         /* Tx napi touches cachelines on the cpu handling tx interrupts. Only
1632          * enable the feature if this is likely affine with the transmit path.
1633          */
1634         if (!vi->affinity_hint_set) {
1635                 napi->weight = 0;
1636                 return;
1637         }
1638
1639         return virtnet_napi_enable(vq, napi);
1640 }
1641
1642 static void virtnet_napi_tx_disable(struct napi_struct *napi)
1643 {
1644         if (napi->weight)
1645                 napi_disable(napi);
1646 }
1647
1648 static void refill_work(struct work_struct *work)
1649 {
1650         struct virtnet_info *vi =
1651                 container_of(work, struct virtnet_info, refill.work);
1652         bool still_empty;
1653         int i;
1654
1655         for (i = 0; i < vi->curr_queue_pairs; i++) {
1656                 struct receive_queue *rq = &vi->rq[i];
1657
1658                 napi_disable(&rq->napi);
1659                 still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1660                 virtnet_napi_enable(rq->vq, &rq->napi);
1661
1662                 /* In theory, this can happen: if we don't get any buffers in
1663                  * we will *never* try to fill again.
1664                  */
1665                 if (still_empty)
1666                         schedule_delayed_work(&vi->refill, HZ/2);
1667         }
1668 }
1669
1670 static int virtnet_receive(struct receive_queue *rq, int budget,
1671                            unsigned int *xdp_xmit)
1672 {
1673         struct virtnet_info *vi = rq->vq->vdev->priv;
1674         struct virtnet_rq_stats stats = {};
1675         unsigned int len;
1676         void *buf;
1677         int i;
1678
1679         if (!vi->big_packets || vi->mergeable_rx_bufs) {
1680                 void *ctx;
1681
1682                 while (stats.packets < budget &&
1683                        (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1684                         receive_buf(vi, rq, buf, len, ctx, xdp_xmit, &stats);
1685                         stats.packets++;
1686                 }
1687         } else {
1688                 while (stats.packets < budget &&
1689                        (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1690                         receive_buf(vi, rq, buf, len, NULL, xdp_xmit, &stats);
1691                         stats.packets++;
1692                 }
1693         }
1694
1695         if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
1696                 if (!try_fill_recv(vi, rq, GFP_ATOMIC)) {
1697                         spin_lock(&vi->refill_lock);
1698                         if (vi->refill_enabled)
1699                                 schedule_delayed_work(&vi->refill, 0);
1700                         spin_unlock(&vi->refill_lock);
1701                 }
1702         }
1703
1704         u64_stats_update_begin(&rq->stats.syncp);
1705         for (i = 0; i < VIRTNET_RQ_STATS_LEN; i++) {
1706                 size_t offset = virtnet_rq_stats_desc[i].offset;
1707                 u64 *item;
1708
1709                 item = (u64 *)((u8 *)&rq->stats + offset);
1710                 *item += *(u64 *)((u8 *)&stats + offset);
1711         }
1712         u64_stats_update_end(&rq->stats.syncp);
1713
1714         return stats.packets;
1715 }
1716
1717 static void free_old_xmit_skbs(struct send_queue *sq, bool in_napi)
1718 {
1719         unsigned int len;
1720         unsigned int packets = 0;
1721         unsigned int bytes = 0;
1722         void *ptr;
1723
1724         while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
1725                 if (likely(!is_xdp_frame(ptr))) {
1726                         struct sk_buff *skb = ptr;
1727
1728                         pr_debug("Sent skb %p\n", skb);
1729
1730                         bytes += skb->len;
1731                         napi_consume_skb(skb, in_napi);
1732                 } else {
1733                         struct xdp_frame *frame = ptr_to_xdp(ptr);
1734
1735                         bytes += xdp_get_frame_len(frame);
1736                         xdp_return_frame(frame);
1737                 }
1738                 packets++;
1739         }
1740
1741         /* Avoid overhead when no packets have been processed
1742          * happens when called speculatively from start_xmit.
1743          */
1744         if (!packets)
1745                 return;
1746
1747         u64_stats_update_begin(&sq->stats.syncp);
1748         sq->stats.bytes += bytes;
1749         sq->stats.packets += packets;
1750         u64_stats_update_end(&sq->stats.syncp);
1751 }
1752
1753 static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
1754 {
1755         if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
1756                 return false;
1757         else if (q < vi->curr_queue_pairs)
1758                 return true;
1759         else
1760                 return false;
1761 }
1762
1763 static void virtnet_poll_cleantx(struct receive_queue *rq)
1764 {
1765         struct virtnet_info *vi = rq->vq->vdev->priv;
1766         unsigned int index = vq2rxq(rq->vq);
1767         struct send_queue *sq = &vi->sq[index];
1768         struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);
1769
1770         if (!sq->napi.weight || is_xdp_raw_buffer_queue(vi, index))
1771                 return;
1772
1773         if (__netif_tx_trylock(txq)) {
1774                 if (sq->reset) {
1775                         __netif_tx_unlock(txq);
1776                         return;
1777                 }
1778
1779                 do {
1780                         virtqueue_disable_cb(sq->vq);
1781                         free_old_xmit_skbs(sq, true);
1782                 } while (unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1783
1784                 if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1785                         netif_tx_wake_queue(txq);
1786
1787                 __netif_tx_unlock(txq);
1788         }
1789 }
1790
1791 static int virtnet_poll(struct napi_struct *napi, int budget)
1792 {
1793         struct receive_queue *rq =
1794                 container_of(napi, struct receive_queue, napi);
1795         struct virtnet_info *vi = rq->vq->vdev->priv;
1796         struct send_queue *sq;
1797         unsigned int received;
1798         unsigned int xdp_xmit = 0;
1799
1800         virtnet_poll_cleantx(rq);
1801
1802         received = virtnet_receive(rq, budget, &xdp_xmit);
1803
1804         if (xdp_xmit & VIRTIO_XDP_REDIR)
1805                 xdp_do_flush();
1806
1807         /* Out of packets? */
1808         if (received < budget)
1809                 virtqueue_napi_complete(napi, rq->vq, received);
1810
1811         if (xdp_xmit & VIRTIO_XDP_TX) {
1812                 sq = virtnet_xdp_get_sq(vi);
1813                 if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
1814                         u64_stats_update_begin(&sq->stats.syncp);
1815                         sq->stats.kicks++;
1816                         u64_stats_update_end(&sq->stats.syncp);
1817                 }
1818                 virtnet_xdp_put_sq(vi, sq);
1819         }
1820
1821         return received;
1822 }
1823
1824 static int virtnet_open(struct net_device *dev)
1825 {
1826         struct virtnet_info *vi = netdev_priv(dev);
1827         int i, err;
1828
1829         enable_delayed_refill(vi);
1830
1831         for (i = 0; i < vi->max_queue_pairs; i++) {
1832                 if (i < vi->curr_queue_pairs)
1833                         /* Make sure we have some buffers: if oom use wq. */
1834                         if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1835                                 schedule_delayed_work(&vi->refill, 0);
1836
1837                 err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i, vi->rq[i].napi.napi_id);
1838                 if (err < 0)
1839                         return err;
1840
1841                 err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
1842                                                  MEM_TYPE_PAGE_SHARED, NULL);
1843                 if (err < 0) {
1844                         xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
1845                         return err;
1846                 }
1847
1848                 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
1849                 virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
1850         }
1851
1852         return 0;
1853 }
1854
1855 static int virtnet_poll_tx(struct napi_struct *napi, int budget)
1856 {
1857         struct send_queue *sq = container_of(napi, struct send_queue, napi);
1858         struct virtnet_info *vi = sq->vq->vdev->priv;
1859         unsigned int index = vq2txq(sq->vq);
1860         struct netdev_queue *txq;
1861         int opaque;
1862         bool done;
1863
1864         if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
1865                 /* We don't need to enable cb for XDP */
1866                 napi_complete_done(napi, 0);
1867                 return 0;
1868         }
1869
1870         txq = netdev_get_tx_queue(vi->dev, index);
1871         __netif_tx_lock(txq, raw_smp_processor_id());
1872         virtqueue_disable_cb(sq->vq);
1873         free_old_xmit_skbs(sq, true);
1874
1875         if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1876                 netif_tx_wake_queue(txq);
1877
1878         opaque = virtqueue_enable_cb_prepare(sq->vq);
1879
1880         done = napi_complete_done(napi, 0);
1881
1882         if (!done)
1883                 virtqueue_disable_cb(sq->vq);
1884
1885         __netif_tx_unlock(txq);
1886
1887         if (done) {
1888                 if (unlikely(virtqueue_poll(sq->vq, opaque))) {
1889                         if (napi_schedule_prep(napi)) {
1890                                 __netif_tx_lock(txq, raw_smp_processor_id());
1891                                 virtqueue_disable_cb(sq->vq);
1892                                 __netif_tx_unlock(txq);
1893                                 __napi_schedule(napi);
1894                         }
1895                 }
1896         }
1897
1898         return 0;
1899 }
1900
1901 static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
1902 {
1903         struct virtio_net_hdr_mrg_rxbuf *hdr;
1904         const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1905         struct virtnet_info *vi = sq->vq->vdev->priv;
1906         int num_sg;
1907         unsigned hdr_len = vi->hdr_len;
1908         bool can_push;
1909
1910         pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1911
1912         can_push = vi->any_header_sg &&
1913                 !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
1914                 !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
1915         /* Even if we can, don't push here yet as this would skew
1916          * csum_start offset below. */
1917         if (can_push)
1918                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1919         else
1920                 hdr = skb_vnet_hdr(skb);
1921
1922         if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1923                                     virtio_is_little_endian(vi->vdev), false,
1924                                     0))
1925                 return -EPROTO;
1926
1927         if (vi->mergeable_rx_bufs)
1928                 hdr->num_buffers = 0;
1929
1930         sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1931         if (can_push) {
1932                 __skb_push(skb, hdr_len);
1933                 num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1934                 if (unlikely(num_sg < 0))
1935                         return num_sg;
1936                 /* Pull header back to avoid skew in tx bytes calculations. */
1937                 __skb_pull(skb, hdr_len);
1938         } else {
1939                 sg_set_buf(sq->sg, hdr, hdr_len);
1940                 num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
1941                 if (unlikely(num_sg < 0))
1942                         return num_sg;
1943                 num_sg++;
1944         }
1945         return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1946 }
1947
1948 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1949 {
1950         struct virtnet_info *vi = netdev_priv(dev);
1951         int qnum = skb_get_queue_mapping(skb);
1952         struct send_queue *sq = &vi->sq[qnum];
1953         int err;
1954         struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
1955         bool kick = !netdev_xmit_more();
1956         bool use_napi = sq->napi.weight;
1957
1958         /* Free up any pending old buffers before queueing new ones. */
1959         do {
1960                 if (use_napi)
1961                         virtqueue_disable_cb(sq->vq);
1962
1963                 free_old_xmit_skbs(sq, false);
1964
1965         } while (use_napi && kick &&
1966                unlikely(!virtqueue_enable_cb_delayed(sq->vq)));
1967
1968         /* timestamp packet in software */
1969         skb_tx_timestamp(skb);
1970
1971         /* Try to transmit */
1972         err = xmit_skb(sq, skb);
1973
1974         /* This should not happen! */
1975         if (unlikely(err)) {
1976                 dev->stats.tx_fifo_errors++;
1977                 if (net_ratelimit())
1978                         dev_warn(&dev->dev,
1979                                  "Unexpected TXQ (%d) queue failure: %d\n",
1980                                  qnum, err);
1981                 dev->stats.tx_dropped++;
1982                 dev_kfree_skb_any(skb);
1983                 return NETDEV_TX_OK;
1984         }
1985
1986         /* Don't wait up for transmitted skbs to be freed. */
1987         if (!use_napi) {
1988                 skb_orphan(skb);
1989                 nf_reset_ct(skb);
1990         }
1991
1992         /* If running out of space, stop queue to avoid getting packets that we
1993          * are then unable to transmit.
1994          * An alternative would be to force queuing layer to requeue the skb by
1995          * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
1996          * returned in a normal path of operation: it means that driver is not
1997          * maintaining the TX queue stop/start state properly, and causes
1998          * the stack to do a non-trivial amount of useless work.
1999          * Since most packets only take 1 or 2 ring slots, stopping the queue
2000          * early means 16 slots are typically wasted.
2001          */
2002         if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
2003                 netif_stop_subqueue(dev, qnum);
2004                 if (use_napi) {
2005                         if (unlikely(!virtqueue_enable_cb_delayed(sq->vq)))
2006                                 virtqueue_napi_schedule(&sq->napi, sq->vq);
2007                 } else if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
2008                         /* More just got used, free them then recheck. */
2009                         free_old_xmit_skbs(sq, false);
2010                         if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
2011                                 netif_start_subqueue(dev, qnum);
2012                                 virtqueue_disable_cb(sq->vq);
2013                         }
2014                 }
2015         }
2016
2017         if (kick || netif_xmit_stopped(txq)) {
2018                 if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) {
2019                         u64_stats_update_begin(&sq->stats.syncp);
2020                         sq->stats.kicks++;
2021                         u64_stats_update_end(&sq->stats.syncp);
2022                 }
2023         }
2024
2025         return NETDEV_TX_OK;
2026 }
2027
2028 static int virtnet_rx_resize(struct virtnet_info *vi,
2029                              struct receive_queue *rq, u32 ring_num)
2030 {
2031         bool running = netif_running(vi->dev);
2032         int err, qindex;
2033
2034         qindex = rq - vi->rq;
2035
2036         if (running)
2037                 napi_disable(&rq->napi);
2038
2039         err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_free_unused_buf);
2040         if (err)
2041                 netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err);
2042
2043         if (!try_fill_recv(vi, rq, GFP_KERNEL))
2044                 schedule_delayed_work(&vi->refill, 0);
2045
2046         if (running)
2047                 virtnet_napi_enable(rq->vq, &rq->napi);
2048         return err;
2049 }
2050
2051 static int virtnet_tx_resize(struct virtnet_info *vi,
2052                              struct send_queue *sq, u32 ring_num)
2053 {
2054         bool running = netif_running(vi->dev);
2055         struct netdev_queue *txq;
2056         int err, qindex;
2057
2058         qindex = sq - vi->sq;
2059
2060         if (running)
2061                 virtnet_napi_tx_disable(&sq->napi);
2062
2063         txq = netdev_get_tx_queue(vi->dev, qindex);
2064
2065         /* 1. wait all ximt complete
2066          * 2. fix the race of netif_stop_subqueue() vs netif_start_subqueue()
2067          */
2068         __netif_tx_lock_bh(txq);
2069
2070         /* Prevent rx poll from accessing sq. */
2071         sq->reset = true;
2072
2073         /* Prevent the upper layer from trying to send packets. */
2074         netif_stop_subqueue(vi->dev, qindex);
2075
2076         __netif_tx_unlock_bh(txq);
2077
2078         err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf);
2079         if (err)
2080                 netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err);
2081
2082         __netif_tx_lock_bh(txq);
2083         sq->reset = false;
2084         netif_tx_wake_queue(txq);
2085         __netif_tx_unlock_bh(txq);
2086
2087         if (running)
2088                 virtnet_napi_tx_enable(vi, sq->vq, &sq->napi);
2089         return err;
2090 }
2091
2092 /*
2093  * Send command via the control virtqueue and check status.  Commands
2094  * supported by the hypervisor, as indicated by feature bits, should
2095  * never fail unless improperly formatted.
2096  */
2097 static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
2098                                  struct scatterlist *out)
2099 {
2100         struct scatterlist *sgs[4], hdr, stat;
2101         unsigned out_num = 0, tmp;
2102         int ret;
2103
2104         /* Caller should know better */
2105         BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
2106
2107         vi->ctrl->status = ~0;
2108         vi->ctrl->hdr.class = class;
2109         vi->ctrl->hdr.cmd = cmd;
2110         /* Add header */
2111         sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr));
2112         sgs[out_num++] = &hdr;
2113
2114         if (out)
2115                 sgs[out_num++] = out;
2116
2117         /* Add return status. */
2118         sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status));
2119         sgs[out_num] = &stat;
2120
2121         BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
2122         ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
2123         if (ret < 0) {
2124                 dev_warn(&vi->vdev->dev,
2125                          "Failed to add sgs for command vq: %d\n.", ret);
2126                 return false;
2127         }
2128
2129         if (unlikely(!virtqueue_kick(vi->cvq)))
2130                 return vi->ctrl->status == VIRTIO_NET_OK;
2131
2132         /* Spin for a response, the kick causes an ioport write, trapping
2133          * into the hypervisor, so the request should be handled immediately.
2134          */
2135         while (!virtqueue_get_buf(vi->cvq, &tmp) &&
2136                !virtqueue_is_broken(vi->cvq))
2137                 cpu_relax();
2138
2139         return vi->ctrl->status == VIRTIO_NET_OK;
2140 }
2141
2142 static int virtnet_set_mac_address(struct net_device *dev, void *p)
2143 {
2144         struct virtnet_info *vi = netdev_priv(dev);
2145         struct virtio_device *vdev = vi->vdev;
2146         int ret;
2147         struct sockaddr *addr;
2148         struct scatterlist sg;
2149
2150         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
2151                 return -EOPNOTSUPP;
2152
2153         addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
2154         if (!addr)
2155                 return -ENOMEM;
2156
2157         ret = eth_prepare_mac_addr_change(dev, addr);
2158         if (ret)
2159                 goto out;
2160
2161         if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
2162                 sg_init_one(&sg, addr->sa_data, dev->addr_len);
2163                 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
2164                                           VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
2165                         dev_warn(&vdev->dev,
2166                                  "Failed to set mac address by vq command.\n");
2167                         ret = -EINVAL;
2168                         goto out;
2169                 }
2170         } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
2171                    !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
2172                 unsigned int i;
2173
2174                 /* Naturally, this has an atomicity problem. */
2175                 for (i = 0; i < dev->addr_len; i++)
2176                         virtio_cwrite8(vdev,
2177                                        offsetof(struct virtio_net_config, mac) +
2178                                        i, addr->sa_data[i]);
2179         }
2180
2181         eth_commit_mac_addr_change(dev, p);
2182         ret = 0;
2183
2184 out:
2185         kfree(addr);
2186         return ret;
2187 }
2188
2189 static void virtnet_stats(struct net_device *dev,
2190                           struct rtnl_link_stats64 *tot)
2191 {
2192         struct virtnet_info *vi = netdev_priv(dev);
2193         unsigned int start;
2194         int i;
2195
2196         for (i = 0; i < vi->max_queue_pairs; i++) {
2197                 u64 tpackets, tbytes, terrors, rpackets, rbytes, rdrops;
2198                 struct receive_queue *rq = &vi->rq[i];
2199                 struct send_queue *sq = &vi->sq[i];
2200
2201                 do {
2202                         start = u64_stats_fetch_begin(&sq->stats.syncp);
2203                         tpackets = sq->stats.packets;
2204                         tbytes   = sq->stats.bytes;
2205                         terrors  = sq->stats.tx_timeouts;
2206                 } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
2207
2208                 do {
2209                         start = u64_stats_fetch_begin(&rq->stats.syncp);
2210                         rpackets = rq->stats.packets;
2211                         rbytes   = rq->stats.bytes;
2212                         rdrops   = rq->stats.drops;
2213                 } while (u64_stats_fetch_retry(&rq->stats.syncp, start));
2214
2215                 tot->rx_packets += rpackets;
2216                 tot->tx_packets += tpackets;
2217                 tot->rx_bytes   += rbytes;
2218                 tot->tx_bytes   += tbytes;
2219                 tot->rx_dropped += rdrops;
2220                 tot->tx_errors  += terrors;
2221         }
2222
2223         tot->tx_dropped = dev->stats.tx_dropped;
2224         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
2225         tot->rx_length_errors = dev->stats.rx_length_errors;
2226         tot->rx_frame_errors = dev->stats.rx_frame_errors;
2227 }
2228
2229 static void virtnet_ack_link_announce(struct virtnet_info *vi)
2230 {
2231         rtnl_lock();
2232         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
2233                                   VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
2234                 dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
2235         rtnl_unlock();
2236 }
2237
2238 static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
2239 {
2240         struct scatterlist sg;
2241         struct net_device *dev = vi->dev;
2242
2243         if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
2244                 return 0;
2245
2246         vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
2247         sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq));
2248
2249         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
2250                                   VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
2251                 dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
2252                          queue_pairs);
2253                 return -EINVAL;
2254         } else {
2255                 vi->curr_queue_pairs = queue_pairs;
2256                 /* virtnet_open() will refill when device is going to up. */
2257                 if (dev->flags & IFF_UP)
2258                         schedule_delayed_work(&vi->refill, 0);
2259         }
2260
2261         return 0;
2262 }
2263
2264 static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
2265 {
2266         int err;
2267
2268         rtnl_lock();
2269         err = _virtnet_set_queues(vi, queue_pairs);
2270         rtnl_unlock();
2271         return err;
2272 }
2273
2274 static int virtnet_close(struct net_device *dev)
2275 {
2276         struct virtnet_info *vi = netdev_priv(dev);
2277         int i;
2278
2279         /* Make sure NAPI doesn't schedule refill work */
2280         disable_delayed_refill(vi);
2281         /* Make sure refill_work doesn't re-enable napi! */
2282         cancel_delayed_work_sync(&vi->refill);
2283
2284         for (i = 0; i < vi->max_queue_pairs; i++) {
2285                 virtnet_napi_tx_disable(&vi->sq[i].napi);
2286                 napi_disable(&vi->rq[i].napi);
2287                 xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
2288         }
2289
2290         return 0;
2291 }
2292
2293 static void virtnet_set_rx_mode(struct net_device *dev)
2294 {
2295         struct virtnet_info *vi = netdev_priv(dev);
2296         struct scatterlist sg[2];
2297         struct virtio_net_ctrl_mac *mac_data;
2298         struct netdev_hw_addr *ha;
2299         int uc_count;
2300         int mc_count;
2301         void *buf;
2302         int i;
2303
2304         /* We can't dynamically set ndo_set_rx_mode, so return gracefully */
2305         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
2306                 return;
2307
2308         vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0);
2309         vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
2310
2311         sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc));
2312
2313         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
2314                                   VIRTIO_NET_CTRL_RX_PROMISC, sg))
2315                 dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
2316                          vi->ctrl->promisc ? "en" : "dis");
2317
2318         sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti));
2319
2320         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
2321                                   VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
2322                 dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
2323                          vi->ctrl->allmulti ? "en" : "dis");
2324
2325         uc_count = netdev_uc_count(dev);
2326         mc_count = netdev_mc_count(dev);
2327         /* MAC filter - use one buffer for both lists */
2328         buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
2329                       (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
2330         mac_data = buf;
2331         if (!buf)
2332                 return;
2333
2334         sg_init_table(sg, 2);
2335
2336         /* Store the unicast list and count in the front of the buffer */
2337         mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
2338         i = 0;
2339         netdev_for_each_uc_addr(ha, dev)
2340                 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
2341
2342         sg_set_buf(&sg[0], mac_data,
2343                    sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
2344
2345         /* multicast list and count fill the end */
2346         mac_data = (void *)&mac_data->macs[uc_count][0];
2347
2348         mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
2349         i = 0;
2350         netdev_for_each_mc_addr(ha, dev)
2351                 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
2352
2353         sg_set_buf(&sg[1], mac_data,
2354                    sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
2355
2356         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
2357                                   VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
2358                 dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
2359
2360         kfree(buf);
2361 }
2362
2363 static int virtnet_vlan_rx_add_vid(struct net_device *dev,
2364                                    __be16 proto, u16 vid)
2365 {
2366         struct virtnet_info *vi = netdev_priv(dev);
2367         struct scatterlist sg;
2368
2369         vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2370         sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2371
2372         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2373                                   VIRTIO_NET_CTRL_VLAN_ADD, &sg))
2374                 dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
2375         return 0;
2376 }
2377
2378 static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
2379                                     __be16 proto, u16 vid)
2380 {
2381         struct virtnet_info *vi = netdev_priv(dev);
2382         struct scatterlist sg;
2383
2384         vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid);
2385         sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid));
2386
2387         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
2388                                   VIRTIO_NET_CTRL_VLAN_DEL, &sg))
2389                 dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
2390         return 0;
2391 }
2392
2393 static void virtnet_clean_affinity(struct virtnet_info *vi)
2394 {
2395         int i;
2396
2397         if (vi->affinity_hint_set) {
2398                 for (i = 0; i < vi->max_queue_pairs; i++) {
2399                         virtqueue_set_affinity(vi->rq[i].vq, NULL);
2400                         virtqueue_set_affinity(vi->sq[i].vq, NULL);
2401                 }
2402
2403                 vi->affinity_hint_set = false;
2404         }
2405 }
2406
2407 static void virtnet_set_affinity(struct virtnet_info *vi)
2408 {
2409         cpumask_var_t mask;
2410         int stragglers;
2411         int group_size;
2412         int i, j, cpu;
2413         int num_cpu;
2414         int stride;
2415
2416         if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
2417                 virtnet_clean_affinity(vi);
2418                 return;
2419         }
2420
2421         num_cpu = num_online_cpus();
2422         stride = max_t(int, num_cpu / vi->curr_queue_pairs, 1);
2423         stragglers = num_cpu >= vi->curr_queue_pairs ?
2424                         num_cpu % vi->curr_queue_pairs :
2425                         0;
2426         cpu = cpumask_first(cpu_online_mask);
2427
2428         for (i = 0; i < vi->curr_queue_pairs; i++) {
2429                 group_size = stride + (i < stragglers ? 1 : 0);
2430
2431                 for (j = 0; j < group_size; j++) {
2432                         cpumask_set_cpu(cpu, mask);
2433                         cpu = cpumask_next_wrap(cpu, cpu_online_mask,
2434                                                 nr_cpu_ids, false);
2435                 }
2436                 virtqueue_set_affinity(vi->rq[i].vq, mask);
2437                 virtqueue_set_affinity(vi->sq[i].vq, mask);
2438                 __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
2439                 cpumask_clear(mask);
2440         }
2441
2442         vi->affinity_hint_set = true;
2443         free_cpumask_var(mask);
2444 }
2445
2446 static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
2447 {
2448         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2449                                                    node);
2450         virtnet_set_affinity(vi);
2451         return 0;
2452 }
2453
2454 static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
2455 {
2456         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2457                                                    node_dead);
2458         virtnet_set_affinity(vi);
2459         return 0;
2460 }
2461
2462 static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
2463 {
2464         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
2465                                                    node);
2466
2467         virtnet_clean_affinity(vi);
2468         return 0;
2469 }
2470
2471 static enum cpuhp_state virtionet_online;
2472
2473 static int virtnet_cpu_notif_add(struct virtnet_info *vi)
2474 {
2475         int ret;
2476
2477         ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
2478         if (ret)
2479                 return ret;
2480         ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
2481                                                &vi->node_dead);
2482         if (!ret)
2483                 return ret;
2484         cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
2485         return ret;
2486 }
2487
2488 static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
2489 {
2490         cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
2491         cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
2492                                             &vi->node_dead);
2493 }
2494
2495 static void virtnet_get_ringparam(struct net_device *dev,
2496                                   struct ethtool_ringparam *ring,
2497                                   struct kernel_ethtool_ringparam *kernel_ring,
2498                                   struct netlink_ext_ack *extack)
2499 {
2500         struct virtnet_info *vi = netdev_priv(dev);
2501
2502         ring->rx_max_pending = vi->rq[0].vq->num_max;
2503         ring->tx_max_pending = vi->sq[0].vq->num_max;
2504         ring->rx_pending = virtqueue_get_vring_size(vi->rq[0].vq);
2505         ring->tx_pending = virtqueue_get_vring_size(vi->sq[0].vq);
2506 }
2507
2508 static int virtnet_set_ringparam(struct net_device *dev,
2509                                  struct ethtool_ringparam *ring,
2510                                  struct kernel_ethtool_ringparam *kernel_ring,
2511                                  struct netlink_ext_ack *extack)
2512 {
2513         struct virtnet_info *vi = netdev_priv(dev);
2514         u32 rx_pending, tx_pending;
2515         struct receive_queue *rq;
2516         struct send_queue *sq;
2517         int i, err;
2518
2519         if (ring->rx_mini_pending || ring->rx_jumbo_pending)
2520                 return -EINVAL;
2521
2522         rx_pending = virtqueue_get_vring_size(vi->rq[0].vq);
2523         tx_pending = virtqueue_get_vring_size(vi->sq[0].vq);
2524
2525         if (ring->rx_pending == rx_pending &&
2526             ring->tx_pending == tx_pending)
2527                 return 0;
2528
2529         if (ring->rx_pending > vi->rq[0].vq->num_max)
2530                 return -EINVAL;
2531
2532         if (ring->tx_pending > vi->sq[0].vq->num_max)
2533                 return -EINVAL;
2534
2535         for (i = 0; i < vi->max_queue_pairs; i++) {
2536                 rq = vi->rq + i;
2537                 sq = vi->sq + i;
2538
2539                 if (ring->tx_pending != tx_pending) {
2540                         err = virtnet_tx_resize(vi, sq, ring->tx_pending);
2541                         if (err)
2542                                 return err;
2543                 }
2544
2545                 if (ring->rx_pending != rx_pending) {
2546                         err = virtnet_rx_resize(vi, rq, ring->rx_pending);
2547                         if (err)
2548                                 return err;
2549                 }
2550         }
2551
2552         return 0;
2553 }
2554
2555 static bool virtnet_commit_rss_command(struct virtnet_info *vi)
2556 {
2557         struct net_device *dev = vi->dev;
2558         struct scatterlist sgs[4];
2559         unsigned int sg_buf_size;
2560
2561         /* prepare sgs */
2562         sg_init_table(sgs, 4);
2563
2564         sg_buf_size = offsetof(struct virtio_net_ctrl_rss, indirection_table);
2565         sg_set_buf(&sgs[0], &vi->ctrl->rss, sg_buf_size);
2566
2567         sg_buf_size = sizeof(uint16_t) * (vi->ctrl->rss.indirection_table_mask + 1);
2568         sg_set_buf(&sgs[1], vi->ctrl->rss.indirection_table, sg_buf_size);
2569
2570         sg_buf_size = offsetof(struct virtio_net_ctrl_rss, key)
2571                         - offsetof(struct virtio_net_ctrl_rss, max_tx_vq);
2572         sg_set_buf(&sgs[2], &vi->ctrl->rss.max_tx_vq, sg_buf_size);
2573
2574         sg_buf_size = vi->rss_key_size;
2575         sg_set_buf(&sgs[3], vi->ctrl->rss.key, sg_buf_size);
2576
2577         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
2578                                   vi->has_rss ? VIRTIO_NET_CTRL_MQ_RSS_CONFIG
2579                                   : VIRTIO_NET_CTRL_MQ_HASH_CONFIG, sgs)) {
2580                 dev_warn(&dev->dev, "VIRTIONET issue with committing RSS sgs\n");
2581                 return false;
2582         }
2583         return true;
2584 }
2585
2586 static void virtnet_init_default_rss(struct virtnet_info *vi)
2587 {
2588         u32 indir_val = 0;
2589         int i = 0;
2590
2591         vi->ctrl->rss.hash_types = vi->rss_hash_types_supported;
2592         vi->rss_hash_types_saved = vi->rss_hash_types_supported;
2593         vi->ctrl->rss.indirection_table_mask = vi->rss_indir_table_size
2594                                                 ? vi->rss_indir_table_size - 1 : 0;
2595         vi->ctrl->rss.unclassified_queue = 0;
2596
2597         for (; i < vi->rss_indir_table_size; ++i) {
2598                 indir_val = ethtool_rxfh_indir_default(i, vi->curr_queue_pairs);
2599                 vi->ctrl->rss.indirection_table[i] = indir_val;
2600         }
2601
2602         vi->ctrl->rss.max_tx_vq = vi->curr_queue_pairs;
2603         vi->ctrl->rss.hash_key_length = vi->rss_key_size;
2604
2605         netdev_rss_key_fill(vi->ctrl->rss.key, vi->rss_key_size);
2606 }
2607
2608 static void virtnet_get_hashflow(const struct virtnet_info *vi, struct ethtool_rxnfc *info)
2609 {
2610         info->data = 0;
2611         switch (info->flow_type) {
2612         case TCP_V4_FLOW:
2613                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
2614                         info->data = RXH_IP_SRC | RXH_IP_DST |
2615                                                  RXH_L4_B_0_1 | RXH_L4_B_2_3;
2616                 } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
2617                         info->data = RXH_IP_SRC | RXH_IP_DST;
2618                 }
2619                 break;
2620         case TCP_V6_FLOW:
2621                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
2622                         info->data = RXH_IP_SRC | RXH_IP_DST |
2623                                                  RXH_L4_B_0_1 | RXH_L4_B_2_3;
2624                 } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
2625                         info->data = RXH_IP_SRC | RXH_IP_DST;
2626                 }
2627                 break;
2628         case UDP_V4_FLOW:
2629                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
2630                         info->data = RXH_IP_SRC | RXH_IP_DST |
2631                                                  RXH_L4_B_0_1 | RXH_L4_B_2_3;
2632                 } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
2633                         info->data = RXH_IP_SRC | RXH_IP_DST;
2634                 }
2635                 break;
2636         case UDP_V6_FLOW:
2637                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
2638                         info->data = RXH_IP_SRC | RXH_IP_DST |
2639                                                  RXH_L4_B_0_1 | RXH_L4_B_2_3;
2640                 } else if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
2641                         info->data = RXH_IP_SRC | RXH_IP_DST;
2642                 }
2643                 break;
2644         case IPV4_FLOW:
2645                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv4)
2646                         info->data = RXH_IP_SRC | RXH_IP_DST;
2647
2648                 break;
2649         case IPV6_FLOW:
2650                 if (vi->rss_hash_types_saved & VIRTIO_NET_RSS_HASH_TYPE_IPv6)
2651                         info->data = RXH_IP_SRC | RXH_IP_DST;
2652
2653                 break;
2654         default:
2655                 info->data = 0;
2656                 break;
2657         }
2658 }
2659
2660 static bool virtnet_set_hashflow(struct virtnet_info *vi, struct ethtool_rxnfc *info)
2661 {
2662         u32 new_hashtypes = vi->rss_hash_types_saved;
2663         bool is_disable = info->data & RXH_DISCARD;
2664         bool is_l4 = info->data == (RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3);
2665
2666         /* supports only 'sd', 'sdfn' and 'r' */
2667         if (!((info->data == (RXH_IP_SRC | RXH_IP_DST)) | is_l4 | is_disable))
2668                 return false;
2669
2670         switch (info->flow_type) {
2671         case TCP_V4_FLOW:
2672                 new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_TCPv4);
2673                 if (!is_disable)
2674                         new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4
2675                                 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv4 : 0);
2676                 break;
2677         case UDP_V4_FLOW:
2678                 new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv4 | VIRTIO_NET_RSS_HASH_TYPE_UDPv4);
2679                 if (!is_disable)
2680                         new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv4
2681                                 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv4 : 0);
2682                 break;
2683         case IPV4_FLOW:
2684                 new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv4;
2685                 if (!is_disable)
2686                         new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv4;
2687                 break;
2688         case TCP_V6_FLOW:
2689                 new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_TCPv6);
2690                 if (!is_disable)
2691                         new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6
2692                                 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_TCPv6 : 0);
2693                 break;
2694         case UDP_V6_FLOW:
2695                 new_hashtypes &= ~(VIRTIO_NET_RSS_HASH_TYPE_IPv6 | VIRTIO_NET_RSS_HASH_TYPE_UDPv6);
2696                 if (!is_disable)
2697                         new_hashtypes |= VIRTIO_NET_RSS_HASH_TYPE_IPv6
2698                                 | (is_l4 ? VIRTIO_NET_RSS_HASH_TYPE_UDPv6 : 0);
2699                 break;
2700         case IPV6_FLOW:
2701                 new_hashtypes &= ~VIRTIO_NET_RSS_HASH_TYPE_IPv6;
2702                 if (!is_disable)
2703                         new_hashtypes = VIRTIO_NET_RSS_HASH_TYPE_IPv6;
2704                 break;
2705         default:
2706                 /* unsupported flow */
2707                 return false;
2708         }
2709
2710         /* if unsupported hashtype was set */
2711         if (new_hashtypes != (new_hashtypes & vi->rss_hash_types_supported))
2712                 return false;
2713
2714         if (new_hashtypes != vi->rss_hash_types_saved) {
2715                 vi->rss_hash_types_saved = new_hashtypes;
2716                 vi->ctrl->rss.hash_types = vi->rss_hash_types_saved;
2717                 if (vi->dev->features & NETIF_F_RXHASH)
2718                         return virtnet_commit_rss_command(vi);
2719         }
2720
2721         return true;
2722 }
2723
2724 static void virtnet_get_drvinfo(struct net_device *dev,
2725                                 struct ethtool_drvinfo *info)
2726 {
2727         struct virtnet_info *vi = netdev_priv(dev);
2728         struct virtio_device *vdev = vi->vdev;
2729
2730         strscpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
2731         strscpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
2732         strscpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));
2733
2734 }
2735
2736 /* TODO: Eliminate OOO packets during switching */
2737 static int virtnet_set_channels(struct net_device *dev,
2738                                 struct ethtool_channels *channels)
2739 {
2740         struct virtnet_info *vi = netdev_priv(dev);
2741         u16 queue_pairs = channels->combined_count;
2742         int err;
2743
2744         /* We don't support separate rx/tx channels.
2745          * We don't allow setting 'other' channels.
2746          */
2747         if (channels->rx_count || channels->tx_count || channels->other_count)
2748                 return -EINVAL;
2749
2750         if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
2751                 return -EINVAL;
2752
2753         /* For now we don't support modifying channels while XDP is loaded
2754          * also when XDP is loaded all RX queues have XDP programs so we only
2755          * need to check a single RX queue.
2756          */
2757         if (vi->rq[0].xdp_prog)
2758                 return -EINVAL;
2759
2760         cpus_read_lock();
2761         err = _virtnet_set_queues(vi, queue_pairs);
2762         if (err) {
2763                 cpus_read_unlock();
2764                 goto err;
2765         }
2766         virtnet_set_affinity(vi);
2767         cpus_read_unlock();
2768
2769         netif_set_real_num_tx_queues(dev, queue_pairs);
2770         netif_set_real_num_rx_queues(dev, queue_pairs);
2771  err:
2772         return err;
2773 }
2774
2775 static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
2776 {
2777         struct virtnet_info *vi = netdev_priv(dev);
2778         unsigned int i, j;
2779         u8 *p = data;
2780
2781         switch (stringset) {
2782         case ETH_SS_STATS:
2783                 for (i = 0; i < vi->curr_queue_pairs; i++) {
2784                         for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++)
2785                                 ethtool_sprintf(&p, "rx_queue_%u_%s", i,
2786                                                 virtnet_rq_stats_desc[j].desc);
2787                 }
2788
2789                 for (i = 0; i < vi->curr_queue_pairs; i++) {
2790                         for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++)
2791                                 ethtool_sprintf(&p, "tx_queue_%u_%s", i,
2792                                                 virtnet_sq_stats_desc[j].desc);
2793                 }
2794                 break;
2795         }
2796 }
2797
2798 static int virtnet_get_sset_count(struct net_device *dev, int sset)
2799 {
2800         struct virtnet_info *vi = netdev_priv(dev);
2801
2802         switch (sset) {
2803         case ETH_SS_STATS:
2804                 return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
2805                                                VIRTNET_SQ_STATS_LEN);
2806         default:
2807                 return -EOPNOTSUPP;
2808         }
2809 }
2810
2811 static void virtnet_get_ethtool_stats(struct net_device *dev,
2812                                       struct ethtool_stats *stats, u64 *data)
2813 {
2814         struct virtnet_info *vi = netdev_priv(dev);
2815         unsigned int idx = 0, start, i, j;
2816         const u8 *stats_base;
2817         size_t offset;
2818
2819         for (i = 0; i < vi->curr_queue_pairs; i++) {
2820                 struct receive_queue *rq = &vi->rq[i];
2821
2822                 stats_base = (u8 *)&rq->stats;
2823                 do {
2824                         start = u64_stats_fetch_begin(&rq->stats.syncp);
2825                         for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) {
2826                                 offset = virtnet_rq_stats_desc[j].offset;
2827                                 data[idx + j] = *(u64 *)(stats_base + offset);
2828                         }
2829                 } while (u64_stats_fetch_retry(&rq->stats.syncp, start));
2830                 idx += VIRTNET_RQ_STATS_LEN;
2831         }
2832
2833         for (i = 0; i < vi->curr_queue_pairs; i++) {
2834                 struct send_queue *sq = &vi->sq[i];
2835
2836                 stats_base = (u8 *)&sq->stats;
2837                 do {
2838                         start = u64_stats_fetch_begin(&sq->stats.syncp);
2839                         for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) {
2840                                 offset = virtnet_sq_stats_desc[j].offset;
2841                                 data[idx + j] = *(u64 *)(stats_base + offset);
2842                         }
2843                 } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
2844                 idx += VIRTNET_SQ_STATS_LEN;
2845         }
2846 }
2847
2848 static void virtnet_get_channels(struct net_device *dev,
2849                                  struct ethtool_channels *channels)
2850 {
2851         struct virtnet_info *vi = netdev_priv(dev);
2852
2853         channels->combined_count = vi->curr_queue_pairs;
2854         channels->max_combined = vi->max_queue_pairs;
2855         channels->max_other = 0;
2856         channels->rx_count = 0;
2857         channels->tx_count = 0;
2858         channels->other_count = 0;
2859 }
2860
2861 static int virtnet_set_link_ksettings(struct net_device *dev,
2862                                       const struct ethtool_link_ksettings *cmd)
2863 {
2864         struct virtnet_info *vi = netdev_priv(dev);
2865
2866         return ethtool_virtdev_set_link_ksettings(dev, cmd,
2867                                                   &vi->speed, &vi->duplex);
2868 }
2869
2870 static int virtnet_get_link_ksettings(struct net_device *dev,
2871                                       struct ethtool_link_ksettings *cmd)
2872 {
2873         struct virtnet_info *vi = netdev_priv(dev);
2874
2875         cmd->base.speed = vi->speed;
2876         cmd->base.duplex = vi->duplex;
2877         cmd->base.port = PORT_OTHER;
2878
2879         return 0;
2880 }
2881
2882 static int virtnet_send_notf_coal_cmds(struct virtnet_info *vi,
2883                                        struct ethtool_coalesce *ec)
2884 {
2885         struct scatterlist sgs_tx, sgs_rx;
2886         struct virtio_net_ctrl_coal_tx coal_tx;
2887         struct virtio_net_ctrl_coal_rx coal_rx;
2888
2889         coal_tx.tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs);
2890         coal_tx.tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames);
2891         sg_init_one(&sgs_tx, &coal_tx, sizeof(coal_tx));
2892
2893         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL,
2894                                   VIRTIO_NET_CTRL_NOTF_COAL_TX_SET,
2895                                   &sgs_tx))
2896                 return -EINVAL;
2897
2898         /* Save parameters */
2899         vi->tx_usecs = ec->tx_coalesce_usecs;
2900         vi->tx_max_packets = ec->tx_max_coalesced_frames;
2901
2902         coal_rx.rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs);
2903         coal_rx.rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames);
2904         sg_init_one(&sgs_rx, &coal_rx, sizeof(coal_rx));
2905
2906         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL,
2907                                   VIRTIO_NET_CTRL_NOTF_COAL_RX_SET,
2908                                   &sgs_rx))
2909                 return -EINVAL;
2910
2911         /* Save parameters */
2912         vi->rx_usecs = ec->rx_coalesce_usecs;
2913         vi->rx_max_packets = ec->rx_max_coalesced_frames;
2914
2915         return 0;
2916 }
2917
2918 static int virtnet_coal_params_supported(struct ethtool_coalesce *ec)
2919 {
2920         /* usecs coalescing is supported only if VIRTIO_NET_F_NOTF_COAL
2921          * feature is negotiated.
2922          */
2923         if (ec->rx_coalesce_usecs || ec->tx_coalesce_usecs)
2924                 return -EOPNOTSUPP;
2925
2926         if (ec->tx_max_coalesced_frames > 1 ||
2927             ec->rx_max_coalesced_frames != 1)
2928                 return -EINVAL;
2929
2930         return 0;
2931 }
2932
2933 static int virtnet_set_coalesce(struct net_device *dev,
2934                                 struct ethtool_coalesce *ec,
2935                                 struct kernel_ethtool_coalesce *kernel_coal,
2936                                 struct netlink_ext_ack *extack)
2937 {
2938         struct virtnet_info *vi = netdev_priv(dev);
2939         int ret, i, napi_weight;
2940         bool update_napi = false;
2941
2942         /* Can't change NAPI weight if the link is up */
2943         napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
2944         if (napi_weight ^ vi->sq[0].napi.weight) {
2945                 if (dev->flags & IFF_UP)
2946                         return -EBUSY;
2947                 else
2948                         update_napi = true;
2949         }
2950
2951         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL))
2952                 ret = virtnet_send_notf_coal_cmds(vi, ec);
2953         else
2954                 ret = virtnet_coal_params_supported(ec);
2955
2956         if (ret)
2957                 return ret;
2958
2959         if (update_napi) {
2960                 for (i = 0; i < vi->max_queue_pairs; i++)
2961                         vi->sq[i].napi.weight = napi_weight;
2962         }
2963
2964         return ret;
2965 }
2966
2967 static int virtnet_get_coalesce(struct net_device *dev,
2968                                 struct ethtool_coalesce *ec,
2969                                 struct kernel_ethtool_coalesce *kernel_coal,
2970                                 struct netlink_ext_ack *extack)
2971 {
2972         struct virtnet_info *vi = netdev_priv(dev);
2973
2974         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) {
2975                 ec->rx_coalesce_usecs = vi->rx_usecs;
2976                 ec->tx_coalesce_usecs = vi->tx_usecs;
2977                 ec->tx_max_coalesced_frames = vi->tx_max_packets;
2978                 ec->rx_max_coalesced_frames = vi->rx_max_packets;
2979         } else {
2980                 ec->rx_max_coalesced_frames = 1;
2981
2982                 if (vi->sq[0].napi.weight)
2983                         ec->tx_max_coalesced_frames = 1;
2984         }
2985
2986         return 0;
2987 }
2988
2989 static void virtnet_init_settings(struct net_device *dev)
2990 {
2991         struct virtnet_info *vi = netdev_priv(dev);
2992
2993         vi->speed = SPEED_UNKNOWN;
2994         vi->duplex = DUPLEX_UNKNOWN;
2995 }
2996
2997 static void virtnet_update_settings(struct virtnet_info *vi)
2998 {
2999         u32 speed;
3000         u8 duplex;
3001
3002         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX))
3003                 return;
3004
3005         virtio_cread_le(vi->vdev, struct virtio_net_config, speed, &speed);
3006
3007         if (ethtool_validate_speed(speed))
3008                 vi->speed = speed;
3009
3010         virtio_cread_le(vi->vdev, struct virtio_net_config, duplex, &duplex);
3011
3012         if (ethtool_validate_duplex(duplex))
3013                 vi->duplex = duplex;
3014 }
3015
3016 static u32 virtnet_get_rxfh_key_size(struct net_device *dev)
3017 {
3018         return ((struct virtnet_info *)netdev_priv(dev))->rss_key_size;
3019 }
3020
3021 static u32 virtnet_get_rxfh_indir_size(struct net_device *dev)
3022 {
3023         return ((struct virtnet_info *)netdev_priv(dev))->rss_indir_table_size;
3024 }
3025
3026 static int virtnet_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc)
3027 {
3028         struct virtnet_info *vi = netdev_priv(dev);
3029         int i;
3030
3031         if (indir) {
3032                 for (i = 0; i < vi->rss_indir_table_size; ++i)
3033                         indir[i] = vi->ctrl->rss.indirection_table[i];
3034         }
3035
3036         if (key)
3037                 memcpy(key, vi->ctrl->rss.key, vi->rss_key_size);
3038
3039         if (hfunc)
3040                 *hfunc = ETH_RSS_HASH_TOP;
3041
3042         return 0;
3043 }
3044
3045 static int virtnet_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key, const u8 hfunc)
3046 {
3047         struct virtnet_info *vi = netdev_priv(dev);
3048         int i;
3049
3050         if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
3051                 return -EOPNOTSUPP;
3052
3053         if (indir) {
3054                 for (i = 0; i < vi->rss_indir_table_size; ++i)
3055                         vi->ctrl->rss.indirection_table[i] = indir[i];
3056         }
3057         if (key)
3058                 memcpy(vi->ctrl->rss.key, key, vi->rss_key_size);
3059
3060         virtnet_commit_rss_command(vi);
3061
3062         return 0;
3063 }
3064
3065 static int virtnet_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, u32 *rule_locs)
3066 {
3067         struct virtnet_info *vi = netdev_priv(dev);
3068         int rc = 0;
3069
3070         switch (info->cmd) {
3071         case ETHTOOL_GRXRINGS:
3072                 info->data = vi->curr_queue_pairs;
3073                 break;
3074         case ETHTOOL_GRXFH:
3075                 virtnet_get_hashflow(vi, info);
3076                 break;
3077         default:
3078                 rc = -EOPNOTSUPP;
3079         }
3080
3081         return rc;
3082 }
3083
3084 static int virtnet_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info)
3085 {
3086         struct virtnet_info *vi = netdev_priv(dev);
3087         int rc = 0;
3088
3089         switch (info->cmd) {
3090         case ETHTOOL_SRXFH:
3091                 if (!virtnet_set_hashflow(vi, info))
3092                         rc = -EINVAL;
3093
3094                 break;
3095         default:
3096                 rc = -EOPNOTSUPP;
3097         }
3098
3099         return rc;
3100 }
3101
3102 static const struct ethtool_ops virtnet_ethtool_ops = {
3103         .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES |
3104                 ETHTOOL_COALESCE_USECS,
3105         .get_drvinfo = virtnet_get_drvinfo,
3106         .get_link = ethtool_op_get_link,
3107         .get_ringparam = virtnet_get_ringparam,
3108         .set_ringparam = virtnet_set_ringparam,
3109         .get_strings = virtnet_get_strings,
3110         .get_sset_count = virtnet_get_sset_count,
3111         .get_ethtool_stats = virtnet_get_ethtool_stats,
3112         .set_channels = virtnet_set_channels,
3113         .get_channels = virtnet_get_channels,
3114         .get_ts_info = ethtool_op_get_ts_info,
3115         .get_link_ksettings = virtnet_get_link_ksettings,
3116         .set_link_ksettings = virtnet_set_link_ksettings,
3117         .set_coalesce = virtnet_set_coalesce,
3118         .get_coalesce = virtnet_get_coalesce,
3119         .get_rxfh_key_size = virtnet_get_rxfh_key_size,
3120         .get_rxfh_indir_size = virtnet_get_rxfh_indir_size,
3121         .get_rxfh = virtnet_get_rxfh,
3122         .set_rxfh = virtnet_set_rxfh,
3123         .get_rxnfc = virtnet_get_rxnfc,
3124         .set_rxnfc = virtnet_set_rxnfc,
3125 };
3126
3127 static void virtnet_freeze_down(struct virtio_device *vdev)
3128 {
3129         struct virtnet_info *vi = vdev->priv;
3130
3131         /* Make sure no work handler is accessing the device */
3132         flush_work(&vi->config_work);
3133
3134         netif_tx_lock_bh(vi->dev);
3135         netif_device_detach(vi->dev);
3136         netif_tx_unlock_bh(vi->dev);
3137         if (netif_running(vi->dev))
3138                 virtnet_close(vi->dev);
3139 }
3140
3141 static int init_vqs(struct virtnet_info *vi);
3142
3143 static int virtnet_restore_up(struct virtio_device *vdev)
3144 {
3145         struct virtnet_info *vi = vdev->priv;
3146         int err;
3147
3148         err = init_vqs(vi);
3149         if (err)
3150                 return err;
3151
3152         virtio_device_ready(vdev);
3153
3154         enable_delayed_refill(vi);
3155
3156         if (netif_running(vi->dev)) {
3157                 err = virtnet_open(vi->dev);
3158                 if (err)
3159                         return err;
3160         }
3161
3162         netif_tx_lock_bh(vi->dev);
3163         netif_device_attach(vi->dev);
3164         netif_tx_unlock_bh(vi->dev);
3165         return err;
3166 }
3167
3168 static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
3169 {
3170         struct scatterlist sg;
3171         vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads);
3172
3173         sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads));
3174
3175         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
3176                                   VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
3177                 dev_warn(&vi->dev->dev, "Fail to set guest offload.\n");
3178                 return -EINVAL;
3179         }
3180
3181         return 0;
3182 }
3183
3184 static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
3185 {
3186         u64 offloads = 0;
3187
3188         if (!vi->guest_offloads)
3189                 return 0;
3190
3191         return virtnet_set_guest_offloads(vi, offloads);
3192 }
3193
3194 static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
3195 {
3196         u64 offloads = vi->guest_offloads;
3197
3198         if (!vi->guest_offloads)
3199                 return 0;
3200
3201         return virtnet_set_guest_offloads(vi, offloads);
3202 }
3203
3204 static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
3205                            struct netlink_ext_ack *extack)
3206 {
3207         unsigned int room = SKB_DATA_ALIGN(VIRTIO_XDP_HEADROOM +
3208                                            sizeof(struct skb_shared_info));
3209         unsigned int max_sz = PAGE_SIZE - room - ETH_HLEN;
3210         struct virtnet_info *vi = netdev_priv(dev);
3211         struct bpf_prog *old_prog;
3212         u16 xdp_qp = 0, curr_qp;
3213         int i, err;
3214
3215         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
3216             && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
3217                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
3218                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
3219                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
3220                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM) ||
3221                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO4) ||
3222                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6))) {
3223                 NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first");
3224                 return -EOPNOTSUPP;
3225         }
3226
3227         if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
3228                 NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
3229                 return -EINVAL;
3230         }
3231
3232         if (prog && !prog->aux->xdp_has_frags && dev->mtu > max_sz) {
3233                 NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP without frags");
3234                 netdev_warn(dev, "single-buffer XDP requires MTU less than %u\n", max_sz);
3235                 return -EINVAL;
3236         }
3237
3238         curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
3239         if (prog)
3240                 xdp_qp = nr_cpu_ids;
3241
3242         /* XDP requires extra queues for XDP_TX */
3243         if (curr_qp + xdp_qp > vi->max_queue_pairs) {
3244                 netdev_warn_once(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n",
3245                                  curr_qp + xdp_qp, vi->max_queue_pairs);
3246                 xdp_qp = 0;
3247         }
3248
3249         old_prog = rtnl_dereference(vi->rq[0].xdp_prog);
3250         if (!prog && !old_prog)
3251                 return 0;
3252
3253         if (prog)
3254                 bpf_prog_add(prog, vi->max_queue_pairs - 1);
3255
3256         /* Make sure NAPI is not using any XDP TX queues for RX. */
3257         if (netif_running(dev)) {
3258                 for (i = 0; i < vi->max_queue_pairs; i++) {
3259                         napi_disable(&vi->rq[i].napi);
3260                         virtnet_napi_tx_disable(&vi->sq[i].napi);
3261                 }
3262         }
3263
3264         if (!prog) {
3265                 for (i = 0; i < vi->max_queue_pairs; i++) {
3266                         rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
3267                         if (i == 0)
3268                                 virtnet_restore_guest_offloads(vi);
3269                 }
3270                 synchronize_net();
3271         }
3272
3273         err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
3274         if (err)
3275                 goto err;
3276         netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
3277         vi->xdp_queue_pairs = xdp_qp;
3278
3279         if (prog) {
3280                 vi->xdp_enabled = true;
3281                 for (i = 0; i < vi->max_queue_pairs; i++) {
3282                         rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
3283                         if (i == 0 && !old_prog)
3284                                 virtnet_clear_guest_offloads(vi);
3285                 }
3286                 if (!old_prog)
3287                         xdp_features_set_redirect_target(dev, true);
3288         } else {
3289                 xdp_features_clear_redirect_target(dev);
3290                 vi->xdp_enabled = false;
3291         }
3292
3293         for (i = 0; i < vi->max_queue_pairs; i++) {
3294                 if (old_prog)
3295                         bpf_prog_put(old_prog);
3296                 if (netif_running(dev)) {
3297                         virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
3298                         virtnet_napi_tx_enable(vi, vi->sq[i].vq,
3299                                                &vi->sq[i].napi);
3300                 }
3301         }
3302
3303         return 0;
3304
3305 err:
3306         if (!prog) {
3307                 virtnet_clear_guest_offloads(vi);
3308                 for (i = 0; i < vi->max_queue_pairs; i++)
3309                         rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
3310         }
3311
3312         if (netif_running(dev)) {
3313                 for (i = 0; i < vi->max_queue_pairs; i++) {
3314                         virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
3315                         virtnet_napi_tx_enable(vi, vi->sq[i].vq,
3316                                                &vi->sq[i].napi);
3317                 }
3318         }
3319         if (prog)
3320                 bpf_prog_sub(prog, vi->max_queue_pairs - 1);
3321         return err;
3322 }
3323
3324 static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
3325 {
3326         switch (xdp->command) {
3327         case XDP_SETUP_PROG:
3328                 return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
3329         default:
3330                 return -EINVAL;
3331         }
3332 }
3333
3334 static int virtnet_get_phys_port_name(struct net_device *dev, char *buf,
3335                                       size_t len)
3336 {
3337         struct virtnet_info *vi = netdev_priv(dev);
3338         int ret;
3339
3340         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY))
3341                 return -EOPNOTSUPP;
3342
3343         ret = snprintf(buf, len, "sby");
3344         if (ret >= len)
3345                 return -EOPNOTSUPP;
3346
3347         return 0;
3348 }
3349
3350 static int virtnet_set_features(struct net_device *dev,
3351                                 netdev_features_t features)
3352 {
3353         struct virtnet_info *vi = netdev_priv(dev);
3354         u64 offloads;
3355         int err;
3356
3357         if ((dev->features ^ features) & NETIF_F_GRO_HW) {
3358                 if (vi->xdp_enabled)
3359                         return -EBUSY;
3360
3361                 if (features & NETIF_F_GRO_HW)
3362                         offloads = vi->guest_offloads_capable;
3363                 else
3364                         offloads = vi->guest_offloads_capable &
3365                                    ~GUEST_OFFLOAD_GRO_HW_MASK;
3366
3367                 err = virtnet_set_guest_offloads(vi, offloads);
3368                 if (err)
3369                         return err;
3370                 vi->guest_offloads = offloads;
3371         }
3372
3373         if ((dev->features ^ features) & NETIF_F_RXHASH) {
3374                 if (features & NETIF_F_RXHASH)
3375                         vi->ctrl->rss.hash_types = vi->rss_hash_types_saved;
3376                 else
3377                         vi->ctrl->rss.hash_types = VIRTIO_NET_HASH_REPORT_NONE;
3378
3379                 if (!virtnet_commit_rss_command(vi))
3380                         return -EINVAL;
3381         }
3382
3383         return 0;
3384 }
3385
3386 static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue)
3387 {
3388         struct virtnet_info *priv = netdev_priv(dev);
3389         struct send_queue *sq = &priv->sq[txqueue];
3390         struct netdev_queue *txq = netdev_get_tx_queue(dev, txqueue);
3391
3392         u64_stats_update_begin(&sq->stats.syncp);
3393         sq->stats.tx_timeouts++;
3394         u64_stats_update_end(&sq->stats.syncp);
3395
3396         netdev_err(dev, "TX timeout on queue: %u, sq: %s, vq: 0x%x, name: %s, %u usecs ago\n",
3397                    txqueue, sq->name, sq->vq->index, sq->vq->name,
3398                    jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start)));
3399 }
3400
3401 static const struct net_device_ops virtnet_netdev = {
3402         .ndo_open            = virtnet_open,
3403         .ndo_stop            = virtnet_close,
3404         .ndo_start_xmit      = start_xmit,
3405         .ndo_validate_addr   = eth_validate_addr,
3406         .ndo_set_mac_address = virtnet_set_mac_address,
3407         .ndo_set_rx_mode     = virtnet_set_rx_mode,
3408         .ndo_get_stats64     = virtnet_stats,
3409         .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
3410         .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
3411         .ndo_bpf                = virtnet_xdp,
3412         .ndo_xdp_xmit           = virtnet_xdp_xmit,
3413         .ndo_features_check     = passthru_features_check,
3414         .ndo_get_phys_port_name = virtnet_get_phys_port_name,
3415         .ndo_set_features       = virtnet_set_features,
3416         .ndo_tx_timeout         = virtnet_tx_timeout,
3417 };
3418
3419 static void virtnet_config_changed_work(struct work_struct *work)
3420 {
3421         struct virtnet_info *vi =
3422                 container_of(work, struct virtnet_info, config_work);
3423         u16 v;
3424
3425         if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
3426                                  struct virtio_net_config, status, &v) < 0)
3427                 return;
3428
3429         if (v & VIRTIO_NET_S_ANNOUNCE) {
3430                 netdev_notify_peers(vi->dev);
3431                 virtnet_ack_link_announce(vi);
3432         }
3433
3434         /* Ignore unknown (future) status bits */
3435         v &= VIRTIO_NET_S_LINK_UP;
3436
3437         if (vi->status == v)
3438                 return;
3439
3440         vi->status = v;
3441
3442         if (vi->status & VIRTIO_NET_S_LINK_UP) {
3443                 virtnet_update_settings(vi);
3444                 netif_carrier_on(vi->dev);
3445                 netif_tx_wake_all_queues(vi->dev);
3446         } else {
3447                 netif_carrier_off(vi->dev);
3448                 netif_tx_stop_all_queues(vi->dev);
3449         }
3450 }
3451
3452 static void virtnet_config_changed(struct virtio_device *vdev)
3453 {
3454         struct virtnet_info *vi = vdev->priv;
3455
3456         schedule_work(&vi->config_work);
3457 }
3458
3459 static void virtnet_free_queues(struct virtnet_info *vi)
3460 {
3461         int i;
3462
3463         for (i = 0; i < vi->max_queue_pairs; i++) {
3464                 __netif_napi_del(&vi->rq[i].napi);
3465                 __netif_napi_del(&vi->sq[i].napi);
3466         }
3467
3468         /* We called __netif_napi_del(),
3469          * we need to respect an RCU grace period before freeing vi->rq
3470          */
3471         synchronize_net();
3472
3473         kfree(vi->rq);
3474         kfree(vi->sq);
3475         kfree(vi->ctrl);
3476 }
3477
3478 static void _free_receive_bufs(struct virtnet_info *vi)
3479 {
3480         struct bpf_prog *old_prog;
3481         int i;
3482
3483         for (i = 0; i < vi->max_queue_pairs; i++) {
3484                 while (vi->rq[i].pages)
3485                         __free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
3486
3487                 old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
3488                 RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
3489                 if (old_prog)
3490                         bpf_prog_put(old_prog);
3491         }
3492 }
3493
3494 static void free_receive_bufs(struct virtnet_info *vi)
3495 {
3496         rtnl_lock();
3497         _free_receive_bufs(vi);
3498         rtnl_unlock();
3499 }
3500
3501 static void free_receive_page_frags(struct virtnet_info *vi)
3502 {
3503         int i;
3504         for (i = 0; i < vi->max_queue_pairs; i++)
3505                 if (vi->rq[i].alloc_frag.page)
3506                         put_page(vi->rq[i].alloc_frag.page);
3507 }
3508
3509 static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
3510 {
3511         if (!is_xdp_frame(buf))
3512                 dev_kfree_skb(buf);
3513         else
3514                 xdp_return_frame(ptr_to_xdp(buf));
3515 }
3516
3517 static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
3518 {
3519         struct virtnet_info *vi = vq->vdev->priv;
3520         int i = vq2rxq(vq);
3521
3522         if (vi->mergeable_rx_bufs)
3523                 put_page(virt_to_head_page(buf));
3524         else if (vi->big_packets)
3525                 give_pages(&vi->rq[i], buf);
3526         else
3527                 put_page(virt_to_head_page(buf));
3528 }
3529
3530 static void free_unused_bufs(struct virtnet_info *vi)
3531 {
3532         void *buf;
3533         int i;
3534
3535         for (i = 0; i < vi->max_queue_pairs; i++) {
3536                 struct virtqueue *vq = vi->sq[i].vq;
3537                 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
3538                         virtnet_sq_free_unused_buf(vq, buf);
3539         }
3540
3541         for (i = 0; i < vi->max_queue_pairs; i++) {
3542                 struct virtqueue *vq = vi->rq[i].vq;
3543                 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
3544                         virtnet_rq_free_unused_buf(vq, buf);
3545         }
3546 }
3547
3548 static void virtnet_del_vqs(struct virtnet_info *vi)
3549 {
3550         struct virtio_device *vdev = vi->vdev;
3551
3552         virtnet_clean_affinity(vi);
3553
3554         vdev->config->del_vqs(vdev);
3555
3556         virtnet_free_queues(vi);
3557 }
3558
3559 /* How large should a single buffer be so a queue full of these can fit at
3560  * least one full packet?
3561  * Logic below assumes the mergeable buffer header is used.
3562  */
3563 static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
3564 {
3565         const unsigned int hdr_len = vi->hdr_len;
3566         unsigned int rq_size = virtqueue_get_vring_size(vq);
3567         unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
3568         unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
3569         unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);
3570
3571         return max(max(min_buf_len, hdr_len) - hdr_len,
3572                    (unsigned int)GOOD_PACKET_LEN);
3573 }
3574
3575 static int virtnet_find_vqs(struct virtnet_info *vi)
3576 {
3577         vq_callback_t **callbacks;
3578         struct virtqueue **vqs;
3579         int ret = -ENOMEM;
3580         int i, total_vqs;
3581         const char **names;
3582         bool *ctx;
3583
3584         /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
3585          * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
3586          * possible control vq.
3587          */
3588         total_vqs = vi->max_queue_pairs * 2 +
3589                     virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
3590
3591         /* Allocate space for find_vqs parameters */
3592         vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL);
3593         if (!vqs)
3594                 goto err_vq;
3595         callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL);
3596         if (!callbacks)
3597                 goto err_callback;
3598         names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL);
3599         if (!names)
3600                 goto err_names;
3601         if (!vi->big_packets || vi->mergeable_rx_bufs) {
3602                 ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL);
3603                 if (!ctx)
3604                         goto err_ctx;
3605         } else {
3606                 ctx = NULL;
3607         }
3608
3609         /* Parameters for control virtqueue, if any */
3610         if (vi->has_cvq) {
3611                 callbacks[total_vqs - 1] = NULL;
3612                 names[total_vqs - 1] = "control";
3613         }
3614
3615         /* Allocate/initialize parameters for send/receive virtqueues */
3616         for (i = 0; i < vi->max_queue_pairs; i++) {
3617                 callbacks[rxq2vq(i)] = skb_recv_done;
3618                 callbacks[txq2vq(i)] = skb_xmit_done;
3619                 sprintf(vi->rq[i].name, "input.%d", i);
3620                 sprintf(vi->sq[i].name, "output.%d", i);
3621                 names[rxq2vq(i)] = vi->rq[i].name;
3622                 names[txq2vq(i)] = vi->sq[i].name;
3623                 if (ctx)
3624                         ctx[rxq2vq(i)] = true;
3625         }
3626
3627         ret = virtio_find_vqs_ctx(vi->vdev, total_vqs, vqs, callbacks,
3628                                   names, ctx, NULL);
3629         if (ret)
3630                 goto err_find;
3631
3632         if (vi->has_cvq) {
3633                 vi->cvq = vqs[total_vqs - 1];
3634                 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
3635                         vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
3636         }
3637
3638         for (i = 0; i < vi->max_queue_pairs; i++) {
3639                 vi->rq[i].vq = vqs[rxq2vq(i)];
3640                 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
3641                 vi->sq[i].vq = vqs[txq2vq(i)];
3642         }
3643
3644         /* run here: ret == 0. */
3645
3646
3647 err_find:
3648         kfree(ctx);
3649 err_ctx:
3650         kfree(names);
3651 err_names:
3652         kfree(callbacks);
3653 err_callback:
3654         kfree(vqs);
3655 err_vq:
3656         return ret;
3657 }
3658
3659 static int virtnet_alloc_queues(struct virtnet_info *vi)
3660 {
3661         int i;
3662
3663         if (vi->has_cvq) {
3664                 vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL);
3665                 if (!vi->ctrl)
3666                         goto err_ctrl;
3667         } else {
3668                 vi->ctrl = NULL;
3669         }
3670         vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL);
3671         if (!vi->sq)
3672                 goto err_sq;
3673         vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL);
3674         if (!vi->rq)
3675                 goto err_rq;
3676
3677         INIT_DELAYED_WORK(&vi->refill, refill_work);
3678         for (i = 0; i < vi->max_queue_pairs; i++) {
3679                 vi->rq[i].pages = NULL;
3680                 netif_napi_add_weight(vi->dev, &vi->rq[i].napi, virtnet_poll,
3681                                       napi_weight);
3682                 netif_napi_add_tx_weight(vi->dev, &vi->sq[i].napi,
3683                                          virtnet_poll_tx,
3684                                          napi_tx ? napi_weight : 0);
3685
3686                 sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
3687                 ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
3688                 sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
3689
3690                 u64_stats_init(&vi->rq[i].stats.syncp);
3691                 u64_stats_init(&vi->sq[i].stats.syncp);
3692         }
3693
3694         return 0;
3695
3696 err_rq:
3697         kfree(vi->sq);
3698 err_sq:
3699         kfree(vi->ctrl);
3700 err_ctrl:
3701         return -ENOMEM;
3702 }
3703
3704 static int init_vqs(struct virtnet_info *vi)
3705 {
3706         int ret;
3707
3708         /* Allocate send & receive queues */
3709         ret = virtnet_alloc_queues(vi);
3710         if (ret)
3711                 goto err;
3712
3713         ret = virtnet_find_vqs(vi);
3714         if (ret)
3715                 goto err_free;
3716
3717         cpus_read_lock();
3718         virtnet_set_affinity(vi);
3719         cpus_read_unlock();
3720
3721         return 0;
3722
3723 err_free:
3724         virtnet_free_queues(vi);
3725 err:
3726         return ret;
3727 }
3728
3729 #ifdef CONFIG_SYSFS
3730 static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
3731                 char *buf)
3732 {
3733         struct virtnet_info *vi = netdev_priv(queue->dev);
3734         unsigned int queue_index = get_netdev_rx_queue_index(queue);
3735         unsigned int headroom = virtnet_get_headroom(vi);
3736         unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
3737         struct ewma_pkt_len *avg;
3738
3739         BUG_ON(queue_index >= vi->max_queue_pairs);
3740         avg = &vi->rq[queue_index].mrg_avg_pkt_len;
3741         return sprintf(buf, "%u\n",
3742                        get_mergeable_buf_len(&vi->rq[queue_index], avg,
3743                                        SKB_DATA_ALIGN(headroom + tailroom)));
3744 }
3745
3746 static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
3747         __ATTR_RO(mergeable_rx_buffer_size);
3748
3749 static struct attribute *virtio_net_mrg_rx_attrs[] = {
3750         &mergeable_rx_buffer_size_attribute.attr,
3751         NULL
3752 };
3753
3754 static const struct attribute_group virtio_net_mrg_rx_group = {
3755         .name = "virtio_net",
3756         .attrs = virtio_net_mrg_rx_attrs
3757 };
3758 #endif
3759
3760 static bool virtnet_fail_on_feature(struct virtio_device *vdev,
3761                                     unsigned int fbit,
3762                                     const char *fname, const char *dname)
3763 {
3764         if (!virtio_has_feature(vdev, fbit))
3765                 return false;
3766
3767         dev_err(&vdev->dev, "device advertises feature %s but not %s",
3768                 fname, dname);
3769
3770         return true;
3771 }
3772
3773 #define VIRTNET_FAIL_ON(vdev, fbit, dbit)                       \
3774         virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)
3775
3776 static bool virtnet_validate_features(struct virtio_device *vdev)
3777 {
3778         if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
3779             (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
3780                              "VIRTIO_NET_F_CTRL_VQ") ||
3781              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
3782                              "VIRTIO_NET_F_CTRL_VQ") ||
3783              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
3784                              "VIRTIO_NET_F_CTRL_VQ") ||
3785              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
3786              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
3787                              "VIRTIO_NET_F_CTRL_VQ") ||
3788              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_RSS,
3789                              "VIRTIO_NET_F_CTRL_VQ") ||
3790              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_HASH_REPORT,
3791                              "VIRTIO_NET_F_CTRL_VQ") ||
3792              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_NOTF_COAL,
3793                              "VIRTIO_NET_F_CTRL_VQ"))) {
3794                 return false;
3795         }
3796
3797         return true;
3798 }
3799
3800 #define MIN_MTU ETH_MIN_MTU
3801 #define MAX_MTU ETH_MAX_MTU
3802
3803 static int virtnet_validate(struct virtio_device *vdev)
3804 {
3805         if (!vdev->config->get) {
3806                 dev_err(&vdev->dev, "%s failure: config access disabled\n",
3807                         __func__);
3808                 return -EINVAL;
3809         }
3810
3811         if (!virtnet_validate_features(vdev))
3812                 return -EINVAL;
3813
3814         if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
3815                 int mtu = virtio_cread16(vdev,
3816                                          offsetof(struct virtio_net_config,
3817                                                   mtu));
3818                 if (mtu < MIN_MTU)
3819                         __virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
3820         }
3821
3822         if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY) &&
3823             !virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
3824                 dev_warn(&vdev->dev, "device advertises feature VIRTIO_NET_F_STANDBY but not VIRTIO_NET_F_MAC, disabling standby");
3825                 __virtio_clear_bit(vdev, VIRTIO_NET_F_STANDBY);
3826         }
3827
3828         return 0;
3829 }
3830
3831 static bool virtnet_check_guest_gso(const struct virtnet_info *vi)
3832 {
3833         return virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
3834                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
3835                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
3836                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) ||
3837                 (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO4) &&
3838                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_USO6));
3839 }
3840
3841 static void virtnet_set_big_packets(struct virtnet_info *vi, const int mtu)
3842 {
3843         bool guest_gso = virtnet_check_guest_gso(vi);
3844
3845         /* If device can receive ANY guest GSO packets, regardless of mtu,
3846          * allocate packets of maximum size, otherwise limit it to only
3847          * mtu size worth only.
3848          */
3849         if (mtu > ETH_DATA_LEN || guest_gso) {
3850                 vi->big_packets = true;
3851                 vi->big_packets_num_skbfrags = guest_gso ? MAX_SKB_FRAGS : DIV_ROUND_UP(mtu, PAGE_SIZE);
3852         }
3853 }
3854
3855 static int virtnet_probe(struct virtio_device *vdev)
3856 {
3857         int i, err = -ENOMEM;
3858         struct net_device *dev;
3859         struct virtnet_info *vi;
3860         u16 max_queue_pairs;
3861         int mtu = 0;
3862
3863         /* Find if host supports multiqueue/rss virtio_net device */
3864         max_queue_pairs = 1;
3865         if (virtio_has_feature(vdev, VIRTIO_NET_F_MQ) || virtio_has_feature(vdev, VIRTIO_NET_F_RSS))
3866                 max_queue_pairs =
3867                      virtio_cread16(vdev, offsetof(struct virtio_net_config, max_virtqueue_pairs));
3868
3869         /* We need at least 2 queue's */
3870         if (max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
3871             max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
3872             !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
3873                 max_queue_pairs = 1;
3874
3875         /* Allocate ourselves a network device with room for our info */
3876         dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
3877         if (!dev)
3878                 return -ENOMEM;
3879
3880         /* Set up network device as normal. */
3881         dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
3882                            IFF_TX_SKB_NO_LINEAR;
3883         dev->netdev_ops = &virtnet_netdev;
3884         dev->features = NETIF_F_HIGHDMA;
3885
3886         dev->ethtool_ops = &virtnet_ethtool_ops;
3887         SET_NETDEV_DEV(dev, &vdev->dev);
3888
3889         /* Do we support "hardware" checksums? */
3890         if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
3891                 /* This opens up the world of extra features. */
3892                 dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3893                 if (csum)
3894                         dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
3895
3896                 if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
3897                         dev->hw_features |= NETIF_F_TSO
3898                                 | NETIF_F_TSO_ECN | NETIF_F_TSO6;
3899                 }
3900                 /* Individual feature bits: what can host handle? */
3901                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
3902                         dev->hw_features |= NETIF_F_TSO;
3903                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
3904                         dev->hw_features |= NETIF_F_TSO6;
3905                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
3906                         dev->hw_features |= NETIF_F_TSO_ECN;
3907                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_USO))
3908                         dev->hw_features |= NETIF_F_GSO_UDP_L4;
3909
3910                 dev->features |= NETIF_F_GSO_ROBUST;
3911
3912                 if (gso)
3913                         dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
3914                 /* (!csum && gso) case will be fixed by register_netdev() */
3915         }
3916         if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
3917                 dev->features |= NETIF_F_RXCSUM;
3918         if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
3919             virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6))
3920                 dev->features |= NETIF_F_GRO_HW;
3921         if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS))
3922                 dev->hw_features |= NETIF_F_GRO_HW;
3923
3924         dev->vlan_features = dev->features;
3925         dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT;
3926
3927         /* MTU range: 68 - 65535 */
3928         dev->min_mtu = MIN_MTU;
3929         dev->max_mtu = MAX_MTU;
3930
3931         /* Configuration may specify what MAC to use.  Otherwise random. */
3932         if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
3933                 u8 addr[ETH_ALEN];
3934
3935                 virtio_cread_bytes(vdev,
3936                                    offsetof(struct virtio_net_config, mac),
3937                                    addr, ETH_ALEN);
3938                 eth_hw_addr_set(dev, addr);
3939         } else {
3940                 eth_hw_addr_random(dev);
3941                 dev_info(&vdev->dev, "Assigned random MAC address %pM\n",
3942                          dev->dev_addr);
3943         }
3944
3945         /* Set up our device-specific information */
3946         vi = netdev_priv(dev);
3947         vi->dev = dev;
3948         vi->vdev = vdev;
3949         vdev->priv = vi;
3950
3951         INIT_WORK(&vi->config_work, virtnet_config_changed_work);
3952         spin_lock_init(&vi->refill_lock);
3953
3954         if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
3955                 vi->mergeable_rx_bufs = true;
3956                 dev->xdp_features |= NETDEV_XDP_ACT_RX_SG;
3957         }
3958
3959         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) {
3960                 vi->rx_usecs = 0;
3961                 vi->tx_usecs = 0;
3962                 vi->tx_max_packets = 0;
3963                 vi->rx_max_packets = 0;
3964         }
3965
3966         if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT))
3967                 vi->has_rss_hash_report = true;
3968
3969         if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS))
3970                 vi->has_rss = true;
3971
3972         if (vi->has_rss || vi->has_rss_hash_report) {
3973                 vi->rss_indir_table_size =
3974                         virtio_cread16(vdev, offsetof(struct virtio_net_config,
3975                                 rss_max_indirection_table_length));
3976                 vi->rss_key_size =
3977                         virtio_cread8(vdev, offsetof(struct virtio_net_config, rss_max_key_size));
3978
3979                 vi->rss_hash_types_supported =
3980                     virtio_cread32(vdev, offsetof(struct virtio_net_config, supported_hash_types));
3981                 vi->rss_hash_types_supported &=
3982                                 ~(VIRTIO_NET_RSS_HASH_TYPE_IP_EX |
3983                                   VIRTIO_NET_RSS_HASH_TYPE_TCP_EX |
3984                                   VIRTIO_NET_RSS_HASH_TYPE_UDP_EX);
3985
3986                 dev->hw_features |= NETIF_F_RXHASH;
3987         }
3988
3989         if (vi->has_rss_hash_report)
3990                 vi->hdr_len = sizeof(struct virtio_net_hdr_v1_hash);
3991         else if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
3992                  virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3993                 vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
3994         else
3995                 vi->hdr_len = sizeof(struct virtio_net_hdr);
3996
3997         if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
3998             virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
3999                 vi->any_header_sg = true;
4000
4001         if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
4002                 vi->has_cvq = true;
4003
4004         if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
4005                 mtu = virtio_cread16(vdev,
4006                                      offsetof(struct virtio_net_config,
4007                                               mtu));
4008                 if (mtu < dev->min_mtu) {
4009                         /* Should never trigger: MTU was previously validated
4010                          * in virtnet_validate.
4011                          */
4012                         dev_err(&vdev->dev,
4013                                 "device MTU appears to have changed it is now %d < %d",
4014                                 mtu, dev->min_mtu);
4015                         err = -EINVAL;
4016                         goto free;
4017                 }
4018
4019                 dev->mtu = mtu;
4020                 dev->max_mtu = mtu;
4021         }
4022
4023         virtnet_set_big_packets(vi, mtu);
4024
4025         if (vi->any_header_sg)
4026                 dev->needed_headroom = vi->hdr_len;
4027
4028         /* Enable multiqueue by default */
4029         if (num_online_cpus() >= max_queue_pairs)
4030                 vi->curr_queue_pairs = max_queue_pairs;
4031         else
4032                 vi->curr_queue_pairs = num_online_cpus();
4033         vi->max_queue_pairs = max_queue_pairs;
4034
4035         /* Allocate/initialize the rx/tx queues, and invoke find_vqs */
4036         err = init_vqs(vi);
4037         if (err)
4038                 goto free;
4039
4040 #ifdef CONFIG_SYSFS
4041         if (vi->mergeable_rx_bufs)
4042                 dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
4043 #endif
4044         netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
4045         netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
4046
4047         virtnet_init_settings(dev);
4048
4049         if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
4050                 vi->failover = net_failover_create(vi->dev);
4051                 if (IS_ERR(vi->failover)) {
4052                         err = PTR_ERR(vi->failover);
4053                         goto free_vqs;
4054                 }
4055         }
4056
4057         if (vi->has_rss || vi->has_rss_hash_report)
4058                 virtnet_init_default_rss(vi);
4059
4060         /* serialize netdev register + virtio_device_ready() with ndo_open() */
4061         rtnl_lock();
4062
4063         err = register_netdevice(dev);
4064         if (err) {
4065                 pr_debug("virtio_net: registering device failed\n");
4066                 rtnl_unlock();
4067                 goto free_failover;
4068         }
4069
4070         virtio_device_ready(vdev);
4071
4072         /* a random MAC address has been assigned, notify the device.
4073          * We don't fail probe if VIRTIO_NET_F_CTRL_MAC_ADDR is not there
4074          * because many devices work fine without getting MAC explicitly
4075          */
4076         if (!virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
4077             virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
4078                 struct scatterlist sg;
4079
4080                 sg_init_one(&sg, dev->dev_addr, dev->addr_len);
4081                 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
4082                                           VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
4083                         pr_debug("virtio_net: setting MAC address failed\n");
4084                         rtnl_unlock();
4085                         err = -EINVAL;
4086                         goto free_unregister_netdev;
4087                 }
4088         }
4089
4090         rtnl_unlock();
4091
4092         err = virtnet_cpu_notif_add(vi);
4093         if (err) {
4094                 pr_debug("virtio_net: registering cpu notifier failed\n");
4095                 goto free_unregister_netdev;
4096         }
4097
4098         virtnet_set_queues(vi, vi->curr_queue_pairs);
4099
4100         /* Assume link up if device can't report link status,
4101            otherwise get link status from config. */
4102         netif_carrier_off(dev);
4103         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
4104                 schedule_work(&vi->config_work);
4105         } else {
4106                 vi->status = VIRTIO_NET_S_LINK_UP;
4107                 virtnet_update_settings(vi);
4108                 netif_carrier_on(dev);
4109         }
4110
4111         for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
4112                 if (virtio_has_feature(vi->vdev, guest_offloads[i]))
4113                         set_bit(guest_offloads[i], &vi->guest_offloads);
4114         vi->guest_offloads_capable = vi->guest_offloads;
4115
4116         pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
4117                  dev->name, max_queue_pairs);
4118
4119         return 0;
4120
4121 free_unregister_netdev:
4122         unregister_netdev(dev);
4123 free_failover:
4124         net_failover_destroy(vi->failover);
4125 free_vqs:
4126         virtio_reset_device(vdev);
4127         cancel_delayed_work_sync(&vi->refill);
4128         free_receive_page_frags(vi);
4129         virtnet_del_vqs(vi);
4130 free:
4131         free_netdev(dev);
4132         return err;
4133 }
4134
4135 static void remove_vq_common(struct virtnet_info *vi)
4136 {
4137         virtio_reset_device(vi->vdev);
4138
4139         /* Free unused buffers in both send and recv, if any. */
4140         free_unused_bufs(vi);
4141
4142         free_receive_bufs(vi);
4143
4144         free_receive_page_frags(vi);
4145
4146         virtnet_del_vqs(vi);
4147 }
4148
4149 static void virtnet_remove(struct virtio_device *vdev)
4150 {
4151         struct virtnet_info *vi = vdev->priv;
4152
4153         virtnet_cpu_notif_remove(vi);
4154
4155         /* Make sure no work handler is accessing the device. */
4156         flush_work(&vi->config_work);
4157
4158         unregister_netdev(vi->dev);
4159
4160         net_failover_destroy(vi->failover);
4161
4162         remove_vq_common(vi);
4163
4164         free_netdev(vi->dev);
4165 }
4166
4167 static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
4168 {
4169         struct virtnet_info *vi = vdev->priv;
4170
4171         virtnet_cpu_notif_remove(vi);
4172         virtnet_freeze_down(vdev);
4173         remove_vq_common(vi);
4174
4175         return 0;
4176 }
4177
4178 static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
4179 {
4180         struct virtnet_info *vi = vdev->priv;
4181         int err;
4182
4183         err = virtnet_restore_up(vdev);
4184         if (err)
4185                 return err;
4186         virtnet_set_queues(vi, vi->curr_queue_pairs);
4187
4188         err = virtnet_cpu_notif_add(vi);
4189         if (err) {
4190                 virtnet_freeze_down(vdev);
4191                 remove_vq_common(vi);
4192                 return err;
4193         }
4194
4195         return 0;
4196 }
4197
4198 static struct virtio_device_id id_table[] = {
4199         { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
4200         { 0 },
4201 };
4202
4203 #define VIRTNET_FEATURES \
4204         VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
4205         VIRTIO_NET_F_MAC, \
4206         VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
4207         VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
4208         VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
4209         VIRTIO_NET_F_HOST_USO, VIRTIO_NET_F_GUEST_USO4, VIRTIO_NET_F_GUEST_USO6, \
4210         VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
4211         VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
4212         VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
4213         VIRTIO_NET_F_CTRL_MAC_ADDR, \
4214         VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
4215         VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \
4216         VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL
4217
4218 static unsigned int features[] = {
4219         VIRTNET_FEATURES,
4220 };
4221
4222 static unsigned int features_legacy[] = {
4223         VIRTNET_FEATURES,
4224         VIRTIO_NET_F_GSO,
4225         VIRTIO_F_ANY_LAYOUT,
4226 };
4227
4228 static struct virtio_driver virtio_net_driver = {
4229         .feature_table = features,
4230         .feature_table_size = ARRAY_SIZE(features),
4231         .feature_table_legacy = features_legacy,
4232         .feature_table_size_legacy = ARRAY_SIZE(features_legacy),
4233         .driver.name =  KBUILD_MODNAME,
4234         .driver.owner = THIS_MODULE,
4235         .id_table =     id_table,
4236         .validate =     virtnet_validate,
4237         .probe =        virtnet_probe,
4238         .remove =       virtnet_remove,
4239         .config_changed = virtnet_config_changed,
4240 #ifdef CONFIG_PM_SLEEP
4241         .freeze =       virtnet_freeze,
4242         .restore =      virtnet_restore,
4243 #endif
4244 };
4245
4246 static __init int virtio_net_driver_init(void)
4247 {
4248         int ret;
4249
4250         ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
4251                                       virtnet_cpu_online,
4252                                       virtnet_cpu_down_prep);
4253         if (ret < 0)
4254                 goto out;
4255         virtionet_online = ret;
4256         ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
4257                                       NULL, virtnet_cpu_dead);
4258         if (ret)
4259                 goto err_dead;
4260         ret = register_virtio_driver(&virtio_net_driver);
4261         if (ret)
4262                 goto err_virtio;
4263         return 0;
4264 err_virtio:
4265         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
4266 err_dead:
4267         cpuhp_remove_multi_state(virtionet_online);
4268 out:
4269         return ret;
4270 }
4271 module_init(virtio_net_driver_init);
4272
4273 static __exit void virtio_net_driver_exit(void)
4274 {
4275         unregister_virtio_driver(&virtio_net_driver);
4276         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
4277         cpuhp_remove_multi_state(virtionet_online);
4278 }
4279 module_exit(virtio_net_driver_exit);
4280
4281 MODULE_DEVICE_TABLE(virtio, id_table);
4282 MODULE_DESCRIPTION("Virtio network driver");
4283 MODULE_LICENSE("GPL");