drivers/net/virtio_net.c

   1 /* A network driver using virtio.
   2  *
   3  * Copyright 2007 Rusty Russell <[email protected]> IBM Corporation
   4  *
   5  * This program is free software; you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation; either version 2 of the License, or
   8  * (at your option) any later version.
   9  *
  10  * This program is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, see <http://www.gnu.org/licenses/>.
  17  */
  18 //#define DEBUG
  19 #include <linux/netdevice.h>
  20 #include <linux/etherdevice.h>
  21 #include <linux/ethtool.h>
  22 #include <linux/module.h>
  23 #include <linux/virtio.h>
  24 #include <linux/virtio_net.h>
  25 #include <linux/bpf.h>
  26 #include <linux/bpf_trace.h>
  27 #include <linux/scatterlist.h>
  28 #include <linux/if_vlan.h>
  29 #include <linux/slab.h>
  30 #include <linux/cpu.h>
  31 #include <linux/average.h>
  32 #include <net/route.h>
  33
  34 static int napi_weight = NAPI_POLL_WEIGHT;
  35 module_param(napi_weight, int, 0444);
  36
  37 static bool csum = true, gso = true, napi_tx;
  38 module_param(csum, bool, 0444);
  39 module_param(gso, bool, 0444);
  40 module_param(napi_tx, bool, 0644);
  41
  42 /* FIXME: MTU in config. */
  43 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
  44 #define GOOD_COPY_LEN   128
  45
  46 #define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
  47
  48 /* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
  49 #define VIRTIO_XDP_HEADROOM 256
  50
  51 /* RX packet size EWMA. The average packet size is used to determine the packet
  52  * buffer size when refilling RX rings. As the entire RX ring may be refilled
  53  * at once, the weight is chosen so that the EWMA will be insensitive to short-
  54  * term, transient changes in packet size.
  55  */
  56 DECLARE_EWMA(pkt_len, 0, 64)
  57
  58 #define VIRTNET_DRIVER_VERSION "1.0.0"
  59
  60 static const unsigned long guest_offloads[] = {
  61         VIRTIO_NET_F_GUEST_TSO4,
  62         VIRTIO_NET_F_GUEST_TSO6,
  63         VIRTIO_NET_F_GUEST_ECN,
  64         VIRTIO_NET_F_GUEST_UFO
  65 };
  66
  67 struct virtnet_stats {
  68         struct u64_stats_sync tx_syncp;
  69         struct u64_stats_sync rx_syncp;
  70         u64 tx_bytes;
  71         u64 tx_packets;
  72
  73         u64 rx_bytes;
  74         u64 rx_packets;
  75 };
  76
  77 /* Internal representation of a send virtqueue */
  78 struct send_queue {
  79         /* Virtqueue associated with this send _queue */
  80         struct virtqueue *vq;
  81
  82         /* TX: fragments + linear part + virtio header */
  83         struct scatterlist sg[MAX_SKB_FRAGS + 2];
  84
  85         /* Name of the send queue: output.$index */
  86         char name[40];
  87
  88         struct napi_struct napi;
  89 };
  90
  91 /* Internal representation of a receive virtqueue */
  92 struct receive_queue {
  93         /* Virtqueue associated with this receive_queue */
  94         struct virtqueue *vq;
  95
  96         struct napi_struct napi;
  97
  98         struct bpf_prog __rcu *xdp_prog;
  99
 100         /* Chain pages by the private ptr. */
 101         struct page *pages;
 102
 103         /* Average packet length for mergeable receive buffers. */
 104         struct ewma_pkt_len mrg_avg_pkt_len;
 105
 106         /* Page frag for packet buffer allocation. */
 107         struct page_frag alloc_frag;
 108
 109         /* RX: fragments + linear part + virtio header */
 110         struct scatterlist sg[MAX_SKB_FRAGS + 2];
 111
 112         /* Min single buffer size for mergeable buffers case. */
 113         unsigned int min_buf_len;
 114
 115         /* Name of this receive queue: input.$index */
 116         char name[40];
 117 };
 118
 119 struct virtnet_info {
 120         struct virtio_device *vdev;
 121         struct virtqueue *cvq;
 122         struct net_device *dev;
 123         struct send_queue *sq;
 124         struct receive_queue *rq;
 125         unsigned int status;
 126
 127         /* Max # of queue pairs supported by the device */
 128         u16 max_queue_pairs;
 129
 130         /* # of queue pairs currently used by the driver */
 131         u16 curr_queue_pairs;
 132
 133         /* # of XDP queue pairs currently used by the driver */
 134         u16 xdp_queue_pairs;
 135
 136         /* I like... big packets and I cannot lie! */
 137         bool big_packets;
 138
 139         /* Host will merge rx buffers for big packets (shake it! shake it!) */
 140         bool mergeable_rx_bufs;
 141
 142         /* Has control virtqueue */
 143         bool has_cvq;
 144
 145         /* Host can handle any s/g split between our header and packet data */
 146         bool any_header_sg;
 147
 148         /* Packet virtio header size */
 149         u8 hdr_len;
 150
 151         /* Active statistics */
 152         struct virtnet_stats __percpu *stats;
 153
 154         /* Work struct for refilling if we run low on memory. */
 155         struct delayed_work refill;
 156
 157         /* Work struct for config space updates */
 158         struct work_struct config_work;
 159
 160         /* Does the affinity hint is set for virtqueues? */
 161         bool affinity_hint_set;
 162
 163         /* CPU hotplug instances for online & dead */
 164         struct hlist_node node;
 165         struct hlist_node node_dead;
 166
 167         /* Control VQ buffers: protected by the rtnl lock */
 168         struct virtio_net_ctrl_hdr ctrl_hdr;
 169         virtio_net_ctrl_ack ctrl_status;
 170         struct virtio_net_ctrl_mq ctrl_mq;
 171         u8 ctrl_promisc;
 172         u8 ctrl_allmulti;
 173         u16 ctrl_vid;
 174         u64 ctrl_offloads;
 175
 176         /* Ethtool settings */
 177         u8 duplex;
 178         u32 speed;
 179
 180         unsigned long guest_offloads;
 181 };
 182
 183 struct padded_vnet_hdr {
 184         struct virtio_net_hdr_mrg_rxbuf hdr;
 185         /*
 186          * hdr is in a separate sg buffer, and data sg buffer shares same page
 187          * with this header sg. This padding makes next sg 16 byte aligned
 188          * after the header.
 189          */
 190         char padding[4];
 191 };
 192
 193 /* Converting between virtqueue no. and kernel tx/rx queue no.
 194  * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 195  */
 196 static int vq2txq(struct virtqueue *vq)
 197 {
 198         return (vq->index - 1) / 2;
 199 }
 200
 201 static int txq2vq(int txq)
 202 {
 203         return txq * 2 + 1;
 204 }
 205
 206 static int vq2rxq(struct virtqueue *vq)
 207 {
 208         return vq->index / 2;
 209 }
 210
 211 static int rxq2vq(int rxq)
 212 {
 213         return rxq * 2;
 214 }
 215
 216 static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb)
 217 {
 218         return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb;
 219 }
 220
 221 /*
 222  * private is used to chain pages for big packets, put the whole
 223  * most recent used list in the beginning for reuse
 224  */
 225 static void give_pages(struct receive_queue *rq, struct page *page)
 226 {
 227         struct page *end;
 228
 229         /* Find end of list, sew whole thing into vi->rq.pages. */
 230         for (end = page; end->private; end = (struct page *)end->private);
 231         end->private = (unsigned long)rq->pages;
 232         rq->pages = page;
 233 }
 234
 235 static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
 236 {
 237         struct page *p = rq->pages;
 238
 239         if (p) {
 240                 rq->pages = (struct page *)p->private;
 241                 /* clear private here, it is used to chain pages */
 242                 p->private = 0;
 243         } else
 244                 p = alloc_page(gfp_mask);
 245         return p;
 246 }
 247
 248 static void virtqueue_napi_schedule(struct napi_struct *napi,
 249                                     struct virtqueue *vq)
 250 {
 251         if (napi_schedule_prep(napi)) {
 252                 virtqueue_disable_cb(vq);
 253                 __napi_schedule(napi);
 254         }
 255 }
 256
 257 static void virtqueue_napi_complete(struct napi_struct *napi,
 258                                     struct virtqueue *vq, int processed)
 259 {
 260         int opaque;
 261
 262         opaque = virtqueue_enable_cb_prepare(vq);
 263         if (napi_complete_done(napi, processed) &&
 264             unlikely(virtqueue_poll(vq, opaque)))
 265                 virtqueue_napi_schedule(napi, vq);
 266 }
 267
 268 static void skb_xmit_done(struct virtqueue *vq)
 269 {
 270         struct virtnet_info *vi = vq->vdev->priv;
 271         struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi;
 272
 273         /* Suppress further interrupts. */
 274         virtqueue_disable_cb(vq);
 275
 276         if (napi->weight)
 277                 virtqueue_napi_schedule(napi, vq);
 278         else
 279                 /* We were probably waiting for more output buffers. */
 280                 netif_wake_subqueue(vi->dev, vq2txq(vq));
 281 }
 282
 283 #define MRG_CTX_HEADER_SHIFT 22
 284 static void *mergeable_len_to_ctx(unsigned int truesize,
 285                                   unsigned int headroom)
 286 {
 287         return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize);
 288 }
 289
 290 static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx)
 291 {
 292         return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT;
 293 }
 294
 295 static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx)
 296 {
 297         return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1);
 298 }
 299
 300 /* Called from bottom half context */
 301 static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 302                                    struct receive_queue *rq,
 303                                    struct page *page, unsigned int offset,
 304                                    unsigned int len, unsigned int truesize)
 305 {
 306         struct sk_buff *skb;
 307         struct virtio_net_hdr_mrg_rxbuf *hdr;
 308         unsigned int copy, hdr_len, hdr_padded_len;
 309         char *p;
 310
 311         p = page_address(page) + offset;
 312
 313         /* copy small packet so we can reuse these pages for small data */
 314         skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN);
 315         if (unlikely(!skb))
 316                 return NULL;
 317
 318         hdr = skb_vnet_hdr(skb);
 319
 320         hdr_len = vi->hdr_len;
 321         if (vi->mergeable_rx_bufs)
 322                 hdr_padded_len = sizeof(*hdr);
 323         else
 324                 hdr_padded_len = sizeof(struct padded_vnet_hdr);
 325
 326         memcpy(hdr, p, hdr_len);
 327
 328         len -= hdr_len;
 329         offset += hdr_padded_len;
 330         p += hdr_padded_len;
 331
 332         copy = len;
 333         if (copy > skb_tailroom(skb))
 334                 copy = skb_tailroom(skb);
 335         skb_put_data(skb, p, copy);
 336
 337         len -= copy;
 338         offset += copy;
 339
 340         if (vi->mergeable_rx_bufs) {
 341                 if (len)
 342                         skb_add_rx_frag(skb, 0, page, offset, len, truesize);
 343                 else
 344                         put_page(page);
 345                 return skb;
 346         }
 347
 348         /*
 349          * Verify that we can indeed put this data into a skb.
 350          * This is here to handle cases when the device erroneously
 351          * tries to receive more than is possible. This is usually
 352          * the case of a broken device.
 353          */
 354         if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
 355                 net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
 356                 dev_kfree_skb(skb);
 357                 return NULL;
 358         }
 359         BUG_ON(offset >= PAGE_SIZE);
 360         while (len) {
 361                 unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len);
 362                 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset,
 363                                 frag_size, truesize);
 364                 len -= frag_size;
 365                 page = (struct page *)page->private;
 366                 offset = 0;
 367         }
 368
 369         if (page)
 370                 give_pages(rq, page);
 371
 372         return skb;
 373 }
 374
 375 static bool virtnet_xdp_xmit(struct virtnet_info *vi,
 376                              struct receive_queue *rq,
 377                              struct xdp_buff *xdp)
 378 {
 379         struct virtio_net_hdr_mrg_rxbuf *hdr;
 380         unsigned int len;
 381         struct send_queue *sq;
 382         unsigned int qp;
 383         void *xdp_sent;
 384         int err;
 385
 386         qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
 387         sq = &vi->sq[qp];
 388
 389         /* Free up any pending old buffers before queueing new ones. */
 390         while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
 391                 struct page *sent_page = virt_to_head_page(xdp_sent);
 392
 393                 put_page(sent_page);
 394         }
 395
 396         xdp->data -= vi->hdr_len;
 397         /* Zero header and leave csum up to XDP layers */
 398         hdr = xdp->data;
 399         memset(hdr, 0, vi->hdr_len);
 400
 401         sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
 402
 403         err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
 404         if (unlikely(err)) {
 405                 struct page *page = virt_to_head_page(xdp->data);
 406
 407                 put_page(page);
 408                 return false;
 409         }
 410
 411         virtqueue_kick(sq->vq);
 412         return true;
 413 }
 414
 415 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
 416 {
 417         return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
 418 }
 419
 420 /* We copy the packet for XDP in the following cases:
 421  *
 422  * 1) Packet is scattered across multiple rx buffers.
 423  * 2) Headroom space is insufficient.
 424  *
 425  * This is inefficient but it's a temporary condition that
 426  * we hit right after XDP is enabled and until queue is refilled
 427  * with large buffers with sufficient headroom - so it should affect
 428  * at most queue size packets.
 429  * Afterwards, the conditions to enable
 430  * XDP should preclude the underlying device from sending packets
 431  * across multiple buffers (num_buf > 1), and we make sure buffers
 432  * have enough headroom.
 433  */
 434 static struct page *xdp_linearize_page(struct receive_queue *rq,
 435                                        u16 *num_buf,
 436                                        struct page *p,
 437                                        int offset,
 438                                        int page_off,
 439                                        unsigned int *len)
 440 {
 441         struct page *page = alloc_page(GFP_ATOMIC);
 442
 443         if (!page)
 444                 return NULL;
 445
 446         memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
 447         page_off += *len;
 448
 449         while (--*num_buf) {
 450                 unsigned int buflen;
 451                 void *buf;
 452                 int off;
 453
 454                 buf = virtqueue_get_buf(rq->vq, &buflen);
 455                 if (unlikely(!buf))
 456                         goto err_buf;
 457
 458                 p = virt_to_head_page(buf);
 459                 off = buf - page_address(p);
 460
 461                 /* guard against a misconfigured or uncooperative backend that
 462                  * is sending packet larger than the MTU.
 463                  */
 464                 if ((page_off + buflen) > PAGE_SIZE) {
 465                         put_page(p);
 466                         goto err_buf;
 467                 }
 468
 469                 memcpy(page_address(page) + page_off,
 470                        page_address(p) + off, buflen);
 471                 page_off += buflen;
 472                 put_page(p);
 473         }
 474
 475         /* Headroom does not contribute to packet length */
 476         *len = page_off - VIRTIO_XDP_HEADROOM;
 477         return page;
 478 err_buf:
 479         __free_pages(page, 0);
 480         return NULL;
 481 }
 482
 483 static struct sk_buff *receive_small(struct net_device *dev,
 484                                      struct virtnet_info *vi,
 485                                      struct receive_queue *rq,
 486                                      void *buf, void *ctx,
 487                                      unsigned int len)
 488 {
 489         struct sk_buff *skb;
 490         struct bpf_prog *xdp_prog;
 491         unsigned int xdp_headroom = (unsigned long)ctx;
 492         unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
 493         unsigned int headroom = vi->hdr_len + header_offset;
 494         unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 495                               SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 496         struct page *page = virt_to_head_page(buf);
 497         unsigned int delta = 0;
 498         struct page *xdp_page;
 499         len -= vi->hdr_len;
 500
 501         rcu_read_lock();
 502         xdp_prog = rcu_dereference(rq->xdp_prog);
 503         if (xdp_prog) {
 504                 struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
 505                 struct xdp_buff xdp;
 506                 void *orig_data;
 507                 u32 act;
 508
 509                 if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
 510                         goto err_xdp;
 511
 512                 if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
 513                         int offset = buf - page_address(page) + header_offset;
 514                         unsigned int tlen = len + vi->hdr_len;
 515                         u16 num_buf = 1;
 516
 517                         xdp_headroom = virtnet_get_headroom(vi);
 518                         header_offset = VIRTNET_RX_PAD + xdp_headroom;
 519                         headroom = vi->hdr_len + header_offset;
 520                         buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
 521                                  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 522                         xdp_page = xdp_linearize_page(rq, &num_buf, page,
 523                                                       offset, header_offset,
 524                                                       &tlen);
 525                         if (!xdp_page)
 526                                 goto err_xdp;
 527
 528                         buf = page_address(xdp_page);
 529                         put_page(page);
 530                         page = xdp_page;
 531                 }
 532
 533                 xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
 534                 xdp.data = xdp.data_hard_start + xdp_headroom;
 535                 xdp.data_end = xdp.data + len;
 536                 orig_data = xdp.data;
 537                 act = bpf_prog_run_xdp(xdp_prog, &xdp);
 538
 539                 switch (act) {
 540                 case XDP_PASS:
 541                         /* Recalculate length in case bpf program changed it */
 542                         delta = orig_data - xdp.data;
 543                         break;
 544                 case XDP_TX:
 545                         if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
 546                                 trace_xdp_exception(vi->dev, xdp_prog, act);
 547                         rcu_read_unlock();
 548                         goto xdp_xmit;
 549                 default:
 550                         bpf_warn_invalid_xdp_action(act);
 551                 case XDP_ABORTED:
 552                         trace_xdp_exception(vi->dev, xdp_prog, act);
 553                 case XDP_DROP:
 554                         goto err_xdp;
 555                 }
 556         }
 557         rcu_read_unlock();
 558
 559         skb = build_skb(buf, buflen);
 560         if (!skb) {
 561                 put_page(page);
 562                 goto err;
 563         }
 564         skb_reserve(skb, headroom - delta);
 565         skb_put(skb, len + delta);
 566         if (!delta) {
 567                 buf += header_offset;
 568                 memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
 569         } /* keep zeroed vnet hdr since packet was changed by bpf */
 570
 571 err:
 572         return skb;
 573
 574 err_xdp:
 575         rcu_read_unlock();
 576         dev->stats.rx_dropped++;
 577         put_page(page);
 578 xdp_xmit:
 579         return NULL;
 580 }
 581
 582 static struct sk_buff *receive_big(struct net_device *dev,
 583                                    struct virtnet_info *vi,
 584                                    struct receive_queue *rq,
 585                                    void *buf,
 586                                    unsigned int len)
 587 {
 588         struct page *page = buf;
 589         struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
 590
 591         if (unlikely(!skb))
 592                 goto err;
 593
 594         return skb;
 595
 596 err:
 597         dev->stats.rx_dropped++;
 598         give_pages(rq, page);
 599         return NULL;
 600 }
 601
 602 static struct sk_buff *receive_mergeable(struct net_device *dev,
 603                                          struct virtnet_info *vi,
 604                                          struct receive_queue *rq,
 605                                          void *buf,
 606                                          void *ctx,
 607                                          unsigned int len)
 608 {
 609         struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
 610         u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
 611         struct page *page = virt_to_head_page(buf);
 612         int offset = buf - page_address(page);
 613         struct sk_buff *head_skb, *curr_skb;
 614         struct bpf_prog *xdp_prog;
 615         unsigned int truesize;
 616         unsigned int headroom = mergeable_ctx_to_headroom(ctx);
 617
 618         head_skb = NULL;
 619
 620         rcu_read_lock();
 621         xdp_prog = rcu_dereference(rq->xdp_prog);
 622         if (xdp_prog) {
 623                 struct page *xdp_page;
 624                 struct xdp_buff xdp;
 625                 void *data;
 626                 u32 act;
 627
 628                 /* This happens when rx buffer size is underestimated */
 629                 if (unlikely(num_buf > 1 ||
 630                              headroom < virtnet_get_headroom(vi))) {
 631                         /* linearize data for XDP */
 632                         xdp_page = xdp_linearize_page(rq, &num_buf,
 633                                                       page, offset,
 634                                                       VIRTIO_XDP_HEADROOM,
 635                                                       &len);
 636                         if (!xdp_page)
 637                                 goto err_xdp;
 638                         offset = VIRTIO_XDP_HEADROOM;
 639                 } else {
 640                         xdp_page = page;
 641                 }
 642
 643                 /* Transient failure which in theory could occur if
 644                  * in-flight packets from before XDP was enabled reach
 645                  * the receive path after XDP is loaded. In practice I
 646                  * was not able to create this condition.
 647                  */
 648                 if (unlikely(hdr->hdr.gso_type))
 649                         goto err_xdp;
 650
 651                 /* Allow consuming headroom but reserve enough space to push
 652                  * the descriptor on if we get an XDP_TX return code.
 653                  */
 654                 data = page_address(xdp_page) + offset;
 655                 xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
 656                 xdp.data = data + vi->hdr_len;
 657                 xdp.data_end = xdp.data + (len - vi->hdr_len);
 658                 act = bpf_prog_run_xdp(xdp_prog, &xdp);
 659
 660                 switch (act) {
 661                 case XDP_PASS:
 662                         /* recalculate offset to account for any header
 663                          * adjustments. Note other cases do not build an
 664                          * skb and avoid using offset
 665                          */
 666                         offset = xdp.data -
 667                                         page_address(xdp_page) - vi->hdr_len;
 668
 669                         /* We can only create skb based on xdp_page. */
 670                         if (unlikely(xdp_page != page)) {
 671                                 rcu_read_unlock();
 672                                 put_page(page);
 673                                 head_skb = page_to_skb(vi, rq, xdp_page,
 674                                                        offset, len, PAGE_SIZE);
 675                                 ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 676                                 return head_skb;
 677                         }
 678                         break;
 679                 case XDP_TX:
 680                         if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
 681                                 trace_xdp_exception(vi->dev, xdp_prog, act);
 682                         ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 683                         if (unlikely(xdp_page != page))
 684                                 goto err_xdp;
 685                         rcu_read_unlock();
 686                         goto xdp_xmit;
 687                 default:
 688                         bpf_warn_invalid_xdp_action(act);
 689                 case XDP_ABORTED:
 690                         trace_xdp_exception(vi->dev, xdp_prog, act);
 691                 case XDP_DROP:
 692                         if (unlikely(xdp_page != page))
 693                                 __free_pages(xdp_page, 0);
 694                         ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 695                         goto err_xdp;
 696                 }
 697         }
 698         rcu_read_unlock();
 699
 700         truesize = mergeable_ctx_to_truesize(ctx);
 701         if (unlikely(len > truesize)) {
 702                 pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
 703                          dev->name, len, (unsigned long)ctx);
 704                 dev->stats.rx_length_errors++;
 705                 goto err_skb;
 706         }
 707
 708         head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
 709         curr_skb = head_skb;
 710
 711         if (unlikely(!curr_skb))
 712                 goto err_skb;
 713         while (--num_buf) {
 714                 int num_skb_frags;
 715
 716                 buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx);
 717                 if (unlikely(!ctx)) {
 718                         pr_debug("%s: rx error: %d buffers out of %d missing\n",
 719                                  dev->name, num_buf,
 720                                  virtio16_to_cpu(vi->vdev,
 721                                                  hdr->num_buffers));
 722                         dev->stats.rx_length_errors++;
 723                         goto err_buf;
 724                 }
 725
 726                 page = virt_to_head_page(buf);
 727
 728                 truesize = mergeable_ctx_to_truesize(ctx);
 729                 if (unlikely(len > truesize)) {
 730                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
 731                                  dev->name, len, (unsigned long)ctx);
 732                         dev->stats.rx_length_errors++;
 733                         goto err_skb;
 734                 }
 735
 736                 num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
 737                 if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
 738                         struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
 739
 740                         if (unlikely(!nskb))
 741                                 goto err_skb;
 742                         if (curr_skb == head_skb)
 743                                 skb_shinfo(curr_skb)->frag_list = nskb;
 744                         else
 745                                 curr_skb->next = nskb;
 746                         curr_skb = nskb;
 747                         head_skb->truesize += nskb->truesize;
 748                         num_skb_frags = 0;
 749                 }
 750                 if (curr_skb != head_skb) {
 751                         head_skb->data_len += len;
 752                         head_skb->len += len;
 753                         head_skb->truesize += truesize;
 754                 }
 755                 offset = buf - page_address(page);
 756                 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
 757                         put_page(page);
 758                         skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
 759                                              len, truesize);
 760                 } else {
 761                         skb_add_rx_frag(curr_skb, num_skb_frags, page,
 762                                         offset, len, truesize);
 763                 }
 764         }
 765
 766         ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
 767         return head_skb;
 768
 769 err_xdp:
 770         rcu_read_unlock();
 771 err_skb:
 772         put_page(page);
 773         while (--num_buf) {
 774                 buf = virtqueue_get_buf(rq->vq, &len);
 775                 if (unlikely(!buf)) {
 776                         pr_debug("%s: rx error: %d buffers missing\n",
 777                                  dev->name, num_buf);
 778                         dev->stats.rx_length_errors++;
 779                         break;
 780                 }
 781                 page = virt_to_head_page(buf);
 782                 put_page(page);
 783         }
 784 err_buf:
 785         dev->stats.rx_dropped++;
 786         dev_kfree_skb(head_skb);
 787 xdp_xmit:
 788         return NULL;
 789 }
 790
 791 static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 792                        void *buf, unsigned int len, void **ctx)
 793 {
 794         struct net_device *dev = vi->dev;
 795         struct sk_buff *skb;
 796         struct virtio_net_hdr_mrg_rxbuf *hdr;
 797         int ret;
 798
 799         if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
 800                 pr_debug("%s: short packet %i\n", dev->name, len);
 801                 dev->stats.rx_length_errors++;
 802                 if (vi->mergeable_rx_bufs) {
 803                         put_page(virt_to_head_page(buf));
 804                 } else if (vi->big_packets) {
 805                         give_pages(rq, buf);
 806                 } else {
 807                         put_page(virt_to_head_page(buf));
 808                 }
 809                 return 0;
 810         }
 811
 812         if (vi->mergeable_rx_bufs)
 813                 skb = receive_mergeable(dev, vi, rq, buf, ctx, len);
 814         else if (vi->big_packets)
 815                 skb = receive_big(dev, vi, rq, buf, len);
 816         else
 817                 skb = receive_small(dev, vi, rq, buf, ctx, len);
 818
 819         if (unlikely(!skb))
 820                 return 0;
 821
 822         hdr = skb_vnet_hdr(skb);
 823
 824         ret = skb->len;
 825
 826         if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID)
 827                 skb->ip_summed = CHECKSUM_UNNECESSARY;
 828
 829         if (virtio_net_hdr_to_skb(skb, &hdr->hdr,
 830                                   virtio_is_little_endian(vi->vdev))) {
 831                 net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n",
 832                                      dev->name, hdr->hdr.gso_type,
 833                                      hdr->hdr.gso_size);
 834                 goto frame_err;
 835         }
 836
 837         skb->protocol = eth_type_trans(skb, dev);
 838         pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
 839                  ntohs(skb->protocol), skb->len, skb->pkt_type);
 840
 841         napi_gro_receive(&rq->napi, skb);
 842         return ret;
 843
 844 frame_err:
 845         dev->stats.rx_frame_errors++;
 846         dev_kfree_skb(skb);
 847         return 0;
 848 }
 849
 850 /* Unlike mergeable buffers, all buffers are allocated to the
 851  * same size, except for the headroom. For this reason we do
 852  * not need to use  mergeable_len_to_ctx here - it is enough
 853  * to store the headroom as the context ignoring the truesize.
 854  */
 855 static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
 856                              gfp_t gfp)
 857 {
 858         struct page_frag *alloc_frag = &rq->alloc_frag;
 859         char *buf;
 860         unsigned int xdp_headroom = virtnet_get_headroom(vi);
 861         void *ctx = (void *)(unsigned long)xdp_headroom;
 862         int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
 863         int err;
 864
 865         len = SKB_DATA_ALIGN(len) +
 866               SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 867         if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
 868                 return -ENOMEM;
 869
 870         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
 871         get_page(alloc_frag->page);
 872         alloc_frag->offset += len;
 873         sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
 874                     vi->hdr_len + GOOD_PACKET_LEN);
 875         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
 876         if (err < 0)
 877                 put_page(virt_to_head_page(buf));
 878         return err;
 879 }
 880
 881 static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
 882                            gfp_t gfp)
 883 {
 884         struct page *first, *list = NULL;
 885         char *p;
 886         int i, err, offset;
 887
 888         sg_init_table(rq->sg, MAX_SKB_FRAGS + 2);
 889
 890         /* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
 891         for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
 892                 first = get_a_page(rq, gfp);
 893                 if (!first) {
 894                         if (list)
 895                                 give_pages(rq, list);
 896                         return -ENOMEM;
 897                 }
 898                 sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
 899
 900                 /* chain new page in list head to match sg */
 901                 first->private = (unsigned long)list;
 902                 list = first;
 903         }
 904
 905         first = get_a_page(rq, gfp);
 906         if (!first) {
 907                 give_pages(rq, list);
 908                 return -ENOMEM;
 909         }
 910         p = page_address(first);
 911
 912         /* rq->sg[0], rq->sg[1] share the same page */
 913         /* a separated rq->sg[0] for header - required in case !any_header_sg */
 914         sg_set_buf(&rq->sg[0], p, vi->hdr_len);
 915
 916         /* rq->sg[1] for data packet, from offset */
 917         offset = sizeof(struct padded_vnet_hdr);
 918         sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
 919
 920         /* chain first in list head */
 921         first->private = (unsigned long)list;
 922         err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
 923                                   first, gfp);
 924         if (err < 0)
 925                 give_pages(rq, first);
 926
 927         return err;
 928 }
 929
 930 static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
 931                                           struct ewma_pkt_len *avg_pkt_len)
 932 {
 933         const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 934         unsigned int len;
 935
 936         len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
 937                                 rq->min_buf_len, PAGE_SIZE - hdr_len);
 938         return ALIGN(len, L1_CACHE_BYTES);
 939 }
 940
 941 static int add_recvbuf_mergeable(struct virtnet_info *vi,
 942                                  struct receive_queue *rq, gfp_t gfp)
 943 {
 944         struct page_frag *alloc_frag = &rq->alloc_frag;
 945         unsigned int headroom = virtnet_get_headroom(vi);
 946         char *buf;
 947         void *ctx;
 948         int err;
 949         unsigned int len, hole;
 950
 951         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len);
 952         if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
 953                 return -ENOMEM;
 954
 955         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
 956         buf += headroom; /* advance address leaving hole at front of pkt */
 957         get_page(alloc_frag->page);
 958         alloc_frag->offset += len + headroom;
 959         hole = alloc_frag->size - alloc_frag->offset;
 960         if (hole < len + headroom) {
 961                 /* To avoid internal fragmentation, if there is very likely not
 962                  * enough space for another buffer, add the remaining space to
 963                  * the current buffer.
 964                  */
 965                 len += hole;
 966                 alloc_frag->offset += hole;
 967         }
 968
 969         sg_init_one(rq->sg, buf, len);
 970         ctx = mergeable_len_to_ctx(len, headroom);
 971         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
 972         if (err < 0)
 973                 put_page(virt_to_head_page(buf));
 974
 975         return err;
 976 }
 977
 978 /*
 979  * Returns false if we couldn't fill entirely (OOM).
 980  *
 981  * Normally run in the receive path, but can also be run from ndo_open
 982  * before we're receiving packets, or from refill_work which is
 983  * careful to disable receiving (using napi_disable).
 984  */
 985 static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
 986                           gfp_t gfp)
 987 {
 988         int err;
 989         bool oom;
 990
 991         gfp |= __GFP_COLD;
 992         do {
 993                 if (vi->mergeable_rx_bufs)
 994                         err = add_recvbuf_mergeable(vi, rq, gfp);
 995                 else if (vi->big_packets)
 996                         err = add_recvbuf_big(vi, rq, gfp);
 997                 else
 998                         err = add_recvbuf_small(vi, rq, gfp);
 999
1000                 oom = err == -ENOMEM;
1001                 if (err)
1002                         break;
1003         } while (rq->vq->num_free);
1004         virtqueue_kick(rq->vq);
1005         return !oom;
1006 }
1007
1008 static void skb_recv_done(struct virtqueue *rvq)
1009 {
1010         struct virtnet_info *vi = rvq->vdev->priv;
1011         struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
1012
1013         virtqueue_napi_schedule(&rq->napi, rvq);
1014 }
1015
1016 static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi)
1017 {
1018         napi_enable(napi);
1019
1020         /* If all buffers were filled by other side before we napi_enabled, we
1021          * won't get another interrupt, so process any outstanding packets now.
1022          * Call local_bh_enable after to trigger softIRQ processing.
1023          */
1024         local_bh_disable();
1025         virtqueue_napi_schedule(napi, vq);
1026         local_bh_enable();
1027 }
1028
1029 static void virtnet_napi_tx_enable(struct virtnet_info *vi,
1030                                    struct virtqueue *vq,
1031                                    struct napi_struct *napi)
1032 {
1033         if (!napi->weight)
1034                 return;
1035
1036         /* Tx napi touches cachelines on the cpu handling tx interrupts. Only
1037          * enable the feature if this is likely affine with the transmit path.
1038          */
1039         if (!vi->affinity_hint_set) {
1040                 napi->weight = 0;
1041                 return;
1042         }
1043
1044         return virtnet_napi_enable(vq, napi);
1045 }
1046
1047 static void virtnet_napi_tx_disable(struct napi_struct *napi)
1048 {
1049         if (napi->weight)
1050                 napi_disable(napi);
1051 }
1052
1053 static void refill_work(struct work_struct *work)
1054 {
1055         struct virtnet_info *vi =
1056                 container_of(work, struct virtnet_info, refill.work);
1057         bool still_empty;
1058         int i;
1059
1060         for (i = 0; i < vi->curr_queue_pairs; i++) {
1061                 struct receive_queue *rq = &vi->rq[i];
1062
1063                 napi_disable(&rq->napi);
1064                 still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
1065                 virtnet_napi_enable(rq->vq, &rq->napi);
1066
1067                 /* In theory, this can happen: if we don't get any buffers in
1068                  * we will *never* try to fill again.
1069                  */
1070                 if (still_empty)
1071                         schedule_delayed_work(&vi->refill, HZ/2);
1072         }
1073 }
1074
1075 static int virtnet_receive(struct receive_queue *rq, int budget)
1076 {
1077         struct virtnet_info *vi = rq->vq->vdev->priv;
1078         unsigned int len, received = 0, bytes = 0;
1079         void *buf;
1080         struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
1081
1082         if (!vi->big_packets || vi->mergeable_rx_bufs) {
1083                 void *ctx;
1084
1085                 while (received < budget &&
1086                        (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) {
1087                         bytes += receive_buf(vi, rq, buf, len, ctx);
1088                         received++;
1089                 }
1090         } else {
1091                 while (received < budget &&
1092                        (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
1093                         bytes += receive_buf(vi, rq, buf, len, NULL);
1094                         received++;
1095                 }
1096         }
1097
1098         if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) {
1099                 if (!try_fill_recv(vi, rq, GFP_ATOMIC))
1100                         schedule_delayed_work(&vi->refill, 0);
1101         }
1102
1103         u64_stats_update_begin(&stats->rx_syncp);
1104         stats->rx_bytes += bytes;
1105         stats->rx_packets += received;
1106         u64_stats_update_end(&stats->rx_syncp);
1107
1108         return received;
1109 }
1110
1111 static void free_old_xmit_skbs(struct send_queue *sq)
1112 {
1113         struct sk_buff *skb;
1114         unsigned int len;
1115         struct virtnet_info *vi = sq->vq->vdev->priv;
1116         struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
1117         unsigned int packets = 0;
1118         unsigned int bytes = 0;
1119
1120         while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
1121                 pr_debug("Sent skb %p\n", skb);
1122
1123                 bytes += skb->len;
1124                 packets++;
1125
1126                 dev_consume_skb_any(skb);
1127         }
1128
1129         /* Avoid overhead when no packets have been processed
1130          * happens when called speculatively from start_xmit.
1131          */
1132         if (!packets)
1133                 return;
1134
1135         u64_stats_update_begin(&stats->tx_syncp);
1136         stats->tx_bytes += bytes;
1137         stats->tx_packets += packets;
1138         u64_stats_update_end(&stats->tx_syncp);
1139 }
1140
1141 static void virtnet_poll_cleantx(struct receive_queue *rq)
1142 {
1143         struct virtnet_info *vi = rq->vq->vdev->priv;
1144         unsigned int index = vq2rxq(rq->vq);
1145         struct send_queue *sq = &vi->sq[index];
1146         struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index);
1147
1148         if (!sq->napi.weight)
1149                 return;
1150
1151         if (__netif_tx_trylock(txq)) {
1152                 free_old_xmit_skbs(sq);
1153                 __netif_tx_unlock(txq);
1154         }
1155
1156         if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1157                 netif_tx_wake_queue(txq);
1158 }
1159
1160 static int virtnet_poll(struct napi_struct *napi, int budget)
1161 {
1162         struct receive_queue *rq =
1163                 container_of(napi, struct receive_queue, napi);
1164         unsigned int received;
1165
1166         virtnet_poll_cleantx(rq);
1167
1168         received = virtnet_receive(rq, budget);
1169
1170         /* Out of packets? */
1171         if (received < budget)
1172                 virtqueue_napi_complete(napi, rq->vq, received);
1173
1174         return received;
1175 }
1176
1177 static int virtnet_open(struct net_device *dev)
1178 {
1179         struct virtnet_info *vi = netdev_priv(dev);
1180         int i;
1181
1182         for (i = 0; i < vi->max_queue_pairs; i++) {
1183                 if (i < vi->curr_queue_pairs)
1184                         /* Make sure we have some buffers: if oom use wq. */
1185                         if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1186                                 schedule_delayed_work(&vi->refill, 0);
1187                 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
1188                 virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
1189         }
1190
1191         return 0;
1192 }
1193
1194 static int virtnet_poll_tx(struct napi_struct *napi, int budget)
1195 {
1196         struct send_queue *sq = container_of(napi, struct send_queue, napi);
1197         struct virtnet_info *vi = sq->vq->vdev->priv;
1198         struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq));
1199
1200         __netif_tx_lock(txq, raw_smp_processor_id());
1201         free_old_xmit_skbs(sq);
1202         __netif_tx_unlock(txq);
1203
1204         virtqueue_napi_complete(napi, sq->vq, 0);
1205
1206         if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
1207                 netif_tx_wake_queue(txq);
1208
1209         return 0;
1210 }
1211
1212 static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
1213 {
1214         struct virtio_net_hdr_mrg_rxbuf *hdr;
1215         const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
1216         struct virtnet_info *vi = sq->vq->vdev->priv;
1217         int num_sg;
1218         unsigned hdr_len = vi->hdr_len;
1219         bool can_push;
1220
1221         pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
1222
1223         can_push = vi->any_header_sg &&
1224                 !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) &&
1225                 !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len;
1226         /* Even if we can, don't push here yet as this would skew
1227          * csum_start offset below. */
1228         if (can_push)
1229                 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len);
1230         else
1231                 hdr = skb_vnet_hdr(skb);
1232
1233         if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
1234                                     virtio_is_little_endian(vi->vdev), false))
1235                 BUG();
1236
1237         if (vi->mergeable_rx_bufs)
1238                 hdr->num_buffers = 0;
1239
1240         sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2));
1241         if (can_push) {
1242                 __skb_push(skb, hdr_len);
1243                 num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
1244                 if (unlikely(num_sg < 0))
1245                         return num_sg;
1246                 /* Pull header back to avoid skew in tx bytes calculations. */
1247                 __skb_pull(skb, hdr_len);
1248         } else {
1249                 sg_set_buf(sq->sg, hdr, hdr_len);
1250                 num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
1251                 if (unlikely(num_sg < 0))
1252                         return num_sg;
1253                 num_sg++;
1254         }
1255         return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
1256 }
1257
1258 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
1259 {
1260         struct virtnet_info *vi = netdev_priv(dev);
1261         int qnum = skb_get_queue_mapping(skb);
1262         struct send_queue *sq = &vi->sq[qnum];
1263         int err;
1264         struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
1265         bool kick = !skb->xmit_more;
1266         bool use_napi = sq->napi.weight;
1267
1268         /* Free up any pending old buffers before queueing new ones. */
1269         free_old_xmit_skbs(sq);
1270
1271         if (use_napi && kick)
1272                 virtqueue_enable_cb_delayed(sq->vq);
1273
1274         /* timestamp packet in software */
1275         skb_tx_timestamp(skb);
1276
1277         /* Try to transmit */
1278         err = xmit_skb(sq, skb);
1279
1280         /* This should not happen! */
1281         if (unlikely(err)) {
1282                 dev->stats.tx_fifo_errors++;
1283                 if (net_ratelimit())
1284                         dev_warn(&dev->dev,
1285                                  "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
1286                 dev->stats.tx_dropped++;
1287                 dev_kfree_skb_any(skb);
1288                 return NETDEV_TX_OK;
1289         }
1290
1291         /* Don't wait up for transmitted skbs to be freed. */
1292         if (!use_napi) {
1293                 skb_orphan(skb);
1294                 nf_reset(skb);
1295         }
1296
1297         /* If running out of space, stop queue to avoid getting packets that we
1298          * are then unable to transmit.
1299          * An alternative would be to force queuing layer to requeue the skb by
1300          * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be
1301          * returned in a normal path of operation: it means that driver is not
1302          * maintaining the TX queue stop/start state properly, and causes
1303          * the stack to do a non-trivial amount of useless work.
1304          * Since most packets only take 1 or 2 ring slots, stopping the queue
1305          * early means 16 slots are typically wasted.
1306          */
1307         if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
1308                 netif_stop_subqueue(dev, qnum);
1309                 if (!use_napi &&
1310                     unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
1311                         /* More just got used, free them then recheck. */
1312                         free_old_xmit_skbs(sq);
1313                         if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
1314                                 netif_start_subqueue(dev, qnum);
1315                                 virtqueue_disable_cb(sq->vq);
1316                         }
1317                 }
1318         }
1319
1320         if (kick || netif_xmit_stopped(txq))
1321                 virtqueue_kick(sq->vq);
1322
1323         return NETDEV_TX_OK;
1324 }
1325
1326 /*
1327  * Send command via the control virtqueue and check status.  Commands
1328  * supported by the hypervisor, as indicated by feature bits, should
1329  * never fail unless improperly formatted.
1330  */
1331 static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
1332                                  struct scatterlist *out)
1333 {
1334         struct scatterlist *sgs[4], hdr, stat;
1335         unsigned out_num = 0, tmp;
1336
1337         /* Caller should know better */
1338         BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
1339
1340         vi->ctrl_status = ~0;
1341         vi->ctrl_hdr.class = class;
1342         vi->ctrl_hdr.cmd = cmd;
1343         /* Add header */
1344         sg_init_one(&hdr, &vi->ctrl_hdr, sizeof(vi->ctrl_hdr));
1345         sgs[out_num++] = &hdr;
1346
1347         if (out)
1348                 sgs[out_num++] = out;
1349
1350         /* Add return status. */
1351         sg_init_one(&stat, &vi->ctrl_status, sizeof(vi->ctrl_status));
1352         sgs[out_num] = &stat;
1353
1354         BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
1355         virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
1356
1357         if (unlikely(!virtqueue_kick(vi->cvq)))
1358                 return vi->ctrl_status == VIRTIO_NET_OK;
1359
1360         /* Spin for a response, the kick causes an ioport write, trapping
1361          * into the hypervisor, so the request should be handled immediately.
1362          */
1363         while (!virtqueue_get_buf(vi->cvq, &tmp) &&
1364                !virtqueue_is_broken(vi->cvq))
1365                 cpu_relax();
1366
1367         return vi->ctrl_status == VIRTIO_NET_OK;
1368 }
1369
1370 static int virtnet_set_mac_address(struct net_device *dev, void *p)
1371 {
1372         struct virtnet_info *vi = netdev_priv(dev);
1373         struct virtio_device *vdev = vi->vdev;
1374         int ret;
1375         struct sockaddr *addr;
1376         struct scatterlist sg;
1377
1378         addr = kmemdup(p, sizeof(*addr), GFP_KERNEL);
1379         if (!addr)
1380                 return -ENOMEM;
1381
1382         ret = eth_prepare_mac_addr_change(dev, addr);
1383         if (ret)
1384                 goto out;
1385
1386         if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) {
1387                 sg_init_one(&sg, addr->sa_data, dev->addr_len);
1388                 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1389                                           VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) {
1390                         dev_warn(&vdev->dev,
1391                                  "Failed to set mac address by vq command.\n");
1392                         ret = -EINVAL;
1393                         goto out;
1394                 }
1395         } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) &&
1396                    !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1397                 unsigned int i;
1398
1399                 /* Naturally, this has an atomicity problem. */
1400                 for (i = 0; i < dev->addr_len; i++)
1401                         virtio_cwrite8(vdev,
1402                                        offsetof(struct virtio_net_config, mac) +
1403                                        i, addr->sa_data[i]);
1404         }
1405
1406         eth_commit_mac_addr_change(dev, p);
1407         ret = 0;
1408
1409 out:
1410         kfree(addr);
1411         return ret;
1412 }
1413
1414 static void virtnet_stats(struct net_device *dev,
1415                           struct rtnl_link_stats64 *tot)
1416 {
1417         struct virtnet_info *vi = netdev_priv(dev);
1418         int cpu;
1419         unsigned int start;
1420
1421         for_each_possible_cpu(cpu) {
1422                 struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
1423                 u64 tpackets, tbytes, rpackets, rbytes;
1424
1425                 do {
1426                         start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
1427                         tpackets = stats->tx_packets;
1428                         tbytes   = stats->tx_bytes;
1429                 } while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
1430
1431                 do {
1432                         start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
1433                         rpackets = stats->rx_packets;
1434                         rbytes   = stats->rx_bytes;
1435                 } while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
1436
1437                 tot->rx_packets += rpackets;
1438                 tot->tx_packets += tpackets;
1439                 tot->rx_bytes   += rbytes;
1440                 tot->tx_bytes   += tbytes;
1441         }
1442
1443         tot->tx_dropped = dev->stats.tx_dropped;
1444         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
1445         tot->rx_dropped = dev->stats.rx_dropped;
1446         tot->rx_length_errors = dev->stats.rx_length_errors;
1447         tot->rx_frame_errors = dev->stats.rx_frame_errors;
1448 }
1449
1450 #ifdef CONFIG_NET_POLL_CONTROLLER
1451 static void virtnet_netpoll(struct net_device *dev)
1452 {
1453         struct virtnet_info *vi = netdev_priv(dev);
1454         int i;
1455
1456         for (i = 0; i < vi->curr_queue_pairs; i++)
1457                 napi_schedule(&vi->rq[i].napi);
1458 }
1459 #endif
1460
1461 static void virtnet_ack_link_announce(struct virtnet_info *vi)
1462 {
1463         rtnl_lock();
1464         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
1465                                   VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL))
1466                 dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
1467         rtnl_unlock();
1468 }
1469
1470 static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
1471 {
1472         struct scatterlist sg;
1473         struct net_device *dev = vi->dev;
1474
1475         if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
1476                 return 0;
1477
1478         vi->ctrl_mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs);
1479         sg_init_one(&sg, &vi->ctrl_mq, sizeof(vi->ctrl_mq));
1480
1481         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
1482                                   VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) {
1483                 dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
1484                          queue_pairs);
1485                 return -EINVAL;
1486         } else {
1487                 vi->curr_queue_pairs = queue_pairs;
1488                 /* virtnet_open() will refill when device is going to up. */
1489                 if (dev->flags & IFF_UP)
1490                         schedule_delayed_work(&vi->refill, 0);
1491         }
1492
1493         return 0;
1494 }
1495
1496 static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
1497 {
1498         int err;
1499
1500         rtnl_lock();
1501         err = _virtnet_set_queues(vi, queue_pairs);
1502         rtnl_unlock();
1503         return err;
1504 }
1505
1506 static int virtnet_close(struct net_device *dev)
1507 {
1508         struct virtnet_info *vi = netdev_priv(dev);
1509         int i;
1510
1511         /* Make sure refill_work doesn't re-enable napi! */
1512         cancel_delayed_work_sync(&vi->refill);
1513
1514         for (i = 0; i < vi->max_queue_pairs; i++) {
1515                 napi_disable(&vi->rq[i].napi);
1516                 virtnet_napi_tx_disable(&vi->sq[i].napi);
1517         }
1518
1519         return 0;
1520 }
1521
1522 static void virtnet_set_rx_mode(struct net_device *dev)
1523 {
1524         struct virtnet_info *vi = netdev_priv(dev);
1525         struct scatterlist sg[2];
1526         struct virtio_net_ctrl_mac *mac_data;
1527         struct netdev_hw_addr *ha;
1528         int uc_count;
1529         int mc_count;
1530         void *buf;
1531         int i;
1532
1533         /* We can't dynamically set ndo_set_rx_mode, so return gracefully */
1534         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX))
1535                 return;
1536
1537         vi->ctrl_promisc = ((dev->flags & IFF_PROMISC) != 0);
1538         vi->ctrl_allmulti = ((dev->flags & IFF_ALLMULTI) != 0);
1539
1540         sg_init_one(sg, &vi->ctrl_promisc, sizeof(vi->ctrl_promisc));
1541
1542         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1543                                   VIRTIO_NET_CTRL_RX_PROMISC, sg))
1544                 dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
1545                          vi->ctrl_promisc ? "en" : "dis");
1546
1547         sg_init_one(sg, &vi->ctrl_allmulti, sizeof(vi->ctrl_allmulti));
1548
1549         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
1550                                   VIRTIO_NET_CTRL_RX_ALLMULTI, sg))
1551                 dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
1552                          vi->ctrl_allmulti ? "en" : "dis");
1553
1554         uc_count = netdev_uc_count(dev);
1555         mc_count = netdev_mc_count(dev);
1556         /* MAC filter - use one buffer for both lists */
1557         buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) +
1558                       (2 * sizeof(mac_data->entries)), GFP_ATOMIC);
1559         mac_data = buf;
1560         if (!buf)
1561                 return;
1562
1563         sg_init_table(sg, 2);
1564
1565         /* Store the unicast list and count in the front of the buffer */
1566         mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count);
1567         i = 0;
1568         netdev_for_each_uc_addr(ha, dev)
1569                 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1570
1571         sg_set_buf(&sg[0], mac_data,
1572                    sizeof(mac_data->entries) + (uc_count * ETH_ALEN));
1573
1574         /* multicast list and count fill the end */
1575         mac_data = (void *)&mac_data->macs[uc_count][0];
1576
1577         mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count);
1578         i = 0;
1579         netdev_for_each_mc_addr(ha, dev)
1580                 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN);
1581
1582         sg_set_buf(&sg[1], mac_data,
1583                    sizeof(mac_data->entries) + (mc_count * ETH_ALEN));
1584
1585         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
1586                                   VIRTIO_NET_CTRL_MAC_TABLE_SET, sg))
1587                 dev_warn(&dev->dev, "Failed to set MAC filter table.\n");
1588
1589         kfree(buf);
1590 }
1591
1592 static int virtnet_vlan_rx_add_vid(struct net_device *dev,
1593                                    __be16 proto, u16 vid)
1594 {
1595         struct virtnet_info *vi = netdev_priv(dev);
1596         struct scatterlist sg;
1597
1598         vi->ctrl_vid = vid;
1599         sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1600
1601         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1602                                   VIRTIO_NET_CTRL_VLAN_ADD, &sg))
1603                 dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
1604         return 0;
1605 }
1606
1607 static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
1608                                     __be16 proto, u16 vid)
1609 {
1610         struct virtnet_info *vi = netdev_priv(dev);
1611         struct scatterlist sg;
1612
1613         vi->ctrl_vid = vid;
1614         sg_init_one(&sg, &vi->ctrl_vid, sizeof(vi->ctrl_vid));
1615
1616         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
1617                                   VIRTIO_NET_CTRL_VLAN_DEL, &sg))
1618                 dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
1619         return 0;
1620 }
1621
1622 static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu)
1623 {
1624         int i;
1625
1626         if (vi->affinity_hint_set) {
1627                 for (i = 0; i < vi->max_queue_pairs; i++) {
1628                         virtqueue_set_affinity(vi->rq[i].vq, -1);
1629                         virtqueue_set_affinity(vi->sq[i].vq, -1);
1630                 }
1631
1632                 vi->affinity_hint_set = false;
1633         }
1634 }
1635
1636 static void virtnet_set_affinity(struct virtnet_info *vi)
1637 {
1638         int i;
1639         int cpu;
1640
1641         /* In multiqueue mode, when the number of cpu is equal to the number of
1642          * queue pairs, we let the queue pairs to be private to one cpu by
1643          * setting the affinity hint to eliminate the contention.
1644          */
1645         if (vi->curr_queue_pairs == 1 ||
1646             vi->max_queue_pairs != num_online_cpus()) {
1647                 virtnet_clean_affinity(vi, -1);
1648                 return;
1649         }
1650
1651         i = 0;
1652         for_each_online_cpu(cpu) {
1653                 virtqueue_set_affinity(vi->rq[i].vq, cpu);
1654                 virtqueue_set_affinity(vi->sq[i].vq, cpu);
1655                 netif_set_xps_queue(vi->dev, cpumask_of(cpu), i);
1656                 i++;
1657         }
1658
1659         vi->affinity_hint_set = true;
1660 }
1661
1662 static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node)
1663 {
1664         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
1665                                                    node);
1666         virtnet_set_affinity(vi);
1667         return 0;
1668 }
1669
1670 static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node)
1671 {
1672         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
1673                                                    node_dead);
1674         virtnet_set_affinity(vi);
1675         return 0;
1676 }
1677
1678 static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node)
1679 {
1680         struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info,
1681                                                    node);
1682
1683         virtnet_clean_affinity(vi, cpu);
1684         return 0;
1685 }
1686
1687 static enum cpuhp_state virtionet_online;
1688
1689 static int virtnet_cpu_notif_add(struct virtnet_info *vi)
1690 {
1691         int ret;
1692
1693         ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node);
1694         if (ret)
1695                 return ret;
1696         ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD,
1697                                                &vi->node_dead);
1698         if (!ret)
1699                 return ret;
1700         cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
1701         return ret;
1702 }
1703
1704 static void virtnet_cpu_notif_remove(struct virtnet_info *vi)
1705 {
1706         cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node);
1707         cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD,
1708                                             &vi->node_dead);
1709 }
1710
1711 static void virtnet_get_ringparam(struct net_device *dev,
1712                                 struct ethtool_ringparam *ring)
1713 {
1714         struct virtnet_info *vi = netdev_priv(dev);
1715
1716         ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
1717         ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
1718         ring->rx_pending = ring->rx_max_pending;
1719         ring->tx_pending = ring->tx_max_pending;
1720 }
1721
1722
1723 static void virtnet_get_drvinfo(struct net_device *dev,
1724                                 struct ethtool_drvinfo *info)
1725 {
1726         struct virtnet_info *vi = netdev_priv(dev);
1727         struct virtio_device *vdev = vi->vdev;
1728
1729         strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
1730         strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
1731         strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));
1732
1733 }
1734
1735 /* TODO: Eliminate OOO packets during switching */
1736 static int virtnet_set_channels(struct net_device *dev,
1737                                 struct ethtool_channels *channels)
1738 {
1739         struct virtnet_info *vi = netdev_priv(dev);
1740         u16 queue_pairs = channels->combined_count;
1741         int err;
1742
1743         /* We don't support separate rx/tx channels.
1744          * We don't allow setting 'other' channels.
1745          */
1746         if (channels->rx_count || channels->tx_count || channels->other_count)
1747                 return -EINVAL;
1748
1749         if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
1750                 return -EINVAL;
1751
1752         /* For now we don't support modifying channels while XDP is loaded
1753          * also when XDP is loaded all RX queues have XDP programs so we only
1754          * need to check a single RX queue.
1755          */
1756         if (vi->rq[0].xdp_prog)
1757                 return -EINVAL;
1758
1759         get_online_cpus();
1760         err = _virtnet_set_queues(vi, queue_pairs);
1761         if (!err) {
1762                 netif_set_real_num_tx_queues(dev, queue_pairs);
1763                 netif_set_real_num_rx_queues(dev, queue_pairs);
1764
1765                 virtnet_set_affinity(vi);
1766         }
1767         put_online_cpus();
1768
1769         return err;
1770 }
1771
1772 static void virtnet_get_channels(struct net_device *dev,
1773                                  struct ethtool_channels *channels)
1774 {
1775         struct virtnet_info *vi = netdev_priv(dev);
1776
1777         channels->combined_count = vi->curr_queue_pairs;
1778         channels->max_combined = vi->max_queue_pairs;
1779         channels->max_other = 0;
1780         channels->rx_count = 0;
1781         channels->tx_count = 0;
1782         channels->other_count = 0;
1783 }
1784
1785 /* Check if the user is trying to change anything besides speed/duplex */
1786 static bool
1787 virtnet_validate_ethtool_cmd(const struct ethtool_link_ksettings *cmd)
1788 {
1789         struct ethtool_link_ksettings diff1 = *cmd;
1790         struct ethtool_link_ksettings diff2 = {};
1791
1792         /* cmd is always set so we need to clear it, validate the port type
1793          * and also without autonegotiation we can ignore advertising
1794          */
1795         diff1.base.speed = 0;
1796         diff2.base.port = PORT_OTHER;
1797         ethtool_link_ksettings_zero_link_mode(&diff1, advertising);
1798         diff1.base.duplex = 0;
1799         diff1.base.cmd = 0;
1800         diff1.base.link_mode_masks_nwords = 0;
1801
1802         return !memcmp(&diff1.base, &diff2.base, sizeof(diff1.base)) &&
1803                 bitmap_empty(diff1.link_modes.supported,
1804                              __ETHTOOL_LINK_MODE_MASK_NBITS) &&
1805                 bitmap_empty(diff1.link_modes.advertising,
1806                              __ETHTOOL_LINK_MODE_MASK_NBITS) &&
1807                 bitmap_empty(diff1.link_modes.lp_advertising,
1808                              __ETHTOOL_LINK_MODE_MASK_NBITS);
1809 }
1810
1811 static int virtnet_set_link_ksettings(struct net_device *dev,
1812                                       const struct ethtool_link_ksettings *cmd)
1813 {
1814         struct virtnet_info *vi = netdev_priv(dev);
1815         u32 speed;
1816
1817         speed = cmd->base.speed;
1818         /* don't allow custom speed and duplex */
1819         if (!ethtool_validate_speed(speed) ||
1820             !ethtool_validate_duplex(cmd->base.duplex) ||
1821             !virtnet_validate_ethtool_cmd(cmd))
1822                 return -EINVAL;
1823         vi->speed = speed;
1824         vi->duplex = cmd->base.duplex;
1825
1826         return 0;
1827 }
1828
1829 static int virtnet_get_link_ksettings(struct net_device *dev,
1830                                       struct ethtool_link_ksettings *cmd)
1831 {
1832         struct virtnet_info *vi = netdev_priv(dev);
1833
1834         cmd->base.speed = vi->speed;
1835         cmd->base.duplex = vi->duplex;
1836         cmd->base.port = PORT_OTHER;
1837
1838         return 0;
1839 }
1840
1841 static void virtnet_init_settings(struct net_device *dev)
1842 {
1843         struct virtnet_info *vi = netdev_priv(dev);
1844
1845         vi->speed = SPEED_UNKNOWN;
1846         vi->duplex = DUPLEX_UNKNOWN;
1847 }
1848
1849 static const struct ethtool_ops virtnet_ethtool_ops = {
1850         .get_drvinfo = virtnet_get_drvinfo,
1851         .get_link = ethtool_op_get_link,
1852         .get_ringparam = virtnet_get_ringparam,
1853         .set_channels = virtnet_set_channels,
1854         .get_channels = virtnet_get_channels,
1855         .get_ts_info = ethtool_op_get_ts_info,
1856         .get_link_ksettings = virtnet_get_link_ksettings,
1857         .set_link_ksettings = virtnet_set_link_ksettings,
1858 };
1859
1860 static void virtnet_freeze_down(struct virtio_device *vdev)
1861 {
1862         struct virtnet_info *vi = vdev->priv;
1863         int i;
1864
1865         /* Make sure no work handler is accessing the device */
1866         flush_work(&vi->config_work);
1867
1868         netif_device_detach(vi->dev);
1869         netif_tx_disable(vi->dev);
1870         cancel_delayed_work_sync(&vi->refill);
1871
1872         if (netif_running(vi->dev)) {
1873                 for (i = 0; i < vi->max_queue_pairs; i++) {
1874                         napi_disable(&vi->rq[i].napi);
1875                         virtnet_napi_tx_disable(&vi->sq[i].napi);
1876                 }
1877         }
1878 }
1879
1880 static int init_vqs(struct virtnet_info *vi);
1881
1882 static int virtnet_restore_up(struct virtio_device *vdev)
1883 {
1884         struct virtnet_info *vi = vdev->priv;
1885         int err, i;
1886
1887         err = init_vqs(vi);
1888         if (err)
1889                 return err;
1890
1891         virtio_device_ready(vdev);
1892
1893         if (netif_running(vi->dev)) {
1894                 for (i = 0; i < vi->curr_queue_pairs; i++)
1895                         if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
1896                                 schedule_delayed_work(&vi->refill, 0);
1897
1898                 for (i = 0; i < vi->max_queue_pairs; i++) {
1899                         virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
1900                         virtnet_napi_tx_enable(vi, vi->sq[i].vq,
1901                                                &vi->sq[i].napi);
1902                 }
1903         }
1904
1905         netif_device_attach(vi->dev);
1906         return err;
1907 }
1908
1909 static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
1910 {
1911         struct scatterlist sg;
1912         vi->ctrl_offloads = cpu_to_virtio64(vi->vdev, offloads);
1913
1914         sg_init_one(&sg, &vi->ctrl_offloads, sizeof(vi->ctrl_offloads));
1915
1916         if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS,
1917                                   VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) {
1918                 dev_warn(&vi->dev->dev, "Fail to set guest offload. \n");
1919                 return -EINVAL;
1920         }
1921
1922         return 0;
1923 }
1924
1925 static int virtnet_clear_guest_offloads(struct virtnet_info *vi)
1926 {
1927         u64 offloads = 0;
1928
1929         if (!vi->guest_offloads)
1930                 return 0;
1931
1932         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))
1933                 offloads = 1ULL << VIRTIO_NET_F_GUEST_CSUM;
1934
1935         return virtnet_set_guest_offloads(vi, offloads);
1936 }
1937
1938 static int virtnet_restore_guest_offloads(struct virtnet_info *vi)
1939 {
1940         u64 offloads = vi->guest_offloads;
1941
1942         if (!vi->guest_offloads)
1943                 return 0;
1944         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))
1945                 offloads |= 1ULL << VIRTIO_NET_F_GUEST_CSUM;
1946
1947         return virtnet_set_guest_offloads(vi, offloads);
1948 }
1949
1950 static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
1951                            struct netlink_ext_ack *extack)
1952 {
1953         unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
1954         struct virtnet_info *vi = netdev_priv(dev);
1955         struct bpf_prog *old_prog;
1956         u16 xdp_qp = 0, curr_qp;
1957         int i, err;
1958
1959         if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
1960             && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
1961                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
1962                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
1963                 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO))) {
1964                 NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first");
1965                 return -EOPNOTSUPP;
1966         }
1967
1968         if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
1969                 NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
1970                 return -EINVAL;
1971         }
1972
1973         if (dev->mtu > max_sz) {
1974                 NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
1975                 netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
1976                 return -EINVAL;
1977         }
1978
1979         curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
1980         if (prog)
1981                 xdp_qp = nr_cpu_ids;
1982
1983         /* XDP requires extra queues for XDP_TX */
1984         if (curr_qp + xdp_qp > vi->max_queue_pairs) {
1985                 NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available");
1986                 netdev_warn(dev, "request %i queues but max is %i\n",
1987                             curr_qp + xdp_qp, vi->max_queue_pairs);
1988                 return -ENOMEM;
1989         }
1990
1991         if (prog) {
1992                 prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
1993                 if (IS_ERR(prog))
1994                         return PTR_ERR(prog);
1995         }
1996
1997         /* Make sure NAPI is not using any XDP TX queues for RX. */
1998         for (i = 0; i < vi->max_queue_pairs; i++)
1999                 napi_disable(&vi->rq[i].napi);
2000
2001         netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
2002         err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
2003         if (err)
2004                 goto err;
2005         vi->xdp_queue_pairs = xdp_qp;
2006
2007         for (i = 0; i < vi->max_queue_pairs; i++) {
2008                 old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
2009                 rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
2010                 if (i == 0) {
2011                         if (!old_prog)
2012                                 virtnet_clear_guest_offloads(vi);
2013                         if (!prog)
2014                                 virtnet_restore_guest_offloads(vi);
2015                 }
2016                 if (old_prog)
2017                         bpf_prog_put(old_prog);
2018                 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2019         }
2020
2021         return 0;
2022
2023 err:
2024         for (i = 0; i < vi->max_queue_pairs; i++)
2025                 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
2026         if (prog)
2027                 bpf_prog_sub(prog, vi->max_queue_pairs - 1);
2028         return err;
2029 }
2030
2031 static u32 virtnet_xdp_query(struct net_device *dev)
2032 {
2033         struct virtnet_info *vi = netdev_priv(dev);
2034         const struct bpf_prog *xdp_prog;
2035         int i;
2036
2037         for (i = 0; i < vi->max_queue_pairs; i++) {
2038                 xdp_prog = rtnl_dereference(vi->rq[i].xdp_prog);
2039                 if (xdp_prog)
2040                         return xdp_prog->aux->id;
2041         }
2042         return 0;
2043 }
2044
2045 static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
2046 {
2047         switch (xdp->command) {
2048         case XDP_SETUP_PROG:
2049                 return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
2050         case XDP_QUERY_PROG:
2051                 xdp->prog_id = virtnet_xdp_query(dev);
2052                 xdp->prog_attached = !!xdp->prog_id;
2053                 return 0;
2054         default:
2055                 return -EINVAL;
2056         }
2057 }
2058
2059 static const struct net_device_ops virtnet_netdev = {
2060         .ndo_open            = virtnet_open,
2061         .ndo_stop            = virtnet_close,
2062         .ndo_start_xmit      = start_xmit,
2063         .ndo_validate_addr   = eth_validate_addr,
2064         .ndo_set_mac_address = virtnet_set_mac_address,
2065         .ndo_set_rx_mode     = virtnet_set_rx_mode,
2066         .ndo_get_stats64     = virtnet_stats,
2067         .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
2068         .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
2069 #ifdef CONFIG_NET_POLL_CONTROLLER
2070         .ndo_poll_controller = virtnet_netpoll,
2071 #endif
2072         .ndo_xdp                = virtnet_xdp,
2073         .ndo_features_check     = passthru_features_check,
2074 };
2075
2076 static void virtnet_config_changed_work(struct work_struct *work)
2077 {
2078         struct virtnet_info *vi =
2079                 container_of(work, struct virtnet_info, config_work);
2080         u16 v;
2081
2082         if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
2083                                  struct virtio_net_config, status, &v) < 0)
2084                 return;
2085
2086         if (v & VIRTIO_NET_S_ANNOUNCE) {
2087                 netdev_notify_peers(vi->dev);
2088                 virtnet_ack_link_announce(vi);
2089         }
2090
2091         /* Ignore unknown (future) status bits */
2092         v &= VIRTIO_NET_S_LINK_UP;
2093
2094         if (vi->status == v)
2095                 return;
2096
2097         vi->status = v;
2098
2099         if (vi->status & VIRTIO_NET_S_LINK_UP) {
2100                 netif_carrier_on(vi->dev);
2101                 netif_tx_wake_all_queues(vi->dev);
2102         } else {
2103                 netif_carrier_off(vi->dev);
2104                 netif_tx_stop_all_queues(vi->dev);
2105         }
2106 }
2107
2108 static void virtnet_config_changed(struct virtio_device *vdev)
2109 {
2110         struct virtnet_info *vi = vdev->priv;
2111
2112         schedule_work(&vi->config_work);
2113 }
2114
2115 static void virtnet_free_queues(struct virtnet_info *vi)
2116 {
2117         int i;
2118
2119         for (i = 0; i < vi->max_queue_pairs; i++) {
2120                 napi_hash_del(&vi->rq[i].napi);
2121                 netif_napi_del(&vi->rq[i].napi);
2122                 netif_napi_del(&vi->sq[i].napi);
2123         }
2124
2125         /* We called napi_hash_del() before netif_napi_del(),
2126          * we need to respect an RCU grace period before freeing vi->rq
2127          */
2128         synchronize_net();
2129
2130         kfree(vi->rq);
2131         kfree(vi->sq);
2132 }
2133
2134 static void _free_receive_bufs(struct virtnet_info *vi)
2135 {
2136         struct bpf_prog *old_prog;
2137         int i;
2138
2139         for (i = 0; i < vi->max_queue_pairs; i++) {
2140                 while (vi->rq[i].pages)
2141                         __free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
2142
2143                 old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
2144                 RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
2145                 if (old_prog)
2146                         bpf_prog_put(old_prog);
2147         }
2148 }
2149
2150 static void free_receive_bufs(struct virtnet_info *vi)
2151 {
2152         rtnl_lock();
2153         _free_receive_bufs(vi);
2154         rtnl_unlock();
2155 }
2156
2157 static void free_receive_page_frags(struct virtnet_info *vi)
2158 {
2159         int i;
2160         for (i = 0; i < vi->max_queue_pairs; i++)
2161                 if (vi->rq[i].alloc_frag.page)
2162                         put_page(vi->rq[i].alloc_frag.page);
2163 }
2164
2165 static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
2166 {
2167         if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
2168                 return false;
2169         else if (q < vi->curr_queue_pairs)
2170                 return true;
2171         else
2172                 return false;
2173 }
2174
2175 static void free_unused_bufs(struct virtnet_info *vi)
2176 {
2177         void *buf;
2178         int i;
2179
2180         for (i = 0; i < vi->max_queue_pairs; i++) {
2181                 struct virtqueue *vq = vi->sq[i].vq;
2182                 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2183                         if (!is_xdp_raw_buffer_queue(vi, i))
2184                                 dev_kfree_skb(buf);
2185                         else
2186                                 put_page(virt_to_head_page(buf));
2187                 }
2188         }
2189
2190         for (i = 0; i < vi->max_queue_pairs; i++) {
2191                 struct virtqueue *vq = vi->rq[i].vq;
2192
2193                 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
2194                         if (vi->mergeable_rx_bufs) {
2195                                 put_page(virt_to_head_page(buf));
2196                         } else if (vi->big_packets) {
2197                                 give_pages(&vi->rq[i], buf);
2198                         } else {
2199                                 put_page(virt_to_head_page(buf));
2200                         }
2201                 }
2202         }
2203 }
2204
2205 static void virtnet_del_vqs(struct virtnet_info *vi)
2206 {
2207         struct virtio_device *vdev = vi->vdev;
2208
2209         virtnet_clean_affinity(vi, -1);
2210
2211         vdev->config->del_vqs(vdev);
2212
2213         virtnet_free_queues(vi);
2214 }
2215
2216 /* How large should a single buffer be so a queue full of these can fit at
2217  * least one full packet?
2218  * Logic below assumes the mergeable buffer header is used.
2219  */
2220 static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq)
2221 {
2222         const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2223         unsigned int rq_size = virtqueue_get_vring_size(vq);
2224         unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu;
2225         unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len;
2226         unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size);
2227
2228         return max(max(min_buf_len, hdr_len) - hdr_len,
2229                    (unsigned int)GOOD_PACKET_LEN);
2230 }
2231
2232 static int virtnet_find_vqs(struct virtnet_info *vi)
2233 {
2234         vq_callback_t **callbacks;
2235         struct virtqueue **vqs;
2236         int ret = -ENOMEM;
2237         int i, total_vqs;
2238         const char **names;
2239         bool *ctx;
2240
2241         /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
2242          * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
2243          * possible control vq.
2244          */
2245         total_vqs = vi->max_queue_pairs * 2 +
2246                     virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
2247
2248         /* Allocate space for find_vqs parameters */
2249         vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
2250         if (!vqs)
2251                 goto err_vq;
2252         callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
2253         if (!callbacks)
2254                 goto err_callback;
2255         names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
2256         if (!names)
2257                 goto err_names;
2258         if (!vi->big_packets || vi->mergeable_rx_bufs) {
2259                 ctx = kzalloc(total_vqs * sizeof(*ctx), GFP_KERNEL);
2260                 if (!ctx)
2261                         goto err_ctx;
2262         } else {
2263                 ctx = NULL;
2264         }
2265
2266         /* Parameters for control virtqueue, if any */
2267         if (vi->has_cvq) {
2268                 callbacks[total_vqs - 1] = NULL;
2269                 names[total_vqs - 1] = "control";
2270         }
2271
2272         /* Allocate/initialize parameters for send/receive virtqueues */
2273         for (i = 0; i < vi->max_queue_pairs; i++) {
2274                 callbacks[rxq2vq(i)] = skb_recv_done;
2275                 callbacks[txq2vq(i)] = skb_xmit_done;
2276                 sprintf(vi->rq[i].name, "input.%d", i);
2277                 sprintf(vi->sq[i].name, "output.%d", i);
2278                 names[rxq2vq(i)] = vi->rq[i].name;
2279                 names[txq2vq(i)] = vi->sq[i].name;
2280                 if (ctx)
2281                         ctx[rxq2vq(i)] = true;
2282         }
2283
2284         ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
2285                                          names, ctx, NULL);
2286         if (ret)
2287                 goto err_find;
2288
2289         if (vi->has_cvq) {
2290                 vi->cvq = vqs[total_vqs - 1];
2291                 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
2292                         vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
2293         }
2294
2295         for (i = 0; i < vi->max_queue_pairs; i++) {
2296                 vi->rq[i].vq = vqs[rxq2vq(i)];
2297                 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
2298                 vi->sq[i].vq = vqs[txq2vq(i)];
2299         }
2300
2301         kfree(names);
2302         kfree(callbacks);
2303         kfree(vqs);
2304         kfree(ctx);
2305
2306         return 0;
2307
2308 err_find:
2309         kfree(ctx);
2310 err_ctx:
2311         kfree(names);
2312 err_names:
2313         kfree(callbacks);
2314 err_callback:
2315         kfree(vqs);
2316 err_vq:
2317         return ret;
2318 }
2319
2320 static int virtnet_alloc_queues(struct virtnet_info *vi)
2321 {
2322         int i;
2323
2324         vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
2325         if (!vi->sq)
2326                 goto err_sq;
2327         vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
2328         if (!vi->rq)
2329                 goto err_rq;
2330
2331         INIT_DELAYED_WORK(&vi->refill, refill_work);
2332         for (i = 0; i < vi->max_queue_pairs; i++) {
2333                 vi->rq[i].pages = NULL;
2334                 netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
2335                                napi_weight);
2336                 netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx,
2337                                   napi_tx ? napi_weight : 0);
2338
2339                 sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
2340                 ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
2341                 sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
2342         }
2343
2344         return 0;
2345
2346 err_rq:
2347         kfree(vi->sq);
2348 err_sq:
2349         return -ENOMEM;
2350 }
2351
2352 static int init_vqs(struct virtnet_info *vi)
2353 {
2354         int ret;
2355
2356         /* Allocate send & receive queues */
2357         ret = virtnet_alloc_queues(vi);
2358         if (ret)
2359                 goto err;
2360
2361         ret = virtnet_find_vqs(vi);
2362         if (ret)
2363                 goto err_free;
2364
2365         get_online_cpus();
2366         virtnet_set_affinity(vi);
2367         put_online_cpus();
2368
2369         return 0;
2370
2371 err_free:
2372         virtnet_free_queues(vi);
2373 err:
2374         return ret;
2375 }
2376
2377 #ifdef CONFIG_SYSFS
2378 static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
2379                 char *buf)
2380 {
2381         struct virtnet_info *vi = netdev_priv(queue->dev);
2382         unsigned int queue_index = get_netdev_rx_queue_index(queue);
2383         struct ewma_pkt_len *avg;
2384
2385         BUG_ON(queue_index >= vi->max_queue_pairs);
2386         avg = &vi->rq[queue_index].mrg_avg_pkt_len;
2387         return sprintf(buf, "%u\n",
2388                        get_mergeable_buf_len(&vi->rq[queue_index], avg));
2389 }
2390
2391 static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
2392         __ATTR_RO(mergeable_rx_buffer_size);
2393
2394 static struct attribute *virtio_net_mrg_rx_attrs[] = {
2395         &mergeable_rx_buffer_size_attribute.attr,
2396         NULL
2397 };
2398
2399 static const struct attribute_group virtio_net_mrg_rx_group = {
2400         .name = "virtio_net",
2401         .attrs = virtio_net_mrg_rx_attrs
2402 };
2403 #endif
2404
2405 static bool virtnet_fail_on_feature(struct virtio_device *vdev,
2406                                     unsigned int fbit,
2407                                     const char *fname, const char *dname)
2408 {
2409         if (!virtio_has_feature(vdev, fbit))
2410                 return false;
2411
2412         dev_err(&vdev->dev, "device advertises feature %s but not %s",
2413                 fname, dname);
2414
2415         return true;
2416 }
2417
2418 #define VIRTNET_FAIL_ON(vdev, fbit, dbit)                       \
2419         virtnet_fail_on_feature(vdev, fbit, #fbit, dbit)
2420
2421 static bool virtnet_validate_features(struct virtio_device *vdev)
2422 {
2423         if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) &&
2424             (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX,
2425                              "VIRTIO_NET_F_CTRL_VQ") ||
2426              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN,
2427                              "VIRTIO_NET_F_CTRL_VQ") ||
2428              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE,
2429                              "VIRTIO_NET_F_CTRL_VQ") ||
2430              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") ||
2431              VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR,
2432                              "VIRTIO_NET_F_CTRL_VQ"))) {
2433                 return false;
2434         }
2435
2436         return true;
2437 }
2438
2439 #define MIN_MTU ETH_MIN_MTU
2440 #define MAX_MTU ETH_MAX_MTU
2441
2442 static int virtnet_validate(struct virtio_device *vdev)
2443 {
2444         if (!vdev->config->get) {
2445                 dev_err(&vdev->dev, "%s failure: config access disabled\n",
2446                         __func__);
2447                 return -EINVAL;
2448         }
2449
2450         if (!virtnet_validate_features(vdev))
2451                 return -EINVAL;
2452
2453         if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
2454                 int mtu = virtio_cread16(vdev,
2455                                          offsetof(struct virtio_net_config,
2456                                                   mtu));
2457                 if (mtu < MIN_MTU)
2458                         __virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
2459         }
2460
2461         return 0;
2462 }
2463
2464 static int virtnet_probe(struct virtio_device *vdev)
2465 {
2466         int i, err;
2467         struct net_device *dev;
2468         struct virtnet_info *vi;
2469         u16 max_queue_pairs;
2470         int mtu;
2471
2472         /* Find if host supports multiqueue virtio_net device */
2473         err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ,
2474                                    struct virtio_net_config,
2475                                    max_virtqueue_pairs, &max_queue_pairs);
2476
2477         /* We need at least 2 queue's */
2478         if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
2479             max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
2480             !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
2481                 max_queue_pairs = 1;
2482
2483         /* Allocate ourselves a network device with room for our info */
2484         dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
2485         if (!dev)
2486                 return -ENOMEM;
2487
2488         /* Set up network device as normal. */
2489         dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
2490         dev->netdev_ops = &virtnet_netdev;
2491         dev->features = NETIF_F_HIGHDMA;
2492
2493         dev->ethtool_ops = &virtnet_ethtool_ops;
2494         SET_NETDEV_DEV(dev, &vdev->dev);
2495
2496         /* Do we support "hardware" checksums? */
2497         if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
2498                 /* This opens up the world of extra features. */
2499                 dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2500                 if (csum)
2501                         dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
2502
2503                 if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
2504                         dev->hw_features |= NETIF_F_TSO
2505                                 | NETIF_F_TSO_ECN | NETIF_F_TSO6;
2506                 }
2507                 /* Individual feature bits: what can host handle? */
2508                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
2509                         dev->hw_features |= NETIF_F_TSO;
2510                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
2511                         dev->hw_features |= NETIF_F_TSO6;
2512                 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
2513                         dev->hw_features |= NETIF_F_TSO_ECN;
2514
2515                 dev->features |= NETIF_F_GSO_ROBUST;
2516
2517                 if (gso)
2518                         dev->features |= dev->hw_features & NETIF_F_ALL_TSO;
2519                 /* (!csum && gso) case will be fixed by register_netdev() */
2520         }
2521         if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM))
2522                 dev->features |= NETIF_F_RXCSUM;
2523
2524         dev->vlan_features = dev->features;
2525
2526         /* MTU range: 68 - 65535 */
2527         dev->min_mtu = MIN_MTU;
2528         dev->max_mtu = MAX_MTU;
2529
2530         /* Configuration may specify what MAC to use.  Otherwise random. */
2531         if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
2532                 virtio_cread_bytes(vdev,
2533                                    offsetof(struct virtio_net_config, mac),
2534                                    dev->dev_addr, dev->addr_len);
2535         else
2536                 eth_hw_addr_random(dev);
2537
2538         /* Set up our device-specific information */
2539         vi = netdev_priv(dev);
2540         vi->dev = dev;
2541         vi->vdev = vdev;
2542         vdev->priv = vi;
2543         vi->stats = alloc_percpu(struct virtnet_stats);
2544         err = -ENOMEM;
2545         if (vi->stats == NULL)
2546                 goto free;
2547
2548         for_each_possible_cpu(i) {
2549                 struct virtnet_stats *virtnet_stats;
2550                 virtnet_stats = per_cpu_ptr(vi->stats, i);
2551                 u64_stats_init(&virtnet_stats->tx_syncp);
2552                 u64_stats_init(&virtnet_stats->rx_syncp);
2553         }
2554
2555         INIT_WORK(&vi->config_work, virtnet_config_changed_work);
2556
2557         /* If we can receive ANY GSO packets, we must allocate large ones. */
2558         if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
2559             virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) ||
2560             virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) ||
2561             virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO))
2562                 vi->big_packets = true;
2563
2564         if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
2565                 vi->mergeable_rx_bufs = true;
2566
2567         if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
2568             virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2569                 vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2570         else
2571                 vi->hdr_len = sizeof(struct virtio_net_hdr);
2572
2573         if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) ||
2574             virtio_has_feature(vdev, VIRTIO_F_VERSION_1))
2575                 vi->any_header_sg = true;
2576
2577         if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
2578                 vi->has_cvq = true;
2579
2580         if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) {
2581                 mtu = virtio_cread16(vdev,
2582                                      offsetof(struct virtio_net_config,
2583                                               mtu));
2584                 if (mtu < dev->min_mtu) {
2585                         /* Should never trigger: MTU was previously validated
2586                          * in virtnet_validate.
2587                          */
2588                         dev_err(&vdev->dev, "device MTU appears to have changed "
2589                                 "it is now %d < %d", mtu, dev->min_mtu);
2590                         goto free_stats;
2591                 }
2592
2593                 dev->mtu = mtu;
2594                 dev->max_mtu = mtu;
2595
2596                 /* TODO: size buffers correctly in this case. */
2597                 if (dev->mtu > ETH_DATA_LEN)
2598                         vi->big_packets = true;
2599         }
2600
2601         if (vi->any_header_sg)
2602                 dev->needed_headroom = vi->hdr_len;
2603
2604         /* Enable multiqueue by default */
2605         if (num_online_cpus() >= max_queue_pairs)
2606                 vi->curr_queue_pairs = max_queue_pairs;
2607         else
2608                 vi->curr_queue_pairs = num_online_cpus();
2609         vi->max_queue_pairs = max_queue_pairs;
2610
2611         /* Allocate/initialize the rx/tx queues, and invoke find_vqs */
2612         err = init_vqs(vi);
2613         if (err)
2614                 goto free_stats;
2615
2616 #ifdef CONFIG_SYSFS
2617         if (vi->mergeable_rx_bufs)
2618                 dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group;
2619 #endif
2620         netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs);
2621         netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs);
2622
2623         virtnet_init_settings(dev);
2624
2625         err = register_netdev(dev);
2626         if (err) {
2627                 pr_debug("virtio_net: registering device failed\n");
2628                 goto free_vqs;
2629         }
2630
2631         virtio_device_ready(vdev);
2632
2633         err = virtnet_cpu_notif_add(vi);
2634         if (err) {
2635                 pr_debug("virtio_net: registering cpu notifier failed\n");
2636                 goto free_unregister_netdev;
2637         }
2638
2639         virtnet_set_queues(vi, vi->curr_queue_pairs);
2640
2641         /* Assume link up if device can't report link status,
2642            otherwise get link status from config. */
2643         if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
2644                 netif_carrier_off(dev);
2645                 schedule_work(&vi->config_work);
2646         } else {
2647                 vi->status = VIRTIO_NET_S_LINK_UP;
2648                 netif_carrier_on(dev);
2649         }
2650
2651         for (i = 0; i < ARRAY_SIZE(guest_offloads); i++)
2652                 if (virtio_has_feature(vi->vdev, guest_offloads[i]))
2653                         set_bit(guest_offloads[i], &vi->guest_offloads);
2654
2655         pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
2656                  dev->name, max_queue_pairs);
2657
2658         return 0;
2659
2660 free_unregister_netdev:
2661         vi->vdev->config->reset(vdev);
2662
2663         unregister_netdev(dev);
2664 free_vqs:
2665         cancel_delayed_work_sync(&vi->refill);
2666         free_receive_page_frags(vi);
2667         virtnet_del_vqs(vi);
2668 free_stats:
2669         free_percpu(vi->stats);
2670 free:
2671         free_netdev(dev);
2672         return err;
2673 }
2674
2675 static void remove_vq_common(struct virtnet_info *vi)
2676 {
2677         vi->vdev->config->reset(vi->vdev);
2678
2679         /* Free unused buffers in both send and recv, if any. */
2680         free_unused_bufs(vi);
2681
2682         free_receive_bufs(vi);
2683
2684         free_receive_page_frags(vi);
2685
2686         virtnet_del_vqs(vi);
2687 }
2688
2689 static void virtnet_remove(struct virtio_device *vdev)
2690 {
2691         struct virtnet_info *vi = vdev->priv;
2692
2693         virtnet_cpu_notif_remove(vi);
2694
2695         /* Make sure no work handler is accessing the device. */
2696         flush_work(&vi->config_work);
2697
2698         unregister_netdev(vi->dev);
2699
2700         remove_vq_common(vi);
2701
2702         free_percpu(vi->stats);
2703         free_netdev(vi->dev);
2704 }
2705
2706 static __maybe_unused int virtnet_freeze(struct virtio_device *vdev)
2707 {
2708         struct virtnet_info *vi = vdev->priv;
2709
2710         virtnet_cpu_notif_remove(vi);
2711         virtnet_freeze_down(vdev);
2712         remove_vq_common(vi);
2713
2714         return 0;
2715 }
2716
2717 static __maybe_unused int virtnet_restore(struct virtio_device *vdev)
2718 {
2719         struct virtnet_info *vi = vdev->priv;
2720         int err;
2721
2722         err = virtnet_restore_up(vdev);
2723         if (err)
2724                 return err;
2725         virtnet_set_queues(vi, vi->curr_queue_pairs);
2726
2727         err = virtnet_cpu_notif_add(vi);
2728         if (err)
2729                 return err;
2730
2731         return 0;
2732 }
2733
2734 static struct virtio_device_id id_table[] = {
2735         { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2736         { 0 },
2737 };
2738
2739 #define VIRTNET_FEATURES \
2740         VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
2741         VIRTIO_NET_F_MAC, \
2742         VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
2743         VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
2744         VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \
2745         VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \
2746         VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \
2747         VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \
2748         VIRTIO_NET_F_CTRL_MAC_ADDR, \
2749         VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS
2750
2751 static unsigned int features[] = {
2752         VIRTNET_FEATURES,
2753 };
2754
2755 static unsigned int features_legacy[] = {
2756         VIRTNET_FEATURES,
2757         VIRTIO_NET_F_GSO,
2758         VIRTIO_F_ANY_LAYOUT,
2759 };
2760
2761 static struct virtio_driver virtio_net_driver = {
2762         .feature_table = features,
2763         .feature_table_size = ARRAY_SIZE(features),
2764         .feature_table_legacy = features_legacy,
2765         .feature_table_size_legacy = ARRAY_SIZE(features_legacy),
2766         .driver.name =  KBUILD_MODNAME,
2767         .driver.owner = THIS_MODULE,
2768         .id_table =     id_table,
2769         .validate =     virtnet_validate,
2770         .probe =        virtnet_probe,
2771         .remove =       virtnet_remove,
2772         .config_changed = virtnet_config_changed,
2773 #ifdef CONFIG_PM_SLEEP
2774         .freeze =       virtnet_freeze,
2775         .restore =      virtnet_restore,
2776 #endif
2777 };
2778
2779 static __init int virtio_net_driver_init(void)
2780 {
2781         int ret;
2782
2783         ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online",
2784                                       virtnet_cpu_online,
2785                                       virtnet_cpu_down_prep);
2786         if (ret < 0)
2787                 goto out;
2788         virtionet_online = ret;
2789         ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead",
2790                                       NULL, virtnet_cpu_dead);
2791         if (ret)
2792                 goto err_dead;
2793
2794         ret = register_virtio_driver(&virtio_net_driver);
2795         if (ret)
2796                 goto err_virtio;
2797         return 0;
2798 err_virtio:
2799         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
2800 err_dead:
2801         cpuhp_remove_multi_state(virtionet_online);
2802 out:
2803         return ret;
2804 }
2805 module_init(virtio_net_driver_init);
2806
2807 static __exit void virtio_net_driver_exit(void)
2808 {
2809         unregister_virtio_driver(&virtio_net_driver);
2810         cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
2811         cpuhp_remove_multi_state(virtionet_online);
2812 }
2813 module_exit(virtio_net_driver_exit);
2814
2815 MODULE_DEVICE_TABLE(virtio, id_table);
2816 MODULE_DESCRIPTION("Virtio network driver");
2817 MODULE_LICENSE("GPL");