net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <[email protected]>
  12  *                              Mark Evans, <[email protected]>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <[email protected]>
  16  *              Alan Cox <[email protected]>
  17  *              David Hinds <[email protected]>
  18  *              Alexey Kuznetsov <[email protected]>
  19  *              Adam Sulmicki <[email protected]>
  20  *              Pekka Riikonen <[email protected]>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <linux/hashtable.h>
 133 #include <linux/vmalloc.h>
 134
 135 #include "net-sysfs.h"
 136
 137 /* Instead of increasing this, you should create a hash table. */
 138 #define MAX_GRO_SKBS 8
 139
 140 /* This should be increased if a protocol with a bigger head is added. */
 141 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 142
 143 static DEFINE_SPINLOCK(ptype_lock);
 144 static DEFINE_SPINLOCK(offload_lock);
 145 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 146 struct list_head ptype_all __read_mostly;       /* Taps */
 147 static struct list_head offload_base __read_mostly;
 148
 149 /*
 150  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 151  * semaphore.
 152  *
 153  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 154  *
 155  * Writers must hold the rtnl semaphore while they loop through the
 156  * dev_base_head list, and hold dev_base_lock for writing when they do the
 157  * actual updates.  This allows pure readers to access the list even
 158  * while a writer is preparing to update it.
 159  *
 160  * To put it another way, dev_base_lock is held for writing only to
 161  * protect against pure readers; the rtnl semaphore provides the
 162  * protection against other writers.
 163  *
 164  * See, for example usages, register_netdevice() and
 165  * unregister_netdevice(), which must be called with the rtnl
 166  * semaphore held.
 167  */
 168 DEFINE_RWLOCK(dev_base_lock);
 169 EXPORT_SYMBOL(dev_base_lock);
 170
 171 /* protects napi_hash addition/deletion and napi_gen_id */
 172 static DEFINE_SPINLOCK(napi_hash_lock);
 173
 174 static unsigned int napi_gen_id;
 175 static DEFINE_HASHTABLE(napi_hash, 8);
 176
 177 seqcount_t devnet_rename_seq;
 178
 179 static inline void dev_base_seq_inc(struct net *net)
 180 {
 181         while (++net->dev_base_seq == 0);
 182 }
 183
 184 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 185 {
 186         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 187
 188         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 189 }
 190
 191 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 192 {
 193         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 194 }
 195
 196 static inline void rps_lock(struct softnet_data *sd)
 197 {
 198 #ifdef CONFIG_RPS
 199         spin_lock(&sd->input_pkt_queue.lock);
 200 #endif
 201 }
 202
 203 static inline void rps_unlock(struct softnet_data *sd)
 204 {
 205 #ifdef CONFIG_RPS
 206         spin_unlock(&sd->input_pkt_queue.lock);
 207 #endif
 208 }
 209
 210 /* Device list insertion */
 211 static void list_netdevice(struct net_device *dev)
 212 {
 213         struct net *net = dev_net(dev);
 214
 215         ASSERT_RTNL();
 216
 217         write_lock_bh(&dev_base_lock);
 218         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 219         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 220         hlist_add_head_rcu(&dev->index_hlist,
 221                            dev_index_hash(net, dev->ifindex));
 222         write_unlock_bh(&dev_base_lock);
 223
 224         dev_base_seq_inc(net);
 225 }
 226
 227 /* Device list removal
 228  * caller must respect a RCU grace period before freeing/reusing dev
 229  */
 230 static void unlist_netdevice(struct net_device *dev)
 231 {
 232         ASSERT_RTNL();
 233
 234         /* Unlink dev from the device chain */
 235         write_lock_bh(&dev_base_lock);
 236         list_del_rcu(&dev->dev_list);
 237         hlist_del_rcu(&dev->name_hlist);
 238         hlist_del_rcu(&dev->index_hlist);
 239         write_unlock_bh(&dev_base_lock);
 240
 241         dev_base_seq_inc(dev_net(dev));
 242 }
 243
 244 /*
 245  *      Our notifier list
 246  */
 247
 248 static RAW_NOTIFIER_HEAD(netdev_chain);
 249
 250 /*
 251  *      Device drivers call our routines to queue packets here. We empty the
 252  *      queue in the local softnet handler.
 253  */
 254
 255 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 256 EXPORT_PER_CPU_SYMBOL(softnet_data);
 257
 258 #ifdef CONFIG_LOCKDEP
 259 /*
 260  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 261  * according to dev->type
 262  */
 263 static const unsigned short netdev_lock_type[] =
 264         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 265          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 266          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 267          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 268          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 269          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 270          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 271          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 272          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 273          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 274          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 275          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 276          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 277          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 278          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 279
 280 static const char *const netdev_lock_name[] =
 281         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 282          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 283          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 284          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 285          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 286          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 287          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 288          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 289          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 290          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 291          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 292          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 293          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 294          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 295          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 296
 297 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 298 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 299
 300 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 301 {
 302         int i;
 303
 304         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 305                 if (netdev_lock_type[i] == dev_type)
 306                         return i;
 307         /* the last key is used by default */
 308         return ARRAY_SIZE(netdev_lock_type) - 1;
 309 }
 310
 311 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 312                                                  unsigned short dev_type)
 313 {
 314         int i;
 315
 316         i = netdev_lock_pos(dev_type);
 317         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 318                                    netdev_lock_name[i]);
 319 }
 320
 321 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev->type);
 326         lockdep_set_class_and_name(&dev->addr_list_lock,
 327                                    &netdev_addr_lock_key[i],
 328                                    netdev_lock_name[i]);
 329 }
 330 #else
 331 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 332                                                  unsigned short dev_type)
 333 {
 334 }
 335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336 {
 337 }
 338 #endif
 339
 340 /*******************************************************************************
 341
 342                 Protocol management and registration routines
 343
 344 *******************************************************************************/
 345
 346 /*
 347  *      Add a protocol ID to the list. Now that the input handler is
 348  *      smarter we can dispense with all the messy stuff that used to be
 349  *      here.
 350  *
 351  *      BEWARE!!! Protocol handlers, mangling input packets,
 352  *      MUST BE last in hash buckets and checking protocol handlers
 353  *      MUST start from promiscuous ptype_all chain in net_bh.
 354  *      It is true now, do not change it.
 355  *      Explanation follows: if protocol handler, mangling packet, will
 356  *      be the first on list, it is not able to sense, that packet
 357  *      is cloned and should be copied-on-write, so that it will
 358  *      change it and subsequent readers will get broken packet.
 359  *                                                      --ANK (980803)
 360  */
 361
 362 static inline struct list_head *ptype_head(const struct packet_type *pt)
 363 {
 364         if (pt->type == htons(ETH_P_ALL))
 365                 return &ptype_all;
 366         else
 367                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 368 }
 369
 370 /**
 371  *      dev_add_pack - add packet handler
 372  *      @pt: packet type declaration
 373  *
 374  *      Add a protocol handler to the networking stack. The passed &packet_type
 375  *      is linked into kernel lists and may not be freed until it has been
 376  *      removed from the kernel lists.
 377  *
 378  *      This call does not sleep therefore it can not
 379  *      guarantee all CPU's that are in middle of receiving packets
 380  *      will see the new packet type (until the next received packet).
 381  */
 382
 383 void dev_add_pack(struct packet_type *pt)
 384 {
 385         struct list_head *head = ptype_head(pt);
 386
 387         spin_lock(&ptype_lock);
 388         list_add_rcu(&pt->list, head);
 389         spin_unlock(&ptype_lock);
 390 }
 391 EXPORT_SYMBOL(dev_add_pack);
 392
 393 /**
 394  *      __dev_remove_pack        - remove packet handler
 395  *      @pt: packet type declaration
 396  *
 397  *      Remove a protocol handler that was previously added to the kernel
 398  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 399  *      from the kernel lists and can be freed or reused once this function
 400  *      returns.
 401  *
 402  *      The packet type might still be in use by receivers
 403  *      and must not be freed until after all the CPU's have gone
 404  *      through a quiescent state.
 405  */
 406 void __dev_remove_pack(struct packet_type *pt)
 407 {
 408         struct list_head *head = ptype_head(pt);
 409         struct packet_type *pt1;
 410
 411         spin_lock(&ptype_lock);
 412
 413         list_for_each_entry(pt1, head, list) {
 414                 if (pt == pt1) {
 415                         list_del_rcu(&pt->list);
 416                         goto out;
 417                 }
 418         }
 419
 420         pr_warn("dev_remove_pack: %p not found\n", pt);
 421 out:
 422         spin_unlock(&ptype_lock);
 423 }
 424 EXPORT_SYMBOL(__dev_remove_pack);
 425
 426 /**
 427  *      dev_remove_pack  - remove packet handler
 428  *      @pt: packet type declaration
 429  *
 430  *      Remove a protocol handler that was previously added to the kernel
 431  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 432  *      from the kernel lists and can be freed or reused once this function
 433  *      returns.
 434  *
 435  *      This call sleeps to guarantee that no CPU is looking at the packet
 436  *      type after return.
 437  */
 438 void dev_remove_pack(struct packet_type *pt)
 439 {
 440         __dev_remove_pack(pt);
 441
 442         synchronize_net();
 443 }
 444 EXPORT_SYMBOL(dev_remove_pack);
 445
 446
 447 /**
 448  *      dev_add_offload - register offload handlers
 449  *      @po: protocol offload declaration
 450  *
 451  *      Add protocol offload handlers to the networking stack. The passed
 452  *      &proto_offload is linked into kernel lists and may not be freed until
 453  *      it has been removed from the kernel lists.
 454  *
 455  *      This call does not sleep therefore it can not
 456  *      guarantee all CPU's that are in middle of receiving packets
 457  *      will see the new offload handlers (until the next received packet).
 458  */
 459 void dev_add_offload(struct packet_offload *po)
 460 {
 461         struct list_head *head = &offload_base;
 462
 463         spin_lock(&offload_lock);
 464         list_add_rcu(&po->list, head);
 465         spin_unlock(&offload_lock);
 466 }
 467 EXPORT_SYMBOL(dev_add_offload);
 468
 469 /**
 470  *      __dev_remove_offload     - remove offload handler
 471  *      @po: packet offload declaration
 472  *
 473  *      Remove a protocol offload handler that was previously added to the
 474  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 475  *      is removed from the kernel lists and can be freed or reused once this
 476  *      function returns.
 477  *
 478  *      The packet type might still be in use by receivers
 479  *      and must not be freed until after all the CPU's have gone
 480  *      through a quiescent state.
 481  */
 482 void __dev_remove_offload(struct packet_offload *po)
 483 {
 484         struct list_head *head = &offload_base;
 485         struct packet_offload *po1;
 486
 487         spin_lock(&offload_lock);
 488
 489         list_for_each_entry(po1, head, list) {
 490                 if (po == po1) {
 491                         list_del_rcu(&po->list);
 492                         goto out;
 493                 }
 494         }
 495
 496         pr_warn("dev_remove_offload: %p not found\n", po);
 497 out:
 498         spin_unlock(&offload_lock);
 499 }
 500 EXPORT_SYMBOL(__dev_remove_offload);
 501
 502 /**
 503  *      dev_remove_offload       - remove packet offload handler
 504  *      @po: packet offload declaration
 505  *
 506  *      Remove a packet offload handler that was previously added to the kernel
 507  *      offload handlers by dev_add_offload(). The passed &offload_type is
 508  *      removed from the kernel lists and can be freed or reused once this
 509  *      function returns.
 510  *
 511  *      This call sleeps to guarantee that no CPU is looking at the packet
 512  *      type after return.
 513  */
 514 void dev_remove_offload(struct packet_offload *po)
 515 {
 516         __dev_remove_offload(po);
 517
 518         synchronize_net();
 519 }
 520 EXPORT_SYMBOL(dev_remove_offload);
 521
 522 /******************************************************************************
 523
 524                       Device Boot-time Settings Routines
 525
 526 *******************************************************************************/
 527
 528 /* Boot time configuration table */
 529 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 530
 531 /**
 532  *      netdev_boot_setup_add   - add new setup entry
 533  *      @name: name of the device
 534  *      @map: configured settings for the device
 535  *
 536  *      Adds new setup entry to the dev_boot_setup list.  The function
 537  *      returns 0 on error and 1 on success.  This is a generic routine to
 538  *      all netdevices.
 539  */
 540 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 541 {
 542         struct netdev_boot_setup *s;
 543         int i;
 544
 545         s = dev_boot_setup;
 546         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 547                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 548                         memset(s[i].name, 0, sizeof(s[i].name));
 549                         strlcpy(s[i].name, name, IFNAMSIZ);
 550                         memcpy(&s[i].map, map, sizeof(s[i].map));
 551                         break;
 552                 }
 553         }
 554
 555         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 556 }
 557
 558 /**
 559  *      netdev_boot_setup_check - check boot time settings
 560  *      @dev: the netdevice
 561  *
 562  *      Check boot time settings for the device.
 563  *      The found settings are set for the device to be used
 564  *      later in the device probing.
 565  *      Returns 0 if no settings found, 1 if they are.
 566  */
 567 int netdev_boot_setup_check(struct net_device *dev)
 568 {
 569         struct netdev_boot_setup *s = dev_boot_setup;
 570         int i;
 571
 572         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 573                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 574                     !strcmp(dev->name, s[i].name)) {
 575                         dev->irq        = s[i].map.irq;
 576                         dev->base_addr  = s[i].map.base_addr;
 577                         dev->mem_start  = s[i].map.mem_start;
 578                         dev->mem_end    = s[i].map.mem_end;
 579                         return 1;
 580                 }
 581         }
 582         return 0;
 583 }
 584 EXPORT_SYMBOL(netdev_boot_setup_check);
 585
 586
 587 /**
 588  *      netdev_boot_base        - get address from boot time settings
 589  *      @prefix: prefix for network device
 590  *      @unit: id for network device
 591  *
 592  *      Check boot time settings for the base address of device.
 593  *      The found settings are set for the device to be used
 594  *      later in the device probing.
 595  *      Returns 0 if no settings found.
 596  */
 597 unsigned long netdev_boot_base(const char *prefix, int unit)
 598 {
 599         const struct netdev_boot_setup *s = dev_boot_setup;
 600         char name[IFNAMSIZ];
 601         int i;
 602
 603         sprintf(name, "%s%d", prefix, unit);
 604
 605         /*
 606          * If device already registered then return base of 1
 607          * to indicate not to probe for this interface
 608          */
 609         if (__dev_get_by_name(&init_net, name))
 610                 return 1;
 611
 612         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 613                 if (!strcmp(name, s[i].name))
 614                         return s[i].map.base_addr;
 615         return 0;
 616 }
 617
 618 /*
 619  * Saves at boot time configured settings for any netdevice.
 620  */
 621 int __init netdev_boot_setup(char *str)
 622 {
 623         int ints[5];
 624         struct ifmap map;
 625
 626         str = get_options(str, ARRAY_SIZE(ints), ints);
 627         if (!str || !*str)
 628                 return 0;
 629
 630         /* Save settings */
 631         memset(&map, 0, sizeof(map));
 632         if (ints[0] > 0)
 633                 map.irq = ints[1];
 634         if (ints[0] > 1)
 635                 map.base_addr = ints[2];
 636         if (ints[0] > 2)
 637                 map.mem_start = ints[3];
 638         if (ints[0] > 3)
 639                 map.mem_end = ints[4];
 640
 641         /* Add new entry to the list */
 642         return netdev_boot_setup_add(str, &map);
 643 }
 644
 645 __setup("netdev=", netdev_boot_setup);
 646
 647 /*******************************************************************************
 648
 649                             Device Interface Subroutines
 650
 651 *******************************************************************************/
 652
 653 /**
 654  *      __dev_get_by_name       - find a device by its name
 655  *      @net: the applicable net namespace
 656  *      @name: name to find
 657  *
 658  *      Find an interface by name. Must be called under RTNL semaphore
 659  *      or @dev_base_lock. If the name is found a pointer to the device
 660  *      is returned. If the name is not found then %NULL is returned. The
 661  *      reference counters are not incremented so the caller must be
 662  *      careful with locks.
 663  */
 664
 665 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 666 {
 667         struct net_device *dev;
 668         struct hlist_head *head = dev_name_hash(net, name);
 669
 670         hlist_for_each_entry(dev, head, name_hlist)
 671                 if (!strncmp(dev->name, name, IFNAMSIZ))
 672                         return dev;
 673
 674         return NULL;
 675 }
 676 EXPORT_SYMBOL(__dev_get_by_name);
 677
 678 /**
 679  *      dev_get_by_name_rcu     - find a device by its name
 680  *      @net: the applicable net namespace
 681  *      @name: name to find
 682  *
 683  *      Find an interface by name.
 684  *      If the name is found a pointer to the device is returned.
 685  *      If the name is not found then %NULL is returned.
 686  *      The reference counters are not incremented so the caller must be
 687  *      careful with locks. The caller must hold RCU lock.
 688  */
 689
 690 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 691 {
 692         struct net_device *dev;
 693         struct hlist_head *head = dev_name_hash(net, name);
 694
 695         hlist_for_each_entry_rcu(dev, head, name_hlist)
 696                 if (!strncmp(dev->name, name, IFNAMSIZ))
 697                         return dev;
 698
 699         return NULL;
 700 }
 701 EXPORT_SYMBOL(dev_get_by_name_rcu);
 702
 703 /**
 704  *      dev_get_by_name         - find a device by its name
 705  *      @net: the applicable net namespace
 706  *      @name: name to find
 707  *
 708  *      Find an interface by name. This can be called from any
 709  *      context and does its own locking. The returned handle has
 710  *      the usage count incremented and the caller must use dev_put() to
 711  *      release it when it is no longer needed. %NULL is returned if no
 712  *      matching device is found.
 713  */
 714
 715 struct net_device *dev_get_by_name(struct net *net, const char *name)
 716 {
 717         struct net_device *dev;
 718
 719         rcu_read_lock();
 720         dev = dev_get_by_name_rcu(net, name);
 721         if (dev)
 722                 dev_hold(dev);
 723         rcu_read_unlock();
 724         return dev;
 725 }
 726 EXPORT_SYMBOL(dev_get_by_name);
 727
 728 /**
 729  *      __dev_get_by_index - find a device by its ifindex
 730  *      @net: the applicable net namespace
 731  *      @ifindex: index of device
 732  *
 733  *      Search for an interface by index. Returns %NULL if the device
 734  *      is not found or a pointer to the device. The device has not
 735  *      had its reference counter increased so the caller must be careful
 736  *      about locking. The caller must hold either the RTNL semaphore
 737  *      or @dev_base_lock.
 738  */
 739
 740 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 741 {
 742         struct net_device *dev;
 743         struct hlist_head *head = dev_index_hash(net, ifindex);
 744
 745         hlist_for_each_entry(dev, head, index_hlist)
 746                 if (dev->ifindex == ifindex)
 747                         return dev;
 748
 749         return NULL;
 750 }
 751 EXPORT_SYMBOL(__dev_get_by_index);
 752
 753 /**
 754  *      dev_get_by_index_rcu - find a device by its ifindex
 755  *      @net: the applicable net namespace
 756  *      @ifindex: index of device
 757  *
 758  *      Search for an interface by index. Returns %NULL if the device
 759  *      is not found or a pointer to the device. The device has not
 760  *      had its reference counter increased so the caller must be careful
 761  *      about locking. The caller must hold RCU lock.
 762  */
 763
 764 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 765 {
 766         struct net_device *dev;
 767         struct hlist_head *head = dev_index_hash(net, ifindex);
 768
 769         hlist_for_each_entry_rcu(dev, head, index_hlist)
 770                 if (dev->ifindex == ifindex)
 771                         return dev;
 772
 773         return NULL;
 774 }
 775 EXPORT_SYMBOL(dev_get_by_index_rcu);
 776
 777
 778 /**
 779  *      dev_get_by_index - find a device by its ifindex
 780  *      @net: the applicable net namespace
 781  *      @ifindex: index of device
 782  *
 783  *      Search for an interface by index. Returns NULL if the device
 784  *      is not found or a pointer to the device. The device returned has
 785  *      had a reference added and the pointer is safe until the user calls
 786  *      dev_put to indicate they have finished with it.
 787  */
 788
 789 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 790 {
 791         struct net_device *dev;
 792
 793         rcu_read_lock();
 794         dev = dev_get_by_index_rcu(net, ifindex);
 795         if (dev)
 796                 dev_hold(dev);
 797         rcu_read_unlock();
 798         return dev;
 799 }
 800 EXPORT_SYMBOL(dev_get_by_index);
 801
 802 /**
 803  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 804  *      @net: the applicable net namespace
 805  *      @type: media type of device
 806  *      @ha: hardware address
 807  *
 808  *      Search for an interface by MAC address. Returns NULL if the device
 809  *      is not found or a pointer to the device.
 810  *      The caller must hold RCU or RTNL.
 811  *      The returned device has not had its ref count increased
 812  *      and the caller must therefore be careful about locking
 813  *
 814  */
 815
 816 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 817                                        const char *ha)
 818 {
 819         struct net_device *dev;
 820
 821         for_each_netdev_rcu(net, dev)
 822                 if (dev->type == type &&
 823                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 824                         return dev;
 825
 826         return NULL;
 827 }
 828 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 829
 830 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 831 {
 832         struct net_device *dev;
 833
 834         ASSERT_RTNL();
 835         for_each_netdev(net, dev)
 836                 if (dev->type == type)
 837                         return dev;
 838
 839         return NULL;
 840 }
 841 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 842
 843 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 844 {
 845         struct net_device *dev, *ret = NULL;
 846
 847         rcu_read_lock();
 848         for_each_netdev_rcu(net, dev)
 849                 if (dev->type == type) {
 850                         dev_hold(dev);
 851                         ret = dev;
 852                         break;
 853                 }
 854         rcu_read_unlock();
 855         return ret;
 856 }
 857 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 858
 859 /**
 860  *      dev_get_by_flags_rcu - find any device with given flags
 861  *      @net: the applicable net namespace
 862  *      @if_flags: IFF_* values
 863  *      @mask: bitmask of bits in if_flags to check
 864  *
 865  *      Search for any interface with the given flags. Returns NULL if a device
 866  *      is not found or a pointer to the device. Must be called inside
 867  *      rcu_read_lock(), and result refcount is unchanged.
 868  */
 869
 870 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 871                                     unsigned short mask)
 872 {
 873         struct net_device *dev, *ret;
 874
 875         ret = NULL;
 876         for_each_netdev_rcu(net, dev) {
 877                 if (((dev->flags ^ if_flags) & mask) == 0) {
 878                         ret = dev;
 879                         break;
 880                 }
 881         }
 882         return ret;
 883 }
 884 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 885
 886 /**
 887  *      dev_valid_name - check if name is okay for network device
 888  *      @name: name string
 889  *
 890  *      Network device names need to be valid file names to
 891  *      to allow sysfs to work.  We also disallow any kind of
 892  *      whitespace.
 893  */
 894 bool dev_valid_name(const char *name)
 895 {
 896         if (*name == '\0')
 897                 return false;
 898         if (strlen(name) >= IFNAMSIZ)
 899                 return false;
 900         if (!strcmp(name, ".") || !strcmp(name, ".."))
 901                 return false;
 902
 903         while (*name) {
 904                 if (*name == '/' || isspace(*name))
 905                         return false;
 906                 name++;
 907         }
 908         return true;
 909 }
 910 EXPORT_SYMBOL(dev_valid_name);
 911
 912 /**
 913  *      __dev_alloc_name - allocate a name for a device
 914  *      @net: network namespace to allocate the device name in
 915  *      @name: name format string
 916  *      @buf:  scratch buffer and result name string
 917  *
 918  *      Passed a format string - eg "lt%d" it will try and find a suitable
 919  *      id. It scans list of devices to build up a free map, then chooses
 920  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 921  *      while allocating the name and adding the device in order to avoid
 922  *      duplicates.
 923  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 924  *      Returns the number of the unit assigned or a negative errno code.
 925  */
 926
 927 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 928 {
 929         int i = 0;
 930         const char *p;
 931         const int max_netdevices = 8*PAGE_SIZE;
 932         unsigned long *inuse;
 933         struct net_device *d;
 934
 935         p = strnchr(name, IFNAMSIZ-1, '%');
 936         if (p) {
 937                 /*
 938                  * Verify the string as this thing may have come from
 939                  * the user.  There must be either one "%d" and no other "%"
 940                  * characters.
 941                  */
 942                 if (p[1] != 'd' || strchr(p + 2, '%'))
 943                         return -EINVAL;
 944
 945                 /* Use one page as a bit array of possible slots */
 946                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 947                 if (!inuse)
 948                         return -ENOMEM;
 949
 950                 for_each_netdev(net, d) {
 951                         if (!sscanf(d->name, name, &i))
 952                                 continue;
 953                         if (i < 0 || i >= max_netdevices)
 954                                 continue;
 955
 956                         /*  avoid cases where sscanf is not exact inverse of printf */
 957                         snprintf(buf, IFNAMSIZ, name, i);
 958                         if (!strncmp(buf, d->name, IFNAMSIZ))
 959                                 set_bit(i, inuse);
 960                 }
 961
 962                 i = find_first_zero_bit(inuse, max_netdevices);
 963                 free_page((unsigned long) inuse);
 964         }
 965
 966         if (buf != name)
 967                 snprintf(buf, IFNAMSIZ, name, i);
 968         if (!__dev_get_by_name(net, buf))
 969                 return i;
 970
 971         /* It is possible to run out of possible slots
 972          * when the name is long and there isn't enough space left
 973          * for the digits, or if all bits are used.
 974          */
 975         return -ENFILE;
 976 }
 977
 978 /**
 979  *      dev_alloc_name - allocate a name for a device
 980  *      @dev: device
 981  *      @name: name format string
 982  *
 983  *      Passed a format string - eg "lt%d" it will try and find a suitable
 984  *      id. It scans list of devices to build up a free map, then chooses
 985  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 986  *      while allocating the name and adding the device in order to avoid
 987  *      duplicates.
 988  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 989  *      Returns the number of the unit assigned or a negative errno code.
 990  */
 991
 992 int dev_alloc_name(struct net_device *dev, const char *name)
 993 {
 994         char buf[IFNAMSIZ];
 995         struct net *net;
 996         int ret;
 997
 998         BUG_ON(!dev_net(dev));
 999         net = dev_net(dev);
1000         ret = __dev_alloc_name(net, name, buf);
1001         if (ret >= 0)
1002                 strlcpy(dev->name, buf, IFNAMSIZ);
1003         return ret;
1004 }
1005 EXPORT_SYMBOL(dev_alloc_name);
1006
1007 static int dev_alloc_name_ns(struct net *net,
1008                              struct net_device *dev,
1009                              const char *name)
1010 {
1011         char buf[IFNAMSIZ];
1012         int ret;
1013
1014         ret = __dev_alloc_name(net, name, buf);
1015         if (ret >= 0)
1016                 strlcpy(dev->name, buf, IFNAMSIZ);
1017         return ret;
1018 }
1019
1020 static int dev_get_valid_name(struct net *net,
1021                               struct net_device *dev,
1022                               const char *name)
1023 {
1024         BUG_ON(!net);
1025
1026         if (!dev_valid_name(name))
1027                 return -EINVAL;
1028
1029         if (strchr(name, '%'))
1030                 return dev_alloc_name_ns(net, dev, name);
1031         else if (__dev_get_by_name(net, name))
1032                 return -EEXIST;
1033         else if (dev->name != name)
1034                 strlcpy(dev->name, name, IFNAMSIZ);
1035
1036         return 0;
1037 }
1038
1039 /**
1040  *      dev_change_name - change name of a device
1041  *      @dev: device
1042  *      @newname: name (or format string) must be at least IFNAMSIZ
1043  *
1044  *      Change name of a device, can pass format strings "eth%d".
1045  *      for wildcarding.
1046  */
1047 int dev_change_name(struct net_device *dev, const char *newname)
1048 {
1049         char oldname[IFNAMSIZ];
1050         int err = 0;
1051         int ret;
1052         struct net *net;
1053
1054         ASSERT_RTNL();
1055         BUG_ON(!dev_net(dev));
1056
1057         net = dev_net(dev);
1058         if (dev->flags & IFF_UP)
1059                 return -EBUSY;
1060
1061         write_seqcount_begin(&devnet_rename_seq);
1062
1063         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1064                 write_seqcount_end(&devnet_rename_seq);
1065                 return 0;
1066         }
1067
1068         memcpy(oldname, dev->name, IFNAMSIZ);
1069
1070         err = dev_get_valid_name(net, dev, newname);
1071         if (err < 0) {
1072                 write_seqcount_end(&devnet_rename_seq);
1073                 return err;
1074         }
1075
1076 rollback:
1077         ret = device_rename(&dev->dev, dev->name);
1078         if (ret) {
1079                 memcpy(dev->name, oldname, IFNAMSIZ);
1080                 write_seqcount_end(&devnet_rename_seq);
1081                 return ret;
1082         }
1083
1084         write_seqcount_end(&devnet_rename_seq);
1085
1086         write_lock_bh(&dev_base_lock);
1087         hlist_del_rcu(&dev->name_hlist);
1088         write_unlock_bh(&dev_base_lock);
1089
1090         synchronize_rcu();
1091
1092         write_lock_bh(&dev_base_lock);
1093         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1094         write_unlock_bh(&dev_base_lock);
1095
1096         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1097         ret = notifier_to_errno(ret);
1098
1099         if (ret) {
1100                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1101                 if (err >= 0) {
1102                         err = ret;
1103                         write_seqcount_begin(&devnet_rename_seq);
1104                         memcpy(dev->name, oldname, IFNAMSIZ);
1105                         goto rollback;
1106                 } else {
1107                         pr_err("%s: name change rollback failed: %d\n",
1108                                dev->name, ret);
1109                 }
1110         }
1111
1112         return err;
1113 }
1114
1115 /**
1116  *      dev_set_alias - change ifalias of a device
1117  *      @dev: device
1118  *      @alias: name up to IFALIASZ
1119  *      @len: limit of bytes to copy from info
1120  *
1121  *      Set ifalias for a device,
1122  */
1123 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1124 {
1125         char *new_ifalias;
1126
1127         ASSERT_RTNL();
1128
1129         if (len >= IFALIASZ)
1130                 return -EINVAL;
1131
1132         if (!len) {
1133                 kfree(dev->ifalias);
1134                 dev->ifalias = NULL;
1135                 return 0;
1136         }
1137
1138         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1139         if (!new_ifalias)
1140                 return -ENOMEM;
1141         dev->ifalias = new_ifalias;
1142
1143         strlcpy(dev->ifalias, alias, len+1);
1144         return len;
1145 }
1146
1147
1148 /**
1149  *      netdev_features_change - device changes features
1150  *      @dev: device to cause notification
1151  *
1152  *      Called to indicate a device has changed features.
1153  */
1154 void netdev_features_change(struct net_device *dev)
1155 {
1156         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1157 }
1158 EXPORT_SYMBOL(netdev_features_change);
1159
1160 /**
1161  *      netdev_state_change - device changes state
1162  *      @dev: device to cause notification
1163  *
1164  *      Called to indicate a device has changed state. This function calls
1165  *      the notifier chains for netdev_chain and sends a NEWLINK message
1166  *      to the routing socket.
1167  */
1168 void netdev_state_change(struct net_device *dev)
1169 {
1170         if (dev->flags & IFF_UP) {
1171                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1172                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1173         }
1174 }
1175 EXPORT_SYMBOL(netdev_state_change);
1176
1177 /**
1178  *      netdev_notify_peers - notify network peers about existence of @dev
1179  *      @dev: network device
1180  *
1181  * Generate traffic such that interested network peers are aware of
1182  * @dev, such as by generating a gratuitous ARP. This may be used when
1183  * a device wants to inform the rest of the network about some sort of
1184  * reconfiguration such as a failover event or virtual machine
1185  * migration.
1186  */
1187 void netdev_notify_peers(struct net_device *dev)
1188 {
1189         rtnl_lock();
1190         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1191         rtnl_unlock();
1192 }
1193 EXPORT_SYMBOL(netdev_notify_peers);
1194
1195 static int __dev_open(struct net_device *dev)
1196 {
1197         const struct net_device_ops *ops = dev->netdev_ops;
1198         int ret;
1199
1200         ASSERT_RTNL();
1201
1202         if (!netif_device_present(dev))
1203                 return -ENODEV;
1204
1205         /* Block netpoll from trying to do any rx path servicing.
1206          * If we don't do this there is a chance ndo_poll_controller
1207          * or ndo_poll may be running while we open the device
1208          */
1209         netpoll_rx_disable(dev);
1210
1211         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1212         ret = notifier_to_errno(ret);
1213         if (ret)
1214                 return ret;
1215
1216         set_bit(__LINK_STATE_START, &dev->state);
1217
1218         if (ops->ndo_validate_addr)
1219                 ret = ops->ndo_validate_addr(dev);
1220
1221         if (!ret && ops->ndo_open)
1222                 ret = ops->ndo_open(dev);
1223
1224         netpoll_rx_enable(dev);
1225
1226         if (ret)
1227                 clear_bit(__LINK_STATE_START, &dev->state);
1228         else {
1229                 dev->flags |= IFF_UP;
1230                 net_dmaengine_get();
1231                 dev_set_rx_mode(dev);
1232                 dev_activate(dev);
1233                 add_device_randomness(dev->dev_addr, dev->addr_len);
1234         }
1235
1236         return ret;
1237 }
1238
1239 /**
1240  *      dev_open        - prepare an interface for use.
1241  *      @dev:   device to open
1242  *
1243  *      Takes a device from down to up state. The device's private open
1244  *      function is invoked and then the multicast lists are loaded. Finally
1245  *      the device is moved into the up state and a %NETDEV_UP message is
1246  *      sent to the netdev notifier chain.
1247  *
1248  *      Calling this function on an active interface is a nop. On a failure
1249  *      a negative errno code is returned.
1250  */
1251 int dev_open(struct net_device *dev)
1252 {
1253         int ret;
1254
1255         if (dev->flags & IFF_UP)
1256                 return 0;
1257
1258         ret = __dev_open(dev);
1259         if (ret < 0)
1260                 return ret;
1261
1262         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1263         call_netdevice_notifiers(NETDEV_UP, dev);
1264
1265         return ret;
1266 }
1267 EXPORT_SYMBOL(dev_open);
1268
1269 static int __dev_close_many(struct list_head *head)
1270 {
1271         struct net_device *dev;
1272
1273         ASSERT_RTNL();
1274         might_sleep();
1275
1276         list_for_each_entry(dev, head, unreg_list) {
1277                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1278
1279                 clear_bit(__LINK_STATE_START, &dev->state);
1280
1281                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1282                  * can be even on different cpu. So just clear netif_running().
1283                  *
1284                  * dev->stop() will invoke napi_disable() on all of it's
1285                  * napi_struct instances on this device.
1286                  */
1287                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1288         }
1289
1290         dev_deactivate_many(head);
1291
1292         list_for_each_entry(dev, head, unreg_list) {
1293                 const struct net_device_ops *ops = dev->netdev_ops;
1294
1295                 /*
1296                  *      Call the device specific close. This cannot fail.
1297                  *      Only if device is UP
1298                  *
1299                  *      We allow it to be called even after a DETACH hot-plug
1300                  *      event.
1301                  */
1302                 if (ops->ndo_stop)
1303                         ops->ndo_stop(dev);
1304
1305                 dev->flags &= ~IFF_UP;
1306                 net_dmaengine_put();
1307         }
1308
1309         return 0;
1310 }
1311
1312 static int __dev_close(struct net_device *dev)
1313 {
1314         int retval;
1315         LIST_HEAD(single);
1316
1317         /* Temporarily disable netpoll until the interface is down */
1318         netpoll_rx_disable(dev);
1319
1320         list_add(&dev->unreg_list, &single);
1321         retval = __dev_close_many(&single);
1322         list_del(&single);
1323
1324         netpoll_rx_enable(dev);
1325         return retval;
1326 }
1327
1328 static int dev_close_many(struct list_head *head)
1329 {
1330         struct net_device *dev, *tmp;
1331         LIST_HEAD(tmp_list);
1332
1333         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1334                 if (!(dev->flags & IFF_UP))
1335                         list_move(&dev->unreg_list, &tmp_list);
1336
1337         __dev_close_many(head);
1338
1339         list_for_each_entry(dev, head, unreg_list) {
1340                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1341                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1342         }
1343
1344         /* rollback_registered_many needs the complete original list */
1345         list_splice(&tmp_list, head);
1346         return 0;
1347 }
1348
1349 /**
1350  *      dev_close - shutdown an interface.
1351  *      @dev: device to shutdown
1352  *
1353  *      This function moves an active device into down state. A
1354  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1355  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1356  *      chain.
1357  */
1358 int dev_close(struct net_device *dev)
1359 {
1360         if (dev->flags & IFF_UP) {
1361                 LIST_HEAD(single);
1362
1363                 /* Block netpoll rx while the interface is going down */
1364                 netpoll_rx_disable(dev);
1365
1366                 list_add(&dev->unreg_list, &single);
1367                 dev_close_many(&single);
1368                 list_del(&single);
1369
1370                 netpoll_rx_enable(dev);
1371         }
1372         return 0;
1373 }
1374 EXPORT_SYMBOL(dev_close);
1375
1376
1377 /**
1378  *      dev_disable_lro - disable Large Receive Offload on a device
1379  *      @dev: device
1380  *
1381  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1382  *      called under RTNL.  This is needed if received packets may be
1383  *      forwarded to another interface.
1384  */
1385 void dev_disable_lro(struct net_device *dev)
1386 {
1387         /*
1388          * If we're trying to disable lro on a vlan device
1389          * use the underlying physical device instead
1390          */
1391         if (is_vlan_dev(dev))
1392                 dev = vlan_dev_real_dev(dev);
1393
1394         dev->wanted_features &= ~NETIF_F_LRO;
1395         netdev_update_features(dev);
1396
1397         if (unlikely(dev->features & NETIF_F_LRO))
1398                 netdev_WARN(dev, "failed to disable LRO!\n");
1399 }
1400 EXPORT_SYMBOL(dev_disable_lro);
1401
1402 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1403                                    struct net_device *dev)
1404 {
1405         struct netdev_notifier_info info;
1406
1407         netdev_notifier_info_init(&info, dev);
1408         return nb->notifier_call(nb, val, &info);
1409 }
1410
1411 static int dev_boot_phase = 1;
1412
1413 /**
1414  *      register_netdevice_notifier - register a network notifier block
1415  *      @nb: notifier
1416  *
1417  *      Register a notifier to be called when network device events occur.
1418  *      The notifier passed is linked into the kernel structures and must
1419  *      not be reused until it has been unregistered. A negative errno code
1420  *      is returned on a failure.
1421  *
1422  *      When registered all registration and up events are replayed
1423  *      to the new notifier to allow device to have a race free
1424  *      view of the network device list.
1425  */
1426
1427 int register_netdevice_notifier(struct notifier_block *nb)
1428 {
1429         struct net_device *dev;
1430         struct net_device *last;
1431         struct net *net;
1432         int err;
1433
1434         rtnl_lock();
1435         err = raw_notifier_chain_register(&netdev_chain, nb);
1436         if (err)
1437                 goto unlock;
1438         if (dev_boot_phase)
1439                 goto unlock;
1440         for_each_net(net) {
1441                 for_each_netdev(net, dev) {
1442                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1443                         err = notifier_to_errno(err);
1444                         if (err)
1445                                 goto rollback;
1446
1447                         if (!(dev->flags & IFF_UP))
1448                                 continue;
1449
1450                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1451                 }
1452         }
1453
1454 unlock:
1455         rtnl_unlock();
1456         return err;
1457
1458 rollback:
1459         last = dev;
1460         for_each_net(net) {
1461                 for_each_netdev(net, dev) {
1462                         if (dev == last)
1463                                 goto outroll;
1464
1465                         if (dev->flags & IFF_UP) {
1466                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1467                                                         dev);
1468                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1469                         }
1470                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1471                 }
1472         }
1473
1474 outroll:
1475         raw_notifier_chain_unregister(&netdev_chain, nb);
1476         goto unlock;
1477 }
1478 EXPORT_SYMBOL(register_netdevice_notifier);
1479
1480 /**
1481  *      unregister_netdevice_notifier - unregister a network notifier block
1482  *      @nb: notifier
1483  *
1484  *      Unregister a notifier previously registered by
1485  *      register_netdevice_notifier(). The notifier is unlinked into the
1486  *      kernel structures and may then be reused. A negative errno code
1487  *      is returned on a failure.
1488  *
1489  *      After unregistering unregister and down device events are synthesized
1490  *      for all devices on the device list to the removed notifier to remove
1491  *      the need for special case cleanup code.
1492  */
1493
1494 int unregister_netdevice_notifier(struct notifier_block *nb)
1495 {
1496         struct net_device *dev;
1497         struct net *net;
1498         int err;
1499
1500         rtnl_lock();
1501         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1502         if (err)
1503                 goto unlock;
1504
1505         for_each_net(net) {
1506                 for_each_netdev(net, dev) {
1507                         if (dev->flags & IFF_UP) {
1508                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1509                                                         dev);
1510                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1511                         }
1512                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1513                 }
1514         }
1515 unlock:
1516         rtnl_unlock();
1517         return err;
1518 }
1519 EXPORT_SYMBOL(unregister_netdevice_notifier);
1520
1521 /**
1522  *      call_netdevice_notifiers_info - call all network notifier blocks
1523  *      @val: value passed unmodified to notifier function
1524  *      @dev: net_device pointer passed unmodified to notifier function
1525  *      @info: notifier information data
1526  *
1527  *      Call all network notifier blocks.  Parameters and return value
1528  *      are as for raw_notifier_call_chain().
1529  */
1530
1531 int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1532                                   struct netdev_notifier_info *info)
1533 {
1534         ASSERT_RTNL();
1535         netdev_notifier_info_init(info, dev);
1536         return raw_notifier_call_chain(&netdev_chain, val, info);
1537 }
1538 EXPORT_SYMBOL(call_netdevice_notifiers_info);
1539
1540 /**
1541  *      call_netdevice_notifiers - call all network notifier blocks
1542  *      @val: value passed unmodified to notifier function
1543  *      @dev: net_device pointer passed unmodified to notifier function
1544  *
1545  *      Call all network notifier blocks.  Parameters and return value
1546  *      are as for raw_notifier_call_chain().
1547  */
1548
1549 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1550 {
1551         struct netdev_notifier_info info;
1552
1553         return call_netdevice_notifiers_info(val, dev, &info);
1554 }
1555 EXPORT_SYMBOL(call_netdevice_notifiers);
1556
1557 static struct static_key netstamp_needed __read_mostly;
1558 #ifdef HAVE_JUMP_LABEL
1559 /* We are not allowed to call static_key_slow_dec() from irq context
1560  * If net_disable_timestamp() is called from irq context, defer the
1561  * static_key_slow_dec() calls.
1562  */
1563 static atomic_t netstamp_needed_deferred;
1564 #endif
1565
1566 void net_enable_timestamp(void)
1567 {
1568 #ifdef HAVE_JUMP_LABEL
1569         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1570
1571         if (deferred) {
1572                 while (--deferred)
1573                         static_key_slow_dec(&netstamp_needed);
1574                 return;
1575         }
1576 #endif
1577         static_key_slow_inc(&netstamp_needed);
1578 }
1579 EXPORT_SYMBOL(net_enable_timestamp);
1580
1581 void net_disable_timestamp(void)
1582 {
1583 #ifdef HAVE_JUMP_LABEL
1584         if (in_interrupt()) {
1585                 atomic_inc(&netstamp_needed_deferred);
1586                 return;
1587         }
1588 #endif
1589         static_key_slow_dec(&netstamp_needed);
1590 }
1591 EXPORT_SYMBOL(net_disable_timestamp);
1592
1593 static inline void net_timestamp_set(struct sk_buff *skb)
1594 {
1595         skb->tstamp.tv64 = 0;
1596         if (static_key_false(&netstamp_needed))
1597                 __net_timestamp(skb);
1598 }
1599
1600 #define net_timestamp_check(COND, SKB)                  \
1601         if (static_key_false(&netstamp_needed)) {               \
1602                 if ((COND) && !(SKB)->tstamp.tv64)      \
1603                         __net_timestamp(SKB);           \
1604         }                                               \
1605
1606 static inline bool is_skb_forwardable(struct net_device *dev,
1607                                       struct sk_buff *skb)
1608 {
1609         unsigned int len;
1610
1611         if (!(dev->flags & IFF_UP))
1612                 return false;
1613
1614         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1615         if (skb->len <= len)
1616                 return true;
1617
1618         /* if TSO is enabled, we don't care about the length as the packet
1619          * could be forwarded without being segmented before
1620          */
1621         if (skb_is_gso(skb))
1622                 return true;
1623
1624         return false;
1625 }
1626
1627 /**
1628  * dev_forward_skb - loopback an skb to another netif
1629  *
1630  * @dev: destination network device
1631  * @skb: buffer to forward
1632  *
1633  * return values:
1634  *      NET_RX_SUCCESS  (no congestion)
1635  *      NET_RX_DROP     (packet was dropped, but freed)
1636  *
1637  * dev_forward_skb can be used for injecting an skb from the
1638  * start_xmit function of one device into the receive queue
1639  * of another device.
1640  *
1641  * The receiving device may be in another namespace, so
1642  * we have to clear all information in the skb that could
1643  * impact namespace isolation.
1644  */
1645 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1646 {
1647         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1648                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1649                         atomic_long_inc(&dev->rx_dropped);
1650                         kfree_skb(skb);
1651                         return NET_RX_DROP;
1652                 }
1653         }
1654
1655         if (unlikely(!is_skb_forwardable(dev, skb))) {
1656                 atomic_long_inc(&dev->rx_dropped);
1657                 kfree_skb(skb);
1658                 return NET_RX_DROP;
1659         }
1660         skb_scrub_packet(skb);
1661         skb->protocol = eth_type_trans(skb, dev);
1662
1663         /* eth_type_trans() can set pkt_type.
1664          * clear pkt_type _after_ calling eth_type_trans()
1665          */
1666         skb->pkt_type = PACKET_HOST;
1667
1668         return netif_rx(skb);
1669 }
1670 EXPORT_SYMBOL_GPL(dev_forward_skb);
1671
1672 static inline int deliver_skb(struct sk_buff *skb,
1673                               struct packet_type *pt_prev,
1674                               struct net_device *orig_dev)
1675 {
1676         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1677                 return -ENOMEM;
1678         atomic_inc(&skb->users);
1679         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1680 }
1681
1682 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1683 {
1684         if (!ptype->af_packet_priv || !skb->sk)
1685                 return false;
1686
1687         if (ptype->id_match)
1688                 return ptype->id_match(ptype, skb->sk);
1689         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1690                 return true;
1691
1692         return false;
1693 }
1694
1695 /*
1696  *      Support routine. Sends outgoing frames to any network
1697  *      taps currently in use.
1698  */
1699
1700 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1701 {
1702         struct packet_type *ptype;
1703         struct sk_buff *skb2 = NULL;
1704         struct packet_type *pt_prev = NULL;
1705
1706         rcu_read_lock();
1707         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1708                 /* Never send packets back to the socket
1709                  * they originated from - MvS ([email protected])
1710                  */
1711                 if ((ptype->dev == dev || !ptype->dev) &&
1712                     (!skb_loop_sk(ptype, skb))) {
1713                         if (pt_prev) {
1714                                 deliver_skb(skb2, pt_prev, skb->dev);
1715                                 pt_prev = ptype;
1716                                 continue;
1717                         }
1718
1719                         skb2 = skb_clone(skb, GFP_ATOMIC);
1720                         if (!skb2)
1721                                 break;
1722
1723                         net_timestamp_set(skb2);
1724
1725                         /* skb->nh should be correctly
1726                            set by sender, so that the second statement is
1727                            just protection against buggy protocols.
1728                          */
1729                         skb_reset_mac_header(skb2);
1730
1731                         if (skb_network_header(skb2) < skb2->data ||
1732                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1733                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1734                                                      ntohs(skb2->protocol),
1735                                                      dev->name);
1736                                 skb_reset_network_header(skb2);
1737                         }
1738
1739                         skb2->transport_header = skb2->network_header;
1740                         skb2->pkt_type = PACKET_OUTGOING;
1741                         pt_prev = ptype;
1742                 }
1743         }
1744         if (pt_prev)
1745                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1746         rcu_read_unlock();
1747 }
1748
1749 /**
1750  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1751  * @dev: Network device
1752  * @txq: number of queues available
1753  *
1754  * If real_num_tx_queues is changed the tc mappings may no longer be
1755  * valid. To resolve this verify the tc mapping remains valid and if
1756  * not NULL the mapping. With no priorities mapping to this
1757  * offset/count pair it will no longer be used. In the worst case TC0
1758  * is invalid nothing can be done so disable priority mappings. If is
1759  * expected that drivers will fix this mapping if they can before
1760  * calling netif_set_real_num_tx_queues.
1761  */
1762 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1763 {
1764         int i;
1765         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1766
1767         /* If TC0 is invalidated disable TC mapping */
1768         if (tc->offset + tc->count > txq) {
1769                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1770                 dev->num_tc = 0;
1771                 return;
1772         }
1773
1774         /* Invalidated prio to tc mappings set to TC0 */
1775         for (i = 1; i < TC_BITMASK + 1; i++) {
1776                 int q = netdev_get_prio_tc_map(dev, i);
1777
1778                 tc = &dev->tc_to_txq[q];
1779                 if (tc->offset + tc->count > txq) {
1780                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1781                                 i, q);
1782                         netdev_set_prio_tc_map(dev, i, 0);
1783                 }
1784         }
1785 }
1786
1787 #ifdef CONFIG_XPS
1788 static DEFINE_MUTEX(xps_map_mutex);
1789 #define xmap_dereference(P)             \
1790         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1791
1792 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1793                                         int cpu, u16 index)
1794 {
1795         struct xps_map *map = NULL;
1796         int pos;
1797
1798         if (dev_maps)
1799                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1800
1801         for (pos = 0; map && pos < map->len; pos++) {
1802                 if (map->queues[pos] == index) {
1803                         if (map->len > 1) {
1804                                 map->queues[pos] = map->queues[--map->len];
1805                         } else {
1806                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1807                                 kfree_rcu(map, rcu);
1808                                 map = NULL;
1809                         }
1810                         break;
1811                 }
1812         }
1813
1814         return map;
1815 }
1816
1817 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1818 {
1819         struct xps_dev_maps *dev_maps;
1820         int cpu, i;
1821         bool active = false;
1822
1823         mutex_lock(&xps_map_mutex);
1824         dev_maps = xmap_dereference(dev->xps_maps);
1825
1826         if (!dev_maps)
1827                 goto out_no_maps;
1828
1829         for_each_possible_cpu(cpu) {
1830                 for (i = index; i < dev->num_tx_queues; i++) {
1831                         if (!remove_xps_queue(dev_maps, cpu, i))
1832                                 break;
1833                 }
1834                 if (i == dev->num_tx_queues)
1835                         active = true;
1836         }
1837
1838         if (!active) {
1839                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1840                 kfree_rcu(dev_maps, rcu);
1841         }
1842
1843         for (i = index; i < dev->num_tx_queues; i++)
1844                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1845                                              NUMA_NO_NODE);
1846
1847 out_no_maps:
1848         mutex_unlock(&xps_map_mutex);
1849 }
1850
1851 static struct xps_map *expand_xps_map(struct xps_map *map,
1852                                       int cpu, u16 index)
1853 {
1854         struct xps_map *new_map;
1855         int alloc_len = XPS_MIN_MAP_ALLOC;
1856         int i, pos;
1857
1858         for (pos = 0; map && pos < map->len; pos++) {
1859                 if (map->queues[pos] != index)
1860                         continue;
1861                 return map;
1862         }
1863
1864         /* Need to add queue to this CPU's existing map */
1865         if (map) {
1866                 if (pos < map->alloc_len)
1867                         return map;
1868
1869                 alloc_len = map->alloc_len * 2;
1870         }
1871
1872         /* Need to allocate new map to store queue on this CPU's map */
1873         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1874                                cpu_to_node(cpu));
1875         if (!new_map)
1876                 return NULL;
1877
1878         for (i = 0; i < pos; i++)
1879                 new_map->queues[i] = map->queues[i];
1880         new_map->alloc_len = alloc_len;
1881         new_map->len = pos;
1882
1883         return new_map;
1884 }
1885
1886 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1887 {
1888         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1889         struct xps_map *map, *new_map;
1890         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1891         int cpu, numa_node_id = -2;
1892         bool active = false;
1893
1894         mutex_lock(&xps_map_mutex);
1895
1896         dev_maps = xmap_dereference(dev->xps_maps);
1897
1898         /* allocate memory for queue storage */
1899         for_each_online_cpu(cpu) {
1900                 if (!cpumask_test_cpu(cpu, mask))
1901                         continue;
1902
1903                 if (!new_dev_maps)
1904                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1905                 if (!new_dev_maps) {
1906                         mutex_unlock(&xps_map_mutex);
1907                         return -ENOMEM;
1908                 }
1909
1910                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1911                                  NULL;
1912
1913                 map = expand_xps_map(map, cpu, index);
1914                 if (!map)
1915                         goto error;
1916
1917                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1918         }
1919
1920         if (!new_dev_maps)
1921                 goto out_no_new_maps;
1922
1923         for_each_possible_cpu(cpu) {
1924                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1925                         /* add queue to CPU maps */
1926                         int pos = 0;
1927
1928                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1929                         while ((pos < map->len) && (map->queues[pos] != index))
1930                                 pos++;
1931
1932                         if (pos == map->len)
1933                                 map->queues[map->len++] = index;
1934 #ifdef CONFIG_NUMA
1935                         if (numa_node_id == -2)
1936                                 numa_node_id = cpu_to_node(cpu);
1937                         else if (numa_node_id != cpu_to_node(cpu))
1938                                 numa_node_id = -1;
1939 #endif
1940                 } else if (dev_maps) {
1941                         /* fill in the new device map from the old device map */
1942                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1943                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1944                 }
1945
1946         }
1947
1948         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1949
1950         /* Cleanup old maps */
1951         if (dev_maps) {
1952                 for_each_possible_cpu(cpu) {
1953                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1954                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1955                         if (map && map != new_map)
1956                                 kfree_rcu(map, rcu);
1957                 }
1958
1959                 kfree_rcu(dev_maps, rcu);
1960         }
1961
1962         dev_maps = new_dev_maps;
1963         active = true;
1964
1965 out_no_new_maps:
1966         /* update Tx queue numa node */
1967         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1968                                      (numa_node_id >= 0) ? numa_node_id :
1969                                      NUMA_NO_NODE);
1970
1971         if (!dev_maps)
1972                 goto out_no_maps;
1973
1974         /* removes queue from unused CPUs */
1975         for_each_possible_cpu(cpu) {
1976                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1977                         continue;
1978
1979                 if (remove_xps_queue(dev_maps, cpu, index))
1980                         active = true;
1981         }
1982
1983         /* free map if not active */
1984         if (!active) {
1985                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1986                 kfree_rcu(dev_maps, rcu);
1987         }
1988
1989 out_no_maps:
1990         mutex_unlock(&xps_map_mutex);
1991
1992         return 0;
1993 error:
1994         /* remove any maps that we added */
1995         for_each_possible_cpu(cpu) {
1996                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1997                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1998                                  NULL;
1999                 if (new_map && new_map != map)
2000                         kfree(new_map);
2001         }
2002
2003         mutex_unlock(&xps_map_mutex);
2004
2005         kfree(new_dev_maps);
2006         return -ENOMEM;
2007 }
2008 EXPORT_SYMBOL(netif_set_xps_queue);
2009
2010 #endif
2011 /*
2012  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2013  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2014  */
2015 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2016 {
2017         int rc;
2018
2019         if (txq < 1 || txq > dev->num_tx_queues)
2020                 return -EINVAL;
2021
2022         if (dev->reg_state == NETREG_REGISTERED ||
2023             dev->reg_state == NETREG_UNREGISTERING) {
2024                 ASSERT_RTNL();
2025
2026                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2027                                                   txq);
2028                 if (rc)
2029                         return rc;
2030
2031                 if (dev->num_tc)
2032                         netif_setup_tc(dev, txq);
2033
2034                 if (txq < dev->real_num_tx_queues) {
2035                         qdisc_reset_all_tx_gt(dev, txq);
2036 #ifdef CONFIG_XPS
2037                         netif_reset_xps_queues_gt(dev, txq);
2038 #endif
2039                 }
2040         }
2041
2042         dev->real_num_tx_queues = txq;
2043         return 0;
2044 }
2045 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2046
2047 #ifdef CONFIG_RPS
2048 /**
2049  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2050  *      @dev: Network device
2051  *      @rxq: Actual number of RX queues
2052  *
2053  *      This must be called either with the rtnl_lock held or before
2054  *      registration of the net device.  Returns 0 on success, or a
2055  *      negative error code.  If called before registration, it always
2056  *      succeeds.
2057  */
2058 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2059 {
2060         int rc;
2061
2062         if (rxq < 1 || rxq > dev->num_rx_queues)
2063                 return -EINVAL;
2064
2065         if (dev->reg_state == NETREG_REGISTERED) {
2066                 ASSERT_RTNL();
2067
2068                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2069                                                   rxq);
2070                 if (rc)
2071                         return rc;
2072         }
2073
2074         dev->real_num_rx_queues = rxq;
2075         return 0;
2076 }
2077 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2078 #endif
2079
2080 /**
2081  * netif_get_num_default_rss_queues - default number of RSS queues
2082  *
2083  * This routine should set an upper limit on the number of RSS queues
2084  * used by default by multiqueue devices.
2085  */
2086 int netif_get_num_default_rss_queues(void)
2087 {
2088         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2089 }
2090 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2091
2092 static inline void __netif_reschedule(struct Qdisc *q)
2093 {
2094         struct softnet_data *sd;
2095         unsigned long flags;
2096
2097         local_irq_save(flags);
2098         sd = &__get_cpu_var(softnet_data);
2099         q->next_sched = NULL;
2100         *sd->output_queue_tailp = q;
2101         sd->output_queue_tailp = &q->next_sched;
2102         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2103         local_irq_restore(flags);
2104 }
2105
2106 void __netif_schedule(struct Qdisc *q)
2107 {
2108         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2109                 __netif_reschedule(q);
2110 }
2111 EXPORT_SYMBOL(__netif_schedule);
2112
2113 void dev_kfree_skb_irq(struct sk_buff *skb)
2114 {
2115         if (atomic_dec_and_test(&skb->users)) {
2116                 struct softnet_data *sd;
2117                 unsigned long flags;
2118
2119                 local_irq_save(flags);
2120                 sd = &__get_cpu_var(softnet_data);
2121                 skb->next = sd->completion_queue;
2122                 sd->completion_queue = skb;
2123                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2124                 local_irq_restore(flags);
2125         }
2126 }
2127 EXPORT_SYMBOL(dev_kfree_skb_irq);
2128
2129 void dev_kfree_skb_any(struct sk_buff *skb)
2130 {
2131         if (in_irq() || irqs_disabled())
2132                 dev_kfree_skb_irq(skb);
2133         else
2134                 dev_kfree_skb(skb);
2135 }
2136 EXPORT_SYMBOL(dev_kfree_skb_any);
2137
2138
2139 /**
2140  * netif_device_detach - mark device as removed
2141  * @dev: network device
2142  *
2143  * Mark device as removed from system and therefore no longer available.
2144  */
2145 void netif_device_detach(struct net_device *dev)
2146 {
2147         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2148             netif_running(dev)) {
2149                 netif_tx_stop_all_queues(dev);
2150         }
2151 }
2152 EXPORT_SYMBOL(netif_device_detach);
2153
2154 /**
2155  * netif_device_attach - mark device as attached
2156  * @dev: network device
2157  *
2158  * Mark device as attached from system and restart if needed.
2159  */
2160 void netif_device_attach(struct net_device *dev)
2161 {
2162         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2163             netif_running(dev)) {
2164                 netif_tx_wake_all_queues(dev);
2165                 __netdev_watchdog_up(dev);
2166         }
2167 }
2168 EXPORT_SYMBOL(netif_device_attach);
2169
2170 static void skb_warn_bad_offload(const struct sk_buff *skb)
2171 {
2172         static const netdev_features_t null_features = 0;
2173         struct net_device *dev = skb->dev;
2174         const char *driver = "";
2175
2176         if (!net_ratelimit())
2177                 return;
2178
2179         if (dev && dev->dev.parent)
2180                 driver = dev_driver_string(dev->dev.parent);
2181
2182         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2183              "gso_type=%d ip_summed=%d\n",
2184              driver, dev ? &dev->features : &null_features,
2185              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2186              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2187              skb_shinfo(skb)->gso_type, skb->ip_summed);
2188 }
2189
2190 /*
2191  * Invalidate hardware checksum when packet is to be mangled, and
2192  * complete checksum manually on outgoing path.
2193  */
2194 int skb_checksum_help(struct sk_buff *skb)
2195 {
2196         __wsum csum;
2197         int ret = 0, offset;
2198
2199         if (skb->ip_summed == CHECKSUM_COMPLETE)
2200                 goto out_set_summed;
2201
2202         if (unlikely(skb_shinfo(skb)->gso_size)) {
2203                 skb_warn_bad_offload(skb);
2204                 return -EINVAL;
2205         }
2206
2207         /* Before computing a checksum, we should make sure no frag could
2208          * be modified by an external entity : checksum could be wrong.
2209          */
2210         if (skb_has_shared_frag(skb)) {
2211                 ret = __skb_linearize(skb);
2212                 if (ret)
2213                         goto out;
2214         }
2215
2216         offset = skb_checksum_start_offset(skb);
2217         BUG_ON(offset >= skb_headlen(skb));
2218         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2219
2220         offset += skb->csum_offset;
2221         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2222
2223         if (skb_cloned(skb) &&
2224             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2225                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2226                 if (ret)
2227                         goto out;
2228         }
2229
2230         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2231 out_set_summed:
2232         skb->ip_summed = CHECKSUM_NONE;
2233 out:
2234         return ret;
2235 }
2236 EXPORT_SYMBOL(skb_checksum_help);
2237
2238 __be16 skb_network_protocol(struct sk_buff *skb)
2239 {
2240         __be16 type = skb->protocol;
2241         int vlan_depth = ETH_HLEN;
2242
2243         /* Tunnel gso handlers can set protocol to ethernet. */
2244         if (type == htons(ETH_P_TEB)) {
2245                 struct ethhdr *eth;
2246
2247                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2248                         return 0;
2249
2250                 eth = (struct ethhdr *)skb_mac_header(skb);
2251                 type = eth->h_proto;
2252         }
2253
2254         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2255                 struct vlan_hdr *vh;
2256
2257                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2258                         return 0;
2259
2260                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2261                 type = vh->h_vlan_encapsulated_proto;
2262                 vlan_depth += VLAN_HLEN;
2263         }
2264
2265         return type;
2266 }
2267
2268 /**
2269  *      skb_mac_gso_segment - mac layer segmentation handler.
2270  *      @skb: buffer to segment
2271  *      @features: features for the output path (see dev->features)
2272  */
2273 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2274                                     netdev_features_t features)
2275 {
2276         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2277         struct packet_offload *ptype;
2278         __be16 type = skb_network_protocol(skb);
2279
2280         if (unlikely(!type))
2281                 return ERR_PTR(-EINVAL);
2282
2283         __skb_pull(skb, skb->mac_len);
2284
2285         rcu_read_lock();
2286         list_for_each_entry_rcu(ptype, &offload_base, list) {
2287                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2288                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2289                                 int err;
2290
2291                                 err = ptype->callbacks.gso_send_check(skb);
2292                                 segs = ERR_PTR(err);
2293                                 if (err || skb_gso_ok(skb, features))
2294                                         break;
2295                                 __skb_push(skb, (skb->data -
2296                                                  skb_network_header(skb)));
2297                         }
2298                         segs = ptype->callbacks.gso_segment(skb, features);
2299                         break;
2300                 }
2301         }
2302         rcu_read_unlock();
2303
2304         __skb_push(skb, skb->data - skb_mac_header(skb));
2305
2306         return segs;
2307 }
2308 EXPORT_SYMBOL(skb_mac_gso_segment);
2309
2310
2311 /* openvswitch calls this on rx path, so we need a different check.
2312  */
2313 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2314 {
2315         if (tx_path)
2316                 return skb->ip_summed != CHECKSUM_PARTIAL;
2317         else
2318                 return skb->ip_summed == CHECKSUM_NONE;
2319 }
2320
2321 /**
2322  *      __skb_gso_segment - Perform segmentation on skb.
2323  *      @skb: buffer to segment
2324  *      @features: features for the output path (see dev->features)
2325  *      @tx_path: whether it is called in TX path
2326  *
2327  *      This function segments the given skb and returns a list of segments.
2328  *
2329  *      It may return NULL if the skb requires no segmentation.  This is
2330  *      only possible when GSO is used for verifying header integrity.
2331  */
2332 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2333                                   netdev_features_t features, bool tx_path)
2334 {
2335         if (unlikely(skb_needs_check(skb, tx_path))) {
2336                 int err;
2337
2338                 skb_warn_bad_offload(skb);
2339
2340                 if (skb_header_cloned(skb) &&
2341                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2342                         return ERR_PTR(err);
2343         }
2344
2345         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2346         skb_reset_mac_header(skb);
2347         skb_reset_mac_len(skb);
2348
2349         return skb_mac_gso_segment(skb, features);
2350 }
2351 EXPORT_SYMBOL(__skb_gso_segment);
2352
2353 /* Take action when hardware reception checksum errors are detected. */
2354 #ifdef CONFIG_BUG
2355 void netdev_rx_csum_fault(struct net_device *dev)
2356 {
2357         if (net_ratelimit()) {
2358                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2359                 dump_stack();
2360         }
2361 }
2362 EXPORT_SYMBOL(netdev_rx_csum_fault);
2363 #endif
2364
2365 /* Actually, we should eliminate this check as soon as we know, that:
2366  * 1. IOMMU is present and allows to map all the memory.
2367  * 2. No high memory really exists on this machine.
2368  */
2369
2370 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2371 {
2372 #ifdef CONFIG_HIGHMEM
2373         int i;
2374         if (!(dev->features & NETIF_F_HIGHDMA)) {
2375                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2376                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2377                         if (PageHighMem(skb_frag_page(frag)))
2378                                 return 1;
2379                 }
2380         }
2381
2382         if (PCI_DMA_BUS_IS_PHYS) {
2383                 struct device *pdev = dev->dev.parent;
2384
2385                 if (!pdev)
2386                         return 0;
2387                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2388                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2389                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2390                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2391                                 return 1;
2392                 }
2393         }
2394 #endif
2395         return 0;
2396 }
2397
2398 struct dev_gso_cb {
2399         void (*destructor)(struct sk_buff *skb);
2400 };
2401
2402 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2403
2404 static void dev_gso_skb_destructor(struct sk_buff *skb)
2405 {
2406         struct dev_gso_cb *cb;
2407
2408         do {
2409                 struct sk_buff *nskb = skb->next;
2410
2411                 skb->next = nskb->next;
2412                 nskb->next = NULL;
2413                 kfree_skb(nskb);
2414         } while (skb->next);
2415
2416         cb = DEV_GSO_CB(skb);
2417         if (cb->destructor)
2418                 cb->destructor(skb);
2419 }
2420
2421 /**
2422  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2423  *      @skb: buffer to segment
2424  *      @features: device features as applicable to this skb
2425  *
2426  *      This function segments the given skb and stores the list of segments
2427  *      in skb->next.
2428  */
2429 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2430 {
2431         struct sk_buff *segs;
2432
2433         segs = skb_gso_segment(skb, features);
2434
2435         /* Verifying header integrity only. */
2436         if (!segs)
2437                 return 0;
2438
2439         if (IS_ERR(segs))
2440                 return PTR_ERR(segs);
2441
2442         skb->next = segs;
2443         DEV_GSO_CB(skb)->destructor = skb->destructor;
2444         skb->destructor = dev_gso_skb_destructor;
2445
2446         return 0;
2447 }
2448
2449 static netdev_features_t harmonize_features(struct sk_buff *skb,
2450         __be16 protocol, netdev_features_t features)
2451 {
2452         if (skb->ip_summed != CHECKSUM_NONE &&
2453             !can_checksum_protocol(features, protocol)) {
2454                 features &= ~NETIF_F_ALL_CSUM;
2455         } else if (illegal_highdma(skb->dev, skb)) {
2456                 features &= ~NETIF_F_SG;
2457         }
2458
2459         return features;
2460 }
2461
2462 netdev_features_t netif_skb_features(struct sk_buff *skb)
2463 {
2464         __be16 protocol = skb->protocol;
2465         netdev_features_t features = skb->dev->features;
2466
2467         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2468                 features &= ~NETIF_F_GSO_MASK;
2469
2470         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2471                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2472                 protocol = veh->h_vlan_encapsulated_proto;
2473         } else if (!vlan_tx_tag_present(skb)) {
2474                 return harmonize_features(skb, protocol, features);
2475         }
2476
2477         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2478                                                NETIF_F_HW_VLAN_STAG_TX);
2479
2480         if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
2481                 return harmonize_features(skb, protocol, features);
2482         } else {
2483                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2484                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2485                                 NETIF_F_HW_VLAN_STAG_TX;
2486                 return harmonize_features(skb, protocol, features);
2487         }
2488 }
2489 EXPORT_SYMBOL(netif_skb_features);
2490
2491 /*
2492  * Returns true if either:
2493  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2494  *      2. skb is fragmented and the device does not support SG.
2495  */
2496 static inline int skb_needs_linearize(struct sk_buff *skb,
2497                                       netdev_features_t features)
2498 {
2499         return skb_is_nonlinear(skb) &&
2500                         ((skb_has_frag_list(skb) &&
2501                                 !(features & NETIF_F_FRAGLIST)) ||
2502                         (skb_shinfo(skb)->nr_frags &&
2503                                 !(features & NETIF_F_SG)));
2504 }
2505
2506 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2507                         struct netdev_queue *txq)
2508 {
2509         const struct net_device_ops *ops = dev->netdev_ops;
2510         int rc = NETDEV_TX_OK;
2511         unsigned int skb_len;
2512
2513         if (likely(!skb->next)) {
2514                 netdev_features_t features;
2515
2516                 /*
2517                  * If device doesn't need skb->dst, release it right now while
2518                  * its hot in this cpu cache
2519                  */
2520                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2521                         skb_dst_drop(skb);
2522
2523                 features = netif_skb_features(skb);
2524
2525                 if (vlan_tx_tag_present(skb) &&
2526                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2527                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2528                                              vlan_tx_tag_get(skb));
2529                         if (unlikely(!skb))
2530                                 goto out;
2531
2532                         skb->vlan_tci = 0;
2533                 }
2534
2535                 /* If encapsulation offload request, verify we are testing
2536                  * hardware encapsulation features instead of standard
2537                  * features for the netdev
2538                  */
2539                 if (skb->encapsulation)
2540                         features &= dev->hw_enc_features;
2541
2542                 if (netif_needs_gso(skb, features)) {
2543                         if (unlikely(dev_gso_segment(skb, features)))
2544                                 goto out_kfree_skb;
2545                         if (skb->next)
2546                                 goto gso;
2547                 } else {
2548                         if (skb_needs_linearize(skb, features) &&
2549                             __skb_linearize(skb))
2550                                 goto out_kfree_skb;
2551
2552                         /* If packet is not checksummed and device does not
2553                          * support checksumming for this protocol, complete
2554                          * checksumming here.
2555                          */
2556                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2557                                 if (skb->encapsulation)
2558                                         skb_set_inner_transport_header(skb,
2559                                                 skb_checksum_start_offset(skb));
2560                                 else
2561                                         skb_set_transport_header(skb,
2562                                                 skb_checksum_start_offset(skb));
2563                                 if (!(features & NETIF_F_ALL_CSUM) &&
2564                                      skb_checksum_help(skb))
2565                                         goto out_kfree_skb;
2566                         }
2567                 }
2568
2569                 if (!list_empty(&ptype_all))
2570                         dev_queue_xmit_nit(skb, dev);
2571
2572                 skb_len = skb->len;
2573                 rc = ops->ndo_start_xmit(skb, dev);
2574                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2575                 if (rc == NETDEV_TX_OK)
2576                         txq_trans_update(txq);
2577                 return rc;
2578         }
2579
2580 gso:
2581         do {
2582                 struct sk_buff *nskb = skb->next;
2583
2584                 skb->next = nskb->next;
2585                 nskb->next = NULL;
2586
2587                 if (!list_empty(&ptype_all))
2588                         dev_queue_xmit_nit(nskb, dev);
2589
2590                 skb_len = nskb->len;
2591                 rc = ops->ndo_start_xmit(nskb, dev);
2592                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2593                 if (unlikely(rc != NETDEV_TX_OK)) {
2594                         if (rc & ~NETDEV_TX_MASK)
2595                                 goto out_kfree_gso_skb;
2596                         nskb->next = skb->next;
2597                         skb->next = nskb;
2598                         return rc;
2599                 }
2600                 txq_trans_update(txq);
2601                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2602                         return NETDEV_TX_BUSY;
2603         } while (skb->next);
2604
2605 out_kfree_gso_skb:
2606         if (likely(skb->next == NULL)) {
2607                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2608                 consume_skb(skb);
2609                 return rc;
2610         }
2611 out_kfree_skb:
2612         kfree_skb(skb);
2613 out:
2614         return rc;
2615 }
2616
2617 static void qdisc_pkt_len_init(struct sk_buff *skb)
2618 {
2619         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2620
2621         qdisc_skb_cb(skb)->pkt_len = skb->len;
2622
2623         /* To get more precise estimation of bytes sent on wire,
2624          * we add to pkt_len the headers size of all segments
2625          */
2626         if (shinfo->gso_size)  {
2627                 unsigned int hdr_len;
2628                 u16 gso_segs = shinfo->gso_segs;
2629
2630                 /* mac layer + network layer */
2631                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2632
2633                 /* + transport layer */
2634                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2635                         hdr_len += tcp_hdrlen(skb);
2636                 else
2637                         hdr_len += sizeof(struct udphdr);
2638
2639                 if (shinfo->gso_type & SKB_GSO_DODGY)
2640                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2641                                                 shinfo->gso_size);
2642
2643                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2644         }
2645 }
2646
2647 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2648                                  struct net_device *dev,
2649                                  struct netdev_queue *txq)
2650 {
2651         spinlock_t *root_lock = qdisc_lock(q);
2652         bool contended;
2653         int rc;
2654
2655         qdisc_pkt_len_init(skb);
2656         qdisc_calculate_pkt_len(skb, q);
2657         /*
2658          * Heuristic to force contended enqueues to serialize on a
2659          * separate lock before trying to get qdisc main lock.
2660          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2661          * and dequeue packets faster.
2662          */
2663         contended = qdisc_is_running(q);
2664         if (unlikely(contended))
2665                 spin_lock(&q->busylock);
2666
2667         spin_lock(root_lock);
2668         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2669                 kfree_skb(skb);
2670                 rc = NET_XMIT_DROP;
2671         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2672                    qdisc_run_begin(q)) {
2673                 /*
2674                  * This is a work-conserving queue; there are no old skbs
2675                  * waiting to be sent out; and the qdisc is not running -
2676                  * xmit the skb directly.
2677                  */
2678                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2679                         skb_dst_force(skb);
2680
2681                 qdisc_bstats_update(q, skb);
2682
2683                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2684                         if (unlikely(contended)) {
2685                                 spin_unlock(&q->busylock);
2686                                 contended = false;
2687                         }
2688                         __qdisc_run(q);
2689                 } else
2690                         qdisc_run_end(q);
2691
2692                 rc = NET_XMIT_SUCCESS;
2693         } else {
2694                 skb_dst_force(skb);
2695                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2696                 if (qdisc_run_begin(q)) {
2697                         if (unlikely(contended)) {
2698                                 spin_unlock(&q->busylock);
2699                                 contended = false;
2700                         }
2701                         __qdisc_run(q);
2702                 }
2703         }
2704         spin_unlock(root_lock);
2705         if (unlikely(contended))
2706                 spin_unlock(&q->busylock);
2707         return rc;
2708 }
2709
2710 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2711 static void skb_update_prio(struct sk_buff *skb)
2712 {
2713         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2714
2715         if (!skb->priority && skb->sk && map) {
2716                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2717
2718                 if (prioidx < map->priomap_len)
2719                         skb->priority = map->priomap[prioidx];
2720         }
2721 }
2722 #else
2723 #define skb_update_prio(skb)
2724 #endif
2725
2726 static DEFINE_PER_CPU(int, xmit_recursion);
2727 #define RECURSION_LIMIT 10
2728
2729 /**
2730  *      dev_loopback_xmit - loop back @skb
2731  *      @skb: buffer to transmit
2732  */
2733 int dev_loopback_xmit(struct sk_buff *skb)
2734 {
2735         skb_reset_mac_header(skb);
2736         __skb_pull(skb, skb_network_offset(skb));
2737         skb->pkt_type = PACKET_LOOPBACK;
2738         skb->ip_summed = CHECKSUM_UNNECESSARY;
2739         WARN_ON(!skb_dst(skb));
2740         skb_dst_force(skb);
2741         netif_rx_ni(skb);
2742         return 0;
2743 }
2744 EXPORT_SYMBOL(dev_loopback_xmit);
2745
2746 /**
2747  *      dev_queue_xmit - transmit a buffer
2748  *      @skb: buffer to transmit
2749  *
2750  *      Queue a buffer for transmission to a network device. The caller must
2751  *      have set the device and priority and built the buffer before calling
2752  *      this function. The function can be called from an interrupt.
2753  *
2754  *      A negative errno code is returned on a failure. A success does not
2755  *      guarantee the frame will be transmitted as it may be dropped due
2756  *      to congestion or traffic shaping.
2757  *
2758  * -----------------------------------------------------------------------------------
2759  *      I notice this method can also return errors from the queue disciplines,
2760  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2761  *      be positive.
2762  *
2763  *      Regardless of the return value, the skb is consumed, so it is currently
2764  *      difficult to retry a send to this method.  (You can bump the ref count
2765  *      before sending to hold a reference for retry if you are careful.)
2766  *
2767  *      When calling this method, interrupts MUST be enabled.  This is because
2768  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2769  *          --BLG
2770  */
2771 int dev_queue_xmit(struct sk_buff *skb)
2772 {
2773         struct net_device *dev = skb->dev;
2774         struct netdev_queue *txq;
2775         struct Qdisc *q;
2776         int rc = -ENOMEM;
2777
2778         skb_reset_mac_header(skb);
2779
2780         /* Disable soft irqs for various locks below. Also
2781          * stops preemption for RCU.
2782          */
2783         rcu_read_lock_bh();
2784
2785         skb_update_prio(skb);
2786
2787         txq = netdev_pick_tx(dev, skb);
2788         q = rcu_dereference_bh(txq->qdisc);
2789
2790 #ifdef CONFIG_NET_CLS_ACT
2791         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2792 #endif
2793         trace_net_dev_queue(skb);
2794         if (q->enqueue) {
2795                 rc = __dev_xmit_skb(skb, q, dev, txq);
2796                 goto out;
2797         }
2798
2799         /* The device has no queue. Common case for software devices:
2800            loopback, all the sorts of tunnels...
2801
2802            Really, it is unlikely that netif_tx_lock protection is necessary
2803            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2804            counters.)
2805            However, it is possible, that they rely on protection
2806            made by us here.
2807
2808            Check this and shot the lock. It is not prone from deadlocks.
2809            Either shot noqueue qdisc, it is even simpler 8)
2810          */
2811         if (dev->flags & IFF_UP) {
2812                 int cpu = smp_processor_id(); /* ok because BHs are off */
2813
2814                 if (txq->xmit_lock_owner != cpu) {
2815
2816                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2817                                 goto recursion_alert;
2818
2819                         HARD_TX_LOCK(dev, txq, cpu);
2820
2821                         if (!netif_xmit_stopped(txq)) {
2822                                 __this_cpu_inc(xmit_recursion);
2823                                 rc = dev_hard_start_xmit(skb, dev, txq);
2824                                 __this_cpu_dec(xmit_recursion);
2825                                 if (dev_xmit_complete(rc)) {
2826                                         HARD_TX_UNLOCK(dev, txq);
2827                                         goto out;
2828                                 }
2829                         }
2830                         HARD_TX_UNLOCK(dev, txq);
2831                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2832                                              dev->name);
2833                 } else {
2834                         /* Recursion is detected! It is possible,
2835                          * unfortunately
2836                          */
2837 recursion_alert:
2838                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2839                                              dev->name);
2840                 }
2841         }
2842
2843         rc = -ENETDOWN;
2844         rcu_read_unlock_bh();
2845
2846         kfree_skb(skb);
2847         return rc;
2848 out:
2849         rcu_read_unlock_bh();
2850         return rc;
2851 }
2852 EXPORT_SYMBOL(dev_queue_xmit);
2853
2854
2855 /*=======================================================================
2856                         Receiver routines
2857   =======================================================================*/
2858
2859 int netdev_max_backlog __read_mostly = 1000;
2860 EXPORT_SYMBOL(netdev_max_backlog);
2861
2862 int netdev_tstamp_prequeue __read_mostly = 1;
2863 int netdev_budget __read_mostly = 300;
2864 int weight_p __read_mostly = 64;            /* old backlog weight */
2865
2866 /* Called with irq disabled */
2867 static inline void ____napi_schedule(struct softnet_data *sd,
2868                                      struct napi_struct *napi)
2869 {
2870         list_add_tail(&napi->poll_list, &sd->poll_list);
2871         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2872 }
2873
2874 #ifdef CONFIG_RPS
2875
2876 /* One global table that all flow-based protocols share. */
2877 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2878 EXPORT_SYMBOL(rps_sock_flow_table);
2879
2880 struct static_key rps_needed __read_mostly;
2881
2882 static struct rps_dev_flow *
2883 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2884             struct rps_dev_flow *rflow, u16 next_cpu)
2885 {
2886         if (next_cpu != RPS_NO_CPU) {
2887 #ifdef CONFIG_RFS_ACCEL
2888                 struct netdev_rx_queue *rxqueue;
2889                 struct rps_dev_flow_table *flow_table;
2890                 struct rps_dev_flow *old_rflow;
2891                 u32 flow_id;
2892                 u16 rxq_index;
2893                 int rc;
2894
2895                 /* Should we steer this flow to a different hardware queue? */
2896                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2897                     !(dev->features & NETIF_F_NTUPLE))
2898                         goto out;
2899                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2900                 if (rxq_index == skb_get_rx_queue(skb))
2901                         goto out;
2902
2903                 rxqueue = dev->_rx + rxq_index;
2904                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2905                 if (!flow_table)
2906                         goto out;
2907                 flow_id = skb->rxhash & flow_table->mask;
2908                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2909                                                         rxq_index, flow_id);
2910                 if (rc < 0)
2911                         goto out;
2912                 old_rflow = rflow;
2913                 rflow = &flow_table->flows[flow_id];
2914                 rflow->filter = rc;
2915                 if (old_rflow->filter == rflow->filter)
2916                         old_rflow->filter = RPS_NO_FILTER;
2917         out:
2918 #endif
2919                 rflow->last_qtail =
2920                         per_cpu(softnet_data, next_cpu).input_queue_head;
2921         }
2922
2923         rflow->cpu = next_cpu;
2924         return rflow;
2925 }
2926
2927 /*
2928  * get_rps_cpu is called from netif_receive_skb and returns the target
2929  * CPU from the RPS map of the receiving queue for a given skb.
2930  * rcu_read_lock must be held on entry.
2931  */
2932 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2933                        struct rps_dev_flow **rflowp)
2934 {
2935         struct netdev_rx_queue *rxqueue;
2936         struct rps_map *map;
2937         struct rps_dev_flow_table *flow_table;
2938         struct rps_sock_flow_table *sock_flow_table;
2939         int cpu = -1;
2940         u16 tcpu;
2941
2942         if (skb_rx_queue_recorded(skb)) {
2943                 u16 index = skb_get_rx_queue(skb);
2944                 if (unlikely(index >= dev->real_num_rx_queues)) {
2945                         WARN_ONCE(dev->real_num_rx_queues > 1,
2946                                   "%s received packet on queue %u, but number "
2947                                   "of RX queues is %u\n",
2948                                   dev->name, index, dev->real_num_rx_queues);
2949                         goto done;
2950                 }
2951                 rxqueue = dev->_rx + index;
2952         } else
2953                 rxqueue = dev->_rx;
2954
2955         map = rcu_dereference(rxqueue->rps_map);
2956         if (map) {
2957                 if (map->len == 1 &&
2958                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2959                         tcpu = map->cpus[0];
2960                         if (cpu_online(tcpu))
2961                                 cpu = tcpu;
2962                         goto done;
2963                 }
2964         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2965                 goto done;
2966         }
2967
2968         skb_reset_network_header(skb);
2969         if (!skb_get_rxhash(skb))
2970                 goto done;
2971
2972         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2973         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2974         if (flow_table && sock_flow_table) {
2975                 u16 next_cpu;
2976                 struct rps_dev_flow *rflow;
2977
2978                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2979                 tcpu = rflow->cpu;
2980
2981                 next_cpu = sock_flow_table->ents[skb->rxhash &
2982                     sock_flow_table->mask];
2983
2984                 /*
2985                  * If the desired CPU (where last recvmsg was done) is
2986                  * different from current CPU (one in the rx-queue flow
2987                  * table entry), switch if one of the following holds:
2988                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2989                  *   - Current CPU is offline.
2990                  *   - The current CPU's queue tail has advanced beyond the
2991                  *     last packet that was enqueued using this table entry.
2992                  *     This guarantees that all previous packets for the flow
2993                  *     have been dequeued, thus preserving in order delivery.
2994                  */
2995                 if (unlikely(tcpu != next_cpu) &&
2996                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2997                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2998                       rflow->last_qtail)) >= 0)) {
2999                         tcpu = next_cpu;
3000                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3001                 }
3002
3003                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3004                         *rflowp = rflow;
3005                         cpu = tcpu;
3006                         goto done;
3007                 }
3008         }
3009
3010         if (map) {
3011                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3012
3013                 if (cpu_online(tcpu)) {
3014                         cpu = tcpu;
3015                         goto done;
3016                 }
3017         }
3018
3019 done:
3020         return cpu;
3021 }
3022
3023 #ifdef CONFIG_RFS_ACCEL
3024
3025 /**
3026  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3027  * @dev: Device on which the filter was set
3028  * @rxq_index: RX queue index
3029  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3030  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3031  *
3032  * Drivers that implement ndo_rx_flow_steer() should periodically call
3033  * this function for each installed filter and remove the filters for
3034  * which it returns %true.
3035  */
3036 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3037                          u32 flow_id, u16 filter_id)
3038 {
3039         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3040         struct rps_dev_flow_table *flow_table;
3041         struct rps_dev_flow *rflow;
3042         bool expire = true;
3043         int cpu;
3044
3045         rcu_read_lock();
3046         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3047         if (flow_table && flow_id <= flow_table->mask) {
3048                 rflow = &flow_table->flows[flow_id];
3049                 cpu = ACCESS_ONCE(rflow->cpu);
3050                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3051                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3052                            rflow->last_qtail) <
3053                      (int)(10 * flow_table->mask)))
3054                         expire = false;
3055         }
3056         rcu_read_unlock();
3057         return expire;
3058 }
3059 EXPORT_SYMBOL(rps_may_expire_flow);
3060
3061 #endif /* CONFIG_RFS_ACCEL */
3062
3063 /* Called from hardirq (IPI) context */
3064 static void rps_trigger_softirq(void *data)
3065 {
3066         struct softnet_data *sd = data;
3067
3068         ____napi_schedule(sd, &sd->backlog);
3069         sd->received_rps++;
3070 }
3071
3072 #endif /* CONFIG_RPS */
3073
3074 /*
3075  * Check if this softnet_data structure is another cpu one
3076  * If yes, queue it to our IPI list and return 1
3077  * If no, return 0
3078  */
3079 static int rps_ipi_queued(struct softnet_data *sd)
3080 {
3081 #ifdef CONFIG_RPS
3082         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3083
3084         if (sd != mysd) {
3085                 sd->rps_ipi_next = mysd->rps_ipi_list;
3086                 mysd->rps_ipi_list = sd;
3087
3088                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3089                 return 1;
3090         }
3091 #endif /* CONFIG_RPS */
3092         return 0;
3093 }
3094
3095 #ifdef CONFIG_NET_FLOW_LIMIT
3096 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3097 #endif
3098
3099 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3100 {
3101 #ifdef CONFIG_NET_FLOW_LIMIT
3102         struct sd_flow_limit *fl;
3103         struct softnet_data *sd;
3104         unsigned int old_flow, new_flow;
3105
3106         if (qlen < (netdev_max_backlog >> 1))
3107                 return false;
3108
3109         sd = &__get_cpu_var(softnet_data);
3110
3111         rcu_read_lock();
3112         fl = rcu_dereference(sd->flow_limit);
3113         if (fl) {
3114                 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3115                 old_flow = fl->history[fl->history_head];
3116                 fl->history[fl->history_head] = new_flow;
3117
3118                 fl->history_head++;
3119                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3120
3121                 if (likely(fl->buckets[old_flow]))
3122                         fl->buckets[old_flow]--;
3123
3124                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3125                         fl->count++;
3126                         rcu_read_unlock();
3127                         return true;
3128                 }
3129         }
3130         rcu_read_unlock();
3131 #endif
3132         return false;
3133 }
3134
3135 /*
3136  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3137  * queue (may be a remote CPU queue).
3138  */
3139 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3140                               unsigned int *qtail)
3141 {
3142         struct softnet_data *sd;
3143         unsigned long flags;
3144         unsigned int qlen;
3145
3146         sd = &per_cpu(softnet_data, cpu);
3147
3148         local_irq_save(flags);
3149
3150         rps_lock(sd);
3151         qlen = skb_queue_len(&sd->input_pkt_queue);
3152         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3153                 if (skb_queue_len(&sd->input_pkt_queue)) {
3154 enqueue:
3155                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3156                         input_queue_tail_incr_save(sd, qtail);
3157                         rps_unlock(sd);
3158                         local_irq_restore(flags);
3159                         return NET_RX_SUCCESS;
3160                 }
3161
3162                 /* Schedule NAPI for backlog device
3163                  * We can use non atomic operation since we own the queue lock
3164                  */
3165                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3166                         if (!rps_ipi_queued(sd))
3167                                 ____napi_schedule(sd, &sd->backlog);
3168                 }
3169                 goto enqueue;
3170         }
3171
3172         sd->dropped++;
3173         rps_unlock(sd);
3174
3175         local_irq_restore(flags);
3176
3177         atomic_long_inc(&skb->dev->rx_dropped);
3178         kfree_skb(skb);
3179         return NET_RX_DROP;
3180 }
3181
3182 /**
3183  *      netif_rx        -       post buffer to the network code
3184  *      @skb: buffer to post
3185  *
3186  *      This function receives a packet from a device driver and queues it for
3187  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3188  *      may be dropped during processing for congestion control or by the
3189  *      protocol layers.
3190  *
3191  *      return values:
3192  *      NET_RX_SUCCESS  (no congestion)
3193  *      NET_RX_DROP     (packet was dropped)
3194  *
3195  */
3196
3197 int netif_rx(struct sk_buff *skb)
3198 {
3199         int ret;
3200
3201         /* if netpoll wants it, pretend we never saw it */
3202         if (netpoll_rx(skb))
3203                 return NET_RX_DROP;
3204
3205         net_timestamp_check(netdev_tstamp_prequeue, skb);
3206
3207         trace_netif_rx(skb);
3208 #ifdef CONFIG_RPS
3209         if (static_key_false(&rps_needed)) {
3210                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3211                 int cpu;
3212
3213                 preempt_disable();
3214                 rcu_read_lock();
3215
3216                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3217                 if (cpu < 0)
3218                         cpu = smp_processor_id();
3219
3220                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3221
3222                 rcu_read_unlock();
3223                 preempt_enable();
3224         } else
3225 #endif
3226         {
3227                 unsigned int qtail;
3228                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3229                 put_cpu();
3230         }
3231         return ret;
3232 }
3233 EXPORT_SYMBOL(netif_rx);
3234
3235 int netif_rx_ni(struct sk_buff *skb)
3236 {
3237         int err;
3238
3239         preempt_disable();
3240         err = netif_rx(skb);
3241         if (local_softirq_pending())
3242                 do_softirq();
3243         preempt_enable();
3244
3245         return err;
3246 }
3247 EXPORT_SYMBOL(netif_rx_ni);
3248
3249 static void net_tx_action(struct softirq_action *h)
3250 {
3251         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3252
3253         if (sd->completion_queue) {
3254                 struct sk_buff *clist;
3255
3256                 local_irq_disable();
3257                 clist = sd->completion_queue;
3258                 sd->completion_queue = NULL;
3259                 local_irq_enable();
3260
3261                 while (clist) {
3262                         struct sk_buff *skb = clist;
3263                         clist = clist->next;
3264
3265                         WARN_ON(atomic_read(&skb->users));
3266                         trace_kfree_skb(skb, net_tx_action);
3267                         __kfree_skb(skb);
3268                 }
3269         }
3270
3271         if (sd->output_queue) {
3272                 struct Qdisc *head;
3273
3274                 local_irq_disable();
3275                 head = sd->output_queue;
3276                 sd->output_queue = NULL;
3277                 sd->output_queue_tailp = &sd->output_queue;
3278                 local_irq_enable();
3279
3280                 while (head) {
3281                         struct Qdisc *q = head;
3282                         spinlock_t *root_lock;
3283
3284                         head = head->next_sched;
3285
3286                         root_lock = qdisc_lock(q);
3287                         if (spin_trylock(root_lock)) {
3288                                 smp_mb__before_clear_bit();
3289                                 clear_bit(__QDISC_STATE_SCHED,
3290                                           &q->state);
3291                                 qdisc_run(q);
3292                                 spin_unlock(root_lock);
3293                         } else {
3294                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3295                                               &q->state)) {
3296                                         __netif_reschedule(q);
3297                                 } else {
3298                                         smp_mb__before_clear_bit();
3299                                         clear_bit(__QDISC_STATE_SCHED,
3300                                                   &q->state);
3301                                 }
3302                         }
3303                 }
3304         }
3305 }
3306
3307 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3308     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3309 /* This hook is defined here for ATM LANE */
3310 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3311                              unsigned char *addr) __read_mostly;
3312 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3313 #endif
3314
3315 #ifdef CONFIG_NET_CLS_ACT
3316 /* TODO: Maybe we should just force sch_ingress to be compiled in
3317  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3318  * a compare and 2 stores extra right now if we dont have it on
3319  * but have CONFIG_NET_CLS_ACT
3320  * NOTE: This doesn't stop any functionality; if you dont have
3321  * the ingress scheduler, you just can't add policies on ingress.
3322  *
3323  */
3324 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3325 {
3326         struct net_device *dev = skb->dev;
3327         u32 ttl = G_TC_RTTL(skb->tc_verd);
3328         int result = TC_ACT_OK;
3329         struct Qdisc *q;
3330
3331         if (unlikely(MAX_RED_LOOP < ttl++)) {
3332                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3333                                      skb->skb_iif, dev->ifindex);
3334                 return TC_ACT_SHOT;
3335         }
3336
3337         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3338         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3339
3340         q = rxq->qdisc;
3341         if (q != &noop_qdisc) {
3342                 spin_lock(qdisc_lock(q));
3343                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3344                         result = qdisc_enqueue_root(skb, q);
3345                 spin_unlock(qdisc_lock(q));
3346         }
3347
3348         return result;
3349 }
3350
3351 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3352                                          struct packet_type **pt_prev,
3353                                          int *ret, struct net_device *orig_dev)
3354 {
3355         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3356
3357         if (!rxq || rxq->qdisc == &noop_qdisc)
3358                 goto out;
3359
3360         if (*pt_prev) {
3361                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3362                 *pt_prev = NULL;
3363         }
3364
3365         switch (ing_filter(skb, rxq)) {
3366         case TC_ACT_SHOT:
3367         case TC_ACT_STOLEN:
3368                 kfree_skb(skb);
3369                 return NULL;
3370         }
3371
3372 out:
3373         skb->tc_verd = 0;
3374         return skb;
3375 }
3376 #endif
3377
3378 /**
3379  *      netdev_rx_handler_register - register receive handler
3380  *      @dev: device to register a handler for
3381  *      @rx_handler: receive handler to register
3382  *      @rx_handler_data: data pointer that is used by rx handler
3383  *
3384  *      Register a receive hander for a device. This handler will then be
3385  *      called from __netif_receive_skb. A negative errno code is returned
3386  *      on a failure.
3387  *
3388  *      The caller must hold the rtnl_mutex.
3389  *
3390  *      For a general description of rx_handler, see enum rx_handler_result.
3391  */
3392 int netdev_rx_handler_register(struct net_device *dev,
3393                                rx_handler_func_t *rx_handler,
3394                                void *rx_handler_data)
3395 {
3396         ASSERT_RTNL();
3397
3398         if (dev->rx_handler)
3399                 return -EBUSY;
3400
3401         /* Note: rx_handler_data must be set before rx_handler */
3402         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3403         rcu_assign_pointer(dev->rx_handler, rx_handler);
3404
3405         return 0;
3406 }
3407 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3408
3409 /**
3410  *      netdev_rx_handler_unregister - unregister receive handler
3411  *      @dev: device to unregister a handler from
3412  *
3413  *      Unregister a receive handler from a device.
3414  *
3415  *      The caller must hold the rtnl_mutex.
3416  */
3417 void netdev_rx_handler_unregister(struct net_device *dev)
3418 {
3419
3420         ASSERT_RTNL();
3421         RCU_INIT_POINTER(dev->rx_handler, NULL);
3422         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3423          * section has a guarantee to see a non NULL rx_handler_data
3424          * as well.
3425          */
3426         synchronize_net();
3427         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3428 }
3429 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3430
3431 /*
3432  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3433  * the special handling of PFMEMALLOC skbs.
3434  */
3435 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3436 {
3437         switch (skb->protocol) {
3438         case __constant_htons(ETH_P_ARP):
3439         case __constant_htons(ETH_P_IP):
3440         case __constant_htons(ETH_P_IPV6):
3441         case __constant_htons(ETH_P_8021Q):
3442         case __constant_htons(ETH_P_8021AD):
3443                 return true;
3444         default:
3445                 return false;
3446         }
3447 }
3448
3449 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3450 {
3451         struct packet_type *ptype, *pt_prev;
3452         rx_handler_func_t *rx_handler;
3453         struct net_device *orig_dev;
3454         struct net_device *null_or_dev;
3455         bool deliver_exact = false;
3456         int ret = NET_RX_DROP;
3457         __be16 type;
3458
3459         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3460
3461         trace_netif_receive_skb(skb);
3462
3463         /* if we've gotten here through NAPI, check netpoll */
3464         if (netpoll_receive_skb(skb))
3465                 goto out;
3466
3467         orig_dev = skb->dev;
3468
3469         skb_reset_network_header(skb);
3470         if (!skb_transport_header_was_set(skb))
3471                 skb_reset_transport_header(skb);
3472         skb_reset_mac_len(skb);
3473
3474         pt_prev = NULL;
3475
3476         rcu_read_lock();
3477
3478 another_round:
3479         skb->skb_iif = skb->dev->ifindex;
3480
3481         __this_cpu_inc(softnet_data.processed);
3482
3483         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3484             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3485                 skb = vlan_untag(skb);
3486                 if (unlikely(!skb))
3487                         goto unlock;
3488         }
3489
3490 #ifdef CONFIG_NET_CLS_ACT
3491         if (skb->tc_verd & TC_NCLS) {
3492                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3493                 goto ncls;
3494         }
3495 #endif
3496
3497         if (pfmemalloc)
3498                 goto skip_taps;
3499
3500         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3501                 if (!ptype->dev || ptype->dev == skb->dev) {
3502                         if (pt_prev)
3503                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3504                         pt_prev = ptype;
3505                 }
3506         }
3507
3508 skip_taps:
3509 #ifdef CONFIG_NET_CLS_ACT
3510         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3511         if (!skb)
3512                 goto unlock;
3513 ncls:
3514 #endif
3515
3516         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3517                 goto drop;
3518
3519         if (vlan_tx_tag_present(skb)) {
3520                 if (pt_prev) {
3521                         ret = deliver_skb(skb, pt_prev, orig_dev);
3522                         pt_prev = NULL;
3523                 }
3524                 if (vlan_do_receive(&skb))
3525                         goto another_round;
3526                 else if (unlikely(!skb))
3527                         goto unlock;
3528         }
3529
3530         rx_handler = rcu_dereference(skb->dev->rx_handler);
3531         if (rx_handler) {
3532                 if (pt_prev) {
3533                         ret = deliver_skb(skb, pt_prev, orig_dev);
3534                         pt_prev = NULL;
3535                 }
3536                 switch (rx_handler(&skb)) {
3537                 case RX_HANDLER_CONSUMED:
3538                         ret = NET_RX_SUCCESS;
3539                         goto unlock;
3540                 case RX_HANDLER_ANOTHER:
3541                         goto another_round;
3542                 case RX_HANDLER_EXACT:
3543                         deliver_exact = true;
3544                 case RX_HANDLER_PASS:
3545                         break;
3546                 default:
3547                         BUG();
3548                 }
3549         }
3550
3551         if (vlan_tx_nonzero_tag_present(skb))
3552                 skb->pkt_type = PACKET_OTHERHOST;
3553
3554         /* deliver only exact match when indicated */
3555         null_or_dev = deliver_exact ? skb->dev : NULL;
3556
3557         type = skb->protocol;
3558         list_for_each_entry_rcu(ptype,
3559                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3560                 if (ptype->type == type &&
3561                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3562                      ptype->dev == orig_dev)) {
3563                         if (pt_prev)
3564                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3565                         pt_prev = ptype;
3566                 }
3567         }
3568
3569         if (pt_prev) {
3570                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3571                         goto drop;
3572                 else
3573                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3574         } else {
3575 drop:
3576                 atomic_long_inc(&skb->dev->rx_dropped);
3577                 kfree_skb(skb);
3578                 /* Jamal, now you will not able to escape explaining
3579                  * me how you were going to use this. :-)
3580                  */
3581                 ret = NET_RX_DROP;
3582         }
3583
3584 unlock:
3585         rcu_read_unlock();
3586 out:
3587         return ret;
3588 }
3589
3590 static int __netif_receive_skb(struct sk_buff *skb)
3591 {
3592         int ret;
3593
3594         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3595                 unsigned long pflags = current->flags;
3596
3597                 /*
3598                  * PFMEMALLOC skbs are special, they should
3599                  * - be delivered to SOCK_MEMALLOC sockets only
3600                  * - stay away from userspace
3601                  * - have bounded memory usage
3602                  *
3603                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3604                  * context down to all allocation sites.
3605                  */
3606                 current->flags |= PF_MEMALLOC;
3607                 ret = __netif_receive_skb_core(skb, true);
3608                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3609         } else
3610                 ret = __netif_receive_skb_core(skb, false);
3611
3612         return ret;
3613 }
3614
3615 /**
3616  *      netif_receive_skb - process receive buffer from network
3617  *      @skb: buffer to process
3618  *
3619  *      netif_receive_skb() is the main receive data processing function.
3620  *      It always succeeds. The buffer may be dropped during processing
3621  *      for congestion control or by the protocol layers.
3622  *
3623  *      This function may only be called from softirq context and interrupts
3624  *      should be enabled.
3625  *
3626  *      Return values (usually ignored):
3627  *      NET_RX_SUCCESS: no congestion
3628  *      NET_RX_DROP: packet was dropped
3629  */
3630 int netif_receive_skb(struct sk_buff *skb)
3631 {
3632         net_timestamp_check(netdev_tstamp_prequeue, skb);
3633
3634         if (skb_defer_rx_timestamp(skb))
3635                 return NET_RX_SUCCESS;
3636
3637 #ifdef CONFIG_RPS
3638         if (static_key_false(&rps_needed)) {
3639                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3640                 int cpu, ret;
3641
3642                 rcu_read_lock();
3643
3644                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3645
3646                 if (cpu >= 0) {
3647                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3648                         rcu_read_unlock();
3649                         return ret;
3650                 }
3651                 rcu_read_unlock();
3652         }
3653 #endif
3654         return __netif_receive_skb(skb);
3655 }
3656 EXPORT_SYMBOL(netif_receive_skb);
3657
3658 /* Network device is going away, flush any packets still pending
3659  * Called with irqs disabled.
3660  */
3661 static void flush_backlog(void *arg)
3662 {
3663         struct net_device *dev = arg;
3664         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3665         struct sk_buff *skb, *tmp;
3666
3667         rps_lock(sd);
3668         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3669                 if (skb->dev == dev) {
3670                         __skb_unlink(skb, &sd->input_pkt_queue);
3671                         kfree_skb(skb);
3672                         input_queue_head_incr(sd);
3673                 }
3674         }
3675         rps_unlock(sd);
3676
3677         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3678                 if (skb->dev == dev) {
3679                         __skb_unlink(skb, &sd->process_queue);
3680                         kfree_skb(skb);
3681                         input_queue_head_incr(sd);
3682                 }
3683         }
3684 }
3685
3686 static int napi_gro_complete(struct sk_buff *skb)
3687 {
3688         struct packet_offload *ptype;
3689         __be16 type = skb->protocol;
3690         struct list_head *head = &offload_base;
3691         int err = -ENOENT;
3692
3693         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3694
3695         if (NAPI_GRO_CB(skb)->count == 1) {
3696                 skb_shinfo(skb)->gso_size = 0;
3697                 goto out;
3698         }
3699
3700         rcu_read_lock();
3701         list_for_each_entry_rcu(ptype, head, list) {
3702                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3703                         continue;
3704
3705                 err = ptype->callbacks.gro_complete(skb);
3706                 break;
3707         }
3708         rcu_read_unlock();
3709
3710         if (err) {
3711                 WARN_ON(&ptype->list == head);
3712                 kfree_skb(skb);
3713                 return NET_RX_SUCCESS;
3714         }
3715
3716 out:
3717         return netif_receive_skb(skb);
3718 }
3719
3720 /* napi->gro_list contains packets ordered by age.
3721  * youngest packets at the head of it.
3722  * Complete skbs in reverse order to reduce latencies.
3723  */
3724 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3725 {
3726         struct sk_buff *skb, *prev = NULL;
3727
3728         /* scan list and build reverse chain */
3729         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3730                 skb->prev = prev;
3731                 prev = skb;
3732         }
3733
3734         for (skb = prev; skb; skb = prev) {
3735                 skb->next = NULL;
3736
3737                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3738                         return;
3739
3740                 prev = skb->prev;
3741                 napi_gro_complete(skb);
3742                 napi->gro_count--;
3743         }
3744
3745         napi->gro_list = NULL;
3746 }
3747 EXPORT_SYMBOL(napi_gro_flush);
3748
3749 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3750 {
3751         struct sk_buff *p;
3752         unsigned int maclen = skb->dev->hard_header_len;
3753
3754         for (p = napi->gro_list; p; p = p->next) {
3755                 unsigned long diffs;
3756
3757                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3758                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3759                 if (maclen == ETH_HLEN)
3760                         diffs |= compare_ether_header(skb_mac_header(p),
3761                                                       skb_gro_mac_header(skb));
3762                 else if (!diffs)
3763                         diffs = memcmp(skb_mac_header(p),
3764                                        skb_gro_mac_header(skb),
3765                                        maclen);
3766                 NAPI_GRO_CB(p)->same_flow = !diffs;
3767                 NAPI_GRO_CB(p)->flush = 0;
3768         }
3769 }
3770
3771 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3772 {
3773         struct sk_buff **pp = NULL;
3774         struct packet_offload *ptype;
3775         __be16 type = skb->protocol;
3776         struct list_head *head = &offload_base;
3777         int same_flow;
3778         enum gro_result ret;
3779
3780         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3781                 goto normal;
3782
3783         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3784                 goto normal;
3785
3786         gro_list_prepare(napi, skb);
3787
3788         rcu_read_lock();
3789         list_for_each_entry_rcu(ptype, head, list) {
3790                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3791                         continue;
3792
3793                 skb_set_network_header(skb, skb_gro_offset(skb));
3794                 skb_reset_mac_len(skb);
3795                 NAPI_GRO_CB(skb)->same_flow = 0;
3796                 NAPI_GRO_CB(skb)->flush = 0;
3797                 NAPI_GRO_CB(skb)->free = 0;
3798
3799                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3800                 break;
3801         }
3802         rcu_read_unlock();
3803
3804         if (&ptype->list == head)
3805                 goto normal;
3806
3807         same_flow = NAPI_GRO_CB(skb)->same_flow;
3808         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3809
3810         if (pp) {
3811                 struct sk_buff *nskb = *pp;
3812
3813                 *pp = nskb->next;
3814                 nskb->next = NULL;
3815                 napi_gro_complete(nskb);
3816                 napi->gro_count--;
3817         }
3818
3819         if (same_flow)
3820                 goto ok;
3821
3822         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3823                 goto normal;
3824
3825         napi->gro_count++;
3826         NAPI_GRO_CB(skb)->count = 1;
3827         NAPI_GRO_CB(skb)->age = jiffies;
3828         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3829         skb->next = napi->gro_list;
3830         napi->gro_list = skb;
3831         ret = GRO_HELD;
3832
3833 pull:
3834         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3835                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3836
3837                 BUG_ON(skb->end - skb->tail < grow);
3838
3839                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3840
3841                 skb->tail += grow;
3842                 skb->data_len -= grow;
3843
3844                 skb_shinfo(skb)->frags[0].page_offset += grow;
3845                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3846
3847                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3848                         skb_frag_unref(skb, 0);
3849                         memmove(skb_shinfo(skb)->frags,
3850                                 skb_shinfo(skb)->frags + 1,
3851                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3852                 }
3853         }
3854
3855 ok:
3856         return ret;
3857
3858 normal:
3859         ret = GRO_NORMAL;
3860         goto pull;
3861 }
3862
3863
3864 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3865 {
3866         switch (ret) {
3867         case GRO_NORMAL:
3868                 if (netif_receive_skb(skb))
3869                         ret = GRO_DROP;
3870                 break;
3871
3872         case GRO_DROP:
3873                 kfree_skb(skb);
3874                 break;
3875
3876         case GRO_MERGED_FREE:
3877                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3878                         kmem_cache_free(skbuff_head_cache, skb);
3879                 else
3880                         __kfree_skb(skb);
3881                 break;
3882
3883         case GRO_HELD:
3884         case GRO_MERGED:
3885                 break;
3886         }
3887
3888         return ret;
3889 }
3890
3891 static void skb_gro_reset_offset(struct sk_buff *skb)
3892 {
3893         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3894         const skb_frag_t *frag0 = &pinfo->frags[0];
3895
3896         NAPI_GRO_CB(skb)->data_offset = 0;
3897         NAPI_GRO_CB(skb)->frag0 = NULL;
3898         NAPI_GRO_CB(skb)->frag0_len = 0;
3899
3900         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3901             pinfo->nr_frags &&
3902             !PageHighMem(skb_frag_page(frag0))) {
3903                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3904                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3905         }
3906 }
3907
3908 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3909 {
3910         skb_gro_reset_offset(skb);
3911
3912         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3913 }
3914 EXPORT_SYMBOL(napi_gro_receive);
3915
3916 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3917 {
3918         __skb_pull(skb, skb_headlen(skb));
3919         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3920         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3921         skb->vlan_tci = 0;
3922         skb->dev = napi->dev;
3923         skb->skb_iif = 0;
3924
3925         napi->skb = skb;
3926 }
3927
3928 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3929 {
3930         struct sk_buff *skb = napi->skb;
3931
3932         if (!skb) {
3933                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3934                 if (skb)
3935                         napi->skb = skb;
3936         }
3937         return skb;
3938 }
3939 EXPORT_SYMBOL(napi_get_frags);
3940
3941 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3942                                gro_result_t ret)
3943 {
3944         switch (ret) {
3945         case GRO_NORMAL:
3946         case GRO_HELD:
3947                 skb->protocol = eth_type_trans(skb, skb->dev);
3948
3949                 if (ret == GRO_HELD)
3950                         skb_gro_pull(skb, -ETH_HLEN);
3951                 else if (netif_receive_skb(skb))
3952                         ret = GRO_DROP;
3953                 break;
3954
3955         case GRO_DROP:
3956         case GRO_MERGED_FREE:
3957                 napi_reuse_skb(napi, skb);
3958                 break;
3959
3960         case GRO_MERGED:
3961                 break;
3962         }
3963
3964         return ret;
3965 }
3966
3967 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3968 {
3969         struct sk_buff *skb = napi->skb;
3970         struct ethhdr *eth;
3971         unsigned int hlen;
3972         unsigned int off;
3973
3974         napi->skb = NULL;
3975
3976         skb_reset_mac_header(skb);
3977         skb_gro_reset_offset(skb);
3978
3979         off = skb_gro_offset(skb);
3980         hlen = off + sizeof(*eth);
3981         eth = skb_gro_header_fast(skb, off);
3982         if (skb_gro_header_hard(skb, hlen)) {
3983                 eth = skb_gro_header_slow(skb, hlen, off);
3984                 if (unlikely(!eth)) {
3985                         napi_reuse_skb(napi, skb);
3986                         skb = NULL;
3987                         goto out;
3988                 }
3989         }
3990
3991         skb_gro_pull(skb, sizeof(*eth));
3992
3993         /*
3994          * This works because the only protocols we care about don't require
3995          * special handling.  We'll fix it up properly at the end.
3996          */
3997         skb->protocol = eth->h_proto;
3998
3999 out:
4000         return skb;
4001 }
4002
4003 gro_result_t napi_gro_frags(struct napi_struct *napi)
4004 {
4005         struct sk_buff *skb = napi_frags_skb(napi);
4006
4007         if (!skb)
4008                 return GRO_DROP;
4009
4010         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4011 }
4012 EXPORT_SYMBOL(napi_gro_frags);
4013
4014 /*
4015  * net_rps_action sends any pending IPI's for rps.
4016  * Note: called with local irq disabled, but exits with local irq enabled.
4017  */
4018 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4019 {
4020 #ifdef CONFIG_RPS
4021         struct softnet_data *remsd = sd->rps_ipi_list;
4022
4023         if (remsd) {
4024                 sd->rps_ipi_list = NULL;
4025
4026                 local_irq_enable();
4027
4028                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4029                 while (remsd) {
4030                         struct softnet_data *next = remsd->rps_ipi_next;
4031
4032                         if (cpu_online(remsd->cpu))
4033                                 __smp_call_function_single(remsd->cpu,
4034                                                            &remsd->csd, 0);
4035                         remsd = next;
4036                 }
4037         } else
4038 #endif
4039                 local_irq_enable();
4040 }
4041
4042 static int process_backlog(struct napi_struct *napi, int quota)
4043 {
4044         int work = 0;
4045         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4046
4047 #ifdef CONFIG_RPS
4048         /* Check if we have pending ipi, its better to send them now,
4049          * not waiting net_rx_action() end.
4050          */
4051         if (sd->rps_ipi_list) {
4052                 local_irq_disable();
4053                 net_rps_action_and_irq_enable(sd);
4054         }
4055 #endif
4056         napi->weight = weight_p;
4057         local_irq_disable();
4058         while (work < quota) {
4059                 struct sk_buff *skb;
4060                 unsigned int qlen;
4061
4062                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4063                         local_irq_enable();
4064                         __netif_receive_skb(skb);
4065                         local_irq_disable();
4066                         input_queue_head_incr(sd);
4067                         if (++work >= quota) {
4068                                 local_irq_enable();
4069                                 return work;
4070                         }
4071                 }
4072
4073                 rps_lock(sd);
4074                 qlen = skb_queue_len(&sd->input_pkt_queue);
4075                 if (qlen)
4076                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4077                                                    &sd->process_queue);
4078
4079                 if (qlen < quota - work) {
4080                         /*
4081                          * Inline a custom version of __napi_complete().
4082                          * only current cpu owns and manipulates this napi,
4083                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4084                          * we can use a plain write instead of clear_bit(),
4085                          * and we dont need an smp_mb() memory barrier.
4086                          */
4087                         list_del(&napi->poll_list);
4088                         napi->state = 0;
4089
4090                         quota = work + qlen;
4091                 }
4092                 rps_unlock(sd);
4093         }
4094         local_irq_enable();
4095
4096         return work;
4097 }
4098
4099 /**
4100  * __napi_schedule - schedule for receive
4101  * @n: entry to schedule
4102  *
4103  * The entry's receive function will be scheduled to run
4104  */
4105 void __napi_schedule(struct napi_struct *n)
4106 {
4107         unsigned long flags;
4108
4109         local_irq_save(flags);
4110         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4111         local_irq_restore(flags);
4112 }
4113 EXPORT_SYMBOL(__napi_schedule);
4114
4115 void __napi_complete(struct napi_struct *n)
4116 {
4117         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4118         BUG_ON(n->gro_list);
4119
4120         list_del(&n->poll_list);
4121         smp_mb__before_clear_bit();
4122         clear_bit(NAPI_STATE_SCHED, &n->state);
4123 }
4124 EXPORT_SYMBOL(__napi_complete);
4125
4126 void napi_complete(struct napi_struct *n)
4127 {
4128         unsigned long flags;
4129
4130         /*
4131          * don't let napi dequeue from the cpu poll list
4132          * just in case its running on a different cpu
4133          */
4134         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4135                 return;
4136
4137         napi_gro_flush(n, false);
4138         local_irq_save(flags);
4139         __napi_complete(n);
4140         local_irq_restore(flags);
4141 }
4142 EXPORT_SYMBOL(napi_complete);
4143
4144 /* must be called under rcu_read_lock(), as we dont take a reference */
4145 struct napi_struct *napi_by_id(unsigned int napi_id)
4146 {
4147         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4148         struct napi_struct *napi;
4149
4150         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4151                 if (napi->napi_id == napi_id)
4152                         return napi;
4153
4154         return NULL;
4155 }
4156 EXPORT_SYMBOL_GPL(napi_by_id);
4157
4158 void napi_hash_add(struct napi_struct *napi)
4159 {
4160         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4161
4162                 spin_lock(&napi_hash_lock);
4163
4164                 /* 0 is not a valid id, we also skip an id that is taken
4165                  * we expect both events to be extremely rare
4166                  */
4167                 napi->napi_id = 0;
4168                 while (!napi->napi_id) {
4169                         napi->napi_id = ++napi_gen_id;
4170                         if (napi_by_id(napi->napi_id))
4171                                 napi->napi_id = 0;
4172                 }
4173
4174                 hlist_add_head_rcu(&napi->napi_hash_node,
4175                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4176
4177                 spin_unlock(&napi_hash_lock);
4178         }
4179 }
4180 EXPORT_SYMBOL_GPL(napi_hash_add);
4181
4182 /* Warning : caller is responsible to make sure rcu grace period
4183  * is respected before freeing memory containing @napi
4184  */
4185 void napi_hash_del(struct napi_struct *napi)
4186 {
4187         spin_lock(&napi_hash_lock);
4188
4189         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4190                 hlist_del_rcu(&napi->napi_hash_node);
4191
4192         spin_unlock(&napi_hash_lock);
4193 }
4194 EXPORT_SYMBOL_GPL(napi_hash_del);
4195
4196 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4197                     int (*poll)(struct napi_struct *, int), int weight)
4198 {
4199         INIT_LIST_HEAD(&napi->poll_list);
4200         napi->gro_count = 0;
4201         napi->gro_list = NULL;
4202         napi->skb = NULL;
4203         napi->poll = poll;
4204         if (weight > NAPI_POLL_WEIGHT)
4205                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4206                             weight, dev->name);
4207         napi->weight = weight;
4208         list_add(&napi->dev_list, &dev->napi_list);
4209         napi->dev = dev;
4210 #ifdef CONFIG_NETPOLL
4211         spin_lock_init(&napi->poll_lock);
4212         napi->poll_owner = -1;
4213 #endif
4214         set_bit(NAPI_STATE_SCHED, &napi->state);
4215 }
4216 EXPORT_SYMBOL(netif_napi_add);
4217
4218 void netif_napi_del(struct napi_struct *napi)
4219 {
4220         struct sk_buff *skb, *next;
4221
4222         list_del_init(&napi->dev_list);
4223         napi_free_frags(napi);
4224
4225         for (skb = napi->gro_list; skb; skb = next) {
4226                 next = skb->next;
4227                 skb->next = NULL;
4228                 kfree_skb(skb);
4229         }
4230
4231         napi->gro_list = NULL;
4232         napi->gro_count = 0;
4233 }
4234 EXPORT_SYMBOL(netif_napi_del);
4235
4236 static void net_rx_action(struct softirq_action *h)
4237 {
4238         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4239         unsigned long time_limit = jiffies + 2;
4240         int budget = netdev_budget;
4241         void *have;
4242
4243         local_irq_disable();
4244
4245         while (!list_empty(&sd->poll_list)) {
4246                 struct napi_struct *n;
4247                 int work, weight;
4248
4249                 /* If softirq window is exhuasted then punt.
4250                  * Allow this to run for 2 jiffies since which will allow
4251                  * an average latency of 1.5/HZ.
4252                  */
4253                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4254                         goto softnet_break;
4255
4256                 local_irq_enable();
4257
4258                 /* Even though interrupts have been re-enabled, this
4259                  * access is safe because interrupts can only add new
4260                  * entries to the tail of this list, and only ->poll()
4261                  * calls can remove this head entry from the list.
4262                  */
4263                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4264
4265                 have = netpoll_poll_lock(n);
4266
4267                 weight = n->weight;
4268
4269                 /* This NAPI_STATE_SCHED test is for avoiding a race
4270                  * with netpoll's poll_napi().  Only the entity which
4271                  * obtains the lock and sees NAPI_STATE_SCHED set will
4272                  * actually make the ->poll() call.  Therefore we avoid
4273                  * accidentally calling ->poll() when NAPI is not scheduled.
4274                  */
4275                 work = 0;
4276                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4277                         work = n->poll(n, weight);
4278                         trace_napi_poll(n);
4279                 }
4280
4281                 WARN_ON_ONCE(work > weight);
4282
4283                 budget -= work;
4284
4285                 local_irq_disable();
4286
4287                 /* Drivers must not modify the NAPI state if they
4288                  * consume the entire weight.  In such cases this code
4289                  * still "owns" the NAPI instance and therefore can
4290                  * move the instance around on the list at-will.
4291                  */
4292                 if (unlikely(work == weight)) {
4293                         if (unlikely(napi_disable_pending(n))) {
4294                                 local_irq_enable();
4295                                 napi_complete(n);
4296                                 local_irq_disable();
4297                         } else {
4298                                 if (n->gro_list) {
4299                                         /* flush too old packets
4300                                          * If HZ < 1000, flush all packets.
4301                                          */
4302                                         local_irq_enable();
4303                                         napi_gro_flush(n, HZ >= 1000);
4304                                         local_irq_disable();
4305                                 }
4306                                 list_move_tail(&n->poll_list, &sd->poll_list);
4307                         }
4308                 }
4309
4310                 netpoll_poll_unlock(have);
4311         }
4312 out:
4313         net_rps_action_and_irq_enable(sd);
4314
4315 #ifdef CONFIG_NET_DMA
4316         /*
4317          * There may not be any more sk_buffs coming right now, so push
4318          * any pending DMA copies to hardware
4319          */
4320         dma_issue_pending_all();
4321 #endif
4322
4323         return;
4324
4325 softnet_break:
4326         sd->time_squeeze++;
4327         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4328         goto out;
4329 }
4330
4331 struct netdev_upper {
4332         struct net_device *dev;
4333         bool master;
4334         struct list_head list;
4335         struct rcu_head rcu;
4336         struct list_head search_list;
4337 };
4338
4339 static void __append_search_uppers(struct list_head *search_list,
4340                                    struct net_device *dev)
4341 {
4342         struct netdev_upper *upper;
4343
4344         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4345                 /* check if this upper is not already in search list */
4346                 if (list_empty(&upper->search_list))
4347                         list_add_tail(&upper->search_list, search_list);
4348         }
4349 }
4350
4351 static bool __netdev_search_upper_dev(struct net_device *dev,
4352                                       struct net_device *upper_dev)
4353 {
4354         LIST_HEAD(search_list);
4355         struct netdev_upper *upper;
4356         struct netdev_upper *tmp;
4357         bool ret = false;
4358
4359         __append_search_uppers(&search_list, dev);
4360         list_for_each_entry(upper, &search_list, search_list) {
4361                 if (upper->dev == upper_dev) {
4362                         ret = true;
4363                         break;
4364                 }
4365                 __append_search_uppers(&search_list, upper->dev);
4366         }
4367         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4368                 INIT_LIST_HEAD(&upper->search_list);
4369         return ret;
4370 }
4371
4372 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4373                                                 struct net_device *upper_dev)
4374 {
4375         struct netdev_upper *upper;
4376
4377         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4378                 if (upper->dev == upper_dev)
4379                         return upper;
4380         }
4381         return NULL;
4382 }
4383
4384 /**
4385  * netdev_has_upper_dev - Check if device is linked to an upper device
4386  * @dev: device
4387  * @upper_dev: upper device to check
4388  *
4389  * Find out if a device is linked to specified upper device and return true
4390  * in case it is. Note that this checks only immediate upper device,
4391  * not through a complete stack of devices. The caller must hold the RTNL lock.
4392  */
4393 bool netdev_has_upper_dev(struct net_device *dev,
4394                           struct net_device *upper_dev)
4395 {
4396         ASSERT_RTNL();
4397
4398         return __netdev_find_upper(dev, upper_dev);
4399 }
4400 EXPORT_SYMBOL(netdev_has_upper_dev);
4401
4402 /**
4403  * netdev_has_any_upper_dev - Check if device is linked to some device
4404  * @dev: device
4405  *
4406  * Find out if a device is linked to an upper device and return true in case
4407  * it is. The caller must hold the RTNL lock.
4408  */
4409 bool netdev_has_any_upper_dev(struct net_device *dev)
4410 {
4411         ASSERT_RTNL();
4412
4413         return !list_empty(&dev->upper_dev_list);
4414 }
4415 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4416
4417 /**
4418  * netdev_master_upper_dev_get - Get master upper device
4419  * @dev: device
4420  *
4421  * Find a master upper device and return pointer to it or NULL in case
4422  * it's not there. The caller must hold the RTNL lock.
4423  */
4424 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4425 {
4426         struct netdev_upper *upper;
4427
4428         ASSERT_RTNL();
4429
4430         if (list_empty(&dev->upper_dev_list))
4431                 return NULL;
4432
4433         upper = list_first_entry(&dev->upper_dev_list,
4434                                  struct netdev_upper, list);
4435         if (likely(upper->master))
4436                 return upper->dev;
4437         return NULL;
4438 }
4439 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4440
4441 /**
4442  * netdev_master_upper_dev_get_rcu - Get master upper device
4443  * @dev: device
4444  *
4445  * Find a master upper device and return pointer to it or NULL in case
4446  * it's not there. The caller must hold the RCU read lock.
4447  */
4448 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4449 {
4450         struct netdev_upper *upper;
4451
4452         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4453                                        struct netdev_upper, list);
4454         if (upper && likely(upper->master))
4455                 return upper->dev;
4456         return NULL;
4457 }
4458 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4459
4460 static int __netdev_upper_dev_link(struct net_device *dev,
4461                                    struct net_device *upper_dev, bool master)
4462 {
4463         struct netdev_upper *upper;
4464
4465         ASSERT_RTNL();
4466
4467         if (dev == upper_dev)
4468                 return -EBUSY;
4469
4470         /* To prevent loops, check if dev is not upper device to upper_dev. */
4471         if (__netdev_search_upper_dev(upper_dev, dev))
4472                 return -EBUSY;
4473
4474         if (__netdev_find_upper(dev, upper_dev))
4475                 return -EEXIST;
4476
4477         if (master && netdev_master_upper_dev_get(dev))
4478                 return -EBUSY;
4479
4480         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4481         if (!upper)
4482                 return -ENOMEM;
4483
4484         upper->dev = upper_dev;
4485         upper->master = master;
4486         INIT_LIST_HEAD(&upper->search_list);
4487
4488         /* Ensure that master upper link is always the first item in list. */
4489         if (master)
4490                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4491         else
4492                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4493         dev_hold(upper_dev);
4494         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4495         return 0;
4496 }
4497
4498 /**
4499  * netdev_upper_dev_link - Add a link to the upper device
4500  * @dev: device
4501  * @upper_dev: new upper device
4502  *
4503  * Adds a link to device which is upper to this one. The caller must hold
4504  * the RTNL lock. On a failure a negative errno code is returned.
4505  * On success the reference counts are adjusted and the function
4506  * returns zero.
4507  */
4508 int netdev_upper_dev_link(struct net_device *dev,
4509                           struct net_device *upper_dev)
4510 {
4511         return __netdev_upper_dev_link(dev, upper_dev, false);
4512 }
4513 EXPORT_SYMBOL(netdev_upper_dev_link);
4514
4515 /**
4516  * netdev_master_upper_dev_link - Add a master link to the upper device
4517  * @dev: device
4518  * @upper_dev: new upper device
4519  *
4520  * Adds a link to device which is upper to this one. In this case, only
4521  * one master upper device can be linked, although other non-master devices
4522  * might be linked as well. The caller must hold the RTNL lock.
4523  * On a failure a negative errno code is returned. On success the reference
4524  * counts are adjusted and the function returns zero.
4525  */
4526 int netdev_master_upper_dev_link(struct net_device *dev,
4527                                  struct net_device *upper_dev)
4528 {
4529         return __netdev_upper_dev_link(dev, upper_dev, true);
4530 }
4531 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4532
4533 /**
4534  * netdev_upper_dev_unlink - Removes a link to upper device
4535  * @dev: device
4536  * @upper_dev: new upper device
4537  *
4538  * Removes a link to device which is upper to this one. The caller must hold
4539  * the RTNL lock.
4540  */
4541 void netdev_upper_dev_unlink(struct net_device *dev,
4542                              struct net_device *upper_dev)
4543 {
4544         struct netdev_upper *upper;
4545
4546         ASSERT_RTNL();
4547
4548         upper = __netdev_find_upper(dev, upper_dev);
4549         if (!upper)
4550                 return;
4551         list_del_rcu(&upper->list);
4552         dev_put(upper_dev);
4553         kfree_rcu(upper, rcu);
4554         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4555 }
4556 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4557
4558 static void dev_change_rx_flags(struct net_device *dev, int flags)
4559 {
4560         const struct net_device_ops *ops = dev->netdev_ops;
4561
4562         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4563                 ops->ndo_change_rx_flags(dev, flags);
4564 }
4565
4566 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4567 {
4568         unsigned int old_flags = dev->flags;
4569         kuid_t uid;
4570         kgid_t gid;
4571
4572         ASSERT_RTNL();
4573
4574         dev->flags |= IFF_PROMISC;
4575         dev->promiscuity += inc;
4576         if (dev->promiscuity == 0) {
4577                 /*
4578                  * Avoid overflow.
4579                  * If inc causes overflow, untouch promisc and return error.
4580                  */
4581                 if (inc < 0)
4582                         dev->flags &= ~IFF_PROMISC;
4583                 else {
4584                         dev->promiscuity -= inc;
4585                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4586                                 dev->name);
4587                         return -EOVERFLOW;
4588                 }
4589         }
4590         if (dev->flags != old_flags) {
4591                 pr_info("device %s %s promiscuous mode\n",
4592                         dev->name,
4593                         dev->flags & IFF_PROMISC ? "entered" : "left");
4594                 if (audit_enabled) {
4595                         current_uid_gid(&uid, &gid);
4596                         audit_log(current->audit_context, GFP_ATOMIC,
4597                                 AUDIT_ANOM_PROMISCUOUS,
4598                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4599                                 dev->name, (dev->flags & IFF_PROMISC),
4600                                 (old_flags & IFF_PROMISC),
4601                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4602                                 from_kuid(&init_user_ns, uid),
4603                                 from_kgid(&init_user_ns, gid),
4604                                 audit_get_sessionid(current));
4605                 }
4606
4607                 dev_change_rx_flags(dev, IFF_PROMISC);
4608         }
4609         return 0;
4610 }
4611
4612 /**
4613  *      dev_set_promiscuity     - update promiscuity count on a device
4614  *      @dev: device
4615  *      @inc: modifier
4616  *
4617  *      Add or remove promiscuity from a device. While the count in the device
4618  *      remains above zero the interface remains promiscuous. Once it hits zero
4619  *      the device reverts back to normal filtering operation. A negative inc
4620  *      value is used to drop promiscuity on the device.
4621  *      Return 0 if successful or a negative errno code on error.
4622  */
4623 int dev_set_promiscuity(struct net_device *dev, int inc)
4624 {
4625         unsigned int old_flags = dev->flags;
4626         int err;
4627
4628         err = __dev_set_promiscuity(dev, inc);
4629         if (err < 0)
4630                 return err;
4631         if (dev->flags != old_flags)
4632                 dev_set_rx_mode(dev);
4633         return err;
4634 }
4635 EXPORT_SYMBOL(dev_set_promiscuity);
4636
4637 /**
4638  *      dev_set_allmulti        - update allmulti count on a device
4639  *      @dev: device
4640  *      @inc: modifier
4641  *
4642  *      Add or remove reception of all multicast frames to a device. While the
4643  *      count in the device remains above zero the interface remains listening
4644  *      to all interfaces. Once it hits zero the device reverts back to normal
4645  *      filtering operation. A negative @inc value is used to drop the counter
4646  *      when releasing a resource needing all multicasts.
4647  *      Return 0 if successful or a negative errno code on error.
4648  */
4649
4650 int dev_set_allmulti(struct net_device *dev, int inc)
4651 {
4652         unsigned int old_flags = dev->flags;
4653
4654         ASSERT_RTNL();
4655
4656         dev->flags |= IFF_ALLMULTI;
4657         dev->allmulti += inc;
4658         if (dev->allmulti == 0) {
4659                 /*
4660                  * Avoid overflow.
4661                  * If inc causes overflow, untouch allmulti and return error.
4662                  */
4663                 if (inc < 0)
4664                         dev->flags &= ~IFF_ALLMULTI;
4665                 else {
4666                         dev->allmulti -= inc;
4667                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4668                                 dev->name);
4669                         return -EOVERFLOW;
4670                 }
4671         }
4672         if (dev->flags ^ old_flags) {
4673                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4674                 dev_set_rx_mode(dev);
4675         }
4676         return 0;
4677 }
4678 EXPORT_SYMBOL(dev_set_allmulti);
4679
4680 /*
4681  *      Upload unicast and multicast address lists to device and
4682  *      configure RX filtering. When the device doesn't support unicast
4683  *      filtering it is put in promiscuous mode while unicast addresses
4684  *      are present.
4685  */
4686 void __dev_set_rx_mode(struct net_device *dev)
4687 {
4688         const struct net_device_ops *ops = dev->netdev_ops;
4689
4690         /* dev_open will call this function so the list will stay sane. */
4691         if (!(dev->flags&IFF_UP))
4692                 return;
4693
4694         if (!netif_device_present(dev))
4695                 return;
4696
4697         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4698                 /* Unicast addresses changes may only happen under the rtnl,
4699                  * therefore calling __dev_set_promiscuity here is safe.
4700                  */
4701                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4702                         __dev_set_promiscuity(dev, 1);
4703                         dev->uc_promisc = true;
4704                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4705                         __dev_set_promiscuity(dev, -1);
4706                         dev->uc_promisc = false;
4707                 }
4708         }
4709
4710         if (ops->ndo_set_rx_mode)
4711                 ops->ndo_set_rx_mode(dev);
4712 }
4713
4714 void dev_set_rx_mode(struct net_device *dev)
4715 {
4716         netif_addr_lock_bh(dev);
4717         __dev_set_rx_mode(dev);
4718         netif_addr_unlock_bh(dev);
4719 }
4720
4721 /**
4722  *      dev_get_flags - get flags reported to userspace
4723  *      @dev: device
4724  *
4725  *      Get the combination of flag bits exported through APIs to userspace.
4726  */
4727 unsigned int dev_get_flags(const struct net_device *dev)
4728 {
4729         unsigned int flags;
4730
4731         flags = (dev->flags & ~(IFF_PROMISC |
4732                                 IFF_ALLMULTI |
4733                                 IFF_RUNNING |
4734                                 IFF_LOWER_UP |
4735                                 IFF_DORMANT)) |
4736                 (dev->gflags & (IFF_PROMISC |
4737                                 IFF_ALLMULTI));
4738
4739         if (netif_running(dev)) {
4740                 if (netif_oper_up(dev))
4741                         flags |= IFF_RUNNING;
4742                 if (netif_carrier_ok(dev))
4743                         flags |= IFF_LOWER_UP;
4744                 if (netif_dormant(dev))
4745                         flags |= IFF_DORMANT;
4746         }
4747
4748         return flags;
4749 }
4750 EXPORT_SYMBOL(dev_get_flags);
4751
4752 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4753 {
4754         unsigned int old_flags = dev->flags;
4755         int ret;
4756
4757         ASSERT_RTNL();
4758
4759         /*
4760          *      Set the flags on our device.
4761          */
4762
4763         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4764                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4765                                IFF_AUTOMEDIA)) |
4766                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4767                                     IFF_ALLMULTI));
4768
4769         /*
4770          *      Load in the correct multicast list now the flags have changed.
4771          */
4772
4773         if ((old_flags ^ flags) & IFF_MULTICAST)
4774                 dev_change_rx_flags(dev, IFF_MULTICAST);
4775
4776         dev_set_rx_mode(dev);
4777
4778         /*
4779          *      Have we downed the interface. We handle IFF_UP ourselves
4780          *      according to user attempts to set it, rather than blindly
4781          *      setting it.
4782          */
4783
4784         ret = 0;
4785         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4786                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4787
4788                 if (!ret)
4789                         dev_set_rx_mode(dev);
4790         }
4791
4792         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4793                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4794
4795                 dev->gflags ^= IFF_PROMISC;
4796                 dev_set_promiscuity(dev, inc);
4797         }
4798
4799         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4800            is important. Some (broken) drivers set IFF_PROMISC, when
4801            IFF_ALLMULTI is requested not asking us and not reporting.
4802          */
4803         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4804                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4805
4806                 dev->gflags ^= IFF_ALLMULTI;
4807                 dev_set_allmulti(dev, inc);
4808         }
4809
4810         return ret;
4811 }
4812
4813 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4814 {
4815         unsigned int changes = dev->flags ^ old_flags;
4816
4817         if (changes & IFF_UP) {
4818                 if (dev->flags & IFF_UP)
4819                         call_netdevice_notifiers(NETDEV_UP, dev);
4820                 else
4821                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4822         }
4823
4824         if (dev->flags & IFF_UP &&
4825             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
4826                 struct netdev_notifier_change_info change_info;
4827
4828                 change_info.flags_changed = changes;
4829                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
4830                                               &change_info.info);
4831         }
4832 }
4833
4834 /**
4835  *      dev_change_flags - change device settings
4836  *      @dev: device
4837  *      @flags: device state flags
4838  *
4839  *      Change settings on device based state flags. The flags are
4840  *      in the userspace exported format.
4841  */
4842 int dev_change_flags(struct net_device *dev, unsigned int flags)
4843 {
4844         int ret;
4845         unsigned int changes, old_flags = dev->flags;
4846
4847         ret = __dev_change_flags(dev, flags);
4848         if (ret < 0)
4849                 return ret;
4850
4851         changes = old_flags ^ dev->flags;
4852         if (changes)
4853                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4854
4855         __dev_notify_flags(dev, old_flags);
4856         return ret;
4857 }
4858 EXPORT_SYMBOL(dev_change_flags);
4859
4860 /**
4861  *      dev_set_mtu - Change maximum transfer unit
4862  *      @dev: device
4863  *      @new_mtu: new transfer unit
4864  *
4865  *      Change the maximum transfer size of the network device.
4866  */
4867 int dev_set_mtu(struct net_device *dev, int new_mtu)
4868 {
4869         const struct net_device_ops *ops = dev->netdev_ops;
4870         int err;
4871
4872         if (new_mtu == dev->mtu)
4873                 return 0;
4874
4875         /*      MTU must be positive.    */
4876         if (new_mtu < 0)
4877                 return -EINVAL;
4878
4879         if (!netif_device_present(dev))
4880                 return -ENODEV;
4881
4882         err = 0;
4883         if (ops->ndo_change_mtu)
4884                 err = ops->ndo_change_mtu(dev, new_mtu);
4885         else
4886                 dev->mtu = new_mtu;
4887
4888         if (!err)
4889                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4890         return err;
4891 }
4892 EXPORT_SYMBOL(dev_set_mtu);
4893
4894 /**
4895  *      dev_set_group - Change group this device belongs to
4896  *      @dev: device
4897  *      @new_group: group this device should belong to
4898  */
4899 void dev_set_group(struct net_device *dev, int new_group)
4900 {
4901         dev->group = new_group;
4902 }
4903 EXPORT_SYMBOL(dev_set_group);
4904
4905 /**
4906  *      dev_set_mac_address - Change Media Access Control Address
4907  *      @dev: device
4908  *      @sa: new address
4909  *
4910  *      Change the hardware (MAC) address of the device
4911  */
4912 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4913 {
4914         const struct net_device_ops *ops = dev->netdev_ops;
4915         int err;
4916
4917         if (!ops->ndo_set_mac_address)
4918                 return -EOPNOTSUPP;
4919         if (sa->sa_family != dev->type)
4920                 return -EINVAL;
4921         if (!netif_device_present(dev))
4922                 return -ENODEV;
4923         err = ops->ndo_set_mac_address(dev, sa);
4924         if (err)
4925                 return err;
4926         dev->addr_assign_type = NET_ADDR_SET;
4927         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4928         add_device_randomness(dev->dev_addr, dev->addr_len);
4929         return 0;
4930 }
4931 EXPORT_SYMBOL(dev_set_mac_address);
4932
4933 /**
4934  *      dev_change_carrier - Change device carrier
4935  *      @dev: device
4936  *      @new_carrier: new value
4937  *
4938  *      Change device carrier
4939  */
4940 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4941 {
4942         const struct net_device_ops *ops = dev->netdev_ops;
4943
4944         if (!ops->ndo_change_carrier)
4945                 return -EOPNOTSUPP;
4946         if (!netif_device_present(dev))
4947                 return -ENODEV;
4948         return ops->ndo_change_carrier(dev, new_carrier);
4949 }
4950 EXPORT_SYMBOL(dev_change_carrier);
4951
4952 /**
4953  *      dev_new_index   -       allocate an ifindex
4954  *      @net: the applicable net namespace
4955  *
4956  *      Returns a suitable unique value for a new device interface
4957  *      number.  The caller must hold the rtnl semaphore or the
4958  *      dev_base_lock to be sure it remains unique.
4959  */
4960 static int dev_new_index(struct net *net)
4961 {
4962         int ifindex = net->ifindex;
4963         for (;;) {
4964                 if (++ifindex <= 0)
4965                         ifindex = 1;
4966                 if (!__dev_get_by_index(net, ifindex))
4967                         return net->ifindex = ifindex;
4968         }
4969 }
4970
4971 /* Delayed registration/unregisteration */
4972 static LIST_HEAD(net_todo_list);
4973
4974 static void net_set_todo(struct net_device *dev)
4975 {
4976         list_add_tail(&dev->todo_list, &net_todo_list);
4977 }
4978
4979 static void rollback_registered_many(struct list_head *head)
4980 {
4981         struct net_device *dev, *tmp;
4982
4983         BUG_ON(dev_boot_phase);
4984         ASSERT_RTNL();
4985
4986         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4987                 /* Some devices call without registering
4988                  * for initialization unwind. Remove those
4989                  * devices and proceed with the remaining.
4990                  */
4991                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4992                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4993                                  dev->name, dev);
4994
4995                         WARN_ON(1);
4996                         list_del(&dev->unreg_list);
4997                         continue;
4998                 }
4999                 dev->dismantle = true;
5000                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5001         }
5002
5003         /* If device is running, close it first. */
5004         dev_close_many(head);
5005
5006         list_for_each_entry(dev, head, unreg_list) {
5007                 /* And unlink it from device chain. */
5008                 unlist_netdevice(dev);
5009
5010                 dev->reg_state = NETREG_UNREGISTERING;
5011         }
5012
5013         synchronize_net();
5014
5015         list_for_each_entry(dev, head, unreg_list) {
5016                 /* Shutdown queueing discipline. */
5017                 dev_shutdown(dev);
5018
5019
5020                 /* Notify protocols, that we are about to destroy
5021                    this device. They should clean all the things.
5022                 */
5023                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5024
5025                 if (!dev->rtnl_link_ops ||
5026                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5027                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5028
5029                 /*
5030                  *      Flush the unicast and multicast chains
5031                  */
5032                 dev_uc_flush(dev);
5033                 dev_mc_flush(dev);
5034
5035                 if (dev->netdev_ops->ndo_uninit)
5036                         dev->netdev_ops->ndo_uninit(dev);
5037
5038                 /* Notifier chain MUST detach us all upper devices. */
5039                 WARN_ON(netdev_has_any_upper_dev(dev));
5040
5041                 /* Remove entries from kobject tree */
5042                 netdev_unregister_kobject(dev);
5043 #ifdef CONFIG_XPS
5044                 /* Remove XPS queueing entries */
5045                 netif_reset_xps_queues_gt(dev, 0);
5046 #endif
5047         }
5048
5049         synchronize_net();
5050
5051         list_for_each_entry(dev, head, unreg_list)
5052                 dev_put(dev);
5053 }
5054
5055 static void rollback_registered(struct net_device *dev)
5056 {
5057         LIST_HEAD(single);
5058
5059         list_add(&dev->unreg_list, &single);
5060         rollback_registered_many(&single);
5061         list_del(&single);
5062 }
5063
5064 static netdev_features_t netdev_fix_features(struct net_device *dev,
5065         netdev_features_t features)
5066 {
5067         /* Fix illegal checksum combinations */
5068         if ((features & NETIF_F_HW_CSUM) &&
5069             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5070                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5071                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5072         }
5073
5074         /* TSO requires that SG is present as well. */
5075         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5076                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5077                 features &= ~NETIF_F_ALL_TSO;
5078         }
5079
5080         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5081                                         !(features & NETIF_F_IP_CSUM)) {
5082                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5083                 features &= ~NETIF_F_TSO;
5084                 features &= ~NETIF_F_TSO_ECN;
5085         }
5086
5087         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5088                                          !(features & NETIF_F_IPV6_CSUM)) {
5089                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5090                 features &= ~NETIF_F_TSO6;
5091         }
5092
5093         /* TSO ECN requires that TSO is present as well. */
5094         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5095                 features &= ~NETIF_F_TSO_ECN;
5096
5097         /* Software GSO depends on SG. */
5098         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5099                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5100                 features &= ~NETIF_F_GSO;
5101         }
5102
5103         /* UFO needs SG and checksumming */
5104         if (features & NETIF_F_UFO) {
5105                 /* maybe split UFO into V4 and V6? */
5106                 if (!((features & NETIF_F_GEN_CSUM) ||
5107                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5108                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5109                         netdev_dbg(dev,
5110                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5111                         features &= ~NETIF_F_UFO;
5112                 }
5113
5114                 if (!(features & NETIF_F_SG)) {
5115                         netdev_dbg(dev,
5116                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5117                         features &= ~NETIF_F_UFO;
5118                 }
5119         }
5120
5121         return features;
5122 }
5123
5124 int __netdev_update_features(struct net_device *dev)
5125 {
5126         netdev_features_t features;
5127         int err = 0;
5128
5129         ASSERT_RTNL();
5130
5131         features = netdev_get_wanted_features(dev);
5132
5133         if (dev->netdev_ops->ndo_fix_features)
5134                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5135
5136         /* driver might be less strict about feature dependencies */
5137         features = netdev_fix_features(dev, features);
5138
5139         if (dev->features == features)
5140                 return 0;
5141
5142         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5143                 &dev->features, &features);
5144
5145         if (dev->netdev_ops->ndo_set_features)
5146                 err = dev->netdev_ops->ndo_set_features(dev, features);
5147
5148         if (unlikely(err < 0)) {
5149                 netdev_err(dev,
5150                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5151                         err, &features, &dev->features);
5152                 return -1;
5153         }
5154
5155         if (!err)
5156                 dev->features = features;
5157
5158         return 1;
5159 }
5160
5161 /**
5162  *      netdev_update_features - recalculate device features
5163  *      @dev: the device to check
5164  *
5165  *      Recalculate dev->features set and send notifications if it
5166  *      has changed. Should be called after driver or hardware dependent
5167  *      conditions might have changed that influence the features.
5168  */
5169 void netdev_update_features(struct net_device *dev)
5170 {
5171         if (__netdev_update_features(dev))
5172                 netdev_features_change(dev);
5173 }
5174 EXPORT_SYMBOL(netdev_update_features);
5175
5176 /**
5177  *      netdev_change_features - recalculate device features
5178  *      @dev: the device to check
5179  *
5180  *      Recalculate dev->features set and send notifications even
5181  *      if they have not changed. Should be called instead of
5182  *      netdev_update_features() if also dev->vlan_features might
5183  *      have changed to allow the changes to be propagated to stacked
5184  *      VLAN devices.
5185  */
5186 void netdev_change_features(struct net_device *dev)
5187 {
5188         __netdev_update_features(dev);
5189         netdev_features_change(dev);
5190 }
5191 EXPORT_SYMBOL(netdev_change_features);
5192
5193 /**
5194  *      netif_stacked_transfer_operstate -      transfer operstate
5195  *      @rootdev: the root or lower level device to transfer state from
5196  *      @dev: the device to transfer operstate to
5197  *
5198  *      Transfer operational state from root to device. This is normally
5199  *      called when a stacking relationship exists between the root
5200  *      device and the device(a leaf device).
5201  */
5202 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5203                                         struct net_device *dev)
5204 {
5205         if (rootdev->operstate == IF_OPER_DORMANT)
5206                 netif_dormant_on(dev);
5207         else
5208                 netif_dormant_off(dev);
5209
5210         if (netif_carrier_ok(rootdev)) {
5211                 if (!netif_carrier_ok(dev))
5212                         netif_carrier_on(dev);
5213         } else {
5214                 if (netif_carrier_ok(dev))
5215                         netif_carrier_off(dev);
5216         }
5217 }
5218 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5219
5220 #ifdef CONFIG_RPS
5221 static int netif_alloc_rx_queues(struct net_device *dev)
5222 {
5223         unsigned int i, count = dev->num_rx_queues;
5224         struct netdev_rx_queue *rx;
5225
5226         BUG_ON(count < 1);
5227
5228         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5229         if (!rx)
5230                 return -ENOMEM;
5231
5232         dev->_rx = rx;
5233
5234         for (i = 0; i < count; i++)
5235                 rx[i].dev = dev;
5236         return 0;
5237 }
5238 #endif
5239
5240 static void netdev_init_one_queue(struct net_device *dev,
5241                                   struct netdev_queue *queue, void *_unused)
5242 {
5243         /* Initialize queue lock */
5244         spin_lock_init(&queue->_xmit_lock);
5245         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5246         queue->xmit_lock_owner = -1;
5247         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5248         queue->dev = dev;
5249 #ifdef CONFIG_BQL
5250         dql_init(&queue->dql, HZ);
5251 #endif
5252 }
5253
5254 static void netif_free_tx_queues(struct net_device *dev)
5255 {
5256         if (is_vmalloc_addr(dev->_tx))
5257                 vfree(dev->_tx);
5258         else
5259                 kfree(dev->_tx);
5260 }
5261
5262 static int netif_alloc_netdev_queues(struct net_device *dev)
5263 {
5264         unsigned int count = dev->num_tx_queues;
5265         struct netdev_queue *tx;
5266         size_t sz = count * sizeof(*tx);
5267
5268         BUG_ON(count < 1 || count > 0xffff);
5269
5270         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5271         if (!tx) {
5272                 tx = vzalloc(sz);
5273                 if (!tx)
5274                         return -ENOMEM;
5275         }
5276         dev->_tx = tx;
5277
5278         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5279         spin_lock_init(&dev->tx_global_lock);
5280
5281         return 0;
5282 }
5283
5284 /**
5285  *      register_netdevice      - register a network device
5286  *      @dev: device to register
5287  *
5288  *      Take a completed network device structure and add it to the kernel
5289  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5290  *      chain. 0 is returned on success. A negative errno code is returned
5291  *      on a failure to set up the device, or if the name is a duplicate.
5292  *
5293  *      Callers must hold the rtnl semaphore. You may want
5294  *      register_netdev() instead of this.
5295  *
5296  *      BUGS:
5297  *      The locking appears insufficient to guarantee two parallel registers
5298  *      will not get the same name.
5299  */
5300
5301 int register_netdevice(struct net_device *dev)
5302 {
5303         int ret;
5304         struct net *net = dev_net(dev);
5305
5306         BUG_ON(dev_boot_phase);
5307         ASSERT_RTNL();
5308
5309         might_sleep();
5310
5311         /* When net_device's are persistent, this will be fatal. */
5312         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5313         BUG_ON(!net);
5314
5315         spin_lock_init(&dev->addr_list_lock);
5316         netdev_set_addr_lockdep_class(dev);
5317
5318         dev->iflink = -1;
5319
5320         ret = dev_get_valid_name(net, dev, dev->name);
5321         if (ret < 0)
5322                 goto out;
5323
5324         /* Init, if this function is available */
5325         if (dev->netdev_ops->ndo_init) {
5326                 ret = dev->netdev_ops->ndo_init(dev);
5327                 if (ret) {
5328                         if (ret > 0)
5329                                 ret = -EIO;
5330                         goto out;
5331                 }
5332         }
5333
5334         if (((dev->hw_features | dev->features) &
5335              NETIF_F_HW_VLAN_CTAG_FILTER) &&
5336             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5337              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5338                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5339                 ret = -EINVAL;
5340                 goto err_uninit;
5341         }
5342
5343         ret = -EBUSY;
5344         if (!dev->ifindex)
5345                 dev->ifindex = dev_new_index(net);
5346         else if (__dev_get_by_index(net, dev->ifindex))
5347                 goto err_uninit;
5348
5349         if (dev->iflink == -1)
5350                 dev->iflink = dev->ifindex;
5351
5352         /* Transfer changeable features to wanted_features and enable
5353          * software offloads (GSO and GRO).
5354          */
5355         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5356         dev->features |= NETIF_F_SOFT_FEATURES;
5357         dev->wanted_features = dev->features & dev->hw_features;
5358
5359         /* Turn on no cache copy if HW is doing checksum */
5360         if (!(dev->flags & IFF_LOOPBACK)) {
5361                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5362                 if (dev->features & NETIF_F_ALL_CSUM) {
5363                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5364                         dev->features |= NETIF_F_NOCACHE_COPY;
5365                 }
5366         }
5367
5368         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5369          */
5370         dev->vlan_features |= NETIF_F_HIGHDMA;
5371
5372         /* Make NETIF_F_SG inheritable to tunnel devices.
5373          */
5374         dev->hw_enc_features |= NETIF_F_SG;
5375
5376         /* Make NETIF_F_SG inheritable to MPLS.
5377          */
5378         dev->mpls_features |= NETIF_F_SG;
5379
5380         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5381         ret = notifier_to_errno(ret);
5382         if (ret)
5383                 goto err_uninit;
5384
5385         ret = netdev_register_kobject(dev);
5386         if (ret)
5387                 goto err_uninit;
5388         dev->reg_state = NETREG_REGISTERED;
5389
5390         __netdev_update_features(dev);
5391
5392         /*
5393          *      Default initial state at registry is that the
5394          *      device is present.
5395          */
5396
5397         set_bit(__LINK_STATE_PRESENT, &dev->state);
5398
5399         linkwatch_init_dev(dev);
5400
5401         dev_init_scheduler(dev);
5402         dev_hold(dev);
5403         list_netdevice(dev);
5404         add_device_randomness(dev->dev_addr, dev->addr_len);
5405
5406         /* If the device has permanent device address, driver should
5407          * set dev_addr and also addr_assign_type should be set to
5408          * NET_ADDR_PERM (default value).
5409          */
5410         if (dev->addr_assign_type == NET_ADDR_PERM)
5411                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5412
5413         /* Notify protocols, that a new device appeared. */
5414         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5415         ret = notifier_to_errno(ret);
5416         if (ret) {
5417                 rollback_registered(dev);
5418                 dev->reg_state = NETREG_UNREGISTERED;
5419         }
5420         /*
5421          *      Prevent userspace races by waiting until the network
5422          *      device is fully setup before sending notifications.
5423          */
5424         if (!dev->rtnl_link_ops ||
5425             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5426                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5427
5428 out:
5429         return ret;
5430
5431 err_uninit:
5432         if (dev->netdev_ops->ndo_uninit)
5433                 dev->netdev_ops->ndo_uninit(dev);
5434         goto out;
5435 }
5436 EXPORT_SYMBOL(register_netdevice);
5437
5438 /**
5439  *      init_dummy_netdev       - init a dummy network device for NAPI
5440  *      @dev: device to init
5441  *
5442  *      This takes a network device structure and initialize the minimum
5443  *      amount of fields so it can be used to schedule NAPI polls without
5444  *      registering a full blown interface. This is to be used by drivers
5445  *      that need to tie several hardware interfaces to a single NAPI
5446  *      poll scheduler due to HW limitations.
5447  */
5448 int init_dummy_netdev(struct net_device *dev)
5449 {
5450         /* Clear everything. Note we don't initialize spinlocks
5451          * are they aren't supposed to be taken by any of the
5452          * NAPI code and this dummy netdev is supposed to be
5453          * only ever used for NAPI polls
5454          */
5455         memset(dev, 0, sizeof(struct net_device));
5456
5457         /* make sure we BUG if trying to hit standard
5458          * register/unregister code path
5459          */
5460         dev->reg_state = NETREG_DUMMY;
5461
5462         /* NAPI wants this */
5463         INIT_LIST_HEAD(&dev->napi_list);
5464
5465         /* a dummy interface is started by default */
5466         set_bit(__LINK_STATE_PRESENT, &dev->state);
5467         set_bit(__LINK_STATE_START, &dev->state);
5468
5469         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5470          * because users of this 'device' dont need to change
5471          * its refcount.
5472          */
5473
5474         return 0;
5475 }
5476 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5477
5478
5479 /**
5480  *      register_netdev - register a network device
5481  *      @dev: device to register
5482  *
5483  *      Take a completed network device structure and add it to the kernel
5484  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5485  *      chain. 0 is returned on success. A negative errno code is returned
5486  *      on a failure to set up the device, or if the name is a duplicate.
5487  *
5488  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5489  *      and expands the device name if you passed a format string to
5490  *      alloc_netdev.
5491  */
5492 int register_netdev(struct net_device *dev)
5493 {
5494         int err;
5495
5496         rtnl_lock();
5497         err = register_netdevice(dev);
5498         rtnl_unlock();
5499         return err;
5500 }
5501 EXPORT_SYMBOL(register_netdev);
5502
5503 int netdev_refcnt_read(const struct net_device *dev)
5504 {
5505         int i, refcnt = 0;
5506
5507         for_each_possible_cpu(i)
5508                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5509         return refcnt;
5510 }
5511 EXPORT_SYMBOL(netdev_refcnt_read);
5512
5513 /**
5514  * netdev_wait_allrefs - wait until all references are gone.
5515  * @dev: target net_device
5516  *
5517  * This is called when unregistering network devices.
5518  *
5519  * Any protocol or device that holds a reference should register
5520  * for netdevice notification, and cleanup and put back the
5521  * reference if they receive an UNREGISTER event.
5522  * We can get stuck here if buggy protocols don't correctly
5523  * call dev_put.
5524  */
5525 static void netdev_wait_allrefs(struct net_device *dev)
5526 {
5527         unsigned long rebroadcast_time, warning_time;
5528         int refcnt;
5529
5530         linkwatch_forget_dev(dev);
5531
5532         rebroadcast_time = warning_time = jiffies;
5533         refcnt = netdev_refcnt_read(dev);
5534
5535         while (refcnt != 0) {
5536                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5537                         rtnl_lock();
5538
5539                         /* Rebroadcast unregister notification */
5540                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5541
5542                         __rtnl_unlock();
5543                         rcu_barrier();
5544                         rtnl_lock();
5545
5546                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5547                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5548                                      &dev->state)) {
5549                                 /* We must not have linkwatch events
5550                                  * pending on unregister. If this
5551                                  * happens, we simply run the queue
5552                                  * unscheduled, resulting in a noop
5553                                  * for this device.
5554                                  */
5555                                 linkwatch_run_queue();
5556                         }
5557
5558                         __rtnl_unlock();
5559
5560                         rebroadcast_time = jiffies;
5561                 }
5562
5563                 msleep(250);
5564
5565                 refcnt = netdev_refcnt_read(dev);
5566
5567                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5568                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5569                                  dev->name, refcnt);
5570                         warning_time = jiffies;
5571                 }
5572         }
5573 }
5574
5575 /* The sequence is:
5576  *
5577  *      rtnl_lock();
5578  *      ...
5579  *      register_netdevice(x1);
5580  *      register_netdevice(x2);
5581  *      ...
5582  *      unregister_netdevice(y1);
5583  *      unregister_netdevice(y2);
5584  *      ...
5585  *      rtnl_unlock();
5586  *      free_netdev(y1);
5587  *      free_netdev(y2);
5588  *
5589  * We are invoked by rtnl_unlock().
5590  * This allows us to deal with problems:
5591  * 1) We can delete sysfs objects which invoke hotplug
5592  *    without deadlocking with linkwatch via keventd.
5593  * 2) Since we run with the RTNL semaphore not held, we can sleep
5594  *    safely in order to wait for the netdev refcnt to drop to zero.
5595  *
5596  * We must not return until all unregister events added during
5597  * the interval the lock was held have been completed.
5598  */
5599 void netdev_run_todo(void)
5600 {
5601         struct list_head list;
5602
5603         /* Snapshot list, allow later requests */
5604         list_replace_init(&net_todo_list, &list);
5605
5606         __rtnl_unlock();
5607
5608
5609         /* Wait for rcu callbacks to finish before next phase */
5610         if (!list_empty(&list))
5611                 rcu_barrier();
5612
5613         while (!list_empty(&list)) {
5614                 struct net_device *dev
5615                         = list_first_entry(&list, struct net_device, todo_list);
5616                 list_del(&dev->todo_list);
5617
5618                 rtnl_lock();
5619                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5620                 __rtnl_unlock();
5621
5622                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5623                         pr_err("network todo '%s' but state %d\n",
5624                                dev->name, dev->reg_state);
5625                         dump_stack();
5626                         continue;
5627                 }
5628
5629                 dev->reg_state = NETREG_UNREGISTERED;
5630
5631                 on_each_cpu(flush_backlog, dev, 1);
5632
5633                 netdev_wait_allrefs(dev);
5634
5635                 /* paranoia */
5636                 BUG_ON(netdev_refcnt_read(dev));
5637                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5638                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5639                 WARN_ON(dev->dn_ptr);
5640
5641                 if (dev->destructor)
5642                         dev->destructor(dev);
5643
5644                 /* Free network device */
5645                 kobject_put(&dev->dev.kobj);
5646         }
5647 }
5648
5649 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5650  * fields in the same order, with only the type differing.
5651  */
5652 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5653                              const struct net_device_stats *netdev_stats)
5654 {
5655 #if BITS_PER_LONG == 64
5656         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5657         memcpy(stats64, netdev_stats, sizeof(*stats64));
5658 #else
5659         size_t i, n = sizeof(*stats64) / sizeof(u64);
5660         const unsigned long *src = (const unsigned long *)netdev_stats;
5661         u64 *dst = (u64 *)stats64;
5662
5663         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5664                      sizeof(*stats64) / sizeof(u64));
5665         for (i = 0; i < n; i++)
5666                 dst[i] = src[i];
5667 #endif
5668 }
5669 EXPORT_SYMBOL(netdev_stats_to_stats64);
5670
5671 /**
5672  *      dev_get_stats   - get network device statistics
5673  *      @dev: device to get statistics from
5674  *      @storage: place to store stats
5675  *
5676  *      Get network statistics from device. Return @storage.
5677  *      The device driver may provide its own method by setting
5678  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5679  *      otherwise the internal statistics structure is used.
5680  */
5681 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5682                                         struct rtnl_link_stats64 *storage)
5683 {
5684         const struct net_device_ops *ops = dev->netdev_ops;
5685
5686         if (ops->ndo_get_stats64) {
5687                 memset(storage, 0, sizeof(*storage));
5688                 ops->ndo_get_stats64(dev, storage);
5689         } else if (ops->ndo_get_stats) {
5690                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5691         } else {
5692                 netdev_stats_to_stats64(storage, &dev->stats);
5693         }
5694         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5695         return storage;
5696 }
5697 EXPORT_SYMBOL(dev_get_stats);
5698
5699 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5700 {
5701         struct netdev_queue *queue = dev_ingress_queue(dev);
5702
5703 #ifdef CONFIG_NET_CLS_ACT
5704         if (queue)
5705                 return queue;
5706         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5707         if (!queue)
5708                 return NULL;
5709         netdev_init_one_queue(dev, queue, NULL);
5710         queue->qdisc = &noop_qdisc;
5711         queue->qdisc_sleeping = &noop_qdisc;
5712         rcu_assign_pointer(dev->ingress_queue, queue);
5713 #endif
5714         return queue;
5715 }
5716
5717 static const struct ethtool_ops default_ethtool_ops;
5718
5719 void netdev_set_default_ethtool_ops(struct net_device *dev,
5720                                     const struct ethtool_ops *ops)
5721 {
5722         if (dev->ethtool_ops == &default_ethtool_ops)
5723                 dev->ethtool_ops = ops;
5724 }
5725 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5726
5727 /**
5728  *      alloc_netdev_mqs - allocate network device
5729  *      @sizeof_priv:   size of private data to allocate space for
5730  *      @name:          device name format string
5731  *      @setup:         callback to initialize device
5732  *      @txqs:          the number of TX subqueues to allocate
5733  *      @rxqs:          the number of RX subqueues to allocate
5734  *
5735  *      Allocates a struct net_device with private data area for driver use
5736  *      and performs basic initialization.  Also allocates subquue structs
5737  *      for each queue on the device.
5738  */
5739 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5740                 void (*setup)(struct net_device *),
5741                 unsigned int txqs, unsigned int rxqs)
5742 {
5743         struct net_device *dev;
5744         size_t alloc_size;
5745         struct net_device *p;
5746
5747         BUG_ON(strlen(name) >= sizeof(dev->name));
5748
5749         if (txqs < 1) {
5750                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5751                 return NULL;
5752         }
5753
5754 #ifdef CONFIG_RPS
5755         if (rxqs < 1) {
5756                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5757                 return NULL;
5758         }
5759 #endif
5760
5761         alloc_size = sizeof(struct net_device);
5762         if (sizeof_priv) {
5763                 /* ensure 32-byte alignment of private area */
5764                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5765                 alloc_size += sizeof_priv;
5766         }
5767         /* ensure 32-byte alignment of whole construct */
5768         alloc_size += NETDEV_ALIGN - 1;
5769
5770         p = kzalloc(alloc_size, GFP_KERNEL);
5771         if (!p)
5772                 return NULL;
5773
5774         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5775         dev->padded = (char *)dev - (char *)p;
5776
5777         dev->pcpu_refcnt = alloc_percpu(int);
5778         if (!dev->pcpu_refcnt)
5779                 goto free_p;
5780
5781         if (dev_addr_init(dev))
5782                 goto free_pcpu;
5783
5784         dev_mc_init(dev);
5785         dev_uc_init(dev);
5786
5787         dev_net_set(dev, &init_net);
5788
5789         dev->gso_max_size = GSO_MAX_SIZE;
5790         dev->gso_max_segs = GSO_MAX_SEGS;
5791
5792         INIT_LIST_HEAD(&dev->napi_list);
5793         INIT_LIST_HEAD(&dev->unreg_list);
5794         INIT_LIST_HEAD(&dev->link_watch_list);
5795         INIT_LIST_HEAD(&dev->upper_dev_list);
5796         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5797         setup(dev);
5798
5799         dev->num_tx_queues = txqs;
5800         dev->real_num_tx_queues = txqs;
5801         if (netif_alloc_netdev_queues(dev))
5802                 goto free_all;
5803
5804 #ifdef CONFIG_RPS
5805         dev->num_rx_queues = rxqs;
5806         dev->real_num_rx_queues = rxqs;
5807         if (netif_alloc_rx_queues(dev))
5808                 goto free_all;
5809 #endif
5810
5811         strcpy(dev->name, name);
5812         dev->group = INIT_NETDEV_GROUP;
5813         if (!dev->ethtool_ops)
5814                 dev->ethtool_ops = &default_ethtool_ops;
5815         return dev;
5816
5817 free_all:
5818         free_netdev(dev);
5819         return NULL;
5820
5821 free_pcpu:
5822         free_percpu(dev->pcpu_refcnt);
5823         netif_free_tx_queues(dev);
5824 #ifdef CONFIG_RPS
5825         kfree(dev->_rx);
5826 #endif
5827
5828 free_p:
5829         kfree(p);
5830         return NULL;
5831 }
5832 EXPORT_SYMBOL(alloc_netdev_mqs);
5833
5834 /**
5835  *      free_netdev - free network device
5836  *      @dev: device
5837  *
5838  *      This function does the last stage of destroying an allocated device
5839  *      interface. The reference to the device object is released.
5840  *      If this is the last reference then it will be freed.
5841  */
5842 void free_netdev(struct net_device *dev)
5843 {
5844         struct napi_struct *p, *n;
5845
5846         release_net(dev_net(dev));
5847
5848         netif_free_tx_queues(dev);
5849 #ifdef CONFIG_RPS
5850         kfree(dev->_rx);
5851 #endif
5852
5853         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5854
5855         /* Flush device addresses */
5856         dev_addr_flush(dev);
5857
5858         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5859                 netif_napi_del(p);
5860
5861         free_percpu(dev->pcpu_refcnt);
5862         dev->pcpu_refcnt = NULL;
5863
5864         /*  Compatibility with error handling in drivers */
5865         if (dev->reg_state == NETREG_UNINITIALIZED) {
5866                 kfree((char *)dev - dev->padded);
5867                 return;
5868         }
5869
5870         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5871         dev->reg_state = NETREG_RELEASED;
5872
5873         /* will free via device release */
5874         put_device(&dev->dev);
5875 }
5876 EXPORT_SYMBOL(free_netdev);
5877
5878 /**
5879  *      synchronize_net -  Synchronize with packet receive processing
5880  *
5881  *      Wait for packets currently being received to be done.
5882  *      Does not block later packets from starting.
5883  */
5884 void synchronize_net(void)
5885 {
5886         might_sleep();
5887         if (rtnl_is_locked())
5888                 synchronize_rcu_expedited();
5889         else
5890                 synchronize_rcu();
5891 }
5892 EXPORT_SYMBOL(synchronize_net);
5893
5894 /**
5895  *      unregister_netdevice_queue - remove device from the kernel
5896  *      @dev: device
5897  *      @head: list
5898  *
5899  *      This function shuts down a device interface and removes it
5900  *      from the kernel tables.
5901  *      If head not NULL, device is queued to be unregistered later.
5902  *
5903  *      Callers must hold the rtnl semaphore.  You may want
5904  *      unregister_netdev() instead of this.
5905  */
5906
5907 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5908 {
5909         ASSERT_RTNL();
5910
5911         if (head) {
5912                 list_move_tail(&dev->unreg_list, head);
5913         } else {
5914                 rollback_registered(dev);
5915                 /* Finish processing unregister after unlock */
5916                 net_set_todo(dev);
5917         }
5918 }
5919 EXPORT_SYMBOL(unregister_netdevice_queue);
5920
5921 /**
5922  *      unregister_netdevice_many - unregister many devices
5923  *      @head: list of devices
5924  */
5925 void unregister_netdevice_many(struct list_head *head)
5926 {
5927         struct net_device *dev;
5928
5929         if (!list_empty(head)) {
5930                 rollback_registered_many(head);
5931                 list_for_each_entry(dev, head, unreg_list)
5932                         net_set_todo(dev);
5933         }
5934 }
5935 EXPORT_SYMBOL(unregister_netdevice_many);
5936
5937 /**
5938  *      unregister_netdev - remove device from the kernel
5939  *      @dev: device
5940  *
5941  *      This function shuts down a device interface and removes it
5942  *      from the kernel tables.
5943  *
5944  *      This is just a wrapper for unregister_netdevice that takes
5945  *      the rtnl semaphore.  In general you want to use this and not
5946  *      unregister_netdevice.
5947  */
5948 void unregister_netdev(struct net_device *dev)
5949 {
5950         rtnl_lock();
5951         unregister_netdevice(dev);
5952         rtnl_unlock();
5953 }
5954 EXPORT_SYMBOL(unregister_netdev);
5955
5956 /**
5957  *      dev_change_net_namespace - move device to different nethost namespace
5958  *      @dev: device
5959  *      @net: network namespace
5960  *      @pat: If not NULL name pattern to try if the current device name
5961  *            is already taken in the destination network namespace.
5962  *
5963  *      This function shuts down a device interface and moves it
5964  *      to a new network namespace. On success 0 is returned, on
5965  *      a failure a netagive errno code is returned.
5966  *
5967  *      Callers must hold the rtnl semaphore.
5968  */
5969
5970 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5971 {
5972         int err;
5973
5974         ASSERT_RTNL();
5975
5976         /* Don't allow namespace local devices to be moved. */
5977         err = -EINVAL;
5978         if (dev->features & NETIF_F_NETNS_LOCAL)
5979                 goto out;
5980
5981         /* Ensure the device has been registrered */
5982         if (dev->reg_state != NETREG_REGISTERED)
5983                 goto out;
5984
5985         /* Get out if there is nothing todo */
5986         err = 0;
5987         if (net_eq(dev_net(dev), net))
5988                 goto out;
5989
5990         /* Pick the destination device name, and ensure
5991          * we can use it in the destination network namespace.
5992          */
5993         err = -EEXIST;
5994         if (__dev_get_by_name(net, dev->name)) {
5995                 /* We get here if we can't use the current device name */
5996                 if (!pat)
5997                         goto out;
5998                 if (dev_get_valid_name(net, dev, pat) < 0)
5999                         goto out;
6000         }
6001
6002         /*
6003          * And now a mini version of register_netdevice unregister_netdevice.
6004          */
6005
6006         /* If device is running close it first. */
6007         dev_close(dev);
6008
6009         /* And unlink it from device chain */
6010         err = -ENODEV;
6011         unlist_netdevice(dev);
6012
6013         synchronize_net();
6014
6015         /* Shutdown queueing discipline. */
6016         dev_shutdown(dev);
6017
6018         /* Notify protocols, that we are about to destroy
6019            this device. They should clean all the things.
6020
6021            Note that dev->reg_state stays at NETREG_REGISTERED.
6022            This is wanted because this way 8021q and macvlan know
6023            the device is just moving and can keep their slaves up.
6024         */
6025         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6026         rcu_barrier();
6027         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6028         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6029
6030         /*
6031          *      Flush the unicast and multicast chains
6032          */
6033         dev_uc_flush(dev);
6034         dev_mc_flush(dev);
6035
6036         /* Send a netdev-removed uevent to the old namespace */
6037         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6038
6039         /* Actually switch the network namespace */
6040         dev_net_set(dev, net);
6041
6042         /* If there is an ifindex conflict assign a new one */
6043         if (__dev_get_by_index(net, dev->ifindex)) {
6044                 int iflink = (dev->iflink == dev->ifindex);
6045                 dev->ifindex = dev_new_index(net);
6046                 if (iflink)
6047                         dev->iflink = dev->ifindex;
6048         }
6049
6050         /* Send a netdev-add uevent to the new namespace */
6051         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6052
6053         /* Fixup kobjects */
6054         err = device_rename(&dev->dev, dev->name);
6055         WARN_ON(err);
6056
6057         /* Add the device back in the hashes */
6058         list_netdevice(dev);
6059
6060         /* Notify protocols, that a new device appeared. */
6061         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6062
6063         /*
6064          *      Prevent userspace races by waiting until the network
6065          *      device is fully setup before sending notifications.
6066          */
6067         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6068
6069         synchronize_net();
6070         err = 0;
6071 out:
6072         return err;
6073 }
6074 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6075
6076 static int dev_cpu_callback(struct notifier_block *nfb,
6077                             unsigned long action,
6078                             void *ocpu)
6079 {
6080         struct sk_buff **list_skb;
6081         struct sk_buff *skb;
6082         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6083         struct softnet_data *sd, *oldsd;
6084
6085         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6086                 return NOTIFY_OK;
6087
6088         local_irq_disable();
6089         cpu = smp_processor_id();
6090         sd = &per_cpu(softnet_data, cpu);
6091         oldsd = &per_cpu(softnet_data, oldcpu);
6092
6093         /* Find end of our completion_queue. */
6094         list_skb = &sd->completion_queue;
6095         while (*list_skb)
6096                 list_skb = &(*list_skb)->next;
6097         /* Append completion queue from offline CPU. */
6098         *list_skb = oldsd->completion_queue;
6099         oldsd->completion_queue = NULL;
6100
6101         /* Append output queue from offline CPU. */
6102         if (oldsd->output_queue) {
6103                 *sd->output_queue_tailp = oldsd->output_queue;
6104                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6105                 oldsd->output_queue = NULL;
6106                 oldsd->output_queue_tailp = &oldsd->output_queue;
6107         }
6108         /* Append NAPI poll list from offline CPU. */
6109         if (!list_empty(&oldsd->poll_list)) {
6110                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6111                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6112         }
6113
6114         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6115         local_irq_enable();
6116
6117         /* Process offline CPU's input_pkt_queue */
6118         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6119                 netif_rx(skb);
6120                 input_queue_head_incr(oldsd);
6121         }
6122         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6123                 netif_rx(skb);
6124                 input_queue_head_incr(oldsd);
6125         }
6126
6127         return NOTIFY_OK;
6128 }
6129
6130
6131 /**
6132  *      netdev_increment_features - increment feature set by one
6133  *      @all: current feature set
6134  *      @one: new feature set
6135  *      @mask: mask feature set
6136  *
6137  *      Computes a new feature set after adding a device with feature set
6138  *      @one to the master device with current feature set @all.  Will not
6139  *      enable anything that is off in @mask. Returns the new feature set.
6140  */
6141 netdev_features_t netdev_increment_features(netdev_features_t all,
6142         netdev_features_t one, netdev_features_t mask)
6143 {
6144         if (mask & NETIF_F_GEN_CSUM)
6145                 mask |= NETIF_F_ALL_CSUM;
6146         mask |= NETIF_F_VLAN_CHALLENGED;
6147
6148         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6149         all &= one | ~NETIF_F_ALL_FOR_ALL;
6150
6151         /* If one device supports hw checksumming, set for all. */
6152         if (all & NETIF_F_GEN_CSUM)
6153                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6154
6155         return all;
6156 }
6157 EXPORT_SYMBOL(netdev_increment_features);
6158
6159 static struct hlist_head * __net_init netdev_create_hash(void)
6160 {
6161         int i;
6162         struct hlist_head *hash;
6163
6164         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6165         if (hash != NULL)
6166                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6167                         INIT_HLIST_HEAD(&hash[i]);
6168
6169         return hash;
6170 }
6171
6172 /* Initialize per network namespace state */
6173 static int __net_init netdev_init(struct net *net)
6174 {
6175         if (net != &init_net)
6176                 INIT_LIST_HEAD(&net->dev_base_head);
6177
6178         net->dev_name_head = netdev_create_hash();
6179         if (net->dev_name_head == NULL)
6180                 goto err_name;
6181
6182         net->dev_index_head = netdev_create_hash();
6183         if (net->dev_index_head == NULL)
6184                 goto err_idx;
6185
6186         return 0;
6187
6188 err_idx:
6189         kfree(net->dev_name_head);
6190 err_name:
6191         return -ENOMEM;
6192 }
6193
6194 /**
6195  *      netdev_drivername - network driver for the device
6196  *      @dev: network device
6197  *
6198  *      Determine network driver for device.
6199  */
6200 const char *netdev_drivername(const struct net_device *dev)
6201 {
6202         const struct device_driver *driver;
6203         const struct device *parent;
6204         const char *empty = "";
6205
6206         parent = dev->dev.parent;
6207         if (!parent)
6208                 return empty;
6209
6210         driver = parent->driver;
6211         if (driver && driver->name)
6212                 return driver->name;
6213         return empty;
6214 }
6215
6216 static int __netdev_printk(const char *level, const struct net_device *dev,
6217                            struct va_format *vaf)
6218 {
6219         int r;
6220
6221         if (dev && dev->dev.parent) {
6222                 r = dev_printk_emit(level[1] - '0',
6223                                     dev->dev.parent,
6224                                     "%s %s %s: %pV",
6225                                     dev_driver_string(dev->dev.parent),
6226                                     dev_name(dev->dev.parent),
6227                                     netdev_name(dev), vaf);
6228         } else if (dev) {
6229                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6230         } else {
6231                 r = printk("%s(NULL net_device): %pV", level, vaf);
6232         }
6233
6234         return r;
6235 }
6236
6237 int netdev_printk(const char *level, const struct net_device *dev,
6238                   const char *format, ...)
6239 {
6240         struct va_format vaf;
6241         va_list args;
6242         int r;
6243
6244         va_start(args, format);
6245
6246         vaf.fmt = format;
6247         vaf.va = &args;
6248
6249         r = __netdev_printk(level, dev, &vaf);
6250
6251         va_end(args);
6252
6253         return r;
6254 }
6255 EXPORT_SYMBOL(netdev_printk);
6256
6257 #define define_netdev_printk_level(func, level)                 \
6258 int func(const struct net_device *dev, const char *fmt, ...)    \
6259 {                                                               \
6260         int r;                                                  \
6261         struct va_format vaf;                                   \
6262         va_list args;                                           \
6263                                                                 \
6264         va_start(args, fmt);                                    \
6265                                                                 \
6266         vaf.fmt = fmt;                                          \
6267         vaf.va = &args;                                         \
6268                                                                 \
6269         r = __netdev_printk(level, dev, &vaf);                  \
6270                                                                 \
6271         va_end(args);                                           \
6272                                                                 \
6273         return r;                                               \
6274 }                                                               \
6275 EXPORT_SYMBOL(func);
6276
6277 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6278 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6279 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6280 define_netdev_printk_level(netdev_err, KERN_ERR);
6281 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6282 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6283 define_netdev_printk_level(netdev_info, KERN_INFO);
6284
6285 static void __net_exit netdev_exit(struct net *net)
6286 {
6287         kfree(net->dev_name_head);
6288         kfree(net->dev_index_head);
6289 }
6290
6291 static struct pernet_operations __net_initdata netdev_net_ops = {
6292         .init = netdev_init,
6293         .exit = netdev_exit,
6294 };
6295
6296 static void __net_exit default_device_exit(struct net *net)
6297 {
6298         struct net_device *dev, *aux;
6299         /*
6300          * Push all migratable network devices back to the
6301          * initial network namespace
6302          */
6303         rtnl_lock();
6304         for_each_netdev_safe(net, dev, aux) {
6305                 int err;
6306                 char fb_name[IFNAMSIZ];
6307
6308                 /* Ignore unmoveable devices (i.e. loopback) */
6309                 if (dev->features & NETIF_F_NETNS_LOCAL)
6310                         continue;
6311
6312                 /* Leave virtual devices for the generic cleanup */
6313                 if (dev->rtnl_link_ops)
6314                         continue;
6315
6316                 /* Push remaining network devices to init_net */
6317                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6318                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6319                 if (err) {
6320                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6321                                  __func__, dev->name, err);
6322                         BUG();
6323                 }
6324         }
6325         rtnl_unlock();
6326 }
6327
6328 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6329 {
6330         /* At exit all network devices most be removed from a network
6331          * namespace.  Do this in the reverse order of registration.
6332          * Do this across as many network namespaces as possible to
6333          * improve batching efficiency.
6334          */
6335         struct net_device *dev;
6336         struct net *net;
6337         LIST_HEAD(dev_kill_list);
6338
6339         rtnl_lock();
6340         list_for_each_entry(net, net_list, exit_list) {
6341                 for_each_netdev_reverse(net, dev) {
6342                         if (dev->rtnl_link_ops)
6343                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6344                         else
6345                                 unregister_netdevice_queue(dev, &dev_kill_list);
6346                 }
6347         }
6348         unregister_netdevice_many(&dev_kill_list);
6349         list_del(&dev_kill_list);
6350         rtnl_unlock();
6351 }
6352
6353 static struct pernet_operations __net_initdata default_device_ops = {
6354         .exit = default_device_exit,
6355         .exit_batch = default_device_exit_batch,
6356 };
6357
6358 /*
6359  *      Initialize the DEV module. At boot time this walks the device list and
6360  *      unhooks any devices that fail to initialise (normally hardware not
6361  *      present) and leaves us with a valid list of present and active devices.
6362  *
6363  */
6364
6365 /*
6366  *       This is called single threaded during boot, so no need
6367  *       to take the rtnl semaphore.
6368  */
6369 static int __init net_dev_init(void)
6370 {
6371         int i, rc = -ENOMEM;
6372
6373         BUG_ON(!dev_boot_phase);
6374
6375         if (dev_proc_init())
6376                 goto out;
6377
6378         if (netdev_kobject_init())
6379                 goto out;
6380
6381         INIT_LIST_HEAD(&ptype_all);
6382         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6383                 INIT_LIST_HEAD(&ptype_base[i]);
6384
6385         INIT_LIST_HEAD(&offload_base);
6386
6387         if (register_pernet_subsys(&netdev_net_ops))
6388                 goto out;
6389
6390         /*
6391          *      Initialise the packet receive queues.
6392          */
6393
6394         for_each_possible_cpu(i) {
6395                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6396
6397                 memset(sd, 0, sizeof(*sd));
6398                 skb_queue_head_init(&sd->input_pkt_queue);
6399                 skb_queue_head_init(&sd->process_queue);
6400                 sd->completion_queue = NULL;
6401                 INIT_LIST_HEAD(&sd->poll_list);
6402                 sd->output_queue = NULL;
6403                 sd->output_queue_tailp = &sd->output_queue;
6404 #ifdef CONFIG_RPS
6405                 sd->csd.func = rps_trigger_softirq;
6406                 sd->csd.info = sd;
6407                 sd->csd.flags = 0;
6408                 sd->cpu = i;
6409 #endif
6410
6411                 sd->backlog.poll = process_backlog;
6412                 sd->backlog.weight = weight_p;
6413                 sd->backlog.gro_list = NULL;
6414                 sd->backlog.gro_count = 0;
6415
6416 #ifdef CONFIG_NET_FLOW_LIMIT
6417                 sd->flow_limit = NULL;
6418 #endif
6419         }
6420
6421         dev_boot_phase = 0;
6422
6423         /* The loopback device is special if any other network devices
6424          * is present in a network namespace the loopback device must
6425          * be present. Since we now dynamically allocate and free the
6426          * loopback device ensure this invariant is maintained by
6427          * keeping the loopback device as the first device on the
6428          * list of network devices.  Ensuring the loopback devices
6429          * is the first device that appears and the last network device
6430          * that disappears.
6431          */
6432         if (register_pernet_device(&loopback_net_ops))
6433                 goto out;
6434
6435         if (register_pernet_device(&default_device_ops))
6436                 goto out;
6437
6438         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6439         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6440
6441         hotcpu_notifier(dev_cpu_callback, 0);
6442         dst_init();
6443         rc = 0;
6444 out:
6445         return rc;
6446 }
6447
6448 subsys_initcall(net_dev_init);