net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <[email protected]>
  12  *                              Mark Evans, <[email protected]>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <[email protected]>
  16  *              Alan Cox <[email protected]>
  17  *              David Hinds <[email protected]>
  18  *              Alexey Kuznetsov <[email protected]>
  19  *              Adam Sulmicki <[email protected]>
  20  *              Pekka Riikonen <[email protected]>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static DEFINE_SPINLOCK(offload_lock);
 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150 struct list_head ptype_all __read_mostly;       /* Taps */
 151 static struct list_head offload_base __read_mostly;
 152
 153 static int netif_rx_internal(struct sk_buff *skb);
 154 static int call_netdevice_notifiers_info(unsigned long val,
 155                                          struct net_device *dev,
 156                                          struct netdev_notifier_info *info);
 157
 158 /*
 159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160  * semaphore.
 161  *
 162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163  *
 164  * Writers must hold the rtnl semaphore while they loop through the
 165  * dev_base_head list, and hold dev_base_lock for writing when they do the
 166  * actual updates.  This allows pure readers to access the list even
 167  * while a writer is preparing to update it.
 168  *
 169  * To put it another way, dev_base_lock is held for writing only to
 170  * protect against pure readers; the rtnl semaphore provides the
 171  * protection against other writers.
 172  *
 173  * See, for example usages, register_netdevice() and
 174  * unregister_netdevice(), which must be called with the rtnl
 175  * semaphore held.
 176  */
 177 DEFINE_RWLOCK(dev_base_lock);
 178 EXPORT_SYMBOL(dev_base_lock);
 179
 180 /* protects napi_hash addition/deletion and napi_gen_id */
 181 static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183 static unsigned int napi_gen_id;
 184 static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186 static seqcount_t devnet_rename_seq;
 187
 188 static inline void dev_base_seq_inc(struct net *net)
 189 {
 190         while (++net->dev_base_seq == 0);
 191 }
 192
 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194 {
 195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198 }
 199
 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201 {
 202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203 }
 204
 205 static inline void rps_lock(struct softnet_data *sd)
 206 {
 207 #ifdef CONFIG_RPS
 208         spin_lock(&sd->input_pkt_queue.lock);
 209 #endif
 210 }
 211
 212 static inline void rps_unlock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_unlock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 /* Device list insertion */
 220 static void list_netdevice(struct net_device *dev)
 221 {
 222         struct net *net = dev_net(dev);
 223
 224         ASSERT_RTNL();
 225
 226         write_lock_bh(&dev_base_lock);
 227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229         hlist_add_head_rcu(&dev->index_hlist,
 230                            dev_index_hash(net, dev->ifindex));
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(net);
 234 }
 235
 236 /* Device list removal
 237  * caller must respect a RCU grace period before freeing/reusing dev
 238  */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del_rcu(&dev->dev_list);
 246         hlist_del_rcu(&dev->name_hlist);
 247         hlist_del_rcu(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(dev_net(dev));
 251 }
 252
 253 /*
 254  *      Our notifier list
 255  */
 256
 257 static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259 /*
 260  *      Device drivers call our routines to queue packets here. We empty the
 261  *      queue in the local softnet handler.
 262  */
 263
 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265 EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267 #ifdef CONFIG_LOCKDEP
 268 /*
 269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270  * according to dev->type
 271  */
 272 static const unsigned short netdev_lock_type[] =
 273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289 static const char *const netdev_lock_name[] =
 290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310 {
 311         int i;
 312
 313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                 if (netdev_lock_type[i] == dev_type)
 315                         return i;
 316         /* the last key is used by default */
 317         return ARRAY_SIZE(netdev_lock_type) - 1;
 318 }
 319
 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                  unsigned short dev_type)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev_type);
 326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329
 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331 {
 332         int i;
 333
 334         i = netdev_lock_pos(dev->type);
 335         lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                    &netdev_addr_lock_key[i],
 337                                    netdev_lock_name[i]);
 338 }
 339 #else
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343 }
 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345 {
 346 }
 347 #endif
 348
 349 /*******************************************************************************
 350
 351                 Protocol management and registration routines
 352
 353 *******************************************************************************/
 354
 355 /*
 356  *      Add a protocol ID to the list. Now that the input handler is
 357  *      smarter we can dispense with all the messy stuff that used to be
 358  *      here.
 359  *
 360  *      BEWARE!!! Protocol handlers, mangling input packets,
 361  *      MUST BE last in hash buckets and checking protocol handlers
 362  *      MUST start from promiscuous ptype_all chain in net_bh.
 363  *      It is true now, do not change it.
 364  *      Explanation follows: if protocol handler, mangling packet, will
 365  *      be the first on list, it is not able to sense, that packet
 366  *      is cloned and should be copied-on-write, so that it will
 367  *      change it and subsequent readers will get broken packet.
 368  *                                                      --ANK (980803)
 369  */
 370
 371 static inline struct list_head *ptype_head(const struct packet_type *pt)
 372 {
 373         if (pt->type == htons(ETH_P_ALL))
 374                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 375         else
 376                 return pt->dev ? &pt->dev->ptype_specific :
 377                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 378 }
 379
 380 /**
 381  *      dev_add_pack - add packet handler
 382  *      @pt: packet type declaration
 383  *
 384  *      Add a protocol handler to the networking stack. The passed &packet_type
 385  *      is linked into kernel lists and may not be freed until it has been
 386  *      removed from the kernel lists.
 387  *
 388  *      This call does not sleep therefore it can not
 389  *      guarantee all CPU's that are in middle of receiving packets
 390  *      will see the new packet type (until the next received packet).
 391  */
 392
 393 void dev_add_pack(struct packet_type *pt)
 394 {
 395         struct list_head *head = ptype_head(pt);
 396
 397         spin_lock(&ptype_lock);
 398         list_add_rcu(&pt->list, head);
 399         spin_unlock(&ptype_lock);
 400 }
 401 EXPORT_SYMBOL(dev_add_pack);
 402
 403 /**
 404  *      __dev_remove_pack        - remove packet handler
 405  *      @pt: packet type declaration
 406  *
 407  *      Remove a protocol handler that was previously added to the kernel
 408  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 409  *      from the kernel lists and can be freed or reused once this function
 410  *      returns.
 411  *
 412  *      The packet type might still be in use by receivers
 413  *      and must not be freed until after all the CPU's have gone
 414  *      through a quiescent state.
 415  */
 416 void __dev_remove_pack(struct packet_type *pt)
 417 {
 418         struct list_head *head = ptype_head(pt);
 419         struct packet_type *pt1;
 420
 421         spin_lock(&ptype_lock);
 422
 423         list_for_each_entry(pt1, head, list) {
 424                 if (pt == pt1) {
 425                         list_del_rcu(&pt->list);
 426                         goto out;
 427                 }
 428         }
 429
 430         pr_warn("dev_remove_pack: %p not found\n", pt);
 431 out:
 432         spin_unlock(&ptype_lock);
 433 }
 434 EXPORT_SYMBOL(__dev_remove_pack);
 435
 436 /**
 437  *      dev_remove_pack  - remove packet handler
 438  *      @pt: packet type declaration
 439  *
 440  *      Remove a protocol handler that was previously added to the kernel
 441  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 442  *      from the kernel lists and can be freed or reused once this function
 443  *      returns.
 444  *
 445  *      This call sleeps to guarantee that no CPU is looking at the packet
 446  *      type after return.
 447  */
 448 void dev_remove_pack(struct packet_type *pt)
 449 {
 450         __dev_remove_pack(pt);
 451
 452         synchronize_net();
 453 }
 454 EXPORT_SYMBOL(dev_remove_pack);
 455
 456
 457 /**
 458  *      dev_add_offload - register offload handlers
 459  *      @po: protocol offload declaration
 460  *
 461  *      Add protocol offload handlers to the networking stack. The passed
 462  *      &proto_offload is linked into kernel lists and may not be freed until
 463  *      it has been removed from the kernel lists.
 464  *
 465  *      This call does not sleep therefore it can not
 466  *      guarantee all CPU's that are in middle of receiving packets
 467  *      will see the new offload handlers (until the next received packet).
 468  */
 469 void dev_add_offload(struct packet_offload *po)
 470 {
 471         struct list_head *head = &offload_base;
 472
 473         spin_lock(&offload_lock);
 474         list_add_rcu(&po->list, head);
 475         spin_unlock(&offload_lock);
 476 }
 477 EXPORT_SYMBOL(dev_add_offload);
 478
 479 /**
 480  *      __dev_remove_offload     - remove offload handler
 481  *      @po: packet offload declaration
 482  *
 483  *      Remove a protocol offload handler that was previously added to the
 484  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 485  *      is removed from the kernel lists and can be freed or reused once this
 486  *      function returns.
 487  *
 488  *      The packet type might still be in use by receivers
 489  *      and must not be freed until after all the CPU's have gone
 490  *      through a quiescent state.
 491  */
 492 static void __dev_remove_offload(struct packet_offload *po)
 493 {
 494         struct list_head *head = &offload_base;
 495         struct packet_offload *po1;
 496
 497         spin_lock(&offload_lock);
 498
 499         list_for_each_entry(po1, head, list) {
 500                 if (po == po1) {
 501                         list_del_rcu(&po->list);
 502                         goto out;
 503                 }
 504         }
 505
 506         pr_warn("dev_remove_offload: %p not found\n", po);
 507 out:
 508         spin_unlock(&offload_lock);
 509 }
 510
 511 /**
 512  *      dev_remove_offload       - remove packet offload handler
 513  *      @po: packet offload declaration
 514  *
 515  *      Remove a packet offload handler that was previously added to the kernel
 516  *      offload handlers by dev_add_offload(). The passed &offload_type is
 517  *      removed from the kernel lists and can be freed or reused once this
 518  *      function returns.
 519  *
 520  *      This call sleeps to guarantee that no CPU is looking at the packet
 521  *      type after return.
 522  */
 523 void dev_remove_offload(struct packet_offload *po)
 524 {
 525         __dev_remove_offload(po);
 526
 527         synchronize_net();
 528 }
 529 EXPORT_SYMBOL(dev_remove_offload);
 530
 531 /******************************************************************************
 532
 533                       Device Boot-time Settings Routines
 534
 535 *******************************************************************************/
 536
 537 /* Boot time configuration table */
 538 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 539
 540 /**
 541  *      netdev_boot_setup_add   - add new setup entry
 542  *      @name: name of the device
 543  *      @map: configured settings for the device
 544  *
 545  *      Adds new setup entry to the dev_boot_setup list.  The function
 546  *      returns 0 on error and 1 on success.  This is a generic routine to
 547  *      all netdevices.
 548  */
 549 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 550 {
 551         struct netdev_boot_setup *s;
 552         int i;
 553
 554         s = dev_boot_setup;
 555         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 556                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 557                         memset(s[i].name, 0, sizeof(s[i].name));
 558                         strlcpy(s[i].name, name, IFNAMSIZ);
 559                         memcpy(&s[i].map, map, sizeof(s[i].map));
 560                         break;
 561                 }
 562         }
 563
 564         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 565 }
 566
 567 /**
 568  *      netdev_boot_setup_check - check boot time settings
 569  *      @dev: the netdevice
 570  *
 571  *      Check boot time settings for the device.
 572  *      The found settings are set for the device to be used
 573  *      later in the device probing.
 574  *      Returns 0 if no settings found, 1 if they are.
 575  */
 576 int netdev_boot_setup_check(struct net_device *dev)
 577 {
 578         struct netdev_boot_setup *s = dev_boot_setup;
 579         int i;
 580
 581         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 582                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 583                     !strcmp(dev->name, s[i].name)) {
 584                         dev->irq        = s[i].map.irq;
 585                         dev->base_addr  = s[i].map.base_addr;
 586                         dev->mem_start  = s[i].map.mem_start;
 587                         dev->mem_end    = s[i].map.mem_end;
 588                         return 1;
 589                 }
 590         }
 591         return 0;
 592 }
 593 EXPORT_SYMBOL(netdev_boot_setup_check);
 594
 595
 596 /**
 597  *      netdev_boot_base        - get address from boot time settings
 598  *      @prefix: prefix for network device
 599  *      @unit: id for network device
 600  *
 601  *      Check boot time settings for the base address of device.
 602  *      The found settings are set for the device to be used
 603  *      later in the device probing.
 604  *      Returns 0 if no settings found.
 605  */
 606 unsigned long netdev_boot_base(const char *prefix, int unit)
 607 {
 608         const struct netdev_boot_setup *s = dev_boot_setup;
 609         char name[IFNAMSIZ];
 610         int i;
 611
 612         sprintf(name, "%s%d", prefix, unit);
 613
 614         /*
 615          * If device already registered then return base of 1
 616          * to indicate not to probe for this interface
 617          */
 618         if (__dev_get_by_name(&init_net, name))
 619                 return 1;
 620
 621         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 622                 if (!strcmp(name, s[i].name))
 623                         return s[i].map.base_addr;
 624         return 0;
 625 }
 626
 627 /*
 628  * Saves at boot time configured settings for any netdevice.
 629  */
 630 int __init netdev_boot_setup(char *str)
 631 {
 632         int ints[5];
 633         struct ifmap map;
 634
 635         str = get_options(str, ARRAY_SIZE(ints), ints);
 636         if (!str || !*str)
 637                 return 0;
 638
 639         /* Save settings */
 640         memset(&map, 0, sizeof(map));
 641         if (ints[0] > 0)
 642                 map.irq = ints[1];
 643         if (ints[0] > 1)
 644                 map.base_addr = ints[2];
 645         if (ints[0] > 2)
 646                 map.mem_start = ints[3];
 647         if (ints[0] > 3)
 648                 map.mem_end = ints[4];
 649
 650         /* Add new entry to the list */
 651         return netdev_boot_setup_add(str, &map);
 652 }
 653
 654 __setup("netdev=", netdev_boot_setup);
 655
 656 /*******************************************************************************
 657
 658                             Device Interface Subroutines
 659
 660 *******************************************************************************/
 661
 662 /**
 663  *      __dev_get_by_name       - find a device by its name
 664  *      @net: the applicable net namespace
 665  *      @name: name to find
 666  *
 667  *      Find an interface by name. Must be called under RTNL semaphore
 668  *      or @dev_base_lock. If the name is found a pointer to the device
 669  *      is returned. If the name is not found then %NULL is returned. The
 670  *      reference counters are not incremented so the caller must be
 671  *      careful with locks.
 672  */
 673
 674 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 675 {
 676         struct net_device *dev;
 677         struct hlist_head *head = dev_name_hash(net, name);
 678
 679         hlist_for_each_entry(dev, head, name_hlist)
 680                 if (!strncmp(dev->name, name, IFNAMSIZ))
 681                         return dev;
 682
 683         return NULL;
 684 }
 685 EXPORT_SYMBOL(__dev_get_by_name);
 686
 687 /**
 688  *      dev_get_by_name_rcu     - find a device by its name
 689  *      @net: the applicable net namespace
 690  *      @name: name to find
 691  *
 692  *      Find an interface by name.
 693  *      If the name is found a pointer to the device is returned.
 694  *      If the name is not found then %NULL is returned.
 695  *      The reference counters are not incremented so the caller must be
 696  *      careful with locks. The caller must hold RCU lock.
 697  */
 698
 699 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 700 {
 701         struct net_device *dev;
 702         struct hlist_head *head = dev_name_hash(net, name);
 703
 704         hlist_for_each_entry_rcu(dev, head, name_hlist)
 705                 if (!strncmp(dev->name, name, IFNAMSIZ))
 706                         return dev;
 707
 708         return NULL;
 709 }
 710 EXPORT_SYMBOL(dev_get_by_name_rcu);
 711
 712 /**
 713  *      dev_get_by_name         - find a device by its name
 714  *      @net: the applicable net namespace
 715  *      @name: name to find
 716  *
 717  *      Find an interface by name. This can be called from any
 718  *      context and does its own locking. The returned handle has
 719  *      the usage count incremented and the caller must use dev_put() to
 720  *      release it when it is no longer needed. %NULL is returned if no
 721  *      matching device is found.
 722  */
 723
 724 struct net_device *dev_get_by_name(struct net *net, const char *name)
 725 {
 726         struct net_device *dev;
 727
 728         rcu_read_lock();
 729         dev = dev_get_by_name_rcu(net, name);
 730         if (dev)
 731                 dev_hold(dev);
 732         rcu_read_unlock();
 733         return dev;
 734 }
 735 EXPORT_SYMBOL(dev_get_by_name);
 736
 737 /**
 738  *      __dev_get_by_index - find a device by its ifindex
 739  *      @net: the applicable net namespace
 740  *      @ifindex: index of device
 741  *
 742  *      Search for an interface by index. Returns %NULL if the device
 743  *      is not found or a pointer to the device. The device has not
 744  *      had its reference counter increased so the caller must be careful
 745  *      about locking. The caller must hold either the RTNL semaphore
 746  *      or @dev_base_lock.
 747  */
 748
 749 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 750 {
 751         struct net_device *dev;
 752         struct hlist_head *head = dev_index_hash(net, ifindex);
 753
 754         hlist_for_each_entry(dev, head, index_hlist)
 755                 if (dev->ifindex == ifindex)
 756                         return dev;
 757
 758         return NULL;
 759 }
 760 EXPORT_SYMBOL(__dev_get_by_index);
 761
 762 /**
 763  *      dev_get_by_index_rcu - find a device by its ifindex
 764  *      @net: the applicable net namespace
 765  *      @ifindex: index of device
 766  *
 767  *      Search for an interface by index. Returns %NULL if the device
 768  *      is not found or a pointer to the device. The device has not
 769  *      had its reference counter increased so the caller must be careful
 770  *      about locking. The caller must hold RCU lock.
 771  */
 772
 773 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 774 {
 775         struct net_device *dev;
 776         struct hlist_head *head = dev_index_hash(net, ifindex);
 777
 778         hlist_for_each_entry_rcu(dev, head, index_hlist)
 779                 if (dev->ifindex == ifindex)
 780                         return dev;
 781
 782         return NULL;
 783 }
 784 EXPORT_SYMBOL(dev_get_by_index_rcu);
 785
 786
 787 /**
 788  *      dev_get_by_index - find a device by its ifindex
 789  *      @net: the applicable net namespace
 790  *      @ifindex: index of device
 791  *
 792  *      Search for an interface by index. Returns NULL if the device
 793  *      is not found or a pointer to the device. The device returned has
 794  *      had a reference added and the pointer is safe until the user calls
 795  *      dev_put to indicate they have finished with it.
 796  */
 797
 798 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 799 {
 800         struct net_device *dev;
 801
 802         rcu_read_lock();
 803         dev = dev_get_by_index_rcu(net, ifindex);
 804         if (dev)
 805                 dev_hold(dev);
 806         rcu_read_unlock();
 807         return dev;
 808 }
 809 EXPORT_SYMBOL(dev_get_by_index);
 810
 811 /**
 812  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 813  *      @net: network namespace
 814  *      @name: a pointer to the buffer where the name will be stored.
 815  *      @ifindex: the ifindex of the interface to get the name from.
 816  *
 817  *      The use of raw_seqcount_begin() and cond_resched() before
 818  *      retrying is required as we want to give the writers a chance
 819  *      to complete when CONFIG_PREEMPT is not set.
 820  */
 821 int netdev_get_name(struct net *net, char *name, int ifindex)
 822 {
 823         struct net_device *dev;
 824         unsigned int seq;
 825
 826 retry:
 827         seq = raw_seqcount_begin(&devnet_rename_seq);
 828         rcu_read_lock();
 829         dev = dev_get_by_index_rcu(net, ifindex);
 830         if (!dev) {
 831                 rcu_read_unlock();
 832                 return -ENODEV;
 833         }
 834
 835         strcpy(name, dev->name);
 836         rcu_read_unlock();
 837         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 838                 cond_resched();
 839                 goto retry;
 840         }
 841
 842         return 0;
 843 }
 844
 845 /**
 846  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 847  *      @net: the applicable net namespace
 848  *      @type: media type of device
 849  *      @ha: hardware address
 850  *
 851  *      Search for an interface by MAC address. Returns NULL if the device
 852  *      is not found or a pointer to the device.
 853  *      The caller must hold RCU or RTNL.
 854  *      The returned device has not had its ref count increased
 855  *      and the caller must therefore be careful about locking
 856  *
 857  */
 858
 859 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 860                                        const char *ha)
 861 {
 862         struct net_device *dev;
 863
 864         for_each_netdev_rcu(net, dev)
 865                 if (dev->type == type &&
 866                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 867                         return dev;
 868
 869         return NULL;
 870 }
 871 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 872
 873 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 874 {
 875         struct net_device *dev;
 876
 877         ASSERT_RTNL();
 878         for_each_netdev(net, dev)
 879                 if (dev->type == type)
 880                         return dev;
 881
 882         return NULL;
 883 }
 884 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 885
 886 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 887 {
 888         struct net_device *dev, *ret = NULL;
 889
 890         rcu_read_lock();
 891         for_each_netdev_rcu(net, dev)
 892                 if (dev->type == type) {
 893                         dev_hold(dev);
 894                         ret = dev;
 895                         break;
 896                 }
 897         rcu_read_unlock();
 898         return ret;
 899 }
 900 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 901
 902 /**
 903  *      __dev_get_by_flags - find any device with given flags
 904  *      @net: the applicable net namespace
 905  *      @if_flags: IFF_* values
 906  *      @mask: bitmask of bits in if_flags to check
 907  *
 908  *      Search for any interface with the given flags. Returns NULL if a device
 909  *      is not found or a pointer to the device. Must be called inside
 910  *      rtnl_lock(), and result refcount is unchanged.
 911  */
 912
 913 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 914                                       unsigned short mask)
 915 {
 916         struct net_device *dev, *ret;
 917
 918         ASSERT_RTNL();
 919
 920         ret = NULL;
 921         for_each_netdev(net, dev) {
 922                 if (((dev->flags ^ if_flags) & mask) == 0) {
 923                         ret = dev;
 924                         break;
 925                 }
 926         }
 927         return ret;
 928 }
 929 EXPORT_SYMBOL(__dev_get_by_flags);
 930
 931 /**
 932  *      dev_valid_name - check if name is okay for network device
 933  *      @name: name string
 934  *
 935  *      Network device names need to be valid file names to
 936  *      to allow sysfs to work.  We also disallow any kind of
 937  *      whitespace.
 938  */
 939 bool dev_valid_name(const char *name)
 940 {
 941         if (*name == '\0')
 942                 return false;
 943         if (strlen(name) >= IFNAMSIZ)
 944                 return false;
 945         if (!strcmp(name, ".") || !strcmp(name, ".."))
 946                 return false;
 947
 948         while (*name) {
 949                 if (*name == '/' || *name == ':' || isspace(*name))
 950                         return false;
 951                 name++;
 952         }
 953         return true;
 954 }
 955 EXPORT_SYMBOL(dev_valid_name);
 956
 957 /**
 958  *      __dev_alloc_name - allocate a name for a device
 959  *      @net: network namespace to allocate the device name in
 960  *      @name: name format string
 961  *      @buf:  scratch buffer and result name string
 962  *
 963  *      Passed a format string - eg "lt%d" it will try and find a suitable
 964  *      id. It scans list of devices to build up a free map, then chooses
 965  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 966  *      while allocating the name and adding the device in order to avoid
 967  *      duplicates.
 968  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 969  *      Returns the number of the unit assigned or a negative errno code.
 970  */
 971
 972 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 973 {
 974         int i = 0;
 975         const char *p;
 976         const int max_netdevices = 8*PAGE_SIZE;
 977         unsigned long *inuse;
 978         struct net_device *d;
 979
 980         p = strnchr(name, IFNAMSIZ-1, '%');
 981         if (p) {
 982                 /*
 983                  * Verify the string as this thing may have come from
 984                  * the user.  There must be either one "%d" and no other "%"
 985                  * characters.
 986                  */
 987                 if (p[1] != 'd' || strchr(p + 2, '%'))
 988                         return -EINVAL;
 989
 990                 /* Use one page as a bit array of possible slots */
 991                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 992                 if (!inuse)
 993                         return -ENOMEM;
 994
 995                 for_each_netdev(net, d) {
 996                         if (!sscanf(d->name, name, &i))
 997                                 continue;
 998                         if (i < 0 || i >= max_netdevices)
 999                                 continue;
1000
1001                         /*  avoid cases where sscanf is not exact inverse of printf */
1002                         snprintf(buf, IFNAMSIZ, name, i);
1003                         if (!strncmp(buf, d->name, IFNAMSIZ))
1004                                 set_bit(i, inuse);
1005                 }
1006
1007                 i = find_first_zero_bit(inuse, max_netdevices);
1008                 free_page((unsigned long) inuse);
1009         }
1010
1011         if (buf != name)
1012                 snprintf(buf, IFNAMSIZ, name, i);
1013         if (!__dev_get_by_name(net, buf))
1014                 return i;
1015
1016         /* It is possible to run out of possible slots
1017          * when the name is long and there isn't enough space left
1018          * for the digits, or if all bits are used.
1019          */
1020         return -ENFILE;
1021 }
1022
1023 /**
1024  *      dev_alloc_name - allocate a name for a device
1025  *      @dev: device
1026  *      @name: name format string
1027  *
1028  *      Passed a format string - eg "lt%d" it will try and find a suitable
1029  *      id. It scans list of devices to build up a free map, then chooses
1030  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1031  *      while allocating the name and adding the device in order to avoid
1032  *      duplicates.
1033  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1034  *      Returns the number of the unit assigned or a negative errno code.
1035  */
1036
1037 int dev_alloc_name(struct net_device *dev, const char *name)
1038 {
1039         char buf[IFNAMSIZ];
1040         struct net *net;
1041         int ret;
1042
1043         BUG_ON(!dev_net(dev));
1044         net = dev_net(dev);
1045         ret = __dev_alloc_name(net, name, buf);
1046         if (ret >= 0)
1047                 strlcpy(dev->name, buf, IFNAMSIZ);
1048         return ret;
1049 }
1050 EXPORT_SYMBOL(dev_alloc_name);
1051
1052 static int dev_alloc_name_ns(struct net *net,
1053                              struct net_device *dev,
1054                              const char *name)
1055 {
1056         char buf[IFNAMSIZ];
1057         int ret;
1058
1059         ret = __dev_alloc_name(net, name, buf);
1060         if (ret >= 0)
1061                 strlcpy(dev->name, buf, IFNAMSIZ);
1062         return ret;
1063 }
1064
1065 static int dev_get_valid_name(struct net *net,
1066                               struct net_device *dev,
1067                               const char *name)
1068 {
1069         BUG_ON(!net);
1070
1071         if (!dev_valid_name(name))
1072                 return -EINVAL;
1073
1074         if (strchr(name, '%'))
1075                 return dev_alloc_name_ns(net, dev, name);
1076         else if (__dev_get_by_name(net, name))
1077                 return -EEXIST;
1078         else if (dev->name != name)
1079                 strlcpy(dev->name, name, IFNAMSIZ);
1080
1081         return 0;
1082 }
1083
1084 /**
1085  *      dev_change_name - change name of a device
1086  *      @dev: device
1087  *      @newname: name (or format string) must be at least IFNAMSIZ
1088  *
1089  *      Change name of a device, can pass format strings "eth%d".
1090  *      for wildcarding.
1091  */
1092 int dev_change_name(struct net_device *dev, const char *newname)
1093 {
1094         unsigned char old_assign_type;
1095         char oldname[IFNAMSIZ];
1096         int err = 0;
1097         int ret;
1098         struct net *net;
1099
1100         ASSERT_RTNL();
1101         BUG_ON(!dev_net(dev));
1102
1103         net = dev_net(dev);
1104         if (dev->flags & IFF_UP)
1105                 return -EBUSY;
1106
1107         write_seqcount_begin(&devnet_rename_seq);
1108
1109         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1110                 write_seqcount_end(&devnet_rename_seq);
1111                 return 0;
1112         }
1113
1114         memcpy(oldname, dev->name, IFNAMSIZ);
1115
1116         err = dev_get_valid_name(net, dev, newname);
1117         if (err < 0) {
1118                 write_seqcount_end(&devnet_rename_seq);
1119                 return err;
1120         }
1121
1122         if (oldname[0] && !strchr(oldname, '%'))
1123                 netdev_info(dev, "renamed from %s\n", oldname);
1124
1125         old_assign_type = dev->name_assign_type;
1126         dev->name_assign_type = NET_NAME_RENAMED;
1127
1128 rollback:
1129         ret = device_rename(&dev->dev, dev->name);
1130         if (ret) {
1131                 memcpy(dev->name, oldname, IFNAMSIZ);
1132                 dev->name_assign_type = old_assign_type;
1133                 write_seqcount_end(&devnet_rename_seq);
1134                 return ret;
1135         }
1136
1137         write_seqcount_end(&devnet_rename_seq);
1138
1139         netdev_adjacent_rename_links(dev, oldname);
1140
1141         write_lock_bh(&dev_base_lock);
1142         hlist_del_rcu(&dev->name_hlist);
1143         write_unlock_bh(&dev_base_lock);
1144
1145         synchronize_rcu();
1146
1147         write_lock_bh(&dev_base_lock);
1148         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1149         write_unlock_bh(&dev_base_lock);
1150
1151         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1152         ret = notifier_to_errno(ret);
1153
1154         if (ret) {
1155                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1156                 if (err >= 0) {
1157                         err = ret;
1158                         write_seqcount_begin(&devnet_rename_seq);
1159                         memcpy(dev->name, oldname, IFNAMSIZ);
1160                         memcpy(oldname, newname, IFNAMSIZ);
1161                         dev->name_assign_type = old_assign_type;
1162                         old_assign_type = NET_NAME_RENAMED;
1163                         goto rollback;
1164                 } else {
1165                         pr_err("%s: name change rollback failed: %d\n",
1166                                dev->name, ret);
1167                 }
1168         }
1169
1170         return err;
1171 }
1172
1173 /**
1174  *      dev_set_alias - change ifalias of a device
1175  *      @dev: device
1176  *      @alias: name up to IFALIASZ
1177  *      @len: limit of bytes to copy from info
1178  *
1179  *      Set ifalias for a device,
1180  */
1181 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1182 {
1183         char *new_ifalias;
1184
1185         ASSERT_RTNL();
1186
1187         if (len >= IFALIASZ)
1188                 return -EINVAL;
1189
1190         if (!len) {
1191                 kfree(dev->ifalias);
1192                 dev->ifalias = NULL;
1193                 return 0;
1194         }
1195
1196         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1197         if (!new_ifalias)
1198                 return -ENOMEM;
1199         dev->ifalias = new_ifalias;
1200
1201         strlcpy(dev->ifalias, alias, len+1);
1202         return len;
1203 }
1204
1205
1206 /**
1207  *      netdev_features_change - device changes features
1208  *      @dev: device to cause notification
1209  *
1210  *      Called to indicate a device has changed features.
1211  */
1212 void netdev_features_change(struct net_device *dev)
1213 {
1214         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1215 }
1216 EXPORT_SYMBOL(netdev_features_change);
1217
1218 /**
1219  *      netdev_state_change - device changes state
1220  *      @dev: device to cause notification
1221  *
1222  *      Called to indicate a device has changed state. This function calls
1223  *      the notifier chains for netdev_chain and sends a NEWLINK message
1224  *      to the routing socket.
1225  */
1226 void netdev_state_change(struct net_device *dev)
1227 {
1228         if (dev->flags & IFF_UP) {
1229                 struct netdev_notifier_change_info change_info;
1230
1231                 change_info.flags_changed = 0;
1232                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1233                                               &change_info.info);
1234                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1235         }
1236 }
1237 EXPORT_SYMBOL(netdev_state_change);
1238
1239 /**
1240  *      netdev_notify_peers - notify network peers about existence of @dev
1241  *      @dev: network device
1242  *
1243  * Generate traffic such that interested network peers are aware of
1244  * @dev, such as by generating a gratuitous ARP. This may be used when
1245  * a device wants to inform the rest of the network about some sort of
1246  * reconfiguration such as a failover event or virtual machine
1247  * migration.
1248  */
1249 void netdev_notify_peers(struct net_device *dev)
1250 {
1251         rtnl_lock();
1252         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1253         rtnl_unlock();
1254 }
1255 EXPORT_SYMBOL(netdev_notify_peers);
1256
1257 static int __dev_open(struct net_device *dev)
1258 {
1259         const struct net_device_ops *ops = dev->netdev_ops;
1260         int ret;
1261
1262         ASSERT_RTNL();
1263
1264         if (!netif_device_present(dev))
1265                 return -ENODEV;
1266
1267         /* Block netpoll from trying to do any rx path servicing.
1268          * If we don't do this there is a chance ndo_poll_controller
1269          * or ndo_poll may be running while we open the device
1270          */
1271         netpoll_poll_disable(dev);
1272
1273         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1274         ret = notifier_to_errno(ret);
1275         if (ret)
1276                 return ret;
1277
1278         set_bit(__LINK_STATE_START, &dev->state);
1279
1280         if (ops->ndo_validate_addr)
1281                 ret = ops->ndo_validate_addr(dev);
1282
1283         if (!ret && ops->ndo_open)
1284                 ret = ops->ndo_open(dev);
1285
1286         netpoll_poll_enable(dev);
1287
1288         if (ret)
1289                 clear_bit(__LINK_STATE_START, &dev->state);
1290         else {
1291                 dev->flags |= IFF_UP;
1292                 dev_set_rx_mode(dev);
1293                 dev_activate(dev);
1294                 add_device_randomness(dev->dev_addr, dev->addr_len);
1295         }
1296
1297         return ret;
1298 }
1299
1300 /**
1301  *      dev_open        - prepare an interface for use.
1302  *      @dev:   device to open
1303  *
1304  *      Takes a device from down to up state. The device's private open
1305  *      function is invoked and then the multicast lists are loaded. Finally
1306  *      the device is moved into the up state and a %NETDEV_UP message is
1307  *      sent to the netdev notifier chain.
1308  *
1309  *      Calling this function on an active interface is a nop. On a failure
1310  *      a negative errno code is returned.
1311  */
1312 int dev_open(struct net_device *dev)
1313 {
1314         int ret;
1315
1316         if (dev->flags & IFF_UP)
1317                 return 0;
1318
1319         ret = __dev_open(dev);
1320         if (ret < 0)
1321                 return ret;
1322
1323         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1324         call_netdevice_notifiers(NETDEV_UP, dev);
1325
1326         return ret;
1327 }
1328 EXPORT_SYMBOL(dev_open);
1329
1330 static int __dev_close_many(struct list_head *head)
1331 {
1332         struct net_device *dev;
1333
1334         ASSERT_RTNL();
1335         might_sleep();
1336
1337         list_for_each_entry(dev, head, close_list) {
1338                 /* Temporarily disable netpoll until the interface is down */
1339                 netpoll_poll_disable(dev);
1340
1341                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1342
1343                 clear_bit(__LINK_STATE_START, &dev->state);
1344
1345                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1346                  * can be even on different cpu. So just clear netif_running().
1347                  *
1348                  * dev->stop() will invoke napi_disable() on all of it's
1349                  * napi_struct instances on this device.
1350                  */
1351                 smp_mb__after_atomic(); /* Commit netif_running(). */
1352         }
1353
1354         dev_deactivate_many(head);
1355
1356         list_for_each_entry(dev, head, close_list) {
1357                 const struct net_device_ops *ops = dev->netdev_ops;
1358
1359                 /*
1360                  *      Call the device specific close. This cannot fail.
1361                  *      Only if device is UP
1362                  *
1363                  *      We allow it to be called even after a DETACH hot-plug
1364                  *      event.
1365                  */
1366                 if (ops->ndo_stop)
1367                         ops->ndo_stop(dev);
1368
1369                 dev->flags &= ~IFF_UP;
1370                 netpoll_poll_enable(dev);
1371         }
1372
1373         return 0;
1374 }
1375
1376 static int __dev_close(struct net_device *dev)
1377 {
1378         int retval;
1379         LIST_HEAD(single);
1380
1381         list_add(&dev->close_list, &single);
1382         retval = __dev_close_many(&single);
1383         list_del(&single);
1384
1385         return retval;
1386 }
1387
1388 int dev_close_many(struct list_head *head, bool unlink)
1389 {
1390         struct net_device *dev, *tmp;
1391
1392         /* Remove the devices that don't need to be closed */
1393         list_for_each_entry_safe(dev, tmp, head, close_list)
1394                 if (!(dev->flags & IFF_UP))
1395                         list_del_init(&dev->close_list);
1396
1397         __dev_close_many(head);
1398
1399         list_for_each_entry_safe(dev, tmp, head, close_list) {
1400                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1401                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1402                 if (unlink)
1403                         list_del_init(&dev->close_list);
1404         }
1405
1406         return 0;
1407 }
1408 EXPORT_SYMBOL(dev_close_many);
1409
1410 /**
1411  *      dev_close - shutdown an interface.
1412  *      @dev: device to shutdown
1413  *
1414  *      This function moves an active device into down state. A
1415  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1416  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1417  *      chain.
1418  */
1419 int dev_close(struct net_device *dev)
1420 {
1421         if (dev->flags & IFF_UP) {
1422                 LIST_HEAD(single);
1423
1424                 list_add(&dev->close_list, &single);
1425                 dev_close_many(&single, true);
1426                 list_del(&single);
1427         }
1428         return 0;
1429 }
1430 EXPORT_SYMBOL(dev_close);
1431
1432
1433 /**
1434  *      dev_disable_lro - disable Large Receive Offload on a device
1435  *      @dev: device
1436  *
1437  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1438  *      called under RTNL.  This is needed if received packets may be
1439  *      forwarded to another interface.
1440  */
1441 void dev_disable_lro(struct net_device *dev)
1442 {
1443         struct net_device *lower_dev;
1444         struct list_head *iter;
1445
1446         dev->wanted_features &= ~NETIF_F_LRO;
1447         netdev_update_features(dev);
1448
1449         if (unlikely(dev->features & NETIF_F_LRO))
1450                 netdev_WARN(dev, "failed to disable LRO!\n");
1451
1452         netdev_for_each_lower_dev(dev, lower_dev, iter)
1453                 dev_disable_lro(lower_dev);
1454 }
1455 EXPORT_SYMBOL(dev_disable_lro);
1456
1457 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1458                                    struct net_device *dev)
1459 {
1460         struct netdev_notifier_info info;
1461
1462         netdev_notifier_info_init(&info, dev);
1463         return nb->notifier_call(nb, val, &info);
1464 }
1465
1466 static int dev_boot_phase = 1;
1467
1468 /**
1469  *      register_netdevice_notifier - register a network notifier block
1470  *      @nb: notifier
1471  *
1472  *      Register a notifier to be called when network device events occur.
1473  *      The notifier passed is linked into the kernel structures and must
1474  *      not be reused until it has been unregistered. A negative errno code
1475  *      is returned on a failure.
1476  *
1477  *      When registered all registration and up events are replayed
1478  *      to the new notifier to allow device to have a race free
1479  *      view of the network device list.
1480  */
1481
1482 int register_netdevice_notifier(struct notifier_block *nb)
1483 {
1484         struct net_device *dev;
1485         struct net_device *last;
1486         struct net *net;
1487         int err;
1488
1489         rtnl_lock();
1490         err = raw_notifier_chain_register(&netdev_chain, nb);
1491         if (err)
1492                 goto unlock;
1493         if (dev_boot_phase)
1494                 goto unlock;
1495         for_each_net(net) {
1496                 for_each_netdev(net, dev) {
1497                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1498                         err = notifier_to_errno(err);
1499                         if (err)
1500                                 goto rollback;
1501
1502                         if (!(dev->flags & IFF_UP))
1503                                 continue;
1504
1505                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1506                 }
1507         }
1508
1509 unlock:
1510         rtnl_unlock();
1511         return err;
1512
1513 rollback:
1514         last = dev;
1515         for_each_net(net) {
1516                 for_each_netdev(net, dev) {
1517                         if (dev == last)
1518                                 goto outroll;
1519
1520                         if (dev->flags & IFF_UP) {
1521                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1522                                                         dev);
1523                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1524                         }
1525                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1526                 }
1527         }
1528
1529 outroll:
1530         raw_notifier_chain_unregister(&netdev_chain, nb);
1531         goto unlock;
1532 }
1533 EXPORT_SYMBOL(register_netdevice_notifier);
1534
1535 /**
1536  *      unregister_netdevice_notifier - unregister a network notifier block
1537  *      @nb: notifier
1538  *
1539  *      Unregister a notifier previously registered by
1540  *      register_netdevice_notifier(). The notifier is unlinked into the
1541  *      kernel structures and may then be reused. A negative errno code
1542  *      is returned on a failure.
1543  *
1544  *      After unregistering unregister and down device events are synthesized
1545  *      for all devices on the device list to the removed notifier to remove
1546  *      the need for special case cleanup code.
1547  */
1548
1549 int unregister_netdevice_notifier(struct notifier_block *nb)
1550 {
1551         struct net_device *dev;
1552         struct net *net;
1553         int err;
1554
1555         rtnl_lock();
1556         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1557         if (err)
1558                 goto unlock;
1559
1560         for_each_net(net) {
1561                 for_each_netdev(net, dev) {
1562                         if (dev->flags & IFF_UP) {
1563                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1564                                                         dev);
1565                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1566                         }
1567                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1568                 }
1569         }
1570 unlock:
1571         rtnl_unlock();
1572         return err;
1573 }
1574 EXPORT_SYMBOL(unregister_netdevice_notifier);
1575
1576 /**
1577  *      call_netdevice_notifiers_info - call all network notifier blocks
1578  *      @val: value passed unmodified to notifier function
1579  *      @dev: net_device pointer passed unmodified to notifier function
1580  *      @info: notifier information data
1581  *
1582  *      Call all network notifier blocks.  Parameters and return value
1583  *      are as for raw_notifier_call_chain().
1584  */
1585
1586 static int call_netdevice_notifiers_info(unsigned long val,
1587                                          struct net_device *dev,
1588                                          struct netdev_notifier_info *info)
1589 {
1590         ASSERT_RTNL();
1591         netdev_notifier_info_init(info, dev);
1592         return raw_notifier_call_chain(&netdev_chain, val, info);
1593 }
1594
1595 /**
1596  *      call_netdevice_notifiers - call all network notifier blocks
1597  *      @val: value passed unmodified to notifier function
1598  *      @dev: net_device pointer passed unmodified to notifier function
1599  *
1600  *      Call all network notifier blocks.  Parameters and return value
1601  *      are as for raw_notifier_call_chain().
1602  */
1603
1604 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1605 {
1606         struct netdev_notifier_info info;
1607
1608         return call_netdevice_notifiers_info(val, dev, &info);
1609 }
1610 EXPORT_SYMBOL(call_netdevice_notifiers);
1611
1612 static struct static_key netstamp_needed __read_mostly;
1613 #ifdef HAVE_JUMP_LABEL
1614 /* We are not allowed to call static_key_slow_dec() from irq context
1615  * If net_disable_timestamp() is called from irq context, defer the
1616  * static_key_slow_dec() calls.
1617  */
1618 static atomic_t netstamp_needed_deferred;
1619 #endif
1620
1621 void net_enable_timestamp(void)
1622 {
1623 #ifdef HAVE_JUMP_LABEL
1624         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1625
1626         if (deferred) {
1627                 while (--deferred)
1628                         static_key_slow_dec(&netstamp_needed);
1629                 return;
1630         }
1631 #endif
1632         static_key_slow_inc(&netstamp_needed);
1633 }
1634 EXPORT_SYMBOL(net_enable_timestamp);
1635
1636 void net_disable_timestamp(void)
1637 {
1638 #ifdef HAVE_JUMP_LABEL
1639         if (in_interrupt()) {
1640                 atomic_inc(&netstamp_needed_deferred);
1641                 return;
1642         }
1643 #endif
1644         static_key_slow_dec(&netstamp_needed);
1645 }
1646 EXPORT_SYMBOL(net_disable_timestamp);
1647
1648 static inline void net_timestamp_set(struct sk_buff *skb)
1649 {
1650         skb->tstamp.tv64 = 0;
1651         if (static_key_false(&netstamp_needed))
1652                 __net_timestamp(skb);
1653 }
1654
1655 #define net_timestamp_check(COND, SKB)                  \
1656         if (static_key_false(&netstamp_needed)) {               \
1657                 if ((COND) && !(SKB)->tstamp.tv64)      \
1658                         __net_timestamp(SKB);           \
1659         }                                               \
1660
1661 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1662 {
1663         unsigned int len;
1664
1665         if (!(dev->flags & IFF_UP))
1666                 return false;
1667
1668         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1669         if (skb->len <= len)
1670                 return true;
1671
1672         /* if TSO is enabled, we don't care about the length as the packet
1673          * could be forwarded without being segmented before
1674          */
1675         if (skb_is_gso(skb))
1676                 return true;
1677
1678         return false;
1679 }
1680 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1681
1682 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1683 {
1684         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1685                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1686                         atomic_long_inc(&dev->rx_dropped);
1687                         kfree_skb(skb);
1688                         return NET_RX_DROP;
1689                 }
1690         }
1691
1692         if (unlikely(!is_skb_forwardable(dev, skb))) {
1693                 atomic_long_inc(&dev->rx_dropped);
1694                 kfree_skb(skb);
1695                 return NET_RX_DROP;
1696         }
1697
1698         skb_scrub_packet(skb, true);
1699         skb->protocol = eth_type_trans(skb, dev);
1700         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1701
1702         return 0;
1703 }
1704 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1705
1706 /**
1707  * dev_forward_skb - loopback an skb to another netif
1708  *
1709  * @dev: destination network device
1710  * @skb: buffer to forward
1711  *
1712  * return values:
1713  *      NET_RX_SUCCESS  (no congestion)
1714  *      NET_RX_DROP     (packet was dropped, but freed)
1715  *
1716  * dev_forward_skb can be used for injecting an skb from the
1717  * start_xmit function of one device into the receive queue
1718  * of another device.
1719  *
1720  * The receiving device may be in another namespace, so
1721  * we have to clear all information in the skb that could
1722  * impact namespace isolation.
1723  */
1724 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1725 {
1726         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1727 }
1728 EXPORT_SYMBOL_GPL(dev_forward_skb);
1729
1730 static inline int deliver_skb(struct sk_buff *skb,
1731                               struct packet_type *pt_prev,
1732                               struct net_device *orig_dev)
1733 {
1734         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1735                 return -ENOMEM;
1736         atomic_inc(&skb->users);
1737         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1738 }
1739
1740 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1741                                           struct packet_type **pt,
1742                                           struct net_device *dev, __be16 type,
1743                                           struct list_head *ptype_list)
1744 {
1745         struct packet_type *ptype, *pt_prev = *pt;
1746
1747         list_for_each_entry_rcu(ptype, ptype_list, list) {
1748                 if (ptype->type != type)
1749                         continue;
1750                 if (pt_prev)
1751                         deliver_skb(skb, pt_prev, dev);
1752                 pt_prev = ptype;
1753         }
1754         *pt = pt_prev;
1755 }
1756
1757 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1758 {
1759         if (!ptype->af_packet_priv || !skb->sk)
1760                 return false;
1761
1762         if (ptype->id_match)
1763                 return ptype->id_match(ptype, skb->sk);
1764         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1765                 return true;
1766
1767         return false;
1768 }
1769
1770 /*
1771  *      Support routine. Sends outgoing frames to any network
1772  *      taps currently in use.
1773  */
1774
1775 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1776 {
1777         struct packet_type *ptype;
1778         struct sk_buff *skb2 = NULL;
1779         struct packet_type *pt_prev = NULL;
1780         struct list_head *ptype_list = &ptype_all;
1781
1782         rcu_read_lock();
1783 again:
1784         list_for_each_entry_rcu(ptype, ptype_list, list) {
1785                 /* Never send packets back to the socket
1786                  * they originated from - MvS ([email protected])
1787                  */
1788                 if (skb_loop_sk(ptype, skb))
1789                         continue;
1790
1791                 if (pt_prev) {
1792                         deliver_skb(skb2, pt_prev, skb->dev);
1793                         pt_prev = ptype;
1794                         continue;
1795                 }
1796
1797                 /* need to clone skb, done only once */
1798                 skb2 = skb_clone(skb, GFP_ATOMIC);
1799                 if (!skb2)
1800                         goto out_unlock;
1801
1802                 net_timestamp_set(skb2);
1803
1804                 /* skb->nh should be correctly
1805                  * set by sender, so that the second statement is
1806                  * just protection against buggy protocols.
1807                  */
1808                 skb_reset_mac_header(skb2);
1809
1810                 if (skb_network_header(skb2) < skb2->data ||
1811                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1812                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1813                                              ntohs(skb2->protocol),
1814                                              dev->name);
1815                         skb_reset_network_header(skb2);
1816                 }
1817
1818                 skb2->transport_header = skb2->network_header;
1819                 skb2->pkt_type = PACKET_OUTGOING;
1820                 pt_prev = ptype;
1821         }
1822
1823         if (ptype_list == &ptype_all) {
1824                 ptype_list = &dev->ptype_all;
1825                 goto again;
1826         }
1827 out_unlock:
1828         if (pt_prev)
1829                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1830         rcu_read_unlock();
1831 }
1832
1833 /**
1834  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1835  * @dev: Network device
1836  * @txq: number of queues available
1837  *
1838  * If real_num_tx_queues is changed the tc mappings may no longer be
1839  * valid. To resolve this verify the tc mapping remains valid and if
1840  * not NULL the mapping. With no priorities mapping to this
1841  * offset/count pair it will no longer be used. In the worst case TC0
1842  * is invalid nothing can be done so disable priority mappings. If is
1843  * expected that drivers will fix this mapping if they can before
1844  * calling netif_set_real_num_tx_queues.
1845  */
1846 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1847 {
1848         int i;
1849         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1850
1851         /* If TC0 is invalidated disable TC mapping */
1852         if (tc->offset + tc->count > txq) {
1853                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1854                 dev->num_tc = 0;
1855                 return;
1856         }
1857
1858         /* Invalidated prio to tc mappings set to TC0 */
1859         for (i = 1; i < TC_BITMASK + 1; i++) {
1860                 int q = netdev_get_prio_tc_map(dev, i);
1861
1862                 tc = &dev->tc_to_txq[q];
1863                 if (tc->offset + tc->count > txq) {
1864                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1865                                 i, q);
1866                         netdev_set_prio_tc_map(dev, i, 0);
1867                 }
1868         }
1869 }
1870
1871 #ifdef CONFIG_XPS
1872 static DEFINE_MUTEX(xps_map_mutex);
1873 #define xmap_dereference(P)             \
1874         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1875
1876 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1877                                         int cpu, u16 index)
1878 {
1879         struct xps_map *map = NULL;
1880         int pos;
1881
1882         if (dev_maps)
1883                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1884
1885         for (pos = 0; map && pos < map->len; pos++) {
1886                 if (map->queues[pos] == index) {
1887                         if (map->len > 1) {
1888                                 map->queues[pos] = map->queues[--map->len];
1889                         } else {
1890                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1891                                 kfree_rcu(map, rcu);
1892                                 map = NULL;
1893                         }
1894                         break;
1895                 }
1896         }
1897
1898         return map;
1899 }
1900
1901 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1902 {
1903         struct xps_dev_maps *dev_maps;
1904         int cpu, i;
1905         bool active = false;
1906
1907         mutex_lock(&xps_map_mutex);
1908         dev_maps = xmap_dereference(dev->xps_maps);
1909
1910         if (!dev_maps)
1911                 goto out_no_maps;
1912
1913         for_each_possible_cpu(cpu) {
1914                 for (i = index; i < dev->num_tx_queues; i++) {
1915                         if (!remove_xps_queue(dev_maps, cpu, i))
1916                                 break;
1917                 }
1918                 if (i == dev->num_tx_queues)
1919                         active = true;
1920         }
1921
1922         if (!active) {
1923                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1924                 kfree_rcu(dev_maps, rcu);
1925         }
1926
1927         for (i = index; i < dev->num_tx_queues; i++)
1928                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1929                                              NUMA_NO_NODE);
1930
1931 out_no_maps:
1932         mutex_unlock(&xps_map_mutex);
1933 }
1934
1935 static struct xps_map *expand_xps_map(struct xps_map *map,
1936                                       int cpu, u16 index)
1937 {
1938         struct xps_map *new_map;
1939         int alloc_len = XPS_MIN_MAP_ALLOC;
1940         int i, pos;
1941
1942         for (pos = 0; map && pos < map->len; pos++) {
1943                 if (map->queues[pos] != index)
1944                         continue;
1945                 return map;
1946         }
1947
1948         /* Need to add queue to this CPU's existing map */
1949         if (map) {
1950                 if (pos < map->alloc_len)
1951                         return map;
1952
1953                 alloc_len = map->alloc_len * 2;
1954         }
1955
1956         /* Need to allocate new map to store queue on this CPU's map */
1957         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1958                                cpu_to_node(cpu));
1959         if (!new_map)
1960                 return NULL;
1961
1962         for (i = 0; i < pos; i++)
1963                 new_map->queues[i] = map->queues[i];
1964         new_map->alloc_len = alloc_len;
1965         new_map->len = pos;
1966
1967         return new_map;
1968 }
1969
1970 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1971                         u16 index)
1972 {
1973         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1974         struct xps_map *map, *new_map;
1975         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1976         int cpu, numa_node_id = -2;
1977         bool active = false;
1978
1979         mutex_lock(&xps_map_mutex);
1980
1981         dev_maps = xmap_dereference(dev->xps_maps);
1982
1983         /* allocate memory for queue storage */
1984         for_each_online_cpu(cpu) {
1985                 if (!cpumask_test_cpu(cpu, mask))
1986                         continue;
1987
1988                 if (!new_dev_maps)
1989                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1990                 if (!new_dev_maps) {
1991                         mutex_unlock(&xps_map_mutex);
1992                         return -ENOMEM;
1993                 }
1994
1995                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1996                                  NULL;
1997
1998                 map = expand_xps_map(map, cpu, index);
1999                 if (!map)
2000                         goto error;
2001
2002                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2003         }
2004
2005         if (!new_dev_maps)
2006                 goto out_no_new_maps;
2007
2008         for_each_possible_cpu(cpu) {
2009                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2010                         /* add queue to CPU maps */
2011                         int pos = 0;
2012
2013                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2014                         while ((pos < map->len) && (map->queues[pos] != index))
2015                                 pos++;
2016
2017                         if (pos == map->len)
2018                                 map->queues[map->len++] = index;
2019 #ifdef CONFIG_NUMA
2020                         if (numa_node_id == -2)
2021                                 numa_node_id = cpu_to_node(cpu);
2022                         else if (numa_node_id != cpu_to_node(cpu))
2023                                 numa_node_id = -1;
2024 #endif
2025                 } else if (dev_maps) {
2026                         /* fill in the new device map from the old device map */
2027                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2028                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2029                 }
2030
2031         }
2032
2033         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2034
2035         /* Cleanup old maps */
2036         if (dev_maps) {
2037                 for_each_possible_cpu(cpu) {
2038                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2039                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2040                         if (map && map != new_map)
2041                                 kfree_rcu(map, rcu);
2042                 }
2043
2044                 kfree_rcu(dev_maps, rcu);
2045         }
2046
2047         dev_maps = new_dev_maps;
2048         active = true;
2049
2050 out_no_new_maps:
2051         /* update Tx queue numa node */
2052         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2053                                      (numa_node_id >= 0) ? numa_node_id :
2054                                      NUMA_NO_NODE);
2055
2056         if (!dev_maps)
2057                 goto out_no_maps;
2058
2059         /* removes queue from unused CPUs */
2060         for_each_possible_cpu(cpu) {
2061                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2062                         continue;
2063
2064                 if (remove_xps_queue(dev_maps, cpu, index))
2065                         active = true;
2066         }
2067
2068         /* free map if not active */
2069         if (!active) {
2070                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2071                 kfree_rcu(dev_maps, rcu);
2072         }
2073
2074 out_no_maps:
2075         mutex_unlock(&xps_map_mutex);
2076
2077         return 0;
2078 error:
2079         /* remove any maps that we added */
2080         for_each_possible_cpu(cpu) {
2081                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2082                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2083                                  NULL;
2084                 if (new_map && new_map != map)
2085                         kfree(new_map);
2086         }
2087
2088         mutex_unlock(&xps_map_mutex);
2089
2090         kfree(new_dev_maps);
2091         return -ENOMEM;
2092 }
2093 EXPORT_SYMBOL(netif_set_xps_queue);
2094
2095 #endif
2096 /*
2097  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2098  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2099  */
2100 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2101 {
2102         int rc;
2103
2104         if (txq < 1 || txq > dev->num_tx_queues)
2105                 return -EINVAL;
2106
2107         if (dev->reg_state == NETREG_REGISTERED ||
2108             dev->reg_state == NETREG_UNREGISTERING) {
2109                 ASSERT_RTNL();
2110
2111                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2112                                                   txq);
2113                 if (rc)
2114                         return rc;
2115
2116                 if (dev->num_tc)
2117                         netif_setup_tc(dev, txq);
2118
2119                 if (txq < dev->real_num_tx_queues) {
2120                         qdisc_reset_all_tx_gt(dev, txq);
2121 #ifdef CONFIG_XPS
2122                         netif_reset_xps_queues_gt(dev, txq);
2123 #endif
2124                 }
2125         }
2126
2127         dev->real_num_tx_queues = txq;
2128         return 0;
2129 }
2130 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2131
2132 #ifdef CONFIG_SYSFS
2133 /**
2134  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2135  *      @dev: Network device
2136  *      @rxq: Actual number of RX queues
2137  *
2138  *      This must be called either with the rtnl_lock held or before
2139  *      registration of the net device.  Returns 0 on success, or a
2140  *      negative error code.  If called before registration, it always
2141  *      succeeds.
2142  */
2143 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2144 {
2145         int rc;
2146
2147         if (rxq < 1 || rxq > dev->num_rx_queues)
2148                 return -EINVAL;
2149
2150         if (dev->reg_state == NETREG_REGISTERED) {
2151                 ASSERT_RTNL();
2152
2153                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2154                                                   rxq);
2155                 if (rc)
2156                         return rc;
2157         }
2158
2159         dev->real_num_rx_queues = rxq;
2160         return 0;
2161 }
2162 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2163 #endif
2164
2165 /**
2166  * netif_get_num_default_rss_queues - default number of RSS queues
2167  *
2168  * This routine should set an upper limit on the number of RSS queues
2169  * used by default by multiqueue devices.
2170  */
2171 int netif_get_num_default_rss_queues(void)
2172 {
2173         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2174 }
2175 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2176
2177 static inline void __netif_reschedule(struct Qdisc *q)
2178 {
2179         struct softnet_data *sd;
2180         unsigned long flags;
2181
2182         local_irq_save(flags);
2183         sd = this_cpu_ptr(&softnet_data);
2184         q->next_sched = NULL;
2185         *sd->output_queue_tailp = q;
2186         sd->output_queue_tailp = &q->next_sched;
2187         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2188         local_irq_restore(flags);
2189 }
2190
2191 void __netif_schedule(struct Qdisc *q)
2192 {
2193         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2194                 __netif_reschedule(q);
2195 }
2196 EXPORT_SYMBOL(__netif_schedule);
2197
2198 struct dev_kfree_skb_cb {
2199         enum skb_free_reason reason;
2200 };
2201
2202 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2203 {
2204         return (struct dev_kfree_skb_cb *)skb->cb;
2205 }
2206
2207 void netif_schedule_queue(struct netdev_queue *txq)
2208 {
2209         rcu_read_lock();
2210         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2211                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2212
2213                 __netif_schedule(q);
2214         }
2215         rcu_read_unlock();
2216 }
2217 EXPORT_SYMBOL(netif_schedule_queue);
2218
2219 /**
2220  *      netif_wake_subqueue - allow sending packets on subqueue
2221  *      @dev: network device
2222  *      @queue_index: sub queue index
2223  *
2224  * Resume individual transmit queue of a device with multiple transmit queues.
2225  */
2226 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2227 {
2228         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2229
2230         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2231                 struct Qdisc *q;
2232
2233                 rcu_read_lock();
2234                 q = rcu_dereference(txq->qdisc);
2235                 __netif_schedule(q);
2236                 rcu_read_unlock();
2237         }
2238 }
2239 EXPORT_SYMBOL(netif_wake_subqueue);
2240
2241 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2242 {
2243         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2244                 struct Qdisc *q;
2245
2246                 rcu_read_lock();
2247                 q = rcu_dereference(dev_queue->qdisc);
2248                 __netif_schedule(q);
2249                 rcu_read_unlock();
2250         }
2251 }
2252 EXPORT_SYMBOL(netif_tx_wake_queue);
2253
2254 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2255 {
2256         unsigned long flags;
2257
2258         if (likely(atomic_read(&skb->users) == 1)) {
2259                 smp_rmb();
2260                 atomic_set(&skb->users, 0);
2261         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2262                 return;
2263         }
2264         get_kfree_skb_cb(skb)->reason = reason;
2265         local_irq_save(flags);
2266         skb->next = __this_cpu_read(softnet_data.completion_queue);
2267         __this_cpu_write(softnet_data.completion_queue, skb);
2268         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2269         local_irq_restore(flags);
2270 }
2271 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2272
2273 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2274 {
2275         if (in_irq() || irqs_disabled())
2276                 __dev_kfree_skb_irq(skb, reason);
2277         else
2278                 dev_kfree_skb(skb);
2279 }
2280 EXPORT_SYMBOL(__dev_kfree_skb_any);
2281
2282
2283 /**
2284  * netif_device_detach - mark device as removed
2285  * @dev: network device
2286  *
2287  * Mark device as removed from system and therefore no longer available.
2288  */
2289 void netif_device_detach(struct net_device *dev)
2290 {
2291         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2292             netif_running(dev)) {
2293                 netif_tx_stop_all_queues(dev);
2294         }
2295 }
2296 EXPORT_SYMBOL(netif_device_detach);
2297
2298 /**
2299  * netif_device_attach - mark device as attached
2300  * @dev: network device
2301  *
2302  * Mark device as attached from system and restart if needed.
2303  */
2304 void netif_device_attach(struct net_device *dev)
2305 {
2306         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2307             netif_running(dev)) {
2308                 netif_tx_wake_all_queues(dev);
2309                 __netdev_watchdog_up(dev);
2310         }
2311 }
2312 EXPORT_SYMBOL(netif_device_attach);
2313
2314 static void skb_warn_bad_offload(const struct sk_buff *skb)
2315 {
2316         static const netdev_features_t null_features = 0;
2317         struct net_device *dev = skb->dev;
2318         const char *driver = "";
2319
2320         if (!net_ratelimit())
2321                 return;
2322
2323         if (dev && dev->dev.parent)
2324                 driver = dev_driver_string(dev->dev.parent);
2325
2326         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2327              "gso_type=%d ip_summed=%d\n",
2328              driver, dev ? &dev->features : &null_features,
2329              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2330              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2331              skb_shinfo(skb)->gso_type, skb->ip_summed);
2332 }
2333
2334 /*
2335  * Invalidate hardware checksum when packet is to be mangled, and
2336  * complete checksum manually on outgoing path.
2337  */
2338 int skb_checksum_help(struct sk_buff *skb)
2339 {
2340         __wsum csum;
2341         int ret = 0, offset;
2342
2343         if (skb->ip_summed == CHECKSUM_COMPLETE)
2344                 goto out_set_summed;
2345
2346         if (unlikely(skb_shinfo(skb)->gso_size)) {
2347                 skb_warn_bad_offload(skb);
2348                 return -EINVAL;
2349         }
2350
2351         /* Before computing a checksum, we should make sure no frag could
2352          * be modified by an external entity : checksum could be wrong.
2353          */
2354         if (skb_has_shared_frag(skb)) {
2355                 ret = __skb_linearize(skb);
2356                 if (ret)
2357                         goto out;
2358         }
2359
2360         offset = skb_checksum_start_offset(skb);
2361         BUG_ON(offset >= skb_headlen(skb));
2362         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2363
2364         offset += skb->csum_offset;
2365         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2366
2367         if (skb_cloned(skb) &&
2368             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2369                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2370                 if (ret)
2371                         goto out;
2372         }
2373
2374         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2375 out_set_summed:
2376         skb->ip_summed = CHECKSUM_NONE;
2377 out:
2378         return ret;
2379 }
2380 EXPORT_SYMBOL(skb_checksum_help);
2381
2382 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2383 {
2384         __be16 type = skb->protocol;
2385
2386         /* Tunnel gso handlers can set protocol to ethernet. */
2387         if (type == htons(ETH_P_TEB)) {
2388                 struct ethhdr *eth;
2389
2390                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2391                         return 0;
2392
2393                 eth = (struct ethhdr *)skb_mac_header(skb);
2394                 type = eth->h_proto;
2395         }
2396
2397         return __vlan_get_protocol(skb, type, depth);
2398 }
2399
2400 /**
2401  *      skb_mac_gso_segment - mac layer segmentation handler.
2402  *      @skb: buffer to segment
2403  *      @features: features for the output path (see dev->features)
2404  */
2405 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2406                                     netdev_features_t features)
2407 {
2408         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2409         struct packet_offload *ptype;
2410         int vlan_depth = skb->mac_len;
2411         __be16 type = skb_network_protocol(skb, &vlan_depth);
2412
2413         if (unlikely(!type))
2414                 return ERR_PTR(-EINVAL);
2415
2416         __skb_pull(skb, vlan_depth);
2417
2418         rcu_read_lock();
2419         list_for_each_entry_rcu(ptype, &offload_base, list) {
2420                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2421                         segs = ptype->callbacks.gso_segment(skb, features);
2422                         break;
2423                 }
2424         }
2425         rcu_read_unlock();
2426
2427         __skb_push(skb, skb->data - skb_mac_header(skb));
2428
2429         return segs;
2430 }
2431 EXPORT_SYMBOL(skb_mac_gso_segment);
2432
2433
2434 /* openvswitch calls this on rx path, so we need a different check.
2435  */
2436 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2437 {
2438         if (tx_path)
2439                 return skb->ip_summed != CHECKSUM_PARTIAL;
2440         else
2441                 return skb->ip_summed == CHECKSUM_NONE;
2442 }
2443
2444 /**
2445  *      __skb_gso_segment - Perform segmentation on skb.
2446  *      @skb: buffer to segment
2447  *      @features: features for the output path (see dev->features)
2448  *      @tx_path: whether it is called in TX path
2449  *
2450  *      This function segments the given skb and returns a list of segments.
2451  *
2452  *      It may return NULL if the skb requires no segmentation.  This is
2453  *      only possible when GSO is used for verifying header integrity.
2454  */
2455 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2456                                   netdev_features_t features, bool tx_path)
2457 {
2458         if (unlikely(skb_needs_check(skb, tx_path))) {
2459                 int err;
2460
2461                 skb_warn_bad_offload(skb);
2462
2463                 err = skb_cow_head(skb, 0);
2464                 if (err < 0)
2465                         return ERR_PTR(err);
2466         }
2467
2468         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2469         SKB_GSO_CB(skb)->encap_level = 0;
2470
2471         skb_reset_mac_header(skb);
2472         skb_reset_mac_len(skb);
2473
2474         return skb_mac_gso_segment(skb, features);
2475 }
2476 EXPORT_SYMBOL(__skb_gso_segment);
2477
2478 /* Take action when hardware reception checksum errors are detected. */
2479 #ifdef CONFIG_BUG
2480 void netdev_rx_csum_fault(struct net_device *dev)
2481 {
2482         if (net_ratelimit()) {
2483                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2484                 dump_stack();
2485         }
2486 }
2487 EXPORT_SYMBOL(netdev_rx_csum_fault);
2488 #endif
2489
2490 /* Actually, we should eliminate this check as soon as we know, that:
2491  * 1. IOMMU is present and allows to map all the memory.
2492  * 2. No high memory really exists on this machine.
2493  */
2494
2495 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2496 {
2497 #ifdef CONFIG_HIGHMEM
2498         int i;
2499         if (!(dev->features & NETIF_F_HIGHDMA)) {
2500                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2501                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2502                         if (PageHighMem(skb_frag_page(frag)))
2503                                 return 1;
2504                 }
2505         }
2506
2507         if (PCI_DMA_BUS_IS_PHYS) {
2508                 struct device *pdev = dev->dev.parent;
2509
2510                 if (!pdev)
2511                         return 0;
2512                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2513                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2514                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2515                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2516                                 return 1;
2517                 }
2518         }
2519 #endif
2520         return 0;
2521 }
2522
2523 /* If MPLS offload request, verify we are testing hardware MPLS features
2524  * instead of standard features for the netdev.
2525  */
2526 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2527 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2528                                            netdev_features_t features,
2529                                            __be16 type)
2530 {
2531         if (eth_p_mpls(type))
2532                 features &= skb->dev->mpls_features;
2533
2534         return features;
2535 }
2536 #else
2537 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2538                                            netdev_features_t features,
2539                                            __be16 type)
2540 {
2541         return features;
2542 }
2543 #endif
2544
2545 static netdev_features_t harmonize_features(struct sk_buff *skb,
2546         netdev_features_t features)
2547 {
2548         int tmp;
2549         __be16 type;
2550
2551         type = skb_network_protocol(skb, &tmp);
2552         features = net_mpls_features(skb, features, type);
2553
2554         if (skb->ip_summed != CHECKSUM_NONE &&
2555             !can_checksum_protocol(features, type)) {
2556                 features &= ~NETIF_F_ALL_CSUM;
2557         } else if (illegal_highdma(skb->dev, skb)) {
2558                 features &= ~NETIF_F_SG;
2559         }
2560
2561         return features;
2562 }
2563
2564 netdev_features_t netif_skb_features(struct sk_buff *skb)
2565 {
2566         struct net_device *dev = skb->dev;
2567         netdev_features_t features = dev->features;
2568         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2569         __be16 protocol = skb->protocol;
2570
2571         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2572                 features &= ~NETIF_F_GSO_MASK;
2573
2574         /* If encapsulation offload request, verify we are testing
2575          * hardware encapsulation features instead of standard
2576          * features for the netdev
2577          */
2578         if (skb->encapsulation)
2579                 features &= dev->hw_enc_features;
2580
2581         if (!skb_vlan_tag_present(skb)) {
2582                 if (unlikely(protocol == htons(ETH_P_8021Q) ||
2583                              protocol == htons(ETH_P_8021AD))) {
2584                         struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2585                         protocol = veh->h_vlan_encapsulated_proto;
2586                 } else {
2587                         goto finalize;
2588                 }
2589         }
2590
2591         features = netdev_intersect_features(features,
2592                                              dev->vlan_features |
2593                                              NETIF_F_HW_VLAN_CTAG_TX |
2594                                              NETIF_F_HW_VLAN_STAG_TX);
2595
2596         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2597                 features = netdev_intersect_features(features,
2598                                                      NETIF_F_SG |
2599                                                      NETIF_F_HIGHDMA |
2600                                                      NETIF_F_FRAGLIST |
2601                                                      NETIF_F_GEN_CSUM |
2602                                                      NETIF_F_HW_VLAN_CTAG_TX |
2603                                                      NETIF_F_HW_VLAN_STAG_TX);
2604
2605 finalize:
2606         if (dev->netdev_ops->ndo_features_check)
2607                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2608                                                                 features);
2609
2610         return harmonize_features(skb, features);
2611 }
2612 EXPORT_SYMBOL(netif_skb_features);
2613
2614 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2615                     struct netdev_queue *txq, bool more)
2616 {
2617         unsigned int len;
2618         int rc;
2619
2620         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2621                 dev_queue_xmit_nit(skb, dev);
2622
2623         len = skb->len;
2624         trace_net_dev_start_xmit(skb, dev);
2625         rc = netdev_start_xmit(skb, dev, txq, more);
2626         trace_net_dev_xmit(skb, rc, dev, len);
2627
2628         return rc;
2629 }
2630
2631 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2632                                     struct netdev_queue *txq, int *ret)
2633 {
2634         struct sk_buff *skb = first;
2635         int rc = NETDEV_TX_OK;
2636
2637         while (skb) {
2638                 struct sk_buff *next = skb->next;
2639
2640                 skb->next = NULL;
2641                 rc = xmit_one(skb, dev, txq, next != NULL);
2642                 if (unlikely(!dev_xmit_complete(rc))) {
2643                         skb->next = next;
2644                         goto out;
2645                 }
2646
2647                 skb = next;
2648                 if (netif_xmit_stopped(txq) && skb) {
2649                         rc = NETDEV_TX_BUSY;
2650                         break;
2651                 }
2652         }
2653
2654 out:
2655         *ret = rc;
2656         return skb;
2657 }
2658
2659 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2660                                           netdev_features_t features)
2661 {
2662         if (skb_vlan_tag_present(skb) &&
2663             !vlan_hw_offload_capable(features, skb->vlan_proto))
2664                 skb = __vlan_hwaccel_push_inside(skb);
2665         return skb;
2666 }
2667
2668 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2669 {
2670         netdev_features_t features;
2671
2672         if (skb->next)
2673                 return skb;
2674
2675         features = netif_skb_features(skb);
2676         skb = validate_xmit_vlan(skb, features);
2677         if (unlikely(!skb))
2678                 goto out_null;
2679
2680         if (netif_needs_gso(dev, skb, features)) {
2681                 struct sk_buff *segs;
2682
2683                 segs = skb_gso_segment(skb, features);
2684                 if (IS_ERR(segs)) {
2685                         goto out_kfree_skb;
2686                 } else if (segs) {
2687                         consume_skb(skb);
2688                         skb = segs;
2689                 }
2690         } else {
2691                 if (skb_needs_linearize(skb, features) &&
2692                     __skb_linearize(skb))
2693                         goto out_kfree_skb;
2694
2695                 /* If packet is not checksummed and device does not
2696                  * support checksumming for this protocol, complete
2697                  * checksumming here.
2698                  */
2699                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2700                         if (skb->encapsulation)
2701                                 skb_set_inner_transport_header(skb,
2702                                                                skb_checksum_start_offset(skb));
2703                         else
2704                                 skb_set_transport_header(skb,
2705                                                          skb_checksum_start_offset(skb));
2706                         if (!(features & NETIF_F_ALL_CSUM) &&
2707                             skb_checksum_help(skb))
2708                                 goto out_kfree_skb;
2709                 }
2710         }
2711
2712         return skb;
2713
2714 out_kfree_skb:
2715         kfree_skb(skb);
2716 out_null:
2717         return NULL;
2718 }
2719
2720 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2721 {
2722         struct sk_buff *next, *head = NULL, *tail;
2723
2724         for (; skb != NULL; skb = next) {
2725                 next = skb->next;
2726                 skb->next = NULL;
2727
2728                 /* in case skb wont be segmented, point to itself */
2729                 skb->prev = skb;
2730
2731                 skb = validate_xmit_skb(skb, dev);
2732                 if (!skb)
2733                         continue;
2734
2735                 if (!head)
2736                         head = skb;
2737                 else
2738                         tail->next = skb;
2739                 /* If skb was segmented, skb->prev points to
2740                  * the last segment. If not, it still contains skb.
2741                  */
2742                 tail = skb->prev;
2743         }
2744         return head;
2745 }
2746
2747 static void qdisc_pkt_len_init(struct sk_buff *skb)
2748 {
2749         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2750
2751         qdisc_skb_cb(skb)->pkt_len = skb->len;
2752
2753         /* To get more precise estimation of bytes sent on wire,
2754          * we add to pkt_len the headers size of all segments
2755          */
2756         if (shinfo->gso_size)  {
2757                 unsigned int hdr_len;
2758                 u16 gso_segs = shinfo->gso_segs;
2759
2760                 /* mac layer + network layer */
2761                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2762
2763                 /* + transport layer */
2764                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2765                         hdr_len += tcp_hdrlen(skb);
2766                 else
2767                         hdr_len += sizeof(struct udphdr);
2768
2769                 if (shinfo->gso_type & SKB_GSO_DODGY)
2770                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2771                                                 shinfo->gso_size);
2772
2773                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2774         }
2775 }
2776
2777 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2778                                  struct net_device *dev,
2779                                  struct netdev_queue *txq)
2780 {
2781         spinlock_t *root_lock = qdisc_lock(q);
2782         bool contended;
2783         int rc;
2784
2785         qdisc_pkt_len_init(skb);
2786         qdisc_calculate_pkt_len(skb, q);
2787         /*
2788          * Heuristic to force contended enqueues to serialize on a
2789          * separate lock before trying to get qdisc main lock.
2790          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2791          * often and dequeue packets faster.
2792          */
2793         contended = qdisc_is_running(q);
2794         if (unlikely(contended))
2795                 spin_lock(&q->busylock);
2796
2797         spin_lock(root_lock);
2798         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2799                 kfree_skb(skb);
2800                 rc = NET_XMIT_DROP;
2801         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2802                    qdisc_run_begin(q)) {
2803                 /*
2804                  * This is a work-conserving queue; there are no old skbs
2805                  * waiting to be sent out; and the qdisc is not running -
2806                  * xmit the skb directly.
2807                  */
2808
2809                 qdisc_bstats_update(q, skb);
2810
2811                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2812                         if (unlikely(contended)) {
2813                                 spin_unlock(&q->busylock);
2814                                 contended = false;
2815                         }
2816                         __qdisc_run(q);
2817                 } else
2818                         qdisc_run_end(q);
2819
2820                 rc = NET_XMIT_SUCCESS;
2821         } else {
2822                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2823                 if (qdisc_run_begin(q)) {
2824                         if (unlikely(contended)) {
2825                                 spin_unlock(&q->busylock);
2826                                 contended = false;
2827                         }
2828                         __qdisc_run(q);
2829                 }
2830         }
2831         spin_unlock(root_lock);
2832         if (unlikely(contended))
2833                 spin_unlock(&q->busylock);
2834         return rc;
2835 }
2836
2837 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2838 static void skb_update_prio(struct sk_buff *skb)
2839 {
2840         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2841
2842         if (!skb->priority && skb->sk && map) {
2843                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2844
2845                 if (prioidx < map->priomap_len)
2846                         skb->priority = map->priomap[prioidx];
2847         }
2848 }
2849 #else
2850 #define skb_update_prio(skb)
2851 #endif
2852
2853 static DEFINE_PER_CPU(int, xmit_recursion);
2854 #define RECURSION_LIMIT 10
2855
2856 /**
2857  *      dev_loopback_xmit - loop back @skb
2858  *      @skb: buffer to transmit
2859  */
2860 int dev_loopback_xmit(struct sk_buff *skb)
2861 {
2862         skb_reset_mac_header(skb);
2863         __skb_pull(skb, skb_network_offset(skb));
2864         skb->pkt_type = PACKET_LOOPBACK;
2865         skb->ip_summed = CHECKSUM_UNNECESSARY;
2866         WARN_ON(!skb_dst(skb));
2867         skb_dst_force(skb);
2868         netif_rx_ni(skb);
2869         return 0;
2870 }
2871 EXPORT_SYMBOL(dev_loopback_xmit);
2872
2873 /**
2874  *      __dev_queue_xmit - transmit a buffer
2875  *      @skb: buffer to transmit
2876  *      @accel_priv: private data used for L2 forwarding offload
2877  *
2878  *      Queue a buffer for transmission to a network device. The caller must
2879  *      have set the device and priority and built the buffer before calling
2880  *      this function. The function can be called from an interrupt.
2881  *
2882  *      A negative errno code is returned on a failure. A success does not
2883  *      guarantee the frame will be transmitted as it may be dropped due
2884  *      to congestion or traffic shaping.
2885  *
2886  * -----------------------------------------------------------------------------------
2887  *      I notice this method can also return errors from the queue disciplines,
2888  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2889  *      be positive.
2890  *
2891  *      Regardless of the return value, the skb is consumed, so it is currently
2892  *      difficult to retry a send to this method.  (You can bump the ref count
2893  *      before sending to hold a reference for retry if you are careful.)
2894  *
2895  *      When calling this method, interrupts MUST be enabled.  This is because
2896  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2897  *          --BLG
2898  */
2899 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2900 {
2901         struct net_device *dev = skb->dev;
2902         struct netdev_queue *txq;
2903         struct Qdisc *q;
2904         int rc = -ENOMEM;
2905
2906         skb_reset_mac_header(skb);
2907
2908         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2909                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2910
2911         /* Disable soft irqs for various locks below. Also
2912          * stops preemption for RCU.
2913          */
2914         rcu_read_lock_bh();
2915
2916         skb_update_prio(skb);
2917
2918         /* If device/qdisc don't need skb->dst, release it right now while
2919          * its hot in this cpu cache.
2920          */
2921         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2922                 skb_dst_drop(skb);
2923         else
2924                 skb_dst_force(skb);
2925
2926         txq = netdev_pick_tx(dev, skb, accel_priv);
2927         q = rcu_dereference_bh(txq->qdisc);
2928
2929 #ifdef CONFIG_NET_CLS_ACT
2930         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2931 #endif
2932         trace_net_dev_queue(skb);
2933         if (q->enqueue) {
2934                 rc = __dev_xmit_skb(skb, q, dev, txq);
2935                 goto out;
2936         }
2937
2938         /* The device has no queue. Common case for software devices:
2939            loopback, all the sorts of tunnels...
2940
2941            Really, it is unlikely that netif_tx_lock protection is necessary
2942            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2943            counters.)
2944            However, it is possible, that they rely on protection
2945            made by us here.
2946
2947            Check this and shot the lock. It is not prone from deadlocks.
2948            Either shot noqueue qdisc, it is even simpler 8)
2949          */
2950         if (dev->flags & IFF_UP) {
2951                 int cpu = smp_processor_id(); /* ok because BHs are off */
2952
2953                 if (txq->xmit_lock_owner != cpu) {
2954
2955                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2956                                 goto recursion_alert;
2957
2958                         skb = validate_xmit_skb(skb, dev);
2959                         if (!skb)
2960                                 goto drop;
2961
2962                         HARD_TX_LOCK(dev, txq, cpu);
2963
2964                         if (!netif_xmit_stopped(txq)) {
2965                                 __this_cpu_inc(xmit_recursion);
2966                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2967                                 __this_cpu_dec(xmit_recursion);
2968                                 if (dev_xmit_complete(rc)) {
2969                                         HARD_TX_UNLOCK(dev, txq);
2970                                         goto out;
2971                                 }
2972                         }
2973                         HARD_TX_UNLOCK(dev, txq);
2974                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2975                                              dev->name);
2976                 } else {
2977                         /* Recursion is detected! It is possible,
2978                          * unfortunately
2979                          */
2980 recursion_alert:
2981                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2982                                              dev->name);
2983                 }
2984         }
2985
2986         rc = -ENETDOWN;
2987 drop:
2988         rcu_read_unlock_bh();
2989
2990         atomic_long_inc(&dev->tx_dropped);
2991         kfree_skb_list(skb);
2992         return rc;
2993 out:
2994         rcu_read_unlock_bh();
2995         return rc;
2996 }
2997
2998 int dev_queue_xmit(struct sk_buff *skb)
2999 {
3000         return __dev_queue_xmit(skb, NULL);
3001 }
3002 EXPORT_SYMBOL(dev_queue_xmit);
3003
3004 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3005 {
3006         return __dev_queue_xmit(skb, accel_priv);
3007 }
3008 EXPORT_SYMBOL(dev_queue_xmit_accel);
3009
3010
3011 /*=======================================================================
3012                         Receiver routines
3013   =======================================================================*/
3014
3015 int netdev_max_backlog __read_mostly = 1000;
3016 EXPORT_SYMBOL(netdev_max_backlog);
3017
3018 int netdev_tstamp_prequeue __read_mostly = 1;
3019 int netdev_budget __read_mostly = 300;
3020 int weight_p __read_mostly = 64;            /* old backlog weight */
3021
3022 /* Called with irq disabled */
3023 static inline void ____napi_schedule(struct softnet_data *sd,
3024                                      struct napi_struct *napi)
3025 {
3026         list_add_tail(&napi->poll_list, &sd->poll_list);
3027         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3028 }
3029
3030 #ifdef CONFIG_RPS
3031
3032 /* One global table that all flow-based protocols share. */
3033 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3034 EXPORT_SYMBOL(rps_sock_flow_table);
3035 u32 rps_cpu_mask __read_mostly;
3036 EXPORT_SYMBOL(rps_cpu_mask);
3037
3038 struct static_key rps_needed __read_mostly;
3039
3040 static struct rps_dev_flow *
3041 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3042             struct rps_dev_flow *rflow, u16 next_cpu)
3043 {
3044         if (next_cpu != RPS_NO_CPU) {
3045 #ifdef CONFIG_RFS_ACCEL
3046                 struct netdev_rx_queue *rxqueue;
3047                 struct rps_dev_flow_table *flow_table;
3048                 struct rps_dev_flow *old_rflow;
3049                 u32 flow_id;
3050                 u16 rxq_index;
3051                 int rc;
3052
3053                 /* Should we steer this flow to a different hardware queue? */
3054                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3055                     !(dev->features & NETIF_F_NTUPLE))
3056                         goto out;
3057                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3058                 if (rxq_index == skb_get_rx_queue(skb))
3059                         goto out;
3060
3061                 rxqueue = dev->_rx + rxq_index;
3062                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3063                 if (!flow_table)
3064                         goto out;
3065                 flow_id = skb_get_hash(skb) & flow_table->mask;
3066                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3067                                                         rxq_index, flow_id);
3068                 if (rc < 0)
3069                         goto out;
3070                 old_rflow = rflow;
3071                 rflow = &flow_table->flows[flow_id];
3072                 rflow->filter = rc;
3073                 if (old_rflow->filter == rflow->filter)
3074                         old_rflow->filter = RPS_NO_FILTER;
3075         out:
3076 #endif
3077                 rflow->last_qtail =
3078                         per_cpu(softnet_data, next_cpu).input_queue_head;
3079         }
3080
3081         rflow->cpu = next_cpu;
3082         return rflow;
3083 }
3084
3085 /*
3086  * get_rps_cpu is called from netif_receive_skb and returns the target
3087  * CPU from the RPS map of the receiving queue for a given skb.
3088  * rcu_read_lock must be held on entry.
3089  */
3090 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3091                        struct rps_dev_flow **rflowp)
3092 {
3093         const struct rps_sock_flow_table *sock_flow_table;
3094         struct netdev_rx_queue *rxqueue = dev->_rx;
3095         struct rps_dev_flow_table *flow_table;
3096         struct rps_map *map;
3097         int cpu = -1;
3098         u32 tcpu;
3099         u32 hash;
3100
3101         if (skb_rx_queue_recorded(skb)) {
3102                 u16 index = skb_get_rx_queue(skb);
3103
3104                 if (unlikely(index >= dev->real_num_rx_queues)) {
3105                         WARN_ONCE(dev->real_num_rx_queues > 1,
3106                                   "%s received packet on queue %u, but number "
3107                                   "of RX queues is %u\n",
3108                                   dev->name, index, dev->real_num_rx_queues);
3109                         goto done;
3110                 }
3111                 rxqueue += index;
3112         }
3113
3114         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3115
3116         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3117         map = rcu_dereference(rxqueue->rps_map);
3118         if (!flow_table && !map)
3119                 goto done;
3120
3121         skb_reset_network_header(skb);
3122         hash = skb_get_hash(skb);
3123         if (!hash)
3124                 goto done;
3125
3126         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3127         if (flow_table && sock_flow_table) {
3128                 struct rps_dev_flow *rflow;
3129                 u32 next_cpu;
3130                 u32 ident;
3131
3132                 /* First check into global flow table if there is a match */
3133                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3134                 if ((ident ^ hash) & ~rps_cpu_mask)
3135                         goto try_rps;
3136
3137                 next_cpu = ident & rps_cpu_mask;
3138
3139                 /* OK, now we know there is a match,
3140                  * we can look at the local (per receive queue) flow table
3141                  */
3142                 rflow = &flow_table->flows[hash & flow_table->mask];
3143                 tcpu = rflow->cpu;
3144
3145                 /*
3146                  * If the desired CPU (where last recvmsg was done) is
3147                  * different from current CPU (one in the rx-queue flow
3148                  * table entry), switch if one of the following holds:
3149                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3150                  *   - Current CPU is offline.
3151                  *   - The current CPU's queue tail has advanced beyond the
3152                  *     last packet that was enqueued using this table entry.
3153                  *     This guarantees that all previous packets for the flow
3154                  *     have been dequeued, thus preserving in order delivery.
3155                  */
3156                 if (unlikely(tcpu != next_cpu) &&
3157                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3158                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3159                       rflow->last_qtail)) >= 0)) {
3160                         tcpu = next_cpu;
3161                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3162                 }
3163
3164                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3165                         *rflowp = rflow;
3166                         cpu = tcpu;
3167                         goto done;
3168                 }
3169         }
3170
3171 try_rps:
3172
3173         if (map) {
3174                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3175                 if (cpu_online(tcpu)) {
3176                         cpu = tcpu;
3177                         goto done;
3178                 }
3179         }
3180
3181 done:
3182         return cpu;
3183 }
3184
3185 #ifdef CONFIG_RFS_ACCEL
3186
3187 /**
3188  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3189  * @dev: Device on which the filter was set
3190  * @rxq_index: RX queue index
3191  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3192  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3193  *
3194  * Drivers that implement ndo_rx_flow_steer() should periodically call
3195  * this function for each installed filter and remove the filters for
3196  * which it returns %true.
3197  */
3198 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3199                          u32 flow_id, u16 filter_id)
3200 {
3201         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3202         struct rps_dev_flow_table *flow_table;
3203         struct rps_dev_flow *rflow;
3204         bool expire = true;
3205         int cpu;
3206
3207         rcu_read_lock();
3208         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3209         if (flow_table && flow_id <= flow_table->mask) {
3210                 rflow = &flow_table->flows[flow_id];
3211                 cpu = ACCESS_ONCE(rflow->cpu);
3212                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3213                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3214                            rflow->last_qtail) <
3215                      (int)(10 * flow_table->mask)))
3216                         expire = false;
3217         }
3218         rcu_read_unlock();
3219         return expire;
3220 }
3221 EXPORT_SYMBOL(rps_may_expire_flow);
3222
3223 #endif /* CONFIG_RFS_ACCEL */
3224
3225 /* Called from hardirq (IPI) context */
3226 static void rps_trigger_softirq(void *data)
3227 {
3228         struct softnet_data *sd = data;
3229
3230         ____napi_schedule(sd, &sd->backlog);
3231         sd->received_rps++;
3232 }
3233
3234 #endif /* CONFIG_RPS */
3235
3236 /*
3237  * Check if this softnet_data structure is another cpu one
3238  * If yes, queue it to our IPI list and return 1
3239  * If no, return 0
3240  */
3241 static int rps_ipi_queued(struct softnet_data *sd)
3242 {
3243 #ifdef CONFIG_RPS
3244         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3245
3246         if (sd != mysd) {
3247                 sd->rps_ipi_next = mysd->rps_ipi_list;
3248                 mysd->rps_ipi_list = sd;
3249
3250                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3251                 return 1;
3252         }
3253 #endif /* CONFIG_RPS */
3254         return 0;
3255 }
3256
3257 #ifdef CONFIG_NET_FLOW_LIMIT
3258 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3259 #endif
3260
3261 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3262 {
3263 #ifdef CONFIG_NET_FLOW_LIMIT
3264         struct sd_flow_limit *fl;
3265         struct softnet_data *sd;
3266         unsigned int old_flow, new_flow;
3267
3268         if (qlen < (netdev_max_backlog >> 1))
3269                 return false;
3270
3271         sd = this_cpu_ptr(&softnet_data);
3272
3273         rcu_read_lock();
3274         fl = rcu_dereference(sd->flow_limit);
3275         if (fl) {
3276                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3277                 old_flow = fl->history[fl->history_head];
3278                 fl->history[fl->history_head] = new_flow;
3279
3280                 fl->history_head++;
3281                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3282
3283                 if (likely(fl->buckets[old_flow]))
3284                         fl->buckets[old_flow]--;
3285
3286                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3287                         fl->count++;
3288                         rcu_read_unlock();
3289                         return true;
3290                 }
3291         }
3292         rcu_read_unlock();
3293 #endif
3294         return false;
3295 }
3296
3297 /*
3298  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3299  * queue (may be a remote CPU queue).
3300  */
3301 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3302                               unsigned int *qtail)
3303 {
3304         struct softnet_data *sd;
3305         unsigned long flags;
3306         unsigned int qlen;
3307
3308         sd = &per_cpu(softnet_data, cpu);
3309
3310         local_irq_save(flags);
3311
3312         rps_lock(sd);
3313         qlen = skb_queue_len(&sd->input_pkt_queue);
3314         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3315                 if (qlen) {
3316 enqueue:
3317                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3318                         input_queue_tail_incr_save(sd, qtail);
3319                         rps_unlock(sd);
3320                         local_irq_restore(flags);
3321                         return NET_RX_SUCCESS;
3322                 }
3323
3324                 /* Schedule NAPI for backlog device
3325                  * We can use non atomic operation since we own the queue lock
3326                  */
3327                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3328                         if (!rps_ipi_queued(sd))
3329                                 ____napi_schedule(sd, &sd->backlog);
3330                 }
3331                 goto enqueue;
3332         }
3333
3334         sd->dropped++;
3335         rps_unlock(sd);
3336
3337         local_irq_restore(flags);
3338
3339         atomic_long_inc(&skb->dev->rx_dropped);
3340         kfree_skb(skb);
3341         return NET_RX_DROP;
3342 }
3343
3344 static int netif_rx_internal(struct sk_buff *skb)
3345 {
3346         int ret;
3347
3348         net_timestamp_check(netdev_tstamp_prequeue, skb);
3349
3350         trace_netif_rx(skb);
3351 #ifdef CONFIG_RPS
3352         if (static_key_false(&rps_needed)) {
3353                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3354                 int cpu;
3355
3356                 preempt_disable();
3357                 rcu_read_lock();
3358
3359                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3360                 if (cpu < 0)
3361                         cpu = smp_processor_id();
3362
3363                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3364
3365                 rcu_read_unlock();
3366                 preempt_enable();
3367         } else
3368 #endif
3369         {
3370                 unsigned int qtail;
3371                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3372                 put_cpu();
3373         }
3374         return ret;
3375 }
3376
3377 /**
3378  *      netif_rx        -       post buffer to the network code
3379  *      @skb: buffer to post
3380  *
3381  *      This function receives a packet from a device driver and queues it for
3382  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3383  *      may be dropped during processing for congestion control or by the
3384  *      protocol layers.
3385  *
3386  *      return values:
3387  *      NET_RX_SUCCESS  (no congestion)
3388  *      NET_RX_DROP     (packet was dropped)
3389  *
3390  */
3391
3392 int netif_rx(struct sk_buff *skb)
3393 {
3394         trace_netif_rx_entry(skb);
3395
3396         return netif_rx_internal(skb);
3397 }
3398 EXPORT_SYMBOL(netif_rx);
3399
3400 int netif_rx_ni(struct sk_buff *skb)
3401 {
3402         int err;
3403
3404         trace_netif_rx_ni_entry(skb);
3405
3406         preempt_disable();
3407         err = netif_rx_internal(skb);
3408         if (local_softirq_pending())
3409                 do_softirq();
3410         preempt_enable();
3411
3412         return err;
3413 }
3414 EXPORT_SYMBOL(netif_rx_ni);
3415
3416 static void net_tx_action(struct softirq_action *h)
3417 {
3418         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3419
3420         if (sd->completion_queue) {
3421                 struct sk_buff *clist;
3422
3423                 local_irq_disable();
3424                 clist = sd->completion_queue;
3425                 sd->completion_queue = NULL;
3426                 local_irq_enable();
3427
3428                 while (clist) {
3429                         struct sk_buff *skb = clist;
3430                         clist = clist->next;
3431
3432                         WARN_ON(atomic_read(&skb->users));
3433                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3434                                 trace_consume_skb(skb);
3435                         else
3436                                 trace_kfree_skb(skb, net_tx_action);
3437                         __kfree_skb(skb);
3438                 }
3439         }
3440
3441         if (sd->output_queue) {
3442                 struct Qdisc *head;
3443
3444                 local_irq_disable();
3445                 head = sd->output_queue;
3446                 sd->output_queue = NULL;
3447                 sd->output_queue_tailp = &sd->output_queue;
3448                 local_irq_enable();
3449
3450                 while (head) {
3451                         struct Qdisc *q = head;
3452                         spinlock_t *root_lock;
3453
3454                         head = head->next_sched;
3455
3456                         root_lock = qdisc_lock(q);
3457                         if (spin_trylock(root_lock)) {
3458                                 smp_mb__before_atomic();
3459                                 clear_bit(__QDISC_STATE_SCHED,
3460                                           &q->state);
3461                                 qdisc_run(q);
3462                                 spin_unlock(root_lock);
3463                         } else {
3464                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3465                                               &q->state)) {
3466                                         __netif_reschedule(q);
3467                                 } else {
3468                                         smp_mb__before_atomic();
3469                                         clear_bit(__QDISC_STATE_SCHED,
3470                                                   &q->state);
3471                                 }
3472                         }
3473                 }
3474         }
3475 }
3476
3477 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3478     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3479 /* This hook is defined here for ATM LANE */
3480 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3481                              unsigned char *addr) __read_mostly;
3482 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3483 #endif
3484
3485 #ifdef CONFIG_NET_CLS_ACT
3486 /* TODO: Maybe we should just force sch_ingress to be compiled in
3487  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3488  * a compare and 2 stores extra right now if we dont have it on
3489  * but have CONFIG_NET_CLS_ACT
3490  * NOTE: This doesn't stop any functionality; if you dont have
3491  * the ingress scheduler, you just can't add policies on ingress.
3492  *
3493  */
3494 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3495 {
3496         struct net_device *dev = skb->dev;
3497         u32 ttl = G_TC_RTTL(skb->tc_verd);
3498         int result = TC_ACT_OK;
3499         struct Qdisc *q;
3500
3501         if (unlikely(MAX_RED_LOOP < ttl++)) {
3502                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3503                                      skb->skb_iif, dev->ifindex);
3504                 return TC_ACT_SHOT;
3505         }
3506
3507         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3508         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3509
3510         q = rcu_dereference(rxq->qdisc);
3511         if (q != &noop_qdisc) {
3512                 spin_lock(qdisc_lock(q));
3513                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3514                         result = qdisc_enqueue_root(skb, q);
3515                 spin_unlock(qdisc_lock(q));
3516         }
3517
3518         return result;
3519 }
3520
3521 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3522                                          struct packet_type **pt_prev,
3523                                          int *ret, struct net_device *orig_dev)
3524 {
3525         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3526
3527         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3528                 goto out;
3529
3530         if (*pt_prev) {
3531                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3532                 *pt_prev = NULL;
3533         }
3534
3535         switch (ing_filter(skb, rxq)) {
3536         case TC_ACT_SHOT:
3537         case TC_ACT_STOLEN:
3538                 kfree_skb(skb);
3539                 return NULL;
3540         }
3541
3542 out:
3543         skb->tc_verd = 0;
3544         return skb;
3545 }
3546 #endif
3547
3548 /**
3549  *      netdev_rx_handler_register - register receive handler
3550  *      @dev: device to register a handler for
3551  *      @rx_handler: receive handler to register
3552  *      @rx_handler_data: data pointer that is used by rx handler
3553  *
3554  *      Register a receive handler for a device. This handler will then be
3555  *      called from __netif_receive_skb. A negative errno code is returned
3556  *      on a failure.
3557  *
3558  *      The caller must hold the rtnl_mutex.
3559  *
3560  *      For a general description of rx_handler, see enum rx_handler_result.
3561  */
3562 int netdev_rx_handler_register(struct net_device *dev,
3563                                rx_handler_func_t *rx_handler,
3564                                void *rx_handler_data)
3565 {
3566         ASSERT_RTNL();
3567
3568         if (dev->rx_handler)
3569                 return -EBUSY;
3570
3571         /* Note: rx_handler_data must be set before rx_handler */
3572         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3573         rcu_assign_pointer(dev->rx_handler, rx_handler);
3574
3575         return 0;
3576 }
3577 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3578
3579 /**
3580  *      netdev_rx_handler_unregister - unregister receive handler
3581  *      @dev: device to unregister a handler from
3582  *
3583  *      Unregister a receive handler from a device.
3584  *
3585  *      The caller must hold the rtnl_mutex.
3586  */
3587 void netdev_rx_handler_unregister(struct net_device *dev)
3588 {
3589
3590         ASSERT_RTNL();
3591         RCU_INIT_POINTER(dev->rx_handler, NULL);
3592         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3593          * section has a guarantee to see a non NULL rx_handler_data
3594          * as well.
3595          */
3596         synchronize_net();
3597         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3598 }
3599 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3600
3601 /*
3602  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3603  * the special handling of PFMEMALLOC skbs.
3604  */
3605 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3606 {
3607         switch (skb->protocol) {
3608         case htons(ETH_P_ARP):
3609         case htons(ETH_P_IP):
3610         case htons(ETH_P_IPV6):
3611         case htons(ETH_P_8021Q):
3612         case htons(ETH_P_8021AD):
3613                 return true;
3614         default:
3615                 return false;
3616         }
3617 }
3618
3619 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3620 {
3621         struct packet_type *ptype, *pt_prev;
3622         rx_handler_func_t *rx_handler;
3623         struct net_device *orig_dev;
3624         bool deliver_exact = false;
3625         int ret = NET_RX_DROP;
3626         __be16 type;
3627
3628         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3629
3630         trace_netif_receive_skb(skb);
3631
3632         orig_dev = skb->dev;
3633
3634         skb_reset_network_header(skb);
3635         if (!skb_transport_header_was_set(skb))
3636                 skb_reset_transport_header(skb);
3637         skb_reset_mac_len(skb);
3638
3639         pt_prev = NULL;
3640
3641         rcu_read_lock();
3642
3643 another_round:
3644         skb->skb_iif = skb->dev->ifindex;
3645
3646         __this_cpu_inc(softnet_data.processed);
3647
3648         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3649             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3650                 skb = skb_vlan_untag(skb);
3651                 if (unlikely(!skb))
3652                         goto unlock;
3653         }
3654
3655 #ifdef CONFIG_NET_CLS_ACT
3656         if (skb->tc_verd & TC_NCLS) {
3657                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3658                 goto ncls;
3659         }
3660 #endif
3661
3662         if (pfmemalloc)
3663                 goto skip_taps;
3664
3665         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3666                 if (pt_prev)
3667                         ret = deliver_skb(skb, pt_prev, orig_dev);
3668                 pt_prev = ptype;
3669         }
3670
3671         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3672                 if (pt_prev)
3673                         ret = deliver_skb(skb, pt_prev, orig_dev);
3674                 pt_prev = ptype;
3675         }
3676
3677 skip_taps:
3678 #ifdef CONFIG_NET_CLS_ACT
3679         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3680         if (!skb)
3681                 goto unlock;
3682 ncls:
3683 #endif
3684
3685         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3686                 goto drop;
3687
3688         if (skb_vlan_tag_present(skb)) {
3689                 if (pt_prev) {
3690                         ret = deliver_skb(skb, pt_prev, orig_dev);
3691                         pt_prev = NULL;
3692                 }
3693                 if (vlan_do_receive(&skb))
3694                         goto another_round;
3695                 else if (unlikely(!skb))
3696                         goto unlock;
3697         }
3698
3699         rx_handler = rcu_dereference(skb->dev->rx_handler);
3700         if (rx_handler) {
3701                 if (pt_prev) {
3702                         ret = deliver_skb(skb, pt_prev, orig_dev);
3703                         pt_prev = NULL;
3704                 }
3705                 switch (rx_handler(&skb)) {
3706                 case RX_HANDLER_CONSUMED:
3707                         ret = NET_RX_SUCCESS;
3708                         goto unlock;
3709                 case RX_HANDLER_ANOTHER:
3710                         goto another_round;
3711                 case RX_HANDLER_EXACT:
3712                         deliver_exact = true;
3713                 case RX_HANDLER_PASS:
3714                         break;
3715                 default:
3716                         BUG();
3717                 }
3718         }
3719
3720         if (unlikely(skb_vlan_tag_present(skb))) {
3721                 if (skb_vlan_tag_get_id(skb))
3722                         skb->pkt_type = PACKET_OTHERHOST;
3723                 /* Note: we might in the future use prio bits
3724                  * and set skb->priority like in vlan_do_receive()
3725                  * For the time being, just ignore Priority Code Point
3726                  */
3727                 skb->vlan_tci = 0;
3728         }
3729
3730         type = skb->protocol;
3731
3732         /* deliver only exact match when indicated */
3733         if (likely(!deliver_exact)) {
3734                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3735                                        &ptype_base[ntohs(type) &
3736                                                    PTYPE_HASH_MASK]);
3737         }
3738
3739         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3740                                &orig_dev->ptype_specific);
3741
3742         if (unlikely(skb->dev != orig_dev)) {
3743                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3744                                        &skb->dev->ptype_specific);
3745         }
3746
3747         if (pt_prev) {
3748                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3749                         goto drop;
3750                 else
3751                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3752         } else {
3753 drop:
3754                 atomic_long_inc(&skb->dev->rx_dropped);
3755                 kfree_skb(skb);
3756                 /* Jamal, now you will not able to escape explaining
3757                  * me how you were going to use this. :-)
3758                  */
3759                 ret = NET_RX_DROP;
3760         }
3761
3762 unlock:
3763         rcu_read_unlock();
3764         return ret;
3765 }
3766
3767 static int __netif_receive_skb(struct sk_buff *skb)
3768 {
3769         int ret;
3770
3771         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3772                 unsigned long pflags = current->flags;
3773
3774                 /*
3775                  * PFMEMALLOC skbs are special, they should
3776                  * - be delivered to SOCK_MEMALLOC sockets only
3777                  * - stay away from userspace
3778                  * - have bounded memory usage
3779                  *
3780                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3781                  * context down to all allocation sites.
3782                  */
3783                 current->flags |= PF_MEMALLOC;
3784                 ret = __netif_receive_skb_core(skb, true);
3785                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3786         } else
3787                 ret = __netif_receive_skb_core(skb, false);
3788
3789         return ret;
3790 }
3791
3792 static int netif_receive_skb_internal(struct sk_buff *skb)
3793 {
3794         net_timestamp_check(netdev_tstamp_prequeue, skb);
3795
3796         if (skb_defer_rx_timestamp(skb))
3797                 return NET_RX_SUCCESS;
3798
3799 #ifdef CONFIG_RPS
3800         if (static_key_false(&rps_needed)) {
3801                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3802                 int cpu, ret;
3803
3804                 rcu_read_lock();
3805
3806                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3807
3808                 if (cpu >= 0) {
3809                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3810                         rcu_read_unlock();
3811                         return ret;
3812                 }
3813                 rcu_read_unlock();
3814         }
3815 #endif
3816         return __netif_receive_skb(skb);
3817 }
3818
3819 /**
3820  *      netif_receive_skb - process receive buffer from network
3821  *      @skb: buffer to process
3822  *
3823  *      netif_receive_skb() is the main receive data processing function.
3824  *      It always succeeds. The buffer may be dropped during processing
3825  *      for congestion control or by the protocol layers.
3826  *
3827  *      This function may only be called from softirq context and interrupts
3828  *      should be enabled.
3829  *
3830  *      Return values (usually ignored):
3831  *      NET_RX_SUCCESS: no congestion
3832  *      NET_RX_DROP: packet was dropped
3833  */
3834 int netif_receive_skb(struct sk_buff *skb)
3835 {
3836         trace_netif_receive_skb_entry(skb);
3837
3838         return netif_receive_skb_internal(skb);
3839 }
3840 EXPORT_SYMBOL(netif_receive_skb);
3841
3842 /* Network device is going away, flush any packets still pending
3843  * Called with irqs disabled.
3844  */
3845 static void flush_backlog(void *arg)
3846 {
3847         struct net_device *dev = arg;
3848         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3849         struct sk_buff *skb, *tmp;
3850
3851         rps_lock(sd);
3852         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3853                 if (skb->dev == dev) {
3854                         __skb_unlink(skb, &sd->input_pkt_queue);
3855                         kfree_skb(skb);
3856                         input_queue_head_incr(sd);
3857                 }
3858         }
3859         rps_unlock(sd);
3860
3861         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3862                 if (skb->dev == dev) {
3863                         __skb_unlink(skb, &sd->process_queue);
3864                         kfree_skb(skb);
3865                         input_queue_head_incr(sd);
3866                 }
3867         }
3868 }
3869
3870 static int napi_gro_complete(struct sk_buff *skb)
3871 {
3872         struct packet_offload *ptype;
3873         __be16 type = skb->protocol;
3874         struct list_head *head = &offload_base;
3875         int err = -ENOENT;
3876
3877         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3878
3879         if (NAPI_GRO_CB(skb)->count == 1) {
3880                 skb_shinfo(skb)->gso_size = 0;
3881                 goto out;
3882         }
3883
3884         rcu_read_lock();
3885         list_for_each_entry_rcu(ptype, head, list) {
3886                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3887                         continue;
3888
3889                 err = ptype->callbacks.gro_complete(skb, 0);
3890                 break;
3891         }
3892         rcu_read_unlock();
3893
3894         if (err) {
3895                 WARN_ON(&ptype->list == head);
3896                 kfree_skb(skb);
3897                 return NET_RX_SUCCESS;
3898         }
3899
3900 out:
3901         return netif_receive_skb_internal(skb);
3902 }
3903
3904 /* napi->gro_list contains packets ordered by age.
3905  * youngest packets at the head of it.
3906  * Complete skbs in reverse order to reduce latencies.
3907  */
3908 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3909 {
3910         struct sk_buff *skb, *prev = NULL;
3911
3912         /* scan list and build reverse chain */
3913         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3914                 skb->prev = prev;
3915                 prev = skb;
3916         }
3917
3918         for (skb = prev; skb; skb = prev) {
3919                 skb->next = NULL;
3920
3921                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3922                         return;
3923
3924                 prev = skb->prev;
3925                 napi_gro_complete(skb);
3926                 napi->gro_count--;
3927         }
3928
3929         napi->gro_list = NULL;
3930 }
3931 EXPORT_SYMBOL(napi_gro_flush);
3932
3933 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3934 {
3935         struct sk_buff *p;
3936         unsigned int maclen = skb->dev->hard_header_len;
3937         u32 hash = skb_get_hash_raw(skb);
3938
3939         for (p = napi->gro_list; p; p = p->next) {
3940                 unsigned long diffs;
3941
3942                 NAPI_GRO_CB(p)->flush = 0;
3943
3944                 if (hash != skb_get_hash_raw(p)) {
3945                         NAPI_GRO_CB(p)->same_flow = 0;
3946                         continue;
3947                 }
3948
3949                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3950                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3951                 if (maclen == ETH_HLEN)
3952                         diffs |= compare_ether_header(skb_mac_header(p),
3953                                                       skb_mac_header(skb));
3954                 else if (!diffs)
3955                         diffs = memcmp(skb_mac_header(p),
3956                                        skb_mac_header(skb),
3957                                        maclen);
3958                 NAPI_GRO_CB(p)->same_flow = !diffs;
3959         }
3960 }
3961
3962 static void skb_gro_reset_offset(struct sk_buff *skb)
3963 {
3964         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3965         const skb_frag_t *frag0 = &pinfo->frags[0];
3966
3967         NAPI_GRO_CB(skb)->data_offset = 0;
3968         NAPI_GRO_CB(skb)->frag0 = NULL;
3969         NAPI_GRO_CB(skb)->frag0_len = 0;
3970
3971         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3972             pinfo->nr_frags &&
3973             !PageHighMem(skb_frag_page(frag0))) {
3974                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3975                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3976         }
3977 }
3978
3979 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3980 {
3981         struct skb_shared_info *pinfo = skb_shinfo(skb);
3982
3983         BUG_ON(skb->end - skb->tail < grow);
3984
3985         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3986
3987         skb->data_len -= grow;
3988         skb->tail += grow;
3989
3990         pinfo->frags[0].page_offset += grow;
3991         skb_frag_size_sub(&pinfo->frags[0], grow);
3992
3993         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3994                 skb_frag_unref(skb, 0);
3995                 memmove(pinfo->frags, pinfo->frags + 1,
3996                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3997         }
3998 }
3999
4000 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4001 {
4002         struct sk_buff **pp = NULL;
4003         struct packet_offload *ptype;
4004         __be16 type = skb->protocol;
4005         struct list_head *head = &offload_base;
4006         int same_flow;
4007         enum gro_result ret;
4008         int grow;
4009
4010         if (!(skb->dev->features & NETIF_F_GRO))
4011                 goto normal;
4012
4013         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4014                 goto normal;
4015
4016         gro_list_prepare(napi, skb);
4017
4018         rcu_read_lock();
4019         list_for_each_entry_rcu(ptype, head, list) {
4020                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4021                         continue;
4022
4023                 skb_set_network_header(skb, skb_gro_offset(skb));
4024                 skb_reset_mac_len(skb);
4025                 NAPI_GRO_CB(skb)->same_flow = 0;
4026                 NAPI_GRO_CB(skb)->flush = 0;
4027                 NAPI_GRO_CB(skb)->free = 0;
4028                 NAPI_GRO_CB(skb)->udp_mark = 0;
4029                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4030
4031                 /* Setup for GRO checksum validation */
4032                 switch (skb->ip_summed) {
4033                 case CHECKSUM_COMPLETE:
4034                         NAPI_GRO_CB(skb)->csum = skb->csum;
4035                         NAPI_GRO_CB(skb)->csum_valid = 1;
4036                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4037                         break;
4038                 case CHECKSUM_UNNECESSARY:
4039                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4040                         NAPI_GRO_CB(skb)->csum_valid = 0;
4041                         break;
4042                 default:
4043                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4044                         NAPI_GRO_CB(skb)->csum_valid = 0;
4045                 }
4046
4047                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4048                 break;
4049         }
4050         rcu_read_unlock();
4051
4052         if (&ptype->list == head)
4053                 goto normal;
4054
4055         same_flow = NAPI_GRO_CB(skb)->same_flow;
4056         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4057
4058         if (pp) {
4059                 struct sk_buff *nskb = *pp;
4060
4061                 *pp = nskb->next;
4062                 nskb->next = NULL;
4063                 napi_gro_complete(nskb);
4064                 napi->gro_count--;
4065         }
4066
4067         if (same_flow)
4068                 goto ok;
4069
4070         if (NAPI_GRO_CB(skb)->flush)
4071                 goto normal;
4072
4073         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4074                 struct sk_buff *nskb = napi->gro_list;
4075
4076                 /* locate the end of the list to select the 'oldest' flow */
4077                 while (nskb->next) {
4078                         pp = &nskb->next;
4079                         nskb = *pp;
4080                 }
4081                 *pp = NULL;
4082                 nskb->next = NULL;
4083                 napi_gro_complete(nskb);
4084         } else {
4085                 napi->gro_count++;
4086         }
4087         NAPI_GRO_CB(skb)->count = 1;
4088         NAPI_GRO_CB(skb)->age = jiffies;
4089         NAPI_GRO_CB(skb)->last = skb;
4090         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4091         skb->next = napi->gro_list;
4092         napi->gro_list = skb;
4093         ret = GRO_HELD;
4094
4095 pull:
4096         grow = skb_gro_offset(skb) - skb_headlen(skb);
4097         if (grow > 0)
4098                 gro_pull_from_frag0(skb, grow);
4099 ok:
4100         return ret;
4101
4102 normal:
4103         ret = GRO_NORMAL;
4104         goto pull;
4105 }
4106
4107 struct packet_offload *gro_find_receive_by_type(__be16 type)
4108 {
4109         struct list_head *offload_head = &offload_base;
4110         struct packet_offload *ptype;
4111
4112         list_for_each_entry_rcu(ptype, offload_head, list) {
4113                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4114                         continue;
4115                 return ptype;
4116         }
4117         return NULL;
4118 }
4119 EXPORT_SYMBOL(gro_find_receive_by_type);
4120
4121 struct packet_offload *gro_find_complete_by_type(__be16 type)
4122 {
4123         struct list_head *offload_head = &offload_base;
4124         struct packet_offload *ptype;
4125
4126         list_for_each_entry_rcu(ptype, offload_head, list) {
4127                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4128                         continue;
4129                 return ptype;
4130         }
4131         return NULL;
4132 }
4133 EXPORT_SYMBOL(gro_find_complete_by_type);
4134
4135 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4136 {
4137         switch (ret) {
4138         case GRO_NORMAL:
4139                 if (netif_receive_skb_internal(skb))
4140                         ret = GRO_DROP;
4141                 break;
4142
4143         case GRO_DROP:
4144                 kfree_skb(skb);
4145                 break;
4146
4147         case GRO_MERGED_FREE:
4148                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4149                         kmem_cache_free(skbuff_head_cache, skb);
4150                 else
4151                         __kfree_skb(skb);
4152                 break;
4153
4154         case GRO_HELD:
4155         case GRO_MERGED:
4156                 break;
4157         }
4158
4159         return ret;
4160 }
4161
4162 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4163 {
4164         trace_napi_gro_receive_entry(skb);
4165
4166         skb_gro_reset_offset(skb);
4167
4168         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4169 }
4170 EXPORT_SYMBOL(napi_gro_receive);
4171
4172 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4173 {
4174         if (unlikely(skb->pfmemalloc)) {
4175                 consume_skb(skb);
4176                 return;
4177         }
4178         __skb_pull(skb, skb_headlen(skb));
4179         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4180         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4181         skb->vlan_tci = 0;
4182         skb->dev = napi->dev;
4183         skb->skb_iif = 0;
4184         skb->encapsulation = 0;
4185         skb_shinfo(skb)->gso_type = 0;
4186         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4187
4188         napi->skb = skb;
4189 }
4190
4191 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4192 {
4193         struct sk_buff *skb = napi->skb;
4194
4195         if (!skb) {
4196                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4197                 napi->skb = skb;
4198         }
4199         return skb;
4200 }
4201 EXPORT_SYMBOL(napi_get_frags);
4202
4203 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4204                                       struct sk_buff *skb,
4205                                       gro_result_t ret)
4206 {
4207         switch (ret) {
4208         case GRO_NORMAL:
4209         case GRO_HELD:
4210                 __skb_push(skb, ETH_HLEN);
4211                 skb->protocol = eth_type_trans(skb, skb->dev);
4212                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4213                         ret = GRO_DROP;
4214                 break;
4215
4216         case GRO_DROP:
4217         case GRO_MERGED_FREE:
4218                 napi_reuse_skb(napi, skb);
4219                 break;
4220
4221         case GRO_MERGED:
4222                 break;
4223         }
4224
4225         return ret;
4226 }
4227
4228 /* Upper GRO stack assumes network header starts at gro_offset=0
4229  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4230  * We copy ethernet header into skb->data to have a common layout.
4231  */
4232 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4233 {
4234         struct sk_buff *skb = napi->skb;
4235         const struct ethhdr *eth;
4236         unsigned int hlen = sizeof(*eth);
4237
4238         napi->skb = NULL;
4239
4240         skb_reset_mac_header(skb);
4241         skb_gro_reset_offset(skb);
4242
4243         eth = skb_gro_header_fast(skb, 0);
4244         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4245                 eth = skb_gro_header_slow(skb, hlen, 0);
4246                 if (unlikely(!eth)) {
4247                         napi_reuse_skb(napi, skb);
4248                         return NULL;
4249                 }
4250         } else {
4251                 gro_pull_from_frag0(skb, hlen);
4252                 NAPI_GRO_CB(skb)->frag0 += hlen;
4253                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4254         }
4255         __skb_pull(skb, hlen);
4256
4257         /*
4258          * This works because the only protocols we care about don't require
4259          * special handling.
4260          * We'll fix it up properly in napi_frags_finish()
4261          */
4262         skb->protocol = eth->h_proto;
4263
4264         return skb;
4265 }
4266
4267 gro_result_t napi_gro_frags(struct napi_struct *napi)
4268 {
4269         struct sk_buff *skb = napi_frags_skb(napi);
4270
4271         if (!skb)
4272                 return GRO_DROP;
4273
4274         trace_napi_gro_frags_entry(skb);
4275
4276         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4277 }
4278 EXPORT_SYMBOL(napi_gro_frags);
4279
4280 /* Compute the checksum from gro_offset and return the folded value
4281  * after adding in any pseudo checksum.
4282  */
4283 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4284 {
4285         __wsum wsum;
4286         __sum16 sum;
4287
4288         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4289
4290         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4291         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4292         if (likely(!sum)) {
4293                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4294                     !skb->csum_complete_sw)
4295                         netdev_rx_csum_fault(skb->dev);
4296         }
4297
4298         NAPI_GRO_CB(skb)->csum = wsum;
4299         NAPI_GRO_CB(skb)->csum_valid = 1;
4300
4301         return sum;
4302 }
4303 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4304
4305 /*
4306  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4307  * Note: called with local irq disabled, but exits with local irq enabled.
4308  */
4309 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4310 {
4311 #ifdef CONFIG_RPS
4312         struct softnet_data *remsd = sd->rps_ipi_list;
4313
4314         if (remsd) {
4315                 sd->rps_ipi_list = NULL;
4316
4317                 local_irq_enable();
4318
4319                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4320                 while (remsd) {
4321                         struct softnet_data *next = remsd->rps_ipi_next;
4322
4323                         if (cpu_online(remsd->cpu))
4324                                 smp_call_function_single_async(remsd->cpu,
4325                                                            &remsd->csd);
4326                         remsd = next;
4327                 }
4328         } else
4329 #endif
4330                 local_irq_enable();
4331 }
4332
4333 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4334 {
4335 #ifdef CONFIG_RPS
4336         return sd->rps_ipi_list != NULL;
4337 #else
4338         return false;
4339 #endif
4340 }
4341
4342 static int process_backlog(struct napi_struct *napi, int quota)
4343 {
4344         int work = 0;
4345         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4346
4347         /* Check if we have pending ipi, its better to send them now,
4348          * not waiting net_rx_action() end.
4349          */
4350         if (sd_has_rps_ipi_waiting(sd)) {
4351                 local_irq_disable();
4352                 net_rps_action_and_irq_enable(sd);
4353         }
4354
4355         napi->weight = weight_p;
4356         local_irq_disable();
4357         while (1) {
4358                 struct sk_buff *skb;
4359
4360                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4361                         local_irq_enable();
4362                         __netif_receive_skb(skb);
4363                         local_irq_disable();
4364                         input_queue_head_incr(sd);
4365                         if (++work >= quota) {
4366                                 local_irq_enable();
4367                                 return work;
4368                         }
4369                 }
4370
4371                 rps_lock(sd);
4372                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4373                         /*
4374                          * Inline a custom version of __napi_complete().
4375                          * only current cpu owns and manipulates this napi,
4376                          * and NAPI_STATE_SCHED is the only possible flag set
4377                          * on backlog.
4378                          * We can use a plain write instead of clear_bit(),
4379                          * and we dont need an smp_mb() memory barrier.
4380                          */
4381                         napi->state = 0;
4382                         rps_unlock(sd);
4383
4384                         break;
4385                 }
4386
4387                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4388                                            &sd->process_queue);
4389                 rps_unlock(sd);
4390         }
4391         local_irq_enable();
4392
4393         return work;
4394 }
4395
4396 /**
4397  * __napi_schedule - schedule for receive
4398  * @n: entry to schedule
4399  *
4400  * The entry's receive function will be scheduled to run.
4401  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4402  */
4403 void __napi_schedule(struct napi_struct *n)
4404 {
4405         unsigned long flags;
4406
4407         local_irq_save(flags);
4408         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4409         local_irq_restore(flags);
4410 }
4411 EXPORT_SYMBOL(__napi_schedule);
4412
4413 /**
4414  * __napi_schedule_irqoff - schedule for receive
4415  * @n: entry to schedule
4416  *
4417  * Variant of __napi_schedule() assuming hard irqs are masked
4418  */
4419 void __napi_schedule_irqoff(struct napi_struct *n)
4420 {
4421         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4422 }
4423 EXPORT_SYMBOL(__napi_schedule_irqoff);
4424
4425 void __napi_complete(struct napi_struct *n)
4426 {
4427         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4428
4429         list_del_init(&n->poll_list);
4430         smp_mb__before_atomic();
4431         clear_bit(NAPI_STATE_SCHED, &n->state);
4432 }
4433 EXPORT_SYMBOL(__napi_complete);
4434
4435 void napi_complete_done(struct napi_struct *n, int work_done)
4436 {
4437         unsigned long flags;
4438
4439         /*
4440          * don't let napi dequeue from the cpu poll list
4441          * just in case its running on a different cpu
4442          */
4443         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4444                 return;
4445
4446         if (n->gro_list) {
4447                 unsigned long timeout = 0;
4448
4449                 if (work_done)
4450                         timeout = n->dev->gro_flush_timeout;
4451
4452                 if (timeout)
4453                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4454                                       HRTIMER_MODE_REL_PINNED);
4455                 else
4456                         napi_gro_flush(n, false);
4457         }
4458         if (likely(list_empty(&n->poll_list))) {
4459                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4460         } else {
4461                 /* If n->poll_list is not empty, we need to mask irqs */
4462                 local_irq_save(flags);
4463                 __napi_complete(n);
4464                 local_irq_restore(flags);
4465         }
4466 }
4467 EXPORT_SYMBOL(napi_complete_done);
4468
4469 /* must be called under rcu_read_lock(), as we dont take a reference */
4470 struct napi_struct *napi_by_id(unsigned int napi_id)
4471 {
4472         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4473         struct napi_struct *napi;
4474
4475         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4476                 if (napi->napi_id == napi_id)
4477                         return napi;
4478
4479         return NULL;
4480 }
4481 EXPORT_SYMBOL_GPL(napi_by_id);
4482
4483 void napi_hash_add(struct napi_struct *napi)
4484 {
4485         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4486
4487                 spin_lock(&napi_hash_lock);
4488
4489                 /* 0 is not a valid id, we also skip an id that is taken
4490                  * we expect both events to be extremely rare
4491                  */
4492                 napi->napi_id = 0;
4493                 while (!napi->napi_id) {
4494                         napi->napi_id = ++napi_gen_id;
4495                         if (napi_by_id(napi->napi_id))
4496                                 napi->napi_id = 0;
4497                 }
4498
4499                 hlist_add_head_rcu(&napi->napi_hash_node,
4500                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4501
4502                 spin_unlock(&napi_hash_lock);
4503         }
4504 }
4505 EXPORT_SYMBOL_GPL(napi_hash_add);
4506
4507 /* Warning : caller is responsible to make sure rcu grace period
4508  * is respected before freeing memory containing @napi
4509  */
4510 void napi_hash_del(struct napi_struct *napi)
4511 {
4512         spin_lock(&napi_hash_lock);
4513
4514         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4515                 hlist_del_rcu(&napi->napi_hash_node);
4516
4517         spin_unlock(&napi_hash_lock);
4518 }
4519 EXPORT_SYMBOL_GPL(napi_hash_del);
4520
4521 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4522 {
4523         struct napi_struct *napi;
4524
4525         napi = container_of(timer, struct napi_struct, timer);
4526         if (napi->gro_list)
4527                 napi_schedule(napi);
4528
4529         return HRTIMER_NORESTART;
4530 }
4531
4532 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4533                     int (*poll)(struct napi_struct *, int), int weight)
4534 {
4535         INIT_LIST_HEAD(&napi->poll_list);
4536         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4537         napi->timer.function = napi_watchdog;
4538         napi->gro_count = 0;
4539         napi->gro_list = NULL;
4540         napi->skb = NULL;
4541         napi->poll = poll;
4542         if (weight > NAPI_POLL_WEIGHT)
4543                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4544                             weight, dev->name);
4545         napi->weight = weight;
4546         list_add(&napi->dev_list, &dev->napi_list);
4547         napi->dev = dev;
4548 #ifdef CONFIG_NETPOLL
4549         spin_lock_init(&napi->poll_lock);
4550         napi->poll_owner = -1;
4551 #endif
4552         set_bit(NAPI_STATE_SCHED, &napi->state);
4553 }
4554 EXPORT_SYMBOL(netif_napi_add);
4555
4556 void napi_disable(struct napi_struct *n)
4557 {
4558         might_sleep();
4559         set_bit(NAPI_STATE_DISABLE, &n->state);
4560
4561         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4562                 msleep(1);
4563
4564         hrtimer_cancel(&n->timer);
4565
4566         clear_bit(NAPI_STATE_DISABLE, &n->state);
4567 }
4568 EXPORT_SYMBOL(napi_disable);
4569
4570 void netif_napi_del(struct napi_struct *napi)
4571 {
4572         list_del_init(&napi->dev_list);
4573         napi_free_frags(napi);
4574
4575         kfree_skb_list(napi->gro_list);
4576         napi->gro_list = NULL;
4577         napi->gro_count = 0;
4578 }
4579 EXPORT_SYMBOL(netif_napi_del);
4580
4581 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4582 {
4583         void *have;
4584         int work, weight;
4585
4586         list_del_init(&n->poll_list);
4587
4588         have = netpoll_poll_lock(n);
4589
4590         weight = n->weight;
4591
4592         /* This NAPI_STATE_SCHED test is for avoiding a race
4593          * with netpoll's poll_napi().  Only the entity which
4594          * obtains the lock and sees NAPI_STATE_SCHED set will
4595          * actually make the ->poll() call.  Therefore we avoid
4596          * accidentally calling ->poll() when NAPI is not scheduled.
4597          */
4598         work = 0;
4599         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4600                 work = n->poll(n, weight);
4601                 trace_napi_poll(n);
4602         }
4603
4604         WARN_ON_ONCE(work > weight);
4605
4606         if (likely(work < weight))
4607                 goto out_unlock;
4608
4609         /* Drivers must not modify the NAPI state if they
4610          * consume the entire weight.  In such cases this code
4611          * still "owns" the NAPI instance and therefore can
4612          * move the instance around on the list at-will.
4613          */
4614         if (unlikely(napi_disable_pending(n))) {
4615                 napi_complete(n);
4616                 goto out_unlock;
4617         }
4618
4619         if (n->gro_list) {
4620                 /* flush too old packets
4621                  * If HZ < 1000, flush all packets.
4622                  */
4623                 napi_gro_flush(n, HZ >= 1000);
4624         }
4625
4626         /* Some drivers may have called napi_schedule
4627          * prior to exhausting their budget.
4628          */
4629         if (unlikely(!list_empty(&n->poll_list))) {
4630                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4631                              n->dev ? n->dev->name : "backlog");
4632                 goto out_unlock;
4633         }
4634
4635         list_add_tail(&n->poll_list, repoll);
4636
4637 out_unlock:
4638         netpoll_poll_unlock(have);
4639
4640         return work;
4641 }
4642
4643 static void net_rx_action(struct softirq_action *h)
4644 {
4645         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4646         unsigned long time_limit = jiffies + 2;
4647         int budget = netdev_budget;
4648         LIST_HEAD(list);
4649         LIST_HEAD(repoll);
4650
4651         local_irq_disable();
4652         list_splice_init(&sd->poll_list, &list);
4653         local_irq_enable();
4654
4655         for (;;) {
4656                 struct napi_struct *n;
4657
4658                 if (list_empty(&list)) {
4659                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4660                                 return;
4661                         break;
4662                 }
4663
4664                 n = list_first_entry(&list, struct napi_struct, poll_list);
4665                 budget -= napi_poll(n, &repoll);
4666
4667                 /* If softirq window is exhausted then punt.
4668                  * Allow this to run for 2 jiffies since which will allow
4669                  * an average latency of 1.5/HZ.
4670                  */
4671                 if (unlikely(budget <= 0 ||
4672                              time_after_eq(jiffies, time_limit))) {
4673                         sd->time_squeeze++;
4674                         break;
4675                 }
4676         }
4677
4678         local_irq_disable();
4679
4680         list_splice_tail_init(&sd->poll_list, &list);
4681         list_splice_tail(&repoll, &list);
4682         list_splice(&list, &sd->poll_list);
4683         if (!list_empty(&sd->poll_list))
4684                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4685
4686         net_rps_action_and_irq_enable(sd);
4687 }
4688
4689 struct netdev_adjacent {
4690         struct net_device *dev;
4691
4692         /* upper master flag, there can only be one master device per list */
4693         bool master;
4694
4695         /* counter for the number of times this device was added to us */
4696         u16 ref_nr;
4697
4698         /* private field for the users */
4699         void *private;
4700
4701         struct list_head list;
4702         struct rcu_head rcu;
4703 };
4704
4705 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4706                                                  struct net_device *adj_dev,
4707                                                  struct list_head *adj_list)
4708 {
4709         struct netdev_adjacent *adj;
4710
4711         list_for_each_entry(adj, adj_list, list) {
4712                 if (adj->dev == adj_dev)
4713                         return adj;
4714         }
4715         return NULL;
4716 }
4717
4718 /**
4719  * netdev_has_upper_dev - Check if device is linked to an upper device
4720  * @dev: device
4721  * @upper_dev: upper device to check
4722  *
4723  * Find out if a device is linked to specified upper device and return true
4724  * in case it is. Note that this checks only immediate upper device,
4725  * not through a complete stack of devices. The caller must hold the RTNL lock.
4726  */
4727 bool netdev_has_upper_dev(struct net_device *dev,
4728                           struct net_device *upper_dev)
4729 {
4730         ASSERT_RTNL();
4731
4732         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4733 }
4734 EXPORT_SYMBOL(netdev_has_upper_dev);
4735
4736 /**
4737  * netdev_has_any_upper_dev - Check if device is linked to some device
4738  * @dev: device
4739  *
4740  * Find out if a device is linked to an upper device and return true in case
4741  * it is. The caller must hold the RTNL lock.
4742  */
4743 static bool netdev_has_any_upper_dev(struct net_device *dev)
4744 {
4745         ASSERT_RTNL();
4746
4747         return !list_empty(&dev->all_adj_list.upper);
4748 }
4749
4750 /**
4751  * netdev_master_upper_dev_get - Get master upper device
4752  * @dev: device
4753  *
4754  * Find a master upper device and return pointer to it or NULL in case
4755  * it's not there. The caller must hold the RTNL lock.
4756  */
4757 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4758 {
4759         struct netdev_adjacent *upper;
4760
4761         ASSERT_RTNL();
4762
4763         if (list_empty(&dev->adj_list.upper))
4764                 return NULL;
4765
4766         upper = list_first_entry(&dev->adj_list.upper,
4767                                  struct netdev_adjacent, list);
4768         if (likely(upper->master))
4769                 return upper->dev;
4770         return NULL;
4771 }
4772 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4773
4774 void *netdev_adjacent_get_private(struct list_head *adj_list)
4775 {
4776         struct netdev_adjacent *adj;
4777
4778         adj = list_entry(adj_list, struct netdev_adjacent, list);
4779
4780         return adj->private;
4781 }
4782 EXPORT_SYMBOL(netdev_adjacent_get_private);
4783
4784 /**
4785  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4786  * @dev: device
4787  * @iter: list_head ** of the current position
4788  *
4789  * Gets the next device from the dev's upper list, starting from iter
4790  * position. The caller must hold RCU read lock.
4791  */
4792 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4793                                                  struct list_head **iter)
4794 {
4795         struct netdev_adjacent *upper;
4796
4797         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4798
4799         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4800
4801         if (&upper->list == &dev->adj_list.upper)
4802                 return NULL;
4803
4804         *iter = &upper->list;
4805
4806         return upper->dev;
4807 }
4808 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4809
4810 /**
4811  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4812  * @dev: device
4813  * @iter: list_head ** of the current position
4814  *
4815  * Gets the next device from the dev's upper list, starting from iter
4816  * position. The caller must hold RCU read lock.
4817  */
4818 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4819                                                      struct list_head **iter)
4820 {
4821         struct netdev_adjacent *upper;
4822
4823         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4824
4825         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4826
4827         if (&upper->list == &dev->all_adj_list.upper)
4828                 return NULL;
4829
4830         *iter = &upper->list;
4831
4832         return upper->dev;
4833 }
4834 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4835
4836 /**
4837  * netdev_lower_get_next_private - Get the next ->private from the
4838  *                                 lower neighbour list
4839  * @dev: device
4840  * @iter: list_head ** of the current position
4841  *
4842  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4843  * list, starting from iter position. The caller must hold either hold the
4844  * RTNL lock or its own locking that guarantees that the neighbour lower
4845  * list will remain unchainged.
4846  */
4847 void *netdev_lower_get_next_private(struct net_device *dev,
4848                                     struct list_head **iter)
4849 {
4850         struct netdev_adjacent *lower;
4851
4852         lower = list_entry(*iter, struct netdev_adjacent, list);
4853
4854         if (&lower->list == &dev->adj_list.lower)
4855                 return NULL;
4856
4857         *iter = lower->list.next;
4858
4859         return lower->private;
4860 }
4861 EXPORT_SYMBOL(netdev_lower_get_next_private);
4862
4863 /**
4864  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4865  *                                     lower neighbour list, RCU
4866  *                                     variant
4867  * @dev: device
4868  * @iter: list_head ** of the current position
4869  *
4870  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4871  * list, starting from iter position. The caller must hold RCU read lock.
4872  */
4873 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4874                                         struct list_head **iter)
4875 {
4876         struct netdev_adjacent *lower;
4877
4878         WARN_ON_ONCE(!rcu_read_lock_held());
4879
4880         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4881
4882         if (&lower->list == &dev->adj_list.lower)
4883                 return NULL;
4884
4885         *iter = &lower->list;
4886
4887         return lower->private;
4888 }
4889 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4890
4891 /**
4892  * netdev_lower_get_next - Get the next device from the lower neighbour
4893  *                         list
4894  * @dev: device
4895  * @iter: list_head ** of the current position
4896  *
4897  * Gets the next netdev_adjacent from the dev's lower neighbour
4898  * list, starting from iter position. The caller must hold RTNL lock or
4899  * its own locking that guarantees that the neighbour lower
4900  * list will remain unchainged.
4901  */
4902 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4903 {
4904         struct netdev_adjacent *lower;
4905
4906         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4907
4908         if (&lower->list == &dev->adj_list.lower)
4909                 return NULL;
4910
4911         *iter = &lower->list;
4912
4913         return lower->dev;
4914 }
4915 EXPORT_SYMBOL(netdev_lower_get_next);
4916
4917 /**
4918  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4919  *                                     lower neighbour list, RCU
4920  *                                     variant
4921  * @dev: device
4922  *
4923  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4924  * list. The caller must hold RCU read lock.
4925  */
4926 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4927 {
4928         struct netdev_adjacent *lower;
4929
4930         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4931                         struct netdev_adjacent, list);
4932         if (lower)
4933                 return lower->private;
4934         return NULL;
4935 }
4936 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4937
4938 /**
4939  * netdev_master_upper_dev_get_rcu - Get master upper device
4940  * @dev: device
4941  *
4942  * Find a master upper device and return pointer to it or NULL in case
4943  * it's not there. The caller must hold the RCU read lock.
4944  */
4945 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4946 {
4947         struct netdev_adjacent *upper;
4948
4949         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4950                                        struct netdev_adjacent, list);
4951         if (upper && likely(upper->master))
4952                 return upper->dev;
4953         return NULL;
4954 }
4955 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4956
4957 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4958                               struct net_device *adj_dev,
4959                               struct list_head *dev_list)
4960 {
4961         char linkname[IFNAMSIZ+7];
4962         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4963                 "upper_%s" : "lower_%s", adj_dev->name);
4964         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4965                                  linkname);
4966 }
4967 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4968                                char *name,
4969                                struct list_head *dev_list)
4970 {
4971         char linkname[IFNAMSIZ+7];
4972         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4973                 "upper_%s" : "lower_%s", name);
4974         sysfs_remove_link(&(dev->dev.kobj), linkname);
4975 }
4976
4977 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4978                                                  struct net_device *adj_dev,
4979                                                  struct list_head *dev_list)
4980 {
4981         return (dev_list == &dev->adj_list.upper ||
4982                 dev_list == &dev->adj_list.lower) &&
4983                 net_eq(dev_net(dev), dev_net(adj_dev));
4984 }
4985
4986 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4987                                         struct net_device *adj_dev,
4988                                         struct list_head *dev_list,
4989                                         void *private, bool master)
4990 {
4991         struct netdev_adjacent *adj;
4992         int ret;
4993
4994         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4995
4996         if (adj) {
4997                 adj->ref_nr++;
4998                 return 0;
4999         }
5000
5001         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5002         if (!adj)
5003                 return -ENOMEM;
5004
5005         adj->dev = adj_dev;
5006         adj->master = master;
5007         adj->ref_nr = 1;
5008         adj->private = private;
5009         dev_hold(adj_dev);
5010
5011         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5012                  adj_dev->name, dev->name, adj_dev->name);
5013
5014         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5015                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5016                 if (ret)
5017                         goto free_adj;
5018         }
5019
5020         /* Ensure that master link is always the first item in list. */
5021         if (master) {
5022                 ret = sysfs_create_link(&(dev->dev.kobj),
5023                                         &(adj_dev->dev.kobj), "master");
5024                 if (ret)
5025                         goto remove_symlinks;
5026
5027                 list_add_rcu(&adj->list, dev_list);
5028         } else {
5029                 list_add_tail_rcu(&adj->list, dev_list);
5030         }
5031
5032         return 0;
5033
5034 remove_symlinks:
5035         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5036                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5037 free_adj:
5038         kfree(adj);
5039         dev_put(adj_dev);
5040
5041         return ret;
5042 }
5043
5044 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5045                                          struct net_device *adj_dev,
5046                                          struct list_head *dev_list)
5047 {
5048         struct netdev_adjacent *adj;
5049
5050         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5051
5052         if (!adj) {
5053                 pr_err("tried to remove device %s from %s\n",
5054                        dev->name, adj_dev->name);
5055                 BUG();
5056         }
5057
5058         if (adj->ref_nr > 1) {
5059                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5060                          adj->ref_nr-1);
5061                 adj->ref_nr--;
5062                 return;
5063         }
5064
5065         if (adj->master)
5066                 sysfs_remove_link(&(dev->dev.kobj), "master");
5067
5068         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5069                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5070
5071         list_del_rcu(&adj->list);
5072         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5073                  adj_dev->name, dev->name, adj_dev->name);
5074         dev_put(adj_dev);
5075         kfree_rcu(adj, rcu);
5076 }
5077
5078 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5079                                             struct net_device *upper_dev,
5080                                             struct list_head *up_list,
5081                                             struct list_head *down_list,
5082                                             void *private, bool master)
5083 {
5084         int ret;
5085
5086         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5087                                            master);
5088         if (ret)
5089                 return ret;
5090
5091         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5092                                            false);
5093         if (ret) {
5094                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5095                 return ret;
5096         }
5097
5098         return 0;
5099 }
5100
5101 static int __netdev_adjacent_dev_link(struct net_device *dev,
5102                                       struct net_device *upper_dev)
5103 {
5104         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5105                                                 &dev->all_adj_list.upper,
5106                                                 &upper_dev->all_adj_list.lower,
5107                                                 NULL, false);
5108 }
5109
5110 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5111                                                struct net_device *upper_dev,
5112                                                struct list_head *up_list,
5113                                                struct list_head *down_list)
5114 {
5115         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5116         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5117 }
5118
5119 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5120                                          struct net_device *upper_dev)
5121 {
5122         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5123                                            &dev->all_adj_list.upper,
5124                                            &upper_dev->all_adj_list.lower);
5125 }
5126
5127 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5128                                                 struct net_device *upper_dev,
5129                                                 void *private, bool master)
5130 {
5131         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5132
5133         if (ret)
5134                 return ret;
5135
5136         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5137                                                &dev->adj_list.upper,
5138                                                &upper_dev->adj_list.lower,
5139                                                private, master);
5140         if (ret) {
5141                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5142                 return ret;
5143         }
5144
5145         return 0;
5146 }
5147
5148 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5149                                                    struct net_device *upper_dev)
5150 {
5151         __netdev_adjacent_dev_unlink(dev, upper_dev);
5152         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5153                                            &dev->adj_list.upper,
5154                                            &upper_dev->adj_list.lower);
5155 }
5156
5157 static int __netdev_upper_dev_link(struct net_device *dev,
5158                                    struct net_device *upper_dev, bool master,
5159                                    void *private)
5160 {
5161         struct netdev_adjacent *i, *j, *to_i, *to_j;
5162         int ret = 0;
5163
5164         ASSERT_RTNL();
5165
5166         if (dev == upper_dev)
5167                 return -EBUSY;
5168
5169         /* To prevent loops, check if dev is not upper device to upper_dev. */
5170         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5171                 return -EBUSY;
5172
5173         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5174                 return -EEXIST;
5175
5176         if (master && netdev_master_upper_dev_get(dev))
5177                 return -EBUSY;
5178
5179         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5180                                                    master);
5181         if (ret)
5182                 return ret;
5183
5184         /* Now that we linked these devs, make all the upper_dev's
5185          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5186          * versa, and don't forget the devices itself. All of these
5187          * links are non-neighbours.
5188          */
5189         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5190                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5191                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5192                                  i->dev->name, j->dev->name);
5193                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5194                         if (ret)
5195                                 goto rollback_mesh;
5196                 }
5197         }
5198
5199         /* add dev to every upper_dev's upper device */
5200         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5201                 pr_debug("linking %s's upper device %s with %s\n",
5202                          upper_dev->name, i->dev->name, dev->name);
5203                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5204                 if (ret)
5205                         goto rollback_upper_mesh;
5206         }
5207
5208         /* add upper_dev to every dev's lower device */
5209         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5210                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5211                          i->dev->name, upper_dev->name);
5212                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5213                 if (ret)
5214                         goto rollback_lower_mesh;
5215         }
5216
5217         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5218         return 0;
5219
5220 rollback_lower_mesh:
5221         to_i = i;
5222         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5223                 if (i == to_i)
5224                         break;
5225                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5226         }
5227
5228         i = NULL;
5229
5230 rollback_upper_mesh:
5231         to_i = i;
5232         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5233                 if (i == to_i)
5234                         break;
5235                 __netdev_adjacent_dev_unlink(dev, i->dev);
5236         }
5237
5238         i = j = NULL;
5239
5240 rollback_mesh:
5241         to_i = i;
5242         to_j = j;
5243         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5244                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5245                         if (i == to_i && j == to_j)
5246                                 break;
5247                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5248                 }
5249                 if (i == to_i)
5250                         break;
5251         }
5252
5253         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5254
5255         return ret;
5256 }
5257
5258 /**
5259  * netdev_upper_dev_link - Add a link to the upper device
5260  * @dev: device
5261  * @upper_dev: new upper device
5262  *
5263  * Adds a link to device which is upper to this one. The caller must hold
5264  * the RTNL lock. On a failure a negative errno code is returned.
5265  * On success the reference counts are adjusted and the function
5266  * returns zero.
5267  */
5268 int netdev_upper_dev_link(struct net_device *dev,
5269                           struct net_device *upper_dev)
5270 {
5271         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5272 }
5273 EXPORT_SYMBOL(netdev_upper_dev_link);
5274
5275 /**
5276  * netdev_master_upper_dev_link - Add a master link to the upper device
5277  * @dev: device
5278  * @upper_dev: new upper device
5279  *
5280  * Adds a link to device which is upper to this one. In this case, only
5281  * one master upper device can be linked, although other non-master devices
5282  * might be linked as well. The caller must hold the RTNL lock.
5283  * On a failure a negative errno code is returned. On success the reference
5284  * counts are adjusted and the function returns zero.
5285  */
5286 int netdev_master_upper_dev_link(struct net_device *dev,
5287                                  struct net_device *upper_dev)
5288 {
5289         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5290 }
5291 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5292
5293 int netdev_master_upper_dev_link_private(struct net_device *dev,
5294                                          struct net_device *upper_dev,
5295                                          void *private)
5296 {
5297         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5298 }
5299 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5300
5301 /**
5302  * netdev_upper_dev_unlink - Removes a link to upper device
5303  * @dev: device
5304  * @upper_dev: new upper device
5305  *
5306  * Removes a link to device which is upper to this one. The caller must hold
5307  * the RTNL lock.
5308  */
5309 void netdev_upper_dev_unlink(struct net_device *dev,
5310                              struct net_device *upper_dev)
5311 {
5312         struct netdev_adjacent *i, *j;
5313         ASSERT_RTNL();
5314
5315         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5316
5317         /* Here is the tricky part. We must remove all dev's lower
5318          * devices from all upper_dev's upper devices and vice
5319          * versa, to maintain the graph relationship.
5320          */
5321         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5322                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5323                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5324
5325         /* remove also the devices itself from lower/upper device
5326          * list
5327          */
5328         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5329                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5330
5331         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5332                 __netdev_adjacent_dev_unlink(dev, i->dev);
5333
5334         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5335 }
5336 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5337
5338 /**
5339  * netdev_bonding_info_change - Dispatch event about slave change
5340  * @dev: device
5341  * @bonding_info: info to dispatch
5342  *
5343  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5344  * The caller must hold the RTNL lock.
5345  */
5346 void netdev_bonding_info_change(struct net_device *dev,
5347                                 struct netdev_bonding_info *bonding_info)
5348 {
5349         struct netdev_notifier_bonding_info     info;
5350
5351         memcpy(&info.bonding_info, bonding_info,
5352                sizeof(struct netdev_bonding_info));
5353         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5354                                       &info.info);
5355 }
5356 EXPORT_SYMBOL(netdev_bonding_info_change);
5357
5358 static void netdev_adjacent_add_links(struct net_device *dev)
5359 {
5360         struct netdev_adjacent *iter;
5361
5362         struct net *net = dev_net(dev);
5363
5364         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5365                 if (!net_eq(net,dev_net(iter->dev)))
5366                         continue;
5367                 netdev_adjacent_sysfs_add(iter->dev, dev,
5368                                           &iter->dev->adj_list.lower);
5369                 netdev_adjacent_sysfs_add(dev, iter->dev,
5370                                           &dev->adj_list.upper);
5371         }
5372
5373         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5374                 if (!net_eq(net,dev_net(iter->dev)))
5375                         continue;
5376                 netdev_adjacent_sysfs_add(iter->dev, dev,
5377                                           &iter->dev->adj_list.upper);
5378                 netdev_adjacent_sysfs_add(dev, iter->dev,
5379                                           &dev->adj_list.lower);
5380         }
5381 }
5382
5383 static void netdev_adjacent_del_links(struct net_device *dev)
5384 {
5385         struct netdev_adjacent *iter;
5386
5387         struct net *net = dev_net(dev);
5388
5389         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5390                 if (!net_eq(net,dev_net(iter->dev)))
5391                         continue;
5392                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5393                                           &iter->dev->adj_list.lower);
5394                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5395                                           &dev->adj_list.upper);
5396         }
5397
5398         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5399                 if (!net_eq(net,dev_net(iter->dev)))
5400                         continue;
5401                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5402                                           &iter->dev->adj_list.upper);
5403                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5404                                           &dev->adj_list.lower);
5405         }
5406 }
5407
5408 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5409 {
5410         struct netdev_adjacent *iter;
5411
5412         struct net *net = dev_net(dev);
5413
5414         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5415                 if (!net_eq(net,dev_net(iter->dev)))
5416                         continue;
5417                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5418                                           &iter->dev->adj_list.lower);
5419                 netdev_adjacent_sysfs_add(iter->dev, dev,
5420                                           &iter->dev->adj_list.lower);
5421         }
5422
5423         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5424                 if (!net_eq(net,dev_net(iter->dev)))
5425                         continue;
5426                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5427                                           &iter->dev->adj_list.upper);
5428                 netdev_adjacent_sysfs_add(iter->dev, dev,
5429                                           &iter->dev->adj_list.upper);
5430         }
5431 }
5432
5433 void *netdev_lower_dev_get_private(struct net_device *dev,
5434                                    struct net_device *lower_dev)
5435 {
5436         struct netdev_adjacent *lower;
5437
5438         if (!lower_dev)
5439                 return NULL;
5440         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5441         if (!lower)
5442                 return NULL;
5443
5444         return lower->private;
5445 }
5446 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5447
5448
5449 int dev_get_nest_level(struct net_device *dev,
5450                        bool (*type_check)(struct net_device *dev))
5451 {
5452         struct net_device *lower = NULL;
5453         struct list_head *iter;
5454         int max_nest = -1;
5455         int nest;
5456
5457         ASSERT_RTNL();
5458
5459         netdev_for_each_lower_dev(dev, lower, iter) {
5460                 nest = dev_get_nest_level(lower, type_check);
5461                 if (max_nest < nest)
5462                         max_nest = nest;
5463         }
5464
5465         if (type_check(dev))
5466                 max_nest++;
5467
5468         return max_nest;
5469 }
5470 EXPORT_SYMBOL(dev_get_nest_level);
5471
5472 static void dev_change_rx_flags(struct net_device *dev, int flags)
5473 {
5474         const struct net_device_ops *ops = dev->netdev_ops;
5475
5476         if (ops->ndo_change_rx_flags)
5477                 ops->ndo_change_rx_flags(dev, flags);
5478 }
5479
5480 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5481 {
5482         unsigned int old_flags = dev->flags;
5483         kuid_t uid;
5484         kgid_t gid;
5485
5486         ASSERT_RTNL();
5487
5488         dev->flags |= IFF_PROMISC;
5489         dev->promiscuity += inc;
5490         if (dev->promiscuity == 0) {
5491                 /*
5492                  * Avoid overflow.
5493                  * If inc causes overflow, untouch promisc and return error.
5494                  */
5495                 if (inc < 0)
5496                         dev->flags &= ~IFF_PROMISC;
5497                 else {
5498                         dev->promiscuity -= inc;
5499                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5500                                 dev->name);
5501                         return -EOVERFLOW;
5502                 }
5503         }
5504         if (dev->flags != old_flags) {
5505                 pr_info("device %s %s promiscuous mode\n",
5506                         dev->name,
5507                         dev->flags & IFF_PROMISC ? "entered" : "left");
5508                 if (audit_enabled) {
5509                         current_uid_gid(&uid, &gid);
5510                         audit_log(current->audit_context, GFP_ATOMIC,
5511                                 AUDIT_ANOM_PROMISCUOUS,
5512                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5513                                 dev->name, (dev->flags & IFF_PROMISC),
5514                                 (old_flags & IFF_PROMISC),
5515                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5516                                 from_kuid(&init_user_ns, uid),
5517                                 from_kgid(&init_user_ns, gid),
5518                                 audit_get_sessionid(current));
5519                 }
5520
5521                 dev_change_rx_flags(dev, IFF_PROMISC);
5522         }
5523         if (notify)
5524                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5525         return 0;
5526 }
5527
5528 /**
5529  *      dev_set_promiscuity     - update promiscuity count on a device
5530  *      @dev: device
5531  *      @inc: modifier
5532  *
5533  *      Add or remove promiscuity from a device. While the count in the device
5534  *      remains above zero the interface remains promiscuous. Once it hits zero
5535  *      the device reverts back to normal filtering operation. A negative inc
5536  *      value is used to drop promiscuity on the device.
5537  *      Return 0 if successful or a negative errno code on error.
5538  */
5539 int dev_set_promiscuity(struct net_device *dev, int inc)
5540 {
5541         unsigned int old_flags = dev->flags;
5542         int err;
5543
5544         err = __dev_set_promiscuity(dev, inc, true);
5545         if (err < 0)
5546                 return err;
5547         if (dev->flags != old_flags)
5548                 dev_set_rx_mode(dev);
5549         return err;
5550 }
5551 EXPORT_SYMBOL(dev_set_promiscuity);
5552
5553 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5554 {
5555         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5556
5557         ASSERT_RTNL();
5558
5559         dev->flags |= IFF_ALLMULTI;
5560         dev->allmulti += inc;
5561         if (dev->allmulti == 0) {
5562                 /*
5563                  * Avoid overflow.
5564                  * If inc causes overflow, untouch allmulti and return error.
5565                  */
5566                 if (inc < 0)
5567                         dev->flags &= ~IFF_ALLMULTI;
5568                 else {
5569                         dev->allmulti -= inc;
5570                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5571                                 dev->name);
5572                         return -EOVERFLOW;
5573                 }
5574         }
5575         if (dev->flags ^ old_flags) {
5576                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5577                 dev_set_rx_mode(dev);
5578                 if (notify)
5579                         __dev_notify_flags(dev, old_flags,
5580                                            dev->gflags ^ old_gflags);
5581         }
5582         return 0;
5583 }
5584
5585 /**
5586  *      dev_set_allmulti        - update allmulti count on a device
5587  *      @dev: device
5588  *      @inc: modifier
5589  *
5590  *      Add or remove reception of all multicast frames to a device. While the
5591  *      count in the device remains above zero the interface remains listening
5592  *      to all interfaces. Once it hits zero the device reverts back to normal
5593  *      filtering operation. A negative @inc value is used to drop the counter
5594  *      when releasing a resource needing all multicasts.
5595  *      Return 0 if successful or a negative errno code on error.
5596  */
5597
5598 int dev_set_allmulti(struct net_device *dev, int inc)
5599 {
5600         return __dev_set_allmulti(dev, inc, true);
5601 }
5602 EXPORT_SYMBOL(dev_set_allmulti);
5603
5604 /*
5605  *      Upload unicast and multicast address lists to device and
5606  *      configure RX filtering. When the device doesn't support unicast
5607  *      filtering it is put in promiscuous mode while unicast addresses
5608  *      are present.
5609  */
5610 void __dev_set_rx_mode(struct net_device *dev)
5611 {
5612         const struct net_device_ops *ops = dev->netdev_ops;
5613
5614         /* dev_open will call this function so the list will stay sane. */
5615         if (!(dev->flags&IFF_UP))
5616                 return;
5617
5618         if (!netif_device_present(dev))
5619                 return;
5620
5621         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5622                 /* Unicast addresses changes may only happen under the rtnl,
5623                  * therefore calling __dev_set_promiscuity here is safe.
5624                  */
5625                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5626                         __dev_set_promiscuity(dev, 1, false);
5627                         dev->uc_promisc = true;
5628                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5629                         __dev_set_promiscuity(dev, -1, false);
5630                         dev->uc_promisc = false;
5631                 }
5632         }
5633
5634         if (ops->ndo_set_rx_mode)
5635                 ops->ndo_set_rx_mode(dev);
5636 }
5637
5638 void dev_set_rx_mode(struct net_device *dev)
5639 {
5640         netif_addr_lock_bh(dev);
5641         __dev_set_rx_mode(dev);
5642         netif_addr_unlock_bh(dev);
5643 }
5644
5645 /**
5646  *      dev_get_flags - get flags reported to userspace
5647  *      @dev: device
5648  *
5649  *      Get the combination of flag bits exported through APIs to userspace.
5650  */
5651 unsigned int dev_get_flags(const struct net_device *dev)
5652 {
5653         unsigned int flags;
5654
5655         flags = (dev->flags & ~(IFF_PROMISC |
5656                                 IFF_ALLMULTI |
5657                                 IFF_RUNNING |
5658                                 IFF_LOWER_UP |
5659                                 IFF_DORMANT)) |
5660                 (dev->gflags & (IFF_PROMISC |
5661                                 IFF_ALLMULTI));
5662
5663         if (netif_running(dev)) {
5664                 if (netif_oper_up(dev))
5665                         flags |= IFF_RUNNING;
5666                 if (netif_carrier_ok(dev))
5667                         flags |= IFF_LOWER_UP;
5668                 if (netif_dormant(dev))
5669                         flags |= IFF_DORMANT;
5670         }
5671
5672         return flags;
5673 }
5674 EXPORT_SYMBOL(dev_get_flags);
5675
5676 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5677 {
5678         unsigned int old_flags = dev->flags;
5679         int ret;
5680
5681         ASSERT_RTNL();
5682
5683         /*
5684          *      Set the flags on our device.
5685          */
5686
5687         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5688                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5689                                IFF_AUTOMEDIA)) |
5690                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5691                                     IFF_ALLMULTI));
5692
5693         /*
5694          *      Load in the correct multicast list now the flags have changed.
5695          */
5696
5697         if ((old_flags ^ flags) & IFF_MULTICAST)
5698                 dev_change_rx_flags(dev, IFF_MULTICAST);
5699
5700         dev_set_rx_mode(dev);
5701
5702         /*
5703          *      Have we downed the interface. We handle IFF_UP ourselves
5704          *      according to user attempts to set it, rather than blindly
5705          *      setting it.
5706          */
5707
5708         ret = 0;
5709         if ((old_flags ^ flags) & IFF_UP)
5710                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5711
5712         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5713                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5714                 unsigned int old_flags = dev->flags;
5715
5716                 dev->gflags ^= IFF_PROMISC;
5717
5718                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5719                         if (dev->flags != old_flags)
5720                                 dev_set_rx_mode(dev);
5721         }
5722
5723         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5724            is important. Some (broken) drivers set IFF_PROMISC, when
5725            IFF_ALLMULTI is requested not asking us and not reporting.
5726          */
5727         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5728                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5729
5730                 dev->gflags ^= IFF_ALLMULTI;
5731                 __dev_set_allmulti(dev, inc, false);
5732         }
5733
5734         return ret;
5735 }
5736
5737 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5738                         unsigned int gchanges)
5739 {
5740         unsigned int changes = dev->flags ^ old_flags;
5741
5742         if (gchanges)
5743                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5744
5745         if (changes & IFF_UP) {
5746                 if (dev->flags & IFF_UP)
5747                         call_netdevice_notifiers(NETDEV_UP, dev);
5748                 else
5749                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5750         }
5751
5752         if (dev->flags & IFF_UP &&
5753             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5754                 struct netdev_notifier_change_info change_info;
5755
5756                 change_info.flags_changed = changes;
5757                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5758                                               &change_info.info);
5759         }
5760 }
5761
5762 /**
5763  *      dev_change_flags - change device settings
5764  *      @dev: device
5765  *      @flags: device state flags
5766  *
5767  *      Change settings on device based state flags. The flags are
5768  *      in the userspace exported format.
5769  */
5770 int dev_change_flags(struct net_device *dev, unsigned int flags)
5771 {
5772         int ret;
5773         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5774
5775         ret = __dev_change_flags(dev, flags);
5776         if (ret < 0)
5777                 return ret;
5778
5779         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5780         __dev_notify_flags(dev, old_flags, changes);
5781         return ret;
5782 }
5783 EXPORT_SYMBOL(dev_change_flags);
5784
5785 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5786 {
5787         const struct net_device_ops *ops = dev->netdev_ops;
5788
5789         if (ops->ndo_change_mtu)
5790                 return ops->ndo_change_mtu(dev, new_mtu);
5791
5792         dev->mtu = new_mtu;
5793         return 0;
5794 }
5795
5796 /**
5797  *      dev_set_mtu - Change maximum transfer unit
5798  *      @dev: device
5799  *      @new_mtu: new transfer unit
5800  *
5801  *      Change the maximum transfer size of the network device.
5802  */
5803 int dev_set_mtu(struct net_device *dev, int new_mtu)
5804 {
5805         int err, orig_mtu;
5806
5807         if (new_mtu == dev->mtu)
5808                 return 0;
5809
5810         /*      MTU must be positive.    */
5811         if (new_mtu < 0)
5812                 return -EINVAL;
5813
5814         if (!netif_device_present(dev))
5815                 return -ENODEV;
5816
5817         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5818         err = notifier_to_errno(err);
5819         if (err)
5820                 return err;
5821
5822         orig_mtu = dev->mtu;
5823         err = __dev_set_mtu(dev, new_mtu);
5824
5825         if (!err) {
5826                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5827                 err = notifier_to_errno(err);
5828                 if (err) {
5829                         /* setting mtu back and notifying everyone again,
5830                          * so that they have a chance to revert changes.
5831                          */
5832                         __dev_set_mtu(dev, orig_mtu);
5833                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5834                 }
5835         }
5836         return err;
5837 }
5838 EXPORT_SYMBOL(dev_set_mtu);
5839
5840 /**
5841  *      dev_set_group - Change group this device belongs to
5842  *      @dev: device
5843  *      @new_group: group this device should belong to
5844  */
5845 void dev_set_group(struct net_device *dev, int new_group)
5846 {
5847         dev->group = new_group;
5848 }
5849 EXPORT_SYMBOL(dev_set_group);
5850
5851 /**
5852  *      dev_set_mac_address - Change Media Access Control Address
5853  *      @dev: device
5854  *      @sa: new address
5855  *
5856  *      Change the hardware (MAC) address of the device
5857  */
5858 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5859 {
5860         const struct net_device_ops *ops = dev->netdev_ops;
5861         int err;
5862
5863         if (!ops->ndo_set_mac_address)
5864                 return -EOPNOTSUPP;
5865         if (sa->sa_family != dev->type)
5866                 return -EINVAL;
5867         if (!netif_device_present(dev))
5868                 return -ENODEV;
5869         err = ops->ndo_set_mac_address(dev, sa);
5870         if (err)
5871                 return err;
5872         dev->addr_assign_type = NET_ADDR_SET;
5873         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5874         add_device_randomness(dev->dev_addr, dev->addr_len);
5875         return 0;
5876 }
5877 EXPORT_SYMBOL(dev_set_mac_address);
5878
5879 /**
5880  *      dev_change_carrier - Change device carrier
5881  *      @dev: device
5882  *      @new_carrier: new value
5883  *
5884  *      Change device carrier
5885  */
5886 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5887 {
5888         const struct net_device_ops *ops = dev->netdev_ops;
5889
5890         if (!ops->ndo_change_carrier)
5891                 return -EOPNOTSUPP;
5892         if (!netif_device_present(dev))
5893                 return -ENODEV;
5894         return ops->ndo_change_carrier(dev, new_carrier);
5895 }
5896 EXPORT_SYMBOL(dev_change_carrier);
5897
5898 /**
5899  *      dev_get_phys_port_id - Get device physical port ID
5900  *      @dev: device
5901  *      @ppid: port ID
5902  *
5903  *      Get device physical port ID
5904  */
5905 int dev_get_phys_port_id(struct net_device *dev,
5906                          struct netdev_phys_item_id *ppid)
5907 {
5908         const struct net_device_ops *ops = dev->netdev_ops;
5909
5910         if (!ops->ndo_get_phys_port_id)
5911                 return -EOPNOTSUPP;
5912         return ops->ndo_get_phys_port_id(dev, ppid);
5913 }
5914 EXPORT_SYMBOL(dev_get_phys_port_id);
5915
5916 /**
5917  *      dev_get_phys_port_name - Get device physical port name
5918  *      @dev: device
5919  *      @name: port name
5920  *
5921  *      Get device physical port name
5922  */
5923 int dev_get_phys_port_name(struct net_device *dev,
5924                            char *name, size_t len)
5925 {
5926         const struct net_device_ops *ops = dev->netdev_ops;
5927
5928         if (!ops->ndo_get_phys_port_name)
5929                 return -EOPNOTSUPP;
5930         return ops->ndo_get_phys_port_name(dev, name, len);
5931 }
5932 EXPORT_SYMBOL(dev_get_phys_port_name);
5933
5934 /**
5935  *      dev_new_index   -       allocate an ifindex
5936  *      @net: the applicable net namespace
5937  *
5938  *      Returns a suitable unique value for a new device interface
5939  *      number.  The caller must hold the rtnl semaphore or the
5940  *      dev_base_lock to be sure it remains unique.
5941  */
5942 static int dev_new_index(struct net *net)
5943 {
5944         int ifindex = net->ifindex;
5945         for (;;) {
5946                 if (++ifindex <= 0)
5947                         ifindex = 1;
5948                 if (!__dev_get_by_index(net, ifindex))
5949                         return net->ifindex = ifindex;
5950         }
5951 }
5952
5953 /* Delayed registration/unregisteration */
5954 static LIST_HEAD(net_todo_list);
5955 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5956
5957 static void net_set_todo(struct net_device *dev)
5958 {
5959         list_add_tail(&dev->todo_list, &net_todo_list);
5960         dev_net(dev)->dev_unreg_count++;
5961 }
5962
5963 static void rollback_registered_many(struct list_head *head)
5964 {
5965         struct net_device *dev, *tmp;
5966         LIST_HEAD(close_head);
5967
5968         BUG_ON(dev_boot_phase);
5969         ASSERT_RTNL();
5970
5971         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5972                 /* Some devices call without registering
5973                  * for initialization unwind. Remove those
5974                  * devices and proceed with the remaining.
5975                  */
5976                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5977                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5978                                  dev->name, dev);
5979
5980                         WARN_ON(1);
5981                         list_del(&dev->unreg_list);
5982                         continue;
5983                 }
5984                 dev->dismantle = true;
5985                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5986         }
5987
5988         /* If device is running, close it first. */
5989         list_for_each_entry(dev, head, unreg_list)
5990                 list_add_tail(&dev->close_list, &close_head);
5991         dev_close_many(&close_head, true);
5992
5993         list_for_each_entry(dev, head, unreg_list) {
5994                 /* And unlink it from device chain. */
5995                 unlist_netdevice(dev);
5996
5997                 dev->reg_state = NETREG_UNREGISTERING;
5998         }
5999
6000         synchronize_net();
6001
6002         list_for_each_entry(dev, head, unreg_list) {
6003                 struct sk_buff *skb = NULL;
6004
6005                 /* Shutdown queueing discipline. */
6006                 dev_shutdown(dev);
6007
6008
6009                 /* Notify protocols, that we are about to destroy
6010                    this device. They should clean all the things.
6011                 */
6012                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6013
6014                 if (!dev->rtnl_link_ops ||
6015                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6016                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6017                                                      GFP_KERNEL);
6018
6019                 /*
6020                  *      Flush the unicast and multicast chains
6021                  */
6022                 dev_uc_flush(dev);
6023                 dev_mc_flush(dev);
6024
6025                 if (dev->netdev_ops->ndo_uninit)
6026                         dev->netdev_ops->ndo_uninit(dev);
6027
6028                 if (skb)
6029                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6030
6031                 /* Notifier chain MUST detach us all upper devices. */
6032                 WARN_ON(netdev_has_any_upper_dev(dev));
6033
6034                 /* Remove entries from kobject tree */
6035                 netdev_unregister_kobject(dev);
6036 #ifdef CONFIG_XPS
6037                 /* Remove XPS queueing entries */
6038                 netif_reset_xps_queues_gt(dev, 0);
6039 #endif
6040         }
6041
6042         synchronize_net();
6043
6044         list_for_each_entry(dev, head, unreg_list)
6045                 dev_put(dev);
6046 }
6047
6048 static void rollback_registered(struct net_device *dev)
6049 {
6050         LIST_HEAD(single);
6051
6052         list_add(&dev->unreg_list, &single);
6053         rollback_registered_many(&single);
6054         list_del(&single);
6055 }
6056
6057 static netdev_features_t netdev_fix_features(struct net_device *dev,
6058         netdev_features_t features)
6059 {
6060         /* Fix illegal checksum combinations */
6061         if ((features & NETIF_F_HW_CSUM) &&
6062             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6063                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6064                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6065         }
6066
6067         /* TSO requires that SG is present as well. */
6068         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6069                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6070                 features &= ~NETIF_F_ALL_TSO;
6071         }
6072
6073         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6074                                         !(features & NETIF_F_IP_CSUM)) {
6075                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6076                 features &= ~NETIF_F_TSO;
6077                 features &= ~NETIF_F_TSO_ECN;
6078         }
6079
6080         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6081                                          !(features & NETIF_F_IPV6_CSUM)) {
6082                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6083                 features &= ~NETIF_F_TSO6;
6084         }
6085
6086         /* TSO ECN requires that TSO is present as well. */
6087         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6088                 features &= ~NETIF_F_TSO_ECN;
6089
6090         /* Software GSO depends on SG. */
6091         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6092                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6093                 features &= ~NETIF_F_GSO;
6094         }
6095
6096         /* UFO needs SG and checksumming */
6097         if (features & NETIF_F_UFO) {
6098                 /* maybe split UFO into V4 and V6? */
6099                 if (!((features & NETIF_F_GEN_CSUM) ||
6100                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6101                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6102                         netdev_dbg(dev,
6103                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6104                         features &= ~NETIF_F_UFO;
6105                 }
6106
6107                 if (!(features & NETIF_F_SG)) {
6108                         netdev_dbg(dev,
6109                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6110                         features &= ~NETIF_F_UFO;
6111                 }
6112         }
6113
6114 #ifdef CONFIG_NET_RX_BUSY_POLL
6115         if (dev->netdev_ops->ndo_busy_poll)
6116                 features |= NETIF_F_BUSY_POLL;
6117         else
6118 #endif
6119                 features &= ~NETIF_F_BUSY_POLL;
6120
6121         return features;
6122 }
6123
6124 int __netdev_update_features(struct net_device *dev)
6125 {
6126         netdev_features_t features;
6127         int err = 0;
6128
6129         ASSERT_RTNL();
6130
6131         features = netdev_get_wanted_features(dev);
6132
6133         if (dev->netdev_ops->ndo_fix_features)
6134                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6135
6136         /* driver might be less strict about feature dependencies */
6137         features = netdev_fix_features(dev, features);
6138
6139         if (dev->features == features)
6140                 return 0;
6141
6142         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6143                 &dev->features, &features);
6144
6145         if (dev->netdev_ops->ndo_set_features)
6146                 err = dev->netdev_ops->ndo_set_features(dev, features);
6147
6148         if (unlikely(err < 0)) {
6149                 netdev_err(dev,
6150                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6151                         err, &features, &dev->features);
6152                 return -1;
6153         }
6154
6155         if (!err)
6156                 dev->features = features;
6157
6158         return 1;
6159 }
6160
6161 /**
6162  *      netdev_update_features - recalculate device features
6163  *      @dev: the device to check
6164  *
6165  *      Recalculate dev->features set and send notifications if it
6166  *      has changed. Should be called after driver or hardware dependent
6167  *      conditions might have changed that influence the features.
6168  */
6169 void netdev_update_features(struct net_device *dev)
6170 {
6171         if (__netdev_update_features(dev))
6172                 netdev_features_change(dev);
6173 }
6174 EXPORT_SYMBOL(netdev_update_features);
6175
6176 /**
6177  *      netdev_change_features - recalculate device features
6178  *      @dev: the device to check
6179  *
6180  *      Recalculate dev->features set and send notifications even
6181  *      if they have not changed. Should be called instead of
6182  *      netdev_update_features() if also dev->vlan_features might
6183  *      have changed to allow the changes to be propagated to stacked
6184  *      VLAN devices.
6185  */
6186 void netdev_change_features(struct net_device *dev)
6187 {
6188         __netdev_update_features(dev);
6189         netdev_features_change(dev);
6190 }
6191 EXPORT_SYMBOL(netdev_change_features);
6192
6193 /**
6194  *      netif_stacked_transfer_operstate -      transfer operstate
6195  *      @rootdev: the root or lower level device to transfer state from
6196  *      @dev: the device to transfer operstate to
6197  *
6198  *      Transfer operational state from root to device. This is normally
6199  *      called when a stacking relationship exists between the root
6200  *      device and the device(a leaf device).
6201  */
6202 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6203                                         struct net_device *dev)
6204 {
6205         if (rootdev->operstate == IF_OPER_DORMANT)
6206                 netif_dormant_on(dev);
6207         else
6208                 netif_dormant_off(dev);
6209
6210         if (netif_carrier_ok(rootdev)) {
6211                 if (!netif_carrier_ok(dev))
6212                         netif_carrier_on(dev);
6213         } else {
6214                 if (netif_carrier_ok(dev))
6215                         netif_carrier_off(dev);
6216         }
6217 }
6218 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6219
6220 #ifdef CONFIG_SYSFS
6221 static int netif_alloc_rx_queues(struct net_device *dev)
6222 {
6223         unsigned int i, count = dev->num_rx_queues;
6224         struct netdev_rx_queue *rx;
6225         size_t sz = count * sizeof(*rx);
6226
6227         BUG_ON(count < 1);
6228
6229         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6230         if (!rx) {
6231                 rx = vzalloc(sz);
6232                 if (!rx)
6233                         return -ENOMEM;
6234         }
6235         dev->_rx = rx;
6236
6237         for (i = 0; i < count; i++)
6238                 rx[i].dev = dev;
6239         return 0;
6240 }
6241 #endif
6242
6243 static void netdev_init_one_queue(struct net_device *dev,
6244                                   struct netdev_queue *queue, void *_unused)
6245 {
6246         /* Initialize queue lock */
6247         spin_lock_init(&queue->_xmit_lock);
6248         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6249         queue->xmit_lock_owner = -1;
6250         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6251         queue->dev = dev;
6252 #ifdef CONFIG_BQL
6253         dql_init(&queue->dql, HZ);
6254 #endif
6255 }
6256
6257 static void netif_free_tx_queues(struct net_device *dev)
6258 {
6259         kvfree(dev->_tx);
6260 }
6261
6262 static int netif_alloc_netdev_queues(struct net_device *dev)
6263 {
6264         unsigned int count = dev->num_tx_queues;
6265         struct netdev_queue *tx;
6266         size_t sz = count * sizeof(*tx);
6267
6268         BUG_ON(count < 1 || count > 0xffff);
6269
6270         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6271         if (!tx) {
6272                 tx = vzalloc(sz);
6273                 if (!tx)
6274                         return -ENOMEM;
6275         }
6276         dev->_tx = tx;
6277
6278         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6279         spin_lock_init(&dev->tx_global_lock);
6280
6281         return 0;
6282 }
6283
6284 /**
6285  *      register_netdevice      - register a network device
6286  *      @dev: device to register
6287  *
6288  *      Take a completed network device structure and add it to the kernel
6289  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6290  *      chain. 0 is returned on success. A negative errno code is returned
6291  *      on a failure to set up the device, or if the name is a duplicate.
6292  *
6293  *      Callers must hold the rtnl semaphore. You may want
6294  *      register_netdev() instead of this.
6295  *
6296  *      BUGS:
6297  *      The locking appears insufficient to guarantee two parallel registers
6298  *      will not get the same name.
6299  */
6300
6301 int register_netdevice(struct net_device *dev)
6302 {
6303         int ret;
6304         struct net *net = dev_net(dev);
6305
6306         BUG_ON(dev_boot_phase);
6307         ASSERT_RTNL();
6308
6309         might_sleep();
6310
6311         /* When net_device's are persistent, this will be fatal. */
6312         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6313         BUG_ON(!net);
6314
6315         spin_lock_init(&dev->addr_list_lock);
6316         netdev_set_addr_lockdep_class(dev);
6317
6318         dev->iflink = -1;
6319
6320         ret = dev_get_valid_name(net, dev, dev->name);
6321         if (ret < 0)
6322                 goto out;
6323
6324         /* Init, if this function is available */
6325         if (dev->netdev_ops->ndo_init) {
6326                 ret = dev->netdev_ops->ndo_init(dev);
6327                 if (ret) {
6328                         if (ret > 0)
6329                                 ret = -EIO;
6330                         goto out;
6331                 }
6332         }
6333
6334         if (((dev->hw_features | dev->features) &
6335              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6336             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6337              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6338                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6339                 ret = -EINVAL;
6340                 goto err_uninit;
6341         }
6342
6343         ret = -EBUSY;
6344         if (!dev->ifindex)
6345                 dev->ifindex = dev_new_index(net);
6346         else if (__dev_get_by_index(net, dev->ifindex))
6347                 goto err_uninit;
6348
6349         if (dev->iflink == -1)
6350                 dev->iflink = dev->ifindex;
6351
6352         /* Transfer changeable features to wanted_features and enable
6353          * software offloads (GSO and GRO).
6354          */
6355         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6356         dev->features |= NETIF_F_SOFT_FEATURES;
6357         dev->wanted_features = dev->features & dev->hw_features;
6358
6359         if (!(dev->flags & IFF_LOOPBACK)) {
6360                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6361         }
6362
6363         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6364          */
6365         dev->vlan_features |= NETIF_F_HIGHDMA;
6366
6367         /* Make NETIF_F_SG inheritable to tunnel devices.
6368          */
6369         dev->hw_enc_features |= NETIF_F_SG;
6370
6371         /* Make NETIF_F_SG inheritable to MPLS.
6372          */
6373         dev->mpls_features |= NETIF_F_SG;
6374
6375         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6376         ret = notifier_to_errno(ret);
6377         if (ret)
6378                 goto err_uninit;
6379
6380         ret = netdev_register_kobject(dev);
6381         if (ret)
6382                 goto err_uninit;
6383         dev->reg_state = NETREG_REGISTERED;
6384
6385         __netdev_update_features(dev);
6386
6387         /*
6388          *      Default initial state at registry is that the
6389          *      device is present.
6390          */
6391
6392         set_bit(__LINK_STATE_PRESENT, &dev->state);
6393
6394         linkwatch_init_dev(dev);
6395
6396         dev_init_scheduler(dev);
6397         dev_hold(dev);
6398         list_netdevice(dev);
6399         add_device_randomness(dev->dev_addr, dev->addr_len);
6400
6401         /* If the device has permanent device address, driver should
6402          * set dev_addr and also addr_assign_type should be set to
6403          * NET_ADDR_PERM (default value).
6404          */
6405         if (dev->addr_assign_type == NET_ADDR_PERM)
6406                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6407
6408         /* Notify protocols, that a new device appeared. */
6409         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6410         ret = notifier_to_errno(ret);
6411         if (ret) {
6412                 rollback_registered(dev);
6413                 dev->reg_state = NETREG_UNREGISTERED;
6414         }
6415         /*
6416          *      Prevent userspace races by waiting until the network
6417          *      device is fully setup before sending notifications.
6418          */
6419         if (!dev->rtnl_link_ops ||
6420             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6421                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6422
6423 out:
6424         return ret;
6425
6426 err_uninit:
6427         if (dev->netdev_ops->ndo_uninit)
6428                 dev->netdev_ops->ndo_uninit(dev);
6429         goto out;
6430 }
6431 EXPORT_SYMBOL(register_netdevice);
6432
6433 /**
6434  *      init_dummy_netdev       - init a dummy network device for NAPI
6435  *      @dev: device to init
6436  *
6437  *      This takes a network device structure and initialize the minimum
6438  *      amount of fields so it can be used to schedule NAPI polls without
6439  *      registering a full blown interface. This is to be used by drivers
6440  *      that need to tie several hardware interfaces to a single NAPI
6441  *      poll scheduler due to HW limitations.
6442  */
6443 int init_dummy_netdev(struct net_device *dev)
6444 {
6445         /* Clear everything. Note we don't initialize spinlocks
6446          * are they aren't supposed to be taken by any of the
6447          * NAPI code and this dummy netdev is supposed to be
6448          * only ever used for NAPI polls
6449          */
6450         memset(dev, 0, sizeof(struct net_device));
6451
6452         /* make sure we BUG if trying to hit standard
6453          * register/unregister code path
6454          */
6455         dev->reg_state = NETREG_DUMMY;
6456
6457         /* NAPI wants this */
6458         INIT_LIST_HEAD(&dev->napi_list);
6459
6460         /* a dummy interface is started by default */
6461         set_bit(__LINK_STATE_PRESENT, &dev->state);
6462         set_bit(__LINK_STATE_START, &dev->state);
6463
6464         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6465          * because users of this 'device' dont need to change
6466          * its refcount.
6467          */
6468
6469         return 0;
6470 }
6471 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6472
6473
6474 /**
6475  *      register_netdev - register a network device
6476  *      @dev: device to register
6477  *
6478  *      Take a completed network device structure and add it to the kernel
6479  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6480  *      chain. 0 is returned on success. A negative errno code is returned
6481  *      on a failure to set up the device, or if the name is a duplicate.
6482  *
6483  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6484  *      and expands the device name if you passed a format string to
6485  *      alloc_netdev.
6486  */
6487 int register_netdev(struct net_device *dev)
6488 {
6489         int err;
6490
6491         rtnl_lock();
6492         err = register_netdevice(dev);
6493         rtnl_unlock();
6494         return err;
6495 }
6496 EXPORT_SYMBOL(register_netdev);
6497
6498 int netdev_refcnt_read(const struct net_device *dev)
6499 {
6500         int i, refcnt = 0;
6501
6502         for_each_possible_cpu(i)
6503                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6504         return refcnt;
6505 }
6506 EXPORT_SYMBOL(netdev_refcnt_read);
6507
6508 /**
6509  * netdev_wait_allrefs - wait until all references are gone.
6510  * @dev: target net_device
6511  *
6512  * This is called when unregistering network devices.
6513  *
6514  * Any protocol or device that holds a reference should register
6515  * for netdevice notification, and cleanup and put back the
6516  * reference if they receive an UNREGISTER event.
6517  * We can get stuck here if buggy protocols don't correctly
6518  * call dev_put.
6519  */
6520 static void netdev_wait_allrefs(struct net_device *dev)
6521 {
6522         unsigned long rebroadcast_time, warning_time;
6523         int refcnt;
6524
6525         linkwatch_forget_dev(dev);
6526
6527         rebroadcast_time = warning_time = jiffies;
6528         refcnt = netdev_refcnt_read(dev);
6529
6530         while (refcnt != 0) {
6531                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6532                         rtnl_lock();
6533
6534                         /* Rebroadcast unregister notification */
6535                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6536
6537                         __rtnl_unlock();
6538                         rcu_barrier();
6539                         rtnl_lock();
6540
6541                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6542                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6543                                      &dev->state)) {
6544                                 /* We must not have linkwatch events
6545                                  * pending on unregister. If this
6546                                  * happens, we simply run the queue
6547                                  * unscheduled, resulting in a noop
6548                                  * for this device.
6549                                  */
6550                                 linkwatch_run_queue();
6551                         }
6552
6553                         __rtnl_unlock();
6554
6555                         rebroadcast_time = jiffies;
6556                 }
6557
6558                 msleep(250);
6559
6560                 refcnt = netdev_refcnt_read(dev);
6561
6562                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6563                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6564                                  dev->name, refcnt);
6565                         warning_time = jiffies;
6566                 }
6567         }
6568 }
6569
6570 /* The sequence is:
6571  *
6572  *      rtnl_lock();
6573  *      ...
6574  *      register_netdevice(x1);
6575  *      register_netdevice(x2);
6576  *      ...
6577  *      unregister_netdevice(y1);
6578  *      unregister_netdevice(y2);
6579  *      ...
6580  *      rtnl_unlock();
6581  *      free_netdev(y1);
6582  *      free_netdev(y2);
6583  *
6584  * We are invoked by rtnl_unlock().
6585  * This allows us to deal with problems:
6586  * 1) We can delete sysfs objects which invoke hotplug
6587  *    without deadlocking with linkwatch via keventd.
6588  * 2) Since we run with the RTNL semaphore not held, we can sleep
6589  *    safely in order to wait for the netdev refcnt to drop to zero.
6590  *
6591  * We must not return until all unregister events added during
6592  * the interval the lock was held have been completed.
6593  */
6594 void netdev_run_todo(void)
6595 {
6596         struct list_head list;
6597
6598         /* Snapshot list, allow later requests */
6599         list_replace_init(&net_todo_list, &list);
6600
6601         __rtnl_unlock();
6602
6603
6604         /* Wait for rcu callbacks to finish before next phase */
6605         if (!list_empty(&list))
6606                 rcu_barrier();
6607
6608         while (!list_empty(&list)) {
6609                 struct net_device *dev
6610                         = list_first_entry(&list, struct net_device, todo_list);
6611                 list_del(&dev->todo_list);
6612
6613                 rtnl_lock();
6614                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6615                 __rtnl_unlock();
6616
6617                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6618                         pr_err("network todo '%s' but state %d\n",
6619                                dev->name, dev->reg_state);
6620                         dump_stack();
6621                         continue;
6622                 }
6623
6624                 dev->reg_state = NETREG_UNREGISTERED;
6625
6626                 on_each_cpu(flush_backlog, dev, 1);
6627
6628                 netdev_wait_allrefs(dev);
6629
6630                 /* paranoia */
6631                 BUG_ON(netdev_refcnt_read(dev));
6632                 BUG_ON(!list_empty(&dev->ptype_all));
6633                 BUG_ON(!list_empty(&dev->ptype_specific));
6634                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6635                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6636                 WARN_ON(dev->dn_ptr);
6637
6638                 if (dev->destructor)
6639                         dev->destructor(dev);
6640
6641                 /* Report a network device has been unregistered */
6642                 rtnl_lock();
6643                 dev_net(dev)->dev_unreg_count--;
6644                 __rtnl_unlock();
6645                 wake_up(&netdev_unregistering_wq);
6646
6647                 /* Free network device */
6648                 kobject_put(&dev->dev.kobj);
6649         }
6650 }
6651
6652 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6653  * fields in the same order, with only the type differing.
6654  */
6655 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6656                              const struct net_device_stats *netdev_stats)
6657 {
6658 #if BITS_PER_LONG == 64
6659         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6660         memcpy(stats64, netdev_stats, sizeof(*stats64));
6661 #else
6662         size_t i, n = sizeof(*stats64) / sizeof(u64);
6663         const unsigned long *src = (const unsigned long *)netdev_stats;
6664         u64 *dst = (u64 *)stats64;
6665
6666         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6667                      sizeof(*stats64) / sizeof(u64));
6668         for (i = 0; i < n; i++)
6669                 dst[i] = src[i];
6670 #endif
6671 }
6672 EXPORT_SYMBOL(netdev_stats_to_stats64);
6673
6674 /**
6675  *      dev_get_stats   - get network device statistics
6676  *      @dev: device to get statistics from
6677  *      @storage: place to store stats
6678  *
6679  *      Get network statistics from device. Return @storage.
6680  *      The device driver may provide its own method by setting
6681  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6682  *      otherwise the internal statistics structure is used.
6683  */
6684 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6685                                         struct rtnl_link_stats64 *storage)
6686 {
6687         const struct net_device_ops *ops = dev->netdev_ops;
6688
6689         if (ops->ndo_get_stats64) {
6690                 memset(storage, 0, sizeof(*storage));
6691                 ops->ndo_get_stats64(dev, storage);
6692         } else if (ops->ndo_get_stats) {
6693                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6694         } else {
6695                 netdev_stats_to_stats64(storage, &dev->stats);
6696         }
6697         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6698         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6699         return storage;
6700 }
6701 EXPORT_SYMBOL(dev_get_stats);
6702
6703 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6704 {
6705         struct netdev_queue *queue = dev_ingress_queue(dev);
6706
6707 #ifdef CONFIG_NET_CLS_ACT
6708         if (queue)
6709                 return queue;
6710         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6711         if (!queue)
6712                 return NULL;
6713         netdev_init_one_queue(dev, queue, NULL);
6714         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6715         queue->qdisc_sleeping = &noop_qdisc;
6716         rcu_assign_pointer(dev->ingress_queue, queue);
6717 #endif
6718         return queue;
6719 }
6720
6721 static const struct ethtool_ops default_ethtool_ops;
6722
6723 void netdev_set_default_ethtool_ops(struct net_device *dev,
6724                                     const struct ethtool_ops *ops)
6725 {
6726         if (dev->ethtool_ops == &default_ethtool_ops)
6727                 dev->ethtool_ops = ops;
6728 }
6729 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6730
6731 void netdev_freemem(struct net_device *dev)
6732 {
6733         char *addr = (char *)dev - dev->padded;
6734
6735         kvfree(addr);
6736 }
6737
6738 /**
6739  *      alloc_netdev_mqs - allocate network device
6740  *      @sizeof_priv:           size of private data to allocate space for
6741  *      @name:                  device name format string
6742  *      @name_assign_type:      origin of device name
6743  *      @setup:                 callback to initialize device
6744  *      @txqs:                  the number of TX subqueues to allocate
6745  *      @rxqs:                  the number of RX subqueues to allocate
6746  *
6747  *      Allocates a struct net_device with private data area for driver use
6748  *      and performs basic initialization.  Also allocates subqueue structs
6749  *      for each queue on the device.
6750  */
6751 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6752                 unsigned char name_assign_type,
6753                 void (*setup)(struct net_device *),
6754                 unsigned int txqs, unsigned int rxqs)
6755 {
6756         struct net_device *dev;
6757         size_t alloc_size;
6758         struct net_device *p;
6759
6760         BUG_ON(strlen(name) >= sizeof(dev->name));
6761
6762         if (txqs < 1) {
6763                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6764                 return NULL;
6765         }
6766
6767 #ifdef CONFIG_SYSFS
6768         if (rxqs < 1) {
6769                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6770                 return NULL;
6771         }
6772 #endif
6773
6774         alloc_size = sizeof(struct net_device);
6775         if (sizeof_priv) {
6776                 /* ensure 32-byte alignment of private area */
6777                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6778                 alloc_size += sizeof_priv;
6779         }
6780         /* ensure 32-byte alignment of whole construct */
6781         alloc_size += NETDEV_ALIGN - 1;
6782
6783         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6784         if (!p)
6785                 p = vzalloc(alloc_size);
6786         if (!p)
6787                 return NULL;
6788
6789         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6790         dev->padded = (char *)dev - (char *)p;
6791
6792         dev->pcpu_refcnt = alloc_percpu(int);
6793         if (!dev->pcpu_refcnt)
6794                 goto free_dev;
6795
6796         if (dev_addr_init(dev))
6797                 goto free_pcpu;
6798
6799         dev_mc_init(dev);
6800         dev_uc_init(dev);
6801
6802         dev_net_set(dev, &init_net);
6803
6804         dev->gso_max_size = GSO_MAX_SIZE;
6805         dev->gso_max_segs = GSO_MAX_SEGS;
6806         dev->gso_min_segs = 0;
6807
6808         INIT_LIST_HEAD(&dev->napi_list);
6809         INIT_LIST_HEAD(&dev->unreg_list);
6810         INIT_LIST_HEAD(&dev->close_list);
6811         INIT_LIST_HEAD(&dev->link_watch_list);
6812         INIT_LIST_HEAD(&dev->adj_list.upper);
6813         INIT_LIST_HEAD(&dev->adj_list.lower);
6814         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6815         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6816         INIT_LIST_HEAD(&dev->ptype_all);
6817         INIT_LIST_HEAD(&dev->ptype_specific);
6818         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6819         setup(dev);
6820
6821         dev->num_tx_queues = txqs;
6822         dev->real_num_tx_queues = txqs;
6823         if (netif_alloc_netdev_queues(dev))
6824                 goto free_all;
6825
6826 #ifdef CONFIG_SYSFS
6827         dev->num_rx_queues = rxqs;
6828         dev->real_num_rx_queues = rxqs;
6829         if (netif_alloc_rx_queues(dev))
6830                 goto free_all;
6831 #endif
6832
6833         strcpy(dev->name, name);
6834         dev->name_assign_type = name_assign_type;
6835         dev->group = INIT_NETDEV_GROUP;
6836         if (!dev->ethtool_ops)
6837                 dev->ethtool_ops = &default_ethtool_ops;
6838         return dev;
6839
6840 free_all:
6841         free_netdev(dev);
6842         return NULL;
6843
6844 free_pcpu:
6845         free_percpu(dev->pcpu_refcnt);
6846 free_dev:
6847         netdev_freemem(dev);
6848         return NULL;
6849 }
6850 EXPORT_SYMBOL(alloc_netdev_mqs);
6851
6852 /**
6853  *      free_netdev - free network device
6854  *      @dev: device
6855  *
6856  *      This function does the last stage of destroying an allocated device
6857  *      interface. The reference to the device object is released.
6858  *      If this is the last reference then it will be freed.
6859  */
6860 void free_netdev(struct net_device *dev)
6861 {
6862         struct napi_struct *p, *n;
6863
6864         netif_free_tx_queues(dev);
6865 #ifdef CONFIG_SYSFS
6866         kvfree(dev->_rx);
6867 #endif
6868
6869         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6870
6871         /* Flush device addresses */
6872         dev_addr_flush(dev);
6873
6874         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6875                 netif_napi_del(p);
6876
6877         free_percpu(dev->pcpu_refcnt);
6878         dev->pcpu_refcnt = NULL;
6879
6880         /*  Compatibility with error handling in drivers */
6881         if (dev->reg_state == NETREG_UNINITIALIZED) {
6882                 netdev_freemem(dev);
6883                 return;
6884         }
6885
6886         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6887         dev->reg_state = NETREG_RELEASED;
6888
6889         /* will free via device release */
6890         put_device(&dev->dev);
6891 }
6892 EXPORT_SYMBOL(free_netdev);
6893
6894 /**
6895  *      synchronize_net -  Synchronize with packet receive processing
6896  *
6897  *      Wait for packets currently being received to be done.
6898  *      Does not block later packets from starting.
6899  */
6900 void synchronize_net(void)
6901 {
6902         might_sleep();
6903         if (rtnl_is_locked())
6904                 synchronize_rcu_expedited();
6905         else
6906                 synchronize_rcu();
6907 }
6908 EXPORT_SYMBOL(synchronize_net);
6909
6910 /**
6911  *      unregister_netdevice_queue - remove device from the kernel
6912  *      @dev: device
6913  *      @head: list
6914  *
6915  *      This function shuts down a device interface and removes it
6916  *      from the kernel tables.
6917  *      If head not NULL, device is queued to be unregistered later.
6918  *
6919  *      Callers must hold the rtnl semaphore.  You may want
6920  *      unregister_netdev() instead of this.
6921  */
6922
6923 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6924 {
6925         ASSERT_RTNL();
6926
6927         if (head) {
6928                 list_move_tail(&dev->unreg_list, head);
6929         } else {
6930                 rollback_registered(dev);
6931                 /* Finish processing unregister after unlock */
6932                 net_set_todo(dev);
6933         }
6934 }
6935 EXPORT_SYMBOL(unregister_netdevice_queue);
6936
6937 /**
6938  *      unregister_netdevice_many - unregister many devices
6939  *      @head: list of devices
6940  *
6941  *  Note: As most callers use a stack allocated list_head,
6942  *  we force a list_del() to make sure stack wont be corrupted later.
6943  */
6944 void unregister_netdevice_many(struct list_head *head)
6945 {
6946         struct net_device *dev;
6947
6948         if (!list_empty(head)) {
6949                 rollback_registered_many(head);
6950                 list_for_each_entry(dev, head, unreg_list)
6951                         net_set_todo(dev);
6952                 list_del(head);
6953         }
6954 }
6955 EXPORT_SYMBOL(unregister_netdevice_many);
6956
6957 /**
6958  *      unregister_netdev - remove device from the kernel
6959  *      @dev: device
6960  *
6961  *      This function shuts down a device interface and removes it
6962  *      from the kernel tables.
6963  *
6964  *      This is just a wrapper for unregister_netdevice that takes
6965  *      the rtnl semaphore.  In general you want to use this and not
6966  *      unregister_netdevice.
6967  */
6968 void unregister_netdev(struct net_device *dev)
6969 {
6970         rtnl_lock();
6971         unregister_netdevice(dev);
6972         rtnl_unlock();
6973 }
6974 EXPORT_SYMBOL(unregister_netdev);
6975
6976 /**
6977  *      dev_change_net_namespace - move device to different nethost namespace
6978  *      @dev: device
6979  *      @net: network namespace
6980  *      @pat: If not NULL name pattern to try if the current device name
6981  *            is already taken in the destination network namespace.
6982  *
6983  *      This function shuts down a device interface and moves it
6984  *      to a new network namespace. On success 0 is returned, on
6985  *      a failure a netagive errno code is returned.
6986  *
6987  *      Callers must hold the rtnl semaphore.
6988  */
6989
6990 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6991 {
6992         int err;
6993
6994         ASSERT_RTNL();
6995
6996         /* Don't allow namespace local devices to be moved. */
6997         err = -EINVAL;
6998         if (dev->features & NETIF_F_NETNS_LOCAL)
6999                 goto out;
7000
7001         /* Ensure the device has been registrered */
7002         if (dev->reg_state != NETREG_REGISTERED)
7003                 goto out;
7004
7005         /* Get out if there is nothing todo */
7006         err = 0;
7007         if (net_eq(dev_net(dev), net))
7008                 goto out;
7009
7010         /* Pick the destination device name, and ensure
7011          * we can use it in the destination network namespace.
7012          */
7013         err = -EEXIST;
7014         if (__dev_get_by_name(net, dev->name)) {
7015                 /* We get here if we can't use the current device name */
7016                 if (!pat)
7017                         goto out;
7018                 if (dev_get_valid_name(net, dev, pat) < 0)
7019                         goto out;
7020         }
7021
7022         /*
7023          * And now a mini version of register_netdevice unregister_netdevice.
7024          */
7025
7026         /* If device is running close it first. */
7027         dev_close(dev);
7028
7029         /* And unlink it from device chain */
7030         err = -ENODEV;
7031         unlist_netdevice(dev);
7032
7033         synchronize_net();
7034
7035         /* Shutdown queueing discipline. */
7036         dev_shutdown(dev);
7037
7038         /* Notify protocols, that we are about to destroy
7039            this device. They should clean all the things.
7040
7041            Note that dev->reg_state stays at NETREG_REGISTERED.
7042            This is wanted because this way 8021q and macvlan know
7043            the device is just moving and can keep their slaves up.
7044         */
7045         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7046         rcu_barrier();
7047         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7048         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7049
7050         /*
7051          *      Flush the unicast and multicast chains
7052          */
7053         dev_uc_flush(dev);
7054         dev_mc_flush(dev);
7055
7056         /* Send a netdev-removed uevent to the old namespace */
7057         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7058         netdev_adjacent_del_links(dev);
7059
7060         /* Actually switch the network namespace */
7061         dev_net_set(dev, net);
7062
7063         /* If there is an ifindex conflict assign a new one */
7064         if (__dev_get_by_index(net, dev->ifindex)) {
7065                 int iflink = (dev->iflink == dev->ifindex);
7066                 dev->ifindex = dev_new_index(net);
7067                 if (iflink)
7068                         dev->iflink = dev->ifindex;
7069         }
7070
7071         /* Send a netdev-add uevent to the new namespace */
7072         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7073         netdev_adjacent_add_links(dev);
7074
7075         /* Fixup kobjects */
7076         err = device_rename(&dev->dev, dev->name);
7077         WARN_ON(err);
7078
7079         /* Add the device back in the hashes */
7080         list_netdevice(dev);
7081
7082         /* Notify protocols, that a new device appeared. */
7083         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7084
7085         /*
7086          *      Prevent userspace races by waiting until the network
7087          *      device is fully setup before sending notifications.
7088          */
7089         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7090
7091         synchronize_net();
7092         err = 0;
7093 out:
7094         return err;
7095 }
7096 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7097
7098 static int dev_cpu_callback(struct notifier_block *nfb,
7099                             unsigned long action,
7100                             void *ocpu)
7101 {
7102         struct sk_buff **list_skb;
7103         struct sk_buff *skb;
7104         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7105         struct softnet_data *sd, *oldsd;
7106
7107         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7108                 return NOTIFY_OK;
7109
7110         local_irq_disable();
7111         cpu = smp_processor_id();
7112         sd = &per_cpu(softnet_data, cpu);
7113         oldsd = &per_cpu(softnet_data, oldcpu);
7114
7115         /* Find end of our completion_queue. */
7116         list_skb = &sd->completion_queue;
7117         while (*list_skb)
7118                 list_skb = &(*list_skb)->next;
7119         /* Append completion queue from offline CPU. */
7120         *list_skb = oldsd->completion_queue;
7121         oldsd->completion_queue = NULL;
7122
7123         /* Append output queue from offline CPU. */
7124         if (oldsd->output_queue) {
7125                 *sd->output_queue_tailp = oldsd->output_queue;
7126                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7127                 oldsd->output_queue = NULL;
7128                 oldsd->output_queue_tailp = &oldsd->output_queue;
7129         }
7130         /* Append NAPI poll list from offline CPU, with one exception :
7131          * process_backlog() must be called by cpu owning percpu backlog.
7132          * We properly handle process_queue & input_pkt_queue later.
7133          */
7134         while (!list_empty(&oldsd->poll_list)) {
7135                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7136                                                             struct napi_struct,
7137                                                             poll_list);
7138
7139                 list_del_init(&napi->poll_list);
7140                 if (napi->poll == process_backlog)
7141                         napi->state = 0;
7142                 else
7143                         ____napi_schedule(sd, napi);
7144         }
7145
7146         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7147         local_irq_enable();
7148
7149         /* Process offline CPU's input_pkt_queue */
7150         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7151                 netif_rx_ni(skb);
7152                 input_queue_head_incr(oldsd);
7153         }
7154         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7155                 netif_rx_ni(skb);
7156                 input_queue_head_incr(oldsd);
7157         }
7158
7159         return NOTIFY_OK;
7160 }
7161
7162
7163 /**
7164  *      netdev_increment_features - increment feature set by one
7165  *      @all: current feature set
7166  *      @one: new feature set
7167  *      @mask: mask feature set
7168  *
7169  *      Computes a new feature set after adding a device with feature set
7170  *      @one to the master device with current feature set @all.  Will not
7171  *      enable anything that is off in @mask. Returns the new feature set.
7172  */
7173 netdev_features_t netdev_increment_features(netdev_features_t all,
7174         netdev_features_t one, netdev_features_t mask)
7175 {
7176         if (mask & NETIF_F_GEN_CSUM)
7177                 mask |= NETIF_F_ALL_CSUM;
7178         mask |= NETIF_F_VLAN_CHALLENGED;
7179
7180         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7181         all &= one | ~NETIF_F_ALL_FOR_ALL;
7182
7183         /* If one device supports hw checksumming, set for all. */
7184         if (all & NETIF_F_GEN_CSUM)
7185                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7186
7187         return all;
7188 }
7189 EXPORT_SYMBOL(netdev_increment_features);
7190
7191 static struct hlist_head * __net_init netdev_create_hash(void)
7192 {
7193         int i;
7194         struct hlist_head *hash;
7195
7196         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7197         if (hash != NULL)
7198                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7199                         INIT_HLIST_HEAD(&hash[i]);
7200
7201         return hash;
7202 }
7203
7204 /* Initialize per network namespace state */
7205 static int __net_init netdev_init(struct net *net)
7206 {
7207         if (net != &init_net)
7208                 INIT_LIST_HEAD(&net->dev_base_head);
7209
7210         net->dev_name_head = netdev_create_hash();
7211         if (net->dev_name_head == NULL)
7212                 goto err_name;
7213
7214         net->dev_index_head = netdev_create_hash();
7215         if (net->dev_index_head == NULL)
7216                 goto err_idx;
7217
7218         return 0;
7219
7220 err_idx:
7221         kfree(net->dev_name_head);
7222 err_name:
7223         return -ENOMEM;
7224 }
7225
7226 /**
7227  *      netdev_drivername - network driver for the device
7228  *      @dev: network device
7229  *
7230  *      Determine network driver for device.
7231  */
7232 const char *netdev_drivername(const struct net_device *dev)
7233 {
7234         const struct device_driver *driver;
7235         const struct device *parent;
7236         const char *empty = "";
7237
7238         parent = dev->dev.parent;
7239         if (!parent)
7240                 return empty;
7241
7242         driver = parent->driver;
7243         if (driver && driver->name)
7244                 return driver->name;
7245         return empty;
7246 }
7247
7248 static void __netdev_printk(const char *level, const struct net_device *dev,
7249                             struct va_format *vaf)
7250 {
7251         if (dev && dev->dev.parent) {
7252                 dev_printk_emit(level[1] - '0',
7253                                 dev->dev.parent,
7254                                 "%s %s %s%s: %pV",
7255                                 dev_driver_string(dev->dev.parent),
7256                                 dev_name(dev->dev.parent),
7257                                 netdev_name(dev), netdev_reg_state(dev),
7258                                 vaf);
7259         } else if (dev) {
7260                 printk("%s%s%s: %pV",
7261                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7262         } else {
7263                 printk("%s(NULL net_device): %pV", level, vaf);
7264         }
7265 }
7266
7267 void netdev_printk(const char *level, const struct net_device *dev,
7268                    const char *format, ...)
7269 {
7270         struct va_format vaf;
7271         va_list args;
7272
7273         va_start(args, format);
7274
7275         vaf.fmt = format;
7276         vaf.va = &args;
7277
7278         __netdev_printk(level, dev, &vaf);
7279
7280         va_end(args);
7281 }
7282 EXPORT_SYMBOL(netdev_printk);
7283
7284 #define define_netdev_printk_level(func, level)                 \
7285 void func(const struct net_device *dev, const char *fmt, ...)   \
7286 {                                                               \
7287         struct va_format vaf;                                   \
7288         va_list args;                                           \
7289                                                                 \
7290         va_start(args, fmt);                                    \
7291                                                                 \
7292         vaf.fmt = fmt;                                          \
7293         vaf.va = &args;                                         \
7294                                                                 \
7295         __netdev_printk(level, dev, &vaf);                      \
7296                                                                 \
7297         va_end(args);                                           \
7298 }                                                               \
7299 EXPORT_SYMBOL(func);
7300
7301 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7302 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7303 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7304 define_netdev_printk_level(netdev_err, KERN_ERR);
7305 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7306 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7307 define_netdev_printk_level(netdev_info, KERN_INFO);
7308
7309 static void __net_exit netdev_exit(struct net *net)
7310 {
7311         kfree(net->dev_name_head);
7312         kfree(net->dev_index_head);
7313 }
7314
7315 static struct pernet_operations __net_initdata netdev_net_ops = {
7316         .init = netdev_init,
7317         .exit = netdev_exit,
7318 };
7319
7320 static void __net_exit default_device_exit(struct net *net)
7321 {
7322         struct net_device *dev, *aux;
7323         /*
7324          * Push all migratable network devices back to the
7325          * initial network namespace
7326          */
7327         rtnl_lock();
7328         for_each_netdev_safe(net, dev, aux) {
7329                 int err;
7330                 char fb_name[IFNAMSIZ];
7331
7332                 /* Ignore unmoveable devices (i.e. loopback) */
7333                 if (dev->features & NETIF_F_NETNS_LOCAL)
7334                         continue;
7335
7336                 /* Leave virtual devices for the generic cleanup */
7337                 if (dev->rtnl_link_ops)
7338                         continue;
7339
7340                 /* Push remaining network devices to init_net */
7341                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7342                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7343                 if (err) {
7344                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7345                                  __func__, dev->name, err);
7346                         BUG();
7347                 }
7348         }
7349         rtnl_unlock();
7350 }
7351
7352 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7353 {
7354         /* Return with the rtnl_lock held when there are no network
7355          * devices unregistering in any network namespace in net_list.
7356          */
7357         struct net *net;
7358         bool unregistering;
7359         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7360
7361         add_wait_queue(&netdev_unregistering_wq, &wait);
7362         for (;;) {
7363                 unregistering = false;
7364                 rtnl_lock();
7365                 list_for_each_entry(net, net_list, exit_list) {
7366                         if (net->dev_unreg_count > 0) {
7367                                 unregistering = true;
7368                                 break;
7369                         }
7370                 }
7371                 if (!unregistering)
7372                         break;
7373                 __rtnl_unlock();
7374
7375                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7376         }
7377         remove_wait_queue(&netdev_unregistering_wq, &wait);
7378 }
7379
7380 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7381 {
7382         /* At exit all network devices most be removed from a network
7383          * namespace.  Do this in the reverse order of registration.
7384          * Do this across as many network namespaces as possible to
7385          * improve batching efficiency.
7386          */
7387         struct net_device *dev;
7388         struct net *net;
7389         LIST_HEAD(dev_kill_list);
7390
7391         /* To prevent network device cleanup code from dereferencing
7392          * loopback devices or network devices that have been freed
7393          * wait here for all pending unregistrations to complete,
7394          * before unregistring the loopback device and allowing the
7395          * network namespace be freed.
7396          *
7397          * The netdev todo list containing all network devices
7398          * unregistrations that happen in default_device_exit_batch
7399          * will run in the rtnl_unlock() at the end of
7400          * default_device_exit_batch.
7401          */
7402         rtnl_lock_unregistering(net_list);
7403         list_for_each_entry(net, net_list, exit_list) {
7404                 for_each_netdev_reverse(net, dev) {
7405                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7406                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7407                         else
7408                                 unregister_netdevice_queue(dev, &dev_kill_list);
7409                 }
7410         }
7411         unregister_netdevice_many(&dev_kill_list);
7412         rtnl_unlock();
7413 }
7414
7415 static struct pernet_operations __net_initdata default_device_ops = {
7416         .exit = default_device_exit,
7417         .exit_batch = default_device_exit_batch,
7418 };
7419
7420 /*
7421  *      Initialize the DEV module. At boot time this walks the device list and
7422  *      unhooks any devices that fail to initialise (normally hardware not
7423  *      present) and leaves us with a valid list of present and active devices.
7424  *
7425  */
7426
7427 /*
7428  *       This is called single threaded during boot, so no need
7429  *       to take the rtnl semaphore.
7430  */
7431 static int __init net_dev_init(void)
7432 {
7433         int i, rc = -ENOMEM;
7434
7435         BUG_ON(!dev_boot_phase);
7436
7437         if (dev_proc_init())
7438                 goto out;
7439
7440         if (netdev_kobject_init())
7441                 goto out;
7442
7443         INIT_LIST_HEAD(&ptype_all);
7444         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7445                 INIT_LIST_HEAD(&ptype_base[i]);
7446
7447         INIT_LIST_HEAD(&offload_base);
7448
7449         if (register_pernet_subsys(&netdev_net_ops))
7450                 goto out;
7451
7452         /*
7453          *      Initialise the packet receive queues.
7454          */
7455
7456         for_each_possible_cpu(i) {
7457                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7458
7459                 skb_queue_head_init(&sd->input_pkt_queue);
7460                 skb_queue_head_init(&sd->process_queue);
7461                 INIT_LIST_HEAD(&sd->poll_list);
7462                 sd->output_queue_tailp = &sd->output_queue;
7463 #ifdef CONFIG_RPS
7464                 sd->csd.func = rps_trigger_softirq;
7465                 sd->csd.info = sd;
7466                 sd->cpu = i;
7467 #endif
7468
7469                 sd->backlog.poll = process_backlog;
7470                 sd->backlog.weight = weight_p;
7471         }
7472
7473         dev_boot_phase = 0;
7474
7475         /* The loopback device is special if any other network devices
7476          * is present in a network namespace the loopback device must
7477          * be present. Since we now dynamically allocate and free the
7478          * loopback device ensure this invariant is maintained by
7479          * keeping the loopback device as the first device on the
7480          * list of network devices.  Ensuring the loopback devices
7481          * is the first device that appears and the last network device
7482          * that disappears.
7483          */
7484         if (register_pernet_device(&loopback_net_ops))
7485                 goto out;
7486
7487         if (register_pernet_device(&default_device_ops))
7488                 goto out;
7489
7490         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7491         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7492
7493         hotcpu_notifier(dev_cpu_callback, 0);
7494         dst_init();
7495         rc = 0;
7496 out:
7497         return rc;
7498 }
7499
7500 subsys_initcall(net_dev_init);