net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <[email protected]>
  12  *                              Mark Evans, <[email protected]>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <[email protected]>
  16  *              Alan Cox <[email protected]>
  17  *              David Hinds <[email protected]>
  18  *              Alexey Kuznetsov <[email protected]>
  19  *              Adam Sulmicki <[email protected]>
  20  *              Pekka Riikonen <[email protected]>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static DEFINE_SPINLOCK(offload_lock);
 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150 struct list_head ptype_all __read_mostly;       /* Taps */
 151 static struct list_head offload_base __read_mostly;
 152
 153 static int netif_rx_internal(struct sk_buff *skb);
 154 static int call_netdevice_notifiers_info(unsigned long val,
 155                                          struct net_device *dev,
 156                                          struct netdev_notifier_info *info);
 157
 158 /*
 159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160  * semaphore.
 161  *
 162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163  *
 164  * Writers must hold the rtnl semaphore while they loop through the
 165  * dev_base_head list, and hold dev_base_lock for writing when they do the
 166  * actual updates.  This allows pure readers to access the list even
 167  * while a writer is preparing to update it.
 168  *
 169  * To put it another way, dev_base_lock is held for writing only to
 170  * protect against pure readers; the rtnl semaphore provides the
 171  * protection against other writers.
 172  *
 173  * See, for example usages, register_netdevice() and
 174  * unregister_netdevice(), which must be called with the rtnl
 175  * semaphore held.
 176  */
 177 DEFINE_RWLOCK(dev_base_lock);
 178 EXPORT_SYMBOL(dev_base_lock);
 179
 180 /* protects napi_hash addition/deletion and napi_gen_id */
 181 static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183 static unsigned int napi_gen_id;
 184 static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186 static seqcount_t devnet_rename_seq;
 187
 188 static inline void dev_base_seq_inc(struct net *net)
 189 {
 190         while (++net->dev_base_seq == 0);
 191 }
 192
 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194 {
 195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198 }
 199
 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201 {
 202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203 }
 204
 205 static inline void rps_lock(struct softnet_data *sd)
 206 {
 207 #ifdef CONFIG_RPS
 208         spin_lock(&sd->input_pkt_queue.lock);
 209 #endif
 210 }
 211
 212 static inline void rps_unlock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_unlock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 /* Device list insertion */
 220 static void list_netdevice(struct net_device *dev)
 221 {
 222         struct net *net = dev_net(dev);
 223
 224         ASSERT_RTNL();
 225
 226         write_lock_bh(&dev_base_lock);
 227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229         hlist_add_head_rcu(&dev->index_hlist,
 230                            dev_index_hash(net, dev->ifindex));
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(net);
 234 }
 235
 236 /* Device list removal
 237  * caller must respect a RCU grace period before freeing/reusing dev
 238  */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del_rcu(&dev->dev_list);
 246         hlist_del_rcu(&dev->name_hlist);
 247         hlist_del_rcu(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(dev_net(dev));
 251 }
 252
 253 /*
 254  *      Our notifier list
 255  */
 256
 257 static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259 /*
 260  *      Device drivers call our routines to queue packets here. We empty the
 261  *      queue in the local softnet handler.
 262  */
 263
 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265 EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267 #ifdef CONFIG_LOCKDEP
 268 /*
 269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270  * according to dev->type
 271  */
 272 static const unsigned short netdev_lock_type[] =
 273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289 static const char *const netdev_lock_name[] =
 290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310 {
 311         int i;
 312
 313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                 if (netdev_lock_type[i] == dev_type)
 315                         return i;
 316         /* the last key is used by default */
 317         return ARRAY_SIZE(netdev_lock_type) - 1;
 318 }
 319
 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                  unsigned short dev_type)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev_type);
 326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329
 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331 {
 332         int i;
 333
 334         i = netdev_lock_pos(dev->type);
 335         lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                    &netdev_addr_lock_key[i],
 337                                    netdev_lock_name[i]);
 338 }
 339 #else
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343 }
 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345 {
 346 }
 347 #endif
 348
 349 /*******************************************************************************
 350
 351                 Protocol management and registration routines
 352
 353 *******************************************************************************/
 354
 355 /*
 356  *      Add a protocol ID to the list. Now that the input handler is
 357  *      smarter we can dispense with all the messy stuff that used to be
 358  *      here.
 359  *
 360  *      BEWARE!!! Protocol handlers, mangling input packets,
 361  *      MUST BE last in hash buckets and checking protocol handlers
 362  *      MUST start from promiscuous ptype_all chain in net_bh.
 363  *      It is true now, do not change it.
 364  *      Explanation follows: if protocol handler, mangling packet, will
 365  *      be the first on list, it is not able to sense, that packet
 366  *      is cloned and should be copied-on-write, so that it will
 367  *      change it and subsequent readers will get broken packet.
 368  *                                                      --ANK (980803)
 369  */
 370
 371 static inline struct list_head *ptype_head(const struct packet_type *pt)
 372 {
 373         if (pt->type == htons(ETH_P_ALL))
 374                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 375         else
 376                 return pt->dev ? &pt->dev->ptype_specific :
 377                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 378 }
 379
 380 /**
 381  *      dev_add_pack - add packet handler
 382  *      @pt: packet type declaration
 383  *
 384  *      Add a protocol handler to the networking stack. The passed &packet_type
 385  *      is linked into kernel lists and may not be freed until it has been
 386  *      removed from the kernel lists.
 387  *
 388  *      This call does not sleep therefore it can not
 389  *      guarantee all CPU's that are in middle of receiving packets
 390  *      will see the new packet type (until the next received packet).
 391  */
 392
 393 void dev_add_pack(struct packet_type *pt)
 394 {
 395         struct list_head *head = ptype_head(pt);
 396
 397         spin_lock(&ptype_lock);
 398         list_add_rcu(&pt->list, head);
 399         spin_unlock(&ptype_lock);
 400 }
 401 EXPORT_SYMBOL(dev_add_pack);
 402
 403 /**
 404  *      __dev_remove_pack        - remove packet handler
 405  *      @pt: packet type declaration
 406  *
 407  *      Remove a protocol handler that was previously added to the kernel
 408  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 409  *      from the kernel lists and can be freed or reused once this function
 410  *      returns.
 411  *
 412  *      The packet type might still be in use by receivers
 413  *      and must not be freed until after all the CPU's have gone
 414  *      through a quiescent state.
 415  */
 416 void __dev_remove_pack(struct packet_type *pt)
 417 {
 418         struct list_head *head = ptype_head(pt);
 419         struct packet_type *pt1;
 420
 421         spin_lock(&ptype_lock);
 422
 423         list_for_each_entry(pt1, head, list) {
 424                 if (pt == pt1) {
 425                         list_del_rcu(&pt->list);
 426                         goto out;
 427                 }
 428         }
 429
 430         pr_warn("dev_remove_pack: %p not found\n", pt);
 431 out:
 432         spin_unlock(&ptype_lock);
 433 }
 434 EXPORT_SYMBOL(__dev_remove_pack);
 435
 436 /**
 437  *      dev_remove_pack  - remove packet handler
 438  *      @pt: packet type declaration
 439  *
 440  *      Remove a protocol handler that was previously added to the kernel
 441  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 442  *      from the kernel lists and can be freed or reused once this function
 443  *      returns.
 444  *
 445  *      This call sleeps to guarantee that no CPU is looking at the packet
 446  *      type after return.
 447  */
 448 void dev_remove_pack(struct packet_type *pt)
 449 {
 450         __dev_remove_pack(pt);
 451
 452         synchronize_net();
 453 }
 454 EXPORT_SYMBOL(dev_remove_pack);
 455
 456
 457 /**
 458  *      dev_add_offload - register offload handlers
 459  *      @po: protocol offload declaration
 460  *
 461  *      Add protocol offload handlers to the networking stack. The passed
 462  *      &proto_offload is linked into kernel lists and may not be freed until
 463  *      it has been removed from the kernel lists.
 464  *
 465  *      This call does not sleep therefore it can not
 466  *      guarantee all CPU's that are in middle of receiving packets
 467  *      will see the new offload handlers (until the next received packet).
 468  */
 469 void dev_add_offload(struct packet_offload *po)
 470 {
 471         struct list_head *head = &offload_base;
 472
 473         spin_lock(&offload_lock);
 474         list_add_rcu(&po->list, head);
 475         spin_unlock(&offload_lock);
 476 }
 477 EXPORT_SYMBOL(dev_add_offload);
 478
 479 /**
 480  *      __dev_remove_offload     - remove offload handler
 481  *      @po: packet offload declaration
 482  *
 483  *      Remove a protocol offload handler that was previously added to the
 484  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 485  *      is removed from the kernel lists and can be freed or reused once this
 486  *      function returns.
 487  *
 488  *      The packet type might still be in use by receivers
 489  *      and must not be freed until after all the CPU's have gone
 490  *      through a quiescent state.
 491  */
 492 static void __dev_remove_offload(struct packet_offload *po)
 493 {
 494         struct list_head *head = &offload_base;
 495         struct packet_offload *po1;
 496
 497         spin_lock(&offload_lock);
 498
 499         list_for_each_entry(po1, head, list) {
 500                 if (po == po1) {
 501                         list_del_rcu(&po->list);
 502                         goto out;
 503                 }
 504         }
 505
 506         pr_warn("dev_remove_offload: %p not found\n", po);
 507 out:
 508         spin_unlock(&offload_lock);
 509 }
 510
 511 /**
 512  *      dev_remove_offload       - remove packet offload handler
 513  *      @po: packet offload declaration
 514  *
 515  *      Remove a packet offload handler that was previously added to the kernel
 516  *      offload handlers by dev_add_offload(). The passed &offload_type is
 517  *      removed from the kernel lists and can be freed or reused once this
 518  *      function returns.
 519  *
 520  *      This call sleeps to guarantee that no CPU is looking at the packet
 521  *      type after return.
 522  */
 523 void dev_remove_offload(struct packet_offload *po)
 524 {
 525         __dev_remove_offload(po);
 526
 527         synchronize_net();
 528 }
 529 EXPORT_SYMBOL(dev_remove_offload);
 530
 531 /******************************************************************************
 532
 533                       Device Boot-time Settings Routines
 534
 535 *******************************************************************************/
 536
 537 /* Boot time configuration table */
 538 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 539
 540 /**
 541  *      netdev_boot_setup_add   - add new setup entry
 542  *      @name: name of the device
 543  *      @map: configured settings for the device
 544  *
 545  *      Adds new setup entry to the dev_boot_setup list.  The function
 546  *      returns 0 on error and 1 on success.  This is a generic routine to
 547  *      all netdevices.
 548  */
 549 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 550 {
 551         struct netdev_boot_setup *s;
 552         int i;
 553
 554         s = dev_boot_setup;
 555         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 556                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 557                         memset(s[i].name, 0, sizeof(s[i].name));
 558                         strlcpy(s[i].name, name, IFNAMSIZ);
 559                         memcpy(&s[i].map, map, sizeof(s[i].map));
 560                         break;
 561                 }
 562         }
 563
 564         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 565 }
 566
 567 /**
 568  *      netdev_boot_setup_check - check boot time settings
 569  *      @dev: the netdevice
 570  *
 571  *      Check boot time settings for the device.
 572  *      The found settings are set for the device to be used
 573  *      later in the device probing.
 574  *      Returns 0 if no settings found, 1 if they are.
 575  */
 576 int netdev_boot_setup_check(struct net_device *dev)
 577 {
 578         struct netdev_boot_setup *s = dev_boot_setup;
 579         int i;
 580
 581         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 582                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 583                     !strcmp(dev->name, s[i].name)) {
 584                         dev->irq        = s[i].map.irq;
 585                         dev->base_addr  = s[i].map.base_addr;
 586                         dev->mem_start  = s[i].map.mem_start;
 587                         dev->mem_end    = s[i].map.mem_end;
 588                         return 1;
 589                 }
 590         }
 591         return 0;
 592 }
 593 EXPORT_SYMBOL(netdev_boot_setup_check);
 594
 595
 596 /**
 597  *      netdev_boot_base        - get address from boot time settings
 598  *      @prefix: prefix for network device
 599  *      @unit: id for network device
 600  *
 601  *      Check boot time settings for the base address of device.
 602  *      The found settings are set for the device to be used
 603  *      later in the device probing.
 604  *      Returns 0 if no settings found.
 605  */
 606 unsigned long netdev_boot_base(const char *prefix, int unit)
 607 {
 608         const struct netdev_boot_setup *s = dev_boot_setup;
 609         char name[IFNAMSIZ];
 610         int i;
 611
 612         sprintf(name, "%s%d", prefix, unit);
 613
 614         /*
 615          * If device already registered then return base of 1
 616          * to indicate not to probe for this interface
 617          */
 618         if (__dev_get_by_name(&init_net, name))
 619                 return 1;
 620
 621         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 622                 if (!strcmp(name, s[i].name))
 623                         return s[i].map.base_addr;
 624         return 0;
 625 }
 626
 627 /*
 628  * Saves at boot time configured settings for any netdevice.
 629  */
 630 int __init netdev_boot_setup(char *str)
 631 {
 632         int ints[5];
 633         struct ifmap map;
 634
 635         str = get_options(str, ARRAY_SIZE(ints), ints);
 636         if (!str || !*str)
 637                 return 0;
 638
 639         /* Save settings */
 640         memset(&map, 0, sizeof(map));
 641         if (ints[0] > 0)
 642                 map.irq = ints[1];
 643         if (ints[0] > 1)
 644                 map.base_addr = ints[2];
 645         if (ints[0] > 2)
 646                 map.mem_start = ints[3];
 647         if (ints[0] > 3)
 648                 map.mem_end = ints[4];
 649
 650         /* Add new entry to the list */
 651         return netdev_boot_setup_add(str, &map);
 652 }
 653
 654 __setup("netdev=", netdev_boot_setup);
 655
 656 /*******************************************************************************
 657
 658                             Device Interface Subroutines
 659
 660 *******************************************************************************/
 661
 662 /**
 663  *      dev_get_iflink  - get 'iflink' value of a interface
 664  *      @dev: targeted interface
 665  *
 666  *      Indicates the ifindex the interface is linked to.
 667  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 668  */
 669
 670 int dev_get_iflink(const struct net_device *dev)
 671 {
 672         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 673                 return dev->netdev_ops->ndo_get_iflink(dev);
 674
 675         /* If dev->rtnl_link_ops is set, it's a virtual interface. */
 676         if (dev->rtnl_link_ops)
 677                 return 0;
 678
 679         return dev->ifindex;
 680 }
 681 EXPORT_SYMBOL(dev_get_iflink);
 682
 683 /**
 684  *      __dev_get_by_name       - find a device by its name
 685  *      @net: the applicable net namespace
 686  *      @name: name to find
 687  *
 688  *      Find an interface by name. Must be called under RTNL semaphore
 689  *      or @dev_base_lock. If the name is found a pointer to the device
 690  *      is returned. If the name is not found then %NULL is returned. The
 691  *      reference counters are not incremented so the caller must be
 692  *      careful with locks.
 693  */
 694
 695 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 696 {
 697         struct net_device *dev;
 698         struct hlist_head *head = dev_name_hash(net, name);
 699
 700         hlist_for_each_entry(dev, head, name_hlist)
 701                 if (!strncmp(dev->name, name, IFNAMSIZ))
 702                         return dev;
 703
 704         return NULL;
 705 }
 706 EXPORT_SYMBOL(__dev_get_by_name);
 707
 708 /**
 709  *      dev_get_by_name_rcu     - find a device by its name
 710  *      @net: the applicable net namespace
 711  *      @name: name to find
 712  *
 713  *      Find an interface by name.
 714  *      If the name is found a pointer to the device is returned.
 715  *      If the name is not found then %NULL is returned.
 716  *      The reference counters are not incremented so the caller must be
 717  *      careful with locks. The caller must hold RCU lock.
 718  */
 719
 720 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 721 {
 722         struct net_device *dev;
 723         struct hlist_head *head = dev_name_hash(net, name);
 724
 725         hlist_for_each_entry_rcu(dev, head, name_hlist)
 726                 if (!strncmp(dev->name, name, IFNAMSIZ))
 727                         return dev;
 728
 729         return NULL;
 730 }
 731 EXPORT_SYMBOL(dev_get_by_name_rcu);
 732
 733 /**
 734  *      dev_get_by_name         - find a device by its name
 735  *      @net: the applicable net namespace
 736  *      @name: name to find
 737  *
 738  *      Find an interface by name. This can be called from any
 739  *      context and does its own locking. The returned handle has
 740  *      the usage count incremented and the caller must use dev_put() to
 741  *      release it when it is no longer needed. %NULL is returned if no
 742  *      matching device is found.
 743  */
 744
 745 struct net_device *dev_get_by_name(struct net *net, const char *name)
 746 {
 747         struct net_device *dev;
 748
 749         rcu_read_lock();
 750         dev = dev_get_by_name_rcu(net, name);
 751         if (dev)
 752                 dev_hold(dev);
 753         rcu_read_unlock();
 754         return dev;
 755 }
 756 EXPORT_SYMBOL(dev_get_by_name);
 757
 758 /**
 759  *      __dev_get_by_index - find a device by its ifindex
 760  *      @net: the applicable net namespace
 761  *      @ifindex: index of device
 762  *
 763  *      Search for an interface by index. Returns %NULL if the device
 764  *      is not found or a pointer to the device. The device has not
 765  *      had its reference counter increased so the caller must be careful
 766  *      about locking. The caller must hold either the RTNL semaphore
 767  *      or @dev_base_lock.
 768  */
 769
 770 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 771 {
 772         struct net_device *dev;
 773         struct hlist_head *head = dev_index_hash(net, ifindex);
 774
 775         hlist_for_each_entry(dev, head, index_hlist)
 776                 if (dev->ifindex == ifindex)
 777                         return dev;
 778
 779         return NULL;
 780 }
 781 EXPORT_SYMBOL(__dev_get_by_index);
 782
 783 /**
 784  *      dev_get_by_index_rcu - find a device by its ifindex
 785  *      @net: the applicable net namespace
 786  *      @ifindex: index of device
 787  *
 788  *      Search for an interface by index. Returns %NULL if the device
 789  *      is not found or a pointer to the device. The device has not
 790  *      had its reference counter increased so the caller must be careful
 791  *      about locking. The caller must hold RCU lock.
 792  */
 793
 794 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 795 {
 796         struct net_device *dev;
 797         struct hlist_head *head = dev_index_hash(net, ifindex);
 798
 799         hlist_for_each_entry_rcu(dev, head, index_hlist)
 800                 if (dev->ifindex == ifindex)
 801                         return dev;
 802
 803         return NULL;
 804 }
 805 EXPORT_SYMBOL(dev_get_by_index_rcu);
 806
 807
 808 /**
 809  *      dev_get_by_index - find a device by its ifindex
 810  *      @net: the applicable net namespace
 811  *      @ifindex: index of device
 812  *
 813  *      Search for an interface by index. Returns NULL if the device
 814  *      is not found or a pointer to the device. The device returned has
 815  *      had a reference added and the pointer is safe until the user calls
 816  *      dev_put to indicate they have finished with it.
 817  */
 818
 819 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 820 {
 821         struct net_device *dev;
 822
 823         rcu_read_lock();
 824         dev = dev_get_by_index_rcu(net, ifindex);
 825         if (dev)
 826                 dev_hold(dev);
 827         rcu_read_unlock();
 828         return dev;
 829 }
 830 EXPORT_SYMBOL(dev_get_by_index);
 831
 832 /**
 833  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 834  *      @net: network namespace
 835  *      @name: a pointer to the buffer where the name will be stored.
 836  *      @ifindex: the ifindex of the interface to get the name from.
 837  *
 838  *      The use of raw_seqcount_begin() and cond_resched() before
 839  *      retrying is required as we want to give the writers a chance
 840  *      to complete when CONFIG_PREEMPT is not set.
 841  */
 842 int netdev_get_name(struct net *net, char *name, int ifindex)
 843 {
 844         struct net_device *dev;
 845         unsigned int seq;
 846
 847 retry:
 848         seq = raw_seqcount_begin(&devnet_rename_seq);
 849         rcu_read_lock();
 850         dev = dev_get_by_index_rcu(net, ifindex);
 851         if (!dev) {
 852                 rcu_read_unlock();
 853                 return -ENODEV;
 854         }
 855
 856         strcpy(name, dev->name);
 857         rcu_read_unlock();
 858         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 859                 cond_resched();
 860                 goto retry;
 861         }
 862
 863         return 0;
 864 }
 865
 866 /**
 867  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 868  *      @net: the applicable net namespace
 869  *      @type: media type of device
 870  *      @ha: hardware address
 871  *
 872  *      Search for an interface by MAC address. Returns NULL if the device
 873  *      is not found or a pointer to the device.
 874  *      The caller must hold RCU or RTNL.
 875  *      The returned device has not had its ref count increased
 876  *      and the caller must therefore be careful about locking
 877  *
 878  */
 879
 880 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 881                                        const char *ha)
 882 {
 883         struct net_device *dev;
 884
 885         for_each_netdev_rcu(net, dev)
 886                 if (dev->type == type &&
 887                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 888                         return dev;
 889
 890         return NULL;
 891 }
 892 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 893
 894 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 895 {
 896         struct net_device *dev;
 897
 898         ASSERT_RTNL();
 899         for_each_netdev(net, dev)
 900                 if (dev->type == type)
 901                         return dev;
 902
 903         return NULL;
 904 }
 905 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 906
 907 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 908 {
 909         struct net_device *dev, *ret = NULL;
 910
 911         rcu_read_lock();
 912         for_each_netdev_rcu(net, dev)
 913                 if (dev->type == type) {
 914                         dev_hold(dev);
 915                         ret = dev;
 916                         break;
 917                 }
 918         rcu_read_unlock();
 919         return ret;
 920 }
 921 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 922
 923 /**
 924  *      __dev_get_by_flags - find any device with given flags
 925  *      @net: the applicable net namespace
 926  *      @if_flags: IFF_* values
 927  *      @mask: bitmask of bits in if_flags to check
 928  *
 929  *      Search for any interface with the given flags. Returns NULL if a device
 930  *      is not found or a pointer to the device. Must be called inside
 931  *      rtnl_lock(), and result refcount is unchanged.
 932  */
 933
 934 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 935                                       unsigned short mask)
 936 {
 937         struct net_device *dev, *ret;
 938
 939         ASSERT_RTNL();
 940
 941         ret = NULL;
 942         for_each_netdev(net, dev) {
 943                 if (((dev->flags ^ if_flags) & mask) == 0) {
 944                         ret = dev;
 945                         break;
 946                 }
 947         }
 948         return ret;
 949 }
 950 EXPORT_SYMBOL(__dev_get_by_flags);
 951
 952 /**
 953  *      dev_valid_name - check if name is okay for network device
 954  *      @name: name string
 955  *
 956  *      Network device names need to be valid file names to
 957  *      to allow sysfs to work.  We also disallow any kind of
 958  *      whitespace.
 959  */
 960 bool dev_valid_name(const char *name)
 961 {
 962         if (*name == '\0')
 963                 return false;
 964         if (strlen(name) >= IFNAMSIZ)
 965                 return false;
 966         if (!strcmp(name, ".") || !strcmp(name, ".."))
 967                 return false;
 968
 969         while (*name) {
 970                 if (*name == '/' || *name == ':' || isspace(*name))
 971                         return false;
 972                 name++;
 973         }
 974         return true;
 975 }
 976 EXPORT_SYMBOL(dev_valid_name);
 977
 978 /**
 979  *      __dev_alloc_name - allocate a name for a device
 980  *      @net: network namespace to allocate the device name in
 981  *      @name: name format string
 982  *      @buf:  scratch buffer and result name string
 983  *
 984  *      Passed a format string - eg "lt%d" it will try and find a suitable
 985  *      id. It scans list of devices to build up a free map, then chooses
 986  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 987  *      while allocating the name and adding the device in order to avoid
 988  *      duplicates.
 989  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 990  *      Returns the number of the unit assigned or a negative errno code.
 991  */
 992
 993 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 994 {
 995         int i = 0;
 996         const char *p;
 997         const int max_netdevices = 8*PAGE_SIZE;
 998         unsigned long *inuse;
 999         struct net_device *d;
1000
1001         p = strnchr(name, IFNAMSIZ-1, '%');
1002         if (p) {
1003                 /*
1004                  * Verify the string as this thing may have come from
1005                  * the user.  There must be either one "%d" and no other "%"
1006                  * characters.
1007                  */
1008                 if (p[1] != 'd' || strchr(p + 2, '%'))
1009                         return -EINVAL;
1010
1011                 /* Use one page as a bit array of possible slots */
1012                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1013                 if (!inuse)
1014                         return -ENOMEM;
1015
1016                 for_each_netdev(net, d) {
1017                         if (!sscanf(d->name, name, &i))
1018                                 continue;
1019                         if (i < 0 || i >= max_netdevices)
1020                                 continue;
1021
1022                         /*  avoid cases where sscanf is not exact inverse of printf */
1023                         snprintf(buf, IFNAMSIZ, name, i);
1024                         if (!strncmp(buf, d->name, IFNAMSIZ))
1025                                 set_bit(i, inuse);
1026                 }
1027
1028                 i = find_first_zero_bit(inuse, max_netdevices);
1029                 free_page((unsigned long) inuse);
1030         }
1031
1032         if (buf != name)
1033                 snprintf(buf, IFNAMSIZ, name, i);
1034         if (!__dev_get_by_name(net, buf))
1035                 return i;
1036
1037         /* It is possible to run out of possible slots
1038          * when the name is long and there isn't enough space left
1039          * for the digits, or if all bits are used.
1040          */
1041         return -ENFILE;
1042 }
1043
1044 /**
1045  *      dev_alloc_name - allocate a name for a device
1046  *      @dev: device
1047  *      @name: name format string
1048  *
1049  *      Passed a format string - eg "lt%d" it will try and find a suitable
1050  *      id. It scans list of devices to build up a free map, then chooses
1051  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1052  *      while allocating the name and adding the device in order to avoid
1053  *      duplicates.
1054  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1055  *      Returns the number of the unit assigned or a negative errno code.
1056  */
1057
1058 int dev_alloc_name(struct net_device *dev, const char *name)
1059 {
1060         char buf[IFNAMSIZ];
1061         struct net *net;
1062         int ret;
1063
1064         BUG_ON(!dev_net(dev));
1065         net = dev_net(dev);
1066         ret = __dev_alloc_name(net, name, buf);
1067         if (ret >= 0)
1068                 strlcpy(dev->name, buf, IFNAMSIZ);
1069         return ret;
1070 }
1071 EXPORT_SYMBOL(dev_alloc_name);
1072
1073 static int dev_alloc_name_ns(struct net *net,
1074                              struct net_device *dev,
1075                              const char *name)
1076 {
1077         char buf[IFNAMSIZ];
1078         int ret;
1079
1080         ret = __dev_alloc_name(net, name, buf);
1081         if (ret >= 0)
1082                 strlcpy(dev->name, buf, IFNAMSIZ);
1083         return ret;
1084 }
1085
1086 static int dev_get_valid_name(struct net *net,
1087                               struct net_device *dev,
1088                               const char *name)
1089 {
1090         BUG_ON(!net);
1091
1092         if (!dev_valid_name(name))
1093                 return -EINVAL;
1094
1095         if (strchr(name, '%'))
1096                 return dev_alloc_name_ns(net, dev, name);
1097         else if (__dev_get_by_name(net, name))
1098                 return -EEXIST;
1099         else if (dev->name != name)
1100                 strlcpy(dev->name, name, IFNAMSIZ);
1101
1102         return 0;
1103 }
1104
1105 /**
1106  *      dev_change_name - change name of a device
1107  *      @dev: device
1108  *      @newname: name (or format string) must be at least IFNAMSIZ
1109  *
1110  *      Change name of a device, can pass format strings "eth%d".
1111  *      for wildcarding.
1112  */
1113 int dev_change_name(struct net_device *dev, const char *newname)
1114 {
1115         unsigned char old_assign_type;
1116         char oldname[IFNAMSIZ];
1117         int err = 0;
1118         int ret;
1119         struct net *net;
1120
1121         ASSERT_RTNL();
1122         BUG_ON(!dev_net(dev));
1123
1124         net = dev_net(dev);
1125         if (dev->flags & IFF_UP)
1126                 return -EBUSY;
1127
1128         write_seqcount_begin(&devnet_rename_seq);
1129
1130         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1131                 write_seqcount_end(&devnet_rename_seq);
1132                 return 0;
1133         }
1134
1135         memcpy(oldname, dev->name, IFNAMSIZ);
1136
1137         err = dev_get_valid_name(net, dev, newname);
1138         if (err < 0) {
1139                 write_seqcount_end(&devnet_rename_seq);
1140                 return err;
1141         }
1142
1143         if (oldname[0] && !strchr(oldname, '%'))
1144                 netdev_info(dev, "renamed from %s\n", oldname);
1145
1146         old_assign_type = dev->name_assign_type;
1147         dev->name_assign_type = NET_NAME_RENAMED;
1148
1149 rollback:
1150         ret = device_rename(&dev->dev, dev->name);
1151         if (ret) {
1152                 memcpy(dev->name, oldname, IFNAMSIZ);
1153                 dev->name_assign_type = old_assign_type;
1154                 write_seqcount_end(&devnet_rename_seq);
1155                 return ret;
1156         }
1157
1158         write_seqcount_end(&devnet_rename_seq);
1159
1160         netdev_adjacent_rename_links(dev, oldname);
1161
1162         write_lock_bh(&dev_base_lock);
1163         hlist_del_rcu(&dev->name_hlist);
1164         write_unlock_bh(&dev_base_lock);
1165
1166         synchronize_rcu();
1167
1168         write_lock_bh(&dev_base_lock);
1169         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1170         write_unlock_bh(&dev_base_lock);
1171
1172         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1173         ret = notifier_to_errno(ret);
1174
1175         if (ret) {
1176                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1177                 if (err >= 0) {
1178                         err = ret;
1179                         write_seqcount_begin(&devnet_rename_seq);
1180                         memcpy(dev->name, oldname, IFNAMSIZ);
1181                         memcpy(oldname, newname, IFNAMSIZ);
1182                         dev->name_assign_type = old_assign_type;
1183                         old_assign_type = NET_NAME_RENAMED;
1184                         goto rollback;
1185                 } else {
1186                         pr_err("%s: name change rollback failed: %d\n",
1187                                dev->name, ret);
1188                 }
1189         }
1190
1191         return err;
1192 }
1193
1194 /**
1195  *      dev_set_alias - change ifalias of a device
1196  *      @dev: device
1197  *      @alias: name up to IFALIASZ
1198  *      @len: limit of bytes to copy from info
1199  *
1200  *      Set ifalias for a device,
1201  */
1202 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1203 {
1204         char *new_ifalias;
1205
1206         ASSERT_RTNL();
1207
1208         if (len >= IFALIASZ)
1209                 return -EINVAL;
1210
1211         if (!len) {
1212                 kfree(dev->ifalias);
1213                 dev->ifalias = NULL;
1214                 return 0;
1215         }
1216
1217         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1218         if (!new_ifalias)
1219                 return -ENOMEM;
1220         dev->ifalias = new_ifalias;
1221
1222         strlcpy(dev->ifalias, alias, len+1);
1223         return len;
1224 }
1225
1226
1227 /**
1228  *      netdev_features_change - device changes features
1229  *      @dev: device to cause notification
1230  *
1231  *      Called to indicate a device has changed features.
1232  */
1233 void netdev_features_change(struct net_device *dev)
1234 {
1235         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1236 }
1237 EXPORT_SYMBOL(netdev_features_change);
1238
1239 /**
1240  *      netdev_state_change - device changes state
1241  *      @dev: device to cause notification
1242  *
1243  *      Called to indicate a device has changed state. This function calls
1244  *      the notifier chains for netdev_chain and sends a NEWLINK message
1245  *      to the routing socket.
1246  */
1247 void netdev_state_change(struct net_device *dev)
1248 {
1249         if (dev->flags & IFF_UP) {
1250                 struct netdev_notifier_change_info change_info;
1251
1252                 change_info.flags_changed = 0;
1253                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1254                                               &change_info.info);
1255                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1256         }
1257 }
1258 EXPORT_SYMBOL(netdev_state_change);
1259
1260 /**
1261  *      netdev_notify_peers - notify network peers about existence of @dev
1262  *      @dev: network device
1263  *
1264  * Generate traffic such that interested network peers are aware of
1265  * @dev, such as by generating a gratuitous ARP. This may be used when
1266  * a device wants to inform the rest of the network about some sort of
1267  * reconfiguration such as a failover event or virtual machine
1268  * migration.
1269  */
1270 void netdev_notify_peers(struct net_device *dev)
1271 {
1272         rtnl_lock();
1273         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1274         rtnl_unlock();
1275 }
1276 EXPORT_SYMBOL(netdev_notify_peers);
1277
1278 static int __dev_open(struct net_device *dev)
1279 {
1280         const struct net_device_ops *ops = dev->netdev_ops;
1281         int ret;
1282
1283         ASSERT_RTNL();
1284
1285         if (!netif_device_present(dev))
1286                 return -ENODEV;
1287
1288         /* Block netpoll from trying to do any rx path servicing.
1289          * If we don't do this there is a chance ndo_poll_controller
1290          * or ndo_poll may be running while we open the device
1291          */
1292         netpoll_poll_disable(dev);
1293
1294         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1295         ret = notifier_to_errno(ret);
1296         if (ret)
1297                 return ret;
1298
1299         set_bit(__LINK_STATE_START, &dev->state);
1300
1301         if (ops->ndo_validate_addr)
1302                 ret = ops->ndo_validate_addr(dev);
1303
1304         if (!ret && ops->ndo_open)
1305                 ret = ops->ndo_open(dev);
1306
1307         netpoll_poll_enable(dev);
1308
1309         if (ret)
1310                 clear_bit(__LINK_STATE_START, &dev->state);
1311         else {
1312                 dev->flags |= IFF_UP;
1313                 dev_set_rx_mode(dev);
1314                 dev_activate(dev);
1315                 add_device_randomness(dev->dev_addr, dev->addr_len);
1316         }
1317
1318         return ret;
1319 }
1320
1321 /**
1322  *      dev_open        - prepare an interface for use.
1323  *      @dev:   device to open
1324  *
1325  *      Takes a device from down to up state. The device's private open
1326  *      function is invoked and then the multicast lists are loaded. Finally
1327  *      the device is moved into the up state and a %NETDEV_UP message is
1328  *      sent to the netdev notifier chain.
1329  *
1330  *      Calling this function on an active interface is a nop. On a failure
1331  *      a negative errno code is returned.
1332  */
1333 int dev_open(struct net_device *dev)
1334 {
1335         int ret;
1336
1337         if (dev->flags & IFF_UP)
1338                 return 0;
1339
1340         ret = __dev_open(dev);
1341         if (ret < 0)
1342                 return ret;
1343
1344         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1345         call_netdevice_notifiers(NETDEV_UP, dev);
1346
1347         return ret;
1348 }
1349 EXPORT_SYMBOL(dev_open);
1350
1351 static int __dev_close_many(struct list_head *head)
1352 {
1353         struct net_device *dev;
1354
1355         ASSERT_RTNL();
1356         might_sleep();
1357
1358         list_for_each_entry(dev, head, close_list) {
1359                 /* Temporarily disable netpoll until the interface is down */
1360                 netpoll_poll_disable(dev);
1361
1362                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1363
1364                 clear_bit(__LINK_STATE_START, &dev->state);
1365
1366                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1367                  * can be even on different cpu. So just clear netif_running().
1368                  *
1369                  * dev->stop() will invoke napi_disable() on all of it's
1370                  * napi_struct instances on this device.
1371                  */
1372                 smp_mb__after_atomic(); /* Commit netif_running(). */
1373         }
1374
1375         dev_deactivate_many(head);
1376
1377         list_for_each_entry(dev, head, close_list) {
1378                 const struct net_device_ops *ops = dev->netdev_ops;
1379
1380                 /*
1381                  *      Call the device specific close. This cannot fail.
1382                  *      Only if device is UP
1383                  *
1384                  *      We allow it to be called even after a DETACH hot-plug
1385                  *      event.
1386                  */
1387                 if (ops->ndo_stop)
1388                         ops->ndo_stop(dev);
1389
1390                 dev->flags &= ~IFF_UP;
1391                 netpoll_poll_enable(dev);
1392         }
1393
1394         return 0;
1395 }
1396
1397 static int __dev_close(struct net_device *dev)
1398 {
1399         int retval;
1400         LIST_HEAD(single);
1401
1402         list_add(&dev->close_list, &single);
1403         retval = __dev_close_many(&single);
1404         list_del(&single);
1405
1406         return retval;
1407 }
1408
1409 int dev_close_many(struct list_head *head, bool unlink)
1410 {
1411         struct net_device *dev, *tmp;
1412
1413         /* Remove the devices that don't need to be closed */
1414         list_for_each_entry_safe(dev, tmp, head, close_list)
1415                 if (!(dev->flags & IFF_UP))
1416                         list_del_init(&dev->close_list);
1417
1418         __dev_close_many(head);
1419
1420         list_for_each_entry_safe(dev, tmp, head, close_list) {
1421                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1422                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1423                 if (unlink)
1424                         list_del_init(&dev->close_list);
1425         }
1426
1427         return 0;
1428 }
1429 EXPORT_SYMBOL(dev_close_many);
1430
1431 /**
1432  *      dev_close - shutdown an interface.
1433  *      @dev: device to shutdown
1434  *
1435  *      This function moves an active device into down state. A
1436  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1437  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1438  *      chain.
1439  */
1440 int dev_close(struct net_device *dev)
1441 {
1442         if (dev->flags & IFF_UP) {
1443                 LIST_HEAD(single);
1444
1445                 list_add(&dev->close_list, &single);
1446                 dev_close_many(&single, true);
1447                 list_del(&single);
1448         }
1449         return 0;
1450 }
1451 EXPORT_SYMBOL(dev_close);
1452
1453
1454 /**
1455  *      dev_disable_lro - disable Large Receive Offload on a device
1456  *      @dev: device
1457  *
1458  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1459  *      called under RTNL.  This is needed if received packets may be
1460  *      forwarded to another interface.
1461  */
1462 void dev_disable_lro(struct net_device *dev)
1463 {
1464         struct net_device *lower_dev;
1465         struct list_head *iter;
1466
1467         dev->wanted_features &= ~NETIF_F_LRO;
1468         netdev_update_features(dev);
1469
1470         if (unlikely(dev->features & NETIF_F_LRO))
1471                 netdev_WARN(dev, "failed to disable LRO!\n");
1472
1473         netdev_for_each_lower_dev(dev, lower_dev, iter)
1474                 dev_disable_lro(lower_dev);
1475 }
1476 EXPORT_SYMBOL(dev_disable_lro);
1477
1478 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1479                                    struct net_device *dev)
1480 {
1481         struct netdev_notifier_info info;
1482
1483         netdev_notifier_info_init(&info, dev);
1484         return nb->notifier_call(nb, val, &info);
1485 }
1486
1487 static int dev_boot_phase = 1;
1488
1489 /**
1490  *      register_netdevice_notifier - register a network notifier block
1491  *      @nb: notifier
1492  *
1493  *      Register a notifier to be called when network device events occur.
1494  *      The notifier passed is linked into the kernel structures and must
1495  *      not be reused until it has been unregistered. A negative errno code
1496  *      is returned on a failure.
1497  *
1498  *      When registered all registration and up events are replayed
1499  *      to the new notifier to allow device to have a race free
1500  *      view of the network device list.
1501  */
1502
1503 int register_netdevice_notifier(struct notifier_block *nb)
1504 {
1505         struct net_device *dev;
1506         struct net_device *last;
1507         struct net *net;
1508         int err;
1509
1510         rtnl_lock();
1511         err = raw_notifier_chain_register(&netdev_chain, nb);
1512         if (err)
1513                 goto unlock;
1514         if (dev_boot_phase)
1515                 goto unlock;
1516         for_each_net(net) {
1517                 for_each_netdev(net, dev) {
1518                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1519                         err = notifier_to_errno(err);
1520                         if (err)
1521                                 goto rollback;
1522
1523                         if (!(dev->flags & IFF_UP))
1524                                 continue;
1525
1526                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1527                 }
1528         }
1529
1530 unlock:
1531         rtnl_unlock();
1532         return err;
1533
1534 rollback:
1535         last = dev;
1536         for_each_net(net) {
1537                 for_each_netdev(net, dev) {
1538                         if (dev == last)
1539                                 goto outroll;
1540
1541                         if (dev->flags & IFF_UP) {
1542                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1543                                                         dev);
1544                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1545                         }
1546                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1547                 }
1548         }
1549
1550 outroll:
1551         raw_notifier_chain_unregister(&netdev_chain, nb);
1552         goto unlock;
1553 }
1554 EXPORT_SYMBOL(register_netdevice_notifier);
1555
1556 /**
1557  *      unregister_netdevice_notifier - unregister a network notifier block
1558  *      @nb: notifier
1559  *
1560  *      Unregister a notifier previously registered by
1561  *      register_netdevice_notifier(). The notifier is unlinked into the
1562  *      kernel structures and may then be reused. A negative errno code
1563  *      is returned on a failure.
1564  *
1565  *      After unregistering unregister and down device events are synthesized
1566  *      for all devices on the device list to the removed notifier to remove
1567  *      the need for special case cleanup code.
1568  */
1569
1570 int unregister_netdevice_notifier(struct notifier_block *nb)
1571 {
1572         struct net_device *dev;
1573         struct net *net;
1574         int err;
1575
1576         rtnl_lock();
1577         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1578         if (err)
1579                 goto unlock;
1580
1581         for_each_net(net) {
1582                 for_each_netdev(net, dev) {
1583                         if (dev->flags & IFF_UP) {
1584                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1585                                                         dev);
1586                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1587                         }
1588                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1589                 }
1590         }
1591 unlock:
1592         rtnl_unlock();
1593         return err;
1594 }
1595 EXPORT_SYMBOL(unregister_netdevice_notifier);
1596
1597 /**
1598  *      call_netdevice_notifiers_info - call all network notifier blocks
1599  *      @val: value passed unmodified to notifier function
1600  *      @dev: net_device pointer passed unmodified to notifier function
1601  *      @info: notifier information data
1602  *
1603  *      Call all network notifier blocks.  Parameters and return value
1604  *      are as for raw_notifier_call_chain().
1605  */
1606
1607 static int call_netdevice_notifiers_info(unsigned long val,
1608                                          struct net_device *dev,
1609                                          struct netdev_notifier_info *info)
1610 {
1611         ASSERT_RTNL();
1612         netdev_notifier_info_init(info, dev);
1613         return raw_notifier_call_chain(&netdev_chain, val, info);
1614 }
1615
1616 /**
1617  *      call_netdevice_notifiers - call all network notifier blocks
1618  *      @val: value passed unmodified to notifier function
1619  *      @dev: net_device pointer passed unmodified to notifier function
1620  *
1621  *      Call all network notifier blocks.  Parameters and return value
1622  *      are as for raw_notifier_call_chain().
1623  */
1624
1625 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1626 {
1627         struct netdev_notifier_info info;
1628
1629         return call_netdevice_notifiers_info(val, dev, &info);
1630 }
1631 EXPORT_SYMBOL(call_netdevice_notifiers);
1632
1633 static struct static_key netstamp_needed __read_mostly;
1634 #ifdef HAVE_JUMP_LABEL
1635 /* We are not allowed to call static_key_slow_dec() from irq context
1636  * If net_disable_timestamp() is called from irq context, defer the
1637  * static_key_slow_dec() calls.
1638  */
1639 static atomic_t netstamp_needed_deferred;
1640 #endif
1641
1642 void net_enable_timestamp(void)
1643 {
1644 #ifdef HAVE_JUMP_LABEL
1645         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1646
1647         if (deferred) {
1648                 while (--deferred)
1649                         static_key_slow_dec(&netstamp_needed);
1650                 return;
1651         }
1652 #endif
1653         static_key_slow_inc(&netstamp_needed);
1654 }
1655 EXPORT_SYMBOL(net_enable_timestamp);
1656
1657 void net_disable_timestamp(void)
1658 {
1659 #ifdef HAVE_JUMP_LABEL
1660         if (in_interrupt()) {
1661                 atomic_inc(&netstamp_needed_deferred);
1662                 return;
1663         }
1664 #endif
1665         static_key_slow_dec(&netstamp_needed);
1666 }
1667 EXPORT_SYMBOL(net_disable_timestamp);
1668
1669 static inline void net_timestamp_set(struct sk_buff *skb)
1670 {
1671         skb->tstamp.tv64 = 0;
1672         if (static_key_false(&netstamp_needed))
1673                 __net_timestamp(skb);
1674 }
1675
1676 #define net_timestamp_check(COND, SKB)                  \
1677         if (static_key_false(&netstamp_needed)) {               \
1678                 if ((COND) && !(SKB)->tstamp.tv64)      \
1679                         __net_timestamp(SKB);           \
1680         }                                               \
1681
1682 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1683 {
1684         unsigned int len;
1685
1686         if (!(dev->flags & IFF_UP))
1687                 return false;
1688
1689         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1690         if (skb->len <= len)
1691                 return true;
1692
1693         /* if TSO is enabled, we don't care about the length as the packet
1694          * could be forwarded without being segmented before
1695          */
1696         if (skb_is_gso(skb))
1697                 return true;
1698
1699         return false;
1700 }
1701 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1702
1703 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1704 {
1705         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1706                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1707                         atomic_long_inc(&dev->rx_dropped);
1708                         kfree_skb(skb);
1709                         return NET_RX_DROP;
1710                 }
1711         }
1712
1713         if (unlikely(!is_skb_forwardable(dev, skb))) {
1714                 atomic_long_inc(&dev->rx_dropped);
1715                 kfree_skb(skb);
1716                 return NET_RX_DROP;
1717         }
1718
1719         skb_scrub_packet(skb, true);
1720         skb->priority = 0;
1721         skb->protocol = eth_type_trans(skb, dev);
1722         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1723
1724         return 0;
1725 }
1726 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1727
1728 /**
1729  * dev_forward_skb - loopback an skb to another netif
1730  *
1731  * @dev: destination network device
1732  * @skb: buffer to forward
1733  *
1734  * return values:
1735  *      NET_RX_SUCCESS  (no congestion)
1736  *      NET_RX_DROP     (packet was dropped, but freed)
1737  *
1738  * dev_forward_skb can be used for injecting an skb from the
1739  * start_xmit function of one device into the receive queue
1740  * of another device.
1741  *
1742  * The receiving device may be in another namespace, so
1743  * we have to clear all information in the skb that could
1744  * impact namespace isolation.
1745  */
1746 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1747 {
1748         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1749 }
1750 EXPORT_SYMBOL_GPL(dev_forward_skb);
1751
1752 static inline int deliver_skb(struct sk_buff *skb,
1753                               struct packet_type *pt_prev,
1754                               struct net_device *orig_dev)
1755 {
1756         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1757                 return -ENOMEM;
1758         atomic_inc(&skb->users);
1759         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1760 }
1761
1762 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1763                                           struct packet_type **pt,
1764                                           struct net_device *orig_dev,
1765                                           __be16 type,
1766                                           struct list_head *ptype_list)
1767 {
1768         struct packet_type *ptype, *pt_prev = *pt;
1769
1770         list_for_each_entry_rcu(ptype, ptype_list, list) {
1771                 if (ptype->type != type)
1772                         continue;
1773                 if (pt_prev)
1774                         deliver_skb(skb, pt_prev, orig_dev);
1775                 pt_prev = ptype;
1776         }
1777         *pt = pt_prev;
1778 }
1779
1780 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1781 {
1782         if (!ptype->af_packet_priv || !skb->sk)
1783                 return false;
1784
1785         if (ptype->id_match)
1786                 return ptype->id_match(ptype, skb->sk);
1787         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1788                 return true;
1789
1790         return false;
1791 }
1792
1793 /*
1794  *      Support routine. Sends outgoing frames to any network
1795  *      taps currently in use.
1796  */
1797
1798 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1799 {
1800         struct packet_type *ptype;
1801         struct sk_buff *skb2 = NULL;
1802         struct packet_type *pt_prev = NULL;
1803         struct list_head *ptype_list = &ptype_all;
1804
1805         rcu_read_lock();
1806 again:
1807         list_for_each_entry_rcu(ptype, ptype_list, list) {
1808                 /* Never send packets back to the socket
1809                  * they originated from - MvS ([email protected])
1810                  */
1811                 if (skb_loop_sk(ptype, skb))
1812                         continue;
1813
1814                 if (pt_prev) {
1815                         deliver_skb(skb2, pt_prev, skb->dev);
1816                         pt_prev = ptype;
1817                         continue;
1818                 }
1819
1820                 /* need to clone skb, done only once */
1821                 skb2 = skb_clone(skb, GFP_ATOMIC);
1822                 if (!skb2)
1823                         goto out_unlock;
1824
1825                 net_timestamp_set(skb2);
1826
1827                 /* skb->nh should be correctly
1828                  * set by sender, so that the second statement is
1829                  * just protection against buggy protocols.
1830                  */
1831                 skb_reset_mac_header(skb2);
1832
1833                 if (skb_network_header(skb2) < skb2->data ||
1834                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1835                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1836                                              ntohs(skb2->protocol),
1837                                              dev->name);
1838                         skb_reset_network_header(skb2);
1839                 }
1840
1841                 skb2->transport_header = skb2->network_header;
1842                 skb2->pkt_type = PACKET_OUTGOING;
1843                 pt_prev = ptype;
1844         }
1845
1846         if (ptype_list == &ptype_all) {
1847                 ptype_list = &dev->ptype_all;
1848                 goto again;
1849         }
1850 out_unlock:
1851         if (pt_prev)
1852                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1853         rcu_read_unlock();
1854 }
1855
1856 /**
1857  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1858  * @dev: Network device
1859  * @txq: number of queues available
1860  *
1861  * If real_num_tx_queues is changed the tc mappings may no longer be
1862  * valid. To resolve this verify the tc mapping remains valid and if
1863  * not NULL the mapping. With no priorities mapping to this
1864  * offset/count pair it will no longer be used. In the worst case TC0
1865  * is invalid nothing can be done so disable priority mappings. If is
1866  * expected that drivers will fix this mapping if they can before
1867  * calling netif_set_real_num_tx_queues.
1868  */
1869 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1870 {
1871         int i;
1872         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1873
1874         /* If TC0 is invalidated disable TC mapping */
1875         if (tc->offset + tc->count > txq) {
1876                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1877                 dev->num_tc = 0;
1878                 return;
1879         }
1880
1881         /* Invalidated prio to tc mappings set to TC0 */
1882         for (i = 1; i < TC_BITMASK + 1; i++) {
1883                 int q = netdev_get_prio_tc_map(dev, i);
1884
1885                 tc = &dev->tc_to_txq[q];
1886                 if (tc->offset + tc->count > txq) {
1887                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1888                                 i, q);
1889                         netdev_set_prio_tc_map(dev, i, 0);
1890                 }
1891         }
1892 }
1893
1894 #ifdef CONFIG_XPS
1895 static DEFINE_MUTEX(xps_map_mutex);
1896 #define xmap_dereference(P)             \
1897         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1898
1899 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1900                                         int cpu, u16 index)
1901 {
1902         struct xps_map *map = NULL;
1903         int pos;
1904
1905         if (dev_maps)
1906                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1907
1908         for (pos = 0; map && pos < map->len; pos++) {
1909                 if (map->queues[pos] == index) {
1910                         if (map->len > 1) {
1911                                 map->queues[pos] = map->queues[--map->len];
1912                         } else {
1913                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1914                                 kfree_rcu(map, rcu);
1915                                 map = NULL;
1916                         }
1917                         break;
1918                 }
1919         }
1920
1921         return map;
1922 }
1923
1924 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1925 {
1926         struct xps_dev_maps *dev_maps;
1927         int cpu, i;
1928         bool active = false;
1929
1930         mutex_lock(&xps_map_mutex);
1931         dev_maps = xmap_dereference(dev->xps_maps);
1932
1933         if (!dev_maps)
1934                 goto out_no_maps;
1935
1936         for_each_possible_cpu(cpu) {
1937                 for (i = index; i < dev->num_tx_queues; i++) {
1938                         if (!remove_xps_queue(dev_maps, cpu, i))
1939                                 break;
1940                 }
1941                 if (i == dev->num_tx_queues)
1942                         active = true;
1943         }
1944
1945         if (!active) {
1946                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1947                 kfree_rcu(dev_maps, rcu);
1948         }
1949
1950         for (i = index; i < dev->num_tx_queues; i++)
1951                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1952                                              NUMA_NO_NODE);
1953
1954 out_no_maps:
1955         mutex_unlock(&xps_map_mutex);
1956 }
1957
1958 static struct xps_map *expand_xps_map(struct xps_map *map,
1959                                       int cpu, u16 index)
1960 {
1961         struct xps_map *new_map;
1962         int alloc_len = XPS_MIN_MAP_ALLOC;
1963         int i, pos;
1964
1965         for (pos = 0; map && pos < map->len; pos++) {
1966                 if (map->queues[pos] != index)
1967                         continue;
1968                 return map;
1969         }
1970
1971         /* Need to add queue to this CPU's existing map */
1972         if (map) {
1973                 if (pos < map->alloc_len)
1974                         return map;
1975
1976                 alloc_len = map->alloc_len * 2;
1977         }
1978
1979         /* Need to allocate new map to store queue on this CPU's map */
1980         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1981                                cpu_to_node(cpu));
1982         if (!new_map)
1983                 return NULL;
1984
1985         for (i = 0; i < pos; i++)
1986                 new_map->queues[i] = map->queues[i];
1987         new_map->alloc_len = alloc_len;
1988         new_map->len = pos;
1989
1990         return new_map;
1991 }
1992
1993 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1994                         u16 index)
1995 {
1996         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1997         struct xps_map *map, *new_map;
1998         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1999         int cpu, numa_node_id = -2;
2000         bool active = false;
2001
2002         mutex_lock(&xps_map_mutex);
2003
2004         dev_maps = xmap_dereference(dev->xps_maps);
2005
2006         /* allocate memory for queue storage */
2007         for_each_online_cpu(cpu) {
2008                 if (!cpumask_test_cpu(cpu, mask))
2009                         continue;
2010
2011                 if (!new_dev_maps)
2012                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2013                 if (!new_dev_maps) {
2014                         mutex_unlock(&xps_map_mutex);
2015                         return -ENOMEM;
2016                 }
2017
2018                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2019                                  NULL;
2020
2021                 map = expand_xps_map(map, cpu, index);
2022                 if (!map)
2023                         goto error;
2024
2025                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2026         }
2027
2028         if (!new_dev_maps)
2029                 goto out_no_new_maps;
2030
2031         for_each_possible_cpu(cpu) {
2032                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2033                         /* add queue to CPU maps */
2034                         int pos = 0;
2035
2036                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2037                         while ((pos < map->len) && (map->queues[pos] != index))
2038                                 pos++;
2039
2040                         if (pos == map->len)
2041                                 map->queues[map->len++] = index;
2042 #ifdef CONFIG_NUMA
2043                         if (numa_node_id == -2)
2044                                 numa_node_id = cpu_to_node(cpu);
2045                         else if (numa_node_id != cpu_to_node(cpu))
2046                                 numa_node_id = -1;
2047 #endif
2048                 } else if (dev_maps) {
2049                         /* fill in the new device map from the old device map */
2050                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2051                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2052                 }
2053
2054         }
2055
2056         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2057
2058         /* Cleanup old maps */
2059         if (dev_maps) {
2060                 for_each_possible_cpu(cpu) {
2061                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2062                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2063                         if (map && map != new_map)
2064                                 kfree_rcu(map, rcu);
2065                 }
2066
2067                 kfree_rcu(dev_maps, rcu);
2068         }
2069
2070         dev_maps = new_dev_maps;
2071         active = true;
2072
2073 out_no_new_maps:
2074         /* update Tx queue numa node */
2075         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2076                                      (numa_node_id >= 0) ? numa_node_id :
2077                                      NUMA_NO_NODE);
2078
2079         if (!dev_maps)
2080                 goto out_no_maps;
2081
2082         /* removes queue from unused CPUs */
2083         for_each_possible_cpu(cpu) {
2084                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2085                         continue;
2086
2087                 if (remove_xps_queue(dev_maps, cpu, index))
2088                         active = true;
2089         }
2090
2091         /* free map if not active */
2092         if (!active) {
2093                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2094                 kfree_rcu(dev_maps, rcu);
2095         }
2096
2097 out_no_maps:
2098         mutex_unlock(&xps_map_mutex);
2099
2100         return 0;
2101 error:
2102         /* remove any maps that we added */
2103         for_each_possible_cpu(cpu) {
2104                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2105                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2106                                  NULL;
2107                 if (new_map && new_map != map)
2108                         kfree(new_map);
2109         }
2110
2111         mutex_unlock(&xps_map_mutex);
2112
2113         kfree(new_dev_maps);
2114         return -ENOMEM;
2115 }
2116 EXPORT_SYMBOL(netif_set_xps_queue);
2117
2118 #endif
2119 /*
2120  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2121  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2122  */
2123 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2124 {
2125         int rc;
2126
2127         if (txq < 1 || txq > dev->num_tx_queues)
2128                 return -EINVAL;
2129
2130         if (dev->reg_state == NETREG_REGISTERED ||
2131             dev->reg_state == NETREG_UNREGISTERING) {
2132                 ASSERT_RTNL();
2133
2134                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2135                                                   txq);
2136                 if (rc)
2137                         return rc;
2138
2139                 if (dev->num_tc)
2140                         netif_setup_tc(dev, txq);
2141
2142                 if (txq < dev->real_num_tx_queues) {
2143                         qdisc_reset_all_tx_gt(dev, txq);
2144 #ifdef CONFIG_XPS
2145                         netif_reset_xps_queues_gt(dev, txq);
2146 #endif
2147                 }
2148         }
2149
2150         dev->real_num_tx_queues = txq;
2151         return 0;
2152 }
2153 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2154
2155 #ifdef CONFIG_SYSFS
2156 /**
2157  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2158  *      @dev: Network device
2159  *      @rxq: Actual number of RX queues
2160  *
2161  *      This must be called either with the rtnl_lock held or before
2162  *      registration of the net device.  Returns 0 on success, or a
2163  *      negative error code.  If called before registration, it always
2164  *      succeeds.
2165  */
2166 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2167 {
2168         int rc;
2169
2170         if (rxq < 1 || rxq > dev->num_rx_queues)
2171                 return -EINVAL;
2172
2173         if (dev->reg_state == NETREG_REGISTERED) {
2174                 ASSERT_RTNL();
2175
2176                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2177                                                   rxq);
2178                 if (rc)
2179                         return rc;
2180         }
2181
2182         dev->real_num_rx_queues = rxq;
2183         return 0;
2184 }
2185 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2186 #endif
2187
2188 /**
2189  * netif_get_num_default_rss_queues - default number of RSS queues
2190  *
2191  * This routine should set an upper limit on the number of RSS queues
2192  * used by default by multiqueue devices.
2193  */
2194 int netif_get_num_default_rss_queues(void)
2195 {
2196         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2197 }
2198 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2199
2200 static inline void __netif_reschedule(struct Qdisc *q)
2201 {
2202         struct softnet_data *sd;
2203         unsigned long flags;
2204
2205         local_irq_save(flags);
2206         sd = this_cpu_ptr(&softnet_data);
2207         q->next_sched = NULL;
2208         *sd->output_queue_tailp = q;
2209         sd->output_queue_tailp = &q->next_sched;
2210         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2211         local_irq_restore(flags);
2212 }
2213
2214 void __netif_schedule(struct Qdisc *q)
2215 {
2216         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2217                 __netif_reschedule(q);
2218 }
2219 EXPORT_SYMBOL(__netif_schedule);
2220
2221 struct dev_kfree_skb_cb {
2222         enum skb_free_reason reason;
2223 };
2224
2225 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2226 {
2227         return (struct dev_kfree_skb_cb *)skb->cb;
2228 }
2229
2230 void netif_schedule_queue(struct netdev_queue *txq)
2231 {
2232         rcu_read_lock();
2233         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2234                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2235
2236                 __netif_schedule(q);
2237         }
2238         rcu_read_unlock();
2239 }
2240 EXPORT_SYMBOL(netif_schedule_queue);
2241
2242 /**
2243  *      netif_wake_subqueue - allow sending packets on subqueue
2244  *      @dev: network device
2245  *      @queue_index: sub queue index
2246  *
2247  * Resume individual transmit queue of a device with multiple transmit queues.
2248  */
2249 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2250 {
2251         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2252
2253         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2254                 struct Qdisc *q;
2255
2256                 rcu_read_lock();
2257                 q = rcu_dereference(txq->qdisc);
2258                 __netif_schedule(q);
2259                 rcu_read_unlock();
2260         }
2261 }
2262 EXPORT_SYMBOL(netif_wake_subqueue);
2263
2264 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2265 {
2266         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2267                 struct Qdisc *q;
2268
2269                 rcu_read_lock();
2270                 q = rcu_dereference(dev_queue->qdisc);
2271                 __netif_schedule(q);
2272                 rcu_read_unlock();
2273         }
2274 }
2275 EXPORT_SYMBOL(netif_tx_wake_queue);
2276
2277 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2278 {
2279         unsigned long flags;
2280
2281         if (likely(atomic_read(&skb->users) == 1)) {
2282                 smp_rmb();
2283                 atomic_set(&skb->users, 0);
2284         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2285                 return;
2286         }
2287         get_kfree_skb_cb(skb)->reason = reason;
2288         local_irq_save(flags);
2289         skb->next = __this_cpu_read(softnet_data.completion_queue);
2290         __this_cpu_write(softnet_data.completion_queue, skb);
2291         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2292         local_irq_restore(flags);
2293 }
2294 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2295
2296 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2297 {
2298         if (in_irq() || irqs_disabled())
2299                 __dev_kfree_skb_irq(skb, reason);
2300         else
2301                 dev_kfree_skb(skb);
2302 }
2303 EXPORT_SYMBOL(__dev_kfree_skb_any);
2304
2305
2306 /**
2307  * netif_device_detach - mark device as removed
2308  * @dev: network device
2309  *
2310  * Mark device as removed from system and therefore no longer available.
2311  */
2312 void netif_device_detach(struct net_device *dev)
2313 {
2314         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2315             netif_running(dev)) {
2316                 netif_tx_stop_all_queues(dev);
2317         }
2318 }
2319 EXPORT_SYMBOL(netif_device_detach);
2320
2321 /**
2322  * netif_device_attach - mark device as attached
2323  * @dev: network device
2324  *
2325  * Mark device as attached from system and restart if needed.
2326  */
2327 void netif_device_attach(struct net_device *dev)
2328 {
2329         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2330             netif_running(dev)) {
2331                 netif_tx_wake_all_queues(dev);
2332                 __netdev_watchdog_up(dev);
2333         }
2334 }
2335 EXPORT_SYMBOL(netif_device_attach);
2336
2337 static void skb_warn_bad_offload(const struct sk_buff *skb)
2338 {
2339         static const netdev_features_t null_features = 0;
2340         struct net_device *dev = skb->dev;
2341         const char *driver = "";
2342
2343         if (!net_ratelimit())
2344                 return;
2345
2346         if (dev && dev->dev.parent)
2347                 driver = dev_driver_string(dev->dev.parent);
2348
2349         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2350              "gso_type=%d ip_summed=%d\n",
2351              driver, dev ? &dev->features : &null_features,
2352              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2353              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2354              skb_shinfo(skb)->gso_type, skb->ip_summed);
2355 }
2356
2357 /*
2358  * Invalidate hardware checksum when packet is to be mangled, and
2359  * complete checksum manually on outgoing path.
2360  */
2361 int skb_checksum_help(struct sk_buff *skb)
2362 {
2363         __wsum csum;
2364         int ret = 0, offset;
2365
2366         if (skb->ip_summed == CHECKSUM_COMPLETE)
2367                 goto out_set_summed;
2368
2369         if (unlikely(skb_shinfo(skb)->gso_size)) {
2370                 skb_warn_bad_offload(skb);
2371                 return -EINVAL;
2372         }
2373
2374         /* Before computing a checksum, we should make sure no frag could
2375          * be modified by an external entity : checksum could be wrong.
2376          */
2377         if (skb_has_shared_frag(skb)) {
2378                 ret = __skb_linearize(skb);
2379                 if (ret)
2380                         goto out;
2381         }
2382
2383         offset = skb_checksum_start_offset(skb);
2384         BUG_ON(offset >= skb_headlen(skb));
2385         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2386
2387         offset += skb->csum_offset;
2388         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2389
2390         if (skb_cloned(skb) &&
2391             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2392                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2393                 if (ret)
2394                         goto out;
2395         }
2396
2397         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2398 out_set_summed:
2399         skb->ip_summed = CHECKSUM_NONE;
2400 out:
2401         return ret;
2402 }
2403 EXPORT_SYMBOL(skb_checksum_help);
2404
2405 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2406 {
2407         __be16 type = skb->protocol;
2408
2409         /* Tunnel gso handlers can set protocol to ethernet. */
2410         if (type == htons(ETH_P_TEB)) {
2411                 struct ethhdr *eth;
2412
2413                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2414                         return 0;
2415
2416                 eth = (struct ethhdr *)skb_mac_header(skb);
2417                 type = eth->h_proto;
2418         }
2419
2420         return __vlan_get_protocol(skb, type, depth);
2421 }
2422
2423 /**
2424  *      skb_mac_gso_segment - mac layer segmentation handler.
2425  *      @skb: buffer to segment
2426  *      @features: features for the output path (see dev->features)
2427  */
2428 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2429                                     netdev_features_t features)
2430 {
2431         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2432         struct packet_offload *ptype;
2433         int vlan_depth = skb->mac_len;
2434         __be16 type = skb_network_protocol(skb, &vlan_depth);
2435
2436         if (unlikely(!type))
2437                 return ERR_PTR(-EINVAL);
2438
2439         __skb_pull(skb, vlan_depth);
2440
2441         rcu_read_lock();
2442         list_for_each_entry_rcu(ptype, &offload_base, list) {
2443                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2444                         segs = ptype->callbacks.gso_segment(skb, features);
2445                         break;
2446                 }
2447         }
2448         rcu_read_unlock();
2449
2450         __skb_push(skb, skb->data - skb_mac_header(skb));
2451
2452         return segs;
2453 }
2454 EXPORT_SYMBOL(skb_mac_gso_segment);
2455
2456
2457 /* openvswitch calls this on rx path, so we need a different check.
2458  */
2459 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2460 {
2461         if (tx_path)
2462                 return skb->ip_summed != CHECKSUM_PARTIAL;
2463         else
2464                 return skb->ip_summed == CHECKSUM_NONE;
2465 }
2466
2467 /**
2468  *      __skb_gso_segment - Perform segmentation on skb.
2469  *      @skb: buffer to segment
2470  *      @features: features for the output path (see dev->features)
2471  *      @tx_path: whether it is called in TX path
2472  *
2473  *      This function segments the given skb and returns a list of segments.
2474  *
2475  *      It may return NULL if the skb requires no segmentation.  This is
2476  *      only possible when GSO is used for verifying header integrity.
2477  */
2478 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2479                                   netdev_features_t features, bool tx_path)
2480 {
2481         if (unlikely(skb_needs_check(skb, tx_path))) {
2482                 int err;
2483
2484                 skb_warn_bad_offload(skb);
2485
2486                 err = skb_cow_head(skb, 0);
2487                 if (err < 0)
2488                         return ERR_PTR(err);
2489         }
2490
2491         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2492         SKB_GSO_CB(skb)->encap_level = 0;
2493
2494         skb_reset_mac_header(skb);
2495         skb_reset_mac_len(skb);
2496
2497         return skb_mac_gso_segment(skb, features);
2498 }
2499 EXPORT_SYMBOL(__skb_gso_segment);
2500
2501 /* Take action when hardware reception checksum errors are detected. */
2502 #ifdef CONFIG_BUG
2503 void netdev_rx_csum_fault(struct net_device *dev)
2504 {
2505         if (net_ratelimit()) {
2506                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2507                 dump_stack();
2508         }
2509 }
2510 EXPORT_SYMBOL(netdev_rx_csum_fault);
2511 #endif
2512
2513 /* Actually, we should eliminate this check as soon as we know, that:
2514  * 1. IOMMU is present and allows to map all the memory.
2515  * 2. No high memory really exists on this machine.
2516  */
2517
2518 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2519 {
2520 #ifdef CONFIG_HIGHMEM
2521         int i;
2522         if (!(dev->features & NETIF_F_HIGHDMA)) {
2523                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2524                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2525                         if (PageHighMem(skb_frag_page(frag)))
2526                                 return 1;
2527                 }
2528         }
2529
2530         if (PCI_DMA_BUS_IS_PHYS) {
2531                 struct device *pdev = dev->dev.parent;
2532
2533                 if (!pdev)
2534                         return 0;
2535                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2536                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2537                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2538                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2539                                 return 1;
2540                 }
2541         }
2542 #endif
2543         return 0;
2544 }
2545
2546 /* If MPLS offload request, verify we are testing hardware MPLS features
2547  * instead of standard features for the netdev.
2548  */
2549 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2550 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2551                                            netdev_features_t features,
2552                                            __be16 type)
2553 {
2554         if (eth_p_mpls(type))
2555                 features &= skb->dev->mpls_features;
2556
2557         return features;
2558 }
2559 #else
2560 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2561                                            netdev_features_t features,
2562                                            __be16 type)
2563 {
2564         return features;
2565 }
2566 #endif
2567
2568 static netdev_features_t harmonize_features(struct sk_buff *skb,
2569         netdev_features_t features)
2570 {
2571         int tmp;
2572         __be16 type;
2573
2574         type = skb_network_protocol(skb, &tmp);
2575         features = net_mpls_features(skb, features, type);
2576
2577         if (skb->ip_summed != CHECKSUM_NONE &&
2578             !can_checksum_protocol(features, type)) {
2579                 features &= ~NETIF_F_ALL_CSUM;
2580         } else if (illegal_highdma(skb->dev, skb)) {
2581                 features &= ~NETIF_F_SG;
2582         }
2583
2584         return features;
2585 }
2586
2587 netdev_features_t passthru_features_check(struct sk_buff *skb,
2588                                           struct net_device *dev,
2589                                           netdev_features_t features)
2590 {
2591         return features;
2592 }
2593 EXPORT_SYMBOL(passthru_features_check);
2594
2595 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2596                                              struct net_device *dev,
2597                                              netdev_features_t features)
2598 {
2599         return vlan_features_check(skb, features);
2600 }
2601
2602 netdev_features_t netif_skb_features(struct sk_buff *skb)
2603 {
2604         struct net_device *dev = skb->dev;
2605         netdev_features_t features = dev->features;
2606         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2607
2608         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2609                 features &= ~NETIF_F_GSO_MASK;
2610
2611         /* If encapsulation offload request, verify we are testing
2612          * hardware encapsulation features instead of standard
2613          * features for the netdev
2614          */
2615         if (skb->encapsulation)
2616                 features &= dev->hw_enc_features;
2617
2618         if (skb_vlan_tagged(skb))
2619                 features = netdev_intersect_features(features,
2620                                                      dev->vlan_features |
2621                                                      NETIF_F_HW_VLAN_CTAG_TX |
2622                                                      NETIF_F_HW_VLAN_STAG_TX);
2623
2624         if (dev->netdev_ops->ndo_features_check)
2625                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2626                                                                 features);
2627         else
2628                 features &= dflt_features_check(skb, dev, features);
2629
2630         return harmonize_features(skb, features);
2631 }
2632 EXPORT_SYMBOL(netif_skb_features);
2633
2634 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2635                     struct netdev_queue *txq, bool more)
2636 {
2637         unsigned int len;
2638         int rc;
2639
2640         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2641                 dev_queue_xmit_nit(skb, dev);
2642
2643         len = skb->len;
2644         trace_net_dev_start_xmit(skb, dev);
2645         rc = netdev_start_xmit(skb, dev, txq, more);
2646         trace_net_dev_xmit(skb, rc, dev, len);
2647
2648         return rc;
2649 }
2650
2651 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2652                                     struct netdev_queue *txq, int *ret)
2653 {
2654         struct sk_buff *skb = first;
2655         int rc = NETDEV_TX_OK;
2656
2657         while (skb) {
2658                 struct sk_buff *next = skb->next;
2659
2660                 skb->next = NULL;
2661                 rc = xmit_one(skb, dev, txq, next != NULL);
2662                 if (unlikely(!dev_xmit_complete(rc))) {
2663                         skb->next = next;
2664                         goto out;
2665                 }
2666
2667                 skb = next;
2668                 if (netif_xmit_stopped(txq) && skb) {
2669                         rc = NETDEV_TX_BUSY;
2670                         break;
2671                 }
2672         }
2673
2674 out:
2675         *ret = rc;
2676         return skb;
2677 }
2678
2679 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2680                                           netdev_features_t features)
2681 {
2682         if (skb_vlan_tag_present(skb) &&
2683             !vlan_hw_offload_capable(features, skb->vlan_proto))
2684                 skb = __vlan_hwaccel_push_inside(skb);
2685         return skb;
2686 }
2687
2688 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2689 {
2690         netdev_features_t features;
2691
2692         if (skb->next)
2693                 return skb;
2694
2695         features = netif_skb_features(skb);
2696         skb = validate_xmit_vlan(skb, features);
2697         if (unlikely(!skb))
2698                 goto out_null;
2699
2700         if (netif_needs_gso(dev, skb, features)) {
2701                 struct sk_buff *segs;
2702
2703                 segs = skb_gso_segment(skb, features);
2704                 if (IS_ERR(segs)) {
2705                         goto out_kfree_skb;
2706                 } else if (segs) {
2707                         consume_skb(skb);
2708                         skb = segs;
2709                 }
2710         } else {
2711                 if (skb_needs_linearize(skb, features) &&
2712                     __skb_linearize(skb))
2713                         goto out_kfree_skb;
2714
2715                 /* If packet is not checksummed and device does not
2716                  * support checksumming for this protocol, complete
2717                  * checksumming here.
2718                  */
2719                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2720                         if (skb->encapsulation)
2721                                 skb_set_inner_transport_header(skb,
2722                                                                skb_checksum_start_offset(skb));
2723                         else
2724                                 skb_set_transport_header(skb,
2725                                                          skb_checksum_start_offset(skb));
2726                         if (!(features & NETIF_F_ALL_CSUM) &&
2727                             skb_checksum_help(skb))
2728                                 goto out_kfree_skb;
2729                 }
2730         }
2731
2732         return skb;
2733
2734 out_kfree_skb:
2735         kfree_skb(skb);
2736 out_null:
2737         return NULL;
2738 }
2739
2740 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2741 {
2742         struct sk_buff *next, *head = NULL, *tail;
2743
2744         for (; skb != NULL; skb = next) {
2745                 next = skb->next;
2746                 skb->next = NULL;
2747
2748                 /* in case skb wont be segmented, point to itself */
2749                 skb->prev = skb;
2750
2751                 skb = validate_xmit_skb(skb, dev);
2752                 if (!skb)
2753                         continue;
2754
2755                 if (!head)
2756                         head = skb;
2757                 else
2758                         tail->next = skb;
2759                 /* If skb was segmented, skb->prev points to
2760                  * the last segment. If not, it still contains skb.
2761                  */
2762                 tail = skb->prev;
2763         }
2764         return head;
2765 }
2766
2767 static void qdisc_pkt_len_init(struct sk_buff *skb)
2768 {
2769         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2770
2771         qdisc_skb_cb(skb)->pkt_len = skb->len;
2772
2773         /* To get more precise estimation of bytes sent on wire,
2774          * we add to pkt_len the headers size of all segments
2775          */
2776         if (shinfo->gso_size)  {
2777                 unsigned int hdr_len;
2778                 u16 gso_segs = shinfo->gso_segs;
2779
2780                 /* mac layer + network layer */
2781                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2782
2783                 /* + transport layer */
2784                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2785                         hdr_len += tcp_hdrlen(skb);
2786                 else
2787                         hdr_len += sizeof(struct udphdr);
2788
2789                 if (shinfo->gso_type & SKB_GSO_DODGY)
2790                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2791                                                 shinfo->gso_size);
2792
2793                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2794         }
2795 }
2796
2797 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2798                                  struct net_device *dev,
2799                                  struct netdev_queue *txq)
2800 {
2801         spinlock_t *root_lock = qdisc_lock(q);
2802         bool contended;
2803         int rc;
2804
2805         qdisc_pkt_len_init(skb);
2806         qdisc_calculate_pkt_len(skb, q);
2807         /*
2808          * Heuristic to force contended enqueues to serialize on a
2809          * separate lock before trying to get qdisc main lock.
2810          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2811          * often and dequeue packets faster.
2812          */
2813         contended = qdisc_is_running(q);
2814         if (unlikely(contended))
2815                 spin_lock(&q->busylock);
2816
2817         spin_lock(root_lock);
2818         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2819                 kfree_skb(skb);
2820                 rc = NET_XMIT_DROP;
2821         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2822                    qdisc_run_begin(q)) {
2823                 /*
2824                  * This is a work-conserving queue; there are no old skbs
2825                  * waiting to be sent out; and the qdisc is not running -
2826                  * xmit the skb directly.
2827                  */
2828
2829                 qdisc_bstats_update(q, skb);
2830
2831                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2832                         if (unlikely(contended)) {
2833                                 spin_unlock(&q->busylock);
2834                                 contended = false;
2835                         }
2836                         __qdisc_run(q);
2837                 } else
2838                         qdisc_run_end(q);
2839
2840                 rc = NET_XMIT_SUCCESS;
2841         } else {
2842                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2843                 if (qdisc_run_begin(q)) {
2844                         if (unlikely(contended)) {
2845                                 spin_unlock(&q->busylock);
2846                                 contended = false;
2847                         }
2848                         __qdisc_run(q);
2849                 }
2850         }
2851         spin_unlock(root_lock);
2852         if (unlikely(contended))
2853                 spin_unlock(&q->busylock);
2854         return rc;
2855 }
2856
2857 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2858 static void skb_update_prio(struct sk_buff *skb)
2859 {
2860         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2861
2862         if (!skb->priority && skb->sk && map) {
2863                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2864
2865                 if (prioidx < map->priomap_len)
2866                         skb->priority = map->priomap[prioidx];
2867         }
2868 }
2869 #else
2870 #define skb_update_prio(skb)
2871 #endif
2872
2873 DEFINE_PER_CPU(int, xmit_recursion);
2874 EXPORT_SYMBOL(xmit_recursion);
2875
2876 #define RECURSION_LIMIT 10
2877
2878 /**
2879  *      dev_loopback_xmit - loop back @skb
2880  *      @skb: buffer to transmit
2881  */
2882 int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
2883 {
2884         skb_reset_mac_header(skb);
2885         __skb_pull(skb, skb_network_offset(skb));
2886         skb->pkt_type = PACKET_LOOPBACK;
2887         skb->ip_summed = CHECKSUM_UNNECESSARY;
2888         WARN_ON(!skb_dst(skb));
2889         skb_dst_force(skb);
2890         netif_rx_ni(skb);
2891         return 0;
2892 }
2893 EXPORT_SYMBOL(dev_loopback_xmit);
2894
2895 /**
2896  *      __dev_queue_xmit - transmit a buffer
2897  *      @skb: buffer to transmit
2898  *      @accel_priv: private data used for L2 forwarding offload
2899  *
2900  *      Queue a buffer for transmission to a network device. The caller must
2901  *      have set the device and priority and built the buffer before calling
2902  *      this function. The function can be called from an interrupt.
2903  *
2904  *      A negative errno code is returned on a failure. A success does not
2905  *      guarantee the frame will be transmitted as it may be dropped due
2906  *      to congestion or traffic shaping.
2907  *
2908  * -----------------------------------------------------------------------------------
2909  *      I notice this method can also return errors from the queue disciplines,
2910  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2911  *      be positive.
2912  *
2913  *      Regardless of the return value, the skb is consumed, so it is currently
2914  *      difficult to retry a send to this method.  (You can bump the ref count
2915  *      before sending to hold a reference for retry if you are careful.)
2916  *
2917  *      When calling this method, interrupts MUST be enabled.  This is because
2918  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2919  *          --BLG
2920  */
2921 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2922 {
2923         struct net_device *dev = skb->dev;
2924         struct netdev_queue *txq;
2925         struct Qdisc *q;
2926         int rc = -ENOMEM;
2927
2928         skb_reset_mac_header(skb);
2929
2930         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2931                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2932
2933         /* Disable soft irqs for various locks below. Also
2934          * stops preemption for RCU.
2935          */
2936         rcu_read_lock_bh();
2937
2938         skb_update_prio(skb);
2939
2940         /* If device/qdisc don't need skb->dst, release it right now while
2941          * its hot in this cpu cache.
2942          */
2943         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2944                 skb_dst_drop(skb);
2945         else
2946                 skb_dst_force(skb);
2947
2948         txq = netdev_pick_tx(dev, skb, accel_priv);
2949         q = rcu_dereference_bh(txq->qdisc);
2950
2951 #ifdef CONFIG_NET_CLS_ACT
2952         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2953 #endif
2954         trace_net_dev_queue(skb);
2955         if (q->enqueue) {
2956                 rc = __dev_xmit_skb(skb, q, dev, txq);
2957                 goto out;
2958         }
2959
2960         /* The device has no queue. Common case for software devices:
2961            loopback, all the sorts of tunnels...
2962
2963            Really, it is unlikely that netif_tx_lock protection is necessary
2964            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2965            counters.)
2966            However, it is possible, that they rely on protection
2967            made by us here.
2968
2969            Check this and shot the lock. It is not prone from deadlocks.
2970            Either shot noqueue qdisc, it is even simpler 8)
2971          */
2972         if (dev->flags & IFF_UP) {
2973                 int cpu = smp_processor_id(); /* ok because BHs are off */
2974
2975                 if (txq->xmit_lock_owner != cpu) {
2976
2977                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2978                                 goto recursion_alert;
2979
2980                         skb = validate_xmit_skb(skb, dev);
2981                         if (!skb)
2982                                 goto drop;
2983
2984                         HARD_TX_LOCK(dev, txq, cpu);
2985
2986                         if (!netif_xmit_stopped(txq)) {
2987                                 __this_cpu_inc(xmit_recursion);
2988                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2989                                 __this_cpu_dec(xmit_recursion);
2990                                 if (dev_xmit_complete(rc)) {
2991                                         HARD_TX_UNLOCK(dev, txq);
2992                                         goto out;
2993                                 }
2994                         }
2995                         HARD_TX_UNLOCK(dev, txq);
2996                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2997                                              dev->name);
2998                 } else {
2999                         /* Recursion is detected! It is possible,
3000                          * unfortunately
3001                          */
3002 recursion_alert:
3003                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3004                                              dev->name);
3005                 }
3006         }
3007
3008         rc = -ENETDOWN;
3009 drop:
3010         rcu_read_unlock_bh();
3011
3012         atomic_long_inc(&dev->tx_dropped);
3013         kfree_skb_list(skb);
3014         return rc;
3015 out:
3016         rcu_read_unlock_bh();
3017         return rc;
3018 }
3019
3020 int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
3021 {
3022         return __dev_queue_xmit(skb, NULL);
3023 }
3024 EXPORT_SYMBOL(dev_queue_xmit_sk);
3025
3026 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3027 {
3028         return __dev_queue_xmit(skb, accel_priv);
3029 }
3030 EXPORT_SYMBOL(dev_queue_xmit_accel);
3031
3032
3033 /*=======================================================================
3034                         Receiver routines
3035   =======================================================================*/
3036
3037 int netdev_max_backlog __read_mostly = 1000;
3038 EXPORT_SYMBOL(netdev_max_backlog);
3039
3040 int netdev_tstamp_prequeue __read_mostly = 1;
3041 int netdev_budget __read_mostly = 300;
3042 int weight_p __read_mostly = 64;            /* old backlog weight */
3043
3044 /* Called with irq disabled */
3045 static inline void ____napi_schedule(struct softnet_data *sd,
3046                                      struct napi_struct *napi)
3047 {
3048         list_add_tail(&napi->poll_list, &sd->poll_list);
3049         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3050 }
3051
3052 #ifdef CONFIG_RPS
3053
3054 /* One global table that all flow-based protocols share. */
3055 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3056 EXPORT_SYMBOL(rps_sock_flow_table);
3057 u32 rps_cpu_mask __read_mostly;
3058 EXPORT_SYMBOL(rps_cpu_mask);
3059
3060 struct static_key rps_needed __read_mostly;
3061
3062 static struct rps_dev_flow *
3063 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3064             struct rps_dev_flow *rflow, u16 next_cpu)
3065 {
3066         if (next_cpu != RPS_NO_CPU) {
3067 #ifdef CONFIG_RFS_ACCEL
3068                 struct netdev_rx_queue *rxqueue;
3069                 struct rps_dev_flow_table *flow_table;
3070                 struct rps_dev_flow *old_rflow;
3071                 u32 flow_id;
3072                 u16 rxq_index;
3073                 int rc;
3074
3075                 /* Should we steer this flow to a different hardware queue? */
3076                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3077                     !(dev->features & NETIF_F_NTUPLE))
3078                         goto out;
3079                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3080                 if (rxq_index == skb_get_rx_queue(skb))
3081                         goto out;
3082
3083                 rxqueue = dev->_rx + rxq_index;
3084                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3085                 if (!flow_table)
3086                         goto out;
3087                 flow_id = skb_get_hash(skb) & flow_table->mask;
3088                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3089                                                         rxq_index, flow_id);
3090                 if (rc < 0)
3091                         goto out;
3092                 old_rflow = rflow;
3093                 rflow = &flow_table->flows[flow_id];
3094                 rflow->filter = rc;
3095                 if (old_rflow->filter == rflow->filter)
3096                         old_rflow->filter = RPS_NO_FILTER;
3097         out:
3098 #endif
3099                 rflow->last_qtail =
3100                         per_cpu(softnet_data, next_cpu).input_queue_head;
3101         }
3102
3103         rflow->cpu = next_cpu;
3104         return rflow;
3105 }
3106
3107 /*
3108  * get_rps_cpu is called from netif_receive_skb and returns the target
3109  * CPU from the RPS map of the receiving queue for a given skb.
3110  * rcu_read_lock must be held on entry.
3111  */
3112 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3113                        struct rps_dev_flow **rflowp)
3114 {
3115         const struct rps_sock_flow_table *sock_flow_table;
3116         struct netdev_rx_queue *rxqueue = dev->_rx;
3117         struct rps_dev_flow_table *flow_table;
3118         struct rps_map *map;
3119         int cpu = -1;
3120         u32 tcpu;
3121         u32 hash;
3122
3123         if (skb_rx_queue_recorded(skb)) {
3124                 u16 index = skb_get_rx_queue(skb);
3125
3126                 if (unlikely(index >= dev->real_num_rx_queues)) {
3127                         WARN_ONCE(dev->real_num_rx_queues > 1,
3128                                   "%s received packet on queue %u, but number "
3129                                   "of RX queues is %u\n",
3130                                   dev->name, index, dev->real_num_rx_queues);
3131                         goto done;
3132                 }
3133                 rxqueue += index;
3134         }
3135
3136         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3137
3138         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3139         map = rcu_dereference(rxqueue->rps_map);
3140         if (!flow_table && !map)
3141                 goto done;
3142
3143         skb_reset_network_header(skb);
3144         hash = skb_get_hash(skb);
3145         if (!hash)
3146                 goto done;
3147
3148         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3149         if (flow_table && sock_flow_table) {
3150                 struct rps_dev_flow *rflow;
3151                 u32 next_cpu;
3152                 u32 ident;
3153
3154                 /* First check into global flow table if there is a match */
3155                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3156                 if ((ident ^ hash) & ~rps_cpu_mask)
3157                         goto try_rps;
3158
3159                 next_cpu = ident & rps_cpu_mask;
3160
3161                 /* OK, now we know there is a match,
3162                  * we can look at the local (per receive queue) flow table
3163                  */
3164                 rflow = &flow_table->flows[hash & flow_table->mask];
3165                 tcpu = rflow->cpu;
3166
3167                 /*
3168                  * If the desired CPU (where last recvmsg was done) is
3169                  * different from current CPU (one in the rx-queue flow
3170                  * table entry), switch if one of the following holds:
3171                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3172                  *   - Current CPU is offline.
3173                  *   - The current CPU's queue tail has advanced beyond the
3174                  *     last packet that was enqueued using this table entry.
3175                  *     This guarantees that all previous packets for the flow
3176                  *     have been dequeued, thus preserving in order delivery.
3177                  */
3178                 if (unlikely(tcpu != next_cpu) &&
3179                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3180                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3181                       rflow->last_qtail)) >= 0)) {
3182                         tcpu = next_cpu;
3183                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3184                 }
3185
3186                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3187                         *rflowp = rflow;
3188                         cpu = tcpu;
3189                         goto done;
3190                 }
3191         }
3192
3193 try_rps:
3194
3195         if (map) {
3196                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3197                 if (cpu_online(tcpu)) {
3198                         cpu = tcpu;
3199                         goto done;
3200                 }
3201         }
3202
3203 done:
3204         return cpu;
3205 }
3206
3207 #ifdef CONFIG_RFS_ACCEL
3208
3209 /**
3210  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3211  * @dev: Device on which the filter was set
3212  * @rxq_index: RX queue index
3213  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3214  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3215  *
3216  * Drivers that implement ndo_rx_flow_steer() should periodically call
3217  * this function for each installed filter and remove the filters for
3218  * which it returns %true.
3219  */
3220 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3221                          u32 flow_id, u16 filter_id)
3222 {
3223         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3224         struct rps_dev_flow_table *flow_table;
3225         struct rps_dev_flow *rflow;
3226         bool expire = true;
3227         int cpu;
3228
3229         rcu_read_lock();
3230         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3231         if (flow_table && flow_id <= flow_table->mask) {
3232                 rflow = &flow_table->flows[flow_id];
3233                 cpu = ACCESS_ONCE(rflow->cpu);
3234                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3235                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3236                            rflow->last_qtail) <
3237                      (int)(10 * flow_table->mask)))
3238                         expire = false;
3239         }
3240         rcu_read_unlock();
3241         return expire;
3242 }
3243 EXPORT_SYMBOL(rps_may_expire_flow);
3244
3245 #endif /* CONFIG_RFS_ACCEL */
3246
3247 /* Called from hardirq (IPI) context */
3248 static void rps_trigger_softirq(void *data)
3249 {
3250         struct softnet_data *sd = data;
3251
3252         ____napi_schedule(sd, &sd->backlog);
3253         sd->received_rps++;
3254 }
3255
3256 #endif /* CONFIG_RPS */
3257
3258 /*
3259  * Check if this softnet_data structure is another cpu one
3260  * If yes, queue it to our IPI list and return 1
3261  * If no, return 0
3262  */
3263 static int rps_ipi_queued(struct softnet_data *sd)
3264 {
3265 #ifdef CONFIG_RPS
3266         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3267
3268         if (sd != mysd) {
3269                 sd->rps_ipi_next = mysd->rps_ipi_list;
3270                 mysd->rps_ipi_list = sd;
3271
3272                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3273                 return 1;
3274         }
3275 #endif /* CONFIG_RPS */
3276         return 0;
3277 }
3278
3279 #ifdef CONFIG_NET_FLOW_LIMIT
3280 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3281 #endif
3282
3283 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3284 {
3285 #ifdef CONFIG_NET_FLOW_LIMIT
3286         struct sd_flow_limit *fl;
3287         struct softnet_data *sd;
3288         unsigned int old_flow, new_flow;
3289
3290         if (qlen < (netdev_max_backlog >> 1))
3291                 return false;
3292
3293         sd = this_cpu_ptr(&softnet_data);
3294
3295         rcu_read_lock();
3296         fl = rcu_dereference(sd->flow_limit);
3297         if (fl) {
3298                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3299                 old_flow = fl->history[fl->history_head];
3300                 fl->history[fl->history_head] = new_flow;
3301
3302                 fl->history_head++;
3303                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3304
3305                 if (likely(fl->buckets[old_flow]))
3306                         fl->buckets[old_flow]--;
3307
3308                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3309                         fl->count++;
3310                         rcu_read_unlock();
3311                         return true;
3312                 }
3313         }
3314         rcu_read_unlock();
3315 #endif
3316         return false;
3317 }
3318
3319 /*
3320  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3321  * queue (may be a remote CPU queue).
3322  */
3323 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3324                               unsigned int *qtail)
3325 {
3326         struct softnet_data *sd;
3327         unsigned long flags;
3328         unsigned int qlen;
3329
3330         sd = &per_cpu(softnet_data, cpu);
3331
3332         local_irq_save(flags);
3333
3334         rps_lock(sd);
3335         qlen = skb_queue_len(&sd->input_pkt_queue);
3336         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3337                 if (qlen) {
3338 enqueue:
3339                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3340                         input_queue_tail_incr_save(sd, qtail);
3341                         rps_unlock(sd);
3342                         local_irq_restore(flags);
3343                         return NET_RX_SUCCESS;
3344                 }
3345
3346                 /* Schedule NAPI for backlog device
3347                  * We can use non atomic operation since we own the queue lock
3348                  */
3349                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3350                         if (!rps_ipi_queued(sd))
3351                                 ____napi_schedule(sd, &sd->backlog);
3352                 }
3353                 goto enqueue;
3354         }
3355
3356         sd->dropped++;
3357         rps_unlock(sd);
3358
3359         local_irq_restore(flags);
3360
3361         atomic_long_inc(&skb->dev->rx_dropped);
3362         kfree_skb(skb);
3363         return NET_RX_DROP;
3364 }
3365
3366 static int netif_rx_internal(struct sk_buff *skb)
3367 {
3368         int ret;
3369
3370         net_timestamp_check(netdev_tstamp_prequeue, skb);
3371
3372         trace_netif_rx(skb);
3373 #ifdef CONFIG_RPS
3374         if (static_key_false(&rps_needed)) {
3375                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3376                 int cpu;
3377
3378                 preempt_disable();
3379                 rcu_read_lock();
3380
3381                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3382                 if (cpu < 0)
3383                         cpu = smp_processor_id();
3384
3385                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3386
3387                 rcu_read_unlock();
3388                 preempt_enable();
3389         } else
3390 #endif
3391         {
3392                 unsigned int qtail;
3393                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3394                 put_cpu();
3395         }
3396         return ret;
3397 }
3398
3399 /**
3400  *      netif_rx        -       post buffer to the network code
3401  *      @skb: buffer to post
3402  *
3403  *      This function receives a packet from a device driver and queues it for
3404  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3405  *      may be dropped during processing for congestion control or by the
3406  *      protocol layers.
3407  *
3408  *      return values:
3409  *      NET_RX_SUCCESS  (no congestion)
3410  *      NET_RX_DROP     (packet was dropped)
3411  *
3412  */
3413
3414 int netif_rx(struct sk_buff *skb)
3415 {
3416         trace_netif_rx_entry(skb);
3417
3418         return netif_rx_internal(skb);
3419 }
3420 EXPORT_SYMBOL(netif_rx);
3421
3422 int netif_rx_ni(struct sk_buff *skb)
3423 {
3424         int err;
3425
3426         trace_netif_rx_ni_entry(skb);
3427
3428         preempt_disable();
3429         err = netif_rx_internal(skb);
3430         if (local_softirq_pending())
3431                 do_softirq();
3432         preempt_enable();
3433
3434         return err;
3435 }
3436 EXPORT_SYMBOL(netif_rx_ni);
3437
3438 static void net_tx_action(struct softirq_action *h)
3439 {
3440         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3441
3442         if (sd->completion_queue) {
3443                 struct sk_buff *clist;
3444
3445                 local_irq_disable();
3446                 clist = sd->completion_queue;
3447                 sd->completion_queue = NULL;
3448                 local_irq_enable();
3449
3450                 while (clist) {
3451                         struct sk_buff *skb = clist;
3452                         clist = clist->next;
3453
3454                         WARN_ON(atomic_read(&skb->users));
3455                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3456                                 trace_consume_skb(skb);
3457                         else
3458                                 trace_kfree_skb(skb, net_tx_action);
3459                         __kfree_skb(skb);
3460                 }
3461         }
3462
3463         if (sd->output_queue) {
3464                 struct Qdisc *head;
3465
3466                 local_irq_disable();
3467                 head = sd->output_queue;
3468                 sd->output_queue = NULL;
3469                 sd->output_queue_tailp = &sd->output_queue;
3470                 local_irq_enable();
3471
3472                 while (head) {
3473                         struct Qdisc *q = head;
3474                         spinlock_t *root_lock;
3475
3476                         head = head->next_sched;
3477
3478                         root_lock = qdisc_lock(q);
3479                         if (spin_trylock(root_lock)) {
3480                                 smp_mb__before_atomic();
3481                                 clear_bit(__QDISC_STATE_SCHED,
3482                                           &q->state);
3483                                 qdisc_run(q);
3484                                 spin_unlock(root_lock);
3485                         } else {
3486                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3487                                               &q->state)) {
3488                                         __netif_reschedule(q);
3489                                 } else {
3490                                         smp_mb__before_atomic();
3491                                         clear_bit(__QDISC_STATE_SCHED,
3492                                                   &q->state);
3493                                 }
3494                         }
3495                 }
3496         }
3497 }
3498
3499 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3500     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3501 /* This hook is defined here for ATM LANE */
3502 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3503                              unsigned char *addr) __read_mostly;
3504 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3505 #endif
3506
3507 #ifdef CONFIG_NET_CLS_ACT
3508 /* TODO: Maybe we should just force sch_ingress to be compiled in
3509  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3510  * a compare and 2 stores extra right now if we dont have it on
3511  * but have CONFIG_NET_CLS_ACT
3512  * NOTE: This doesn't stop any functionality; if you dont have
3513  * the ingress scheduler, you just can't add policies on ingress.
3514  *
3515  */
3516 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3517 {
3518         struct net_device *dev = skb->dev;
3519         u32 ttl = G_TC_RTTL(skb->tc_verd);
3520         int result = TC_ACT_OK;
3521         struct Qdisc *q;
3522
3523         if (unlikely(MAX_RED_LOOP < ttl++)) {
3524                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3525                                      skb->skb_iif, dev->ifindex);
3526                 return TC_ACT_SHOT;
3527         }
3528
3529         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3530         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3531
3532         q = rcu_dereference(rxq->qdisc);
3533         if (q != &noop_qdisc) {
3534                 spin_lock(qdisc_lock(q));
3535                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3536                         result = qdisc_enqueue_root(skb, q);
3537                 spin_unlock(qdisc_lock(q));
3538         }
3539
3540         return result;
3541 }
3542
3543 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3544                                          struct packet_type **pt_prev,
3545                                          int *ret, struct net_device *orig_dev)
3546 {
3547         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3548
3549         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3550                 goto out;
3551
3552         if (*pt_prev) {
3553                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3554                 *pt_prev = NULL;
3555         }
3556
3557         switch (ing_filter(skb, rxq)) {
3558         case TC_ACT_SHOT:
3559         case TC_ACT_STOLEN:
3560                 kfree_skb(skb);
3561                 return NULL;
3562         }
3563
3564 out:
3565         skb->tc_verd = 0;
3566         return skb;
3567 }
3568 #endif
3569
3570 /**
3571  *      netdev_rx_handler_register - register receive handler
3572  *      @dev: device to register a handler for
3573  *      @rx_handler: receive handler to register
3574  *      @rx_handler_data: data pointer that is used by rx handler
3575  *
3576  *      Register a receive handler for a device. This handler will then be
3577  *      called from __netif_receive_skb. A negative errno code is returned
3578  *      on a failure.
3579  *
3580  *      The caller must hold the rtnl_mutex.
3581  *
3582  *      For a general description of rx_handler, see enum rx_handler_result.
3583  */
3584 int netdev_rx_handler_register(struct net_device *dev,
3585                                rx_handler_func_t *rx_handler,
3586                                void *rx_handler_data)
3587 {
3588         ASSERT_RTNL();
3589
3590         if (dev->rx_handler)
3591                 return -EBUSY;
3592
3593         /* Note: rx_handler_data must be set before rx_handler */
3594         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3595         rcu_assign_pointer(dev->rx_handler, rx_handler);
3596
3597         return 0;
3598 }
3599 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3600
3601 /**
3602  *      netdev_rx_handler_unregister - unregister receive handler
3603  *      @dev: device to unregister a handler from
3604  *
3605  *      Unregister a receive handler from a device.
3606  *
3607  *      The caller must hold the rtnl_mutex.
3608  */
3609 void netdev_rx_handler_unregister(struct net_device *dev)
3610 {
3611
3612         ASSERT_RTNL();
3613         RCU_INIT_POINTER(dev->rx_handler, NULL);
3614         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3615          * section has a guarantee to see a non NULL rx_handler_data
3616          * as well.
3617          */
3618         synchronize_net();
3619         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3620 }
3621 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3622
3623 /*
3624  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3625  * the special handling of PFMEMALLOC skbs.
3626  */
3627 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3628 {
3629         switch (skb->protocol) {
3630         case htons(ETH_P_ARP):
3631         case htons(ETH_P_IP):
3632         case htons(ETH_P_IPV6):
3633         case htons(ETH_P_8021Q):
3634         case htons(ETH_P_8021AD):
3635                 return true;
3636         default:
3637                 return false;
3638         }
3639 }
3640
3641 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3642 {
3643         struct packet_type *ptype, *pt_prev;
3644         rx_handler_func_t *rx_handler;
3645         struct net_device *orig_dev;
3646         bool deliver_exact = false;
3647         int ret = NET_RX_DROP;
3648         __be16 type;
3649
3650         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3651
3652         trace_netif_receive_skb(skb);
3653
3654         orig_dev = skb->dev;
3655
3656         skb_reset_network_header(skb);
3657         if (!skb_transport_header_was_set(skb))
3658                 skb_reset_transport_header(skb);
3659         skb_reset_mac_len(skb);
3660
3661         pt_prev = NULL;
3662
3663         rcu_read_lock();
3664
3665 another_round:
3666         skb->skb_iif = skb->dev->ifindex;
3667
3668         __this_cpu_inc(softnet_data.processed);
3669
3670         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3671             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3672                 skb = skb_vlan_untag(skb);
3673                 if (unlikely(!skb))
3674                         goto unlock;
3675         }
3676
3677 #ifdef CONFIG_NET_CLS_ACT
3678         if (skb->tc_verd & TC_NCLS) {
3679                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3680                 goto ncls;
3681         }
3682 #endif
3683
3684         if (pfmemalloc)
3685                 goto skip_taps;
3686
3687         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3688                 if (pt_prev)
3689                         ret = deliver_skb(skb, pt_prev, orig_dev);
3690                 pt_prev = ptype;
3691         }
3692
3693         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3694                 if (pt_prev)
3695                         ret = deliver_skb(skb, pt_prev, orig_dev);
3696                 pt_prev = ptype;
3697         }
3698
3699 skip_taps:
3700 #ifdef CONFIG_NET_CLS_ACT
3701         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3702         if (!skb)
3703                 goto unlock;
3704 ncls:
3705 #endif
3706
3707         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3708                 goto drop;
3709
3710         if (skb_vlan_tag_present(skb)) {
3711                 if (pt_prev) {
3712                         ret = deliver_skb(skb, pt_prev, orig_dev);
3713                         pt_prev = NULL;
3714                 }
3715                 if (vlan_do_receive(&skb))
3716                         goto another_round;
3717                 else if (unlikely(!skb))
3718                         goto unlock;
3719         }
3720
3721         rx_handler = rcu_dereference(skb->dev->rx_handler);
3722         if (rx_handler) {
3723                 if (pt_prev) {
3724                         ret = deliver_skb(skb, pt_prev, orig_dev);
3725                         pt_prev = NULL;
3726                 }
3727                 switch (rx_handler(&skb)) {
3728                 case RX_HANDLER_CONSUMED:
3729                         ret = NET_RX_SUCCESS;
3730                         goto unlock;
3731                 case RX_HANDLER_ANOTHER:
3732                         goto another_round;
3733                 case RX_HANDLER_EXACT:
3734                         deliver_exact = true;
3735                 case RX_HANDLER_PASS:
3736                         break;
3737                 default:
3738                         BUG();
3739                 }
3740         }
3741
3742         if (unlikely(skb_vlan_tag_present(skb))) {
3743                 if (skb_vlan_tag_get_id(skb))
3744                         skb->pkt_type = PACKET_OTHERHOST;
3745                 /* Note: we might in the future use prio bits
3746                  * and set skb->priority like in vlan_do_receive()
3747                  * For the time being, just ignore Priority Code Point
3748                  */
3749                 skb->vlan_tci = 0;
3750         }
3751
3752         type = skb->protocol;
3753
3754         /* deliver only exact match when indicated */
3755         if (likely(!deliver_exact)) {
3756                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3757                                        &ptype_base[ntohs(type) &
3758                                                    PTYPE_HASH_MASK]);
3759         }
3760
3761         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3762                                &orig_dev->ptype_specific);
3763
3764         if (unlikely(skb->dev != orig_dev)) {
3765                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3766                                        &skb->dev->ptype_specific);
3767         }
3768
3769         if (pt_prev) {
3770                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3771                         goto drop;
3772                 else
3773                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3774         } else {
3775 drop:
3776                 atomic_long_inc(&skb->dev->rx_dropped);
3777                 kfree_skb(skb);
3778                 /* Jamal, now you will not able to escape explaining
3779                  * me how you were going to use this. :-)
3780                  */
3781                 ret = NET_RX_DROP;
3782         }
3783
3784 unlock:
3785         rcu_read_unlock();
3786         return ret;
3787 }
3788
3789 static int __netif_receive_skb(struct sk_buff *skb)
3790 {
3791         int ret;
3792
3793         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3794                 unsigned long pflags = current->flags;
3795
3796                 /*
3797                  * PFMEMALLOC skbs are special, they should
3798                  * - be delivered to SOCK_MEMALLOC sockets only
3799                  * - stay away from userspace
3800                  * - have bounded memory usage
3801                  *
3802                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3803                  * context down to all allocation sites.
3804                  */
3805                 current->flags |= PF_MEMALLOC;
3806                 ret = __netif_receive_skb_core(skb, true);
3807                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3808         } else
3809                 ret = __netif_receive_skb_core(skb, false);
3810
3811         return ret;
3812 }
3813
3814 static int netif_receive_skb_internal(struct sk_buff *skb)
3815 {
3816         net_timestamp_check(netdev_tstamp_prequeue, skb);
3817
3818         if (skb_defer_rx_timestamp(skb))
3819                 return NET_RX_SUCCESS;
3820
3821 #ifdef CONFIG_RPS
3822         if (static_key_false(&rps_needed)) {
3823                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3824                 int cpu, ret;
3825
3826                 rcu_read_lock();
3827
3828                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3829
3830                 if (cpu >= 0) {
3831                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3832                         rcu_read_unlock();
3833                         return ret;
3834                 }
3835                 rcu_read_unlock();
3836         }
3837 #endif
3838         return __netif_receive_skb(skb);
3839 }
3840
3841 /**
3842  *      netif_receive_skb - process receive buffer from network
3843  *      @skb: buffer to process
3844  *
3845  *      netif_receive_skb() is the main receive data processing function.
3846  *      It always succeeds. The buffer may be dropped during processing
3847  *      for congestion control or by the protocol layers.
3848  *
3849  *      This function may only be called from softirq context and interrupts
3850  *      should be enabled.
3851  *
3852  *      Return values (usually ignored):
3853  *      NET_RX_SUCCESS: no congestion
3854  *      NET_RX_DROP: packet was dropped
3855  */
3856 int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
3857 {
3858         trace_netif_receive_skb_entry(skb);
3859
3860         return netif_receive_skb_internal(skb);
3861 }
3862 EXPORT_SYMBOL(netif_receive_skb_sk);
3863
3864 /* Network device is going away, flush any packets still pending
3865  * Called with irqs disabled.
3866  */
3867 static void flush_backlog(void *arg)
3868 {
3869         struct net_device *dev = arg;
3870         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3871         struct sk_buff *skb, *tmp;
3872
3873         rps_lock(sd);
3874         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3875                 if (skb->dev == dev) {
3876                         __skb_unlink(skb, &sd->input_pkt_queue);
3877                         kfree_skb(skb);
3878                         input_queue_head_incr(sd);
3879                 }
3880         }
3881         rps_unlock(sd);
3882
3883         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3884                 if (skb->dev == dev) {
3885                         __skb_unlink(skb, &sd->process_queue);
3886                         kfree_skb(skb);
3887                         input_queue_head_incr(sd);
3888                 }
3889         }
3890 }
3891
3892 static int napi_gro_complete(struct sk_buff *skb)
3893 {
3894         struct packet_offload *ptype;
3895         __be16 type = skb->protocol;
3896         struct list_head *head = &offload_base;
3897         int err = -ENOENT;
3898
3899         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3900
3901         if (NAPI_GRO_CB(skb)->count == 1) {
3902                 skb_shinfo(skb)->gso_size = 0;
3903                 goto out;
3904         }
3905
3906         rcu_read_lock();
3907         list_for_each_entry_rcu(ptype, head, list) {
3908                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3909                         continue;
3910
3911                 err = ptype->callbacks.gro_complete(skb, 0);
3912                 break;
3913         }
3914         rcu_read_unlock();
3915
3916         if (err) {
3917                 WARN_ON(&ptype->list == head);
3918                 kfree_skb(skb);
3919                 return NET_RX_SUCCESS;
3920         }
3921
3922 out:
3923         return netif_receive_skb_internal(skb);
3924 }
3925
3926 /* napi->gro_list contains packets ordered by age.
3927  * youngest packets at the head of it.
3928  * Complete skbs in reverse order to reduce latencies.
3929  */
3930 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3931 {
3932         struct sk_buff *skb, *prev = NULL;
3933
3934         /* scan list and build reverse chain */
3935         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3936                 skb->prev = prev;
3937                 prev = skb;
3938         }
3939
3940         for (skb = prev; skb; skb = prev) {
3941                 skb->next = NULL;
3942
3943                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3944                         return;
3945
3946                 prev = skb->prev;
3947                 napi_gro_complete(skb);
3948                 napi->gro_count--;
3949         }
3950
3951         napi->gro_list = NULL;
3952 }
3953 EXPORT_SYMBOL(napi_gro_flush);
3954
3955 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3956 {
3957         struct sk_buff *p;
3958         unsigned int maclen = skb->dev->hard_header_len;
3959         u32 hash = skb_get_hash_raw(skb);
3960
3961         for (p = napi->gro_list; p; p = p->next) {
3962                 unsigned long diffs;
3963
3964                 NAPI_GRO_CB(p)->flush = 0;
3965
3966                 if (hash != skb_get_hash_raw(p)) {
3967                         NAPI_GRO_CB(p)->same_flow = 0;
3968                         continue;
3969                 }
3970
3971                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3972                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3973                 if (maclen == ETH_HLEN)
3974                         diffs |= compare_ether_header(skb_mac_header(p),
3975                                                       skb_mac_header(skb));
3976                 else if (!diffs)
3977                         diffs = memcmp(skb_mac_header(p),
3978                                        skb_mac_header(skb),
3979                                        maclen);
3980                 NAPI_GRO_CB(p)->same_flow = !diffs;
3981         }
3982 }
3983
3984 static void skb_gro_reset_offset(struct sk_buff *skb)
3985 {
3986         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3987         const skb_frag_t *frag0 = &pinfo->frags[0];
3988
3989         NAPI_GRO_CB(skb)->data_offset = 0;
3990         NAPI_GRO_CB(skb)->frag0 = NULL;
3991         NAPI_GRO_CB(skb)->frag0_len = 0;
3992
3993         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3994             pinfo->nr_frags &&
3995             !PageHighMem(skb_frag_page(frag0))) {
3996                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3997                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3998         }
3999 }
4000
4001 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4002 {
4003         struct skb_shared_info *pinfo = skb_shinfo(skb);
4004
4005         BUG_ON(skb->end - skb->tail < grow);
4006
4007         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4008
4009         skb->data_len -= grow;
4010         skb->tail += grow;
4011
4012         pinfo->frags[0].page_offset += grow;
4013         skb_frag_size_sub(&pinfo->frags[0], grow);
4014
4015         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4016                 skb_frag_unref(skb, 0);
4017                 memmove(pinfo->frags, pinfo->frags + 1,
4018                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4019         }
4020 }
4021
4022 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4023 {
4024         struct sk_buff **pp = NULL;
4025         struct packet_offload *ptype;
4026         __be16 type = skb->protocol;
4027         struct list_head *head = &offload_base;
4028         int same_flow;
4029         enum gro_result ret;
4030         int grow;
4031
4032         if (!(skb->dev->features & NETIF_F_GRO))
4033                 goto normal;
4034
4035         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4036                 goto normal;
4037
4038         gro_list_prepare(napi, skb);
4039
4040         rcu_read_lock();
4041         list_for_each_entry_rcu(ptype, head, list) {
4042                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4043                         continue;
4044
4045                 skb_set_network_header(skb, skb_gro_offset(skb));
4046                 skb_reset_mac_len(skb);
4047                 NAPI_GRO_CB(skb)->same_flow = 0;
4048                 NAPI_GRO_CB(skb)->flush = 0;
4049                 NAPI_GRO_CB(skb)->free = 0;
4050                 NAPI_GRO_CB(skb)->udp_mark = 0;
4051                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4052
4053                 /* Setup for GRO checksum validation */
4054                 switch (skb->ip_summed) {
4055                 case CHECKSUM_COMPLETE:
4056                         NAPI_GRO_CB(skb)->csum = skb->csum;
4057                         NAPI_GRO_CB(skb)->csum_valid = 1;
4058                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4059                         break;
4060                 case CHECKSUM_UNNECESSARY:
4061                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4062                         NAPI_GRO_CB(skb)->csum_valid = 0;
4063                         break;
4064                 default:
4065                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4066                         NAPI_GRO_CB(skb)->csum_valid = 0;
4067                 }
4068
4069                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4070                 break;
4071         }
4072         rcu_read_unlock();
4073
4074         if (&ptype->list == head)
4075                 goto normal;
4076
4077         same_flow = NAPI_GRO_CB(skb)->same_flow;
4078         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4079
4080         if (pp) {
4081                 struct sk_buff *nskb = *pp;
4082
4083                 *pp = nskb->next;
4084                 nskb->next = NULL;
4085                 napi_gro_complete(nskb);
4086                 napi->gro_count--;
4087         }
4088
4089         if (same_flow)
4090                 goto ok;
4091
4092         if (NAPI_GRO_CB(skb)->flush)
4093                 goto normal;
4094
4095         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4096                 struct sk_buff *nskb = napi->gro_list;
4097
4098                 /* locate the end of the list to select the 'oldest' flow */
4099                 while (nskb->next) {
4100                         pp = &nskb->next;
4101                         nskb = *pp;
4102                 }
4103                 *pp = NULL;
4104                 nskb->next = NULL;
4105                 napi_gro_complete(nskb);
4106         } else {
4107                 napi->gro_count++;
4108         }
4109         NAPI_GRO_CB(skb)->count = 1;
4110         NAPI_GRO_CB(skb)->age = jiffies;
4111         NAPI_GRO_CB(skb)->last = skb;
4112         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4113         skb->next = napi->gro_list;
4114         napi->gro_list = skb;
4115         ret = GRO_HELD;
4116
4117 pull:
4118         grow = skb_gro_offset(skb) - skb_headlen(skb);
4119         if (grow > 0)
4120                 gro_pull_from_frag0(skb, grow);
4121 ok:
4122         return ret;
4123
4124 normal:
4125         ret = GRO_NORMAL;
4126         goto pull;
4127 }
4128
4129 struct packet_offload *gro_find_receive_by_type(__be16 type)
4130 {
4131         struct list_head *offload_head = &offload_base;
4132         struct packet_offload *ptype;
4133
4134         list_for_each_entry_rcu(ptype, offload_head, list) {
4135                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4136                         continue;
4137                 return ptype;
4138         }
4139         return NULL;
4140 }
4141 EXPORT_SYMBOL(gro_find_receive_by_type);
4142
4143 struct packet_offload *gro_find_complete_by_type(__be16 type)
4144 {
4145         struct list_head *offload_head = &offload_base;
4146         struct packet_offload *ptype;
4147
4148         list_for_each_entry_rcu(ptype, offload_head, list) {
4149                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4150                         continue;
4151                 return ptype;
4152         }
4153         return NULL;
4154 }
4155 EXPORT_SYMBOL(gro_find_complete_by_type);
4156
4157 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4158 {
4159         switch (ret) {
4160         case GRO_NORMAL:
4161                 if (netif_receive_skb_internal(skb))
4162                         ret = GRO_DROP;
4163                 break;
4164
4165         case GRO_DROP:
4166                 kfree_skb(skb);
4167                 break;
4168
4169         case GRO_MERGED_FREE:
4170                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4171                         kmem_cache_free(skbuff_head_cache, skb);
4172                 else
4173                         __kfree_skb(skb);
4174                 break;
4175
4176         case GRO_HELD:
4177         case GRO_MERGED:
4178                 break;
4179         }
4180
4181         return ret;
4182 }
4183
4184 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4185 {
4186         trace_napi_gro_receive_entry(skb);
4187
4188         skb_gro_reset_offset(skb);
4189
4190         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4191 }
4192 EXPORT_SYMBOL(napi_gro_receive);
4193
4194 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4195 {
4196         if (unlikely(skb->pfmemalloc)) {
4197                 consume_skb(skb);
4198                 return;
4199         }
4200         __skb_pull(skb, skb_headlen(skb));
4201         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4202         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4203         skb->vlan_tci = 0;
4204         skb->dev = napi->dev;
4205         skb->skb_iif = 0;
4206         skb->encapsulation = 0;
4207         skb_shinfo(skb)->gso_type = 0;
4208         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4209
4210         napi->skb = skb;
4211 }
4212
4213 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4214 {
4215         struct sk_buff *skb = napi->skb;
4216
4217         if (!skb) {
4218                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4219                 napi->skb = skb;
4220         }
4221         return skb;
4222 }
4223 EXPORT_SYMBOL(napi_get_frags);
4224
4225 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4226                                       struct sk_buff *skb,
4227                                       gro_result_t ret)
4228 {
4229         switch (ret) {
4230         case GRO_NORMAL:
4231         case GRO_HELD:
4232                 __skb_push(skb, ETH_HLEN);
4233                 skb->protocol = eth_type_trans(skb, skb->dev);
4234                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4235                         ret = GRO_DROP;
4236                 break;
4237
4238         case GRO_DROP:
4239         case GRO_MERGED_FREE:
4240                 napi_reuse_skb(napi, skb);
4241                 break;
4242
4243         case GRO_MERGED:
4244                 break;
4245         }
4246
4247         return ret;
4248 }
4249
4250 /* Upper GRO stack assumes network header starts at gro_offset=0
4251  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4252  * We copy ethernet header into skb->data to have a common layout.
4253  */
4254 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4255 {
4256         struct sk_buff *skb = napi->skb;
4257         const struct ethhdr *eth;
4258         unsigned int hlen = sizeof(*eth);
4259
4260         napi->skb = NULL;
4261
4262         skb_reset_mac_header(skb);
4263         skb_gro_reset_offset(skb);
4264
4265         eth = skb_gro_header_fast(skb, 0);
4266         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4267                 eth = skb_gro_header_slow(skb, hlen, 0);
4268                 if (unlikely(!eth)) {
4269                         napi_reuse_skb(napi, skb);
4270                         return NULL;
4271                 }
4272         } else {
4273                 gro_pull_from_frag0(skb, hlen);
4274                 NAPI_GRO_CB(skb)->frag0 += hlen;
4275                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4276         }
4277         __skb_pull(skb, hlen);
4278
4279         /*
4280          * This works because the only protocols we care about don't require
4281          * special handling.
4282          * We'll fix it up properly in napi_frags_finish()
4283          */
4284         skb->protocol = eth->h_proto;
4285
4286         return skb;
4287 }
4288
4289 gro_result_t napi_gro_frags(struct napi_struct *napi)
4290 {
4291         struct sk_buff *skb = napi_frags_skb(napi);
4292
4293         if (!skb)
4294                 return GRO_DROP;
4295
4296         trace_napi_gro_frags_entry(skb);
4297
4298         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4299 }
4300 EXPORT_SYMBOL(napi_gro_frags);
4301
4302 /* Compute the checksum from gro_offset and return the folded value
4303  * after adding in any pseudo checksum.
4304  */
4305 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4306 {
4307         __wsum wsum;
4308         __sum16 sum;
4309
4310         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4311
4312         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4313         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4314         if (likely(!sum)) {
4315                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4316                     !skb->csum_complete_sw)
4317                         netdev_rx_csum_fault(skb->dev);
4318         }
4319
4320         NAPI_GRO_CB(skb)->csum = wsum;
4321         NAPI_GRO_CB(skb)->csum_valid = 1;
4322
4323         return sum;
4324 }
4325 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4326
4327 /*
4328  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4329  * Note: called with local irq disabled, but exits with local irq enabled.
4330  */
4331 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4332 {
4333 #ifdef CONFIG_RPS
4334         struct softnet_data *remsd = sd->rps_ipi_list;
4335
4336         if (remsd) {
4337                 sd->rps_ipi_list = NULL;
4338
4339                 local_irq_enable();
4340
4341                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4342                 while (remsd) {
4343                         struct softnet_data *next = remsd->rps_ipi_next;
4344
4345                         if (cpu_online(remsd->cpu))
4346                                 smp_call_function_single_async(remsd->cpu,
4347                                                            &remsd->csd);
4348                         remsd = next;
4349                 }
4350         } else
4351 #endif
4352                 local_irq_enable();
4353 }
4354
4355 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4356 {
4357 #ifdef CONFIG_RPS
4358         return sd->rps_ipi_list != NULL;
4359 #else
4360         return false;
4361 #endif
4362 }
4363
4364 static int process_backlog(struct napi_struct *napi, int quota)
4365 {
4366         int work = 0;
4367         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4368
4369         /* Check if we have pending ipi, its better to send them now,
4370          * not waiting net_rx_action() end.
4371          */
4372         if (sd_has_rps_ipi_waiting(sd)) {
4373                 local_irq_disable();
4374                 net_rps_action_and_irq_enable(sd);
4375         }
4376
4377         napi->weight = weight_p;
4378         local_irq_disable();
4379         while (1) {
4380                 struct sk_buff *skb;
4381
4382                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4383                         local_irq_enable();
4384                         __netif_receive_skb(skb);
4385                         local_irq_disable();
4386                         input_queue_head_incr(sd);
4387                         if (++work >= quota) {
4388                                 local_irq_enable();
4389                                 return work;
4390                         }
4391                 }
4392
4393                 rps_lock(sd);
4394                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4395                         /*
4396                          * Inline a custom version of __napi_complete().
4397                          * only current cpu owns and manipulates this napi,
4398                          * and NAPI_STATE_SCHED is the only possible flag set
4399                          * on backlog.
4400                          * We can use a plain write instead of clear_bit(),
4401                          * and we dont need an smp_mb() memory barrier.
4402                          */
4403                         napi->state = 0;
4404                         rps_unlock(sd);
4405
4406                         break;
4407                 }
4408
4409                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4410                                            &sd->process_queue);
4411                 rps_unlock(sd);
4412         }
4413         local_irq_enable();
4414
4415         return work;
4416 }
4417
4418 /**
4419  * __napi_schedule - schedule for receive
4420  * @n: entry to schedule
4421  *
4422  * The entry's receive function will be scheduled to run.
4423  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4424  */
4425 void __napi_schedule(struct napi_struct *n)
4426 {
4427         unsigned long flags;
4428
4429         local_irq_save(flags);
4430         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4431         local_irq_restore(flags);
4432 }
4433 EXPORT_SYMBOL(__napi_schedule);
4434
4435 /**
4436  * __napi_schedule_irqoff - schedule for receive
4437  * @n: entry to schedule
4438  *
4439  * Variant of __napi_schedule() assuming hard irqs are masked
4440  */
4441 void __napi_schedule_irqoff(struct napi_struct *n)
4442 {
4443         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4444 }
4445 EXPORT_SYMBOL(__napi_schedule_irqoff);
4446
4447 void __napi_complete(struct napi_struct *n)
4448 {
4449         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4450
4451         list_del_init(&n->poll_list);
4452         smp_mb__before_atomic();
4453         clear_bit(NAPI_STATE_SCHED, &n->state);
4454 }
4455 EXPORT_SYMBOL(__napi_complete);
4456
4457 void napi_complete_done(struct napi_struct *n, int work_done)
4458 {
4459         unsigned long flags;
4460
4461         /*
4462          * don't let napi dequeue from the cpu poll list
4463          * just in case its running on a different cpu
4464          */
4465         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4466                 return;
4467
4468         if (n->gro_list) {
4469                 unsigned long timeout = 0;
4470
4471                 if (work_done)
4472                         timeout = n->dev->gro_flush_timeout;
4473
4474                 if (timeout)
4475                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4476                                       HRTIMER_MODE_REL_PINNED);
4477                 else
4478                         napi_gro_flush(n, false);
4479         }
4480         if (likely(list_empty(&n->poll_list))) {
4481                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4482         } else {
4483                 /* If n->poll_list is not empty, we need to mask irqs */
4484                 local_irq_save(flags);
4485                 __napi_complete(n);
4486                 local_irq_restore(flags);
4487         }
4488 }
4489 EXPORT_SYMBOL(napi_complete_done);
4490
4491 /* must be called under rcu_read_lock(), as we dont take a reference */
4492 struct napi_struct *napi_by_id(unsigned int napi_id)
4493 {
4494         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4495         struct napi_struct *napi;
4496
4497         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4498                 if (napi->napi_id == napi_id)
4499                         return napi;
4500
4501         return NULL;
4502 }
4503 EXPORT_SYMBOL_GPL(napi_by_id);
4504
4505 void napi_hash_add(struct napi_struct *napi)
4506 {
4507         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4508
4509                 spin_lock(&napi_hash_lock);
4510
4511                 /* 0 is not a valid id, we also skip an id that is taken
4512                  * we expect both events to be extremely rare
4513                  */
4514                 napi->napi_id = 0;
4515                 while (!napi->napi_id) {
4516                         napi->napi_id = ++napi_gen_id;
4517                         if (napi_by_id(napi->napi_id))
4518                                 napi->napi_id = 0;
4519                 }
4520
4521                 hlist_add_head_rcu(&napi->napi_hash_node,
4522                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4523
4524                 spin_unlock(&napi_hash_lock);
4525         }
4526 }
4527 EXPORT_SYMBOL_GPL(napi_hash_add);
4528
4529 /* Warning : caller is responsible to make sure rcu grace period
4530  * is respected before freeing memory containing @napi
4531  */
4532 void napi_hash_del(struct napi_struct *napi)
4533 {
4534         spin_lock(&napi_hash_lock);
4535
4536         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4537                 hlist_del_rcu(&napi->napi_hash_node);
4538
4539         spin_unlock(&napi_hash_lock);
4540 }
4541 EXPORT_SYMBOL_GPL(napi_hash_del);
4542
4543 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4544 {
4545         struct napi_struct *napi;
4546
4547         napi = container_of(timer, struct napi_struct, timer);
4548         if (napi->gro_list)
4549                 napi_schedule(napi);
4550
4551         return HRTIMER_NORESTART;
4552 }
4553
4554 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4555                     int (*poll)(struct napi_struct *, int), int weight)
4556 {
4557         INIT_LIST_HEAD(&napi->poll_list);
4558         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4559         napi->timer.function = napi_watchdog;
4560         napi->gro_count = 0;
4561         napi->gro_list = NULL;
4562         napi->skb = NULL;
4563         napi->poll = poll;
4564         if (weight > NAPI_POLL_WEIGHT)
4565                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4566                             weight, dev->name);
4567         napi->weight = weight;
4568         list_add(&napi->dev_list, &dev->napi_list);
4569         napi->dev = dev;
4570 #ifdef CONFIG_NETPOLL
4571         spin_lock_init(&napi->poll_lock);
4572         napi->poll_owner = -1;
4573 #endif
4574         set_bit(NAPI_STATE_SCHED, &napi->state);
4575 }
4576 EXPORT_SYMBOL(netif_napi_add);
4577
4578 void napi_disable(struct napi_struct *n)
4579 {
4580         might_sleep();
4581         set_bit(NAPI_STATE_DISABLE, &n->state);
4582
4583         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4584                 msleep(1);
4585
4586         hrtimer_cancel(&n->timer);
4587
4588         clear_bit(NAPI_STATE_DISABLE, &n->state);
4589 }
4590 EXPORT_SYMBOL(napi_disable);
4591
4592 void netif_napi_del(struct napi_struct *napi)
4593 {
4594         list_del_init(&napi->dev_list);
4595         napi_free_frags(napi);
4596
4597         kfree_skb_list(napi->gro_list);
4598         napi->gro_list = NULL;
4599         napi->gro_count = 0;
4600 }
4601 EXPORT_SYMBOL(netif_napi_del);
4602
4603 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4604 {
4605         void *have;
4606         int work, weight;
4607
4608         list_del_init(&n->poll_list);
4609
4610         have = netpoll_poll_lock(n);
4611
4612         weight = n->weight;
4613
4614         /* This NAPI_STATE_SCHED test is for avoiding a race
4615          * with netpoll's poll_napi().  Only the entity which
4616          * obtains the lock and sees NAPI_STATE_SCHED set will
4617          * actually make the ->poll() call.  Therefore we avoid
4618          * accidentally calling ->poll() when NAPI is not scheduled.
4619          */
4620         work = 0;
4621         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4622                 work = n->poll(n, weight);
4623                 trace_napi_poll(n);
4624         }
4625
4626         WARN_ON_ONCE(work > weight);
4627
4628         if (likely(work < weight))
4629                 goto out_unlock;
4630
4631         /* Drivers must not modify the NAPI state if they
4632          * consume the entire weight.  In such cases this code
4633          * still "owns" the NAPI instance and therefore can
4634          * move the instance around on the list at-will.
4635          */
4636         if (unlikely(napi_disable_pending(n))) {
4637                 napi_complete(n);
4638                 goto out_unlock;
4639         }
4640
4641         if (n->gro_list) {
4642                 /* flush too old packets
4643                  * If HZ < 1000, flush all packets.
4644                  */
4645                 napi_gro_flush(n, HZ >= 1000);
4646         }
4647
4648         /* Some drivers may have called napi_schedule
4649          * prior to exhausting their budget.
4650          */
4651         if (unlikely(!list_empty(&n->poll_list))) {
4652                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4653                              n->dev ? n->dev->name : "backlog");
4654                 goto out_unlock;
4655         }
4656
4657         list_add_tail(&n->poll_list, repoll);
4658
4659 out_unlock:
4660         netpoll_poll_unlock(have);
4661
4662         return work;
4663 }
4664
4665 static void net_rx_action(struct softirq_action *h)
4666 {
4667         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4668         unsigned long time_limit = jiffies + 2;
4669         int budget = netdev_budget;
4670         LIST_HEAD(list);
4671         LIST_HEAD(repoll);
4672
4673         local_irq_disable();
4674         list_splice_init(&sd->poll_list, &list);
4675         local_irq_enable();
4676
4677         for (;;) {
4678                 struct napi_struct *n;
4679
4680                 if (list_empty(&list)) {
4681                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4682                                 return;
4683                         break;
4684                 }
4685
4686                 n = list_first_entry(&list, struct napi_struct, poll_list);
4687                 budget -= napi_poll(n, &repoll);
4688
4689                 /* If softirq window is exhausted then punt.
4690                  * Allow this to run for 2 jiffies since which will allow
4691                  * an average latency of 1.5/HZ.
4692                  */
4693                 if (unlikely(budget <= 0 ||
4694                              time_after_eq(jiffies, time_limit))) {
4695                         sd->time_squeeze++;
4696                         break;
4697                 }
4698         }
4699
4700         local_irq_disable();
4701
4702         list_splice_tail_init(&sd->poll_list, &list);
4703         list_splice_tail(&repoll, &list);
4704         list_splice(&list, &sd->poll_list);
4705         if (!list_empty(&sd->poll_list))
4706                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4707
4708         net_rps_action_and_irq_enable(sd);
4709 }
4710
4711 struct netdev_adjacent {
4712         struct net_device *dev;
4713
4714         /* upper master flag, there can only be one master device per list */
4715         bool master;
4716
4717         /* counter for the number of times this device was added to us */
4718         u16 ref_nr;
4719
4720         /* private field for the users */
4721         void *private;
4722
4723         struct list_head list;
4724         struct rcu_head rcu;
4725 };
4726
4727 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4728                                                  struct net_device *adj_dev,
4729                                                  struct list_head *adj_list)
4730 {
4731         struct netdev_adjacent *adj;
4732
4733         list_for_each_entry(adj, adj_list, list) {
4734                 if (adj->dev == adj_dev)
4735                         return adj;
4736         }
4737         return NULL;
4738 }
4739
4740 /**
4741  * netdev_has_upper_dev - Check if device is linked to an upper device
4742  * @dev: device
4743  * @upper_dev: upper device to check
4744  *
4745  * Find out if a device is linked to specified upper device and return true
4746  * in case it is. Note that this checks only immediate upper device,
4747  * not through a complete stack of devices. The caller must hold the RTNL lock.
4748  */
4749 bool netdev_has_upper_dev(struct net_device *dev,
4750                           struct net_device *upper_dev)
4751 {
4752         ASSERT_RTNL();
4753
4754         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4755 }
4756 EXPORT_SYMBOL(netdev_has_upper_dev);
4757
4758 /**
4759  * netdev_has_any_upper_dev - Check if device is linked to some device
4760  * @dev: device
4761  *
4762  * Find out if a device is linked to an upper device and return true in case
4763  * it is. The caller must hold the RTNL lock.
4764  */
4765 static bool netdev_has_any_upper_dev(struct net_device *dev)
4766 {
4767         ASSERT_RTNL();
4768
4769         return !list_empty(&dev->all_adj_list.upper);
4770 }
4771
4772 /**
4773  * netdev_master_upper_dev_get - Get master upper device
4774  * @dev: device
4775  *
4776  * Find a master upper device and return pointer to it or NULL in case
4777  * it's not there. The caller must hold the RTNL lock.
4778  */
4779 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4780 {
4781         struct netdev_adjacent *upper;
4782
4783         ASSERT_RTNL();
4784
4785         if (list_empty(&dev->adj_list.upper))
4786                 return NULL;
4787
4788         upper = list_first_entry(&dev->adj_list.upper,
4789                                  struct netdev_adjacent, list);
4790         if (likely(upper->master))
4791                 return upper->dev;
4792         return NULL;
4793 }
4794 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4795
4796 void *netdev_adjacent_get_private(struct list_head *adj_list)
4797 {
4798         struct netdev_adjacent *adj;
4799
4800         adj = list_entry(adj_list, struct netdev_adjacent, list);
4801
4802         return adj->private;
4803 }
4804 EXPORT_SYMBOL(netdev_adjacent_get_private);
4805
4806 /**
4807  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4808  * @dev: device
4809  * @iter: list_head ** of the current position
4810  *
4811  * Gets the next device from the dev's upper list, starting from iter
4812  * position. The caller must hold RCU read lock.
4813  */
4814 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4815                                                  struct list_head **iter)
4816 {
4817         struct netdev_adjacent *upper;
4818
4819         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4820
4821         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4822
4823         if (&upper->list == &dev->adj_list.upper)
4824                 return NULL;
4825
4826         *iter = &upper->list;
4827
4828         return upper->dev;
4829 }
4830 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4831
4832 /**
4833  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4834  * @dev: device
4835  * @iter: list_head ** of the current position
4836  *
4837  * Gets the next device from the dev's upper list, starting from iter
4838  * position. The caller must hold RCU read lock.
4839  */
4840 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4841                                                      struct list_head **iter)
4842 {
4843         struct netdev_adjacent *upper;
4844
4845         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4846
4847         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4848
4849         if (&upper->list == &dev->all_adj_list.upper)
4850                 return NULL;
4851
4852         *iter = &upper->list;
4853
4854         return upper->dev;
4855 }
4856 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4857
4858 /**
4859  * netdev_lower_get_next_private - Get the next ->private from the
4860  *                                 lower neighbour list
4861  * @dev: device
4862  * @iter: list_head ** of the current position
4863  *
4864  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4865  * list, starting from iter position. The caller must hold either hold the
4866  * RTNL lock or its own locking that guarantees that the neighbour lower
4867  * list will remain unchainged.
4868  */
4869 void *netdev_lower_get_next_private(struct net_device *dev,
4870                                     struct list_head **iter)
4871 {
4872         struct netdev_adjacent *lower;
4873
4874         lower = list_entry(*iter, struct netdev_adjacent, list);
4875
4876         if (&lower->list == &dev->adj_list.lower)
4877                 return NULL;
4878
4879         *iter = lower->list.next;
4880
4881         return lower->private;
4882 }
4883 EXPORT_SYMBOL(netdev_lower_get_next_private);
4884
4885 /**
4886  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4887  *                                     lower neighbour list, RCU
4888  *                                     variant
4889  * @dev: device
4890  * @iter: list_head ** of the current position
4891  *
4892  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4893  * list, starting from iter position. The caller must hold RCU read lock.
4894  */
4895 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4896                                         struct list_head **iter)
4897 {
4898         struct netdev_adjacent *lower;
4899
4900         WARN_ON_ONCE(!rcu_read_lock_held());
4901
4902         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4903
4904         if (&lower->list == &dev->adj_list.lower)
4905                 return NULL;
4906
4907         *iter = &lower->list;
4908
4909         return lower->private;
4910 }
4911 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4912
4913 /**
4914  * netdev_lower_get_next - Get the next device from the lower neighbour
4915  *                         list
4916  * @dev: device
4917  * @iter: list_head ** of the current position
4918  *
4919  * Gets the next netdev_adjacent from the dev's lower neighbour
4920  * list, starting from iter position. The caller must hold RTNL lock or
4921  * its own locking that guarantees that the neighbour lower
4922  * list will remain unchainged.
4923  */
4924 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4925 {
4926         struct netdev_adjacent *lower;
4927
4928         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4929
4930         if (&lower->list == &dev->adj_list.lower)
4931                 return NULL;
4932
4933         *iter = &lower->list;
4934
4935         return lower->dev;
4936 }
4937 EXPORT_SYMBOL(netdev_lower_get_next);
4938
4939 /**
4940  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4941  *                                     lower neighbour list, RCU
4942  *                                     variant
4943  * @dev: device
4944  *
4945  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4946  * list. The caller must hold RCU read lock.
4947  */
4948 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4949 {
4950         struct netdev_adjacent *lower;
4951
4952         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4953                         struct netdev_adjacent, list);
4954         if (lower)
4955                 return lower->private;
4956         return NULL;
4957 }
4958 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4959
4960 /**
4961  * netdev_master_upper_dev_get_rcu - Get master upper device
4962  * @dev: device
4963  *
4964  * Find a master upper device and return pointer to it or NULL in case
4965  * it's not there. The caller must hold the RCU read lock.
4966  */
4967 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4968 {
4969         struct netdev_adjacent *upper;
4970
4971         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4972                                        struct netdev_adjacent, list);
4973         if (upper && likely(upper->master))
4974                 return upper->dev;
4975         return NULL;
4976 }
4977 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4978
4979 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4980                               struct net_device *adj_dev,
4981                               struct list_head *dev_list)
4982 {
4983         char linkname[IFNAMSIZ+7];
4984         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4985                 "upper_%s" : "lower_%s", adj_dev->name);
4986         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4987                                  linkname);
4988 }
4989 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4990                                char *name,
4991                                struct list_head *dev_list)
4992 {
4993         char linkname[IFNAMSIZ+7];
4994         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4995                 "upper_%s" : "lower_%s", name);
4996         sysfs_remove_link(&(dev->dev.kobj), linkname);
4997 }
4998
4999 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5000                                                  struct net_device *adj_dev,
5001                                                  struct list_head *dev_list)
5002 {
5003         return (dev_list == &dev->adj_list.upper ||
5004                 dev_list == &dev->adj_list.lower) &&
5005                 net_eq(dev_net(dev), dev_net(adj_dev));
5006 }
5007
5008 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5009                                         struct net_device *adj_dev,
5010                                         struct list_head *dev_list,
5011                                         void *private, bool master)
5012 {
5013         struct netdev_adjacent *adj;
5014         int ret;
5015
5016         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5017
5018         if (adj) {
5019                 adj->ref_nr++;
5020                 return 0;
5021         }
5022
5023         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5024         if (!adj)
5025                 return -ENOMEM;
5026
5027         adj->dev = adj_dev;
5028         adj->master = master;
5029         adj->ref_nr = 1;
5030         adj->private = private;
5031         dev_hold(adj_dev);
5032
5033         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5034                  adj_dev->name, dev->name, adj_dev->name);
5035
5036         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5037                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5038                 if (ret)
5039                         goto free_adj;
5040         }
5041
5042         /* Ensure that master link is always the first item in list. */
5043         if (master) {
5044                 ret = sysfs_create_link(&(dev->dev.kobj),
5045                                         &(adj_dev->dev.kobj), "master");
5046                 if (ret)
5047                         goto remove_symlinks;
5048
5049                 list_add_rcu(&adj->list, dev_list);
5050         } else {
5051                 list_add_tail_rcu(&adj->list, dev_list);
5052         }
5053
5054         return 0;
5055
5056 remove_symlinks:
5057         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5058                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5059 free_adj:
5060         kfree(adj);
5061         dev_put(adj_dev);
5062
5063         return ret;
5064 }
5065
5066 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5067                                          struct net_device *adj_dev,
5068                                          struct list_head *dev_list)
5069 {
5070         struct netdev_adjacent *adj;
5071
5072         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5073
5074         if (!adj) {
5075                 pr_err("tried to remove device %s from %s\n",
5076                        dev->name, adj_dev->name);
5077                 BUG();
5078         }
5079
5080         if (adj->ref_nr > 1) {
5081                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5082                          adj->ref_nr-1);
5083                 adj->ref_nr--;
5084                 return;
5085         }
5086
5087         if (adj->master)
5088                 sysfs_remove_link(&(dev->dev.kobj), "master");
5089
5090         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5091                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5092
5093         list_del_rcu(&adj->list);
5094         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5095                  adj_dev->name, dev->name, adj_dev->name);
5096         dev_put(adj_dev);
5097         kfree_rcu(adj, rcu);
5098 }
5099
5100 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5101                                             struct net_device *upper_dev,
5102                                             struct list_head *up_list,
5103                                             struct list_head *down_list,
5104                                             void *private, bool master)
5105 {
5106         int ret;
5107
5108         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5109                                            master);
5110         if (ret)
5111                 return ret;
5112
5113         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5114                                            false);
5115         if (ret) {
5116                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5117                 return ret;
5118         }
5119
5120         return 0;
5121 }
5122
5123 static int __netdev_adjacent_dev_link(struct net_device *dev,
5124                                       struct net_device *upper_dev)
5125 {
5126         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5127                                                 &dev->all_adj_list.upper,
5128                                                 &upper_dev->all_adj_list.lower,
5129                                                 NULL, false);
5130 }
5131
5132 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5133                                                struct net_device *upper_dev,
5134                                                struct list_head *up_list,
5135                                                struct list_head *down_list)
5136 {
5137         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5138         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5139 }
5140
5141 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5142                                          struct net_device *upper_dev)
5143 {
5144         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5145                                            &dev->all_adj_list.upper,
5146                                            &upper_dev->all_adj_list.lower);
5147 }
5148
5149 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5150                                                 struct net_device *upper_dev,
5151                                                 void *private, bool master)
5152 {
5153         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5154
5155         if (ret)
5156                 return ret;
5157
5158         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5159                                                &dev->adj_list.upper,
5160                                                &upper_dev->adj_list.lower,
5161                                                private, master);
5162         if (ret) {
5163                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5164                 return ret;
5165         }
5166
5167         return 0;
5168 }
5169
5170 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5171                                                    struct net_device *upper_dev)
5172 {
5173         __netdev_adjacent_dev_unlink(dev, upper_dev);
5174         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5175                                            &dev->adj_list.upper,
5176                                            &upper_dev->adj_list.lower);
5177 }
5178
5179 static int __netdev_upper_dev_link(struct net_device *dev,
5180                                    struct net_device *upper_dev, bool master,
5181                                    void *private)
5182 {
5183         struct netdev_adjacent *i, *j, *to_i, *to_j;
5184         int ret = 0;
5185
5186         ASSERT_RTNL();
5187
5188         if (dev == upper_dev)
5189                 return -EBUSY;
5190
5191         /* To prevent loops, check if dev is not upper device to upper_dev. */
5192         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5193                 return -EBUSY;
5194
5195         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5196                 return -EEXIST;
5197
5198         if (master && netdev_master_upper_dev_get(dev))
5199                 return -EBUSY;
5200
5201         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5202                                                    master);
5203         if (ret)
5204                 return ret;
5205
5206         /* Now that we linked these devs, make all the upper_dev's
5207          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5208          * versa, and don't forget the devices itself. All of these
5209          * links are non-neighbours.
5210          */
5211         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5212                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5213                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5214                                  i->dev->name, j->dev->name);
5215                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5216                         if (ret)
5217                                 goto rollback_mesh;
5218                 }
5219         }
5220
5221         /* add dev to every upper_dev's upper device */
5222         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5223                 pr_debug("linking %s's upper device %s with %s\n",
5224                          upper_dev->name, i->dev->name, dev->name);
5225                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5226                 if (ret)
5227                         goto rollback_upper_mesh;
5228         }
5229
5230         /* add upper_dev to every dev's lower device */
5231         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5232                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5233                          i->dev->name, upper_dev->name);
5234                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5235                 if (ret)
5236                         goto rollback_lower_mesh;
5237         }
5238
5239         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5240         return 0;
5241
5242 rollback_lower_mesh:
5243         to_i = i;
5244         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5245                 if (i == to_i)
5246                         break;
5247                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5248         }
5249
5250         i = NULL;
5251
5252 rollback_upper_mesh:
5253         to_i = i;
5254         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5255                 if (i == to_i)
5256                         break;
5257                 __netdev_adjacent_dev_unlink(dev, i->dev);
5258         }
5259
5260         i = j = NULL;
5261
5262 rollback_mesh:
5263         to_i = i;
5264         to_j = j;
5265         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5266                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5267                         if (i == to_i && j == to_j)
5268                                 break;
5269                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5270                 }
5271                 if (i == to_i)
5272                         break;
5273         }
5274
5275         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5276
5277         return ret;
5278 }
5279
5280 /**
5281  * netdev_upper_dev_link - Add a link to the upper device
5282  * @dev: device
5283  * @upper_dev: new upper device
5284  *
5285  * Adds a link to device which is upper to this one. The caller must hold
5286  * the RTNL lock. On a failure a negative errno code is returned.
5287  * On success the reference counts are adjusted and the function
5288  * returns zero.
5289  */
5290 int netdev_upper_dev_link(struct net_device *dev,
5291                           struct net_device *upper_dev)
5292 {
5293         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5294 }
5295 EXPORT_SYMBOL(netdev_upper_dev_link);
5296
5297 /**
5298  * netdev_master_upper_dev_link - Add a master link to the upper device
5299  * @dev: device
5300  * @upper_dev: new upper device
5301  *
5302  * Adds a link to device which is upper to this one. In this case, only
5303  * one master upper device can be linked, although other non-master devices
5304  * might be linked as well. The caller must hold the RTNL lock.
5305  * On a failure a negative errno code is returned. On success the reference
5306  * counts are adjusted and the function returns zero.
5307  */
5308 int netdev_master_upper_dev_link(struct net_device *dev,
5309                                  struct net_device *upper_dev)
5310 {
5311         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5312 }
5313 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5314
5315 int netdev_master_upper_dev_link_private(struct net_device *dev,
5316                                          struct net_device *upper_dev,
5317                                          void *private)
5318 {
5319         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5320 }
5321 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5322
5323 /**
5324  * netdev_upper_dev_unlink - Removes a link to upper device
5325  * @dev: device
5326  * @upper_dev: new upper device
5327  *
5328  * Removes a link to device which is upper to this one. The caller must hold
5329  * the RTNL lock.
5330  */
5331 void netdev_upper_dev_unlink(struct net_device *dev,
5332                              struct net_device *upper_dev)
5333 {
5334         struct netdev_adjacent *i, *j;
5335         ASSERT_RTNL();
5336
5337         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5338
5339         /* Here is the tricky part. We must remove all dev's lower
5340          * devices from all upper_dev's upper devices and vice
5341          * versa, to maintain the graph relationship.
5342          */
5343         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5344                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5345                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5346
5347         /* remove also the devices itself from lower/upper device
5348          * list
5349          */
5350         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5351                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5352
5353         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5354                 __netdev_adjacent_dev_unlink(dev, i->dev);
5355
5356         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5357 }
5358 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5359
5360 /**
5361  * netdev_bonding_info_change - Dispatch event about slave change
5362  * @dev: device
5363  * @bonding_info: info to dispatch
5364  *
5365  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5366  * The caller must hold the RTNL lock.
5367  */
5368 void netdev_bonding_info_change(struct net_device *dev,
5369                                 struct netdev_bonding_info *bonding_info)
5370 {
5371         struct netdev_notifier_bonding_info     info;
5372
5373         memcpy(&info.bonding_info, bonding_info,
5374                sizeof(struct netdev_bonding_info));
5375         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5376                                       &info.info);
5377 }
5378 EXPORT_SYMBOL(netdev_bonding_info_change);
5379
5380 static void netdev_adjacent_add_links(struct net_device *dev)
5381 {
5382         struct netdev_adjacent *iter;
5383
5384         struct net *net = dev_net(dev);
5385
5386         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5387                 if (!net_eq(net,dev_net(iter->dev)))
5388                         continue;
5389                 netdev_adjacent_sysfs_add(iter->dev, dev,
5390                                           &iter->dev->adj_list.lower);
5391                 netdev_adjacent_sysfs_add(dev, iter->dev,
5392                                           &dev->adj_list.upper);
5393         }
5394
5395         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5396                 if (!net_eq(net,dev_net(iter->dev)))
5397                         continue;
5398                 netdev_adjacent_sysfs_add(iter->dev, dev,
5399                                           &iter->dev->adj_list.upper);
5400                 netdev_adjacent_sysfs_add(dev, iter->dev,
5401                                           &dev->adj_list.lower);
5402         }
5403 }
5404
5405 static void netdev_adjacent_del_links(struct net_device *dev)
5406 {
5407         struct netdev_adjacent *iter;
5408
5409         struct net *net = dev_net(dev);
5410
5411         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5412                 if (!net_eq(net,dev_net(iter->dev)))
5413                         continue;
5414                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5415                                           &iter->dev->adj_list.lower);
5416                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5417                                           &dev->adj_list.upper);
5418         }
5419
5420         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5421                 if (!net_eq(net,dev_net(iter->dev)))
5422                         continue;
5423                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5424                                           &iter->dev->adj_list.upper);
5425                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5426                                           &dev->adj_list.lower);
5427         }
5428 }
5429
5430 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5431 {
5432         struct netdev_adjacent *iter;
5433
5434         struct net *net = dev_net(dev);
5435
5436         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5437                 if (!net_eq(net,dev_net(iter->dev)))
5438                         continue;
5439                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5440                                           &iter->dev->adj_list.lower);
5441                 netdev_adjacent_sysfs_add(iter->dev, dev,
5442                                           &iter->dev->adj_list.lower);
5443         }
5444
5445         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5446                 if (!net_eq(net,dev_net(iter->dev)))
5447                         continue;
5448                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5449                                           &iter->dev->adj_list.upper);
5450                 netdev_adjacent_sysfs_add(iter->dev, dev,
5451                                           &iter->dev->adj_list.upper);
5452         }
5453 }
5454
5455 void *netdev_lower_dev_get_private(struct net_device *dev,
5456                                    struct net_device *lower_dev)
5457 {
5458         struct netdev_adjacent *lower;
5459
5460         if (!lower_dev)
5461                 return NULL;
5462         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5463         if (!lower)
5464                 return NULL;
5465
5466         return lower->private;
5467 }
5468 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5469
5470
5471 int dev_get_nest_level(struct net_device *dev,
5472                        bool (*type_check)(struct net_device *dev))
5473 {
5474         struct net_device *lower = NULL;
5475         struct list_head *iter;
5476         int max_nest = -1;
5477         int nest;
5478
5479         ASSERT_RTNL();
5480
5481         netdev_for_each_lower_dev(dev, lower, iter) {
5482                 nest = dev_get_nest_level(lower, type_check);
5483                 if (max_nest < nest)
5484                         max_nest = nest;
5485         }
5486
5487         if (type_check(dev))
5488                 max_nest++;
5489
5490         return max_nest;
5491 }
5492 EXPORT_SYMBOL(dev_get_nest_level);
5493
5494 static void dev_change_rx_flags(struct net_device *dev, int flags)
5495 {
5496         const struct net_device_ops *ops = dev->netdev_ops;
5497
5498         if (ops->ndo_change_rx_flags)
5499                 ops->ndo_change_rx_flags(dev, flags);
5500 }
5501
5502 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5503 {
5504         unsigned int old_flags = dev->flags;
5505         kuid_t uid;
5506         kgid_t gid;
5507
5508         ASSERT_RTNL();
5509
5510         dev->flags |= IFF_PROMISC;
5511         dev->promiscuity += inc;
5512         if (dev->promiscuity == 0) {
5513                 /*
5514                  * Avoid overflow.
5515                  * If inc causes overflow, untouch promisc and return error.
5516                  */
5517                 if (inc < 0)
5518                         dev->flags &= ~IFF_PROMISC;
5519                 else {
5520                         dev->promiscuity -= inc;
5521                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5522                                 dev->name);
5523                         return -EOVERFLOW;
5524                 }
5525         }
5526         if (dev->flags != old_flags) {
5527                 pr_info("device %s %s promiscuous mode\n",
5528                         dev->name,
5529                         dev->flags & IFF_PROMISC ? "entered" : "left");
5530                 if (audit_enabled) {
5531                         current_uid_gid(&uid, &gid);
5532                         audit_log(current->audit_context, GFP_ATOMIC,
5533                                 AUDIT_ANOM_PROMISCUOUS,
5534                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5535                                 dev->name, (dev->flags & IFF_PROMISC),
5536                                 (old_flags & IFF_PROMISC),
5537                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5538                                 from_kuid(&init_user_ns, uid),
5539                                 from_kgid(&init_user_ns, gid),
5540                                 audit_get_sessionid(current));
5541                 }
5542
5543                 dev_change_rx_flags(dev, IFF_PROMISC);
5544         }
5545         if (notify)
5546                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5547         return 0;
5548 }
5549
5550 /**
5551  *      dev_set_promiscuity     - update promiscuity count on a device
5552  *      @dev: device
5553  *      @inc: modifier
5554  *
5555  *      Add or remove promiscuity from a device. While the count in the device
5556  *      remains above zero the interface remains promiscuous. Once it hits zero
5557  *      the device reverts back to normal filtering operation. A negative inc
5558  *      value is used to drop promiscuity on the device.
5559  *      Return 0 if successful or a negative errno code on error.
5560  */
5561 int dev_set_promiscuity(struct net_device *dev, int inc)
5562 {
5563         unsigned int old_flags = dev->flags;
5564         int err;
5565
5566         err = __dev_set_promiscuity(dev, inc, true);
5567         if (err < 0)
5568                 return err;
5569         if (dev->flags != old_flags)
5570                 dev_set_rx_mode(dev);
5571         return err;
5572 }
5573 EXPORT_SYMBOL(dev_set_promiscuity);
5574
5575 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5576 {
5577         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5578
5579         ASSERT_RTNL();
5580
5581         dev->flags |= IFF_ALLMULTI;
5582         dev->allmulti += inc;
5583         if (dev->allmulti == 0) {
5584                 /*
5585                  * Avoid overflow.
5586                  * If inc causes overflow, untouch allmulti and return error.
5587                  */
5588                 if (inc < 0)
5589                         dev->flags &= ~IFF_ALLMULTI;
5590                 else {
5591                         dev->allmulti -= inc;
5592                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5593                                 dev->name);
5594                         return -EOVERFLOW;
5595                 }
5596         }
5597         if (dev->flags ^ old_flags) {
5598                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5599                 dev_set_rx_mode(dev);
5600                 if (notify)
5601                         __dev_notify_flags(dev, old_flags,
5602                                            dev->gflags ^ old_gflags);
5603         }
5604         return 0;
5605 }
5606
5607 /**
5608  *      dev_set_allmulti        - update allmulti count on a device
5609  *      @dev: device
5610  *      @inc: modifier
5611  *
5612  *      Add or remove reception of all multicast frames to a device. While the
5613  *      count in the device remains above zero the interface remains listening
5614  *      to all interfaces. Once it hits zero the device reverts back to normal
5615  *      filtering operation. A negative @inc value is used to drop the counter
5616  *      when releasing a resource needing all multicasts.
5617  *      Return 0 if successful or a negative errno code on error.
5618  */
5619
5620 int dev_set_allmulti(struct net_device *dev, int inc)
5621 {
5622         return __dev_set_allmulti(dev, inc, true);
5623 }
5624 EXPORT_SYMBOL(dev_set_allmulti);
5625
5626 /*
5627  *      Upload unicast and multicast address lists to device and
5628  *      configure RX filtering. When the device doesn't support unicast
5629  *      filtering it is put in promiscuous mode while unicast addresses
5630  *      are present.
5631  */
5632 void __dev_set_rx_mode(struct net_device *dev)
5633 {
5634         const struct net_device_ops *ops = dev->netdev_ops;
5635
5636         /* dev_open will call this function so the list will stay sane. */
5637         if (!(dev->flags&IFF_UP))
5638                 return;
5639
5640         if (!netif_device_present(dev))
5641                 return;
5642
5643         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5644                 /* Unicast addresses changes may only happen under the rtnl,
5645                  * therefore calling __dev_set_promiscuity here is safe.
5646                  */
5647                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5648                         __dev_set_promiscuity(dev, 1, false);
5649                         dev->uc_promisc = true;
5650                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5651                         __dev_set_promiscuity(dev, -1, false);
5652                         dev->uc_promisc = false;
5653                 }
5654         }
5655
5656         if (ops->ndo_set_rx_mode)
5657                 ops->ndo_set_rx_mode(dev);
5658 }
5659
5660 void dev_set_rx_mode(struct net_device *dev)
5661 {
5662         netif_addr_lock_bh(dev);
5663         __dev_set_rx_mode(dev);
5664         netif_addr_unlock_bh(dev);
5665 }
5666
5667 /**
5668  *      dev_get_flags - get flags reported to userspace
5669  *      @dev: device
5670  *
5671  *      Get the combination of flag bits exported through APIs to userspace.
5672  */
5673 unsigned int dev_get_flags(const struct net_device *dev)
5674 {
5675         unsigned int flags;
5676
5677         flags = (dev->flags & ~(IFF_PROMISC |
5678                                 IFF_ALLMULTI |
5679                                 IFF_RUNNING |
5680                                 IFF_LOWER_UP |
5681                                 IFF_DORMANT)) |
5682                 (dev->gflags & (IFF_PROMISC |
5683                                 IFF_ALLMULTI));
5684
5685         if (netif_running(dev)) {
5686                 if (netif_oper_up(dev))
5687                         flags |= IFF_RUNNING;
5688                 if (netif_carrier_ok(dev))
5689                         flags |= IFF_LOWER_UP;
5690                 if (netif_dormant(dev))
5691                         flags |= IFF_DORMANT;
5692         }
5693
5694         return flags;
5695 }
5696 EXPORT_SYMBOL(dev_get_flags);
5697
5698 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5699 {
5700         unsigned int old_flags = dev->flags;
5701         int ret;
5702
5703         ASSERT_RTNL();
5704
5705         /*
5706          *      Set the flags on our device.
5707          */
5708
5709         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5710                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5711                                IFF_AUTOMEDIA)) |
5712                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5713                                     IFF_ALLMULTI));
5714
5715         /*
5716          *      Load in the correct multicast list now the flags have changed.
5717          */
5718
5719         if ((old_flags ^ flags) & IFF_MULTICAST)
5720                 dev_change_rx_flags(dev, IFF_MULTICAST);
5721
5722         dev_set_rx_mode(dev);
5723
5724         /*
5725          *      Have we downed the interface. We handle IFF_UP ourselves
5726          *      according to user attempts to set it, rather than blindly
5727          *      setting it.
5728          */
5729
5730         ret = 0;
5731         if ((old_flags ^ flags) & IFF_UP)
5732                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5733
5734         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5735                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5736                 unsigned int old_flags = dev->flags;
5737
5738                 dev->gflags ^= IFF_PROMISC;
5739
5740                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5741                         if (dev->flags != old_flags)
5742                                 dev_set_rx_mode(dev);
5743         }
5744
5745         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5746            is important. Some (broken) drivers set IFF_PROMISC, when
5747            IFF_ALLMULTI is requested not asking us and not reporting.
5748          */
5749         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5750                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5751
5752                 dev->gflags ^= IFF_ALLMULTI;
5753                 __dev_set_allmulti(dev, inc, false);
5754         }
5755
5756         return ret;
5757 }
5758
5759 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5760                         unsigned int gchanges)
5761 {
5762         unsigned int changes = dev->flags ^ old_flags;
5763
5764         if (gchanges)
5765                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5766
5767         if (changes & IFF_UP) {
5768                 if (dev->flags & IFF_UP)
5769                         call_netdevice_notifiers(NETDEV_UP, dev);
5770                 else
5771                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5772         }
5773
5774         if (dev->flags & IFF_UP &&
5775             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5776                 struct netdev_notifier_change_info change_info;
5777
5778                 change_info.flags_changed = changes;
5779                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5780                                               &change_info.info);
5781         }
5782 }
5783
5784 /**
5785  *      dev_change_flags - change device settings
5786  *      @dev: device
5787  *      @flags: device state flags
5788  *
5789  *      Change settings on device based state flags. The flags are
5790  *      in the userspace exported format.
5791  */
5792 int dev_change_flags(struct net_device *dev, unsigned int flags)
5793 {
5794         int ret;
5795         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5796
5797         ret = __dev_change_flags(dev, flags);
5798         if (ret < 0)
5799                 return ret;
5800
5801         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5802         __dev_notify_flags(dev, old_flags, changes);
5803         return ret;
5804 }
5805 EXPORT_SYMBOL(dev_change_flags);
5806
5807 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5808 {
5809         const struct net_device_ops *ops = dev->netdev_ops;
5810
5811         if (ops->ndo_change_mtu)
5812                 return ops->ndo_change_mtu(dev, new_mtu);
5813
5814         dev->mtu = new_mtu;
5815         return 0;
5816 }
5817
5818 /**
5819  *      dev_set_mtu - Change maximum transfer unit
5820  *      @dev: device
5821  *      @new_mtu: new transfer unit
5822  *
5823  *      Change the maximum transfer size of the network device.
5824  */
5825 int dev_set_mtu(struct net_device *dev, int new_mtu)
5826 {
5827         int err, orig_mtu;
5828
5829         if (new_mtu == dev->mtu)
5830                 return 0;
5831
5832         /*      MTU must be positive.    */
5833         if (new_mtu < 0)
5834                 return -EINVAL;
5835
5836         if (!netif_device_present(dev))
5837                 return -ENODEV;
5838
5839         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5840         err = notifier_to_errno(err);
5841         if (err)
5842                 return err;
5843
5844         orig_mtu = dev->mtu;
5845         err = __dev_set_mtu(dev, new_mtu);
5846
5847         if (!err) {
5848                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5849                 err = notifier_to_errno(err);
5850                 if (err) {
5851                         /* setting mtu back and notifying everyone again,
5852                          * so that they have a chance to revert changes.
5853                          */
5854                         __dev_set_mtu(dev, orig_mtu);
5855                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5856                 }
5857         }
5858         return err;
5859 }
5860 EXPORT_SYMBOL(dev_set_mtu);
5861
5862 /**
5863  *      dev_set_group - Change group this device belongs to
5864  *      @dev: device
5865  *      @new_group: group this device should belong to
5866  */
5867 void dev_set_group(struct net_device *dev, int new_group)
5868 {
5869         dev->group = new_group;
5870 }
5871 EXPORT_SYMBOL(dev_set_group);
5872
5873 /**
5874  *      dev_set_mac_address - Change Media Access Control Address
5875  *      @dev: device
5876  *      @sa: new address
5877  *
5878  *      Change the hardware (MAC) address of the device
5879  */
5880 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5881 {
5882         const struct net_device_ops *ops = dev->netdev_ops;
5883         int err;
5884
5885         if (!ops->ndo_set_mac_address)
5886                 return -EOPNOTSUPP;
5887         if (sa->sa_family != dev->type)
5888                 return -EINVAL;
5889         if (!netif_device_present(dev))
5890                 return -ENODEV;
5891         err = ops->ndo_set_mac_address(dev, sa);
5892         if (err)
5893                 return err;
5894         dev->addr_assign_type = NET_ADDR_SET;
5895         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5896         add_device_randomness(dev->dev_addr, dev->addr_len);
5897         return 0;
5898 }
5899 EXPORT_SYMBOL(dev_set_mac_address);
5900
5901 /**
5902  *      dev_change_carrier - Change device carrier
5903  *      @dev: device
5904  *      @new_carrier: new value
5905  *
5906  *      Change device carrier
5907  */
5908 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5909 {
5910         const struct net_device_ops *ops = dev->netdev_ops;
5911
5912         if (!ops->ndo_change_carrier)
5913                 return -EOPNOTSUPP;
5914         if (!netif_device_present(dev))
5915                 return -ENODEV;
5916         return ops->ndo_change_carrier(dev, new_carrier);
5917 }
5918 EXPORT_SYMBOL(dev_change_carrier);
5919
5920 /**
5921  *      dev_get_phys_port_id - Get device physical port ID
5922  *      @dev: device
5923  *      @ppid: port ID
5924  *
5925  *      Get device physical port ID
5926  */
5927 int dev_get_phys_port_id(struct net_device *dev,
5928                          struct netdev_phys_item_id *ppid)
5929 {
5930         const struct net_device_ops *ops = dev->netdev_ops;
5931
5932         if (!ops->ndo_get_phys_port_id)
5933                 return -EOPNOTSUPP;
5934         return ops->ndo_get_phys_port_id(dev, ppid);
5935 }
5936 EXPORT_SYMBOL(dev_get_phys_port_id);
5937
5938 /**
5939  *      dev_get_phys_port_name - Get device physical port name
5940  *      @dev: device
5941  *      @name: port name
5942  *
5943  *      Get device physical port name
5944  */
5945 int dev_get_phys_port_name(struct net_device *dev,
5946                            char *name, size_t len)
5947 {
5948         const struct net_device_ops *ops = dev->netdev_ops;
5949
5950         if (!ops->ndo_get_phys_port_name)
5951                 return -EOPNOTSUPP;
5952         return ops->ndo_get_phys_port_name(dev, name, len);
5953 }
5954 EXPORT_SYMBOL(dev_get_phys_port_name);
5955
5956 /**
5957  *      dev_new_index   -       allocate an ifindex
5958  *      @net: the applicable net namespace
5959  *
5960  *      Returns a suitable unique value for a new device interface
5961  *      number.  The caller must hold the rtnl semaphore or the
5962  *      dev_base_lock to be sure it remains unique.
5963  */
5964 static int dev_new_index(struct net *net)
5965 {
5966         int ifindex = net->ifindex;
5967         for (;;) {
5968                 if (++ifindex <= 0)
5969                         ifindex = 1;
5970                 if (!__dev_get_by_index(net, ifindex))
5971                         return net->ifindex = ifindex;
5972         }
5973 }
5974
5975 /* Delayed registration/unregisteration */
5976 static LIST_HEAD(net_todo_list);
5977 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5978
5979 static void net_set_todo(struct net_device *dev)
5980 {
5981         list_add_tail(&dev->todo_list, &net_todo_list);
5982         dev_net(dev)->dev_unreg_count++;
5983 }
5984
5985 static void rollback_registered_many(struct list_head *head)
5986 {
5987         struct net_device *dev, *tmp;
5988         LIST_HEAD(close_head);
5989
5990         BUG_ON(dev_boot_phase);
5991         ASSERT_RTNL();
5992
5993         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5994                 /* Some devices call without registering
5995                  * for initialization unwind. Remove those
5996                  * devices and proceed with the remaining.
5997                  */
5998                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5999                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6000                                  dev->name, dev);
6001
6002                         WARN_ON(1);
6003                         list_del(&dev->unreg_list);
6004                         continue;
6005                 }
6006                 dev->dismantle = true;
6007                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6008         }
6009
6010         /* If device is running, close it first. */
6011         list_for_each_entry(dev, head, unreg_list)
6012                 list_add_tail(&dev->close_list, &close_head);
6013         dev_close_many(&close_head, true);
6014
6015         list_for_each_entry(dev, head, unreg_list) {
6016                 /* And unlink it from device chain. */
6017                 unlist_netdevice(dev);
6018
6019                 dev->reg_state = NETREG_UNREGISTERING;
6020         }
6021
6022         synchronize_net();
6023
6024         list_for_each_entry(dev, head, unreg_list) {
6025                 struct sk_buff *skb = NULL;
6026
6027                 /* Shutdown queueing discipline. */
6028                 dev_shutdown(dev);
6029
6030
6031                 /* Notify protocols, that we are about to destroy
6032                    this device. They should clean all the things.
6033                 */
6034                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6035
6036                 if (!dev->rtnl_link_ops ||
6037                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6038                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6039                                                      GFP_KERNEL);
6040
6041                 /*
6042                  *      Flush the unicast and multicast chains
6043                  */
6044                 dev_uc_flush(dev);
6045                 dev_mc_flush(dev);
6046
6047                 if (dev->netdev_ops->ndo_uninit)
6048                         dev->netdev_ops->ndo_uninit(dev);
6049
6050                 if (skb)
6051                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6052
6053                 /* Notifier chain MUST detach us all upper devices. */
6054                 WARN_ON(netdev_has_any_upper_dev(dev));
6055
6056                 /* Remove entries from kobject tree */
6057                 netdev_unregister_kobject(dev);
6058 #ifdef CONFIG_XPS
6059                 /* Remove XPS queueing entries */
6060                 netif_reset_xps_queues_gt(dev, 0);
6061 #endif
6062         }
6063
6064         synchronize_net();
6065
6066         list_for_each_entry(dev, head, unreg_list)
6067                 dev_put(dev);
6068 }
6069
6070 static void rollback_registered(struct net_device *dev)
6071 {
6072         LIST_HEAD(single);
6073
6074         list_add(&dev->unreg_list, &single);
6075         rollback_registered_many(&single);
6076         list_del(&single);
6077 }
6078
6079 static netdev_features_t netdev_fix_features(struct net_device *dev,
6080         netdev_features_t features)
6081 {
6082         /* Fix illegal checksum combinations */
6083         if ((features & NETIF_F_HW_CSUM) &&
6084             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6085                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6086                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6087         }
6088
6089         /* TSO requires that SG is present as well. */
6090         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6091                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6092                 features &= ~NETIF_F_ALL_TSO;
6093         }
6094
6095         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6096                                         !(features & NETIF_F_IP_CSUM)) {
6097                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6098                 features &= ~NETIF_F_TSO;
6099                 features &= ~NETIF_F_TSO_ECN;
6100         }
6101
6102         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6103                                          !(features & NETIF_F_IPV6_CSUM)) {
6104                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6105                 features &= ~NETIF_F_TSO6;
6106         }
6107
6108         /* TSO ECN requires that TSO is present as well. */
6109         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6110                 features &= ~NETIF_F_TSO_ECN;
6111
6112         /* Software GSO depends on SG. */
6113         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6114                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6115                 features &= ~NETIF_F_GSO;
6116         }
6117
6118         /* UFO needs SG and checksumming */
6119         if (features & NETIF_F_UFO) {
6120                 /* maybe split UFO into V4 and V6? */
6121                 if (!((features & NETIF_F_GEN_CSUM) ||
6122                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6123                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6124                         netdev_dbg(dev,
6125                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6126                         features &= ~NETIF_F_UFO;
6127                 }
6128
6129                 if (!(features & NETIF_F_SG)) {
6130                         netdev_dbg(dev,
6131                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6132                         features &= ~NETIF_F_UFO;
6133                 }
6134         }
6135
6136 #ifdef CONFIG_NET_RX_BUSY_POLL
6137         if (dev->netdev_ops->ndo_busy_poll)
6138                 features |= NETIF_F_BUSY_POLL;
6139         else
6140 #endif
6141                 features &= ~NETIF_F_BUSY_POLL;
6142
6143         return features;
6144 }
6145
6146 int __netdev_update_features(struct net_device *dev)
6147 {
6148         netdev_features_t features;
6149         int err = 0;
6150
6151         ASSERT_RTNL();
6152
6153         features = netdev_get_wanted_features(dev);
6154
6155         if (dev->netdev_ops->ndo_fix_features)
6156                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6157
6158         /* driver might be less strict about feature dependencies */
6159         features = netdev_fix_features(dev, features);
6160
6161         if (dev->features == features)
6162                 return 0;
6163
6164         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6165                 &dev->features, &features);
6166
6167         if (dev->netdev_ops->ndo_set_features)
6168                 err = dev->netdev_ops->ndo_set_features(dev, features);
6169
6170         if (unlikely(err < 0)) {
6171                 netdev_err(dev,
6172                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6173                         err, &features, &dev->features);
6174                 return -1;
6175         }
6176
6177         if (!err)
6178                 dev->features = features;
6179
6180         return 1;
6181 }
6182
6183 /**
6184  *      netdev_update_features - recalculate device features
6185  *      @dev: the device to check
6186  *
6187  *      Recalculate dev->features set and send notifications if it
6188  *      has changed. Should be called after driver or hardware dependent
6189  *      conditions might have changed that influence the features.
6190  */
6191 void netdev_update_features(struct net_device *dev)
6192 {
6193         if (__netdev_update_features(dev))
6194                 netdev_features_change(dev);
6195 }
6196 EXPORT_SYMBOL(netdev_update_features);
6197
6198 /**
6199  *      netdev_change_features - recalculate device features
6200  *      @dev: the device to check
6201  *
6202  *      Recalculate dev->features set and send notifications even
6203  *      if they have not changed. Should be called instead of
6204  *      netdev_update_features() if also dev->vlan_features might
6205  *      have changed to allow the changes to be propagated to stacked
6206  *      VLAN devices.
6207  */
6208 void netdev_change_features(struct net_device *dev)
6209 {
6210         __netdev_update_features(dev);
6211         netdev_features_change(dev);
6212 }
6213 EXPORT_SYMBOL(netdev_change_features);
6214
6215 /**
6216  *      netif_stacked_transfer_operstate -      transfer operstate
6217  *      @rootdev: the root or lower level device to transfer state from
6218  *      @dev: the device to transfer operstate to
6219  *
6220  *      Transfer operational state from root to device. This is normally
6221  *      called when a stacking relationship exists between the root
6222  *      device and the device(a leaf device).
6223  */
6224 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6225                                         struct net_device *dev)
6226 {
6227         if (rootdev->operstate == IF_OPER_DORMANT)
6228                 netif_dormant_on(dev);
6229         else
6230                 netif_dormant_off(dev);
6231
6232         if (netif_carrier_ok(rootdev)) {
6233                 if (!netif_carrier_ok(dev))
6234                         netif_carrier_on(dev);
6235         } else {
6236                 if (netif_carrier_ok(dev))
6237                         netif_carrier_off(dev);
6238         }
6239 }
6240 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6241
6242 #ifdef CONFIG_SYSFS
6243 static int netif_alloc_rx_queues(struct net_device *dev)
6244 {
6245         unsigned int i, count = dev->num_rx_queues;
6246         struct netdev_rx_queue *rx;
6247         size_t sz = count * sizeof(*rx);
6248
6249         BUG_ON(count < 1);
6250
6251         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6252         if (!rx) {
6253                 rx = vzalloc(sz);
6254                 if (!rx)
6255                         return -ENOMEM;
6256         }
6257         dev->_rx = rx;
6258
6259         for (i = 0; i < count; i++)
6260                 rx[i].dev = dev;
6261         return 0;
6262 }
6263 #endif
6264
6265 static void netdev_init_one_queue(struct net_device *dev,
6266                                   struct netdev_queue *queue, void *_unused)
6267 {
6268         /* Initialize queue lock */
6269         spin_lock_init(&queue->_xmit_lock);
6270         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6271         queue->xmit_lock_owner = -1;
6272         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6273         queue->dev = dev;
6274 #ifdef CONFIG_BQL
6275         dql_init(&queue->dql, HZ);
6276 #endif
6277 }
6278
6279 static void netif_free_tx_queues(struct net_device *dev)
6280 {
6281         kvfree(dev->_tx);
6282 }
6283
6284 static int netif_alloc_netdev_queues(struct net_device *dev)
6285 {
6286         unsigned int count = dev->num_tx_queues;
6287         struct netdev_queue *tx;
6288         size_t sz = count * sizeof(*tx);
6289
6290         BUG_ON(count < 1 || count > 0xffff);
6291
6292         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6293         if (!tx) {
6294                 tx = vzalloc(sz);
6295                 if (!tx)
6296                         return -ENOMEM;
6297         }
6298         dev->_tx = tx;
6299
6300         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6301         spin_lock_init(&dev->tx_global_lock);
6302
6303         return 0;
6304 }
6305
6306 /**
6307  *      register_netdevice      - register a network device
6308  *      @dev: device to register
6309  *
6310  *      Take a completed network device structure and add it to the kernel
6311  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6312  *      chain. 0 is returned on success. A negative errno code is returned
6313  *      on a failure to set up the device, or if the name is a duplicate.
6314  *
6315  *      Callers must hold the rtnl semaphore. You may want
6316  *      register_netdev() instead of this.
6317  *
6318  *      BUGS:
6319  *      The locking appears insufficient to guarantee two parallel registers
6320  *      will not get the same name.
6321  */
6322
6323 int register_netdevice(struct net_device *dev)
6324 {
6325         int ret;
6326         struct net *net = dev_net(dev);
6327
6328         BUG_ON(dev_boot_phase);
6329         ASSERT_RTNL();
6330
6331         might_sleep();
6332
6333         /* When net_device's are persistent, this will be fatal. */
6334         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6335         BUG_ON(!net);
6336
6337         spin_lock_init(&dev->addr_list_lock);
6338         netdev_set_addr_lockdep_class(dev);
6339
6340         ret = dev_get_valid_name(net, dev, dev->name);
6341         if (ret < 0)
6342                 goto out;
6343
6344         /* Init, if this function is available */
6345         if (dev->netdev_ops->ndo_init) {
6346                 ret = dev->netdev_ops->ndo_init(dev);
6347                 if (ret) {
6348                         if (ret > 0)
6349                                 ret = -EIO;
6350                         goto out;
6351                 }
6352         }
6353
6354         if (((dev->hw_features | dev->features) &
6355              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6356             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6357              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6358                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6359                 ret = -EINVAL;
6360                 goto err_uninit;
6361         }
6362
6363         ret = -EBUSY;
6364         if (!dev->ifindex)
6365                 dev->ifindex = dev_new_index(net);
6366         else if (__dev_get_by_index(net, dev->ifindex))
6367                 goto err_uninit;
6368
6369         /* Transfer changeable features to wanted_features and enable
6370          * software offloads (GSO and GRO).
6371          */
6372         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6373         dev->features |= NETIF_F_SOFT_FEATURES;
6374         dev->wanted_features = dev->features & dev->hw_features;
6375
6376         if (!(dev->flags & IFF_LOOPBACK)) {
6377                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6378         }
6379
6380         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6381          */
6382         dev->vlan_features |= NETIF_F_HIGHDMA;
6383
6384         /* Make NETIF_F_SG inheritable to tunnel devices.
6385          */
6386         dev->hw_enc_features |= NETIF_F_SG;
6387
6388         /* Make NETIF_F_SG inheritable to MPLS.
6389          */
6390         dev->mpls_features |= NETIF_F_SG;
6391
6392         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6393         ret = notifier_to_errno(ret);
6394         if (ret)
6395                 goto err_uninit;
6396
6397         ret = netdev_register_kobject(dev);
6398         if (ret)
6399                 goto err_uninit;
6400         dev->reg_state = NETREG_REGISTERED;
6401
6402         __netdev_update_features(dev);
6403
6404         /*
6405          *      Default initial state at registry is that the
6406          *      device is present.
6407          */
6408
6409         set_bit(__LINK_STATE_PRESENT, &dev->state);
6410
6411         linkwatch_init_dev(dev);
6412
6413         dev_init_scheduler(dev);
6414         dev_hold(dev);
6415         list_netdevice(dev);
6416         add_device_randomness(dev->dev_addr, dev->addr_len);
6417
6418         /* If the device has permanent device address, driver should
6419          * set dev_addr and also addr_assign_type should be set to
6420          * NET_ADDR_PERM (default value).
6421          */
6422         if (dev->addr_assign_type == NET_ADDR_PERM)
6423                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6424
6425         /* Notify protocols, that a new device appeared. */
6426         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6427         ret = notifier_to_errno(ret);
6428         if (ret) {
6429                 rollback_registered(dev);
6430                 dev->reg_state = NETREG_UNREGISTERED;
6431         }
6432         /*
6433          *      Prevent userspace races by waiting until the network
6434          *      device is fully setup before sending notifications.
6435          */
6436         if (!dev->rtnl_link_ops ||
6437             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6438                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6439
6440 out:
6441         return ret;
6442
6443 err_uninit:
6444         if (dev->netdev_ops->ndo_uninit)
6445                 dev->netdev_ops->ndo_uninit(dev);
6446         goto out;
6447 }
6448 EXPORT_SYMBOL(register_netdevice);
6449
6450 /**
6451  *      init_dummy_netdev       - init a dummy network device for NAPI
6452  *      @dev: device to init
6453  *
6454  *      This takes a network device structure and initialize the minimum
6455  *      amount of fields so it can be used to schedule NAPI polls without
6456  *      registering a full blown interface. This is to be used by drivers
6457  *      that need to tie several hardware interfaces to a single NAPI
6458  *      poll scheduler due to HW limitations.
6459  */
6460 int init_dummy_netdev(struct net_device *dev)
6461 {
6462         /* Clear everything. Note we don't initialize spinlocks
6463          * are they aren't supposed to be taken by any of the
6464          * NAPI code and this dummy netdev is supposed to be
6465          * only ever used for NAPI polls
6466          */
6467         memset(dev, 0, sizeof(struct net_device));
6468
6469         /* make sure we BUG if trying to hit standard
6470          * register/unregister code path
6471          */
6472         dev->reg_state = NETREG_DUMMY;
6473
6474         /* NAPI wants this */
6475         INIT_LIST_HEAD(&dev->napi_list);
6476
6477         /* a dummy interface is started by default */
6478         set_bit(__LINK_STATE_PRESENT, &dev->state);
6479         set_bit(__LINK_STATE_START, &dev->state);
6480
6481         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6482          * because users of this 'device' dont need to change
6483          * its refcount.
6484          */
6485
6486         return 0;
6487 }
6488 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6489
6490
6491 /**
6492  *      register_netdev - register a network device
6493  *      @dev: device to register
6494  *
6495  *      Take a completed network device structure and add it to the kernel
6496  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6497  *      chain. 0 is returned on success. A negative errno code is returned
6498  *      on a failure to set up the device, or if the name is a duplicate.
6499  *
6500  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6501  *      and expands the device name if you passed a format string to
6502  *      alloc_netdev.
6503  */
6504 int register_netdev(struct net_device *dev)
6505 {
6506         int err;
6507
6508         rtnl_lock();
6509         err = register_netdevice(dev);
6510         rtnl_unlock();
6511         return err;
6512 }
6513 EXPORT_SYMBOL(register_netdev);
6514
6515 int netdev_refcnt_read(const struct net_device *dev)
6516 {
6517         int i, refcnt = 0;
6518
6519         for_each_possible_cpu(i)
6520                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6521         return refcnt;
6522 }
6523 EXPORT_SYMBOL(netdev_refcnt_read);
6524
6525 /**
6526  * netdev_wait_allrefs - wait until all references are gone.
6527  * @dev: target net_device
6528  *
6529  * This is called when unregistering network devices.
6530  *
6531  * Any protocol or device that holds a reference should register
6532  * for netdevice notification, and cleanup and put back the
6533  * reference if they receive an UNREGISTER event.
6534  * We can get stuck here if buggy protocols don't correctly
6535  * call dev_put.
6536  */
6537 static void netdev_wait_allrefs(struct net_device *dev)
6538 {
6539         unsigned long rebroadcast_time, warning_time;
6540         int refcnt;
6541
6542         linkwatch_forget_dev(dev);
6543
6544         rebroadcast_time = warning_time = jiffies;
6545         refcnt = netdev_refcnt_read(dev);
6546
6547         while (refcnt != 0) {
6548                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6549                         rtnl_lock();
6550
6551                         /* Rebroadcast unregister notification */
6552                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6553
6554                         __rtnl_unlock();
6555                         rcu_barrier();
6556                         rtnl_lock();
6557
6558                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6559                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6560                                      &dev->state)) {
6561                                 /* We must not have linkwatch events
6562                                  * pending on unregister. If this
6563                                  * happens, we simply run the queue
6564                                  * unscheduled, resulting in a noop
6565                                  * for this device.
6566                                  */
6567                                 linkwatch_run_queue();
6568                         }
6569
6570                         __rtnl_unlock();
6571
6572                         rebroadcast_time = jiffies;
6573                 }
6574
6575                 msleep(250);
6576
6577                 refcnt = netdev_refcnt_read(dev);
6578
6579                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6580                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6581                                  dev->name, refcnt);
6582                         warning_time = jiffies;
6583                 }
6584         }
6585 }
6586
6587 /* The sequence is:
6588  *
6589  *      rtnl_lock();
6590  *      ...
6591  *      register_netdevice(x1);
6592  *      register_netdevice(x2);
6593  *      ...
6594  *      unregister_netdevice(y1);
6595  *      unregister_netdevice(y2);
6596  *      ...
6597  *      rtnl_unlock();
6598  *      free_netdev(y1);
6599  *      free_netdev(y2);
6600  *
6601  * We are invoked by rtnl_unlock().
6602  * This allows us to deal with problems:
6603  * 1) We can delete sysfs objects which invoke hotplug
6604  *    without deadlocking with linkwatch via keventd.
6605  * 2) Since we run with the RTNL semaphore not held, we can sleep
6606  *    safely in order to wait for the netdev refcnt to drop to zero.
6607  *
6608  * We must not return until all unregister events added during
6609  * the interval the lock was held have been completed.
6610  */
6611 void netdev_run_todo(void)
6612 {
6613         struct list_head list;
6614
6615         /* Snapshot list, allow later requests */
6616         list_replace_init(&net_todo_list, &list);
6617
6618         __rtnl_unlock();
6619
6620
6621         /* Wait for rcu callbacks to finish before next phase */
6622         if (!list_empty(&list))
6623                 rcu_barrier();
6624
6625         while (!list_empty(&list)) {
6626                 struct net_device *dev
6627                         = list_first_entry(&list, struct net_device, todo_list);
6628                 list_del(&dev->todo_list);
6629
6630                 rtnl_lock();
6631                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6632                 __rtnl_unlock();
6633
6634                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6635                         pr_err("network todo '%s' but state %d\n",
6636                                dev->name, dev->reg_state);
6637                         dump_stack();
6638                         continue;
6639                 }
6640
6641                 dev->reg_state = NETREG_UNREGISTERED;
6642
6643                 on_each_cpu(flush_backlog, dev, 1);
6644
6645                 netdev_wait_allrefs(dev);
6646
6647                 /* paranoia */
6648                 BUG_ON(netdev_refcnt_read(dev));
6649                 BUG_ON(!list_empty(&dev->ptype_all));
6650                 BUG_ON(!list_empty(&dev->ptype_specific));
6651                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6652                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6653                 WARN_ON(dev->dn_ptr);
6654
6655                 if (dev->destructor)
6656                         dev->destructor(dev);
6657
6658                 /* Report a network device has been unregistered */
6659                 rtnl_lock();
6660                 dev_net(dev)->dev_unreg_count--;
6661                 __rtnl_unlock();
6662                 wake_up(&netdev_unregistering_wq);
6663
6664                 /* Free network device */
6665                 kobject_put(&dev->dev.kobj);
6666         }
6667 }
6668
6669 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6670  * fields in the same order, with only the type differing.
6671  */
6672 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6673                              const struct net_device_stats *netdev_stats)
6674 {
6675 #if BITS_PER_LONG == 64
6676         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6677         memcpy(stats64, netdev_stats, sizeof(*stats64));
6678 #else
6679         size_t i, n = sizeof(*stats64) / sizeof(u64);
6680         const unsigned long *src = (const unsigned long *)netdev_stats;
6681         u64 *dst = (u64 *)stats64;
6682
6683         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6684                      sizeof(*stats64) / sizeof(u64));
6685         for (i = 0; i < n; i++)
6686                 dst[i] = src[i];
6687 #endif
6688 }
6689 EXPORT_SYMBOL(netdev_stats_to_stats64);
6690
6691 /**
6692  *      dev_get_stats   - get network device statistics
6693  *      @dev: device to get statistics from
6694  *      @storage: place to store stats
6695  *
6696  *      Get network statistics from device. Return @storage.
6697  *      The device driver may provide its own method by setting
6698  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6699  *      otherwise the internal statistics structure is used.
6700  */
6701 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6702                                         struct rtnl_link_stats64 *storage)
6703 {
6704         const struct net_device_ops *ops = dev->netdev_ops;
6705
6706         if (ops->ndo_get_stats64) {
6707                 memset(storage, 0, sizeof(*storage));
6708                 ops->ndo_get_stats64(dev, storage);
6709         } else if (ops->ndo_get_stats) {
6710                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6711         } else {
6712                 netdev_stats_to_stats64(storage, &dev->stats);
6713         }
6714         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6715         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6716         return storage;
6717 }
6718 EXPORT_SYMBOL(dev_get_stats);
6719
6720 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6721 {
6722         struct netdev_queue *queue = dev_ingress_queue(dev);
6723
6724 #ifdef CONFIG_NET_CLS_ACT
6725         if (queue)
6726                 return queue;
6727         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6728         if (!queue)
6729                 return NULL;
6730         netdev_init_one_queue(dev, queue, NULL);
6731         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6732         queue->qdisc_sleeping = &noop_qdisc;
6733         rcu_assign_pointer(dev->ingress_queue, queue);
6734 #endif
6735         return queue;
6736 }
6737
6738 static const struct ethtool_ops default_ethtool_ops;
6739
6740 void netdev_set_default_ethtool_ops(struct net_device *dev,
6741                                     const struct ethtool_ops *ops)
6742 {
6743         if (dev->ethtool_ops == &default_ethtool_ops)
6744                 dev->ethtool_ops = ops;
6745 }
6746 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6747
6748 void netdev_freemem(struct net_device *dev)
6749 {
6750         char *addr = (char *)dev - dev->padded;
6751
6752         kvfree(addr);
6753 }
6754
6755 /**
6756  *      alloc_netdev_mqs - allocate network device
6757  *      @sizeof_priv:           size of private data to allocate space for
6758  *      @name:                  device name format string
6759  *      @name_assign_type:      origin of device name
6760  *      @setup:                 callback to initialize device
6761  *      @txqs:                  the number of TX subqueues to allocate
6762  *      @rxqs:                  the number of RX subqueues to allocate
6763  *
6764  *      Allocates a struct net_device with private data area for driver use
6765  *      and performs basic initialization.  Also allocates subqueue structs
6766  *      for each queue on the device.
6767  */
6768 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6769                 unsigned char name_assign_type,
6770                 void (*setup)(struct net_device *),
6771                 unsigned int txqs, unsigned int rxqs)
6772 {
6773         struct net_device *dev;
6774         size_t alloc_size;
6775         struct net_device *p;
6776
6777         BUG_ON(strlen(name) >= sizeof(dev->name));
6778
6779         if (txqs < 1) {
6780                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6781                 return NULL;
6782         }
6783
6784 #ifdef CONFIG_SYSFS
6785         if (rxqs < 1) {
6786                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6787                 return NULL;
6788         }
6789 #endif
6790
6791         alloc_size = sizeof(struct net_device);
6792         if (sizeof_priv) {
6793                 /* ensure 32-byte alignment of private area */
6794                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6795                 alloc_size += sizeof_priv;
6796         }
6797         /* ensure 32-byte alignment of whole construct */
6798         alloc_size += NETDEV_ALIGN - 1;
6799
6800         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6801         if (!p)
6802                 p = vzalloc(alloc_size);
6803         if (!p)
6804                 return NULL;
6805
6806         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6807         dev->padded = (char *)dev - (char *)p;
6808
6809         dev->pcpu_refcnt = alloc_percpu(int);
6810         if (!dev->pcpu_refcnt)
6811                 goto free_dev;
6812
6813         if (dev_addr_init(dev))
6814                 goto free_pcpu;
6815
6816         dev_mc_init(dev);
6817         dev_uc_init(dev);
6818
6819         dev_net_set(dev, &init_net);
6820
6821         dev->gso_max_size = GSO_MAX_SIZE;
6822         dev->gso_max_segs = GSO_MAX_SEGS;
6823         dev->gso_min_segs = 0;
6824
6825         INIT_LIST_HEAD(&dev->napi_list);
6826         INIT_LIST_HEAD(&dev->unreg_list);
6827         INIT_LIST_HEAD(&dev->close_list);
6828         INIT_LIST_HEAD(&dev->link_watch_list);
6829         INIT_LIST_HEAD(&dev->adj_list.upper);
6830         INIT_LIST_HEAD(&dev->adj_list.lower);
6831         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6832         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6833         INIT_LIST_HEAD(&dev->ptype_all);
6834         INIT_LIST_HEAD(&dev->ptype_specific);
6835         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6836         setup(dev);
6837
6838         dev->num_tx_queues = txqs;
6839         dev->real_num_tx_queues = txqs;
6840         if (netif_alloc_netdev_queues(dev))
6841                 goto free_all;
6842
6843 #ifdef CONFIG_SYSFS
6844         dev->num_rx_queues = rxqs;
6845         dev->real_num_rx_queues = rxqs;
6846         if (netif_alloc_rx_queues(dev))
6847                 goto free_all;
6848 #endif
6849
6850         strcpy(dev->name, name);
6851         dev->name_assign_type = name_assign_type;
6852         dev->group = INIT_NETDEV_GROUP;
6853         if (!dev->ethtool_ops)
6854                 dev->ethtool_ops = &default_ethtool_ops;
6855         return dev;
6856
6857 free_all:
6858         free_netdev(dev);
6859         return NULL;
6860
6861 free_pcpu:
6862         free_percpu(dev->pcpu_refcnt);
6863 free_dev:
6864         netdev_freemem(dev);
6865         return NULL;
6866 }
6867 EXPORT_SYMBOL(alloc_netdev_mqs);
6868
6869 /**
6870  *      free_netdev - free network device
6871  *      @dev: device
6872  *
6873  *      This function does the last stage of destroying an allocated device
6874  *      interface. The reference to the device object is released.
6875  *      If this is the last reference then it will be freed.
6876  */
6877 void free_netdev(struct net_device *dev)
6878 {
6879         struct napi_struct *p, *n;
6880
6881         netif_free_tx_queues(dev);
6882 #ifdef CONFIG_SYSFS
6883         kvfree(dev->_rx);
6884 #endif
6885
6886         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6887
6888         /* Flush device addresses */
6889         dev_addr_flush(dev);
6890
6891         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6892                 netif_napi_del(p);
6893
6894         free_percpu(dev->pcpu_refcnt);
6895         dev->pcpu_refcnt = NULL;
6896
6897         /*  Compatibility with error handling in drivers */
6898         if (dev->reg_state == NETREG_UNINITIALIZED) {
6899                 netdev_freemem(dev);
6900                 return;
6901         }
6902
6903         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6904         dev->reg_state = NETREG_RELEASED;
6905
6906         /* will free via device release */
6907         put_device(&dev->dev);
6908 }
6909 EXPORT_SYMBOL(free_netdev);
6910
6911 /**
6912  *      synchronize_net -  Synchronize with packet receive processing
6913  *
6914  *      Wait for packets currently being received to be done.
6915  *      Does not block later packets from starting.
6916  */
6917 void synchronize_net(void)
6918 {
6919         might_sleep();
6920         if (rtnl_is_locked())
6921                 synchronize_rcu_expedited();
6922         else
6923                 synchronize_rcu();
6924 }
6925 EXPORT_SYMBOL(synchronize_net);
6926
6927 /**
6928  *      unregister_netdevice_queue - remove device from the kernel
6929  *      @dev: device
6930  *      @head: list
6931  *
6932  *      This function shuts down a device interface and removes it
6933  *      from the kernel tables.
6934  *      If head not NULL, device is queued to be unregistered later.
6935  *
6936  *      Callers must hold the rtnl semaphore.  You may want
6937  *      unregister_netdev() instead of this.
6938  */
6939
6940 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6941 {
6942         ASSERT_RTNL();
6943
6944         if (head) {
6945                 list_move_tail(&dev->unreg_list, head);
6946         } else {
6947                 rollback_registered(dev);
6948                 /* Finish processing unregister after unlock */
6949                 net_set_todo(dev);
6950         }
6951 }
6952 EXPORT_SYMBOL(unregister_netdevice_queue);
6953
6954 /**
6955  *      unregister_netdevice_many - unregister many devices
6956  *      @head: list of devices
6957  *
6958  *  Note: As most callers use a stack allocated list_head,
6959  *  we force a list_del() to make sure stack wont be corrupted later.
6960  */
6961 void unregister_netdevice_many(struct list_head *head)
6962 {
6963         struct net_device *dev;
6964
6965         if (!list_empty(head)) {
6966                 rollback_registered_many(head);
6967                 list_for_each_entry(dev, head, unreg_list)
6968                         net_set_todo(dev);
6969                 list_del(head);
6970         }
6971 }
6972 EXPORT_SYMBOL(unregister_netdevice_many);
6973
6974 /**
6975  *      unregister_netdev - remove device from the kernel
6976  *      @dev: device
6977  *
6978  *      This function shuts down a device interface and removes it
6979  *      from the kernel tables.
6980  *
6981  *      This is just a wrapper for unregister_netdevice that takes
6982  *      the rtnl semaphore.  In general you want to use this and not
6983  *      unregister_netdevice.
6984  */
6985 void unregister_netdev(struct net_device *dev)
6986 {
6987         rtnl_lock();
6988         unregister_netdevice(dev);
6989         rtnl_unlock();
6990 }
6991 EXPORT_SYMBOL(unregister_netdev);
6992
6993 /**
6994  *      dev_change_net_namespace - move device to different nethost namespace
6995  *      @dev: device
6996  *      @net: network namespace
6997  *      @pat: If not NULL name pattern to try if the current device name
6998  *            is already taken in the destination network namespace.
6999  *
7000  *      This function shuts down a device interface and moves it
7001  *      to a new network namespace. On success 0 is returned, on
7002  *      a failure a netagive errno code is returned.
7003  *
7004  *      Callers must hold the rtnl semaphore.
7005  */
7006
7007 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7008 {
7009         int err;
7010
7011         ASSERT_RTNL();
7012
7013         /* Don't allow namespace local devices to be moved. */
7014         err = -EINVAL;
7015         if (dev->features & NETIF_F_NETNS_LOCAL)
7016                 goto out;
7017
7018         /* Ensure the device has been registrered */
7019         if (dev->reg_state != NETREG_REGISTERED)
7020                 goto out;
7021
7022         /* Get out if there is nothing todo */
7023         err = 0;
7024         if (net_eq(dev_net(dev), net))
7025                 goto out;
7026
7027         /* Pick the destination device name, and ensure
7028          * we can use it in the destination network namespace.
7029          */
7030         err = -EEXIST;
7031         if (__dev_get_by_name(net, dev->name)) {
7032                 /* We get here if we can't use the current device name */
7033                 if (!pat)
7034                         goto out;
7035                 if (dev_get_valid_name(net, dev, pat) < 0)
7036                         goto out;
7037         }
7038
7039         /*
7040          * And now a mini version of register_netdevice unregister_netdevice.
7041          */
7042
7043         /* If device is running close it first. */
7044         dev_close(dev);
7045
7046         /* And unlink it from device chain */
7047         err = -ENODEV;
7048         unlist_netdevice(dev);
7049
7050         synchronize_net();
7051
7052         /* Shutdown queueing discipline. */
7053         dev_shutdown(dev);
7054
7055         /* Notify protocols, that we are about to destroy
7056            this device. They should clean all the things.
7057
7058            Note that dev->reg_state stays at NETREG_REGISTERED.
7059            This is wanted because this way 8021q and macvlan know
7060            the device is just moving and can keep their slaves up.
7061         */
7062         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7063         rcu_barrier();
7064         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7065         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7066
7067         /*
7068          *      Flush the unicast and multicast chains
7069          */
7070         dev_uc_flush(dev);
7071         dev_mc_flush(dev);
7072
7073         /* Send a netdev-removed uevent to the old namespace */
7074         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7075         netdev_adjacent_del_links(dev);
7076
7077         /* Actually switch the network namespace */
7078         dev_net_set(dev, net);
7079
7080         /* If there is an ifindex conflict assign a new one */
7081         if (__dev_get_by_index(net, dev->ifindex))
7082                 dev->ifindex = dev_new_index(net);
7083
7084         /* Send a netdev-add uevent to the new namespace */
7085         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7086         netdev_adjacent_add_links(dev);
7087
7088         /* Fixup kobjects */
7089         err = device_rename(&dev->dev, dev->name);
7090         WARN_ON(err);
7091
7092         /* Add the device back in the hashes */
7093         list_netdevice(dev);
7094
7095         /* Notify protocols, that a new device appeared. */
7096         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7097
7098         /*
7099          *      Prevent userspace races by waiting until the network
7100          *      device is fully setup before sending notifications.
7101          */
7102         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7103
7104         synchronize_net();
7105         err = 0;
7106 out:
7107         return err;
7108 }
7109 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7110
7111 static int dev_cpu_callback(struct notifier_block *nfb,
7112                             unsigned long action,
7113                             void *ocpu)
7114 {
7115         struct sk_buff **list_skb;
7116         struct sk_buff *skb;
7117         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7118         struct softnet_data *sd, *oldsd;
7119
7120         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7121                 return NOTIFY_OK;
7122
7123         local_irq_disable();
7124         cpu = smp_processor_id();
7125         sd = &per_cpu(softnet_data, cpu);
7126         oldsd = &per_cpu(softnet_data, oldcpu);
7127
7128         /* Find end of our completion_queue. */
7129         list_skb = &sd->completion_queue;
7130         while (*list_skb)
7131                 list_skb = &(*list_skb)->next;
7132         /* Append completion queue from offline CPU. */
7133         *list_skb = oldsd->completion_queue;
7134         oldsd->completion_queue = NULL;
7135
7136         /* Append output queue from offline CPU. */
7137         if (oldsd->output_queue) {
7138                 *sd->output_queue_tailp = oldsd->output_queue;
7139                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7140                 oldsd->output_queue = NULL;
7141                 oldsd->output_queue_tailp = &oldsd->output_queue;
7142         }
7143         /* Append NAPI poll list from offline CPU, with one exception :
7144          * process_backlog() must be called by cpu owning percpu backlog.
7145          * We properly handle process_queue & input_pkt_queue later.
7146          */
7147         while (!list_empty(&oldsd->poll_list)) {
7148                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7149                                                             struct napi_struct,
7150                                                             poll_list);
7151
7152                 list_del_init(&napi->poll_list);
7153                 if (napi->poll == process_backlog)
7154                         napi->state = 0;
7155                 else
7156                         ____napi_schedule(sd, napi);
7157         }
7158
7159         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7160         local_irq_enable();
7161
7162         /* Process offline CPU's input_pkt_queue */
7163         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7164                 netif_rx_ni(skb);
7165                 input_queue_head_incr(oldsd);
7166         }
7167         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7168                 netif_rx_ni(skb);
7169                 input_queue_head_incr(oldsd);
7170         }
7171
7172         return NOTIFY_OK;
7173 }
7174
7175
7176 /**
7177  *      netdev_increment_features - increment feature set by one
7178  *      @all: current feature set
7179  *      @one: new feature set
7180  *      @mask: mask feature set
7181  *
7182  *      Computes a new feature set after adding a device with feature set
7183  *      @one to the master device with current feature set @all.  Will not
7184  *      enable anything that is off in @mask. Returns the new feature set.
7185  */
7186 netdev_features_t netdev_increment_features(netdev_features_t all,
7187         netdev_features_t one, netdev_features_t mask)
7188 {
7189         if (mask & NETIF_F_GEN_CSUM)
7190                 mask |= NETIF_F_ALL_CSUM;
7191         mask |= NETIF_F_VLAN_CHALLENGED;
7192
7193         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7194         all &= one | ~NETIF_F_ALL_FOR_ALL;
7195
7196         /* If one device supports hw checksumming, set for all. */
7197         if (all & NETIF_F_GEN_CSUM)
7198                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7199
7200         return all;
7201 }
7202 EXPORT_SYMBOL(netdev_increment_features);
7203
7204 static struct hlist_head * __net_init netdev_create_hash(void)
7205 {
7206         int i;
7207         struct hlist_head *hash;
7208
7209         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7210         if (hash != NULL)
7211                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7212                         INIT_HLIST_HEAD(&hash[i]);
7213
7214         return hash;
7215 }
7216
7217 /* Initialize per network namespace state */
7218 static int __net_init netdev_init(struct net *net)
7219 {
7220         if (net != &init_net)
7221                 INIT_LIST_HEAD(&net->dev_base_head);
7222
7223         net->dev_name_head = netdev_create_hash();
7224         if (net->dev_name_head == NULL)
7225                 goto err_name;
7226
7227         net->dev_index_head = netdev_create_hash();
7228         if (net->dev_index_head == NULL)
7229                 goto err_idx;
7230
7231         return 0;
7232
7233 err_idx:
7234         kfree(net->dev_name_head);
7235 err_name:
7236         return -ENOMEM;
7237 }
7238
7239 /**
7240  *      netdev_drivername - network driver for the device
7241  *      @dev: network device
7242  *
7243  *      Determine network driver for device.
7244  */
7245 const char *netdev_drivername(const struct net_device *dev)
7246 {
7247         const struct device_driver *driver;
7248         const struct device *parent;
7249         const char *empty = "";
7250
7251         parent = dev->dev.parent;
7252         if (!parent)
7253                 return empty;
7254
7255         driver = parent->driver;
7256         if (driver && driver->name)
7257                 return driver->name;
7258         return empty;
7259 }
7260
7261 static void __netdev_printk(const char *level, const struct net_device *dev,
7262                             struct va_format *vaf)
7263 {
7264         if (dev && dev->dev.parent) {
7265                 dev_printk_emit(level[1] - '0',
7266                                 dev->dev.parent,
7267                                 "%s %s %s%s: %pV",
7268                                 dev_driver_string(dev->dev.parent),
7269                                 dev_name(dev->dev.parent),
7270                                 netdev_name(dev), netdev_reg_state(dev),
7271                                 vaf);
7272         } else if (dev) {
7273                 printk("%s%s%s: %pV",
7274                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7275         } else {
7276                 printk("%s(NULL net_device): %pV", level, vaf);
7277         }
7278 }
7279
7280 void netdev_printk(const char *level, const struct net_device *dev,
7281                    const char *format, ...)
7282 {
7283         struct va_format vaf;
7284         va_list args;
7285
7286         va_start(args, format);
7287
7288         vaf.fmt = format;
7289         vaf.va = &args;
7290
7291         __netdev_printk(level, dev, &vaf);
7292
7293         va_end(args);
7294 }
7295 EXPORT_SYMBOL(netdev_printk);
7296
7297 #define define_netdev_printk_level(func, level)                 \
7298 void func(const struct net_device *dev, const char *fmt, ...)   \
7299 {                                                               \
7300         struct va_format vaf;                                   \
7301         va_list args;                                           \
7302                                                                 \
7303         va_start(args, fmt);                                    \
7304                                                                 \
7305         vaf.fmt = fmt;                                          \
7306         vaf.va = &args;                                         \
7307                                                                 \
7308         __netdev_printk(level, dev, &vaf);                      \
7309                                                                 \
7310         va_end(args);                                           \
7311 }                                                               \
7312 EXPORT_SYMBOL(func);
7313
7314 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7315 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7316 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7317 define_netdev_printk_level(netdev_err, KERN_ERR);
7318 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7319 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7320 define_netdev_printk_level(netdev_info, KERN_INFO);
7321
7322 static void __net_exit netdev_exit(struct net *net)
7323 {
7324         kfree(net->dev_name_head);
7325         kfree(net->dev_index_head);
7326 }
7327
7328 static struct pernet_operations __net_initdata netdev_net_ops = {
7329         .init = netdev_init,
7330         .exit = netdev_exit,
7331 };
7332
7333 static void __net_exit default_device_exit(struct net *net)
7334 {
7335         struct net_device *dev, *aux;
7336         /*
7337          * Push all migratable network devices back to the
7338          * initial network namespace
7339          */
7340         rtnl_lock();
7341         for_each_netdev_safe(net, dev, aux) {
7342                 int err;
7343                 char fb_name[IFNAMSIZ];
7344
7345                 /* Ignore unmoveable devices (i.e. loopback) */
7346                 if (dev->features & NETIF_F_NETNS_LOCAL)
7347                         continue;
7348
7349                 /* Leave virtual devices for the generic cleanup */
7350                 if (dev->rtnl_link_ops)
7351                         continue;
7352
7353                 /* Push remaining network devices to init_net */
7354                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7355                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7356                 if (err) {
7357                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7358                                  __func__, dev->name, err);
7359                         BUG();
7360                 }
7361         }
7362         rtnl_unlock();
7363 }
7364
7365 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7366 {
7367         /* Return with the rtnl_lock held when there are no network
7368          * devices unregistering in any network namespace in net_list.
7369          */
7370         struct net *net;
7371         bool unregistering;
7372         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7373
7374         add_wait_queue(&netdev_unregistering_wq, &wait);
7375         for (;;) {
7376                 unregistering = false;
7377                 rtnl_lock();
7378                 list_for_each_entry(net, net_list, exit_list) {
7379                         if (net->dev_unreg_count > 0) {
7380                                 unregistering = true;
7381                                 break;
7382                         }
7383                 }
7384                 if (!unregistering)
7385                         break;
7386                 __rtnl_unlock();
7387
7388                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7389         }
7390         remove_wait_queue(&netdev_unregistering_wq, &wait);
7391 }
7392
7393 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7394 {
7395         /* At exit all network devices most be removed from a network
7396          * namespace.  Do this in the reverse order of registration.
7397          * Do this across as many network namespaces as possible to
7398          * improve batching efficiency.
7399          */
7400         struct net_device *dev;
7401         struct net *net;
7402         LIST_HEAD(dev_kill_list);
7403
7404         /* To prevent network device cleanup code from dereferencing
7405          * loopback devices or network devices that have been freed
7406          * wait here for all pending unregistrations to complete,
7407          * before unregistring the loopback device and allowing the
7408          * network namespace be freed.
7409          *
7410          * The netdev todo list containing all network devices
7411          * unregistrations that happen in default_device_exit_batch
7412          * will run in the rtnl_unlock() at the end of
7413          * default_device_exit_batch.
7414          */
7415         rtnl_lock_unregistering(net_list);
7416         list_for_each_entry(net, net_list, exit_list) {
7417                 for_each_netdev_reverse(net, dev) {
7418                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7419                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7420                         else
7421                                 unregister_netdevice_queue(dev, &dev_kill_list);
7422                 }
7423         }
7424         unregister_netdevice_many(&dev_kill_list);
7425         rtnl_unlock();
7426 }
7427
7428 static struct pernet_operations __net_initdata default_device_ops = {
7429         .exit = default_device_exit,
7430         .exit_batch = default_device_exit_batch,
7431 };
7432
7433 /*
7434  *      Initialize the DEV module. At boot time this walks the device list and
7435  *      unhooks any devices that fail to initialise (normally hardware not
7436  *      present) and leaves us with a valid list of present and active devices.
7437  *
7438  */
7439
7440 /*
7441  *       This is called single threaded during boot, so no need
7442  *       to take the rtnl semaphore.
7443  */
7444 static int __init net_dev_init(void)
7445 {
7446         int i, rc = -ENOMEM;
7447
7448         BUG_ON(!dev_boot_phase);
7449
7450         if (dev_proc_init())
7451                 goto out;
7452
7453         if (netdev_kobject_init())
7454                 goto out;
7455
7456         INIT_LIST_HEAD(&ptype_all);
7457         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7458                 INIT_LIST_HEAD(&ptype_base[i]);
7459
7460         INIT_LIST_HEAD(&offload_base);
7461
7462         if (register_pernet_subsys(&netdev_net_ops))
7463                 goto out;
7464
7465         /*
7466          *      Initialise the packet receive queues.
7467          */
7468
7469         for_each_possible_cpu(i) {
7470                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7471
7472                 skb_queue_head_init(&sd->input_pkt_queue);
7473                 skb_queue_head_init(&sd->process_queue);
7474                 INIT_LIST_HEAD(&sd->poll_list);
7475                 sd->output_queue_tailp = &sd->output_queue;
7476 #ifdef CONFIG_RPS
7477                 sd->csd.func = rps_trigger_softirq;
7478                 sd->csd.info = sd;
7479                 sd->cpu = i;
7480 #endif
7481
7482                 sd->backlog.poll = process_backlog;
7483                 sd->backlog.weight = weight_p;
7484         }
7485
7486         dev_boot_phase = 0;
7487
7488         /* The loopback device is special if any other network devices
7489          * is present in a network namespace the loopback device must
7490          * be present. Since we now dynamically allocate and free the
7491          * loopback device ensure this invariant is maintained by
7492          * keeping the loopback device as the first device on the
7493          * list of network devices.  Ensuring the loopback devices
7494          * is the first device that appears and the last network device
7495          * that disappears.
7496          */
7497         if (register_pernet_device(&loopback_net_ops))
7498                 goto out;
7499
7500         if (register_pernet_device(&default_device_ops))
7501                 goto out;
7502
7503         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7504         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7505
7506         hotcpu_notifier(dev_cpu_callback, 0);
7507         dst_init();
7508         rc = 0;
7509 out:
7510         return rc;
7511 }
7512
7513 subsys_initcall(net_dev_init);