net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <[email protected]>
  12  *                              Mark Evans, <[email protected]>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <[email protected]>
  16  *              Alan Cox <[email protected]>
  17  *              David Hinds <[email protected]>
  18  *              Alexey Kuznetsov <[email protected]>
  19  *              Adam Sulmicki <[email protected]>
  20  *              Pekka Riikonen <[email protected]>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static DEFINE_SPINLOCK(offload_lock);
 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150 struct list_head ptype_all __read_mostly;       /* Taps */
 151 static struct list_head offload_base __read_mostly;
 152
 153 static int netif_rx_internal(struct sk_buff *skb);
 154 static int call_netdevice_notifiers_info(unsigned long val,
 155                                          struct net_device *dev,
 156                                          struct netdev_notifier_info *info);
 157
 158 /*
 159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160  * semaphore.
 161  *
 162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163  *
 164  * Writers must hold the rtnl semaphore while they loop through the
 165  * dev_base_head list, and hold dev_base_lock for writing when they do the
 166  * actual updates.  This allows pure readers to access the list even
 167  * while a writer is preparing to update it.
 168  *
 169  * To put it another way, dev_base_lock is held for writing only to
 170  * protect against pure readers; the rtnl semaphore provides the
 171  * protection against other writers.
 172  *
 173  * See, for example usages, register_netdevice() and
 174  * unregister_netdevice(), which must be called with the rtnl
 175  * semaphore held.
 176  */
 177 DEFINE_RWLOCK(dev_base_lock);
 178 EXPORT_SYMBOL(dev_base_lock);
 179
 180 /* protects napi_hash addition/deletion and napi_gen_id */
 181 static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183 static unsigned int napi_gen_id;
 184 static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186 static seqcount_t devnet_rename_seq;
 187
 188 static inline void dev_base_seq_inc(struct net *net)
 189 {
 190         while (++net->dev_base_seq == 0);
 191 }
 192
 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194 {
 195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198 }
 199
 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201 {
 202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203 }
 204
 205 static inline void rps_lock(struct softnet_data *sd)
 206 {
 207 #ifdef CONFIG_RPS
 208         spin_lock(&sd->input_pkt_queue.lock);
 209 #endif
 210 }
 211
 212 static inline void rps_unlock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_unlock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 /* Device list insertion */
 220 static void list_netdevice(struct net_device *dev)
 221 {
 222         struct net *net = dev_net(dev);
 223
 224         ASSERT_RTNL();
 225
 226         write_lock_bh(&dev_base_lock);
 227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229         hlist_add_head_rcu(&dev->index_hlist,
 230                            dev_index_hash(net, dev->ifindex));
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(net);
 234 }
 235
 236 /* Device list removal
 237  * caller must respect a RCU grace period before freeing/reusing dev
 238  */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del_rcu(&dev->dev_list);
 246         hlist_del_rcu(&dev->name_hlist);
 247         hlist_del_rcu(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(dev_net(dev));
 251 }
 252
 253 /*
 254  *      Our notifier list
 255  */
 256
 257 static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259 /*
 260  *      Device drivers call our routines to queue packets here. We empty the
 261  *      queue in the local softnet handler.
 262  */
 263
 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265 EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267 #ifdef CONFIG_LOCKDEP
 268 /*
 269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270  * according to dev->type
 271  */
 272 static const unsigned short netdev_lock_type[] =
 273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289 static const char *const netdev_lock_name[] =
 290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310 {
 311         int i;
 312
 313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                 if (netdev_lock_type[i] == dev_type)
 315                         return i;
 316         /* the last key is used by default */
 317         return ARRAY_SIZE(netdev_lock_type) - 1;
 318 }
 319
 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                  unsigned short dev_type)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev_type);
 326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329
 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331 {
 332         int i;
 333
 334         i = netdev_lock_pos(dev->type);
 335         lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                    &netdev_addr_lock_key[i],
 337                                    netdev_lock_name[i]);
 338 }
 339 #else
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343 }
 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345 {
 346 }
 347 #endif
 348
 349 /*******************************************************************************
 350
 351                 Protocol management and registration routines
 352
 353 *******************************************************************************/
 354
 355 /*
 356  *      Add a protocol ID to the list. Now that the input handler is
 357  *      smarter we can dispense with all the messy stuff that used to be
 358  *      here.
 359  *
 360  *      BEWARE!!! Protocol handlers, mangling input packets,
 361  *      MUST BE last in hash buckets and checking protocol handlers
 362  *      MUST start from promiscuous ptype_all chain in net_bh.
 363  *      It is true now, do not change it.
 364  *      Explanation follows: if protocol handler, mangling packet, will
 365  *      be the first on list, it is not able to sense, that packet
 366  *      is cloned and should be copied-on-write, so that it will
 367  *      change it and subsequent readers will get broken packet.
 368  *                                                      --ANK (980803)
 369  */
 370
 371 static inline struct list_head *ptype_head(const struct packet_type *pt)
 372 {
 373         if (pt->type == htons(ETH_P_ALL))
 374                 return &ptype_all;
 375         else
 376                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 377 }
 378
 379 /**
 380  *      dev_add_pack - add packet handler
 381  *      @pt: packet type declaration
 382  *
 383  *      Add a protocol handler to the networking stack. The passed &packet_type
 384  *      is linked into kernel lists and may not be freed until it has been
 385  *      removed from the kernel lists.
 386  *
 387  *      This call does not sleep therefore it can not
 388  *      guarantee all CPU's that are in middle of receiving packets
 389  *      will see the new packet type (until the next received packet).
 390  */
 391
 392 void dev_add_pack(struct packet_type *pt)
 393 {
 394         struct list_head *head = ptype_head(pt);
 395
 396         spin_lock(&ptype_lock);
 397         list_add_rcu(&pt->list, head);
 398         spin_unlock(&ptype_lock);
 399 }
 400 EXPORT_SYMBOL(dev_add_pack);
 401
 402 /**
 403  *      __dev_remove_pack        - remove packet handler
 404  *      @pt: packet type declaration
 405  *
 406  *      Remove a protocol handler that was previously added to the kernel
 407  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 408  *      from the kernel lists and can be freed or reused once this function
 409  *      returns.
 410  *
 411  *      The packet type might still be in use by receivers
 412  *      and must not be freed until after all the CPU's have gone
 413  *      through a quiescent state.
 414  */
 415 void __dev_remove_pack(struct packet_type *pt)
 416 {
 417         struct list_head *head = ptype_head(pt);
 418         struct packet_type *pt1;
 419
 420         spin_lock(&ptype_lock);
 421
 422         list_for_each_entry(pt1, head, list) {
 423                 if (pt == pt1) {
 424                         list_del_rcu(&pt->list);
 425                         goto out;
 426                 }
 427         }
 428
 429         pr_warn("dev_remove_pack: %p not found\n", pt);
 430 out:
 431         spin_unlock(&ptype_lock);
 432 }
 433 EXPORT_SYMBOL(__dev_remove_pack);
 434
 435 /**
 436  *      dev_remove_pack  - remove packet handler
 437  *      @pt: packet type declaration
 438  *
 439  *      Remove a protocol handler that was previously added to the kernel
 440  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 441  *      from the kernel lists and can be freed or reused once this function
 442  *      returns.
 443  *
 444  *      This call sleeps to guarantee that no CPU is looking at the packet
 445  *      type after return.
 446  */
 447 void dev_remove_pack(struct packet_type *pt)
 448 {
 449         __dev_remove_pack(pt);
 450
 451         synchronize_net();
 452 }
 453 EXPORT_SYMBOL(dev_remove_pack);
 454
 455
 456 /**
 457  *      dev_add_offload - register offload handlers
 458  *      @po: protocol offload declaration
 459  *
 460  *      Add protocol offload handlers to the networking stack. The passed
 461  *      &proto_offload is linked into kernel lists and may not be freed until
 462  *      it has been removed from the kernel lists.
 463  *
 464  *      This call does not sleep therefore it can not
 465  *      guarantee all CPU's that are in middle of receiving packets
 466  *      will see the new offload handlers (until the next received packet).
 467  */
 468 void dev_add_offload(struct packet_offload *po)
 469 {
 470         struct list_head *head = &offload_base;
 471
 472         spin_lock(&offload_lock);
 473         list_add_rcu(&po->list, head);
 474         spin_unlock(&offload_lock);
 475 }
 476 EXPORT_SYMBOL(dev_add_offload);
 477
 478 /**
 479  *      __dev_remove_offload     - remove offload handler
 480  *      @po: packet offload declaration
 481  *
 482  *      Remove a protocol offload handler that was previously added to the
 483  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 484  *      is removed from the kernel lists and can be freed or reused once this
 485  *      function returns.
 486  *
 487  *      The packet type might still be in use by receivers
 488  *      and must not be freed until after all the CPU's have gone
 489  *      through a quiescent state.
 490  */
 491 static void __dev_remove_offload(struct packet_offload *po)
 492 {
 493         struct list_head *head = &offload_base;
 494         struct packet_offload *po1;
 495
 496         spin_lock(&offload_lock);
 497
 498         list_for_each_entry(po1, head, list) {
 499                 if (po == po1) {
 500                         list_del_rcu(&po->list);
 501                         goto out;
 502                 }
 503         }
 504
 505         pr_warn("dev_remove_offload: %p not found\n", po);
 506 out:
 507         spin_unlock(&offload_lock);
 508 }
 509
 510 /**
 511  *      dev_remove_offload       - remove packet offload handler
 512  *      @po: packet offload declaration
 513  *
 514  *      Remove a packet offload handler that was previously added to the kernel
 515  *      offload handlers by dev_add_offload(). The passed &offload_type is
 516  *      removed from the kernel lists and can be freed or reused once this
 517  *      function returns.
 518  *
 519  *      This call sleeps to guarantee that no CPU is looking at the packet
 520  *      type after return.
 521  */
 522 void dev_remove_offload(struct packet_offload *po)
 523 {
 524         __dev_remove_offload(po);
 525
 526         synchronize_net();
 527 }
 528 EXPORT_SYMBOL(dev_remove_offload);
 529
 530 /******************************************************************************
 531
 532                       Device Boot-time Settings Routines
 533
 534 *******************************************************************************/
 535
 536 /* Boot time configuration table */
 537 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 538
 539 /**
 540  *      netdev_boot_setup_add   - add new setup entry
 541  *      @name: name of the device
 542  *      @map: configured settings for the device
 543  *
 544  *      Adds new setup entry to the dev_boot_setup list.  The function
 545  *      returns 0 on error and 1 on success.  This is a generic routine to
 546  *      all netdevices.
 547  */
 548 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 549 {
 550         struct netdev_boot_setup *s;
 551         int i;
 552
 553         s = dev_boot_setup;
 554         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 555                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 556                         memset(s[i].name, 0, sizeof(s[i].name));
 557                         strlcpy(s[i].name, name, IFNAMSIZ);
 558                         memcpy(&s[i].map, map, sizeof(s[i].map));
 559                         break;
 560                 }
 561         }
 562
 563         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 564 }
 565
 566 /**
 567  *      netdev_boot_setup_check - check boot time settings
 568  *      @dev: the netdevice
 569  *
 570  *      Check boot time settings for the device.
 571  *      The found settings are set for the device to be used
 572  *      later in the device probing.
 573  *      Returns 0 if no settings found, 1 if they are.
 574  */
 575 int netdev_boot_setup_check(struct net_device *dev)
 576 {
 577         struct netdev_boot_setup *s = dev_boot_setup;
 578         int i;
 579
 580         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 581                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 582                     !strcmp(dev->name, s[i].name)) {
 583                         dev->irq        = s[i].map.irq;
 584                         dev->base_addr  = s[i].map.base_addr;
 585                         dev->mem_start  = s[i].map.mem_start;
 586                         dev->mem_end    = s[i].map.mem_end;
 587                         return 1;
 588                 }
 589         }
 590         return 0;
 591 }
 592 EXPORT_SYMBOL(netdev_boot_setup_check);
 593
 594
 595 /**
 596  *      netdev_boot_base        - get address from boot time settings
 597  *      @prefix: prefix for network device
 598  *      @unit: id for network device
 599  *
 600  *      Check boot time settings for the base address of device.
 601  *      The found settings are set for the device to be used
 602  *      later in the device probing.
 603  *      Returns 0 if no settings found.
 604  */
 605 unsigned long netdev_boot_base(const char *prefix, int unit)
 606 {
 607         const struct netdev_boot_setup *s = dev_boot_setup;
 608         char name[IFNAMSIZ];
 609         int i;
 610
 611         sprintf(name, "%s%d", prefix, unit);
 612
 613         /*
 614          * If device already registered then return base of 1
 615          * to indicate not to probe for this interface
 616          */
 617         if (__dev_get_by_name(&init_net, name))
 618                 return 1;
 619
 620         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 621                 if (!strcmp(name, s[i].name))
 622                         return s[i].map.base_addr;
 623         return 0;
 624 }
 625
 626 /*
 627  * Saves at boot time configured settings for any netdevice.
 628  */
 629 int __init netdev_boot_setup(char *str)
 630 {
 631         int ints[5];
 632         struct ifmap map;
 633
 634         str = get_options(str, ARRAY_SIZE(ints), ints);
 635         if (!str || !*str)
 636                 return 0;
 637
 638         /* Save settings */
 639         memset(&map, 0, sizeof(map));
 640         if (ints[0] > 0)
 641                 map.irq = ints[1];
 642         if (ints[0] > 1)
 643                 map.base_addr = ints[2];
 644         if (ints[0] > 2)
 645                 map.mem_start = ints[3];
 646         if (ints[0] > 3)
 647                 map.mem_end = ints[4];
 648
 649         /* Add new entry to the list */
 650         return netdev_boot_setup_add(str, &map);
 651 }
 652
 653 __setup("netdev=", netdev_boot_setup);
 654
 655 /*******************************************************************************
 656
 657                             Device Interface Subroutines
 658
 659 *******************************************************************************/
 660
 661 /**
 662  *      __dev_get_by_name       - find a device by its name
 663  *      @net: the applicable net namespace
 664  *      @name: name to find
 665  *
 666  *      Find an interface by name. Must be called under RTNL semaphore
 667  *      or @dev_base_lock. If the name is found a pointer to the device
 668  *      is returned. If the name is not found then %NULL is returned. The
 669  *      reference counters are not incremented so the caller must be
 670  *      careful with locks.
 671  */
 672
 673 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 674 {
 675         struct net_device *dev;
 676         struct hlist_head *head = dev_name_hash(net, name);
 677
 678         hlist_for_each_entry(dev, head, name_hlist)
 679                 if (!strncmp(dev->name, name, IFNAMSIZ))
 680                         return dev;
 681
 682         return NULL;
 683 }
 684 EXPORT_SYMBOL(__dev_get_by_name);
 685
 686 /**
 687  *      dev_get_by_name_rcu     - find a device by its name
 688  *      @net: the applicable net namespace
 689  *      @name: name to find
 690  *
 691  *      Find an interface by name.
 692  *      If the name is found a pointer to the device is returned.
 693  *      If the name is not found then %NULL is returned.
 694  *      The reference counters are not incremented so the caller must be
 695  *      careful with locks. The caller must hold RCU lock.
 696  */
 697
 698 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 699 {
 700         struct net_device *dev;
 701         struct hlist_head *head = dev_name_hash(net, name);
 702
 703         hlist_for_each_entry_rcu(dev, head, name_hlist)
 704                 if (!strncmp(dev->name, name, IFNAMSIZ))
 705                         return dev;
 706
 707         return NULL;
 708 }
 709 EXPORT_SYMBOL(dev_get_by_name_rcu);
 710
 711 /**
 712  *      dev_get_by_name         - find a device by its name
 713  *      @net: the applicable net namespace
 714  *      @name: name to find
 715  *
 716  *      Find an interface by name. This can be called from any
 717  *      context and does its own locking. The returned handle has
 718  *      the usage count incremented and the caller must use dev_put() to
 719  *      release it when it is no longer needed. %NULL is returned if no
 720  *      matching device is found.
 721  */
 722
 723 struct net_device *dev_get_by_name(struct net *net, const char *name)
 724 {
 725         struct net_device *dev;
 726
 727         rcu_read_lock();
 728         dev = dev_get_by_name_rcu(net, name);
 729         if (dev)
 730                 dev_hold(dev);
 731         rcu_read_unlock();
 732         return dev;
 733 }
 734 EXPORT_SYMBOL(dev_get_by_name);
 735
 736 /**
 737  *      __dev_get_by_index - find a device by its ifindex
 738  *      @net: the applicable net namespace
 739  *      @ifindex: index of device
 740  *
 741  *      Search for an interface by index. Returns %NULL if the device
 742  *      is not found or a pointer to the device. The device has not
 743  *      had its reference counter increased so the caller must be careful
 744  *      about locking. The caller must hold either the RTNL semaphore
 745  *      or @dev_base_lock.
 746  */
 747
 748 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 749 {
 750         struct net_device *dev;
 751         struct hlist_head *head = dev_index_hash(net, ifindex);
 752
 753         hlist_for_each_entry(dev, head, index_hlist)
 754                 if (dev->ifindex == ifindex)
 755                         return dev;
 756
 757         return NULL;
 758 }
 759 EXPORT_SYMBOL(__dev_get_by_index);
 760
 761 /**
 762  *      dev_get_by_index_rcu - find a device by its ifindex
 763  *      @net: the applicable net namespace
 764  *      @ifindex: index of device
 765  *
 766  *      Search for an interface by index. Returns %NULL if the device
 767  *      is not found or a pointer to the device. The device has not
 768  *      had its reference counter increased so the caller must be careful
 769  *      about locking. The caller must hold RCU lock.
 770  */
 771
 772 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 773 {
 774         struct net_device *dev;
 775         struct hlist_head *head = dev_index_hash(net, ifindex);
 776
 777         hlist_for_each_entry_rcu(dev, head, index_hlist)
 778                 if (dev->ifindex == ifindex)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(dev_get_by_index_rcu);
 784
 785
 786 /**
 787  *      dev_get_by_index - find a device by its ifindex
 788  *      @net: the applicable net namespace
 789  *      @ifindex: index of device
 790  *
 791  *      Search for an interface by index. Returns NULL if the device
 792  *      is not found or a pointer to the device. The device returned has
 793  *      had a reference added and the pointer is safe until the user calls
 794  *      dev_put to indicate they have finished with it.
 795  */
 796
 797 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 798 {
 799         struct net_device *dev;
 800
 801         rcu_read_lock();
 802         dev = dev_get_by_index_rcu(net, ifindex);
 803         if (dev)
 804                 dev_hold(dev);
 805         rcu_read_unlock();
 806         return dev;
 807 }
 808 EXPORT_SYMBOL(dev_get_by_index);
 809
 810 /**
 811  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 812  *      @net: network namespace
 813  *      @name: a pointer to the buffer where the name will be stored.
 814  *      @ifindex: the ifindex of the interface to get the name from.
 815  *
 816  *      The use of raw_seqcount_begin() and cond_resched() before
 817  *      retrying is required as we want to give the writers a chance
 818  *      to complete when CONFIG_PREEMPT is not set.
 819  */
 820 int netdev_get_name(struct net *net, char *name, int ifindex)
 821 {
 822         struct net_device *dev;
 823         unsigned int seq;
 824
 825 retry:
 826         seq = raw_seqcount_begin(&devnet_rename_seq);
 827         rcu_read_lock();
 828         dev = dev_get_by_index_rcu(net, ifindex);
 829         if (!dev) {
 830                 rcu_read_unlock();
 831                 return -ENODEV;
 832         }
 833
 834         strcpy(name, dev->name);
 835         rcu_read_unlock();
 836         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 837                 cond_resched();
 838                 goto retry;
 839         }
 840
 841         return 0;
 842 }
 843
 844 /**
 845  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 846  *      @net: the applicable net namespace
 847  *      @type: media type of device
 848  *      @ha: hardware address
 849  *
 850  *      Search for an interface by MAC address. Returns NULL if the device
 851  *      is not found or a pointer to the device.
 852  *      The caller must hold RCU or RTNL.
 853  *      The returned device has not had its ref count increased
 854  *      and the caller must therefore be careful about locking
 855  *
 856  */
 857
 858 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 859                                        const char *ha)
 860 {
 861         struct net_device *dev;
 862
 863         for_each_netdev_rcu(net, dev)
 864                 if (dev->type == type &&
 865                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 866                         return dev;
 867
 868         return NULL;
 869 }
 870 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 871
 872 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 873 {
 874         struct net_device *dev;
 875
 876         ASSERT_RTNL();
 877         for_each_netdev(net, dev)
 878                 if (dev->type == type)
 879                         return dev;
 880
 881         return NULL;
 882 }
 883 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 884
 885 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 886 {
 887         struct net_device *dev, *ret = NULL;
 888
 889         rcu_read_lock();
 890         for_each_netdev_rcu(net, dev)
 891                 if (dev->type == type) {
 892                         dev_hold(dev);
 893                         ret = dev;
 894                         break;
 895                 }
 896         rcu_read_unlock();
 897         return ret;
 898 }
 899 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 900
 901 /**
 902  *      __dev_get_by_flags - find any device with given flags
 903  *      @net: the applicable net namespace
 904  *      @if_flags: IFF_* values
 905  *      @mask: bitmask of bits in if_flags to check
 906  *
 907  *      Search for any interface with the given flags. Returns NULL if a device
 908  *      is not found or a pointer to the device. Must be called inside
 909  *      rtnl_lock(), and result refcount is unchanged.
 910  */
 911
 912 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 913                                       unsigned short mask)
 914 {
 915         struct net_device *dev, *ret;
 916
 917         ASSERT_RTNL();
 918
 919         ret = NULL;
 920         for_each_netdev(net, dev) {
 921                 if (((dev->flags ^ if_flags) & mask) == 0) {
 922                         ret = dev;
 923                         break;
 924                 }
 925         }
 926         return ret;
 927 }
 928 EXPORT_SYMBOL(__dev_get_by_flags);
 929
 930 /**
 931  *      dev_valid_name - check if name is okay for network device
 932  *      @name: name string
 933  *
 934  *      Network device names need to be valid file names to
 935  *      to allow sysfs to work.  We also disallow any kind of
 936  *      whitespace.
 937  */
 938 bool dev_valid_name(const char *name)
 939 {
 940         if (*name == '\0')
 941                 return false;
 942         if (strlen(name) >= IFNAMSIZ)
 943                 return false;
 944         if (!strcmp(name, ".") || !strcmp(name, ".."))
 945                 return false;
 946
 947         while (*name) {
 948                 if (*name == '/' || isspace(*name))
 949                         return false;
 950                 name++;
 951         }
 952         return true;
 953 }
 954 EXPORT_SYMBOL(dev_valid_name);
 955
 956 /**
 957  *      __dev_alloc_name - allocate a name for a device
 958  *      @net: network namespace to allocate the device name in
 959  *      @name: name format string
 960  *      @buf:  scratch buffer and result name string
 961  *
 962  *      Passed a format string - eg "lt%d" it will try and find a suitable
 963  *      id. It scans list of devices to build up a free map, then chooses
 964  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 965  *      while allocating the name and adding the device in order to avoid
 966  *      duplicates.
 967  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 968  *      Returns the number of the unit assigned or a negative errno code.
 969  */
 970
 971 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 972 {
 973         int i = 0;
 974         const char *p;
 975         const int max_netdevices = 8*PAGE_SIZE;
 976         unsigned long *inuse;
 977         struct net_device *d;
 978
 979         p = strnchr(name, IFNAMSIZ-1, '%');
 980         if (p) {
 981                 /*
 982                  * Verify the string as this thing may have come from
 983                  * the user.  There must be either one "%d" and no other "%"
 984                  * characters.
 985                  */
 986                 if (p[1] != 'd' || strchr(p + 2, '%'))
 987                         return -EINVAL;
 988
 989                 /* Use one page as a bit array of possible slots */
 990                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 991                 if (!inuse)
 992                         return -ENOMEM;
 993
 994                 for_each_netdev(net, d) {
 995                         if (!sscanf(d->name, name, &i))
 996                                 continue;
 997                         if (i < 0 || i >= max_netdevices)
 998                                 continue;
 999
1000                         /*  avoid cases where sscanf is not exact inverse of printf */
1001                         snprintf(buf, IFNAMSIZ, name, i);
1002                         if (!strncmp(buf, d->name, IFNAMSIZ))
1003                                 set_bit(i, inuse);
1004                 }
1005
1006                 i = find_first_zero_bit(inuse, max_netdevices);
1007                 free_page((unsigned long) inuse);
1008         }
1009
1010         if (buf != name)
1011                 snprintf(buf, IFNAMSIZ, name, i);
1012         if (!__dev_get_by_name(net, buf))
1013                 return i;
1014
1015         /* It is possible to run out of possible slots
1016          * when the name is long and there isn't enough space left
1017          * for the digits, or if all bits are used.
1018          */
1019         return -ENFILE;
1020 }
1021
1022 /**
1023  *      dev_alloc_name - allocate a name for a device
1024  *      @dev: device
1025  *      @name: name format string
1026  *
1027  *      Passed a format string - eg "lt%d" it will try and find a suitable
1028  *      id. It scans list of devices to build up a free map, then chooses
1029  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1030  *      while allocating the name and adding the device in order to avoid
1031  *      duplicates.
1032  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1033  *      Returns the number of the unit assigned or a negative errno code.
1034  */
1035
1036 int dev_alloc_name(struct net_device *dev, const char *name)
1037 {
1038         char buf[IFNAMSIZ];
1039         struct net *net;
1040         int ret;
1041
1042         BUG_ON(!dev_net(dev));
1043         net = dev_net(dev);
1044         ret = __dev_alloc_name(net, name, buf);
1045         if (ret >= 0)
1046                 strlcpy(dev->name, buf, IFNAMSIZ);
1047         return ret;
1048 }
1049 EXPORT_SYMBOL(dev_alloc_name);
1050
1051 static int dev_alloc_name_ns(struct net *net,
1052                              struct net_device *dev,
1053                              const char *name)
1054 {
1055         char buf[IFNAMSIZ];
1056         int ret;
1057
1058         ret = __dev_alloc_name(net, name, buf);
1059         if (ret >= 0)
1060                 strlcpy(dev->name, buf, IFNAMSIZ);
1061         return ret;
1062 }
1063
1064 static int dev_get_valid_name(struct net *net,
1065                               struct net_device *dev,
1066                               const char *name)
1067 {
1068         BUG_ON(!net);
1069
1070         if (!dev_valid_name(name))
1071                 return -EINVAL;
1072
1073         if (strchr(name, '%'))
1074                 return dev_alloc_name_ns(net, dev, name);
1075         else if (__dev_get_by_name(net, name))
1076                 return -EEXIST;
1077         else if (dev->name != name)
1078                 strlcpy(dev->name, name, IFNAMSIZ);
1079
1080         return 0;
1081 }
1082
1083 /**
1084  *      dev_change_name - change name of a device
1085  *      @dev: device
1086  *      @newname: name (or format string) must be at least IFNAMSIZ
1087  *
1088  *      Change name of a device, can pass format strings "eth%d".
1089  *      for wildcarding.
1090  */
1091 int dev_change_name(struct net_device *dev, const char *newname)
1092 {
1093         unsigned char old_assign_type;
1094         char oldname[IFNAMSIZ];
1095         int err = 0;
1096         int ret;
1097         struct net *net;
1098
1099         ASSERT_RTNL();
1100         BUG_ON(!dev_net(dev));
1101
1102         net = dev_net(dev);
1103         if (dev->flags & IFF_UP)
1104                 return -EBUSY;
1105
1106         write_seqcount_begin(&devnet_rename_seq);
1107
1108         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1109                 write_seqcount_end(&devnet_rename_seq);
1110                 return 0;
1111         }
1112
1113         memcpy(oldname, dev->name, IFNAMSIZ);
1114
1115         err = dev_get_valid_name(net, dev, newname);
1116         if (err < 0) {
1117                 write_seqcount_end(&devnet_rename_seq);
1118                 return err;
1119         }
1120
1121         if (oldname[0] && !strchr(oldname, '%'))
1122                 netdev_info(dev, "renamed from %s\n", oldname);
1123
1124         old_assign_type = dev->name_assign_type;
1125         dev->name_assign_type = NET_NAME_RENAMED;
1126
1127 rollback:
1128         ret = device_rename(&dev->dev, dev->name);
1129         if (ret) {
1130                 memcpy(dev->name, oldname, IFNAMSIZ);
1131                 dev->name_assign_type = old_assign_type;
1132                 write_seqcount_end(&devnet_rename_seq);
1133                 return ret;
1134         }
1135
1136         write_seqcount_end(&devnet_rename_seq);
1137
1138         netdev_adjacent_rename_links(dev, oldname);
1139
1140         write_lock_bh(&dev_base_lock);
1141         hlist_del_rcu(&dev->name_hlist);
1142         write_unlock_bh(&dev_base_lock);
1143
1144         synchronize_rcu();
1145
1146         write_lock_bh(&dev_base_lock);
1147         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1148         write_unlock_bh(&dev_base_lock);
1149
1150         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1151         ret = notifier_to_errno(ret);
1152
1153         if (ret) {
1154                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1155                 if (err >= 0) {
1156                         err = ret;
1157                         write_seqcount_begin(&devnet_rename_seq);
1158                         memcpy(dev->name, oldname, IFNAMSIZ);
1159                         memcpy(oldname, newname, IFNAMSIZ);
1160                         dev->name_assign_type = old_assign_type;
1161                         old_assign_type = NET_NAME_RENAMED;
1162                         goto rollback;
1163                 } else {
1164                         pr_err("%s: name change rollback failed: %d\n",
1165                                dev->name, ret);
1166                 }
1167         }
1168
1169         return err;
1170 }
1171
1172 /**
1173  *      dev_set_alias - change ifalias of a device
1174  *      @dev: device
1175  *      @alias: name up to IFALIASZ
1176  *      @len: limit of bytes to copy from info
1177  *
1178  *      Set ifalias for a device,
1179  */
1180 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1181 {
1182         char *new_ifalias;
1183
1184         ASSERT_RTNL();
1185
1186         if (len >= IFALIASZ)
1187                 return -EINVAL;
1188
1189         if (!len) {
1190                 kfree(dev->ifalias);
1191                 dev->ifalias = NULL;
1192                 return 0;
1193         }
1194
1195         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1196         if (!new_ifalias)
1197                 return -ENOMEM;
1198         dev->ifalias = new_ifalias;
1199
1200         strlcpy(dev->ifalias, alias, len+1);
1201         return len;
1202 }
1203
1204
1205 /**
1206  *      netdev_features_change - device changes features
1207  *      @dev: device to cause notification
1208  *
1209  *      Called to indicate a device has changed features.
1210  */
1211 void netdev_features_change(struct net_device *dev)
1212 {
1213         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1214 }
1215 EXPORT_SYMBOL(netdev_features_change);
1216
1217 /**
1218  *      netdev_state_change - device changes state
1219  *      @dev: device to cause notification
1220  *
1221  *      Called to indicate a device has changed state. This function calls
1222  *      the notifier chains for netdev_chain and sends a NEWLINK message
1223  *      to the routing socket.
1224  */
1225 void netdev_state_change(struct net_device *dev)
1226 {
1227         if (dev->flags & IFF_UP) {
1228                 struct netdev_notifier_change_info change_info;
1229
1230                 change_info.flags_changed = 0;
1231                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1232                                               &change_info.info);
1233                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1234         }
1235 }
1236 EXPORT_SYMBOL(netdev_state_change);
1237
1238 /**
1239  *      netdev_notify_peers - notify network peers about existence of @dev
1240  *      @dev: network device
1241  *
1242  * Generate traffic such that interested network peers are aware of
1243  * @dev, such as by generating a gratuitous ARP. This may be used when
1244  * a device wants to inform the rest of the network about some sort of
1245  * reconfiguration such as a failover event or virtual machine
1246  * migration.
1247  */
1248 void netdev_notify_peers(struct net_device *dev)
1249 {
1250         rtnl_lock();
1251         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1252         rtnl_unlock();
1253 }
1254 EXPORT_SYMBOL(netdev_notify_peers);
1255
1256 static int __dev_open(struct net_device *dev)
1257 {
1258         const struct net_device_ops *ops = dev->netdev_ops;
1259         int ret;
1260
1261         ASSERT_RTNL();
1262
1263         if (!netif_device_present(dev))
1264                 return -ENODEV;
1265
1266         /* Block netpoll from trying to do any rx path servicing.
1267          * If we don't do this there is a chance ndo_poll_controller
1268          * or ndo_poll may be running while we open the device
1269          */
1270         netpoll_poll_disable(dev);
1271
1272         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1273         ret = notifier_to_errno(ret);
1274         if (ret)
1275                 return ret;
1276
1277         set_bit(__LINK_STATE_START, &dev->state);
1278
1279         if (ops->ndo_validate_addr)
1280                 ret = ops->ndo_validate_addr(dev);
1281
1282         if (!ret && ops->ndo_open)
1283                 ret = ops->ndo_open(dev);
1284
1285         netpoll_poll_enable(dev);
1286
1287         if (ret)
1288                 clear_bit(__LINK_STATE_START, &dev->state);
1289         else {
1290                 dev->flags |= IFF_UP;
1291                 dev_set_rx_mode(dev);
1292                 dev_activate(dev);
1293                 add_device_randomness(dev->dev_addr, dev->addr_len);
1294         }
1295
1296         return ret;
1297 }
1298
1299 /**
1300  *      dev_open        - prepare an interface for use.
1301  *      @dev:   device to open
1302  *
1303  *      Takes a device from down to up state. The device's private open
1304  *      function is invoked and then the multicast lists are loaded. Finally
1305  *      the device is moved into the up state and a %NETDEV_UP message is
1306  *      sent to the netdev notifier chain.
1307  *
1308  *      Calling this function on an active interface is a nop. On a failure
1309  *      a negative errno code is returned.
1310  */
1311 int dev_open(struct net_device *dev)
1312 {
1313         int ret;
1314
1315         if (dev->flags & IFF_UP)
1316                 return 0;
1317
1318         ret = __dev_open(dev);
1319         if (ret < 0)
1320                 return ret;
1321
1322         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1323         call_netdevice_notifiers(NETDEV_UP, dev);
1324
1325         return ret;
1326 }
1327 EXPORT_SYMBOL(dev_open);
1328
1329 static int __dev_close_many(struct list_head *head)
1330 {
1331         struct net_device *dev;
1332
1333         ASSERT_RTNL();
1334         might_sleep();
1335
1336         list_for_each_entry(dev, head, close_list) {
1337                 /* Temporarily disable netpoll until the interface is down */
1338                 netpoll_poll_disable(dev);
1339
1340                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1341
1342                 clear_bit(__LINK_STATE_START, &dev->state);
1343
1344                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1345                  * can be even on different cpu. So just clear netif_running().
1346                  *
1347                  * dev->stop() will invoke napi_disable() on all of it's
1348                  * napi_struct instances on this device.
1349                  */
1350                 smp_mb__after_atomic(); /* Commit netif_running(). */
1351         }
1352
1353         dev_deactivate_many(head);
1354
1355         list_for_each_entry(dev, head, close_list) {
1356                 const struct net_device_ops *ops = dev->netdev_ops;
1357
1358                 /*
1359                  *      Call the device specific close. This cannot fail.
1360                  *      Only if device is UP
1361                  *
1362                  *      We allow it to be called even after a DETACH hot-plug
1363                  *      event.
1364                  */
1365                 if (ops->ndo_stop)
1366                         ops->ndo_stop(dev);
1367
1368                 dev->flags &= ~IFF_UP;
1369                 netpoll_poll_enable(dev);
1370         }
1371
1372         return 0;
1373 }
1374
1375 static int __dev_close(struct net_device *dev)
1376 {
1377         int retval;
1378         LIST_HEAD(single);
1379
1380         list_add(&dev->close_list, &single);
1381         retval = __dev_close_many(&single);
1382         list_del(&single);
1383
1384         return retval;
1385 }
1386
1387 static int dev_close_many(struct list_head *head)
1388 {
1389         struct net_device *dev, *tmp;
1390
1391         /* Remove the devices that don't need to be closed */
1392         list_for_each_entry_safe(dev, tmp, head, close_list)
1393                 if (!(dev->flags & IFF_UP))
1394                         list_del_init(&dev->close_list);
1395
1396         __dev_close_many(head);
1397
1398         list_for_each_entry_safe(dev, tmp, head, close_list) {
1399                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1401                 list_del_init(&dev->close_list);
1402         }
1403
1404         return 0;
1405 }
1406
1407 /**
1408  *      dev_close - shutdown an interface.
1409  *      @dev: device to shutdown
1410  *
1411  *      This function moves an active device into down state. A
1412  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414  *      chain.
1415  */
1416 int dev_close(struct net_device *dev)
1417 {
1418         if (dev->flags & IFF_UP) {
1419                 LIST_HEAD(single);
1420
1421                 list_add(&dev->close_list, &single);
1422                 dev_close_many(&single);
1423                 list_del(&single);
1424         }
1425         return 0;
1426 }
1427 EXPORT_SYMBOL(dev_close);
1428
1429
1430 /**
1431  *      dev_disable_lro - disable Large Receive Offload on a device
1432  *      @dev: device
1433  *
1434  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1435  *      called under RTNL.  This is needed if received packets may be
1436  *      forwarded to another interface.
1437  */
1438 void dev_disable_lro(struct net_device *dev)
1439 {
1440         struct net_device *lower_dev;
1441         struct list_head *iter;
1442
1443         dev->wanted_features &= ~NETIF_F_LRO;
1444         netdev_update_features(dev);
1445
1446         if (unlikely(dev->features & NETIF_F_LRO))
1447                 netdev_WARN(dev, "failed to disable LRO!\n");
1448
1449         netdev_for_each_lower_dev(dev, lower_dev, iter)
1450                 dev_disable_lro(lower_dev);
1451 }
1452 EXPORT_SYMBOL(dev_disable_lro);
1453
1454 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1455                                    struct net_device *dev)
1456 {
1457         struct netdev_notifier_info info;
1458
1459         netdev_notifier_info_init(&info, dev);
1460         return nb->notifier_call(nb, val, &info);
1461 }
1462
1463 static int dev_boot_phase = 1;
1464
1465 /**
1466  *      register_netdevice_notifier - register a network notifier block
1467  *      @nb: notifier
1468  *
1469  *      Register a notifier to be called when network device events occur.
1470  *      The notifier passed is linked into the kernel structures and must
1471  *      not be reused until it has been unregistered. A negative errno code
1472  *      is returned on a failure.
1473  *
1474  *      When registered all registration and up events are replayed
1475  *      to the new notifier to allow device to have a race free
1476  *      view of the network device list.
1477  */
1478
1479 int register_netdevice_notifier(struct notifier_block *nb)
1480 {
1481         struct net_device *dev;
1482         struct net_device *last;
1483         struct net *net;
1484         int err;
1485
1486         rtnl_lock();
1487         err = raw_notifier_chain_register(&netdev_chain, nb);
1488         if (err)
1489                 goto unlock;
1490         if (dev_boot_phase)
1491                 goto unlock;
1492         for_each_net(net) {
1493                 for_each_netdev(net, dev) {
1494                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1495                         err = notifier_to_errno(err);
1496                         if (err)
1497                                 goto rollback;
1498
1499                         if (!(dev->flags & IFF_UP))
1500                                 continue;
1501
1502                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1503                 }
1504         }
1505
1506 unlock:
1507         rtnl_unlock();
1508         return err;
1509
1510 rollback:
1511         last = dev;
1512         for_each_net(net) {
1513                 for_each_netdev(net, dev) {
1514                         if (dev == last)
1515                                 goto outroll;
1516
1517                         if (dev->flags & IFF_UP) {
1518                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1519                                                         dev);
1520                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1521                         }
1522                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1523                 }
1524         }
1525
1526 outroll:
1527         raw_notifier_chain_unregister(&netdev_chain, nb);
1528         goto unlock;
1529 }
1530 EXPORT_SYMBOL(register_netdevice_notifier);
1531
1532 /**
1533  *      unregister_netdevice_notifier - unregister a network notifier block
1534  *      @nb: notifier
1535  *
1536  *      Unregister a notifier previously registered by
1537  *      register_netdevice_notifier(). The notifier is unlinked into the
1538  *      kernel structures and may then be reused. A negative errno code
1539  *      is returned on a failure.
1540  *
1541  *      After unregistering unregister and down device events are synthesized
1542  *      for all devices on the device list to the removed notifier to remove
1543  *      the need for special case cleanup code.
1544  */
1545
1546 int unregister_netdevice_notifier(struct notifier_block *nb)
1547 {
1548         struct net_device *dev;
1549         struct net *net;
1550         int err;
1551
1552         rtnl_lock();
1553         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1554         if (err)
1555                 goto unlock;
1556
1557         for_each_net(net) {
1558                 for_each_netdev(net, dev) {
1559                         if (dev->flags & IFF_UP) {
1560                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1561                                                         dev);
1562                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1563                         }
1564                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1565                 }
1566         }
1567 unlock:
1568         rtnl_unlock();
1569         return err;
1570 }
1571 EXPORT_SYMBOL(unregister_netdevice_notifier);
1572
1573 /**
1574  *      call_netdevice_notifiers_info - call all network notifier blocks
1575  *      @val: value passed unmodified to notifier function
1576  *      @dev: net_device pointer passed unmodified to notifier function
1577  *      @info: notifier information data
1578  *
1579  *      Call all network notifier blocks.  Parameters and return value
1580  *      are as for raw_notifier_call_chain().
1581  */
1582
1583 static int call_netdevice_notifiers_info(unsigned long val,
1584                                          struct net_device *dev,
1585                                          struct netdev_notifier_info *info)
1586 {
1587         ASSERT_RTNL();
1588         netdev_notifier_info_init(info, dev);
1589         return raw_notifier_call_chain(&netdev_chain, val, info);
1590 }
1591
1592 /**
1593  *      call_netdevice_notifiers - call all network notifier blocks
1594  *      @val: value passed unmodified to notifier function
1595  *      @dev: net_device pointer passed unmodified to notifier function
1596  *
1597  *      Call all network notifier blocks.  Parameters and return value
1598  *      are as for raw_notifier_call_chain().
1599  */
1600
1601 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1602 {
1603         struct netdev_notifier_info info;
1604
1605         return call_netdevice_notifiers_info(val, dev, &info);
1606 }
1607 EXPORT_SYMBOL(call_netdevice_notifiers);
1608
1609 static struct static_key netstamp_needed __read_mostly;
1610 #ifdef HAVE_JUMP_LABEL
1611 /* We are not allowed to call static_key_slow_dec() from irq context
1612  * If net_disable_timestamp() is called from irq context, defer the
1613  * static_key_slow_dec() calls.
1614  */
1615 static atomic_t netstamp_needed_deferred;
1616 #endif
1617
1618 void net_enable_timestamp(void)
1619 {
1620 #ifdef HAVE_JUMP_LABEL
1621         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1622
1623         if (deferred) {
1624                 while (--deferred)
1625                         static_key_slow_dec(&netstamp_needed);
1626                 return;
1627         }
1628 #endif
1629         static_key_slow_inc(&netstamp_needed);
1630 }
1631 EXPORT_SYMBOL(net_enable_timestamp);
1632
1633 void net_disable_timestamp(void)
1634 {
1635 #ifdef HAVE_JUMP_LABEL
1636         if (in_interrupt()) {
1637                 atomic_inc(&netstamp_needed_deferred);
1638                 return;
1639         }
1640 #endif
1641         static_key_slow_dec(&netstamp_needed);
1642 }
1643 EXPORT_SYMBOL(net_disable_timestamp);
1644
1645 static inline void net_timestamp_set(struct sk_buff *skb)
1646 {
1647         skb->tstamp.tv64 = 0;
1648         if (static_key_false(&netstamp_needed))
1649                 __net_timestamp(skb);
1650 }
1651
1652 #define net_timestamp_check(COND, SKB)                  \
1653         if (static_key_false(&netstamp_needed)) {               \
1654                 if ((COND) && !(SKB)->tstamp.tv64)      \
1655                         __net_timestamp(SKB);           \
1656         }                                               \
1657
1658 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1659 {
1660         unsigned int len;
1661
1662         if (!(dev->flags & IFF_UP))
1663                 return false;
1664
1665         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1666         if (skb->len <= len)
1667                 return true;
1668
1669         /* if TSO is enabled, we don't care about the length as the packet
1670          * could be forwarded without being segmented before
1671          */
1672         if (skb_is_gso(skb))
1673                 return true;
1674
1675         return false;
1676 }
1677 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1678
1679 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1680 {
1681         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1682                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1683                         atomic_long_inc(&dev->rx_dropped);
1684                         kfree_skb(skb);
1685                         return NET_RX_DROP;
1686                 }
1687         }
1688
1689         if (unlikely(!is_skb_forwardable(dev, skb))) {
1690                 atomic_long_inc(&dev->rx_dropped);
1691                 kfree_skb(skb);
1692                 return NET_RX_DROP;
1693         }
1694
1695         skb_scrub_packet(skb, true);
1696         skb->protocol = eth_type_trans(skb, dev);
1697         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1698
1699         return 0;
1700 }
1701 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1702
1703 /**
1704  * dev_forward_skb - loopback an skb to another netif
1705  *
1706  * @dev: destination network device
1707  * @skb: buffer to forward
1708  *
1709  * return values:
1710  *      NET_RX_SUCCESS  (no congestion)
1711  *      NET_RX_DROP     (packet was dropped, but freed)
1712  *
1713  * dev_forward_skb can be used for injecting an skb from the
1714  * start_xmit function of one device into the receive queue
1715  * of another device.
1716  *
1717  * The receiving device may be in another namespace, so
1718  * we have to clear all information in the skb that could
1719  * impact namespace isolation.
1720  */
1721 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1722 {
1723         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1724 }
1725 EXPORT_SYMBOL_GPL(dev_forward_skb);
1726
1727 static inline int deliver_skb(struct sk_buff *skb,
1728                               struct packet_type *pt_prev,
1729                               struct net_device *orig_dev)
1730 {
1731         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1732                 return -ENOMEM;
1733         atomic_inc(&skb->users);
1734         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1735 }
1736
1737 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1738 {
1739         if (!ptype->af_packet_priv || !skb->sk)
1740                 return false;
1741
1742         if (ptype->id_match)
1743                 return ptype->id_match(ptype, skb->sk);
1744         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1745                 return true;
1746
1747         return false;
1748 }
1749
1750 /*
1751  *      Support routine. Sends outgoing frames to any network
1752  *      taps currently in use.
1753  */
1754
1755 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1756 {
1757         struct packet_type *ptype;
1758         struct sk_buff *skb2 = NULL;
1759         struct packet_type *pt_prev = NULL;
1760
1761         rcu_read_lock();
1762         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1763                 /* Never send packets back to the socket
1764                  * they originated from - MvS ([email protected])
1765                  */
1766                 if ((ptype->dev == dev || !ptype->dev) &&
1767                     (!skb_loop_sk(ptype, skb))) {
1768                         if (pt_prev) {
1769                                 deliver_skb(skb2, pt_prev, skb->dev);
1770                                 pt_prev = ptype;
1771                                 continue;
1772                         }
1773
1774                         skb2 = skb_clone(skb, GFP_ATOMIC);
1775                         if (!skb2)
1776                                 break;
1777
1778                         net_timestamp_set(skb2);
1779
1780                         /* skb->nh should be correctly
1781                            set by sender, so that the second statement is
1782                            just protection against buggy protocols.
1783                          */
1784                         skb_reset_mac_header(skb2);
1785
1786                         if (skb_network_header(skb2) < skb2->data ||
1787                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1788                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1789                                                      ntohs(skb2->protocol),
1790                                                      dev->name);
1791                                 skb_reset_network_header(skb2);
1792                         }
1793
1794                         skb2->transport_header = skb2->network_header;
1795                         skb2->pkt_type = PACKET_OUTGOING;
1796                         pt_prev = ptype;
1797                 }
1798         }
1799         if (pt_prev)
1800                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1801         rcu_read_unlock();
1802 }
1803
1804 /**
1805  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1806  * @dev: Network device
1807  * @txq: number of queues available
1808  *
1809  * If real_num_tx_queues is changed the tc mappings may no longer be
1810  * valid. To resolve this verify the tc mapping remains valid and if
1811  * not NULL the mapping. With no priorities mapping to this
1812  * offset/count pair it will no longer be used. In the worst case TC0
1813  * is invalid nothing can be done so disable priority mappings. If is
1814  * expected that drivers will fix this mapping if they can before
1815  * calling netif_set_real_num_tx_queues.
1816  */
1817 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1818 {
1819         int i;
1820         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1821
1822         /* If TC0 is invalidated disable TC mapping */
1823         if (tc->offset + tc->count > txq) {
1824                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1825                 dev->num_tc = 0;
1826                 return;
1827         }
1828
1829         /* Invalidated prio to tc mappings set to TC0 */
1830         for (i = 1; i < TC_BITMASK + 1; i++) {
1831                 int q = netdev_get_prio_tc_map(dev, i);
1832
1833                 tc = &dev->tc_to_txq[q];
1834                 if (tc->offset + tc->count > txq) {
1835                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1836                                 i, q);
1837                         netdev_set_prio_tc_map(dev, i, 0);
1838                 }
1839         }
1840 }
1841
1842 #ifdef CONFIG_XPS
1843 static DEFINE_MUTEX(xps_map_mutex);
1844 #define xmap_dereference(P)             \
1845         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1846
1847 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1848                                         int cpu, u16 index)
1849 {
1850         struct xps_map *map = NULL;
1851         int pos;
1852
1853         if (dev_maps)
1854                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1855
1856         for (pos = 0; map && pos < map->len; pos++) {
1857                 if (map->queues[pos] == index) {
1858                         if (map->len > 1) {
1859                                 map->queues[pos] = map->queues[--map->len];
1860                         } else {
1861                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1862                                 kfree_rcu(map, rcu);
1863                                 map = NULL;
1864                         }
1865                         break;
1866                 }
1867         }
1868
1869         return map;
1870 }
1871
1872 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1873 {
1874         struct xps_dev_maps *dev_maps;
1875         int cpu, i;
1876         bool active = false;
1877
1878         mutex_lock(&xps_map_mutex);
1879         dev_maps = xmap_dereference(dev->xps_maps);
1880
1881         if (!dev_maps)
1882                 goto out_no_maps;
1883
1884         for_each_possible_cpu(cpu) {
1885                 for (i = index; i < dev->num_tx_queues; i++) {
1886                         if (!remove_xps_queue(dev_maps, cpu, i))
1887                                 break;
1888                 }
1889                 if (i == dev->num_tx_queues)
1890                         active = true;
1891         }
1892
1893         if (!active) {
1894                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1895                 kfree_rcu(dev_maps, rcu);
1896         }
1897
1898         for (i = index; i < dev->num_tx_queues; i++)
1899                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1900                                              NUMA_NO_NODE);
1901
1902 out_no_maps:
1903         mutex_unlock(&xps_map_mutex);
1904 }
1905
1906 static struct xps_map *expand_xps_map(struct xps_map *map,
1907                                       int cpu, u16 index)
1908 {
1909         struct xps_map *new_map;
1910         int alloc_len = XPS_MIN_MAP_ALLOC;
1911         int i, pos;
1912
1913         for (pos = 0; map && pos < map->len; pos++) {
1914                 if (map->queues[pos] != index)
1915                         continue;
1916                 return map;
1917         }
1918
1919         /* Need to add queue to this CPU's existing map */
1920         if (map) {
1921                 if (pos < map->alloc_len)
1922                         return map;
1923
1924                 alloc_len = map->alloc_len * 2;
1925         }
1926
1927         /* Need to allocate new map to store queue on this CPU's map */
1928         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1929                                cpu_to_node(cpu));
1930         if (!new_map)
1931                 return NULL;
1932
1933         for (i = 0; i < pos; i++)
1934                 new_map->queues[i] = map->queues[i];
1935         new_map->alloc_len = alloc_len;
1936         new_map->len = pos;
1937
1938         return new_map;
1939 }
1940
1941 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1942                         u16 index)
1943 {
1944         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1945         struct xps_map *map, *new_map;
1946         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1947         int cpu, numa_node_id = -2;
1948         bool active = false;
1949
1950         mutex_lock(&xps_map_mutex);
1951
1952         dev_maps = xmap_dereference(dev->xps_maps);
1953
1954         /* allocate memory for queue storage */
1955         for_each_online_cpu(cpu) {
1956                 if (!cpumask_test_cpu(cpu, mask))
1957                         continue;
1958
1959                 if (!new_dev_maps)
1960                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1961                 if (!new_dev_maps) {
1962                         mutex_unlock(&xps_map_mutex);
1963                         return -ENOMEM;
1964                 }
1965
1966                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1967                                  NULL;
1968
1969                 map = expand_xps_map(map, cpu, index);
1970                 if (!map)
1971                         goto error;
1972
1973                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1974         }
1975
1976         if (!new_dev_maps)
1977                 goto out_no_new_maps;
1978
1979         for_each_possible_cpu(cpu) {
1980                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1981                         /* add queue to CPU maps */
1982                         int pos = 0;
1983
1984                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1985                         while ((pos < map->len) && (map->queues[pos] != index))
1986                                 pos++;
1987
1988                         if (pos == map->len)
1989                                 map->queues[map->len++] = index;
1990 #ifdef CONFIG_NUMA
1991                         if (numa_node_id == -2)
1992                                 numa_node_id = cpu_to_node(cpu);
1993                         else if (numa_node_id != cpu_to_node(cpu))
1994                                 numa_node_id = -1;
1995 #endif
1996                 } else if (dev_maps) {
1997                         /* fill in the new device map from the old device map */
1998                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1999                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2000                 }
2001
2002         }
2003
2004         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2005
2006         /* Cleanup old maps */
2007         if (dev_maps) {
2008                 for_each_possible_cpu(cpu) {
2009                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2010                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2011                         if (map && map != new_map)
2012                                 kfree_rcu(map, rcu);
2013                 }
2014
2015                 kfree_rcu(dev_maps, rcu);
2016         }
2017
2018         dev_maps = new_dev_maps;
2019         active = true;
2020
2021 out_no_new_maps:
2022         /* update Tx queue numa node */
2023         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2024                                      (numa_node_id >= 0) ? numa_node_id :
2025                                      NUMA_NO_NODE);
2026
2027         if (!dev_maps)
2028                 goto out_no_maps;
2029
2030         /* removes queue from unused CPUs */
2031         for_each_possible_cpu(cpu) {
2032                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2033                         continue;
2034
2035                 if (remove_xps_queue(dev_maps, cpu, index))
2036                         active = true;
2037         }
2038
2039         /* free map if not active */
2040         if (!active) {
2041                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2042                 kfree_rcu(dev_maps, rcu);
2043         }
2044
2045 out_no_maps:
2046         mutex_unlock(&xps_map_mutex);
2047
2048         return 0;
2049 error:
2050         /* remove any maps that we added */
2051         for_each_possible_cpu(cpu) {
2052                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2053                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2054                                  NULL;
2055                 if (new_map && new_map != map)
2056                         kfree(new_map);
2057         }
2058
2059         mutex_unlock(&xps_map_mutex);
2060
2061         kfree(new_dev_maps);
2062         return -ENOMEM;
2063 }
2064 EXPORT_SYMBOL(netif_set_xps_queue);
2065
2066 #endif
2067 /*
2068  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2069  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2070  */
2071 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2072 {
2073         int rc;
2074
2075         if (txq < 1 || txq > dev->num_tx_queues)
2076                 return -EINVAL;
2077
2078         if (dev->reg_state == NETREG_REGISTERED ||
2079             dev->reg_state == NETREG_UNREGISTERING) {
2080                 ASSERT_RTNL();
2081
2082                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2083                                                   txq);
2084                 if (rc)
2085                         return rc;
2086
2087                 if (dev->num_tc)
2088                         netif_setup_tc(dev, txq);
2089
2090                 if (txq < dev->real_num_tx_queues) {
2091                         qdisc_reset_all_tx_gt(dev, txq);
2092 #ifdef CONFIG_XPS
2093                         netif_reset_xps_queues_gt(dev, txq);
2094 #endif
2095                 }
2096         }
2097
2098         dev->real_num_tx_queues = txq;
2099         return 0;
2100 }
2101 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2102
2103 #ifdef CONFIG_SYSFS
2104 /**
2105  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2106  *      @dev: Network device
2107  *      @rxq: Actual number of RX queues
2108  *
2109  *      This must be called either with the rtnl_lock held or before
2110  *      registration of the net device.  Returns 0 on success, or a
2111  *      negative error code.  If called before registration, it always
2112  *      succeeds.
2113  */
2114 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2115 {
2116         int rc;
2117
2118         if (rxq < 1 || rxq > dev->num_rx_queues)
2119                 return -EINVAL;
2120
2121         if (dev->reg_state == NETREG_REGISTERED) {
2122                 ASSERT_RTNL();
2123
2124                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2125                                                   rxq);
2126                 if (rc)
2127                         return rc;
2128         }
2129
2130         dev->real_num_rx_queues = rxq;
2131         return 0;
2132 }
2133 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2134 #endif
2135
2136 /**
2137  * netif_get_num_default_rss_queues - default number of RSS queues
2138  *
2139  * This routine should set an upper limit on the number of RSS queues
2140  * used by default by multiqueue devices.
2141  */
2142 int netif_get_num_default_rss_queues(void)
2143 {
2144         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2145 }
2146 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2147
2148 static inline void __netif_reschedule(struct Qdisc *q)
2149 {
2150         struct softnet_data *sd;
2151         unsigned long flags;
2152
2153         local_irq_save(flags);
2154         sd = this_cpu_ptr(&softnet_data);
2155         q->next_sched = NULL;
2156         *sd->output_queue_tailp = q;
2157         sd->output_queue_tailp = &q->next_sched;
2158         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2159         local_irq_restore(flags);
2160 }
2161
2162 void __netif_schedule(struct Qdisc *q)
2163 {
2164         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2165                 __netif_reschedule(q);
2166 }
2167 EXPORT_SYMBOL(__netif_schedule);
2168
2169 struct dev_kfree_skb_cb {
2170         enum skb_free_reason reason;
2171 };
2172
2173 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2174 {
2175         return (struct dev_kfree_skb_cb *)skb->cb;
2176 }
2177
2178 void netif_schedule_queue(struct netdev_queue *txq)
2179 {
2180         rcu_read_lock();
2181         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2182                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2183
2184                 __netif_schedule(q);
2185         }
2186         rcu_read_unlock();
2187 }
2188 EXPORT_SYMBOL(netif_schedule_queue);
2189
2190 /**
2191  *      netif_wake_subqueue - allow sending packets on subqueue
2192  *      @dev: network device
2193  *      @queue_index: sub queue index
2194  *
2195  * Resume individual transmit queue of a device with multiple transmit queues.
2196  */
2197 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2198 {
2199         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2200
2201         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2202                 struct Qdisc *q;
2203
2204                 rcu_read_lock();
2205                 q = rcu_dereference(txq->qdisc);
2206                 __netif_schedule(q);
2207                 rcu_read_unlock();
2208         }
2209 }
2210 EXPORT_SYMBOL(netif_wake_subqueue);
2211
2212 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2213 {
2214         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2215                 struct Qdisc *q;
2216
2217                 rcu_read_lock();
2218                 q = rcu_dereference(dev_queue->qdisc);
2219                 __netif_schedule(q);
2220                 rcu_read_unlock();
2221         }
2222 }
2223 EXPORT_SYMBOL(netif_tx_wake_queue);
2224
2225 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2226 {
2227         unsigned long flags;
2228
2229         if (likely(atomic_read(&skb->users) == 1)) {
2230                 smp_rmb();
2231                 atomic_set(&skb->users, 0);
2232         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2233                 return;
2234         }
2235         get_kfree_skb_cb(skb)->reason = reason;
2236         local_irq_save(flags);
2237         skb->next = __this_cpu_read(softnet_data.completion_queue);
2238         __this_cpu_write(softnet_data.completion_queue, skb);
2239         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2240         local_irq_restore(flags);
2241 }
2242 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2243
2244 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2245 {
2246         if (in_irq() || irqs_disabled())
2247                 __dev_kfree_skb_irq(skb, reason);
2248         else
2249                 dev_kfree_skb(skb);
2250 }
2251 EXPORT_SYMBOL(__dev_kfree_skb_any);
2252
2253
2254 /**
2255  * netif_device_detach - mark device as removed
2256  * @dev: network device
2257  *
2258  * Mark device as removed from system and therefore no longer available.
2259  */
2260 void netif_device_detach(struct net_device *dev)
2261 {
2262         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2263             netif_running(dev)) {
2264                 netif_tx_stop_all_queues(dev);
2265         }
2266 }
2267 EXPORT_SYMBOL(netif_device_detach);
2268
2269 /**
2270  * netif_device_attach - mark device as attached
2271  * @dev: network device
2272  *
2273  * Mark device as attached from system and restart if needed.
2274  */
2275 void netif_device_attach(struct net_device *dev)
2276 {
2277         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2278             netif_running(dev)) {
2279                 netif_tx_wake_all_queues(dev);
2280                 __netdev_watchdog_up(dev);
2281         }
2282 }
2283 EXPORT_SYMBOL(netif_device_attach);
2284
2285 static void skb_warn_bad_offload(const struct sk_buff *skb)
2286 {
2287         static const netdev_features_t null_features = 0;
2288         struct net_device *dev = skb->dev;
2289         const char *driver = "";
2290
2291         if (!net_ratelimit())
2292                 return;
2293
2294         if (dev && dev->dev.parent)
2295                 driver = dev_driver_string(dev->dev.parent);
2296
2297         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2298              "gso_type=%d ip_summed=%d\n",
2299              driver, dev ? &dev->features : &null_features,
2300              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2301              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2302              skb_shinfo(skb)->gso_type, skb->ip_summed);
2303 }
2304
2305 /*
2306  * Invalidate hardware checksum when packet is to be mangled, and
2307  * complete checksum manually on outgoing path.
2308  */
2309 int skb_checksum_help(struct sk_buff *skb)
2310 {
2311         __wsum csum;
2312         int ret = 0, offset;
2313
2314         if (skb->ip_summed == CHECKSUM_COMPLETE)
2315                 goto out_set_summed;
2316
2317         if (unlikely(skb_shinfo(skb)->gso_size)) {
2318                 skb_warn_bad_offload(skb);
2319                 return -EINVAL;
2320         }
2321
2322         /* Before computing a checksum, we should make sure no frag could
2323          * be modified by an external entity : checksum could be wrong.
2324          */
2325         if (skb_has_shared_frag(skb)) {
2326                 ret = __skb_linearize(skb);
2327                 if (ret)
2328                         goto out;
2329         }
2330
2331         offset = skb_checksum_start_offset(skb);
2332         BUG_ON(offset >= skb_headlen(skb));
2333         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2334
2335         offset += skb->csum_offset;
2336         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2337
2338         if (skb_cloned(skb) &&
2339             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2340                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2341                 if (ret)
2342                         goto out;
2343         }
2344
2345         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2346 out_set_summed:
2347         skb->ip_summed = CHECKSUM_NONE;
2348 out:
2349         return ret;
2350 }
2351 EXPORT_SYMBOL(skb_checksum_help);
2352
2353 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2354 {
2355         unsigned int vlan_depth = skb->mac_len;
2356         __be16 type = skb->protocol;
2357
2358         /* Tunnel gso handlers can set protocol to ethernet. */
2359         if (type == htons(ETH_P_TEB)) {
2360                 struct ethhdr *eth;
2361
2362                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2363                         return 0;
2364
2365                 eth = (struct ethhdr *)skb_mac_header(skb);
2366                 type = eth->h_proto;
2367         }
2368
2369         /* if skb->protocol is 802.1Q/AD then the header should already be
2370          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2371          * ETH_HLEN otherwise
2372          */
2373         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2374                 if (vlan_depth) {
2375                         if (WARN_ON(vlan_depth < VLAN_HLEN))
2376                                 return 0;
2377                         vlan_depth -= VLAN_HLEN;
2378                 } else {
2379                         vlan_depth = ETH_HLEN;
2380                 }
2381                 do {
2382                         struct vlan_hdr *vh;
2383
2384                         if (unlikely(!pskb_may_pull(skb,
2385                                                     vlan_depth + VLAN_HLEN)))
2386                                 return 0;
2387
2388                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2389                         type = vh->h_vlan_encapsulated_proto;
2390                         vlan_depth += VLAN_HLEN;
2391                 } while (type == htons(ETH_P_8021Q) ||
2392                          type == htons(ETH_P_8021AD));
2393         }
2394
2395         *depth = vlan_depth;
2396
2397         return type;
2398 }
2399
2400 /**
2401  *      skb_mac_gso_segment - mac layer segmentation handler.
2402  *      @skb: buffer to segment
2403  *      @features: features for the output path (see dev->features)
2404  */
2405 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2406                                     netdev_features_t features)
2407 {
2408         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2409         struct packet_offload *ptype;
2410         int vlan_depth = skb->mac_len;
2411         __be16 type = skb_network_protocol(skb, &vlan_depth);
2412
2413         if (unlikely(!type))
2414                 return ERR_PTR(-EINVAL);
2415
2416         __skb_pull(skb, vlan_depth);
2417
2418         rcu_read_lock();
2419         list_for_each_entry_rcu(ptype, &offload_base, list) {
2420                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2421                         segs = ptype->callbacks.gso_segment(skb, features);
2422                         break;
2423                 }
2424         }
2425         rcu_read_unlock();
2426
2427         __skb_push(skb, skb->data - skb_mac_header(skb));
2428
2429         return segs;
2430 }
2431 EXPORT_SYMBOL(skb_mac_gso_segment);
2432
2433
2434 /* openvswitch calls this on rx path, so we need a different check.
2435  */
2436 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2437 {
2438         if (tx_path)
2439                 return skb->ip_summed != CHECKSUM_PARTIAL;
2440         else
2441                 return skb->ip_summed == CHECKSUM_NONE;
2442 }
2443
2444 /**
2445  *      __skb_gso_segment - Perform segmentation on skb.
2446  *      @skb: buffer to segment
2447  *      @features: features for the output path (see dev->features)
2448  *      @tx_path: whether it is called in TX path
2449  *
2450  *      This function segments the given skb and returns a list of segments.
2451  *
2452  *      It may return NULL if the skb requires no segmentation.  This is
2453  *      only possible when GSO is used for verifying header integrity.
2454  */
2455 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2456                                   netdev_features_t features, bool tx_path)
2457 {
2458         if (unlikely(skb_needs_check(skb, tx_path))) {
2459                 int err;
2460
2461                 skb_warn_bad_offload(skb);
2462
2463                 err = skb_cow_head(skb, 0);
2464                 if (err < 0)
2465                         return ERR_PTR(err);
2466         }
2467
2468         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2469         SKB_GSO_CB(skb)->encap_level = 0;
2470
2471         skb_reset_mac_header(skb);
2472         skb_reset_mac_len(skb);
2473
2474         return skb_mac_gso_segment(skb, features);
2475 }
2476 EXPORT_SYMBOL(__skb_gso_segment);
2477
2478 /* Take action when hardware reception checksum errors are detected. */
2479 #ifdef CONFIG_BUG
2480 void netdev_rx_csum_fault(struct net_device *dev)
2481 {
2482         if (net_ratelimit()) {
2483                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2484                 dump_stack();
2485         }
2486 }
2487 EXPORT_SYMBOL(netdev_rx_csum_fault);
2488 #endif
2489
2490 /* Actually, we should eliminate this check as soon as we know, that:
2491  * 1. IOMMU is present and allows to map all the memory.
2492  * 2. No high memory really exists on this machine.
2493  */
2494
2495 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2496 {
2497 #ifdef CONFIG_HIGHMEM
2498         int i;
2499         if (!(dev->features & NETIF_F_HIGHDMA)) {
2500                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2501                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2502                         if (PageHighMem(skb_frag_page(frag)))
2503                                 return 1;
2504                 }
2505         }
2506
2507         if (PCI_DMA_BUS_IS_PHYS) {
2508                 struct device *pdev = dev->dev.parent;
2509
2510                 if (!pdev)
2511                         return 0;
2512                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2513                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2514                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2515                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2516                                 return 1;
2517                 }
2518         }
2519 #endif
2520         return 0;
2521 }
2522
2523 /* If MPLS offload request, verify we are testing hardware MPLS features
2524  * instead of standard features for the netdev.
2525  */
2526 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2527 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2528                                            netdev_features_t features,
2529                                            __be16 type)
2530 {
2531         if (eth_p_mpls(type))
2532                 features &= skb->dev->mpls_features;
2533
2534         return features;
2535 }
2536 #else
2537 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2538                                            netdev_features_t features,
2539                                            __be16 type)
2540 {
2541         return features;
2542 }
2543 #endif
2544
2545 static netdev_features_t harmonize_features(struct sk_buff *skb,
2546         netdev_features_t features)
2547 {
2548         int tmp;
2549         __be16 type;
2550
2551         type = skb_network_protocol(skb, &tmp);
2552         features = net_mpls_features(skb, features, type);
2553
2554         if (skb->ip_summed != CHECKSUM_NONE &&
2555             !can_checksum_protocol(features, type)) {
2556                 features &= ~NETIF_F_ALL_CSUM;
2557         } else if (illegal_highdma(skb->dev, skb)) {
2558                 features &= ~NETIF_F_SG;
2559         }
2560
2561         return features;
2562 }
2563
2564 netdev_features_t netif_skb_features(struct sk_buff *skb)
2565 {
2566         struct net_device *dev = skb->dev;
2567         netdev_features_t features = dev->features;
2568         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2569         __be16 protocol = skb->protocol;
2570
2571         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2572                 features &= ~NETIF_F_GSO_MASK;
2573
2574         /* If encapsulation offload request, verify we are testing
2575          * hardware encapsulation features instead of standard
2576          * features for the netdev
2577          */
2578         if (skb->encapsulation)
2579                 features &= dev->hw_enc_features;
2580
2581         if (!vlan_tx_tag_present(skb)) {
2582                 if (unlikely(protocol == htons(ETH_P_8021Q) ||
2583                              protocol == htons(ETH_P_8021AD))) {
2584                         struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2585                         protocol = veh->h_vlan_encapsulated_proto;
2586                 } else {
2587                         goto finalize;
2588                 }
2589         }
2590
2591         features = netdev_intersect_features(features,
2592                                              dev->vlan_features |
2593                                              NETIF_F_HW_VLAN_CTAG_TX |
2594                                              NETIF_F_HW_VLAN_STAG_TX);
2595
2596         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2597                 features = netdev_intersect_features(features,
2598                                                      NETIF_F_SG |
2599                                                      NETIF_F_HIGHDMA |
2600                                                      NETIF_F_FRAGLIST |
2601                                                      NETIF_F_GEN_CSUM |
2602                                                      NETIF_F_HW_VLAN_CTAG_TX |
2603                                                      NETIF_F_HW_VLAN_STAG_TX);
2604
2605 finalize:
2606         if (dev->netdev_ops->ndo_features_check)
2607                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2608                                                                 features);
2609
2610         return harmonize_features(skb, features);
2611 }
2612 EXPORT_SYMBOL(netif_skb_features);
2613
2614 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2615                     struct netdev_queue *txq, bool more)
2616 {
2617         unsigned int len;
2618         int rc;
2619
2620         if (!list_empty(&ptype_all))
2621                 dev_queue_xmit_nit(skb, dev);
2622
2623         len = skb->len;
2624         trace_net_dev_start_xmit(skb, dev);
2625         rc = netdev_start_xmit(skb, dev, txq, more);
2626         trace_net_dev_xmit(skb, rc, dev, len);
2627
2628         return rc;
2629 }
2630
2631 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2632                                     struct netdev_queue *txq, int *ret)
2633 {
2634         struct sk_buff *skb = first;
2635         int rc = NETDEV_TX_OK;
2636
2637         while (skb) {
2638                 struct sk_buff *next = skb->next;
2639
2640                 skb->next = NULL;
2641                 rc = xmit_one(skb, dev, txq, next != NULL);
2642                 if (unlikely(!dev_xmit_complete(rc))) {
2643                         skb->next = next;
2644                         goto out;
2645                 }
2646
2647                 skb = next;
2648                 if (netif_xmit_stopped(txq) && skb) {
2649                         rc = NETDEV_TX_BUSY;
2650                         break;
2651                 }
2652         }
2653
2654 out:
2655         *ret = rc;
2656         return skb;
2657 }
2658
2659 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2660                                           netdev_features_t features)
2661 {
2662         if (vlan_tx_tag_present(skb) &&
2663             !vlan_hw_offload_capable(features, skb->vlan_proto))
2664                 skb = __vlan_hwaccel_push_inside(skb);
2665         return skb;
2666 }
2667
2668 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2669 {
2670         netdev_features_t features;
2671
2672         if (skb->next)
2673                 return skb;
2674
2675         features = netif_skb_features(skb);
2676         skb = validate_xmit_vlan(skb, features);
2677         if (unlikely(!skb))
2678                 goto out_null;
2679
2680         if (netif_needs_gso(dev, skb, features)) {
2681                 struct sk_buff *segs;
2682
2683                 segs = skb_gso_segment(skb, features);
2684                 if (IS_ERR(segs)) {
2685                         goto out_kfree_skb;
2686                 } else if (segs) {
2687                         consume_skb(skb);
2688                         skb = segs;
2689                 }
2690         } else {
2691                 if (skb_needs_linearize(skb, features) &&
2692                     __skb_linearize(skb))
2693                         goto out_kfree_skb;
2694
2695                 /* If packet is not checksummed and device does not
2696                  * support checksumming for this protocol, complete
2697                  * checksumming here.
2698                  */
2699                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2700                         if (skb->encapsulation)
2701                                 skb_set_inner_transport_header(skb,
2702                                                                skb_checksum_start_offset(skb));
2703                         else
2704                                 skb_set_transport_header(skb,
2705                                                          skb_checksum_start_offset(skb));
2706                         if (!(features & NETIF_F_ALL_CSUM) &&
2707                             skb_checksum_help(skb))
2708                                 goto out_kfree_skb;
2709                 }
2710         }
2711
2712         return skb;
2713
2714 out_kfree_skb:
2715         kfree_skb(skb);
2716 out_null:
2717         return NULL;
2718 }
2719
2720 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2721 {
2722         struct sk_buff *next, *head = NULL, *tail;
2723
2724         for (; skb != NULL; skb = next) {
2725                 next = skb->next;
2726                 skb->next = NULL;
2727
2728                 /* in case skb wont be segmented, point to itself */
2729                 skb->prev = skb;
2730
2731                 skb = validate_xmit_skb(skb, dev);
2732                 if (!skb)
2733                         continue;
2734
2735                 if (!head)
2736                         head = skb;
2737                 else
2738                         tail->next = skb;
2739                 /* If skb was segmented, skb->prev points to
2740                  * the last segment. If not, it still contains skb.
2741                  */
2742                 tail = skb->prev;
2743         }
2744         return head;
2745 }
2746
2747 static void qdisc_pkt_len_init(struct sk_buff *skb)
2748 {
2749         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2750
2751         qdisc_skb_cb(skb)->pkt_len = skb->len;
2752
2753         /* To get more precise estimation of bytes sent on wire,
2754          * we add to pkt_len the headers size of all segments
2755          */
2756         if (shinfo->gso_size)  {
2757                 unsigned int hdr_len;
2758                 u16 gso_segs = shinfo->gso_segs;
2759
2760                 /* mac layer + network layer */
2761                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2762
2763                 /* + transport layer */
2764                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2765                         hdr_len += tcp_hdrlen(skb);
2766                 else
2767                         hdr_len += sizeof(struct udphdr);
2768
2769                 if (shinfo->gso_type & SKB_GSO_DODGY)
2770                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2771                                                 shinfo->gso_size);
2772
2773                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2774         }
2775 }
2776
2777 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2778                                  struct net_device *dev,
2779                                  struct netdev_queue *txq)
2780 {
2781         spinlock_t *root_lock = qdisc_lock(q);
2782         bool contended;
2783         int rc;
2784
2785         qdisc_pkt_len_init(skb);
2786         qdisc_calculate_pkt_len(skb, q);
2787         /*
2788          * Heuristic to force contended enqueues to serialize on a
2789          * separate lock before trying to get qdisc main lock.
2790          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2791          * often and dequeue packets faster.
2792          */
2793         contended = qdisc_is_running(q);
2794         if (unlikely(contended))
2795                 spin_lock(&q->busylock);
2796
2797         spin_lock(root_lock);
2798         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2799                 kfree_skb(skb);
2800                 rc = NET_XMIT_DROP;
2801         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2802                    qdisc_run_begin(q)) {
2803                 /*
2804                  * This is a work-conserving queue; there are no old skbs
2805                  * waiting to be sent out; and the qdisc is not running -
2806                  * xmit the skb directly.
2807                  */
2808
2809                 qdisc_bstats_update(q, skb);
2810
2811                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2812                         if (unlikely(contended)) {
2813                                 spin_unlock(&q->busylock);
2814                                 contended = false;
2815                         }
2816                         __qdisc_run(q);
2817                 } else
2818                         qdisc_run_end(q);
2819
2820                 rc = NET_XMIT_SUCCESS;
2821         } else {
2822                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2823                 if (qdisc_run_begin(q)) {
2824                         if (unlikely(contended)) {
2825                                 spin_unlock(&q->busylock);
2826                                 contended = false;
2827                         }
2828                         __qdisc_run(q);
2829                 }
2830         }
2831         spin_unlock(root_lock);
2832         if (unlikely(contended))
2833                 spin_unlock(&q->busylock);
2834         return rc;
2835 }
2836
2837 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2838 static void skb_update_prio(struct sk_buff *skb)
2839 {
2840         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2841
2842         if (!skb->priority && skb->sk && map) {
2843                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2844
2845                 if (prioidx < map->priomap_len)
2846                         skb->priority = map->priomap[prioidx];
2847         }
2848 }
2849 #else
2850 #define skb_update_prio(skb)
2851 #endif
2852
2853 static DEFINE_PER_CPU(int, xmit_recursion);
2854 #define RECURSION_LIMIT 10
2855
2856 /**
2857  *      dev_loopback_xmit - loop back @skb
2858  *      @skb: buffer to transmit
2859  */
2860 int dev_loopback_xmit(struct sk_buff *skb)
2861 {
2862         skb_reset_mac_header(skb);
2863         __skb_pull(skb, skb_network_offset(skb));
2864         skb->pkt_type = PACKET_LOOPBACK;
2865         skb->ip_summed = CHECKSUM_UNNECESSARY;
2866         WARN_ON(!skb_dst(skb));
2867         skb_dst_force(skb);
2868         netif_rx_ni(skb);
2869         return 0;
2870 }
2871 EXPORT_SYMBOL(dev_loopback_xmit);
2872
2873 /**
2874  *      __dev_queue_xmit - transmit a buffer
2875  *      @skb: buffer to transmit
2876  *      @accel_priv: private data used for L2 forwarding offload
2877  *
2878  *      Queue a buffer for transmission to a network device. The caller must
2879  *      have set the device and priority and built the buffer before calling
2880  *      this function. The function can be called from an interrupt.
2881  *
2882  *      A negative errno code is returned on a failure. A success does not
2883  *      guarantee the frame will be transmitted as it may be dropped due
2884  *      to congestion or traffic shaping.
2885  *
2886  * -----------------------------------------------------------------------------------
2887  *      I notice this method can also return errors from the queue disciplines,
2888  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2889  *      be positive.
2890  *
2891  *      Regardless of the return value, the skb is consumed, so it is currently
2892  *      difficult to retry a send to this method.  (You can bump the ref count
2893  *      before sending to hold a reference for retry if you are careful.)
2894  *
2895  *      When calling this method, interrupts MUST be enabled.  This is because
2896  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2897  *          --BLG
2898  */
2899 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2900 {
2901         struct net_device *dev = skb->dev;
2902         struct netdev_queue *txq;
2903         struct Qdisc *q;
2904         int rc = -ENOMEM;
2905
2906         skb_reset_mac_header(skb);
2907
2908         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2909                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2910
2911         /* Disable soft irqs for various locks below. Also
2912          * stops preemption for RCU.
2913          */
2914         rcu_read_lock_bh();
2915
2916         skb_update_prio(skb);
2917
2918         /* If device/qdisc don't need skb->dst, release it right now while
2919          * its hot in this cpu cache.
2920          */
2921         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2922                 skb_dst_drop(skb);
2923         else
2924                 skb_dst_force(skb);
2925
2926         txq = netdev_pick_tx(dev, skb, accel_priv);
2927         q = rcu_dereference_bh(txq->qdisc);
2928
2929 #ifdef CONFIG_NET_CLS_ACT
2930         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2931 #endif
2932         trace_net_dev_queue(skb);
2933         if (q->enqueue) {
2934                 rc = __dev_xmit_skb(skb, q, dev, txq);
2935                 goto out;
2936         }
2937
2938         /* The device has no queue. Common case for software devices:
2939            loopback, all the sorts of tunnels...
2940
2941            Really, it is unlikely that netif_tx_lock protection is necessary
2942            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2943            counters.)
2944            However, it is possible, that they rely on protection
2945            made by us here.
2946
2947            Check this and shot the lock. It is not prone from deadlocks.
2948            Either shot noqueue qdisc, it is even simpler 8)
2949          */
2950         if (dev->flags & IFF_UP) {
2951                 int cpu = smp_processor_id(); /* ok because BHs are off */
2952
2953                 if (txq->xmit_lock_owner != cpu) {
2954
2955                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2956                                 goto recursion_alert;
2957
2958                         skb = validate_xmit_skb(skb, dev);
2959                         if (!skb)
2960                                 goto drop;
2961
2962                         HARD_TX_LOCK(dev, txq, cpu);
2963
2964                         if (!netif_xmit_stopped(txq)) {
2965                                 __this_cpu_inc(xmit_recursion);
2966                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2967                                 __this_cpu_dec(xmit_recursion);
2968                                 if (dev_xmit_complete(rc)) {
2969                                         HARD_TX_UNLOCK(dev, txq);
2970                                         goto out;
2971                                 }
2972                         }
2973                         HARD_TX_UNLOCK(dev, txq);
2974                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2975                                              dev->name);
2976                 } else {
2977                         /* Recursion is detected! It is possible,
2978                          * unfortunately
2979                          */
2980 recursion_alert:
2981                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2982                                              dev->name);
2983                 }
2984         }
2985
2986         rc = -ENETDOWN;
2987 drop:
2988         rcu_read_unlock_bh();
2989
2990         atomic_long_inc(&dev->tx_dropped);
2991         kfree_skb_list(skb);
2992         return rc;
2993 out:
2994         rcu_read_unlock_bh();
2995         return rc;
2996 }
2997
2998 int dev_queue_xmit(struct sk_buff *skb)
2999 {
3000         return __dev_queue_xmit(skb, NULL);
3001 }
3002 EXPORT_SYMBOL(dev_queue_xmit);
3003
3004 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3005 {
3006         return __dev_queue_xmit(skb, accel_priv);
3007 }
3008 EXPORT_SYMBOL(dev_queue_xmit_accel);
3009
3010
3011 /*=======================================================================
3012                         Receiver routines
3013   =======================================================================*/
3014
3015 int netdev_max_backlog __read_mostly = 1000;
3016 EXPORT_SYMBOL(netdev_max_backlog);
3017
3018 int netdev_tstamp_prequeue __read_mostly = 1;
3019 int netdev_budget __read_mostly = 300;
3020 int weight_p __read_mostly = 64;            /* old backlog weight */
3021
3022 /* Called with irq disabled */
3023 static inline void ____napi_schedule(struct softnet_data *sd,
3024                                      struct napi_struct *napi)
3025 {
3026         list_add_tail(&napi->poll_list, &sd->poll_list);
3027         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3028 }
3029
3030 #ifdef CONFIG_RPS
3031
3032 /* One global table that all flow-based protocols share. */
3033 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3034 EXPORT_SYMBOL(rps_sock_flow_table);
3035
3036 struct static_key rps_needed __read_mostly;
3037
3038 static struct rps_dev_flow *
3039 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3040             struct rps_dev_flow *rflow, u16 next_cpu)
3041 {
3042         if (next_cpu != RPS_NO_CPU) {
3043 #ifdef CONFIG_RFS_ACCEL
3044                 struct netdev_rx_queue *rxqueue;
3045                 struct rps_dev_flow_table *flow_table;
3046                 struct rps_dev_flow *old_rflow;
3047                 u32 flow_id;
3048                 u16 rxq_index;
3049                 int rc;
3050
3051                 /* Should we steer this flow to a different hardware queue? */
3052                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3053                     !(dev->features & NETIF_F_NTUPLE))
3054                         goto out;
3055                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3056                 if (rxq_index == skb_get_rx_queue(skb))
3057                         goto out;
3058
3059                 rxqueue = dev->_rx + rxq_index;
3060                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3061                 if (!flow_table)
3062                         goto out;
3063                 flow_id = skb_get_hash(skb) & flow_table->mask;
3064                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3065                                                         rxq_index, flow_id);
3066                 if (rc < 0)
3067                         goto out;
3068                 old_rflow = rflow;
3069                 rflow = &flow_table->flows[flow_id];
3070                 rflow->filter = rc;
3071                 if (old_rflow->filter == rflow->filter)
3072                         old_rflow->filter = RPS_NO_FILTER;
3073         out:
3074 #endif
3075                 rflow->last_qtail =
3076                         per_cpu(softnet_data, next_cpu).input_queue_head;
3077         }
3078
3079         rflow->cpu = next_cpu;
3080         return rflow;
3081 }
3082
3083 /*
3084  * get_rps_cpu is called from netif_receive_skb and returns the target
3085  * CPU from the RPS map of the receiving queue for a given skb.
3086  * rcu_read_lock must be held on entry.
3087  */
3088 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3089                        struct rps_dev_flow **rflowp)
3090 {
3091         struct netdev_rx_queue *rxqueue;
3092         struct rps_map *map;
3093         struct rps_dev_flow_table *flow_table;
3094         struct rps_sock_flow_table *sock_flow_table;
3095         int cpu = -1;
3096         u16 tcpu;
3097         u32 hash;
3098
3099         if (skb_rx_queue_recorded(skb)) {
3100                 u16 index = skb_get_rx_queue(skb);
3101                 if (unlikely(index >= dev->real_num_rx_queues)) {
3102                         WARN_ONCE(dev->real_num_rx_queues > 1,
3103                                   "%s received packet on queue %u, but number "
3104                                   "of RX queues is %u\n",
3105                                   dev->name, index, dev->real_num_rx_queues);
3106                         goto done;
3107                 }
3108                 rxqueue = dev->_rx + index;
3109         } else
3110                 rxqueue = dev->_rx;
3111
3112         map = rcu_dereference(rxqueue->rps_map);
3113         if (map) {
3114                 if (map->len == 1 &&
3115                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3116                         tcpu = map->cpus[0];
3117                         if (cpu_online(tcpu))
3118                                 cpu = tcpu;
3119                         goto done;
3120                 }
3121         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3122                 goto done;
3123         }
3124
3125         skb_reset_network_header(skb);
3126         hash = skb_get_hash(skb);
3127         if (!hash)
3128                 goto done;
3129
3130         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3131         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3132         if (flow_table && sock_flow_table) {
3133                 u16 next_cpu;
3134                 struct rps_dev_flow *rflow;
3135
3136                 rflow = &flow_table->flows[hash & flow_table->mask];
3137                 tcpu = rflow->cpu;
3138
3139                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3140
3141                 /*
3142                  * If the desired CPU (where last recvmsg was done) is
3143                  * different from current CPU (one in the rx-queue flow
3144                  * table entry), switch if one of the following holds:
3145                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3146                  *   - Current CPU is offline.
3147                  *   - The current CPU's queue tail has advanced beyond the
3148                  *     last packet that was enqueued using this table entry.
3149                  *     This guarantees that all previous packets for the flow
3150                  *     have been dequeued, thus preserving in order delivery.
3151                  */
3152                 if (unlikely(tcpu != next_cpu) &&
3153                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3154                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3155                       rflow->last_qtail)) >= 0)) {
3156                         tcpu = next_cpu;
3157                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3158                 }
3159
3160                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3161                         *rflowp = rflow;
3162                         cpu = tcpu;
3163                         goto done;
3164                 }
3165         }
3166
3167         if (map) {
3168                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3169                 if (cpu_online(tcpu)) {
3170                         cpu = tcpu;
3171                         goto done;
3172                 }
3173         }
3174
3175 done:
3176         return cpu;
3177 }
3178
3179 #ifdef CONFIG_RFS_ACCEL
3180
3181 /**
3182  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3183  * @dev: Device on which the filter was set
3184  * @rxq_index: RX queue index
3185  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3186  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3187  *
3188  * Drivers that implement ndo_rx_flow_steer() should periodically call
3189  * this function for each installed filter and remove the filters for
3190  * which it returns %true.
3191  */
3192 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3193                          u32 flow_id, u16 filter_id)
3194 {
3195         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3196         struct rps_dev_flow_table *flow_table;
3197         struct rps_dev_flow *rflow;
3198         bool expire = true;
3199         int cpu;
3200
3201         rcu_read_lock();
3202         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3203         if (flow_table && flow_id <= flow_table->mask) {
3204                 rflow = &flow_table->flows[flow_id];
3205                 cpu = ACCESS_ONCE(rflow->cpu);
3206                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3207                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3208                            rflow->last_qtail) <
3209                      (int)(10 * flow_table->mask)))
3210                         expire = false;
3211         }
3212         rcu_read_unlock();
3213         return expire;
3214 }
3215 EXPORT_SYMBOL(rps_may_expire_flow);
3216
3217 #endif /* CONFIG_RFS_ACCEL */
3218
3219 /* Called from hardirq (IPI) context */
3220 static void rps_trigger_softirq(void *data)
3221 {
3222         struct softnet_data *sd = data;
3223
3224         ____napi_schedule(sd, &sd->backlog);
3225         sd->received_rps++;
3226 }
3227
3228 #endif /* CONFIG_RPS */
3229
3230 /*
3231  * Check if this softnet_data structure is another cpu one
3232  * If yes, queue it to our IPI list and return 1
3233  * If no, return 0
3234  */
3235 static int rps_ipi_queued(struct softnet_data *sd)
3236 {
3237 #ifdef CONFIG_RPS
3238         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3239
3240         if (sd != mysd) {
3241                 sd->rps_ipi_next = mysd->rps_ipi_list;
3242                 mysd->rps_ipi_list = sd;
3243
3244                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3245                 return 1;
3246         }
3247 #endif /* CONFIG_RPS */
3248         return 0;
3249 }
3250
3251 #ifdef CONFIG_NET_FLOW_LIMIT
3252 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3253 #endif
3254
3255 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3256 {
3257 #ifdef CONFIG_NET_FLOW_LIMIT
3258         struct sd_flow_limit *fl;
3259         struct softnet_data *sd;
3260         unsigned int old_flow, new_flow;
3261
3262         if (qlen < (netdev_max_backlog >> 1))
3263                 return false;
3264
3265         sd = this_cpu_ptr(&softnet_data);
3266
3267         rcu_read_lock();
3268         fl = rcu_dereference(sd->flow_limit);
3269         if (fl) {
3270                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3271                 old_flow = fl->history[fl->history_head];
3272                 fl->history[fl->history_head] = new_flow;
3273
3274                 fl->history_head++;
3275                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3276
3277                 if (likely(fl->buckets[old_flow]))
3278                         fl->buckets[old_flow]--;
3279
3280                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3281                         fl->count++;
3282                         rcu_read_unlock();
3283                         return true;
3284                 }
3285         }
3286         rcu_read_unlock();
3287 #endif
3288         return false;
3289 }
3290
3291 /*
3292  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3293  * queue (may be a remote CPU queue).
3294  */
3295 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3296                               unsigned int *qtail)
3297 {
3298         struct softnet_data *sd;
3299         unsigned long flags;
3300         unsigned int qlen;
3301
3302         sd = &per_cpu(softnet_data, cpu);
3303
3304         local_irq_save(flags);
3305
3306         rps_lock(sd);
3307         qlen = skb_queue_len(&sd->input_pkt_queue);
3308         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3309                 if (qlen) {
3310 enqueue:
3311                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3312                         input_queue_tail_incr_save(sd, qtail);
3313                         rps_unlock(sd);
3314                         local_irq_restore(flags);
3315                         return NET_RX_SUCCESS;
3316                 }
3317
3318                 /* Schedule NAPI for backlog device
3319                  * We can use non atomic operation since we own the queue lock
3320                  */
3321                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3322                         if (!rps_ipi_queued(sd))
3323                                 ____napi_schedule(sd, &sd->backlog);
3324                 }
3325                 goto enqueue;
3326         }
3327
3328         sd->dropped++;
3329         rps_unlock(sd);
3330
3331         local_irq_restore(flags);
3332
3333         atomic_long_inc(&skb->dev->rx_dropped);
3334         kfree_skb(skb);
3335         return NET_RX_DROP;
3336 }
3337
3338 static int netif_rx_internal(struct sk_buff *skb)
3339 {
3340         int ret;
3341
3342         net_timestamp_check(netdev_tstamp_prequeue, skb);
3343
3344         trace_netif_rx(skb);
3345 #ifdef CONFIG_RPS
3346         if (static_key_false(&rps_needed)) {
3347                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3348                 int cpu;
3349
3350                 preempt_disable();
3351                 rcu_read_lock();
3352
3353                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3354                 if (cpu < 0)
3355                         cpu = smp_processor_id();
3356
3357                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3358
3359                 rcu_read_unlock();
3360                 preempt_enable();
3361         } else
3362 #endif
3363         {
3364                 unsigned int qtail;
3365                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3366                 put_cpu();
3367         }
3368         return ret;
3369 }
3370
3371 /**
3372  *      netif_rx        -       post buffer to the network code
3373  *      @skb: buffer to post
3374  *
3375  *      This function receives a packet from a device driver and queues it for
3376  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3377  *      may be dropped during processing for congestion control or by the
3378  *      protocol layers.
3379  *
3380  *      return values:
3381  *      NET_RX_SUCCESS  (no congestion)
3382  *      NET_RX_DROP     (packet was dropped)
3383  *
3384  */
3385
3386 int netif_rx(struct sk_buff *skb)
3387 {
3388         trace_netif_rx_entry(skb);
3389
3390         return netif_rx_internal(skb);
3391 }
3392 EXPORT_SYMBOL(netif_rx);
3393
3394 int netif_rx_ni(struct sk_buff *skb)
3395 {
3396         int err;
3397
3398         trace_netif_rx_ni_entry(skb);
3399
3400         preempt_disable();
3401         err = netif_rx_internal(skb);
3402         if (local_softirq_pending())
3403                 do_softirq();
3404         preempt_enable();
3405
3406         return err;
3407 }
3408 EXPORT_SYMBOL(netif_rx_ni);
3409
3410 static void net_tx_action(struct softirq_action *h)
3411 {
3412         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3413
3414         if (sd->completion_queue) {
3415                 struct sk_buff *clist;
3416
3417                 local_irq_disable();
3418                 clist = sd->completion_queue;
3419                 sd->completion_queue = NULL;
3420                 local_irq_enable();
3421
3422                 while (clist) {
3423                         struct sk_buff *skb = clist;
3424                         clist = clist->next;
3425
3426                         WARN_ON(atomic_read(&skb->users));
3427                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3428                                 trace_consume_skb(skb);
3429                         else
3430                                 trace_kfree_skb(skb, net_tx_action);
3431                         __kfree_skb(skb);
3432                 }
3433         }
3434
3435         if (sd->output_queue) {
3436                 struct Qdisc *head;
3437
3438                 local_irq_disable();
3439                 head = sd->output_queue;
3440                 sd->output_queue = NULL;
3441                 sd->output_queue_tailp = &sd->output_queue;
3442                 local_irq_enable();
3443
3444                 while (head) {
3445                         struct Qdisc *q = head;
3446                         spinlock_t *root_lock;
3447
3448                         head = head->next_sched;
3449
3450                         root_lock = qdisc_lock(q);
3451                         if (spin_trylock(root_lock)) {
3452                                 smp_mb__before_atomic();
3453                                 clear_bit(__QDISC_STATE_SCHED,
3454                                           &q->state);
3455                                 qdisc_run(q);
3456                                 spin_unlock(root_lock);
3457                         } else {
3458                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3459                                               &q->state)) {
3460                                         __netif_reschedule(q);
3461                                 } else {
3462                                         smp_mb__before_atomic();
3463                                         clear_bit(__QDISC_STATE_SCHED,
3464                                                   &q->state);
3465                                 }
3466                         }
3467                 }
3468         }
3469 }
3470
3471 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3472     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3473 /* This hook is defined here for ATM LANE */
3474 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3475                              unsigned char *addr) __read_mostly;
3476 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3477 #endif
3478
3479 #ifdef CONFIG_NET_CLS_ACT
3480 /* TODO: Maybe we should just force sch_ingress to be compiled in
3481  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3482  * a compare and 2 stores extra right now if we dont have it on
3483  * but have CONFIG_NET_CLS_ACT
3484  * NOTE: This doesn't stop any functionality; if you dont have
3485  * the ingress scheduler, you just can't add policies on ingress.
3486  *
3487  */
3488 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3489 {
3490         struct net_device *dev = skb->dev;
3491         u32 ttl = G_TC_RTTL(skb->tc_verd);
3492         int result = TC_ACT_OK;
3493         struct Qdisc *q;
3494
3495         if (unlikely(MAX_RED_LOOP < ttl++)) {
3496                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3497                                      skb->skb_iif, dev->ifindex);
3498                 return TC_ACT_SHOT;
3499         }
3500
3501         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3502         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3503
3504         q = rcu_dereference(rxq->qdisc);
3505         if (q != &noop_qdisc) {
3506                 spin_lock(qdisc_lock(q));
3507                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3508                         result = qdisc_enqueue_root(skb, q);
3509                 spin_unlock(qdisc_lock(q));
3510         }
3511
3512         return result;
3513 }
3514
3515 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3516                                          struct packet_type **pt_prev,
3517                                          int *ret, struct net_device *orig_dev)
3518 {
3519         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3520
3521         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3522                 goto out;
3523
3524         if (*pt_prev) {
3525                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3526                 *pt_prev = NULL;
3527         }
3528
3529         switch (ing_filter(skb, rxq)) {
3530         case TC_ACT_SHOT:
3531         case TC_ACT_STOLEN:
3532                 kfree_skb(skb);
3533                 return NULL;
3534         }
3535
3536 out:
3537         skb->tc_verd = 0;
3538         return skb;
3539 }
3540 #endif
3541
3542 /**
3543  *      netdev_rx_handler_register - register receive handler
3544  *      @dev: device to register a handler for
3545  *      @rx_handler: receive handler to register
3546  *      @rx_handler_data: data pointer that is used by rx handler
3547  *
3548  *      Register a receive handler for a device. This handler will then be
3549  *      called from __netif_receive_skb. A negative errno code is returned
3550  *      on a failure.
3551  *
3552  *      The caller must hold the rtnl_mutex.
3553  *
3554  *      For a general description of rx_handler, see enum rx_handler_result.
3555  */
3556 int netdev_rx_handler_register(struct net_device *dev,
3557                                rx_handler_func_t *rx_handler,
3558                                void *rx_handler_data)
3559 {
3560         ASSERT_RTNL();
3561
3562         if (dev->rx_handler)
3563                 return -EBUSY;
3564
3565         /* Note: rx_handler_data must be set before rx_handler */
3566         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3567         rcu_assign_pointer(dev->rx_handler, rx_handler);
3568
3569         return 0;
3570 }
3571 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3572
3573 /**
3574  *      netdev_rx_handler_unregister - unregister receive handler
3575  *      @dev: device to unregister a handler from
3576  *
3577  *      Unregister a receive handler from a device.
3578  *
3579  *      The caller must hold the rtnl_mutex.
3580  */
3581 void netdev_rx_handler_unregister(struct net_device *dev)
3582 {
3583
3584         ASSERT_RTNL();
3585         RCU_INIT_POINTER(dev->rx_handler, NULL);
3586         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3587          * section has a guarantee to see a non NULL rx_handler_data
3588          * as well.
3589          */
3590         synchronize_net();
3591         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3592 }
3593 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3594
3595 /*
3596  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3597  * the special handling of PFMEMALLOC skbs.
3598  */
3599 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3600 {
3601         switch (skb->protocol) {
3602         case htons(ETH_P_ARP):
3603         case htons(ETH_P_IP):
3604         case htons(ETH_P_IPV6):
3605         case htons(ETH_P_8021Q):
3606         case htons(ETH_P_8021AD):
3607                 return true;
3608         default:
3609                 return false;
3610         }
3611 }
3612
3613 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3614 {
3615         struct packet_type *ptype, *pt_prev;
3616         rx_handler_func_t *rx_handler;
3617         struct net_device *orig_dev;
3618         struct net_device *null_or_dev;
3619         bool deliver_exact = false;
3620         int ret = NET_RX_DROP;
3621         __be16 type;
3622
3623         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3624
3625         trace_netif_receive_skb(skb);
3626
3627         orig_dev = skb->dev;
3628
3629         skb_reset_network_header(skb);
3630         if (!skb_transport_header_was_set(skb))
3631                 skb_reset_transport_header(skb);
3632         skb_reset_mac_len(skb);
3633
3634         pt_prev = NULL;
3635
3636         rcu_read_lock();
3637
3638 another_round:
3639         skb->skb_iif = skb->dev->ifindex;
3640
3641         __this_cpu_inc(softnet_data.processed);
3642
3643         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3644             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3645                 skb = skb_vlan_untag(skb);
3646                 if (unlikely(!skb))
3647                         goto unlock;
3648         }
3649
3650 #ifdef CONFIG_NET_CLS_ACT
3651         if (skb->tc_verd & TC_NCLS) {
3652                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3653                 goto ncls;
3654         }
3655 #endif
3656
3657         if (pfmemalloc)
3658                 goto skip_taps;
3659
3660         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3661                 if (!ptype->dev || ptype->dev == skb->dev) {
3662                         if (pt_prev)
3663                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3664                         pt_prev = ptype;
3665                 }
3666         }
3667
3668 skip_taps:
3669 #ifdef CONFIG_NET_CLS_ACT
3670         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3671         if (!skb)
3672                 goto unlock;
3673 ncls:
3674 #endif
3675
3676         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3677                 goto drop;
3678
3679         if (vlan_tx_tag_present(skb)) {
3680                 if (pt_prev) {
3681                         ret = deliver_skb(skb, pt_prev, orig_dev);
3682                         pt_prev = NULL;
3683                 }
3684                 if (vlan_do_receive(&skb))
3685                         goto another_round;
3686                 else if (unlikely(!skb))
3687                         goto unlock;
3688         }
3689
3690         rx_handler = rcu_dereference(skb->dev->rx_handler);
3691         if (rx_handler) {
3692                 if (pt_prev) {
3693                         ret = deliver_skb(skb, pt_prev, orig_dev);
3694                         pt_prev = NULL;
3695                 }
3696                 switch (rx_handler(&skb)) {
3697                 case RX_HANDLER_CONSUMED:
3698                         ret = NET_RX_SUCCESS;
3699                         goto unlock;
3700                 case RX_HANDLER_ANOTHER:
3701                         goto another_round;
3702                 case RX_HANDLER_EXACT:
3703                         deliver_exact = true;
3704                 case RX_HANDLER_PASS:
3705                         break;
3706                 default:
3707                         BUG();
3708                 }
3709         }
3710
3711         if (unlikely(vlan_tx_tag_present(skb))) {
3712                 if (vlan_tx_tag_get_id(skb))
3713                         skb->pkt_type = PACKET_OTHERHOST;
3714                 /* Note: we might in the future use prio bits
3715                  * and set skb->priority like in vlan_do_receive()
3716                  * For the time being, just ignore Priority Code Point
3717                  */
3718                 skb->vlan_tci = 0;
3719         }
3720
3721         /* deliver only exact match when indicated */
3722         null_or_dev = deliver_exact ? skb->dev : NULL;
3723
3724         type = skb->protocol;
3725         list_for_each_entry_rcu(ptype,
3726                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3727                 if (ptype->type == type &&
3728                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3729                      ptype->dev == orig_dev)) {
3730                         if (pt_prev)
3731                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3732                         pt_prev = ptype;
3733                 }
3734         }
3735
3736         if (pt_prev) {
3737                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3738                         goto drop;
3739                 else
3740                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3741         } else {
3742 drop:
3743                 atomic_long_inc(&skb->dev->rx_dropped);
3744                 kfree_skb(skb);
3745                 /* Jamal, now you will not able to escape explaining
3746                  * me how you were going to use this. :-)
3747                  */
3748                 ret = NET_RX_DROP;
3749         }
3750
3751 unlock:
3752         rcu_read_unlock();
3753         return ret;
3754 }
3755
3756 static int __netif_receive_skb(struct sk_buff *skb)
3757 {
3758         int ret;
3759
3760         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3761                 unsigned long pflags = current->flags;
3762
3763                 /*
3764                  * PFMEMALLOC skbs are special, they should
3765                  * - be delivered to SOCK_MEMALLOC sockets only
3766                  * - stay away from userspace
3767                  * - have bounded memory usage
3768                  *
3769                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3770                  * context down to all allocation sites.
3771                  */
3772                 current->flags |= PF_MEMALLOC;
3773                 ret = __netif_receive_skb_core(skb, true);
3774                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3775         } else
3776                 ret = __netif_receive_skb_core(skb, false);
3777
3778         return ret;
3779 }
3780
3781 static int netif_receive_skb_internal(struct sk_buff *skb)
3782 {
3783         net_timestamp_check(netdev_tstamp_prequeue, skb);
3784
3785         if (skb_defer_rx_timestamp(skb))
3786                 return NET_RX_SUCCESS;
3787
3788 #ifdef CONFIG_RPS
3789         if (static_key_false(&rps_needed)) {
3790                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3791                 int cpu, ret;
3792
3793                 rcu_read_lock();
3794
3795                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3796
3797                 if (cpu >= 0) {
3798                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3799                         rcu_read_unlock();
3800                         return ret;
3801                 }
3802                 rcu_read_unlock();
3803         }
3804 #endif
3805         return __netif_receive_skb(skb);
3806 }
3807
3808 /**
3809  *      netif_receive_skb - process receive buffer from network
3810  *      @skb: buffer to process
3811  *
3812  *      netif_receive_skb() is the main receive data processing function.
3813  *      It always succeeds. The buffer may be dropped during processing
3814  *      for congestion control or by the protocol layers.
3815  *
3816  *      This function may only be called from softirq context and interrupts
3817  *      should be enabled.
3818  *
3819  *      Return values (usually ignored):
3820  *      NET_RX_SUCCESS: no congestion
3821  *      NET_RX_DROP: packet was dropped
3822  */
3823 int netif_receive_skb(struct sk_buff *skb)
3824 {
3825         trace_netif_receive_skb_entry(skb);
3826
3827         return netif_receive_skb_internal(skb);
3828 }
3829 EXPORT_SYMBOL(netif_receive_skb);
3830
3831 /* Network device is going away, flush any packets still pending
3832  * Called with irqs disabled.
3833  */
3834 static void flush_backlog(void *arg)
3835 {
3836         struct net_device *dev = arg;
3837         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3838         struct sk_buff *skb, *tmp;
3839
3840         rps_lock(sd);
3841         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3842                 if (skb->dev == dev) {
3843                         __skb_unlink(skb, &sd->input_pkt_queue);
3844                         kfree_skb(skb);
3845                         input_queue_head_incr(sd);
3846                 }
3847         }
3848         rps_unlock(sd);
3849
3850         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3851                 if (skb->dev == dev) {
3852                         __skb_unlink(skb, &sd->process_queue);
3853                         kfree_skb(skb);
3854                         input_queue_head_incr(sd);
3855                 }
3856         }
3857 }
3858
3859 static int napi_gro_complete(struct sk_buff *skb)
3860 {
3861         struct packet_offload *ptype;
3862         __be16 type = skb->protocol;
3863         struct list_head *head = &offload_base;
3864         int err = -ENOENT;
3865
3866         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3867
3868         if (NAPI_GRO_CB(skb)->count == 1) {
3869                 skb_shinfo(skb)->gso_size = 0;
3870                 goto out;
3871         }
3872
3873         rcu_read_lock();
3874         list_for_each_entry_rcu(ptype, head, list) {
3875                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3876                         continue;
3877
3878                 err = ptype->callbacks.gro_complete(skb, 0);
3879                 break;
3880         }
3881         rcu_read_unlock();
3882
3883         if (err) {
3884                 WARN_ON(&ptype->list == head);
3885                 kfree_skb(skb);
3886                 return NET_RX_SUCCESS;
3887         }
3888
3889 out:
3890         return netif_receive_skb_internal(skb);
3891 }
3892
3893 /* napi->gro_list contains packets ordered by age.
3894  * youngest packets at the head of it.
3895  * Complete skbs in reverse order to reduce latencies.
3896  */
3897 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3898 {
3899         struct sk_buff *skb, *prev = NULL;
3900
3901         /* scan list and build reverse chain */
3902         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3903                 skb->prev = prev;
3904                 prev = skb;
3905         }
3906
3907         for (skb = prev; skb; skb = prev) {
3908                 skb->next = NULL;
3909
3910                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3911                         return;
3912
3913                 prev = skb->prev;
3914                 napi_gro_complete(skb);
3915                 napi->gro_count--;
3916         }
3917
3918         napi->gro_list = NULL;
3919 }
3920 EXPORT_SYMBOL(napi_gro_flush);
3921
3922 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3923 {
3924         struct sk_buff *p;
3925         unsigned int maclen = skb->dev->hard_header_len;
3926         u32 hash = skb_get_hash_raw(skb);
3927
3928         for (p = napi->gro_list; p; p = p->next) {
3929                 unsigned long diffs;
3930
3931                 NAPI_GRO_CB(p)->flush = 0;
3932
3933                 if (hash != skb_get_hash_raw(p)) {
3934                         NAPI_GRO_CB(p)->same_flow = 0;
3935                         continue;
3936                 }
3937
3938                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3939                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3940                 if (maclen == ETH_HLEN)
3941                         diffs |= compare_ether_header(skb_mac_header(p),
3942                                                       skb_mac_header(skb));
3943                 else if (!diffs)
3944                         diffs = memcmp(skb_mac_header(p),
3945                                        skb_mac_header(skb),
3946                                        maclen);
3947                 NAPI_GRO_CB(p)->same_flow = !diffs;
3948         }
3949 }
3950
3951 static void skb_gro_reset_offset(struct sk_buff *skb)
3952 {
3953         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3954         const skb_frag_t *frag0 = &pinfo->frags[0];
3955
3956         NAPI_GRO_CB(skb)->data_offset = 0;
3957         NAPI_GRO_CB(skb)->frag0 = NULL;
3958         NAPI_GRO_CB(skb)->frag0_len = 0;
3959
3960         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3961             pinfo->nr_frags &&
3962             !PageHighMem(skb_frag_page(frag0))) {
3963                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3964                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3965         }
3966 }
3967
3968 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3969 {
3970         struct skb_shared_info *pinfo = skb_shinfo(skb);
3971
3972         BUG_ON(skb->end - skb->tail < grow);
3973
3974         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3975
3976         skb->data_len -= grow;
3977         skb->tail += grow;
3978
3979         pinfo->frags[0].page_offset += grow;
3980         skb_frag_size_sub(&pinfo->frags[0], grow);
3981
3982         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3983                 skb_frag_unref(skb, 0);
3984                 memmove(pinfo->frags, pinfo->frags + 1,
3985                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3986         }
3987 }
3988
3989 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3990 {
3991         struct sk_buff **pp = NULL;
3992         struct packet_offload *ptype;
3993         __be16 type = skb->protocol;
3994         struct list_head *head = &offload_base;
3995         int same_flow;
3996         enum gro_result ret;
3997         int grow;
3998
3999         if (!(skb->dev->features & NETIF_F_GRO))
4000                 goto normal;
4001
4002         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4003                 goto normal;
4004
4005         gro_list_prepare(napi, skb);
4006
4007         rcu_read_lock();
4008         list_for_each_entry_rcu(ptype, head, list) {
4009                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4010                         continue;
4011
4012                 skb_set_network_header(skb, skb_gro_offset(skb));
4013                 skb_reset_mac_len(skb);
4014                 NAPI_GRO_CB(skb)->same_flow = 0;
4015                 NAPI_GRO_CB(skb)->flush = 0;
4016                 NAPI_GRO_CB(skb)->free = 0;
4017                 NAPI_GRO_CB(skb)->udp_mark = 0;
4018
4019                 /* Setup for GRO checksum validation */
4020                 switch (skb->ip_summed) {
4021                 case CHECKSUM_COMPLETE:
4022                         NAPI_GRO_CB(skb)->csum = skb->csum;
4023                         NAPI_GRO_CB(skb)->csum_valid = 1;
4024                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4025                         break;
4026                 case CHECKSUM_UNNECESSARY:
4027                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4028                         NAPI_GRO_CB(skb)->csum_valid = 0;
4029                         break;
4030                 default:
4031                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4032                         NAPI_GRO_CB(skb)->csum_valid = 0;
4033                 }
4034
4035                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4036                 break;
4037         }
4038         rcu_read_unlock();
4039
4040         if (&ptype->list == head)
4041                 goto normal;
4042
4043         same_flow = NAPI_GRO_CB(skb)->same_flow;
4044         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4045
4046         if (pp) {
4047                 struct sk_buff *nskb = *pp;
4048
4049                 *pp = nskb->next;
4050                 nskb->next = NULL;
4051                 napi_gro_complete(nskb);
4052                 napi->gro_count--;
4053         }
4054
4055         if (same_flow)
4056                 goto ok;
4057
4058         if (NAPI_GRO_CB(skb)->flush)
4059                 goto normal;
4060
4061         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4062                 struct sk_buff *nskb = napi->gro_list;
4063
4064                 /* locate the end of the list to select the 'oldest' flow */
4065                 while (nskb->next) {
4066                         pp = &nskb->next;
4067                         nskb = *pp;
4068                 }
4069                 *pp = NULL;
4070                 nskb->next = NULL;
4071                 napi_gro_complete(nskb);
4072         } else {
4073                 napi->gro_count++;
4074         }
4075         NAPI_GRO_CB(skb)->count = 1;
4076         NAPI_GRO_CB(skb)->age = jiffies;
4077         NAPI_GRO_CB(skb)->last = skb;
4078         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4079         skb->next = napi->gro_list;
4080         napi->gro_list = skb;
4081         ret = GRO_HELD;
4082
4083 pull:
4084         grow = skb_gro_offset(skb) - skb_headlen(skb);
4085         if (grow > 0)
4086                 gro_pull_from_frag0(skb, grow);
4087 ok:
4088         return ret;
4089
4090 normal:
4091         ret = GRO_NORMAL;
4092         goto pull;
4093 }
4094
4095 struct packet_offload *gro_find_receive_by_type(__be16 type)
4096 {
4097         struct list_head *offload_head = &offload_base;
4098         struct packet_offload *ptype;
4099
4100         list_for_each_entry_rcu(ptype, offload_head, list) {
4101                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4102                         continue;
4103                 return ptype;
4104         }
4105         return NULL;
4106 }
4107 EXPORT_SYMBOL(gro_find_receive_by_type);
4108
4109 struct packet_offload *gro_find_complete_by_type(__be16 type)
4110 {
4111         struct list_head *offload_head = &offload_base;
4112         struct packet_offload *ptype;
4113
4114         list_for_each_entry_rcu(ptype, offload_head, list) {
4115                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4116                         continue;
4117                 return ptype;
4118         }
4119         return NULL;
4120 }
4121 EXPORT_SYMBOL(gro_find_complete_by_type);
4122
4123 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4124 {
4125         switch (ret) {
4126         case GRO_NORMAL:
4127                 if (netif_receive_skb_internal(skb))
4128                         ret = GRO_DROP;
4129                 break;
4130
4131         case GRO_DROP:
4132                 kfree_skb(skb);
4133                 break;
4134
4135         case GRO_MERGED_FREE:
4136                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4137                         kmem_cache_free(skbuff_head_cache, skb);
4138                 else
4139                         __kfree_skb(skb);
4140                 break;
4141
4142         case GRO_HELD:
4143         case GRO_MERGED:
4144                 break;
4145         }
4146
4147         return ret;
4148 }
4149
4150 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4151 {
4152         trace_napi_gro_receive_entry(skb);
4153
4154         skb_gro_reset_offset(skb);
4155
4156         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4157 }
4158 EXPORT_SYMBOL(napi_gro_receive);
4159
4160 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4161 {
4162         if (unlikely(skb->pfmemalloc)) {
4163                 consume_skb(skb);
4164                 return;
4165         }
4166         __skb_pull(skb, skb_headlen(skb));
4167         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4168         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4169         skb->vlan_tci = 0;
4170         skb->dev = napi->dev;
4171         skb->skb_iif = 0;
4172         skb->encapsulation = 0;
4173         skb_shinfo(skb)->gso_type = 0;
4174         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4175
4176         napi->skb = skb;
4177 }
4178
4179 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4180 {
4181         struct sk_buff *skb = napi->skb;
4182
4183         if (!skb) {
4184                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4185                 napi->skb = skb;
4186         }
4187         return skb;
4188 }
4189 EXPORT_SYMBOL(napi_get_frags);
4190
4191 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4192                                       struct sk_buff *skb,
4193                                       gro_result_t ret)
4194 {
4195         switch (ret) {
4196         case GRO_NORMAL:
4197         case GRO_HELD:
4198                 __skb_push(skb, ETH_HLEN);
4199                 skb->protocol = eth_type_trans(skb, skb->dev);
4200                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4201                         ret = GRO_DROP;
4202                 break;
4203
4204         case GRO_DROP:
4205         case GRO_MERGED_FREE:
4206                 napi_reuse_skb(napi, skb);
4207                 break;
4208
4209         case GRO_MERGED:
4210                 break;
4211         }
4212
4213         return ret;
4214 }
4215
4216 /* Upper GRO stack assumes network header starts at gro_offset=0
4217  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4218  * We copy ethernet header into skb->data to have a common layout.
4219  */
4220 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4221 {
4222         struct sk_buff *skb = napi->skb;
4223         const struct ethhdr *eth;
4224         unsigned int hlen = sizeof(*eth);
4225
4226         napi->skb = NULL;
4227
4228         skb_reset_mac_header(skb);
4229         skb_gro_reset_offset(skb);
4230
4231         eth = skb_gro_header_fast(skb, 0);
4232         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4233                 eth = skb_gro_header_slow(skb, hlen, 0);
4234                 if (unlikely(!eth)) {
4235                         napi_reuse_skb(napi, skb);
4236                         return NULL;
4237                 }
4238         } else {
4239                 gro_pull_from_frag0(skb, hlen);
4240                 NAPI_GRO_CB(skb)->frag0 += hlen;
4241                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4242         }
4243         __skb_pull(skb, hlen);
4244
4245         /*
4246          * This works because the only protocols we care about don't require
4247          * special handling.
4248          * We'll fix it up properly in napi_frags_finish()
4249          */
4250         skb->protocol = eth->h_proto;
4251
4252         return skb;
4253 }
4254
4255 gro_result_t napi_gro_frags(struct napi_struct *napi)
4256 {
4257         struct sk_buff *skb = napi_frags_skb(napi);
4258
4259         if (!skb)
4260                 return GRO_DROP;
4261
4262         trace_napi_gro_frags_entry(skb);
4263
4264         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4265 }
4266 EXPORT_SYMBOL(napi_gro_frags);
4267
4268 /* Compute the checksum from gro_offset and return the folded value
4269  * after adding in any pseudo checksum.
4270  */
4271 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4272 {
4273         __wsum wsum;
4274         __sum16 sum;
4275
4276         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4277
4278         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4279         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4280         if (likely(!sum)) {
4281                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4282                     !skb->csum_complete_sw)
4283                         netdev_rx_csum_fault(skb->dev);
4284         }
4285
4286         NAPI_GRO_CB(skb)->csum = wsum;
4287         NAPI_GRO_CB(skb)->csum_valid = 1;
4288
4289         return sum;
4290 }
4291 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4292
4293 /*
4294  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4295  * Note: called with local irq disabled, but exits with local irq enabled.
4296  */
4297 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4298 {
4299 #ifdef CONFIG_RPS
4300         struct softnet_data *remsd = sd->rps_ipi_list;
4301
4302         if (remsd) {
4303                 sd->rps_ipi_list = NULL;
4304
4305                 local_irq_enable();
4306
4307                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4308                 while (remsd) {
4309                         struct softnet_data *next = remsd->rps_ipi_next;
4310
4311                         if (cpu_online(remsd->cpu))
4312                                 smp_call_function_single_async(remsd->cpu,
4313                                                            &remsd->csd);
4314                         remsd = next;
4315                 }
4316         } else
4317 #endif
4318                 local_irq_enable();
4319 }
4320
4321 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4322 {
4323 #ifdef CONFIG_RPS
4324         return sd->rps_ipi_list != NULL;
4325 #else
4326         return false;
4327 #endif
4328 }
4329
4330 static int process_backlog(struct napi_struct *napi, int quota)
4331 {
4332         int work = 0;
4333         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4334
4335         /* Check if we have pending ipi, its better to send them now,
4336          * not waiting net_rx_action() end.
4337          */
4338         if (sd_has_rps_ipi_waiting(sd)) {
4339                 local_irq_disable();
4340                 net_rps_action_and_irq_enable(sd);
4341         }
4342
4343         napi->weight = weight_p;
4344         local_irq_disable();
4345         while (1) {
4346                 struct sk_buff *skb;
4347
4348                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4349                         local_irq_enable();
4350                         __netif_receive_skb(skb);
4351                         local_irq_disable();
4352                         input_queue_head_incr(sd);
4353                         if (++work >= quota) {
4354                                 local_irq_enable();
4355                                 return work;
4356                         }
4357                 }
4358
4359                 rps_lock(sd);
4360                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4361                         /*
4362                          * Inline a custom version of __napi_complete().
4363                          * only current cpu owns and manipulates this napi,
4364                          * and NAPI_STATE_SCHED is the only possible flag set
4365                          * on backlog.
4366                          * We can use a plain write instead of clear_bit(),
4367                          * and we dont need an smp_mb() memory barrier.
4368                          */
4369                         napi->state = 0;
4370                         rps_unlock(sd);
4371
4372                         break;
4373                 }
4374
4375                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4376                                            &sd->process_queue);
4377                 rps_unlock(sd);
4378         }
4379         local_irq_enable();
4380
4381         return work;
4382 }
4383
4384 /**
4385  * __napi_schedule - schedule for receive
4386  * @n: entry to schedule
4387  *
4388  * The entry's receive function will be scheduled to run.
4389  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4390  */
4391 void __napi_schedule(struct napi_struct *n)
4392 {
4393         unsigned long flags;
4394
4395         local_irq_save(flags);
4396         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4397         local_irq_restore(flags);
4398 }
4399 EXPORT_SYMBOL(__napi_schedule);
4400
4401 /**
4402  * __napi_schedule_irqoff - schedule for receive
4403  * @n: entry to schedule
4404  *
4405  * Variant of __napi_schedule() assuming hard irqs are masked
4406  */
4407 void __napi_schedule_irqoff(struct napi_struct *n)
4408 {
4409         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4410 }
4411 EXPORT_SYMBOL(__napi_schedule_irqoff);
4412
4413 void __napi_complete(struct napi_struct *n)
4414 {
4415         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4416
4417         list_del_init(&n->poll_list);
4418         smp_mb__before_atomic();
4419         clear_bit(NAPI_STATE_SCHED, &n->state);
4420 }
4421 EXPORT_SYMBOL(__napi_complete);
4422
4423 void napi_complete_done(struct napi_struct *n, int work_done)
4424 {
4425         unsigned long flags;
4426
4427         /*
4428          * don't let napi dequeue from the cpu poll list
4429          * just in case its running on a different cpu
4430          */
4431         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4432                 return;
4433
4434         if (n->gro_list) {
4435                 unsigned long timeout = 0;
4436
4437                 if (work_done)
4438                         timeout = n->dev->gro_flush_timeout;
4439
4440                 if (timeout)
4441                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4442                                       HRTIMER_MODE_REL_PINNED);
4443                 else
4444                         napi_gro_flush(n, false);
4445         }
4446         if (likely(list_empty(&n->poll_list))) {
4447                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4448         } else {
4449                 /* If n->poll_list is not empty, we need to mask irqs */
4450                 local_irq_save(flags);
4451                 __napi_complete(n);
4452                 local_irq_restore(flags);
4453         }
4454 }
4455 EXPORT_SYMBOL(napi_complete_done);
4456
4457 /* must be called under rcu_read_lock(), as we dont take a reference */
4458 struct napi_struct *napi_by_id(unsigned int napi_id)
4459 {
4460         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4461         struct napi_struct *napi;
4462
4463         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4464                 if (napi->napi_id == napi_id)
4465                         return napi;
4466
4467         return NULL;
4468 }
4469 EXPORT_SYMBOL_GPL(napi_by_id);
4470
4471 void napi_hash_add(struct napi_struct *napi)
4472 {
4473         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4474
4475                 spin_lock(&napi_hash_lock);
4476
4477                 /* 0 is not a valid id, we also skip an id that is taken
4478                  * we expect both events to be extremely rare
4479                  */
4480                 napi->napi_id = 0;
4481                 while (!napi->napi_id) {
4482                         napi->napi_id = ++napi_gen_id;
4483                         if (napi_by_id(napi->napi_id))
4484                                 napi->napi_id = 0;
4485                 }
4486
4487                 hlist_add_head_rcu(&napi->napi_hash_node,
4488                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4489
4490                 spin_unlock(&napi_hash_lock);
4491         }
4492 }
4493 EXPORT_SYMBOL_GPL(napi_hash_add);
4494
4495 /* Warning : caller is responsible to make sure rcu grace period
4496  * is respected before freeing memory containing @napi
4497  */
4498 void napi_hash_del(struct napi_struct *napi)
4499 {
4500         spin_lock(&napi_hash_lock);
4501
4502         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4503                 hlist_del_rcu(&napi->napi_hash_node);
4504
4505         spin_unlock(&napi_hash_lock);
4506 }
4507 EXPORT_SYMBOL_GPL(napi_hash_del);
4508
4509 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4510 {
4511         struct napi_struct *napi;
4512
4513         napi = container_of(timer, struct napi_struct, timer);
4514         if (napi->gro_list)
4515                 napi_schedule(napi);
4516
4517         return HRTIMER_NORESTART;
4518 }
4519
4520 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4521                     int (*poll)(struct napi_struct *, int), int weight)
4522 {
4523         INIT_LIST_HEAD(&napi->poll_list);
4524         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4525         napi->timer.function = napi_watchdog;
4526         napi->gro_count = 0;
4527         napi->gro_list = NULL;
4528         napi->skb = NULL;
4529         napi->poll = poll;
4530         if (weight > NAPI_POLL_WEIGHT)
4531                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4532                             weight, dev->name);
4533         napi->weight = weight;
4534         list_add(&napi->dev_list, &dev->napi_list);
4535         napi->dev = dev;
4536 #ifdef CONFIG_NETPOLL
4537         spin_lock_init(&napi->poll_lock);
4538         napi->poll_owner = -1;
4539 #endif
4540         set_bit(NAPI_STATE_SCHED, &napi->state);
4541 }
4542 EXPORT_SYMBOL(netif_napi_add);
4543
4544 void napi_disable(struct napi_struct *n)
4545 {
4546         might_sleep();
4547         set_bit(NAPI_STATE_DISABLE, &n->state);
4548
4549         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4550                 msleep(1);
4551
4552         hrtimer_cancel(&n->timer);
4553
4554         clear_bit(NAPI_STATE_DISABLE, &n->state);
4555 }
4556 EXPORT_SYMBOL(napi_disable);
4557
4558 void netif_napi_del(struct napi_struct *napi)
4559 {
4560         list_del_init(&napi->dev_list);
4561         napi_free_frags(napi);
4562
4563         kfree_skb_list(napi->gro_list);
4564         napi->gro_list = NULL;
4565         napi->gro_count = 0;
4566 }
4567 EXPORT_SYMBOL(netif_napi_del);
4568
4569 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4570 {
4571         void *have;
4572         int work, weight;
4573
4574         list_del_init(&n->poll_list);
4575
4576         have = netpoll_poll_lock(n);
4577
4578         weight = n->weight;
4579
4580         /* This NAPI_STATE_SCHED test is for avoiding a race
4581          * with netpoll's poll_napi().  Only the entity which
4582          * obtains the lock and sees NAPI_STATE_SCHED set will
4583          * actually make the ->poll() call.  Therefore we avoid
4584          * accidentally calling ->poll() when NAPI is not scheduled.
4585          */
4586         work = 0;
4587         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4588                 work = n->poll(n, weight);
4589                 trace_napi_poll(n);
4590         }
4591
4592         WARN_ON_ONCE(work > weight);
4593
4594         if (likely(work < weight))
4595                 goto out_unlock;
4596
4597         /* Drivers must not modify the NAPI state if they
4598          * consume the entire weight.  In such cases this code
4599          * still "owns" the NAPI instance and therefore can
4600          * move the instance around on the list at-will.
4601          */
4602         if (unlikely(napi_disable_pending(n))) {
4603                 napi_complete(n);
4604                 goto out_unlock;
4605         }
4606
4607         if (n->gro_list) {
4608                 /* flush too old packets
4609                  * If HZ < 1000, flush all packets.
4610                  */
4611                 napi_gro_flush(n, HZ >= 1000);
4612         }
4613
4614         /* Some drivers may have called napi_schedule
4615          * prior to exhausting their budget.
4616          */
4617         if (unlikely(!list_empty(&n->poll_list))) {
4618                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4619                              n->dev ? n->dev->name : "backlog");
4620                 goto out_unlock;
4621         }
4622
4623         list_add_tail(&n->poll_list, repoll);
4624
4625 out_unlock:
4626         netpoll_poll_unlock(have);
4627
4628         return work;
4629 }
4630
4631 static void net_rx_action(struct softirq_action *h)
4632 {
4633         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4634         unsigned long time_limit = jiffies + 2;
4635         int budget = netdev_budget;
4636         LIST_HEAD(list);
4637         LIST_HEAD(repoll);
4638
4639         local_irq_disable();
4640         list_splice_init(&sd->poll_list, &list);
4641         local_irq_enable();
4642
4643         for (;;) {
4644                 struct napi_struct *n;
4645
4646                 if (list_empty(&list)) {
4647                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4648                                 return;
4649                         break;
4650                 }
4651
4652                 n = list_first_entry(&list, struct napi_struct, poll_list);
4653                 budget -= napi_poll(n, &repoll);
4654
4655                 /* If softirq window is exhausted then punt.
4656                  * Allow this to run for 2 jiffies since which will allow
4657                  * an average latency of 1.5/HZ.
4658                  */
4659                 if (unlikely(budget <= 0 ||
4660                              time_after_eq(jiffies, time_limit))) {
4661                         sd->time_squeeze++;
4662                         break;
4663                 }
4664         }
4665
4666         local_irq_disable();
4667
4668         list_splice_tail_init(&sd->poll_list, &list);
4669         list_splice_tail(&repoll, &list);
4670         list_splice(&list, &sd->poll_list);
4671         if (!list_empty(&sd->poll_list))
4672                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4673
4674         net_rps_action_and_irq_enable(sd);
4675 }
4676
4677 struct netdev_adjacent {
4678         struct net_device *dev;
4679
4680         /* upper master flag, there can only be one master device per list */
4681         bool master;
4682
4683         /* counter for the number of times this device was added to us */
4684         u16 ref_nr;
4685
4686         /* private field for the users */
4687         void *private;
4688
4689         struct list_head list;
4690         struct rcu_head rcu;
4691 };
4692
4693 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4694                                                  struct net_device *adj_dev,
4695                                                  struct list_head *adj_list)
4696 {
4697         struct netdev_adjacent *adj;
4698
4699         list_for_each_entry(adj, adj_list, list) {
4700                 if (adj->dev == adj_dev)
4701                         return adj;
4702         }
4703         return NULL;
4704 }
4705
4706 /**
4707  * netdev_has_upper_dev - Check if device is linked to an upper device
4708  * @dev: device
4709  * @upper_dev: upper device to check
4710  *
4711  * Find out if a device is linked to specified upper device and return true
4712  * in case it is. Note that this checks only immediate upper device,
4713  * not through a complete stack of devices. The caller must hold the RTNL lock.
4714  */
4715 bool netdev_has_upper_dev(struct net_device *dev,
4716                           struct net_device *upper_dev)
4717 {
4718         ASSERT_RTNL();
4719
4720         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4721 }
4722 EXPORT_SYMBOL(netdev_has_upper_dev);
4723
4724 /**
4725  * netdev_has_any_upper_dev - Check if device is linked to some device
4726  * @dev: device
4727  *
4728  * Find out if a device is linked to an upper device and return true in case
4729  * it is. The caller must hold the RTNL lock.
4730  */
4731 static bool netdev_has_any_upper_dev(struct net_device *dev)
4732 {
4733         ASSERT_RTNL();
4734
4735         return !list_empty(&dev->all_adj_list.upper);
4736 }
4737
4738 /**
4739  * netdev_master_upper_dev_get - Get master upper device
4740  * @dev: device
4741  *
4742  * Find a master upper device and return pointer to it or NULL in case
4743  * it's not there. The caller must hold the RTNL lock.
4744  */
4745 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4746 {
4747         struct netdev_adjacent *upper;
4748
4749         ASSERT_RTNL();
4750
4751         if (list_empty(&dev->adj_list.upper))
4752                 return NULL;
4753
4754         upper = list_first_entry(&dev->adj_list.upper,
4755                                  struct netdev_adjacent, list);
4756         if (likely(upper->master))
4757                 return upper->dev;
4758         return NULL;
4759 }
4760 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4761
4762 void *netdev_adjacent_get_private(struct list_head *adj_list)
4763 {
4764         struct netdev_adjacent *adj;
4765
4766         adj = list_entry(adj_list, struct netdev_adjacent, list);
4767
4768         return adj->private;
4769 }
4770 EXPORT_SYMBOL(netdev_adjacent_get_private);
4771
4772 /**
4773  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4774  * @dev: device
4775  * @iter: list_head ** of the current position
4776  *
4777  * Gets the next device from the dev's upper list, starting from iter
4778  * position. The caller must hold RCU read lock.
4779  */
4780 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4781                                                  struct list_head **iter)
4782 {
4783         struct netdev_adjacent *upper;
4784
4785         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4786
4787         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4788
4789         if (&upper->list == &dev->adj_list.upper)
4790                 return NULL;
4791
4792         *iter = &upper->list;
4793
4794         return upper->dev;
4795 }
4796 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4797
4798 /**
4799  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4800  * @dev: device
4801  * @iter: list_head ** of the current position
4802  *
4803  * Gets the next device from the dev's upper list, starting from iter
4804  * position. The caller must hold RCU read lock.
4805  */
4806 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4807                                                      struct list_head **iter)
4808 {
4809         struct netdev_adjacent *upper;
4810
4811         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4812
4813         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4814
4815         if (&upper->list == &dev->all_adj_list.upper)
4816                 return NULL;
4817
4818         *iter = &upper->list;
4819
4820         return upper->dev;
4821 }
4822 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4823
4824 /**
4825  * netdev_lower_get_next_private - Get the next ->private from the
4826  *                                 lower neighbour list
4827  * @dev: device
4828  * @iter: list_head ** of the current position
4829  *
4830  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4831  * list, starting from iter position. The caller must hold either hold the
4832  * RTNL lock or its own locking that guarantees that the neighbour lower
4833  * list will remain unchainged.
4834  */
4835 void *netdev_lower_get_next_private(struct net_device *dev,
4836                                     struct list_head **iter)
4837 {
4838         struct netdev_adjacent *lower;
4839
4840         lower = list_entry(*iter, struct netdev_adjacent, list);
4841
4842         if (&lower->list == &dev->adj_list.lower)
4843                 return NULL;
4844
4845         *iter = lower->list.next;
4846
4847         return lower->private;
4848 }
4849 EXPORT_SYMBOL(netdev_lower_get_next_private);
4850
4851 /**
4852  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4853  *                                     lower neighbour list, RCU
4854  *                                     variant
4855  * @dev: device
4856  * @iter: list_head ** of the current position
4857  *
4858  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4859  * list, starting from iter position. The caller must hold RCU read lock.
4860  */
4861 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4862                                         struct list_head **iter)
4863 {
4864         struct netdev_adjacent *lower;
4865
4866         WARN_ON_ONCE(!rcu_read_lock_held());
4867
4868         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4869
4870         if (&lower->list == &dev->adj_list.lower)
4871                 return NULL;
4872
4873         *iter = &lower->list;
4874
4875         return lower->private;
4876 }
4877 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4878
4879 /**
4880  * netdev_lower_get_next - Get the next device from the lower neighbour
4881  *                         list
4882  * @dev: device
4883  * @iter: list_head ** of the current position
4884  *
4885  * Gets the next netdev_adjacent from the dev's lower neighbour
4886  * list, starting from iter position. The caller must hold RTNL lock or
4887  * its own locking that guarantees that the neighbour lower
4888  * list will remain unchainged.
4889  */
4890 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4891 {
4892         struct netdev_adjacent *lower;
4893
4894         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4895
4896         if (&lower->list == &dev->adj_list.lower)
4897                 return NULL;
4898
4899         *iter = &lower->list;
4900
4901         return lower->dev;
4902 }
4903 EXPORT_SYMBOL(netdev_lower_get_next);
4904
4905 /**
4906  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4907  *                                     lower neighbour list, RCU
4908  *                                     variant
4909  * @dev: device
4910  *
4911  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4912  * list. The caller must hold RCU read lock.
4913  */
4914 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4915 {
4916         struct netdev_adjacent *lower;
4917
4918         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4919                         struct netdev_adjacent, list);
4920         if (lower)
4921                 return lower->private;
4922         return NULL;
4923 }
4924 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4925
4926 /**
4927  * netdev_master_upper_dev_get_rcu - Get master upper device
4928  * @dev: device
4929  *
4930  * Find a master upper device and return pointer to it or NULL in case
4931  * it's not there. The caller must hold the RCU read lock.
4932  */
4933 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4934 {
4935         struct netdev_adjacent *upper;
4936
4937         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4938                                        struct netdev_adjacent, list);
4939         if (upper && likely(upper->master))
4940                 return upper->dev;
4941         return NULL;
4942 }
4943 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4944
4945 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4946                               struct net_device *adj_dev,
4947                               struct list_head *dev_list)
4948 {
4949         char linkname[IFNAMSIZ+7];
4950         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4951                 "upper_%s" : "lower_%s", adj_dev->name);
4952         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4953                                  linkname);
4954 }
4955 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4956                                char *name,
4957                                struct list_head *dev_list)
4958 {
4959         char linkname[IFNAMSIZ+7];
4960         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4961                 "upper_%s" : "lower_%s", name);
4962         sysfs_remove_link(&(dev->dev.kobj), linkname);
4963 }
4964
4965 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4966                                                  struct net_device *adj_dev,
4967                                                  struct list_head *dev_list)
4968 {
4969         return (dev_list == &dev->adj_list.upper ||
4970                 dev_list == &dev->adj_list.lower) &&
4971                 net_eq(dev_net(dev), dev_net(adj_dev));
4972 }
4973
4974 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4975                                         struct net_device *adj_dev,
4976                                         struct list_head *dev_list,
4977                                         void *private, bool master)
4978 {
4979         struct netdev_adjacent *adj;
4980         int ret;
4981
4982         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4983
4984         if (adj) {
4985                 adj->ref_nr++;
4986                 return 0;
4987         }
4988
4989         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4990         if (!adj)
4991                 return -ENOMEM;
4992
4993         adj->dev = adj_dev;
4994         adj->master = master;
4995         adj->ref_nr = 1;
4996         adj->private = private;
4997         dev_hold(adj_dev);
4998
4999         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5000                  adj_dev->name, dev->name, adj_dev->name);
5001
5002         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5003                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5004                 if (ret)
5005                         goto free_adj;
5006         }
5007
5008         /* Ensure that master link is always the first item in list. */
5009         if (master) {
5010                 ret = sysfs_create_link(&(dev->dev.kobj),
5011                                         &(adj_dev->dev.kobj), "master");
5012                 if (ret)
5013                         goto remove_symlinks;
5014
5015                 list_add_rcu(&adj->list, dev_list);
5016         } else {
5017                 list_add_tail_rcu(&adj->list, dev_list);
5018         }
5019
5020         return 0;
5021
5022 remove_symlinks:
5023         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5024                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5025 free_adj:
5026         kfree(adj);
5027         dev_put(adj_dev);
5028
5029         return ret;
5030 }
5031
5032 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5033                                          struct net_device *adj_dev,
5034                                          struct list_head *dev_list)
5035 {
5036         struct netdev_adjacent *adj;
5037
5038         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5039
5040         if (!adj) {
5041                 pr_err("tried to remove device %s from %s\n",
5042                        dev->name, adj_dev->name);
5043                 BUG();
5044         }
5045
5046         if (adj->ref_nr > 1) {
5047                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5048                          adj->ref_nr-1);
5049                 adj->ref_nr--;
5050                 return;
5051         }
5052
5053         if (adj->master)
5054                 sysfs_remove_link(&(dev->dev.kobj), "master");
5055
5056         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5057                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5058
5059         list_del_rcu(&adj->list);
5060         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5061                  adj_dev->name, dev->name, adj_dev->name);
5062         dev_put(adj_dev);
5063         kfree_rcu(adj, rcu);
5064 }
5065
5066 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5067                                             struct net_device *upper_dev,
5068                                             struct list_head *up_list,
5069                                             struct list_head *down_list,
5070                                             void *private, bool master)
5071 {
5072         int ret;
5073
5074         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5075                                            master);
5076         if (ret)
5077                 return ret;
5078
5079         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5080                                            false);
5081         if (ret) {
5082                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5083                 return ret;
5084         }
5085
5086         return 0;
5087 }
5088
5089 static int __netdev_adjacent_dev_link(struct net_device *dev,
5090                                       struct net_device *upper_dev)
5091 {
5092         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5093                                                 &dev->all_adj_list.upper,
5094                                                 &upper_dev->all_adj_list.lower,
5095                                                 NULL, false);
5096 }
5097
5098 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5099                                                struct net_device *upper_dev,
5100                                                struct list_head *up_list,
5101                                                struct list_head *down_list)
5102 {
5103         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5104         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5105 }
5106
5107 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5108                                          struct net_device *upper_dev)
5109 {
5110         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5111                                            &dev->all_adj_list.upper,
5112                                            &upper_dev->all_adj_list.lower);
5113 }
5114
5115 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5116                                                 struct net_device *upper_dev,
5117                                                 void *private, bool master)
5118 {
5119         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5120
5121         if (ret)
5122                 return ret;
5123
5124         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5125                                                &dev->adj_list.upper,
5126                                                &upper_dev->adj_list.lower,
5127                                                private, master);
5128         if (ret) {
5129                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5130                 return ret;
5131         }
5132
5133         return 0;
5134 }
5135
5136 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5137                                                    struct net_device *upper_dev)
5138 {
5139         __netdev_adjacent_dev_unlink(dev, upper_dev);
5140         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5141                                            &dev->adj_list.upper,
5142                                            &upper_dev->adj_list.lower);
5143 }
5144
5145 static int __netdev_upper_dev_link(struct net_device *dev,
5146                                    struct net_device *upper_dev, bool master,
5147                                    void *private)
5148 {
5149         struct netdev_adjacent *i, *j, *to_i, *to_j;
5150         int ret = 0;
5151
5152         ASSERT_RTNL();
5153
5154         if (dev == upper_dev)
5155                 return -EBUSY;
5156
5157         /* To prevent loops, check if dev is not upper device to upper_dev. */
5158         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5159                 return -EBUSY;
5160
5161         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5162                 return -EEXIST;
5163
5164         if (master && netdev_master_upper_dev_get(dev))
5165                 return -EBUSY;
5166
5167         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5168                                                    master);
5169         if (ret)
5170                 return ret;
5171
5172         /* Now that we linked these devs, make all the upper_dev's
5173          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5174          * versa, and don't forget the devices itself. All of these
5175          * links are non-neighbours.
5176          */
5177         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5178                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5179                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5180                                  i->dev->name, j->dev->name);
5181                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5182                         if (ret)
5183                                 goto rollback_mesh;
5184                 }
5185         }
5186
5187         /* add dev to every upper_dev's upper device */
5188         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5189                 pr_debug("linking %s's upper device %s with %s\n",
5190                          upper_dev->name, i->dev->name, dev->name);
5191                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5192                 if (ret)
5193                         goto rollback_upper_mesh;
5194         }
5195
5196         /* add upper_dev to every dev's lower device */
5197         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5198                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5199                          i->dev->name, upper_dev->name);
5200                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5201                 if (ret)
5202                         goto rollback_lower_mesh;
5203         }
5204
5205         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5206         return 0;
5207
5208 rollback_lower_mesh:
5209         to_i = i;
5210         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5211                 if (i == to_i)
5212                         break;
5213                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5214         }
5215
5216         i = NULL;
5217
5218 rollback_upper_mesh:
5219         to_i = i;
5220         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5221                 if (i == to_i)
5222                         break;
5223                 __netdev_adjacent_dev_unlink(dev, i->dev);
5224         }
5225
5226         i = j = NULL;
5227
5228 rollback_mesh:
5229         to_i = i;
5230         to_j = j;
5231         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5232                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5233                         if (i == to_i && j == to_j)
5234                                 break;
5235                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5236                 }
5237                 if (i == to_i)
5238                         break;
5239         }
5240
5241         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5242
5243         return ret;
5244 }
5245
5246 /**
5247  * netdev_upper_dev_link - Add a link to the upper device
5248  * @dev: device
5249  * @upper_dev: new upper device
5250  *
5251  * Adds a link to device which is upper to this one. The caller must hold
5252  * the RTNL lock. On a failure a negative errno code is returned.
5253  * On success the reference counts are adjusted and the function
5254  * returns zero.
5255  */
5256 int netdev_upper_dev_link(struct net_device *dev,
5257                           struct net_device *upper_dev)
5258 {
5259         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5260 }
5261 EXPORT_SYMBOL(netdev_upper_dev_link);
5262
5263 /**
5264  * netdev_master_upper_dev_link - Add a master link to the upper device
5265  * @dev: device
5266  * @upper_dev: new upper device
5267  *
5268  * Adds a link to device which is upper to this one. In this case, only
5269  * one master upper device can be linked, although other non-master devices
5270  * might be linked as well. The caller must hold the RTNL lock.
5271  * On a failure a negative errno code is returned. On success the reference
5272  * counts are adjusted and the function returns zero.
5273  */
5274 int netdev_master_upper_dev_link(struct net_device *dev,
5275                                  struct net_device *upper_dev)
5276 {
5277         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5278 }
5279 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5280
5281 int netdev_master_upper_dev_link_private(struct net_device *dev,
5282                                          struct net_device *upper_dev,
5283                                          void *private)
5284 {
5285         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5286 }
5287 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5288
5289 /**
5290  * netdev_upper_dev_unlink - Removes a link to upper device
5291  * @dev: device
5292  * @upper_dev: new upper device
5293  *
5294  * Removes a link to device which is upper to this one. The caller must hold
5295  * the RTNL lock.
5296  */
5297 void netdev_upper_dev_unlink(struct net_device *dev,
5298                              struct net_device *upper_dev)
5299 {
5300         struct netdev_adjacent *i, *j;
5301         ASSERT_RTNL();
5302
5303         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5304
5305         /* Here is the tricky part. We must remove all dev's lower
5306          * devices from all upper_dev's upper devices and vice
5307          * versa, to maintain the graph relationship.
5308          */
5309         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5310                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5311                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5312
5313         /* remove also the devices itself from lower/upper device
5314          * list
5315          */
5316         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5317                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5318
5319         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5320                 __netdev_adjacent_dev_unlink(dev, i->dev);
5321
5322         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5323 }
5324 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5325
5326 void netdev_adjacent_add_links(struct net_device *dev)
5327 {
5328         struct netdev_adjacent *iter;
5329
5330         struct net *net = dev_net(dev);
5331
5332         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5333                 if (!net_eq(net,dev_net(iter->dev)))
5334                         continue;
5335                 netdev_adjacent_sysfs_add(iter->dev, dev,
5336                                           &iter->dev->adj_list.lower);
5337                 netdev_adjacent_sysfs_add(dev, iter->dev,
5338                                           &dev->adj_list.upper);
5339         }
5340
5341         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5342                 if (!net_eq(net,dev_net(iter->dev)))
5343                         continue;
5344                 netdev_adjacent_sysfs_add(iter->dev, dev,
5345                                           &iter->dev->adj_list.upper);
5346                 netdev_adjacent_sysfs_add(dev, iter->dev,
5347                                           &dev->adj_list.lower);
5348         }
5349 }
5350
5351 void netdev_adjacent_del_links(struct net_device *dev)
5352 {
5353         struct netdev_adjacent *iter;
5354
5355         struct net *net = dev_net(dev);
5356
5357         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5358                 if (!net_eq(net,dev_net(iter->dev)))
5359                         continue;
5360                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5361                                           &iter->dev->adj_list.lower);
5362                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5363                                           &dev->adj_list.upper);
5364         }
5365
5366         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5367                 if (!net_eq(net,dev_net(iter->dev)))
5368                         continue;
5369                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5370                                           &iter->dev->adj_list.upper);
5371                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5372                                           &dev->adj_list.lower);
5373         }
5374 }
5375
5376 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5377 {
5378         struct netdev_adjacent *iter;
5379
5380         struct net *net = dev_net(dev);
5381
5382         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5383                 if (!net_eq(net,dev_net(iter->dev)))
5384                         continue;
5385                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5386                                           &iter->dev->adj_list.lower);
5387                 netdev_adjacent_sysfs_add(iter->dev, dev,
5388                                           &iter->dev->adj_list.lower);
5389         }
5390
5391         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5392                 if (!net_eq(net,dev_net(iter->dev)))
5393                         continue;
5394                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5395                                           &iter->dev->adj_list.upper);
5396                 netdev_adjacent_sysfs_add(iter->dev, dev,
5397                                           &iter->dev->adj_list.upper);
5398         }
5399 }
5400
5401 void *netdev_lower_dev_get_private(struct net_device *dev,
5402                                    struct net_device *lower_dev)
5403 {
5404         struct netdev_adjacent *lower;
5405
5406         if (!lower_dev)
5407                 return NULL;
5408         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5409         if (!lower)
5410                 return NULL;
5411
5412         return lower->private;
5413 }
5414 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5415
5416
5417 int dev_get_nest_level(struct net_device *dev,
5418                        bool (*type_check)(struct net_device *dev))
5419 {
5420         struct net_device *lower = NULL;
5421         struct list_head *iter;
5422         int max_nest = -1;
5423         int nest;
5424
5425         ASSERT_RTNL();
5426
5427         netdev_for_each_lower_dev(dev, lower, iter) {
5428                 nest = dev_get_nest_level(lower, type_check);
5429                 if (max_nest < nest)
5430                         max_nest = nest;
5431         }
5432
5433         if (type_check(dev))
5434                 max_nest++;
5435
5436         return max_nest;
5437 }
5438 EXPORT_SYMBOL(dev_get_nest_level);
5439
5440 static void dev_change_rx_flags(struct net_device *dev, int flags)
5441 {
5442         const struct net_device_ops *ops = dev->netdev_ops;
5443
5444         if (ops->ndo_change_rx_flags)
5445                 ops->ndo_change_rx_flags(dev, flags);
5446 }
5447
5448 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5449 {
5450         unsigned int old_flags = dev->flags;
5451         kuid_t uid;
5452         kgid_t gid;
5453
5454         ASSERT_RTNL();
5455
5456         dev->flags |= IFF_PROMISC;
5457         dev->promiscuity += inc;
5458         if (dev->promiscuity == 0) {
5459                 /*
5460                  * Avoid overflow.
5461                  * If inc causes overflow, untouch promisc and return error.
5462                  */
5463                 if (inc < 0)
5464                         dev->flags &= ~IFF_PROMISC;
5465                 else {
5466                         dev->promiscuity -= inc;
5467                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5468                                 dev->name);
5469                         return -EOVERFLOW;
5470                 }
5471         }
5472         if (dev->flags != old_flags) {
5473                 pr_info("device %s %s promiscuous mode\n",
5474                         dev->name,
5475                         dev->flags & IFF_PROMISC ? "entered" : "left");
5476                 if (audit_enabled) {
5477                         current_uid_gid(&uid, &gid);
5478                         audit_log(current->audit_context, GFP_ATOMIC,
5479                                 AUDIT_ANOM_PROMISCUOUS,
5480                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5481                                 dev->name, (dev->flags & IFF_PROMISC),
5482                                 (old_flags & IFF_PROMISC),
5483                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5484                                 from_kuid(&init_user_ns, uid),
5485                                 from_kgid(&init_user_ns, gid),
5486                                 audit_get_sessionid(current));
5487                 }
5488
5489                 dev_change_rx_flags(dev, IFF_PROMISC);
5490         }
5491         if (notify)
5492                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5493         return 0;
5494 }
5495
5496 /**
5497  *      dev_set_promiscuity     - update promiscuity count on a device
5498  *      @dev: device
5499  *      @inc: modifier
5500  *
5501  *      Add or remove promiscuity from a device. While the count in the device
5502  *      remains above zero the interface remains promiscuous. Once it hits zero
5503  *      the device reverts back to normal filtering operation. A negative inc
5504  *      value is used to drop promiscuity on the device.
5505  *      Return 0 if successful or a negative errno code on error.
5506  */
5507 int dev_set_promiscuity(struct net_device *dev, int inc)
5508 {
5509         unsigned int old_flags = dev->flags;
5510         int err;
5511
5512         err = __dev_set_promiscuity(dev, inc, true);
5513         if (err < 0)
5514                 return err;
5515         if (dev->flags != old_flags)
5516                 dev_set_rx_mode(dev);
5517         return err;
5518 }
5519 EXPORT_SYMBOL(dev_set_promiscuity);
5520
5521 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5522 {
5523         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5524
5525         ASSERT_RTNL();
5526
5527         dev->flags |= IFF_ALLMULTI;
5528         dev->allmulti += inc;
5529         if (dev->allmulti == 0) {
5530                 /*
5531                  * Avoid overflow.
5532                  * If inc causes overflow, untouch allmulti and return error.
5533                  */
5534                 if (inc < 0)
5535                         dev->flags &= ~IFF_ALLMULTI;
5536                 else {
5537                         dev->allmulti -= inc;
5538                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5539                                 dev->name);
5540                         return -EOVERFLOW;
5541                 }
5542         }
5543         if (dev->flags ^ old_flags) {
5544                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5545                 dev_set_rx_mode(dev);
5546                 if (notify)
5547                         __dev_notify_flags(dev, old_flags,
5548                                            dev->gflags ^ old_gflags);
5549         }
5550         return 0;
5551 }
5552
5553 /**
5554  *      dev_set_allmulti        - update allmulti count on a device
5555  *      @dev: device
5556  *      @inc: modifier
5557  *
5558  *      Add or remove reception of all multicast frames to a device. While the
5559  *      count in the device remains above zero the interface remains listening
5560  *      to all interfaces. Once it hits zero the device reverts back to normal
5561  *      filtering operation. A negative @inc value is used to drop the counter
5562  *      when releasing a resource needing all multicasts.
5563  *      Return 0 if successful or a negative errno code on error.
5564  */
5565
5566 int dev_set_allmulti(struct net_device *dev, int inc)
5567 {
5568         return __dev_set_allmulti(dev, inc, true);
5569 }
5570 EXPORT_SYMBOL(dev_set_allmulti);
5571
5572 /*
5573  *      Upload unicast and multicast address lists to device and
5574  *      configure RX filtering. When the device doesn't support unicast
5575  *      filtering it is put in promiscuous mode while unicast addresses
5576  *      are present.
5577  */
5578 void __dev_set_rx_mode(struct net_device *dev)
5579 {
5580         const struct net_device_ops *ops = dev->netdev_ops;
5581
5582         /* dev_open will call this function so the list will stay sane. */
5583         if (!(dev->flags&IFF_UP))
5584                 return;
5585
5586         if (!netif_device_present(dev))
5587                 return;
5588
5589         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5590                 /* Unicast addresses changes may only happen under the rtnl,
5591                  * therefore calling __dev_set_promiscuity here is safe.
5592                  */
5593                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5594                         __dev_set_promiscuity(dev, 1, false);
5595                         dev->uc_promisc = true;
5596                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5597                         __dev_set_promiscuity(dev, -1, false);
5598                         dev->uc_promisc = false;
5599                 }
5600         }
5601
5602         if (ops->ndo_set_rx_mode)
5603                 ops->ndo_set_rx_mode(dev);
5604 }
5605
5606 void dev_set_rx_mode(struct net_device *dev)
5607 {
5608         netif_addr_lock_bh(dev);
5609         __dev_set_rx_mode(dev);
5610         netif_addr_unlock_bh(dev);
5611 }
5612
5613 /**
5614  *      dev_get_flags - get flags reported to userspace
5615  *      @dev: device
5616  *
5617  *      Get the combination of flag bits exported through APIs to userspace.
5618  */
5619 unsigned int dev_get_flags(const struct net_device *dev)
5620 {
5621         unsigned int flags;
5622
5623         flags = (dev->flags & ~(IFF_PROMISC |
5624                                 IFF_ALLMULTI |
5625                                 IFF_RUNNING |
5626                                 IFF_LOWER_UP |
5627                                 IFF_DORMANT)) |
5628                 (dev->gflags & (IFF_PROMISC |
5629                                 IFF_ALLMULTI));
5630
5631         if (netif_running(dev)) {
5632                 if (netif_oper_up(dev))
5633                         flags |= IFF_RUNNING;
5634                 if (netif_carrier_ok(dev))
5635                         flags |= IFF_LOWER_UP;
5636                 if (netif_dormant(dev))
5637                         flags |= IFF_DORMANT;
5638         }
5639
5640         return flags;
5641 }
5642 EXPORT_SYMBOL(dev_get_flags);
5643
5644 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5645 {
5646         unsigned int old_flags = dev->flags;
5647         int ret;
5648
5649         ASSERT_RTNL();
5650
5651         /*
5652          *      Set the flags on our device.
5653          */
5654
5655         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5656                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5657                                IFF_AUTOMEDIA)) |
5658                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5659                                     IFF_ALLMULTI));
5660
5661         /*
5662          *      Load in the correct multicast list now the flags have changed.
5663          */
5664
5665         if ((old_flags ^ flags) & IFF_MULTICAST)
5666                 dev_change_rx_flags(dev, IFF_MULTICAST);
5667
5668         dev_set_rx_mode(dev);
5669
5670         /*
5671          *      Have we downed the interface. We handle IFF_UP ourselves
5672          *      according to user attempts to set it, rather than blindly
5673          *      setting it.
5674          */
5675
5676         ret = 0;
5677         if ((old_flags ^ flags) & IFF_UP)
5678                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5679
5680         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5681                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5682                 unsigned int old_flags = dev->flags;
5683
5684                 dev->gflags ^= IFF_PROMISC;
5685
5686                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5687                         if (dev->flags != old_flags)
5688                                 dev_set_rx_mode(dev);
5689         }
5690
5691         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5692            is important. Some (broken) drivers set IFF_PROMISC, when
5693            IFF_ALLMULTI is requested not asking us and not reporting.
5694          */
5695         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5696                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5697
5698                 dev->gflags ^= IFF_ALLMULTI;
5699                 __dev_set_allmulti(dev, inc, false);
5700         }
5701
5702         return ret;
5703 }
5704
5705 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5706                         unsigned int gchanges)
5707 {
5708         unsigned int changes = dev->flags ^ old_flags;
5709
5710         if (gchanges)
5711                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5712
5713         if (changes & IFF_UP) {
5714                 if (dev->flags & IFF_UP)
5715                         call_netdevice_notifiers(NETDEV_UP, dev);
5716                 else
5717                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5718         }
5719
5720         if (dev->flags & IFF_UP &&
5721             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5722                 struct netdev_notifier_change_info change_info;
5723
5724                 change_info.flags_changed = changes;
5725                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5726                                               &change_info.info);
5727         }
5728 }
5729
5730 /**
5731  *      dev_change_flags - change device settings
5732  *      @dev: device
5733  *      @flags: device state flags
5734  *
5735  *      Change settings on device based state flags. The flags are
5736  *      in the userspace exported format.
5737  */
5738 int dev_change_flags(struct net_device *dev, unsigned int flags)
5739 {
5740         int ret;
5741         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5742
5743         ret = __dev_change_flags(dev, flags);
5744         if (ret < 0)
5745                 return ret;
5746
5747         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5748         __dev_notify_flags(dev, old_flags, changes);
5749         return ret;
5750 }
5751 EXPORT_SYMBOL(dev_change_flags);
5752
5753 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5754 {
5755         const struct net_device_ops *ops = dev->netdev_ops;
5756
5757         if (ops->ndo_change_mtu)
5758                 return ops->ndo_change_mtu(dev, new_mtu);
5759
5760         dev->mtu = new_mtu;
5761         return 0;
5762 }
5763
5764 /**
5765  *      dev_set_mtu - Change maximum transfer unit
5766  *      @dev: device
5767  *      @new_mtu: new transfer unit
5768  *
5769  *      Change the maximum transfer size of the network device.
5770  */
5771 int dev_set_mtu(struct net_device *dev, int new_mtu)
5772 {
5773         int err, orig_mtu;
5774
5775         if (new_mtu == dev->mtu)
5776                 return 0;
5777
5778         /*      MTU must be positive.    */
5779         if (new_mtu < 0)
5780                 return -EINVAL;
5781
5782         if (!netif_device_present(dev))
5783                 return -ENODEV;
5784
5785         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5786         err = notifier_to_errno(err);
5787         if (err)
5788                 return err;
5789
5790         orig_mtu = dev->mtu;
5791         err = __dev_set_mtu(dev, new_mtu);
5792
5793         if (!err) {
5794                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5795                 err = notifier_to_errno(err);
5796                 if (err) {
5797                         /* setting mtu back and notifying everyone again,
5798                          * so that they have a chance to revert changes.
5799                          */
5800                         __dev_set_mtu(dev, orig_mtu);
5801                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5802                 }
5803         }
5804         return err;
5805 }
5806 EXPORT_SYMBOL(dev_set_mtu);
5807
5808 /**
5809  *      dev_set_group - Change group this device belongs to
5810  *      @dev: device
5811  *      @new_group: group this device should belong to
5812  */
5813 void dev_set_group(struct net_device *dev, int new_group)
5814 {
5815         dev->group = new_group;
5816 }
5817 EXPORT_SYMBOL(dev_set_group);
5818
5819 /**
5820  *      dev_set_mac_address - Change Media Access Control Address
5821  *      @dev: device
5822  *      @sa: new address
5823  *
5824  *      Change the hardware (MAC) address of the device
5825  */
5826 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5827 {
5828         const struct net_device_ops *ops = dev->netdev_ops;
5829         int err;
5830
5831         if (!ops->ndo_set_mac_address)
5832                 return -EOPNOTSUPP;
5833         if (sa->sa_family != dev->type)
5834                 return -EINVAL;
5835         if (!netif_device_present(dev))
5836                 return -ENODEV;
5837         err = ops->ndo_set_mac_address(dev, sa);
5838         if (err)
5839                 return err;
5840         dev->addr_assign_type = NET_ADDR_SET;
5841         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5842         add_device_randomness(dev->dev_addr, dev->addr_len);
5843         return 0;
5844 }
5845 EXPORT_SYMBOL(dev_set_mac_address);
5846
5847 /**
5848  *      dev_change_carrier - Change device carrier
5849  *      @dev: device
5850  *      @new_carrier: new value
5851  *
5852  *      Change device carrier
5853  */
5854 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5855 {
5856         const struct net_device_ops *ops = dev->netdev_ops;
5857
5858         if (!ops->ndo_change_carrier)
5859                 return -EOPNOTSUPP;
5860         if (!netif_device_present(dev))
5861                 return -ENODEV;
5862         return ops->ndo_change_carrier(dev, new_carrier);
5863 }
5864 EXPORT_SYMBOL(dev_change_carrier);
5865
5866 /**
5867  *      dev_get_phys_port_id - Get device physical port ID
5868  *      @dev: device
5869  *      @ppid: port ID
5870  *
5871  *      Get device physical port ID
5872  */
5873 int dev_get_phys_port_id(struct net_device *dev,
5874                          struct netdev_phys_item_id *ppid)
5875 {
5876         const struct net_device_ops *ops = dev->netdev_ops;
5877
5878         if (!ops->ndo_get_phys_port_id)
5879                 return -EOPNOTSUPP;
5880         return ops->ndo_get_phys_port_id(dev, ppid);
5881 }
5882 EXPORT_SYMBOL(dev_get_phys_port_id);
5883
5884 /**
5885  *      dev_new_index   -       allocate an ifindex
5886  *      @net: the applicable net namespace
5887  *
5888  *      Returns a suitable unique value for a new device interface
5889  *      number.  The caller must hold the rtnl semaphore or the
5890  *      dev_base_lock to be sure it remains unique.
5891  */
5892 static int dev_new_index(struct net *net)
5893 {
5894         int ifindex = net->ifindex;
5895         for (;;) {
5896                 if (++ifindex <= 0)
5897                         ifindex = 1;
5898                 if (!__dev_get_by_index(net, ifindex))
5899                         return net->ifindex = ifindex;
5900         }
5901 }
5902
5903 /* Delayed registration/unregisteration */
5904 static LIST_HEAD(net_todo_list);
5905 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5906
5907 static void net_set_todo(struct net_device *dev)
5908 {
5909         list_add_tail(&dev->todo_list, &net_todo_list);
5910         dev_net(dev)->dev_unreg_count++;
5911 }
5912
5913 static void rollback_registered_many(struct list_head *head)
5914 {
5915         struct net_device *dev, *tmp;
5916         LIST_HEAD(close_head);
5917
5918         BUG_ON(dev_boot_phase);
5919         ASSERT_RTNL();
5920
5921         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5922                 /* Some devices call without registering
5923                  * for initialization unwind. Remove those
5924                  * devices and proceed with the remaining.
5925                  */
5926                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5927                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5928                                  dev->name, dev);
5929
5930                         WARN_ON(1);
5931                         list_del(&dev->unreg_list);
5932                         continue;
5933                 }
5934                 dev->dismantle = true;
5935                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5936         }
5937
5938         /* If device is running, close it first. */
5939         list_for_each_entry(dev, head, unreg_list)
5940                 list_add_tail(&dev->close_list, &close_head);
5941         dev_close_many(&close_head);
5942
5943         list_for_each_entry(dev, head, unreg_list) {
5944                 /* And unlink it from device chain. */
5945                 unlist_netdevice(dev);
5946
5947                 dev->reg_state = NETREG_UNREGISTERING;
5948         }
5949
5950         synchronize_net();
5951
5952         list_for_each_entry(dev, head, unreg_list) {
5953                 struct sk_buff *skb = NULL;
5954
5955                 /* Shutdown queueing discipline. */
5956                 dev_shutdown(dev);
5957
5958
5959                 /* Notify protocols, that we are about to destroy
5960                    this device. They should clean all the things.
5961                 */
5962                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5963
5964                 if (!dev->rtnl_link_ops ||
5965                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5966                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
5967                                                      GFP_KERNEL);
5968
5969                 /*
5970                  *      Flush the unicast and multicast chains
5971                  */
5972                 dev_uc_flush(dev);
5973                 dev_mc_flush(dev);
5974
5975                 if (dev->netdev_ops->ndo_uninit)
5976                         dev->netdev_ops->ndo_uninit(dev);
5977
5978                 if (skb)
5979                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
5980
5981                 /* Notifier chain MUST detach us all upper devices. */
5982                 WARN_ON(netdev_has_any_upper_dev(dev));
5983
5984                 /* Remove entries from kobject tree */
5985                 netdev_unregister_kobject(dev);
5986 #ifdef CONFIG_XPS
5987                 /* Remove XPS queueing entries */
5988                 netif_reset_xps_queues_gt(dev, 0);
5989 #endif
5990         }
5991
5992         synchronize_net();
5993
5994         list_for_each_entry(dev, head, unreg_list)
5995                 dev_put(dev);
5996 }
5997
5998 static void rollback_registered(struct net_device *dev)
5999 {
6000         LIST_HEAD(single);
6001
6002         list_add(&dev->unreg_list, &single);
6003         rollback_registered_many(&single);
6004         list_del(&single);
6005 }
6006
6007 static netdev_features_t netdev_fix_features(struct net_device *dev,
6008         netdev_features_t features)
6009 {
6010         /* Fix illegal checksum combinations */
6011         if ((features & NETIF_F_HW_CSUM) &&
6012             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6013                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6014                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6015         }
6016
6017         /* TSO requires that SG is present as well. */
6018         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6019                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6020                 features &= ~NETIF_F_ALL_TSO;
6021         }
6022
6023         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6024                                         !(features & NETIF_F_IP_CSUM)) {
6025                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6026                 features &= ~NETIF_F_TSO;
6027                 features &= ~NETIF_F_TSO_ECN;
6028         }
6029
6030         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6031                                          !(features & NETIF_F_IPV6_CSUM)) {
6032                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6033                 features &= ~NETIF_F_TSO6;
6034         }
6035
6036         /* TSO ECN requires that TSO is present as well. */
6037         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6038                 features &= ~NETIF_F_TSO_ECN;
6039
6040         /* Software GSO depends on SG. */
6041         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6042                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6043                 features &= ~NETIF_F_GSO;
6044         }
6045
6046         /* UFO needs SG and checksumming */
6047         if (features & NETIF_F_UFO) {
6048                 /* maybe split UFO into V4 and V6? */
6049                 if (!((features & NETIF_F_GEN_CSUM) ||
6050                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6051                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6052                         netdev_dbg(dev,
6053                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6054                         features &= ~NETIF_F_UFO;
6055                 }
6056
6057                 if (!(features & NETIF_F_SG)) {
6058                         netdev_dbg(dev,
6059                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6060                         features &= ~NETIF_F_UFO;
6061                 }
6062         }
6063
6064 #ifdef CONFIG_NET_RX_BUSY_POLL
6065         if (dev->netdev_ops->ndo_busy_poll)
6066                 features |= NETIF_F_BUSY_POLL;
6067         else
6068 #endif
6069                 features &= ~NETIF_F_BUSY_POLL;
6070
6071         return features;
6072 }
6073
6074 int __netdev_update_features(struct net_device *dev)
6075 {
6076         netdev_features_t features;
6077         int err = 0;
6078
6079         ASSERT_RTNL();
6080
6081         features = netdev_get_wanted_features(dev);
6082
6083         if (dev->netdev_ops->ndo_fix_features)
6084                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6085
6086         /* driver might be less strict about feature dependencies */
6087         features = netdev_fix_features(dev, features);
6088
6089         if (dev->features == features)
6090                 return 0;
6091
6092         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6093                 &dev->features, &features);
6094
6095         if (dev->netdev_ops->ndo_set_features)
6096                 err = dev->netdev_ops->ndo_set_features(dev, features);
6097
6098         if (unlikely(err < 0)) {
6099                 netdev_err(dev,
6100                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6101                         err, &features, &dev->features);
6102                 return -1;
6103         }
6104
6105         if (!err)
6106                 dev->features = features;
6107
6108         return 1;
6109 }
6110
6111 /**
6112  *      netdev_update_features - recalculate device features
6113  *      @dev: the device to check
6114  *
6115  *      Recalculate dev->features set and send notifications if it
6116  *      has changed. Should be called after driver or hardware dependent
6117  *      conditions might have changed that influence the features.
6118  */
6119 void netdev_update_features(struct net_device *dev)
6120 {
6121         if (__netdev_update_features(dev))
6122                 netdev_features_change(dev);
6123 }
6124 EXPORT_SYMBOL(netdev_update_features);
6125
6126 /**
6127  *      netdev_change_features - recalculate device features
6128  *      @dev: the device to check
6129  *
6130  *      Recalculate dev->features set and send notifications even
6131  *      if they have not changed. Should be called instead of
6132  *      netdev_update_features() if also dev->vlan_features might
6133  *      have changed to allow the changes to be propagated to stacked
6134  *      VLAN devices.
6135  */
6136 void netdev_change_features(struct net_device *dev)
6137 {
6138         __netdev_update_features(dev);
6139         netdev_features_change(dev);
6140 }
6141 EXPORT_SYMBOL(netdev_change_features);
6142
6143 /**
6144  *      netif_stacked_transfer_operstate -      transfer operstate
6145  *      @rootdev: the root or lower level device to transfer state from
6146  *      @dev: the device to transfer operstate to
6147  *
6148  *      Transfer operational state from root to device. This is normally
6149  *      called when a stacking relationship exists between the root
6150  *      device and the device(a leaf device).
6151  */
6152 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6153                                         struct net_device *dev)
6154 {
6155         if (rootdev->operstate == IF_OPER_DORMANT)
6156                 netif_dormant_on(dev);
6157         else
6158                 netif_dormant_off(dev);
6159
6160         if (netif_carrier_ok(rootdev)) {
6161                 if (!netif_carrier_ok(dev))
6162                         netif_carrier_on(dev);
6163         } else {
6164                 if (netif_carrier_ok(dev))
6165                         netif_carrier_off(dev);
6166         }
6167 }
6168 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6169
6170 #ifdef CONFIG_SYSFS
6171 static int netif_alloc_rx_queues(struct net_device *dev)
6172 {
6173         unsigned int i, count = dev->num_rx_queues;
6174         struct netdev_rx_queue *rx;
6175
6176         BUG_ON(count < 1);
6177
6178         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6179         if (!rx)
6180                 return -ENOMEM;
6181
6182         dev->_rx = rx;
6183
6184         for (i = 0; i < count; i++)
6185                 rx[i].dev = dev;
6186         return 0;
6187 }
6188 #endif
6189
6190 static void netdev_init_one_queue(struct net_device *dev,
6191                                   struct netdev_queue *queue, void *_unused)
6192 {
6193         /* Initialize queue lock */
6194         spin_lock_init(&queue->_xmit_lock);
6195         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6196         queue->xmit_lock_owner = -1;
6197         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6198         queue->dev = dev;
6199 #ifdef CONFIG_BQL
6200         dql_init(&queue->dql, HZ);
6201 #endif
6202 }
6203
6204 static void netif_free_tx_queues(struct net_device *dev)
6205 {
6206         kvfree(dev->_tx);
6207 }
6208
6209 static int netif_alloc_netdev_queues(struct net_device *dev)
6210 {
6211         unsigned int count = dev->num_tx_queues;
6212         struct netdev_queue *tx;
6213         size_t sz = count * sizeof(*tx);
6214
6215         BUG_ON(count < 1 || count > 0xffff);
6216
6217         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6218         if (!tx) {
6219                 tx = vzalloc(sz);
6220                 if (!tx)
6221                         return -ENOMEM;
6222         }
6223         dev->_tx = tx;
6224
6225         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6226         spin_lock_init(&dev->tx_global_lock);
6227
6228         return 0;
6229 }
6230
6231 /**
6232  *      register_netdevice      - register a network device
6233  *      @dev: device to register
6234  *
6235  *      Take a completed network device structure and add it to the kernel
6236  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6237  *      chain. 0 is returned on success. A negative errno code is returned
6238  *      on a failure to set up the device, or if the name is a duplicate.
6239  *
6240  *      Callers must hold the rtnl semaphore. You may want
6241  *      register_netdev() instead of this.
6242  *
6243  *      BUGS:
6244  *      The locking appears insufficient to guarantee two parallel registers
6245  *      will not get the same name.
6246  */
6247
6248 int register_netdevice(struct net_device *dev)
6249 {
6250         int ret;
6251         struct net *net = dev_net(dev);
6252
6253         BUG_ON(dev_boot_phase);
6254         ASSERT_RTNL();
6255
6256         might_sleep();
6257
6258         /* When net_device's are persistent, this will be fatal. */
6259         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6260         BUG_ON(!net);
6261
6262         spin_lock_init(&dev->addr_list_lock);
6263         netdev_set_addr_lockdep_class(dev);
6264
6265         dev->iflink = -1;
6266
6267         ret = dev_get_valid_name(net, dev, dev->name);
6268         if (ret < 0)
6269                 goto out;
6270
6271         /* Init, if this function is available */
6272         if (dev->netdev_ops->ndo_init) {
6273                 ret = dev->netdev_ops->ndo_init(dev);
6274                 if (ret) {
6275                         if (ret > 0)
6276                                 ret = -EIO;
6277                         goto out;
6278                 }
6279         }
6280
6281         if (((dev->hw_features | dev->features) &
6282              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6283             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6284              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6285                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6286                 ret = -EINVAL;
6287                 goto err_uninit;
6288         }
6289
6290         ret = -EBUSY;
6291         if (!dev->ifindex)
6292                 dev->ifindex = dev_new_index(net);
6293         else if (__dev_get_by_index(net, dev->ifindex))
6294                 goto err_uninit;
6295
6296         if (dev->iflink == -1)
6297                 dev->iflink = dev->ifindex;
6298
6299         /* Transfer changeable features to wanted_features and enable
6300          * software offloads (GSO and GRO).
6301          */
6302         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6303         dev->features |= NETIF_F_SOFT_FEATURES;
6304         dev->wanted_features = dev->features & dev->hw_features;
6305
6306         if (!(dev->flags & IFF_LOOPBACK)) {
6307                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6308         }
6309
6310         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6311          */
6312         dev->vlan_features |= NETIF_F_HIGHDMA;
6313
6314         /* Make NETIF_F_SG inheritable to tunnel devices.
6315          */
6316         dev->hw_enc_features |= NETIF_F_SG;
6317
6318         /* Make NETIF_F_SG inheritable to MPLS.
6319          */
6320         dev->mpls_features |= NETIF_F_SG;
6321
6322         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6323         ret = notifier_to_errno(ret);
6324         if (ret)
6325                 goto err_uninit;
6326
6327         ret = netdev_register_kobject(dev);
6328         if (ret)
6329                 goto err_uninit;
6330         dev->reg_state = NETREG_REGISTERED;
6331
6332         __netdev_update_features(dev);
6333
6334         /*
6335          *      Default initial state at registry is that the
6336          *      device is present.
6337          */
6338
6339         set_bit(__LINK_STATE_PRESENT, &dev->state);
6340
6341         linkwatch_init_dev(dev);
6342
6343         dev_init_scheduler(dev);
6344         dev_hold(dev);
6345         list_netdevice(dev);
6346         add_device_randomness(dev->dev_addr, dev->addr_len);
6347
6348         /* If the device has permanent device address, driver should
6349          * set dev_addr and also addr_assign_type should be set to
6350          * NET_ADDR_PERM (default value).
6351          */
6352         if (dev->addr_assign_type == NET_ADDR_PERM)
6353                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6354
6355         /* Notify protocols, that a new device appeared. */
6356         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6357         ret = notifier_to_errno(ret);
6358         if (ret) {
6359                 rollback_registered(dev);
6360                 dev->reg_state = NETREG_UNREGISTERED;
6361         }
6362         /*
6363          *      Prevent userspace races by waiting until the network
6364          *      device is fully setup before sending notifications.
6365          */
6366         if (!dev->rtnl_link_ops ||
6367             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6368                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6369
6370 out:
6371         return ret;
6372
6373 err_uninit:
6374         if (dev->netdev_ops->ndo_uninit)
6375                 dev->netdev_ops->ndo_uninit(dev);
6376         goto out;
6377 }
6378 EXPORT_SYMBOL(register_netdevice);
6379
6380 /**
6381  *      init_dummy_netdev       - init a dummy network device for NAPI
6382  *      @dev: device to init
6383  *
6384  *      This takes a network device structure and initialize the minimum
6385  *      amount of fields so it can be used to schedule NAPI polls without
6386  *      registering a full blown interface. This is to be used by drivers
6387  *      that need to tie several hardware interfaces to a single NAPI
6388  *      poll scheduler due to HW limitations.
6389  */
6390 int init_dummy_netdev(struct net_device *dev)
6391 {
6392         /* Clear everything. Note we don't initialize spinlocks
6393          * are they aren't supposed to be taken by any of the
6394          * NAPI code and this dummy netdev is supposed to be
6395          * only ever used for NAPI polls
6396          */
6397         memset(dev, 0, sizeof(struct net_device));
6398
6399         /* make sure we BUG if trying to hit standard
6400          * register/unregister code path
6401          */
6402         dev->reg_state = NETREG_DUMMY;
6403
6404         /* NAPI wants this */
6405         INIT_LIST_HEAD(&dev->napi_list);
6406
6407         /* a dummy interface is started by default */
6408         set_bit(__LINK_STATE_PRESENT, &dev->state);
6409         set_bit(__LINK_STATE_START, &dev->state);
6410
6411         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6412          * because users of this 'device' dont need to change
6413          * its refcount.
6414          */
6415
6416         return 0;
6417 }
6418 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6419
6420
6421 /**
6422  *      register_netdev - register a network device
6423  *      @dev: device to register
6424  *
6425  *      Take a completed network device structure and add it to the kernel
6426  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6427  *      chain. 0 is returned on success. A negative errno code is returned
6428  *      on a failure to set up the device, or if the name is a duplicate.
6429  *
6430  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6431  *      and expands the device name if you passed a format string to
6432  *      alloc_netdev.
6433  */
6434 int register_netdev(struct net_device *dev)
6435 {
6436         int err;
6437
6438         rtnl_lock();
6439         err = register_netdevice(dev);
6440         rtnl_unlock();
6441         return err;
6442 }
6443 EXPORT_SYMBOL(register_netdev);
6444
6445 int netdev_refcnt_read(const struct net_device *dev)
6446 {
6447         int i, refcnt = 0;
6448
6449         for_each_possible_cpu(i)
6450                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6451         return refcnt;
6452 }
6453 EXPORT_SYMBOL(netdev_refcnt_read);
6454
6455 /**
6456  * netdev_wait_allrefs - wait until all references are gone.
6457  * @dev: target net_device
6458  *
6459  * This is called when unregistering network devices.
6460  *
6461  * Any protocol or device that holds a reference should register
6462  * for netdevice notification, and cleanup and put back the
6463  * reference if they receive an UNREGISTER event.
6464  * We can get stuck here if buggy protocols don't correctly
6465  * call dev_put.
6466  */
6467 static void netdev_wait_allrefs(struct net_device *dev)
6468 {
6469         unsigned long rebroadcast_time, warning_time;
6470         int refcnt;
6471
6472         linkwatch_forget_dev(dev);
6473
6474         rebroadcast_time = warning_time = jiffies;
6475         refcnt = netdev_refcnt_read(dev);
6476
6477         while (refcnt != 0) {
6478                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6479                         rtnl_lock();
6480
6481                         /* Rebroadcast unregister notification */
6482                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6483
6484                         __rtnl_unlock();
6485                         rcu_barrier();
6486                         rtnl_lock();
6487
6488                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6489                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6490                                      &dev->state)) {
6491                                 /* We must not have linkwatch events
6492                                  * pending on unregister. If this
6493                                  * happens, we simply run the queue
6494                                  * unscheduled, resulting in a noop
6495                                  * for this device.
6496                                  */
6497                                 linkwatch_run_queue();
6498                         }
6499
6500                         __rtnl_unlock();
6501
6502                         rebroadcast_time = jiffies;
6503                 }
6504
6505                 msleep(250);
6506
6507                 refcnt = netdev_refcnt_read(dev);
6508
6509                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6510                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6511                                  dev->name, refcnt);
6512                         warning_time = jiffies;
6513                 }
6514         }
6515 }
6516
6517 /* The sequence is:
6518  *
6519  *      rtnl_lock();
6520  *      ...
6521  *      register_netdevice(x1);
6522  *      register_netdevice(x2);
6523  *      ...
6524  *      unregister_netdevice(y1);
6525  *      unregister_netdevice(y2);
6526  *      ...
6527  *      rtnl_unlock();
6528  *      free_netdev(y1);
6529  *      free_netdev(y2);
6530  *
6531  * We are invoked by rtnl_unlock().
6532  * This allows us to deal with problems:
6533  * 1) We can delete sysfs objects which invoke hotplug
6534  *    without deadlocking with linkwatch via keventd.
6535  * 2) Since we run with the RTNL semaphore not held, we can sleep
6536  *    safely in order to wait for the netdev refcnt to drop to zero.
6537  *
6538  * We must not return until all unregister events added during
6539  * the interval the lock was held have been completed.
6540  */
6541 void netdev_run_todo(void)
6542 {
6543         struct list_head list;
6544
6545         /* Snapshot list, allow later requests */
6546         list_replace_init(&net_todo_list, &list);
6547
6548         __rtnl_unlock();
6549
6550
6551         /* Wait for rcu callbacks to finish before next phase */
6552         if (!list_empty(&list))
6553                 rcu_barrier();
6554
6555         while (!list_empty(&list)) {
6556                 struct net_device *dev
6557                         = list_first_entry(&list, struct net_device, todo_list);
6558                 list_del(&dev->todo_list);
6559
6560                 rtnl_lock();
6561                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6562                 __rtnl_unlock();
6563
6564                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6565                         pr_err("network todo '%s' but state %d\n",
6566                                dev->name, dev->reg_state);
6567                         dump_stack();
6568                         continue;
6569                 }
6570
6571                 dev->reg_state = NETREG_UNREGISTERED;
6572
6573                 on_each_cpu(flush_backlog, dev, 1);
6574
6575                 netdev_wait_allrefs(dev);
6576
6577                 /* paranoia */
6578                 BUG_ON(netdev_refcnt_read(dev));
6579                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6580                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6581                 WARN_ON(dev->dn_ptr);
6582
6583                 if (dev->destructor)
6584                         dev->destructor(dev);
6585
6586                 /* Report a network device has been unregistered */
6587                 rtnl_lock();
6588                 dev_net(dev)->dev_unreg_count--;
6589                 __rtnl_unlock();
6590                 wake_up(&netdev_unregistering_wq);
6591
6592                 /* Free network device */
6593                 kobject_put(&dev->dev.kobj);
6594         }
6595 }
6596
6597 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6598  * fields in the same order, with only the type differing.
6599  */
6600 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6601                              const struct net_device_stats *netdev_stats)
6602 {
6603 #if BITS_PER_LONG == 64
6604         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6605         memcpy(stats64, netdev_stats, sizeof(*stats64));
6606 #else
6607         size_t i, n = sizeof(*stats64) / sizeof(u64);
6608         const unsigned long *src = (const unsigned long *)netdev_stats;
6609         u64 *dst = (u64 *)stats64;
6610
6611         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6612                      sizeof(*stats64) / sizeof(u64));
6613         for (i = 0; i < n; i++)
6614                 dst[i] = src[i];
6615 #endif
6616 }
6617 EXPORT_SYMBOL(netdev_stats_to_stats64);
6618
6619 /**
6620  *      dev_get_stats   - get network device statistics
6621  *      @dev: device to get statistics from
6622  *      @storage: place to store stats
6623  *
6624  *      Get network statistics from device. Return @storage.
6625  *      The device driver may provide its own method by setting
6626  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6627  *      otherwise the internal statistics structure is used.
6628  */
6629 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6630                                         struct rtnl_link_stats64 *storage)
6631 {
6632         const struct net_device_ops *ops = dev->netdev_ops;
6633
6634         if (ops->ndo_get_stats64) {
6635                 memset(storage, 0, sizeof(*storage));
6636                 ops->ndo_get_stats64(dev, storage);
6637         } else if (ops->ndo_get_stats) {
6638                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6639         } else {
6640                 netdev_stats_to_stats64(storage, &dev->stats);
6641         }
6642         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6643         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6644         return storage;
6645 }
6646 EXPORT_SYMBOL(dev_get_stats);
6647
6648 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6649 {
6650         struct netdev_queue *queue = dev_ingress_queue(dev);
6651
6652 #ifdef CONFIG_NET_CLS_ACT
6653         if (queue)
6654                 return queue;
6655         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6656         if (!queue)
6657                 return NULL;
6658         netdev_init_one_queue(dev, queue, NULL);
6659         queue->qdisc = &noop_qdisc;
6660         queue->qdisc_sleeping = &noop_qdisc;
6661         rcu_assign_pointer(dev->ingress_queue, queue);
6662 #endif
6663         return queue;
6664 }
6665
6666 static const struct ethtool_ops default_ethtool_ops;
6667
6668 void netdev_set_default_ethtool_ops(struct net_device *dev,
6669                                     const struct ethtool_ops *ops)
6670 {
6671         if (dev->ethtool_ops == &default_ethtool_ops)
6672                 dev->ethtool_ops = ops;
6673 }
6674 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6675
6676 void netdev_freemem(struct net_device *dev)
6677 {
6678         char *addr = (char *)dev - dev->padded;
6679
6680         kvfree(addr);
6681 }
6682
6683 /**
6684  *      alloc_netdev_mqs - allocate network device
6685  *      @sizeof_priv:           size of private data to allocate space for
6686  *      @name:                  device name format string
6687  *      @name_assign_type:      origin of device name
6688  *      @setup:                 callback to initialize device
6689  *      @txqs:                  the number of TX subqueues to allocate
6690  *      @rxqs:                  the number of RX subqueues to allocate
6691  *
6692  *      Allocates a struct net_device with private data area for driver use
6693  *      and performs basic initialization.  Also allocates subqueue structs
6694  *      for each queue on the device.
6695  */
6696 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6697                 unsigned char name_assign_type,
6698                 void (*setup)(struct net_device *),
6699                 unsigned int txqs, unsigned int rxqs)
6700 {
6701         struct net_device *dev;
6702         size_t alloc_size;
6703         struct net_device *p;
6704
6705         BUG_ON(strlen(name) >= sizeof(dev->name));
6706
6707         if (txqs < 1) {
6708                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6709                 return NULL;
6710         }
6711
6712 #ifdef CONFIG_SYSFS
6713         if (rxqs < 1) {
6714                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6715                 return NULL;
6716         }
6717 #endif
6718
6719         alloc_size = sizeof(struct net_device);
6720         if (sizeof_priv) {
6721                 /* ensure 32-byte alignment of private area */
6722                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6723                 alloc_size += sizeof_priv;
6724         }
6725         /* ensure 32-byte alignment of whole construct */
6726         alloc_size += NETDEV_ALIGN - 1;
6727
6728         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6729         if (!p)
6730                 p = vzalloc(alloc_size);
6731         if (!p)
6732                 return NULL;
6733
6734         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6735         dev->padded = (char *)dev - (char *)p;
6736
6737         dev->pcpu_refcnt = alloc_percpu(int);
6738         if (!dev->pcpu_refcnt)
6739                 goto free_dev;
6740
6741         if (dev_addr_init(dev))
6742                 goto free_pcpu;
6743
6744         dev_mc_init(dev);
6745         dev_uc_init(dev);
6746
6747         dev_net_set(dev, &init_net);
6748
6749         dev->gso_max_size = GSO_MAX_SIZE;
6750         dev->gso_max_segs = GSO_MAX_SEGS;
6751         dev->gso_min_segs = 0;
6752
6753         INIT_LIST_HEAD(&dev->napi_list);
6754         INIT_LIST_HEAD(&dev->unreg_list);
6755         INIT_LIST_HEAD(&dev->close_list);
6756         INIT_LIST_HEAD(&dev->link_watch_list);
6757         INIT_LIST_HEAD(&dev->adj_list.upper);
6758         INIT_LIST_HEAD(&dev->adj_list.lower);
6759         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6760         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6761         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6762         setup(dev);
6763
6764         dev->num_tx_queues = txqs;
6765         dev->real_num_tx_queues = txqs;
6766         if (netif_alloc_netdev_queues(dev))
6767                 goto free_all;
6768
6769 #ifdef CONFIG_SYSFS
6770         dev->num_rx_queues = rxqs;
6771         dev->real_num_rx_queues = rxqs;
6772         if (netif_alloc_rx_queues(dev))
6773                 goto free_all;
6774 #endif
6775
6776         strcpy(dev->name, name);
6777         dev->name_assign_type = name_assign_type;
6778         dev->group = INIT_NETDEV_GROUP;
6779         if (!dev->ethtool_ops)
6780                 dev->ethtool_ops = &default_ethtool_ops;
6781         return dev;
6782
6783 free_all:
6784         free_netdev(dev);
6785         return NULL;
6786
6787 free_pcpu:
6788         free_percpu(dev->pcpu_refcnt);
6789 free_dev:
6790         netdev_freemem(dev);
6791         return NULL;
6792 }
6793 EXPORT_SYMBOL(alloc_netdev_mqs);
6794
6795 /**
6796  *      free_netdev - free network device
6797  *      @dev: device
6798  *
6799  *      This function does the last stage of destroying an allocated device
6800  *      interface. The reference to the device object is released.
6801  *      If this is the last reference then it will be freed.
6802  */
6803 void free_netdev(struct net_device *dev)
6804 {
6805         struct napi_struct *p, *n;
6806
6807         release_net(dev_net(dev));
6808
6809         netif_free_tx_queues(dev);
6810 #ifdef CONFIG_SYSFS
6811         kfree(dev->_rx);
6812 #endif
6813
6814         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6815
6816         /* Flush device addresses */
6817         dev_addr_flush(dev);
6818
6819         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6820                 netif_napi_del(p);
6821
6822         free_percpu(dev->pcpu_refcnt);
6823         dev->pcpu_refcnt = NULL;
6824
6825         /*  Compatibility with error handling in drivers */
6826         if (dev->reg_state == NETREG_UNINITIALIZED) {
6827                 netdev_freemem(dev);
6828                 return;
6829         }
6830
6831         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6832         dev->reg_state = NETREG_RELEASED;
6833
6834         /* will free via device release */
6835         put_device(&dev->dev);
6836 }
6837 EXPORT_SYMBOL(free_netdev);
6838
6839 /**
6840  *      synchronize_net -  Synchronize with packet receive processing
6841  *
6842  *      Wait for packets currently being received to be done.
6843  *      Does not block later packets from starting.
6844  */
6845 void synchronize_net(void)
6846 {
6847         might_sleep();
6848         if (rtnl_is_locked())
6849                 synchronize_rcu_expedited();
6850         else
6851                 synchronize_rcu();
6852 }
6853 EXPORT_SYMBOL(synchronize_net);
6854
6855 /**
6856  *      unregister_netdevice_queue - remove device from the kernel
6857  *      @dev: device
6858  *      @head: list
6859  *
6860  *      This function shuts down a device interface and removes it
6861  *      from the kernel tables.
6862  *      If head not NULL, device is queued to be unregistered later.
6863  *
6864  *      Callers must hold the rtnl semaphore.  You may want
6865  *      unregister_netdev() instead of this.
6866  */
6867
6868 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6869 {
6870         ASSERT_RTNL();
6871
6872         if (head) {
6873                 list_move_tail(&dev->unreg_list, head);
6874         } else {
6875                 rollback_registered(dev);
6876                 /* Finish processing unregister after unlock */
6877                 net_set_todo(dev);
6878         }
6879 }
6880 EXPORT_SYMBOL(unregister_netdevice_queue);
6881
6882 /**
6883  *      unregister_netdevice_many - unregister many devices
6884  *      @head: list of devices
6885  *
6886  *  Note: As most callers use a stack allocated list_head,
6887  *  we force a list_del() to make sure stack wont be corrupted later.
6888  */
6889 void unregister_netdevice_many(struct list_head *head)
6890 {
6891         struct net_device *dev;
6892
6893         if (!list_empty(head)) {
6894                 rollback_registered_many(head);
6895                 list_for_each_entry(dev, head, unreg_list)
6896                         net_set_todo(dev);
6897                 list_del(head);
6898         }
6899 }
6900 EXPORT_SYMBOL(unregister_netdevice_many);
6901
6902 /**
6903  *      unregister_netdev - remove device from the kernel
6904  *      @dev: device
6905  *
6906  *      This function shuts down a device interface and removes it
6907  *      from the kernel tables.
6908  *
6909  *      This is just a wrapper for unregister_netdevice that takes
6910  *      the rtnl semaphore.  In general you want to use this and not
6911  *      unregister_netdevice.
6912  */
6913 void unregister_netdev(struct net_device *dev)
6914 {
6915         rtnl_lock();
6916         unregister_netdevice(dev);
6917         rtnl_unlock();
6918 }
6919 EXPORT_SYMBOL(unregister_netdev);
6920
6921 /**
6922  *      dev_change_net_namespace - move device to different nethost namespace
6923  *      @dev: device
6924  *      @net: network namespace
6925  *      @pat: If not NULL name pattern to try if the current device name
6926  *            is already taken in the destination network namespace.
6927  *
6928  *      This function shuts down a device interface and moves it
6929  *      to a new network namespace. On success 0 is returned, on
6930  *      a failure a netagive errno code is returned.
6931  *
6932  *      Callers must hold the rtnl semaphore.
6933  */
6934
6935 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6936 {
6937         int err;
6938
6939         ASSERT_RTNL();
6940
6941         /* Don't allow namespace local devices to be moved. */
6942         err = -EINVAL;
6943         if (dev->features & NETIF_F_NETNS_LOCAL)
6944                 goto out;
6945
6946         /* Ensure the device has been registrered */
6947         if (dev->reg_state != NETREG_REGISTERED)
6948                 goto out;
6949
6950         /* Get out if there is nothing todo */
6951         err = 0;
6952         if (net_eq(dev_net(dev), net))
6953                 goto out;
6954
6955         /* Pick the destination device name, and ensure
6956          * we can use it in the destination network namespace.
6957          */
6958         err = -EEXIST;
6959         if (__dev_get_by_name(net, dev->name)) {
6960                 /* We get here if we can't use the current device name */
6961                 if (!pat)
6962                         goto out;
6963                 if (dev_get_valid_name(net, dev, pat) < 0)
6964                         goto out;
6965         }
6966
6967         /*
6968          * And now a mini version of register_netdevice unregister_netdevice.
6969          */
6970
6971         /* If device is running close it first. */
6972         dev_close(dev);
6973
6974         /* And unlink it from device chain */
6975         err = -ENODEV;
6976         unlist_netdevice(dev);
6977
6978         synchronize_net();
6979
6980         /* Shutdown queueing discipline. */
6981         dev_shutdown(dev);
6982
6983         /* Notify protocols, that we are about to destroy
6984            this device. They should clean all the things.
6985
6986            Note that dev->reg_state stays at NETREG_REGISTERED.
6987            This is wanted because this way 8021q and macvlan know
6988            the device is just moving and can keep their slaves up.
6989         */
6990         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6991         rcu_barrier();
6992         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6993         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6994
6995         /*
6996          *      Flush the unicast and multicast chains
6997          */
6998         dev_uc_flush(dev);
6999         dev_mc_flush(dev);
7000
7001         /* Send a netdev-removed uevent to the old namespace */
7002         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7003         netdev_adjacent_del_links(dev);
7004
7005         /* Actually switch the network namespace */
7006         dev_net_set(dev, net);
7007
7008         /* If there is an ifindex conflict assign a new one */
7009         if (__dev_get_by_index(net, dev->ifindex)) {
7010                 int iflink = (dev->iflink == dev->ifindex);
7011                 dev->ifindex = dev_new_index(net);
7012                 if (iflink)
7013                         dev->iflink = dev->ifindex;
7014         }
7015
7016         /* Send a netdev-add uevent to the new namespace */
7017         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7018         netdev_adjacent_add_links(dev);
7019
7020         /* Fixup kobjects */
7021         err = device_rename(&dev->dev, dev->name);
7022         WARN_ON(err);
7023
7024         /* Add the device back in the hashes */
7025         list_netdevice(dev);
7026
7027         /* Notify protocols, that a new device appeared. */
7028         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7029
7030         /*
7031          *      Prevent userspace races by waiting until the network
7032          *      device is fully setup before sending notifications.
7033          */
7034         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7035
7036         synchronize_net();
7037         err = 0;
7038 out:
7039         return err;
7040 }
7041 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7042
7043 static int dev_cpu_callback(struct notifier_block *nfb,
7044                             unsigned long action,
7045                             void *ocpu)
7046 {
7047         struct sk_buff **list_skb;
7048         struct sk_buff *skb;
7049         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7050         struct softnet_data *sd, *oldsd;
7051
7052         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7053                 return NOTIFY_OK;
7054
7055         local_irq_disable();
7056         cpu = smp_processor_id();
7057         sd = &per_cpu(softnet_data, cpu);
7058         oldsd = &per_cpu(softnet_data, oldcpu);
7059
7060         /* Find end of our completion_queue. */
7061         list_skb = &sd->completion_queue;
7062         while (*list_skb)
7063                 list_skb = &(*list_skb)->next;
7064         /* Append completion queue from offline CPU. */
7065         *list_skb = oldsd->completion_queue;
7066         oldsd->completion_queue = NULL;
7067
7068         /* Append output queue from offline CPU. */
7069         if (oldsd->output_queue) {
7070                 *sd->output_queue_tailp = oldsd->output_queue;
7071                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7072                 oldsd->output_queue = NULL;
7073                 oldsd->output_queue_tailp = &oldsd->output_queue;
7074         }
7075         /* Append NAPI poll list from offline CPU. */
7076         if (!list_empty(&oldsd->poll_list)) {
7077                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
7078                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
7079         }
7080
7081         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7082         local_irq_enable();
7083
7084         /* Process offline CPU's input_pkt_queue */
7085         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7086                 netif_rx_internal(skb);
7087                 input_queue_head_incr(oldsd);
7088         }
7089         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
7090                 netif_rx_internal(skb);
7091                 input_queue_head_incr(oldsd);
7092         }
7093
7094         return NOTIFY_OK;
7095 }
7096
7097
7098 /**
7099  *      netdev_increment_features - increment feature set by one
7100  *      @all: current feature set
7101  *      @one: new feature set
7102  *      @mask: mask feature set
7103  *
7104  *      Computes a new feature set after adding a device with feature set
7105  *      @one to the master device with current feature set @all.  Will not
7106  *      enable anything that is off in @mask. Returns the new feature set.
7107  */
7108 netdev_features_t netdev_increment_features(netdev_features_t all,
7109         netdev_features_t one, netdev_features_t mask)
7110 {
7111         if (mask & NETIF_F_GEN_CSUM)
7112                 mask |= NETIF_F_ALL_CSUM;
7113         mask |= NETIF_F_VLAN_CHALLENGED;
7114
7115         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7116         all &= one | ~NETIF_F_ALL_FOR_ALL;
7117
7118         /* If one device supports hw checksumming, set for all. */
7119         if (all & NETIF_F_GEN_CSUM)
7120                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7121
7122         return all;
7123 }
7124 EXPORT_SYMBOL(netdev_increment_features);
7125
7126 static struct hlist_head * __net_init netdev_create_hash(void)
7127 {
7128         int i;
7129         struct hlist_head *hash;
7130
7131         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7132         if (hash != NULL)
7133                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7134                         INIT_HLIST_HEAD(&hash[i]);
7135
7136         return hash;
7137 }
7138
7139 /* Initialize per network namespace state */
7140 static int __net_init netdev_init(struct net *net)
7141 {
7142         if (net != &init_net)
7143                 INIT_LIST_HEAD(&net->dev_base_head);
7144
7145         net->dev_name_head = netdev_create_hash();
7146         if (net->dev_name_head == NULL)
7147                 goto err_name;
7148
7149         net->dev_index_head = netdev_create_hash();
7150         if (net->dev_index_head == NULL)
7151                 goto err_idx;
7152
7153         return 0;
7154
7155 err_idx:
7156         kfree(net->dev_name_head);
7157 err_name:
7158         return -ENOMEM;
7159 }
7160
7161 /**
7162  *      netdev_drivername - network driver for the device
7163  *      @dev: network device
7164  *
7165  *      Determine network driver for device.
7166  */
7167 const char *netdev_drivername(const struct net_device *dev)
7168 {
7169         const struct device_driver *driver;
7170         const struct device *parent;
7171         const char *empty = "";
7172
7173         parent = dev->dev.parent;
7174         if (!parent)
7175                 return empty;
7176
7177         driver = parent->driver;
7178         if (driver && driver->name)
7179                 return driver->name;
7180         return empty;
7181 }
7182
7183 static void __netdev_printk(const char *level, const struct net_device *dev,
7184                             struct va_format *vaf)
7185 {
7186         if (dev && dev->dev.parent) {
7187                 dev_printk_emit(level[1] - '0',
7188                                 dev->dev.parent,
7189                                 "%s %s %s%s: %pV",
7190                                 dev_driver_string(dev->dev.parent),
7191                                 dev_name(dev->dev.parent),
7192                                 netdev_name(dev), netdev_reg_state(dev),
7193                                 vaf);
7194         } else if (dev) {
7195                 printk("%s%s%s: %pV",
7196                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7197         } else {
7198                 printk("%s(NULL net_device): %pV", level, vaf);
7199         }
7200 }
7201
7202 void netdev_printk(const char *level, const struct net_device *dev,
7203                    const char *format, ...)
7204 {
7205         struct va_format vaf;
7206         va_list args;
7207
7208         va_start(args, format);
7209
7210         vaf.fmt = format;
7211         vaf.va = &args;
7212
7213         __netdev_printk(level, dev, &vaf);
7214
7215         va_end(args);
7216 }
7217 EXPORT_SYMBOL(netdev_printk);
7218
7219 #define define_netdev_printk_level(func, level)                 \
7220 void func(const struct net_device *dev, const char *fmt, ...)   \
7221 {                                                               \
7222         struct va_format vaf;                                   \
7223         va_list args;                                           \
7224                                                                 \
7225         va_start(args, fmt);                                    \
7226                                                                 \
7227         vaf.fmt = fmt;                                          \
7228         vaf.va = &args;                                         \
7229                                                                 \
7230         __netdev_printk(level, dev, &vaf);                      \
7231                                                                 \
7232         va_end(args);                                           \
7233 }                                                               \
7234 EXPORT_SYMBOL(func);
7235
7236 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7237 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7238 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7239 define_netdev_printk_level(netdev_err, KERN_ERR);
7240 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7241 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7242 define_netdev_printk_level(netdev_info, KERN_INFO);
7243
7244 static void __net_exit netdev_exit(struct net *net)
7245 {
7246         kfree(net->dev_name_head);
7247         kfree(net->dev_index_head);
7248 }
7249
7250 static struct pernet_operations __net_initdata netdev_net_ops = {
7251         .init = netdev_init,
7252         .exit = netdev_exit,
7253 };
7254
7255 static void __net_exit default_device_exit(struct net *net)
7256 {
7257         struct net_device *dev, *aux;
7258         /*
7259          * Push all migratable network devices back to the
7260          * initial network namespace
7261          */
7262         rtnl_lock();
7263         for_each_netdev_safe(net, dev, aux) {
7264                 int err;
7265                 char fb_name[IFNAMSIZ];
7266
7267                 /* Ignore unmoveable devices (i.e. loopback) */
7268                 if (dev->features & NETIF_F_NETNS_LOCAL)
7269                         continue;
7270
7271                 /* Leave virtual devices for the generic cleanup */
7272                 if (dev->rtnl_link_ops)
7273                         continue;
7274
7275                 /* Push remaining network devices to init_net */
7276                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7277                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7278                 if (err) {
7279                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7280                                  __func__, dev->name, err);
7281                         BUG();
7282                 }
7283         }
7284         rtnl_unlock();
7285 }
7286
7287 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7288 {
7289         /* Return with the rtnl_lock held when there are no network
7290          * devices unregistering in any network namespace in net_list.
7291          */
7292         struct net *net;
7293         bool unregistering;
7294         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7295
7296         add_wait_queue(&netdev_unregistering_wq, &wait);
7297         for (;;) {
7298                 unregistering = false;
7299                 rtnl_lock();
7300                 list_for_each_entry(net, net_list, exit_list) {
7301                         if (net->dev_unreg_count > 0) {
7302                                 unregistering = true;
7303                                 break;
7304                         }
7305                 }
7306                 if (!unregistering)
7307                         break;
7308                 __rtnl_unlock();
7309
7310                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7311         }
7312         remove_wait_queue(&netdev_unregistering_wq, &wait);
7313 }
7314
7315 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7316 {
7317         /* At exit all network devices most be removed from a network
7318          * namespace.  Do this in the reverse order of registration.
7319          * Do this across as many network namespaces as possible to
7320          * improve batching efficiency.
7321          */
7322         struct net_device *dev;
7323         struct net *net;
7324         LIST_HEAD(dev_kill_list);
7325
7326         /* To prevent network device cleanup code from dereferencing
7327          * loopback devices or network devices that have been freed
7328          * wait here for all pending unregistrations to complete,
7329          * before unregistring the loopback device and allowing the
7330          * network namespace be freed.
7331          *
7332          * The netdev todo list containing all network devices
7333          * unregistrations that happen in default_device_exit_batch
7334          * will run in the rtnl_unlock() at the end of
7335          * default_device_exit_batch.
7336          */
7337         rtnl_lock_unregistering(net_list);
7338         list_for_each_entry(net, net_list, exit_list) {
7339                 for_each_netdev_reverse(net, dev) {
7340                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7341                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7342                         else
7343                                 unregister_netdevice_queue(dev, &dev_kill_list);
7344                 }
7345         }
7346         unregister_netdevice_many(&dev_kill_list);
7347         rtnl_unlock();
7348 }
7349
7350 static struct pernet_operations __net_initdata default_device_ops = {
7351         .exit = default_device_exit,
7352         .exit_batch = default_device_exit_batch,
7353 };
7354
7355 /*
7356  *      Initialize the DEV module. At boot time this walks the device list and
7357  *      unhooks any devices that fail to initialise (normally hardware not
7358  *      present) and leaves us with a valid list of present and active devices.
7359  *
7360  */
7361
7362 /*
7363  *       This is called single threaded during boot, so no need
7364  *       to take the rtnl semaphore.
7365  */
7366 static int __init net_dev_init(void)
7367 {
7368         int i, rc = -ENOMEM;
7369
7370         BUG_ON(!dev_boot_phase);
7371
7372         if (dev_proc_init())
7373                 goto out;
7374
7375         if (netdev_kobject_init())
7376                 goto out;
7377
7378         INIT_LIST_HEAD(&ptype_all);
7379         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7380                 INIT_LIST_HEAD(&ptype_base[i]);
7381
7382         INIT_LIST_HEAD(&offload_base);
7383
7384         if (register_pernet_subsys(&netdev_net_ops))
7385                 goto out;
7386
7387         /*
7388          *      Initialise the packet receive queues.
7389          */
7390
7391         for_each_possible_cpu(i) {
7392                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7393
7394                 skb_queue_head_init(&sd->input_pkt_queue);
7395                 skb_queue_head_init(&sd->process_queue);
7396                 INIT_LIST_HEAD(&sd->poll_list);
7397                 sd->output_queue_tailp = &sd->output_queue;
7398 #ifdef CONFIG_RPS
7399                 sd->csd.func = rps_trigger_softirq;
7400                 sd->csd.info = sd;
7401                 sd->cpu = i;
7402 #endif
7403
7404                 sd->backlog.poll = process_backlog;
7405                 sd->backlog.weight = weight_p;
7406         }
7407
7408         dev_boot_phase = 0;
7409
7410         /* The loopback device is special if any other network devices
7411          * is present in a network namespace the loopback device must
7412          * be present. Since we now dynamically allocate and free the
7413          * loopback device ensure this invariant is maintained by
7414          * keeping the loopback device as the first device on the
7415          * list of network devices.  Ensuring the loopback devices
7416          * is the first device that appears and the last network device
7417          * that disappears.
7418          */
7419         if (register_pernet_device(&loopback_net_ops))
7420                 goto out;
7421
7422         if (register_pernet_device(&default_device_ops))
7423                 goto out;
7424
7425         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7426         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7427
7428         hotcpu_notifier(dev_cpu_callback, 0);
7429         dst_init();
7430         rc = 0;
7431 out:
7432         return rc;
7433 }
7434
7435 subsys_initcall(net_dev_init);