2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/notifier.h>
94 #include <linux/skbuff.h>
96 #include <linux/rtnetlink.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/stat.h>
100 #include <linux/if_bridge.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <linux/highmem.h>
105 #include <linux/init.h>
106 #include <linux/kmod.h>
107 #include <linux/module.h>
108 #include <linux/kallsyms.h>
109 #include <linux/netpoll.h>
110 #include <linux/rcupdate.h>
111 #include <linux/delay.h>
112 #include <net/wext.h>
113 #include <net/iw_handler.h>
114 #include <asm/current.h>
115 #include <linux/audit.h>
116 #include <linux/dmaengine.h>
117 #include <linux/err.h>
118 #include <linux/ctype.h>
121 * The list of packet types we will receive (as opposed to discard)
122 * and the routines to invoke.
124 * Why 16. Because with 16 the only overlap we get on a hash of the
125 * low nibble of the protocol value is RARP/SNAP/X.25.
127 * NOTE: That is no longer true with the addition of VLAN tags. Not
128 * sure which should go first, but I bet it won't make much
129 * difference if we are running VLANs. The good news is that
130 * this protocol won't be in the list unless compiled in, so
131 * the average user (w/out VLANs) will not be adversely affected.
148 static DEFINE_SPINLOCK(ptype_lock);
149 static struct list_head ptype_base[16] __read_mostly; /* 16 way hashed list */
150 static struct list_head ptype_all __read_mostly; /* Taps */
152 #ifdef CONFIG_NET_DMA
153 static struct dma_client *net_dma_client;
154 static unsigned int net_dma_count;
155 static spinlock_t net_dma_event_lock;
159 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
162 * Pure readers hold dev_base_lock for reading.
164 * Writers must hold the rtnl semaphore while they loop through the
165 * dev_base_head list, and hold dev_base_lock for writing when they do the
166 * actual updates. This allows pure readers to access the list even
167 * while a writer is preparing to update it.
169 * To put it another way, dev_base_lock is held for writing only to
170 * protect against pure readers; the rtnl semaphore provides the
171 * protection against other writers.
173 * See, for example usages, register_netdevice() and
174 * unregister_netdevice(), which must be called with the rtnl
177 LIST_HEAD(dev_base_head);
178 DEFINE_RWLOCK(dev_base_lock);
180 EXPORT_SYMBOL(dev_base_head);
181 EXPORT_SYMBOL(dev_base_lock);
183 #define NETDEV_HASHBITS 8
184 static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
185 static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
187 static inline struct hlist_head *dev_name_hash(const char *name)
189 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
190 return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
193 static inline struct hlist_head *dev_index_hash(int ifindex)
195 return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
202 static RAW_NOTIFIER_HEAD(netdev_chain);
205 * Device drivers call our routines to queue packets here. We empty the
206 * queue in the local softnet handler.
208 DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
211 extern int netdev_sysfs_init(void);
212 extern int netdev_register_sysfs(struct net_device *);
213 extern void netdev_unregister_sysfs(struct net_device *);
215 #define netdev_sysfs_init() (0)
216 #define netdev_register_sysfs(dev) (0)
217 #define netdev_unregister_sysfs(dev) do { } while(0)
221 /*******************************************************************************
223 Protocol management and registration routines
225 *******************************************************************************/
228 * Add a protocol ID to the list. Now that the input handler is
229 * smarter we can dispense with all the messy stuff that used to be
232 * BEWARE!!! Protocol handlers, mangling input packets,
233 * MUST BE last in hash buckets and checking protocol handlers
234 * MUST start from promiscuous ptype_all chain in net_bh.
235 * It is true now, do not change it.
236 * Explanation follows: if protocol handler, mangling packet, will
237 * be the first on list, it is not able to sense, that packet
238 * is cloned and should be copied-on-write, so that it will
239 * change it and subsequent readers will get broken packet.
244 * dev_add_pack - add packet handler
245 * @pt: packet type declaration
247 * Add a protocol handler to the networking stack. The passed &packet_type
248 * is linked into kernel lists and may not be freed until it has been
249 * removed from the kernel lists.
251 * This call does not sleep therefore it can not
252 * guarantee all CPU's that are in middle of receiving packets
253 * will see the new packet type (until the next received packet).
256 void dev_add_pack(struct packet_type *pt)
260 spin_lock_bh(&ptype_lock);
261 if (pt->type == htons(ETH_P_ALL))
262 list_add_rcu(&pt->list, &ptype_all);
264 hash = ntohs(pt->type) & 15;
265 list_add_rcu(&pt->list, &ptype_base[hash]);
267 spin_unlock_bh(&ptype_lock);
271 * __dev_remove_pack - remove packet handler
272 * @pt: packet type declaration
274 * Remove a protocol handler that was previously added to the kernel
275 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
276 * from the kernel lists and can be freed or reused once this function
279 * The packet type might still be in use by receivers
280 * and must not be freed until after all the CPU's have gone
281 * through a quiescent state.
283 void __dev_remove_pack(struct packet_type *pt)
285 struct list_head *head;
286 struct packet_type *pt1;
288 spin_lock_bh(&ptype_lock);
290 if (pt->type == htons(ETH_P_ALL))
293 head = &ptype_base[ntohs(pt->type) & 15];
295 list_for_each_entry(pt1, head, list) {
297 list_del_rcu(&pt->list);
302 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
304 spin_unlock_bh(&ptype_lock);
307 * dev_remove_pack - remove packet handler
308 * @pt: packet type declaration
310 * Remove a protocol handler that was previously added to the kernel
311 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
312 * from the kernel lists and can be freed or reused once this function
315 * This call sleeps to guarantee that no CPU is looking at the packet
318 void dev_remove_pack(struct packet_type *pt)
320 __dev_remove_pack(pt);
325 /******************************************************************************
327 Device Boot-time Settings Routines
329 *******************************************************************************/
331 /* Boot time configuration table */
332 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
335 * netdev_boot_setup_add - add new setup entry
336 * @name: name of the device
337 * @map: configured settings for the device
339 * Adds new setup entry to the dev_boot_setup list. The function
340 * returns 0 on error and 1 on success. This is a generic routine to
343 static int netdev_boot_setup_add(char *name, struct ifmap *map)
345 struct netdev_boot_setup *s;
349 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
350 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
351 memset(s[i].name, 0, sizeof(s[i].name));
352 strcpy(s[i].name, name);
353 memcpy(&s[i].map, map, sizeof(s[i].map));
358 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
362 * netdev_boot_setup_check - check boot time settings
363 * @dev: the netdevice
365 * Check boot time settings for the device.
366 * The found settings are set for the device to be used
367 * later in the device probing.
368 * Returns 0 if no settings found, 1 if they are.
370 int netdev_boot_setup_check(struct net_device *dev)
372 struct netdev_boot_setup *s = dev_boot_setup;
375 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
376 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
377 !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
378 dev->irq = s[i].map.irq;
379 dev->base_addr = s[i].map.base_addr;
380 dev->mem_start = s[i].map.mem_start;
381 dev->mem_end = s[i].map.mem_end;
390 * netdev_boot_base - get address from boot time settings
391 * @prefix: prefix for network device
392 * @unit: id for network device
394 * Check boot time settings for the base address of device.
395 * The found settings are set for the device to be used
396 * later in the device probing.
397 * Returns 0 if no settings found.
399 unsigned long netdev_boot_base(const char *prefix, int unit)
401 const struct netdev_boot_setup *s = dev_boot_setup;
405 sprintf(name, "%s%d", prefix, unit);
408 * If device already registered then return base of 1
409 * to indicate not to probe for this interface
411 if (__dev_get_by_name(name))
414 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
415 if (!strcmp(name, s[i].name))
416 return s[i].map.base_addr;
421 * Saves at boot time configured settings for any netdevice.
423 int __init netdev_boot_setup(char *str)
428 str = get_options(str, ARRAY_SIZE(ints), ints);
433 memset(&map, 0, sizeof(map));
437 map.base_addr = ints[2];
439 map.mem_start = ints[3];
441 map.mem_end = ints[4];
443 /* Add new entry to the list */
444 return netdev_boot_setup_add(str, &map);
447 __setup("netdev=", netdev_boot_setup);
449 /*******************************************************************************
451 Device Interface Subroutines
453 *******************************************************************************/
456 * __dev_get_by_name - find a device by its name
457 * @name: name to find
459 * Find an interface by name. Must be called under RTNL semaphore
460 * or @dev_base_lock. If the name is found a pointer to the device
461 * is returned. If the name is not found then %NULL is returned. The
462 * reference counters are not incremented so the caller must be
463 * careful with locks.
466 struct net_device *__dev_get_by_name(const char *name)
468 struct hlist_node *p;
470 hlist_for_each(p, dev_name_hash(name)) {
471 struct net_device *dev
472 = hlist_entry(p, struct net_device, name_hlist);
473 if (!strncmp(dev->name, name, IFNAMSIZ))
480 * dev_get_by_name - find a device by its name
481 * @name: name to find
483 * Find an interface by name. This can be called from any
484 * context and does its own locking. The returned handle has
485 * the usage count incremented and the caller must use dev_put() to
486 * release it when it is no longer needed. %NULL is returned if no
487 * matching device is found.
490 struct net_device *dev_get_by_name(const char *name)
492 struct net_device *dev;
494 read_lock(&dev_base_lock);
495 dev = __dev_get_by_name(name);
498 read_unlock(&dev_base_lock);
503 * __dev_get_by_index - find a device by its ifindex
504 * @ifindex: index of device
506 * Search for an interface by index. Returns %NULL if the device
507 * is not found or a pointer to the device. The device has not
508 * had its reference counter increased so the caller must be careful
509 * about locking. The caller must hold either the RTNL semaphore
513 struct net_device *__dev_get_by_index(int ifindex)
515 struct hlist_node *p;
517 hlist_for_each(p, dev_index_hash(ifindex)) {
518 struct net_device *dev
519 = hlist_entry(p, struct net_device, index_hlist);
520 if (dev->ifindex == ifindex)
528 * dev_get_by_index - find a device by its ifindex
529 * @ifindex: index of device
531 * Search for an interface by index. Returns NULL if the device
532 * is not found or a pointer to the device. The device returned has
533 * had a reference added and the pointer is safe until the user calls
534 * dev_put to indicate they have finished with it.
537 struct net_device *dev_get_by_index(int ifindex)
539 struct net_device *dev;
541 read_lock(&dev_base_lock);
542 dev = __dev_get_by_index(ifindex);
545 read_unlock(&dev_base_lock);
550 * dev_getbyhwaddr - find a device by its hardware address
551 * @type: media type of device
552 * @ha: hardware address
554 * Search for an interface by MAC address. Returns NULL if the device
555 * is not found or a pointer to the device. The caller must hold the
556 * rtnl semaphore. The returned device has not had its ref count increased
557 * and the caller must therefore be careful about locking
560 * If the API was consistent this would be __dev_get_by_hwaddr
563 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
565 struct net_device *dev;
570 if (dev->type == type &&
571 !memcmp(dev->dev_addr, ha, dev->addr_len))
577 EXPORT_SYMBOL(dev_getbyhwaddr);
579 struct net_device *__dev_getfirstbyhwtype(unsigned short type)
581 struct net_device *dev;
585 if (dev->type == type)
591 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
593 struct net_device *dev_getfirstbyhwtype(unsigned short type)
595 struct net_device *dev;
598 dev = __dev_getfirstbyhwtype(type);
605 EXPORT_SYMBOL(dev_getfirstbyhwtype);
608 * dev_get_by_flags - find any device with given flags
609 * @if_flags: IFF_* values
610 * @mask: bitmask of bits in if_flags to check
612 * Search for any interface with the given flags. Returns NULL if a device
613 * is not found or a pointer to the device. The device returned has
614 * had a reference added and the pointer is safe until the user calls
615 * dev_put to indicate they have finished with it.
618 struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
620 struct net_device *dev, *ret;
623 read_lock(&dev_base_lock);
624 for_each_netdev(dev) {
625 if (((dev->flags ^ if_flags) & mask) == 0) {
631 read_unlock(&dev_base_lock);
636 * dev_valid_name - check if name is okay for network device
639 * Network device names need to be valid file names to
640 * to allow sysfs to work. We also disallow any kind of
643 int dev_valid_name(const char *name)
647 if (strlen(name) >= IFNAMSIZ)
649 if (!strcmp(name, ".") || !strcmp(name, ".."))
653 if (*name == '/' || isspace(*name))
661 * dev_alloc_name - allocate a name for a device
663 * @name: name format string
665 * Passed a format string - eg "lt%d" it will try and find a suitable
666 * id. It scans list of devices to build up a free map, then chooses
667 * the first empty slot. The caller must hold the dev_base or rtnl lock
668 * while allocating the name and adding the device in order to avoid
670 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
671 * Returns the number of the unit assigned or a negative errno code.
674 int dev_alloc_name(struct net_device *dev, const char *name)
679 const int max_netdevices = 8*PAGE_SIZE;
681 struct net_device *d;
683 p = strnchr(name, IFNAMSIZ-1, '%');
686 * Verify the string as this thing may have come from
687 * the user. There must be either one "%d" and no other "%"
690 if (p[1] != 'd' || strchr(p + 2, '%'))
693 /* Use one page as a bit array of possible slots */
694 inuse = (long *) get_zeroed_page(GFP_ATOMIC);
699 if (!sscanf(d->name, name, &i))
701 if (i < 0 || i >= max_netdevices)
704 /* avoid cases where sscanf is not exact inverse of printf */
705 snprintf(buf, sizeof(buf), name, i);
706 if (!strncmp(buf, d->name, IFNAMSIZ))
710 i = find_first_zero_bit(inuse, max_netdevices);
711 free_page((unsigned long) inuse);
714 snprintf(buf, sizeof(buf), name, i);
715 if (!__dev_get_by_name(buf)) {
716 strlcpy(dev->name, buf, IFNAMSIZ);
720 /* It is possible to run out of possible slots
721 * when the name is long and there isn't enough space left
722 * for the digits, or if all bits are used.
729 * dev_change_name - change name of a device
731 * @newname: name (or format string) must be at least IFNAMSIZ
733 * Change name of a device, can pass format strings "eth%d".
736 int dev_change_name(struct net_device *dev, char *newname)
742 if (dev->flags & IFF_UP)
745 if (!dev_valid_name(newname))
748 if (strchr(newname, '%')) {
749 err = dev_alloc_name(dev, newname);
752 strcpy(newname, dev->name);
754 else if (__dev_get_by_name(newname))
757 strlcpy(dev->name, newname, IFNAMSIZ);
759 device_rename(&dev->dev, dev->name);
760 hlist_del(&dev->name_hlist);
761 hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
762 raw_notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
768 * netdev_features_change - device changes features
769 * @dev: device to cause notification
771 * Called to indicate a device has changed features.
773 void netdev_features_change(struct net_device *dev)
775 raw_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
777 EXPORT_SYMBOL(netdev_features_change);
780 * netdev_state_change - device changes state
781 * @dev: device to cause notification
783 * Called to indicate a device has changed state. This function calls
784 * the notifier chains for netdev_chain and sends a NEWLINK message
785 * to the routing socket.
787 void netdev_state_change(struct net_device *dev)
789 if (dev->flags & IFF_UP) {
790 raw_notifier_call_chain(&netdev_chain,
792 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
797 * dev_load - load a network module
798 * @name: name of interface
800 * If a network interface is not present and the process has suitable
801 * privileges this function loads the module. If module loading is not
802 * available in this kernel then it becomes a nop.
805 void dev_load(const char *name)
807 struct net_device *dev;
809 read_lock(&dev_base_lock);
810 dev = __dev_get_by_name(name);
811 read_unlock(&dev_base_lock);
813 if (!dev && capable(CAP_SYS_MODULE))
814 request_module("%s", name);
817 static int default_rebuild_header(struct sk_buff *skb)
819 printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
820 skb->dev ? skb->dev->name : "NULL!!!");
826 * dev_open - prepare an interface for use.
827 * @dev: device to open
829 * Takes a device from down to up state. The device's private open
830 * function is invoked and then the multicast lists are loaded. Finally
831 * the device is moved into the up state and a %NETDEV_UP message is
832 * sent to the netdev notifier chain.
834 * Calling this function on an active interface is a nop. On a failure
835 * a negative errno code is returned.
837 int dev_open(struct net_device *dev)
845 if (dev->flags & IFF_UP)
849 * Is it even present?
851 if (!netif_device_present(dev))
855 * Call device private open method
857 set_bit(__LINK_STATE_START, &dev->state);
859 ret = dev->open(dev);
861 clear_bit(__LINK_STATE_START, &dev->state);
865 * If it went open OK then:
872 dev->flags |= IFF_UP;
875 * Initialize multicasting status
880 * Wakeup transmit queue engine
885 * ... and announce new interface.
887 raw_notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
893 * dev_close - shutdown an interface.
894 * @dev: device to shutdown
896 * This function moves an active device into down state. A
897 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
898 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
901 int dev_close(struct net_device *dev)
903 if (!(dev->flags & IFF_UP))
907 * Tell people we are going down, so that they can
908 * prepare to death, when device is still operating.
910 raw_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
914 clear_bit(__LINK_STATE_START, &dev->state);
916 /* Synchronize to scheduled poll. We cannot touch poll list,
917 * it can be even on different cpu. So just clear netif_running(),
918 * and wait when poll really will happen. Actually, the best place
919 * for this is inside dev->stop() after device stopped its irq
920 * engine, but this requires more changes in devices. */
922 smp_mb__after_clear_bit(); /* Commit netif_running(). */
923 while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
929 * Call the device specific close. This cannot fail.
930 * Only if device is UP
932 * We allow it to be called even after a DETACH hot-plug
939 * Device is now down.
942 dev->flags &= ~IFF_UP;
945 * Tell people we are down
947 raw_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
954 * Device change register/unregister. These are not inline or static
955 * as we export them to the world.
959 * register_netdevice_notifier - register a network notifier block
962 * Register a notifier to be called when network device events occur.
963 * The notifier passed is linked into the kernel structures and must
964 * not be reused until it has been unregistered. A negative errno code
965 * is returned on a failure.
967 * When registered all registration and up events are replayed
968 * to the new notifier to allow device to have a race free
969 * view of the network device list.
972 int register_netdevice_notifier(struct notifier_block *nb)
974 struct net_device *dev;
978 err = raw_notifier_chain_register(&netdev_chain, nb);
980 for_each_netdev(dev) {
981 nb->notifier_call(nb, NETDEV_REGISTER, dev);
983 if (dev->flags & IFF_UP)
984 nb->notifier_call(nb, NETDEV_UP, dev);
992 * unregister_netdevice_notifier - unregister a network notifier block
995 * Unregister a notifier previously registered by
996 * register_netdevice_notifier(). The notifier is unlinked into the
997 * kernel structures and may then be reused. A negative errno code
998 * is returned on a failure.
1001 int unregister_netdevice_notifier(struct notifier_block *nb)
1006 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1012 * call_netdevice_notifiers - call all network notifier blocks
1013 * @val: value passed unmodified to notifier function
1014 * @v: pointer passed unmodified to notifier function
1016 * Call all network notifier blocks. Parameters and return value
1017 * are as for raw_notifier_call_chain().
1020 int call_netdevice_notifiers(unsigned long val, void *v)
1022 return raw_notifier_call_chain(&netdev_chain, val, v);
1025 /* When > 0 there are consumers of rx skb time stamps */
1026 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1028 void net_enable_timestamp(void)
1030 atomic_inc(&netstamp_needed);
1033 void net_disable_timestamp(void)
1035 atomic_dec(&netstamp_needed);
1038 static inline void net_timestamp(struct sk_buff *skb)
1040 if (atomic_read(&netstamp_needed))
1041 __net_timestamp(skb);
1043 skb->tstamp.tv64 = 0;
1047 * Support routine. Sends outgoing frames to any network
1048 * taps currently in use.
1051 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1053 struct packet_type *ptype;
1058 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1059 /* Never send packets back to the socket
1062 if ((ptype->dev == dev || !ptype->dev) &&
1063 (ptype->af_packet_priv == NULL ||
1064 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1065 struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1069 /* skb->nh should be correctly
1070 set by sender, so that the second statement is
1071 just protection against buggy protocols.
1073 skb_reset_mac_header(skb2);
1075 if (skb_network_header(skb2) < skb2->data ||
1076 skb2->network_header > skb2->tail) {
1077 if (net_ratelimit())
1078 printk(KERN_CRIT "protocol %04x is "
1080 skb2->protocol, dev->name);
1081 skb_reset_network_header(skb2);
1084 skb2->transport_header = skb2->network_header;
1085 skb2->pkt_type = PACKET_OUTGOING;
1086 ptype->func(skb2, skb->dev, ptype, skb->dev);
1093 void __netif_schedule(struct net_device *dev)
1095 if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1096 unsigned long flags;
1097 struct softnet_data *sd;
1099 local_irq_save(flags);
1100 sd = &__get_cpu_var(softnet_data);
1101 dev->next_sched = sd->output_queue;
1102 sd->output_queue = dev;
1103 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1104 local_irq_restore(flags);
1107 EXPORT_SYMBOL(__netif_schedule);
1109 void __netif_rx_schedule(struct net_device *dev)
1111 unsigned long flags;
1113 local_irq_save(flags);
1115 list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
1117 dev->quota += dev->weight;
1119 dev->quota = dev->weight;
1120 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1121 local_irq_restore(flags);
1123 EXPORT_SYMBOL(__netif_rx_schedule);
1125 void dev_kfree_skb_any(struct sk_buff *skb)
1127 if (in_irq() || irqs_disabled())
1128 dev_kfree_skb_irq(skb);
1132 EXPORT_SYMBOL(dev_kfree_skb_any);
1136 void netif_device_detach(struct net_device *dev)
1138 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1139 netif_running(dev)) {
1140 netif_stop_queue(dev);
1143 EXPORT_SYMBOL(netif_device_detach);
1145 void netif_device_attach(struct net_device *dev)
1147 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1148 netif_running(dev)) {
1149 netif_wake_queue(dev);
1150 __netdev_watchdog_up(dev);
1153 EXPORT_SYMBOL(netif_device_attach);
1157 * Invalidate hardware checksum when packet is to be mangled, and
1158 * complete checksum manually on outgoing path.
1160 int skb_checksum_help(struct sk_buff *skb)
1163 int ret = 0, offset;
1165 if (skb->ip_summed == CHECKSUM_COMPLETE)
1166 goto out_set_summed;
1168 if (unlikely(skb_shinfo(skb)->gso_size)) {
1169 /* Let GSO fix up the checksum. */
1170 goto out_set_summed;
1173 if (skb_cloned(skb)) {
1174 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1179 offset = skb->csum_start - skb_headroom(skb);
1180 BUG_ON(offset > (int)skb->len);
1181 csum = skb_checksum(skb, offset, skb->len-offset, 0);
1183 offset = skb_headlen(skb) - offset;
1184 BUG_ON(offset <= 0);
1185 BUG_ON(skb->csum_offset + 2 > offset);
1187 *(__sum16 *)(skb->head + skb->csum_start + skb->csum_offset) =
1190 skb->ip_summed = CHECKSUM_NONE;
1196 * skb_gso_segment - Perform segmentation on skb.
1197 * @skb: buffer to segment
1198 * @features: features for the output path (see dev->features)
1200 * This function segments the given skb and returns a list of segments.
1202 * It may return NULL if the skb requires no segmentation. This is
1203 * only possible when GSO is used for verifying header integrity.
1205 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1207 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1208 struct packet_type *ptype;
1209 __be16 type = skb->protocol;
1212 BUG_ON(skb_shinfo(skb)->frag_list);
1214 skb_reset_mac_header(skb);
1215 skb->mac_len = skb->network_header - skb->mac_header;
1216 __skb_pull(skb, skb->mac_len);
1218 if (WARN_ON(skb->ip_summed != CHECKSUM_PARTIAL)) {
1219 if (skb_header_cloned(skb) &&
1220 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1221 return ERR_PTR(err);
1225 list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type) & 15], list) {
1226 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1227 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1228 err = ptype->gso_send_check(skb);
1229 segs = ERR_PTR(err);
1230 if (err || skb_gso_ok(skb, features))
1232 __skb_push(skb, (skb->data -
1233 skb_network_header(skb)));
1235 segs = ptype->gso_segment(skb, features);
1241 __skb_push(skb, skb->data - skb_mac_header(skb));
1246 EXPORT_SYMBOL(skb_gso_segment);
1248 /* Take action when hardware reception checksum errors are detected. */
1250 void netdev_rx_csum_fault(struct net_device *dev)
1252 if (net_ratelimit()) {
1253 printk(KERN_ERR "%s: hw csum failure.\n",
1254 dev ? dev->name : "<unknown>");
1258 EXPORT_SYMBOL(netdev_rx_csum_fault);
1261 /* Actually, we should eliminate this check as soon as we know, that:
1262 * 1. IOMMU is present and allows to map all the memory.
1263 * 2. No high memory really exists on this machine.
1266 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1268 #ifdef CONFIG_HIGHMEM
1271 if (dev->features & NETIF_F_HIGHDMA)
1274 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1275 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1283 void (*destructor)(struct sk_buff *skb);
1286 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1288 static void dev_gso_skb_destructor(struct sk_buff *skb)
1290 struct dev_gso_cb *cb;
1293 struct sk_buff *nskb = skb->next;
1295 skb->next = nskb->next;
1298 } while (skb->next);
1300 cb = DEV_GSO_CB(skb);
1302 cb->destructor(skb);
1306 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1307 * @skb: buffer to segment
1309 * This function segments the given skb and stores the list of segments
1312 static int dev_gso_segment(struct sk_buff *skb)
1314 struct net_device *dev = skb->dev;
1315 struct sk_buff *segs;
1316 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1319 segs = skb_gso_segment(skb, features);
1321 /* Verifying header integrity only. */
1325 if (unlikely(IS_ERR(segs)))
1326 return PTR_ERR(segs);
1329 DEV_GSO_CB(skb)->destructor = skb->destructor;
1330 skb->destructor = dev_gso_skb_destructor;
1335 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
1337 if (likely(!skb->next)) {
1338 if (!list_empty(&ptype_all))
1339 dev_queue_xmit_nit(skb, dev);
1341 if (netif_needs_gso(dev, skb)) {
1342 if (unlikely(dev_gso_segment(skb)))
1348 return dev->hard_start_xmit(skb, dev);
1353 struct sk_buff *nskb = skb->next;
1356 skb->next = nskb->next;
1358 rc = dev->hard_start_xmit(nskb, dev);
1360 nskb->next = skb->next;
1364 if (unlikely(netif_queue_stopped(dev) && skb->next))
1365 return NETDEV_TX_BUSY;
1366 } while (skb->next);
1368 skb->destructor = DEV_GSO_CB(skb)->destructor;
1375 #define HARD_TX_LOCK(dev, cpu) { \
1376 if ((dev->features & NETIF_F_LLTX) == 0) { \
1377 netif_tx_lock(dev); \
1381 #define HARD_TX_UNLOCK(dev) { \
1382 if ((dev->features & NETIF_F_LLTX) == 0) { \
1383 netif_tx_unlock(dev); \
1388 * dev_queue_xmit - transmit a buffer
1389 * @skb: buffer to transmit
1391 * Queue a buffer for transmission to a network device. The caller must
1392 * have set the device and priority and built the buffer before calling
1393 * this function. The function can be called from an interrupt.
1395 * A negative errno code is returned on a failure. A success does not
1396 * guarantee the frame will be transmitted as it may be dropped due
1397 * to congestion or traffic shaping.
1399 * -----------------------------------------------------------------------------------
1400 * I notice this method can also return errors from the queue disciplines,
1401 * including NET_XMIT_DROP, which is a positive value. So, errors can also
1404 * Regardless of the return value, the skb is consumed, so it is currently
1405 * difficult to retry a send to this method. (You can bump the ref count
1406 * before sending to hold a reference for retry if you are careful.)
1408 * When calling this method, interrupts MUST be enabled. This is because
1409 * the BH enable code must have IRQs enabled so that it will not deadlock.
1413 int dev_queue_xmit(struct sk_buff *skb)
1415 struct net_device *dev = skb->dev;
1419 /* GSO will handle the following emulations directly. */
1420 if (netif_needs_gso(dev, skb))
1423 if (skb_shinfo(skb)->frag_list &&
1424 !(dev->features & NETIF_F_FRAGLIST) &&
1425 __skb_linearize(skb))
1428 /* Fragmented skb is linearized if device does not support SG,
1429 * or if at least one of fragments is in highmem and device
1430 * does not support DMA from it.
1432 if (skb_shinfo(skb)->nr_frags &&
1433 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1434 __skb_linearize(skb))
1437 /* If packet is not checksummed and device does not support
1438 * checksumming for this protocol, complete checksumming here.
1440 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1441 skb_set_transport_header(skb, skb->csum_start -
1444 if (!(dev->features & NETIF_F_GEN_CSUM) &&
1445 (!(dev->features & NETIF_F_IP_CSUM) ||
1446 skb->protocol != htons(ETH_P_IP)))
1447 if (skb_checksum_help(skb))
1452 spin_lock_prefetch(&dev->queue_lock);
1454 /* Disable soft irqs for various locks below. Also
1455 * stops preemption for RCU.
1459 /* Updates of qdisc are serialized by queue_lock.
1460 * The struct Qdisc which is pointed to by qdisc is now a
1461 * rcu structure - it may be accessed without acquiring
1462 * a lock (but the structure may be stale.) The freeing of the
1463 * qdisc will be deferred until it's known that there are no
1464 * more references to it.
1466 * If the qdisc has an enqueue function, we still need to
1467 * hold the queue_lock before calling it, since queue_lock
1468 * also serializes access to the device queue.
1471 q = rcu_dereference(dev->qdisc);
1472 #ifdef CONFIG_NET_CLS_ACT
1473 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1476 /* Grab device queue */
1477 spin_lock(&dev->queue_lock);
1480 rc = q->enqueue(skb, q);
1482 spin_unlock(&dev->queue_lock);
1484 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1487 spin_unlock(&dev->queue_lock);
1490 /* The device has no queue. Common case for software devices:
1491 loopback, all the sorts of tunnels...
1493 Really, it is unlikely that netif_tx_lock protection is necessary
1494 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1496 However, it is possible, that they rely on protection
1499 Check this and shot the lock. It is not prone from deadlocks.
1500 Either shot noqueue qdisc, it is even simpler 8)
1502 if (dev->flags & IFF_UP) {
1503 int cpu = smp_processor_id(); /* ok because BHs are off */
1505 if (dev->xmit_lock_owner != cpu) {
1507 HARD_TX_LOCK(dev, cpu);
1509 if (!netif_queue_stopped(dev)) {
1511 if (!dev_hard_start_xmit(skb, dev)) {
1512 HARD_TX_UNLOCK(dev);
1516 HARD_TX_UNLOCK(dev);
1517 if (net_ratelimit())
1518 printk(KERN_CRIT "Virtual device %s asks to "
1519 "queue packet!\n", dev->name);
1521 /* Recursion is detected! It is possible,
1523 if (net_ratelimit())
1524 printk(KERN_CRIT "Dead loop on virtual device "
1525 "%s, fix it urgently!\n", dev->name);
1530 rcu_read_unlock_bh();
1536 rcu_read_unlock_bh();
1541 /*=======================================================================
1543 =======================================================================*/
1545 int netdev_max_backlog __read_mostly = 1000;
1546 int netdev_budget __read_mostly = 300;
1547 int weight_p __read_mostly = 64; /* old backlog weight */
1549 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1553 * netif_rx - post buffer to the network code
1554 * @skb: buffer to post
1556 * This function receives a packet from a device driver and queues it for
1557 * the upper (protocol) levels to process. It always succeeds. The buffer
1558 * may be dropped during processing for congestion control or by the
1562 * NET_RX_SUCCESS (no congestion)
1563 * NET_RX_CN_LOW (low congestion)
1564 * NET_RX_CN_MOD (moderate congestion)
1565 * NET_RX_CN_HIGH (high congestion)
1566 * NET_RX_DROP (packet was dropped)
1570 int netif_rx(struct sk_buff *skb)
1572 struct softnet_data *queue;
1573 unsigned long flags;
1575 /* if netpoll wants it, pretend we never saw it */
1576 if (netpoll_rx(skb))
1579 if (!skb->tstamp.tv64)
1583 * The code is rearranged so that the path is the most
1584 * short when CPU is congested, but is still operating.
1586 local_irq_save(flags);
1587 queue = &__get_cpu_var(softnet_data);
1589 __get_cpu_var(netdev_rx_stat).total++;
1590 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1591 if (queue->input_pkt_queue.qlen) {
1594 __skb_queue_tail(&queue->input_pkt_queue, skb);
1595 local_irq_restore(flags);
1596 return NET_RX_SUCCESS;
1599 netif_rx_schedule(&queue->backlog_dev);
1603 __get_cpu_var(netdev_rx_stat).dropped++;
1604 local_irq_restore(flags);
1610 int netif_rx_ni(struct sk_buff *skb)
1615 err = netif_rx(skb);
1616 if (local_softirq_pending())
1623 EXPORT_SYMBOL(netif_rx_ni);
1625 static inline struct net_device *skb_bond(struct sk_buff *skb)
1627 struct net_device *dev = skb->dev;
1630 if (skb_bond_should_drop(skb)) {
1634 skb->dev = dev->master;
1640 static void net_tx_action(struct softirq_action *h)
1642 struct softnet_data *sd = &__get_cpu_var(softnet_data);
1644 if (sd->completion_queue) {
1645 struct sk_buff *clist;
1647 local_irq_disable();
1648 clist = sd->completion_queue;
1649 sd->completion_queue = NULL;
1653 struct sk_buff *skb = clist;
1654 clist = clist->next;
1656 BUG_TRAP(!atomic_read(&skb->users));
1661 if (sd->output_queue) {
1662 struct net_device *head;
1664 local_irq_disable();
1665 head = sd->output_queue;
1666 sd->output_queue = NULL;
1670 struct net_device *dev = head;
1671 head = head->next_sched;
1673 smp_mb__before_clear_bit();
1674 clear_bit(__LINK_STATE_SCHED, &dev->state);
1676 if (spin_trylock(&dev->queue_lock)) {
1678 spin_unlock(&dev->queue_lock);
1680 netif_schedule(dev);
1686 static inline int deliver_skb(struct sk_buff *skb,
1687 struct packet_type *pt_prev,
1688 struct net_device *orig_dev)
1690 atomic_inc(&skb->users);
1691 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1694 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1695 /* These hooks defined here for ATM */
1697 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1698 unsigned char *addr);
1699 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
1702 * If bridge module is loaded call bridging hook.
1703 * returns NULL if packet was consumed.
1705 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
1706 struct sk_buff *skb) __read_mostly;
1707 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
1708 struct packet_type **pt_prev, int *ret,
1709 struct net_device *orig_dev)
1711 struct net_bridge_port *port;
1713 if (skb->pkt_type == PACKET_LOOPBACK ||
1714 (port = rcu_dereference(skb->dev->br_port)) == NULL)
1718 *ret = deliver_skb(skb, *pt_prev, orig_dev);
1722 return br_handle_frame_hook(port, skb);
1725 #define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
1728 #ifdef CONFIG_NET_CLS_ACT
1729 /* TODO: Maybe we should just force sch_ingress to be compiled in
1730 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1731 * a compare and 2 stores extra right now if we dont have it on
1732 * but have CONFIG_NET_CLS_ACT
1733 * NOTE: This doesnt stop any functionality; if you dont have
1734 * the ingress scheduler, you just cant add policies on ingress.
1737 static int ing_filter(struct sk_buff *skb)
1740 struct net_device *dev = skb->dev;
1741 int result = TC_ACT_OK;
1743 if (dev->qdisc_ingress) {
1744 __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1745 if (MAX_RED_LOOP < ttl++) {
1746 printk(KERN_WARNING "Redir loop detected Dropping packet (%d->%d)\n",
1747 skb->iif, skb->dev->ifindex);
1751 skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1753 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1755 spin_lock(&dev->ingress_lock);
1756 if ((q = dev->qdisc_ingress) != NULL)
1757 result = q->enqueue(skb, q);
1758 spin_unlock(&dev->ingress_lock);
1766 int netif_receive_skb(struct sk_buff *skb)
1768 struct packet_type *ptype, *pt_prev;
1769 struct net_device *orig_dev;
1770 int ret = NET_RX_DROP;
1773 /* if we've gotten here through NAPI, check netpoll */
1774 if (skb->dev->poll && netpoll_rx(skb))
1777 if (!skb->tstamp.tv64)
1781 skb->iif = skb->dev->ifindex;
1783 orig_dev = skb_bond(skb);
1788 __get_cpu_var(netdev_rx_stat).total++;
1790 skb_reset_network_header(skb);
1791 skb_reset_transport_header(skb);
1792 skb->mac_len = skb->network_header - skb->mac_header;
1798 #ifdef CONFIG_NET_CLS_ACT
1799 if (skb->tc_verd & TC_NCLS) {
1800 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
1805 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1806 if (!ptype->dev || ptype->dev == skb->dev) {
1808 ret = deliver_skb(skb, pt_prev, orig_dev);
1813 #ifdef CONFIG_NET_CLS_ACT
1815 ret = deliver_skb(skb, pt_prev, orig_dev);
1816 pt_prev = NULL; /* noone else should process this after*/
1818 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1821 ret = ing_filter(skb);
1823 if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
1832 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
1836 type = skb->protocol;
1837 list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
1838 if (ptype->type == type &&
1839 (!ptype->dev || ptype->dev == skb->dev)) {
1841 ret = deliver_skb(skb, pt_prev, orig_dev);
1847 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1850 /* Jamal, now you will not able to escape explaining
1851 * me how you were going to use this. :-)
1861 static int process_backlog(struct net_device *backlog_dev, int *budget)
1864 int quota = min(backlog_dev->quota, *budget);
1865 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1866 unsigned long start_time = jiffies;
1868 backlog_dev->weight = weight_p;
1870 struct sk_buff *skb;
1871 struct net_device *dev;
1873 local_irq_disable();
1874 skb = __skb_dequeue(&queue->input_pkt_queue);
1881 netif_receive_skb(skb);
1887 if (work >= quota || jiffies - start_time > 1)
1892 backlog_dev->quota -= work;
1897 backlog_dev->quota -= work;
1900 list_del(&backlog_dev->poll_list);
1901 smp_mb__before_clear_bit();
1902 netif_poll_enable(backlog_dev);
1908 static void net_rx_action(struct softirq_action *h)
1910 struct softnet_data *queue = &__get_cpu_var(softnet_data);
1911 unsigned long start_time = jiffies;
1912 int budget = netdev_budget;
1915 local_irq_disable();
1917 while (!list_empty(&queue->poll_list)) {
1918 struct net_device *dev;
1920 if (budget <= 0 || jiffies - start_time > 1)
1925 dev = list_entry(queue->poll_list.next,
1926 struct net_device, poll_list);
1927 have = netpoll_poll_lock(dev);
1929 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1930 netpoll_poll_unlock(have);
1931 local_irq_disable();
1932 list_move_tail(&dev->poll_list, &queue->poll_list);
1934 dev->quota += dev->weight;
1936 dev->quota = dev->weight;
1938 netpoll_poll_unlock(have);
1940 local_irq_disable();
1944 #ifdef CONFIG_NET_DMA
1946 * There may not be any more sk_buffs coming right now, so push
1947 * any pending DMA copies to hardware
1949 if (net_dma_client) {
1950 struct dma_chan *chan;
1952 list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node)
1953 dma_async_memcpy_issue_pending(chan);
1961 __get_cpu_var(netdev_rx_stat).time_squeeze++;
1962 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1966 static gifconf_func_t * gifconf_list [NPROTO];
1969 * register_gifconf - register a SIOCGIF handler
1970 * @family: Address family
1971 * @gifconf: Function handler
1973 * Register protocol dependent address dumping routines. The handler
1974 * that is passed must not be freed or reused until it has been replaced
1975 * by another handler.
1977 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
1979 if (family >= NPROTO)
1981 gifconf_list[family] = gifconf;
1987 * Map an interface index to its name (SIOCGIFNAME)
1991 * We need this ioctl for efficient implementation of the
1992 * if_indextoname() function required by the IPv6 API. Without
1993 * it, we would have to search all the interfaces to find a
1997 static int dev_ifname(struct ifreq __user *arg)
1999 struct net_device *dev;
2003 * Fetch the caller's info block.
2006 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2009 read_lock(&dev_base_lock);
2010 dev = __dev_get_by_index(ifr.ifr_ifindex);
2012 read_unlock(&dev_base_lock);
2016 strcpy(ifr.ifr_name, dev->name);
2017 read_unlock(&dev_base_lock);
2019 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2025 * Perform a SIOCGIFCONF call. This structure will change
2026 * size eventually, and there is nothing I can do about it.
2027 * Thus we will need a 'compatibility mode'.
2030 static int dev_ifconf(char __user *arg)
2033 struct net_device *dev;
2040 * Fetch the caller's info block.
2043 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2050 * Loop over the interfaces, and write an info block for each.
2054 for_each_netdev(dev) {
2055 for (i = 0; i < NPROTO; i++) {
2056 if (gifconf_list[i]) {
2059 done = gifconf_list[i](dev, NULL, 0);
2061 done = gifconf_list[i](dev, pos + total,
2071 * All done. Write the updated control block back to the caller.
2073 ifc.ifc_len = total;
2076 * Both BSD and Solaris return 0 here, so we do too.
2078 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2081 #ifdef CONFIG_PROC_FS
2083 * This is invoked by the /proc filesystem handler to display a device
2086 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2089 struct net_device *dev;
2091 read_lock(&dev_base_lock);
2093 return SEQ_START_TOKEN;
2096 for_each_netdev(dev)
2103 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2106 return v == SEQ_START_TOKEN ?
2107 first_net_device() : next_net_device((struct net_device *)v);
2110 void dev_seq_stop(struct seq_file *seq, void *v)
2112 read_unlock(&dev_base_lock);
2115 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2117 struct net_device_stats *stats = dev->get_stats(dev);
2119 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2120 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2121 dev->name, stats->rx_bytes, stats->rx_packets,
2123 stats->rx_dropped + stats->rx_missed_errors,
2124 stats->rx_fifo_errors,
2125 stats->rx_length_errors + stats->rx_over_errors +
2126 stats->rx_crc_errors + stats->rx_frame_errors,
2127 stats->rx_compressed, stats->multicast,
2128 stats->tx_bytes, stats->tx_packets,
2129 stats->tx_errors, stats->tx_dropped,
2130 stats->tx_fifo_errors, stats->collisions,
2131 stats->tx_carrier_errors +
2132 stats->tx_aborted_errors +
2133 stats->tx_window_errors +
2134 stats->tx_heartbeat_errors,
2135 stats->tx_compressed);
2139 * Called from the PROCfs module. This now uses the new arbitrary sized
2140 * /proc/net interface to create /proc/net/dev
2142 static int dev_seq_show(struct seq_file *seq, void *v)
2144 if (v == SEQ_START_TOKEN)
2145 seq_puts(seq, "Inter-| Receive "
2147 " face |bytes packets errs drop fifo frame "
2148 "compressed multicast|bytes packets errs "
2149 "drop fifo colls carrier compressed\n");
2151 dev_seq_printf_stats(seq, v);
2155 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2157 struct netif_rx_stats *rc = NULL;
2159 while (*pos < NR_CPUS)
2160 if (cpu_online(*pos)) {
2161 rc = &per_cpu(netdev_rx_stat, *pos);
2168 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2170 return softnet_get_online(pos);
2173 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2176 return softnet_get_online(pos);
2179 static void softnet_seq_stop(struct seq_file *seq, void *v)
2183 static int softnet_seq_show(struct seq_file *seq, void *v)
2185 struct netif_rx_stats *s = v;
2187 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2188 s->total, s->dropped, s->time_squeeze, 0,
2189 0, 0, 0, 0, /* was fastroute */
2194 static const struct seq_operations dev_seq_ops = {
2195 .start = dev_seq_start,
2196 .next = dev_seq_next,
2197 .stop = dev_seq_stop,
2198 .show = dev_seq_show,
2201 static int dev_seq_open(struct inode *inode, struct file *file)
2203 return seq_open(file, &dev_seq_ops);
2206 static const struct file_operations dev_seq_fops = {
2207 .owner = THIS_MODULE,
2208 .open = dev_seq_open,
2210 .llseek = seq_lseek,
2211 .release = seq_release,
2214 static const struct seq_operations softnet_seq_ops = {
2215 .start = softnet_seq_start,
2216 .next = softnet_seq_next,
2217 .stop = softnet_seq_stop,
2218 .show = softnet_seq_show,
2221 static int softnet_seq_open(struct inode *inode, struct file *file)
2223 return seq_open(file, &softnet_seq_ops);
2226 static const struct file_operations softnet_seq_fops = {
2227 .owner = THIS_MODULE,
2228 .open = softnet_seq_open,
2230 .llseek = seq_lseek,
2231 .release = seq_release,
2234 static void *ptype_get_idx(loff_t pos)
2236 struct packet_type *pt = NULL;
2240 list_for_each_entry_rcu(pt, &ptype_all, list) {
2246 for (t = 0; t < 16; t++) {
2247 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
2256 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
2259 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
2262 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2264 struct packet_type *pt;
2265 struct list_head *nxt;
2269 if (v == SEQ_START_TOKEN)
2270 return ptype_get_idx(0);
2273 nxt = pt->list.next;
2274 if (pt->type == htons(ETH_P_ALL)) {
2275 if (nxt != &ptype_all)
2278 nxt = ptype_base[0].next;
2280 hash = ntohs(pt->type) & 15;
2282 while (nxt == &ptype_base[hash]) {
2285 nxt = ptype_base[hash].next;
2288 return list_entry(nxt, struct packet_type, list);
2291 static void ptype_seq_stop(struct seq_file *seq, void *v)
2296 static void ptype_seq_decode(struct seq_file *seq, void *sym)
2298 #ifdef CONFIG_KALLSYMS
2299 unsigned long offset = 0, symsize;
2300 const char *symname;
2304 symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
2311 modname = delim = "";
2312 seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
2318 seq_printf(seq, "[%p]", sym);
2321 static int ptype_seq_show(struct seq_file *seq, void *v)
2323 struct packet_type *pt = v;
2325 if (v == SEQ_START_TOKEN)
2326 seq_puts(seq, "Type Device Function\n");
2328 if (pt->type == htons(ETH_P_ALL))
2329 seq_puts(seq, "ALL ");
2331 seq_printf(seq, "%04x", ntohs(pt->type));
2333 seq_printf(seq, " %-8s ",
2334 pt->dev ? pt->dev->name : "");
2335 ptype_seq_decode(seq, pt->func);
2336 seq_putc(seq, '\n');
2342 static const struct seq_operations ptype_seq_ops = {
2343 .start = ptype_seq_start,
2344 .next = ptype_seq_next,
2345 .stop = ptype_seq_stop,
2346 .show = ptype_seq_show,
2349 static int ptype_seq_open(struct inode *inode, struct file *file)
2351 return seq_open(file, &ptype_seq_ops);
2354 static const struct file_operations ptype_seq_fops = {
2355 .owner = THIS_MODULE,
2356 .open = ptype_seq_open,
2358 .llseek = seq_lseek,
2359 .release = seq_release,
2363 static int __init dev_proc_init(void)
2367 if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2369 if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2371 if (!proc_net_fops_create("ptype", S_IRUGO, &ptype_seq_fops))
2374 if (wext_proc_init())
2380 proc_net_remove("ptype");
2382 proc_net_remove("softnet_stat");
2384 proc_net_remove("dev");
2388 #define dev_proc_init() 0
2389 #endif /* CONFIG_PROC_FS */
2393 * netdev_set_master - set up master/slave pair
2394 * @slave: slave device
2395 * @master: new master device
2397 * Changes the master device of the slave. Pass %NULL to break the
2398 * bonding. The caller must hold the RTNL semaphore. On a failure
2399 * a negative errno code is returned. On success the reference counts
2400 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2401 * function returns zero.
2403 int netdev_set_master(struct net_device *slave, struct net_device *master)
2405 struct net_device *old = slave->master;
2415 slave->master = master;
2423 slave->flags |= IFF_SLAVE;
2425 slave->flags &= ~IFF_SLAVE;
2427 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2432 * dev_set_promiscuity - update promiscuity count on a device
2436 * Add or remove promiscuity from a device. While the count in the device
2437 * remains above zero the interface remains promiscuous. Once it hits zero
2438 * the device reverts back to normal filtering operation. A negative inc
2439 * value is used to drop promiscuity on the device.
2441 void dev_set_promiscuity(struct net_device *dev, int inc)
2443 unsigned short old_flags = dev->flags;
2445 if ((dev->promiscuity += inc) == 0)
2446 dev->flags &= ~IFF_PROMISC;
2448 dev->flags |= IFF_PROMISC;
2449 if (dev->flags != old_flags) {
2451 printk(KERN_INFO "device %s %s promiscuous mode\n",
2452 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2454 audit_log(current->audit_context, GFP_ATOMIC,
2455 AUDIT_ANOM_PROMISCUOUS,
2456 "dev=%s prom=%d old_prom=%d auid=%u",
2457 dev->name, (dev->flags & IFF_PROMISC),
2458 (old_flags & IFF_PROMISC),
2459 audit_get_loginuid(current->audit_context));
2464 * dev_set_allmulti - update allmulti count on a device
2468 * Add or remove reception of all multicast frames to a device. While the
2469 * count in the device remains above zero the interface remains listening
2470 * to all interfaces. Once it hits zero the device reverts back to normal
2471 * filtering operation. A negative @inc value is used to drop the counter
2472 * when releasing a resource needing all multicasts.
2475 void dev_set_allmulti(struct net_device *dev, int inc)
2477 unsigned short old_flags = dev->flags;
2479 dev->flags |= IFF_ALLMULTI;
2480 if ((dev->allmulti += inc) == 0)
2481 dev->flags &= ~IFF_ALLMULTI;
2482 if (dev->flags ^ old_flags)
2486 unsigned dev_get_flags(const struct net_device *dev)
2490 flags = (dev->flags & ~(IFF_PROMISC |
2495 (dev->gflags & (IFF_PROMISC |
2498 if (netif_running(dev)) {
2499 if (netif_oper_up(dev))
2500 flags |= IFF_RUNNING;
2501 if (netif_carrier_ok(dev))
2502 flags |= IFF_LOWER_UP;
2503 if (netif_dormant(dev))
2504 flags |= IFF_DORMANT;
2510 int dev_change_flags(struct net_device *dev, unsigned flags)
2513 int old_flags = dev->flags;
2516 * Set the flags on our device.
2519 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2520 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2522 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2526 * Load in the correct multicast list now the flags have changed.
2532 * Have we downed the interface. We handle IFF_UP ourselves
2533 * according to user attempts to set it, rather than blindly
2538 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
2539 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2545 if (dev->flags & IFF_UP &&
2546 ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2548 raw_notifier_call_chain(&netdev_chain,
2549 NETDEV_CHANGE, dev);
2551 if ((flags ^ dev->gflags) & IFF_PROMISC) {
2552 int inc = (flags & IFF_PROMISC) ? +1 : -1;
2553 dev->gflags ^= IFF_PROMISC;
2554 dev_set_promiscuity(dev, inc);
2557 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2558 is important. Some (broken) drivers set IFF_PROMISC, when
2559 IFF_ALLMULTI is requested not asking us and not reporting.
2561 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2562 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2563 dev->gflags ^= IFF_ALLMULTI;
2564 dev_set_allmulti(dev, inc);
2567 if (old_flags ^ dev->flags)
2568 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2573 int dev_set_mtu(struct net_device *dev, int new_mtu)
2577 if (new_mtu == dev->mtu)
2580 /* MTU must be positive. */
2584 if (!netif_device_present(dev))
2588 if (dev->change_mtu)
2589 err = dev->change_mtu(dev, new_mtu);
2592 if (!err && dev->flags & IFF_UP)
2593 raw_notifier_call_chain(&netdev_chain,
2594 NETDEV_CHANGEMTU, dev);
2598 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
2602 if (!dev->set_mac_address)
2604 if (sa->sa_family != dev->type)
2606 if (!netif_device_present(dev))
2608 err = dev->set_mac_address(dev, sa);
2610 raw_notifier_call_chain(&netdev_chain,
2611 NETDEV_CHANGEADDR, dev);
2616 * Perform the SIOCxIFxxx calls.
2618 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2621 struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2627 case SIOCGIFFLAGS: /* Get interface flags */
2628 ifr->ifr_flags = dev_get_flags(dev);
2631 case SIOCSIFFLAGS: /* Set interface flags */
2632 return dev_change_flags(dev, ifr->ifr_flags);
2634 case SIOCGIFMETRIC: /* Get the metric on the interface
2635 (currently unused) */
2636 ifr->ifr_metric = 0;
2639 case SIOCSIFMETRIC: /* Set the metric on the interface
2640 (currently unused) */
2643 case SIOCGIFMTU: /* Get the MTU of a device */
2644 ifr->ifr_mtu = dev->mtu;
2647 case SIOCSIFMTU: /* Set the MTU of a device */
2648 return dev_set_mtu(dev, ifr->ifr_mtu);
2652 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2654 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
2655 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2656 ifr->ifr_hwaddr.sa_family = dev->type;
2660 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
2662 case SIOCSIFHWBROADCAST:
2663 if (ifr->ifr_hwaddr.sa_family != dev->type)
2665 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2666 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2667 raw_notifier_call_chain(&netdev_chain,
2668 NETDEV_CHANGEADDR, dev);
2672 ifr->ifr_map.mem_start = dev->mem_start;
2673 ifr->ifr_map.mem_end = dev->mem_end;
2674 ifr->ifr_map.base_addr = dev->base_addr;
2675 ifr->ifr_map.irq = dev->irq;
2676 ifr->ifr_map.dma = dev->dma;
2677 ifr->ifr_map.port = dev->if_port;
2681 if (dev->set_config) {
2682 if (!netif_device_present(dev))
2684 return dev->set_config(dev, &ifr->ifr_map);
2689 if (!dev->set_multicast_list ||
2690 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2692 if (!netif_device_present(dev))
2694 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2698 if (!dev->set_multicast_list ||
2699 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2701 if (!netif_device_present(dev))
2703 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2707 ifr->ifr_ifindex = dev->ifindex;
2711 ifr->ifr_qlen = dev->tx_queue_len;
2715 if (ifr->ifr_qlen < 0)
2717 dev->tx_queue_len = ifr->ifr_qlen;
2721 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
2722 return dev_change_name(dev, ifr->ifr_newname);
2725 * Unknown or private ioctl
2729 if ((cmd >= SIOCDEVPRIVATE &&
2730 cmd <= SIOCDEVPRIVATE + 15) ||
2731 cmd == SIOCBONDENSLAVE ||
2732 cmd == SIOCBONDRELEASE ||
2733 cmd == SIOCBONDSETHWADDR ||
2734 cmd == SIOCBONDSLAVEINFOQUERY ||
2735 cmd == SIOCBONDINFOQUERY ||
2736 cmd == SIOCBONDCHANGEACTIVE ||
2737 cmd == SIOCGMIIPHY ||
2738 cmd == SIOCGMIIREG ||
2739 cmd == SIOCSMIIREG ||
2740 cmd == SIOCBRADDIF ||
2741 cmd == SIOCBRDELIF ||
2742 cmd == SIOCWANDEV) {
2744 if (dev->do_ioctl) {
2745 if (netif_device_present(dev))
2746 err = dev->do_ioctl(dev, ifr,
2759 * This function handles all "interface"-type I/O control requests. The actual
2760 * 'doing' part of this is dev_ifsioc above.
2764 * dev_ioctl - network device ioctl
2765 * @cmd: command to issue
2766 * @arg: pointer to a struct ifreq in user space
2768 * Issue ioctl functions to devices. This is normally called by the
2769 * user space syscall interfaces but can sometimes be useful for
2770 * other purposes. The return value is the return from the syscall if
2771 * positive or a negative errno code on error.
2774 int dev_ioctl(unsigned int cmd, void __user *arg)
2780 /* One special case: SIOCGIFCONF takes ifconf argument
2781 and requires shared lock, because it sleeps writing
2785 if (cmd == SIOCGIFCONF) {
2787 ret = dev_ifconf((char __user *) arg);
2791 if (cmd == SIOCGIFNAME)
2792 return dev_ifname((struct ifreq __user *)arg);
2794 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2797 ifr.ifr_name[IFNAMSIZ-1] = 0;
2799 colon = strchr(ifr.ifr_name, ':');
2804 * See which interface the caller is talking about.
2809 * These ioctl calls:
2810 * - can be done by all.
2811 * - atomic and do not require locking.
2822 dev_load(ifr.ifr_name);
2823 read_lock(&dev_base_lock);
2824 ret = dev_ifsioc(&ifr, cmd);
2825 read_unlock(&dev_base_lock);
2829 if (copy_to_user(arg, &ifr,
2830 sizeof(struct ifreq)))
2836 dev_load(ifr.ifr_name);
2838 ret = dev_ethtool(&ifr);
2843 if (copy_to_user(arg, &ifr,
2844 sizeof(struct ifreq)))
2850 * These ioctl calls:
2851 * - require superuser power.
2852 * - require strict serialization.
2858 if (!capable(CAP_NET_ADMIN))
2860 dev_load(ifr.ifr_name);
2862 ret = dev_ifsioc(&ifr, cmd);
2867 if (copy_to_user(arg, &ifr,
2868 sizeof(struct ifreq)))
2874 * These ioctl calls:
2875 * - require superuser power.
2876 * - require strict serialization.
2877 * - do not return a value
2887 case SIOCSIFHWBROADCAST:
2890 case SIOCBONDENSLAVE:
2891 case SIOCBONDRELEASE:
2892 case SIOCBONDSETHWADDR:
2893 case SIOCBONDCHANGEACTIVE:
2896 if (!capable(CAP_NET_ADMIN))
2899 case SIOCBONDSLAVEINFOQUERY:
2900 case SIOCBONDINFOQUERY:
2901 dev_load(ifr.ifr_name);
2903 ret = dev_ifsioc(&ifr, cmd);
2908 /* Get the per device memory space. We can add this but
2909 * currently do not support it */
2911 /* Set the per device memory buffer space.
2912 * Not applicable in our case */
2917 * Unknown or private ioctl.
2920 if (cmd == SIOCWANDEV ||
2921 (cmd >= SIOCDEVPRIVATE &&
2922 cmd <= SIOCDEVPRIVATE + 15)) {
2923 dev_load(ifr.ifr_name);
2925 ret = dev_ifsioc(&ifr, cmd);
2927 if (!ret && copy_to_user(arg, &ifr,
2928 sizeof(struct ifreq)))
2932 /* Take care of Wireless Extensions */
2933 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
2934 return wext_handle_ioctl(&ifr, cmd, arg);
2941 * dev_new_index - allocate an ifindex
2943 * Returns a suitable unique value for a new device interface
2944 * number. The caller must hold the rtnl semaphore or the
2945 * dev_base_lock to be sure it remains unique.
2947 static int dev_new_index(void)
2953 if (!__dev_get_by_index(ifindex))
2958 static int dev_boot_phase = 1;
2960 /* Delayed registration/unregisteration */
2961 static DEFINE_SPINLOCK(net_todo_list_lock);
2962 static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
2964 static void net_set_todo(struct net_device *dev)
2966 spin_lock(&net_todo_list_lock);
2967 list_add_tail(&dev->todo_list, &net_todo_list);
2968 spin_unlock(&net_todo_list_lock);
2972 * register_netdevice - register a network device
2973 * @dev: device to register
2975 * Take a completed network device structure and add it to the kernel
2976 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2977 * chain. 0 is returned on success. A negative errno code is returned
2978 * on a failure to set up the device, or if the name is a duplicate.
2980 * Callers must hold the rtnl semaphore. You may want
2981 * register_netdev() instead of this.
2984 * The locking appears insufficient to guarantee two parallel registers
2985 * will not get the same name.
2988 int register_netdevice(struct net_device *dev)
2990 struct hlist_head *head;
2991 struct hlist_node *p;
2994 BUG_ON(dev_boot_phase);
2999 /* When net_device's are persistent, this will be fatal. */
3000 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
3002 spin_lock_init(&dev->queue_lock);
3003 spin_lock_init(&dev->_xmit_lock);
3004 dev->xmit_lock_owner = -1;
3005 spin_lock_init(&dev->ingress_lock);
3009 /* Init, if this function is available */
3011 ret = dev->init(dev);
3019 if (!dev_valid_name(dev->name)) {
3024 dev->ifindex = dev_new_index();
3025 if (dev->iflink == -1)
3026 dev->iflink = dev->ifindex;
3028 /* Check for existence of name */
3029 head = dev_name_hash(dev->name);
3030 hlist_for_each(p, head) {
3031 struct net_device *d
3032 = hlist_entry(p, struct net_device, name_hlist);
3033 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
3039 /* Fix illegal SG+CSUM combinations. */
3040 if ((dev->features & NETIF_F_SG) &&
3041 !(dev->features & NETIF_F_ALL_CSUM)) {
3042 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n",
3044 dev->features &= ~NETIF_F_SG;
3047 /* TSO requires that SG is present as well. */
3048 if ((dev->features & NETIF_F_TSO) &&
3049 !(dev->features & NETIF_F_SG)) {
3050 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n",
3052 dev->features &= ~NETIF_F_TSO;
3054 if (dev->features & NETIF_F_UFO) {
3055 if (!(dev->features & NETIF_F_HW_CSUM)) {
3056 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3057 "NETIF_F_HW_CSUM feature.\n",
3059 dev->features &= ~NETIF_F_UFO;
3061 if (!(dev->features & NETIF_F_SG)) {
3062 printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
3063 "NETIF_F_SG feature.\n",
3065 dev->features &= ~NETIF_F_UFO;
3070 * nil rebuild_header routine,
3071 * that should be never called and used as just bug trap.
3074 if (!dev->rebuild_header)
3075 dev->rebuild_header = default_rebuild_header;
3077 ret = netdev_register_sysfs(dev);
3080 dev->reg_state = NETREG_REGISTERED;
3083 * Default initial state at registry is that the
3084 * device is present.
3087 set_bit(__LINK_STATE_PRESENT, &dev->state);
3089 dev_init_scheduler(dev);
3090 write_lock_bh(&dev_base_lock);
3091 list_add_tail(&dev->dev_list, &dev_base_head);
3092 hlist_add_head(&dev->name_hlist, head);
3093 hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
3095 write_unlock_bh(&dev_base_lock);
3097 /* Notify protocols, that a new device appeared. */
3098 raw_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
3107 * register_netdev - register a network device
3108 * @dev: device to register
3110 * Take a completed network device structure and add it to the kernel
3111 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
3112 * chain. 0 is returned on success. A negative errno code is returned
3113 * on a failure to set up the device, or if the name is a duplicate.
3115 * This is a wrapper around register_netdevice that takes the rtnl semaphore
3116 * and expands the device name if you passed a format string to
3119 int register_netdev(struct net_device *dev)
3126 * If the name is a format string the caller wants us to do a
3129 if (strchr(dev->name, '%')) {
3130 err = dev_alloc_name(dev, dev->name);
3135 err = register_netdevice(dev);
3140 EXPORT_SYMBOL(register_netdev);
3143 * netdev_wait_allrefs - wait until all references are gone.
3145 * This is called when unregistering network devices.
3147 * Any protocol or device that holds a reference should register
3148 * for netdevice notification, and cleanup and put back the
3149 * reference if they receive an UNREGISTER event.
3150 * We can get stuck here if buggy protocols don't correctly
3153 static void netdev_wait_allrefs(struct net_device *dev)
3155 unsigned long rebroadcast_time, warning_time;
3157 rebroadcast_time = warning_time = jiffies;
3158 while (atomic_read(&dev->refcnt) != 0) {
3159 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
3162 /* Rebroadcast unregister notification */
3163 raw_notifier_call_chain(&netdev_chain,
3164 NETDEV_UNREGISTER, dev);
3166 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
3168 /* We must not have linkwatch events
3169 * pending on unregister. If this
3170 * happens, we simply run the queue
3171 * unscheduled, resulting in a noop
3174 linkwatch_run_queue();
3179 rebroadcast_time = jiffies;
3184 if (time_after(jiffies, warning_time + 10 * HZ)) {
3185 printk(KERN_EMERG "unregister_netdevice: "
3186 "waiting for %s to become free. Usage "
3188 dev->name, atomic_read(&dev->refcnt));
3189 warning_time = jiffies;
3198 * register_netdevice(x1);
3199 * register_netdevice(x2);
3201 * unregister_netdevice(y1);
3202 * unregister_netdevice(y2);
3208 * We are invoked by rtnl_unlock() after it drops the semaphore.
3209 * This allows us to deal with problems:
3210 * 1) We can delete sysfs objects which invoke hotplug
3211 * without deadlocking with linkwatch via keventd.
3212 * 2) Since we run with the RTNL semaphore not held, we can sleep
3213 * safely in order to wait for the netdev refcnt to drop to zero.
3215 static DEFINE_MUTEX(net_todo_run_mutex);
3216 void netdev_run_todo(void)
3218 struct list_head list;
3220 /* Need to guard against multiple cpu's getting out of order. */
3221 mutex_lock(&net_todo_run_mutex);
3223 /* Not safe to do outside the semaphore. We must not return
3224 * until all unregister events invoked by the local processor
3225 * have been completed (either by this todo run, or one on
3228 if (list_empty(&net_todo_list))
3231 /* Snapshot list, allow later requests */
3232 spin_lock(&net_todo_list_lock);
3233 list_replace_init(&net_todo_list, &list);
3234 spin_unlock(&net_todo_list_lock);
3236 while (!list_empty(&list)) {
3237 struct net_device *dev
3238 = list_entry(list.next, struct net_device, todo_list);
3239 list_del(&dev->todo_list);
3241 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
3242 printk(KERN_ERR "network todo '%s' but state %d\n",
3243 dev->name, dev->reg_state);
3248 netdev_unregister_sysfs(dev);
3249 dev->reg_state = NETREG_UNREGISTERED;
3251 netdev_wait_allrefs(dev);
3254 BUG_ON(atomic_read(&dev->refcnt));
3255 BUG_TRAP(!dev->ip_ptr);
3256 BUG_TRAP(!dev->ip6_ptr);
3257 BUG_TRAP(!dev->dn_ptr);
3259 /* It must be the very last action,
3260 * after this 'dev' may point to freed up memory.
3262 if (dev->destructor)
3263 dev->destructor(dev);
3267 mutex_unlock(&net_todo_run_mutex);
3270 static struct net_device_stats *internal_stats(struct net_device *dev)
3276 * alloc_netdev - allocate network device
3277 * @sizeof_priv: size of private data to allocate space for
3278 * @name: device name format string
3279 * @setup: callback to initialize device
3281 * Allocates a struct net_device with private data area for driver use
3282 * and performs basic initialization.
3284 struct net_device *alloc_netdev(int sizeof_priv, const char *name,
3285 void (*setup)(struct net_device *))
3288 struct net_device *dev;
3291 BUG_ON(strlen(name) >= sizeof(dev->name));
3293 /* ensure 32-byte alignment of both the device and private area */
3294 alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
3295 alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3297 p = kzalloc(alloc_size, GFP_KERNEL);
3299 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
3303 dev = (struct net_device *)
3304 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3305 dev->padded = (char *)dev - (char *)p;
3308 dev->priv = netdev_priv(dev);
3310 dev->get_stats = internal_stats;
3312 strcpy(dev->name, name);
3315 EXPORT_SYMBOL(alloc_netdev);
3318 * free_netdev - free network device
3321 * This function does the last stage of destroying an allocated device
3322 * interface. The reference to the device object is released.
3323 * If this is the last reference then it will be freed.
3325 void free_netdev(struct net_device *dev)
3328 /* Compatibility with error handling in drivers */
3329 if (dev->reg_state == NETREG_UNINITIALIZED) {
3330 kfree((char *)dev - dev->padded);
3334 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3335 dev->reg_state = NETREG_RELEASED;
3337 /* will free via device release */
3338 put_device(&dev->dev);
3340 kfree((char *)dev - dev->padded);
3344 /* Synchronize with packet receive processing. */
3345 void synchronize_net(void)
3352 * unregister_netdevice - remove device from the kernel
3355 * This function shuts down a device interface and removes it
3356 * from the kernel tables. On success 0 is returned, on a failure
3357 * a negative errno code is returned.
3359 * Callers must hold the rtnl semaphore. You may want
3360 * unregister_netdev() instead of this.
3363 void unregister_netdevice(struct net_device *dev)
3365 BUG_ON(dev_boot_phase);
3368 /* Some devices call without registering for initialization unwind. */
3369 if (dev->reg_state == NETREG_UNINITIALIZED) {
3370 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3371 "was registered\n", dev->name, dev);
3377 BUG_ON(dev->reg_state != NETREG_REGISTERED);
3379 /* If device is running, close it first. */
3380 if (dev->flags & IFF_UP)
3383 /* And unlink it from device chain. */
3384 write_lock_bh(&dev_base_lock);
3385 list_del(&dev->dev_list);
3386 hlist_del(&dev->name_hlist);
3387 hlist_del(&dev->index_hlist);
3388 write_unlock_bh(&dev_base_lock);
3390 dev->reg_state = NETREG_UNREGISTERING;
3394 /* Shutdown queueing discipline. */
3398 /* Notify protocols, that we are about to destroy
3399 this device. They should clean all the things.
3401 raw_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3404 * Flush the multicast chain
3406 dev_mc_discard(dev);
3411 /* Notifier chain MUST detach us from master device. */
3412 BUG_TRAP(!dev->master);
3414 /* Finish processing unregister after unlock */
3423 * unregister_netdev - remove device from the kernel
3426 * This function shuts down a device interface and removes it
3427 * from the kernel tables. On success 0 is returned, on a failure
3428 * a negative errno code is returned.
3430 * This is just a wrapper for unregister_netdevice that takes
3431 * the rtnl semaphore. In general you want to use this and not
3432 * unregister_netdevice.
3434 void unregister_netdev(struct net_device *dev)
3437 unregister_netdevice(dev);
3441 EXPORT_SYMBOL(unregister_netdev);
3443 static int dev_cpu_callback(struct notifier_block *nfb,
3444 unsigned long action,
3447 struct sk_buff **list_skb;
3448 struct net_device **list_net;
3449 struct sk_buff *skb;
3450 unsigned int cpu, oldcpu = (unsigned long)ocpu;
3451 struct softnet_data *sd, *oldsd;
3453 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
3456 local_irq_disable();
3457 cpu = smp_processor_id();
3458 sd = &per_cpu(softnet_data, cpu);
3459 oldsd = &per_cpu(softnet_data, oldcpu);
3461 /* Find end of our completion_queue. */
3462 list_skb = &sd->completion_queue;
3464 list_skb = &(*list_skb)->next;
3465 /* Append completion queue from offline CPU. */
3466 *list_skb = oldsd->completion_queue;
3467 oldsd->completion_queue = NULL;
3469 /* Find end of our output_queue. */
3470 list_net = &sd->output_queue;
3472 list_net = &(*list_net)->next_sched;
3473 /* Append output queue from offline CPU. */
3474 *list_net = oldsd->output_queue;
3475 oldsd->output_queue = NULL;
3477 raise_softirq_irqoff(NET_TX_SOFTIRQ);
3480 /* Process offline CPU's input_pkt_queue */
3481 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3487 #ifdef CONFIG_NET_DMA
3489 * net_dma_rebalance -
3490 * This is called when the number of channels allocated to the net_dma_client
3491 * changes. The net_dma_client tries to have one DMA channel per CPU.
3493 static void net_dma_rebalance(void)
3495 unsigned int cpu, i, n;
3496 struct dma_chan *chan;
3498 if (net_dma_count == 0) {
3499 for_each_online_cpu(cpu)
3500 rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
3505 cpu = first_cpu(cpu_online_map);
3508 list_for_each_entry(chan, &net_dma_client->channels, client_node) {
3509 n = ((num_online_cpus() / net_dma_count)
3510 + (i < (num_online_cpus() % net_dma_count) ? 1 : 0));
3513 per_cpu(softnet_data, cpu).net_dma = chan;
3514 cpu = next_cpu(cpu, cpu_online_map);
3523 * netdev_dma_event - event callback for the net_dma_client
3524 * @client: should always be net_dma_client
3525 * @chan: DMA channel for the event
3526 * @event: event type
3528 static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
3529 enum dma_event event)
3531 spin_lock(&net_dma_event_lock);
3533 case DMA_RESOURCE_ADDED:
3535 net_dma_rebalance();
3537 case DMA_RESOURCE_REMOVED:
3539 net_dma_rebalance();
3544 spin_unlock(&net_dma_event_lock);
3548 * netdev_dma_regiser - register the networking subsystem as a DMA client
3550 static int __init netdev_dma_register(void)
3552 spin_lock_init(&net_dma_event_lock);
3553 net_dma_client = dma_async_client_register(netdev_dma_event);
3554 if (net_dma_client == NULL)
3557 dma_async_client_chan_request(net_dma_client, num_online_cpus());
3562 static int __init netdev_dma_register(void) { return -ENODEV; }
3563 #endif /* CONFIG_NET_DMA */
3566 * Initialize the DEV module. At boot time this walks the device list and
3567 * unhooks any devices that fail to initialise (normally hardware not
3568 * present) and leaves us with a valid list of present and active devices.
3573 * This is called single threaded during boot, so no need
3574 * to take the rtnl semaphore.
3576 static int __init net_dev_init(void)
3578 int i, rc = -ENOMEM;
3580 BUG_ON(!dev_boot_phase);
3582 if (dev_proc_init())
3585 if (netdev_sysfs_init())
3588 INIT_LIST_HEAD(&ptype_all);
3589 for (i = 0; i < 16; i++)
3590 INIT_LIST_HEAD(&ptype_base[i]);
3592 for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3593 INIT_HLIST_HEAD(&dev_name_head[i]);
3595 for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3596 INIT_HLIST_HEAD(&dev_index_head[i]);
3599 * Initialise the packet receive queues.
3602 for_each_possible_cpu(i) {
3603 struct softnet_data *queue;
3605 queue = &per_cpu(softnet_data, i);
3606 skb_queue_head_init(&queue->input_pkt_queue);
3607 queue->completion_queue = NULL;
3608 INIT_LIST_HEAD(&queue->poll_list);
3609 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3610 queue->backlog_dev.weight = weight_p;
3611 queue->backlog_dev.poll = process_backlog;
3612 atomic_set(&queue->backlog_dev.refcnt, 1);
3615 netdev_dma_register();
3619 open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3620 open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3622 hotcpu_notifier(dev_cpu_callback, 0);
3630 subsys_initcall(net_dev_init);
3632 EXPORT_SYMBOL(__dev_get_by_index);
3633 EXPORT_SYMBOL(__dev_get_by_name);
3634 EXPORT_SYMBOL(__dev_remove_pack);
3635 EXPORT_SYMBOL(dev_valid_name);
3636 EXPORT_SYMBOL(dev_add_pack);
3637 EXPORT_SYMBOL(dev_alloc_name);
3638 EXPORT_SYMBOL(dev_close);
3639 EXPORT_SYMBOL(dev_get_by_flags);
3640 EXPORT_SYMBOL(dev_get_by_index);
3641 EXPORT_SYMBOL(dev_get_by_name);
3642 EXPORT_SYMBOL(dev_open);
3643 EXPORT_SYMBOL(dev_queue_xmit);
3644 EXPORT_SYMBOL(dev_remove_pack);
3645 EXPORT_SYMBOL(dev_set_allmulti);
3646 EXPORT_SYMBOL(dev_set_promiscuity);
3647 EXPORT_SYMBOL(dev_change_flags);
3648 EXPORT_SYMBOL(dev_set_mtu);
3649 EXPORT_SYMBOL(dev_set_mac_address);
3650 EXPORT_SYMBOL(free_netdev);
3651 EXPORT_SYMBOL(netdev_boot_setup_check);
3652 EXPORT_SYMBOL(netdev_set_master);
3653 EXPORT_SYMBOL(netdev_state_change);
3654 EXPORT_SYMBOL(netif_receive_skb);
3655 EXPORT_SYMBOL(netif_rx);
3656 EXPORT_SYMBOL(register_gifconf);
3657 EXPORT_SYMBOL(register_netdevice);
3658 EXPORT_SYMBOL(register_netdevice_notifier);
3659 EXPORT_SYMBOL(skb_checksum_help);
3660 EXPORT_SYMBOL(synchronize_net);
3661 EXPORT_SYMBOL(unregister_netdevice);
3662 EXPORT_SYMBOL(unregister_netdevice_notifier);
3663 EXPORT_SYMBOL(net_enable_timestamp);
3664 EXPORT_SYMBOL(net_disable_timestamp);
3665 EXPORT_SYMBOL(dev_get_flags);
3667 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3668 EXPORT_SYMBOL(br_handle_frame_hook);
3669 EXPORT_SYMBOL(br_fdb_get_hook);
3670 EXPORT_SYMBOL(br_fdb_put_hook);
3674 EXPORT_SYMBOL(dev_load);
3677 EXPORT_PER_CPU_SYMBOL(softnet_data);