]> Git Repo - J-linux.git/blob - drivers/net/bonding/bond_main.c
Merge tag 'vfs-6.13-rc7.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[J-linux.git] / drivers / net / bonding / bond_main.c
1 // SPDX-License-Identifier: GPL-1.0+
2 /*
3  * originally based on the dummy device.
4  *
5  * Copyright 1999, Thomas Davis, [email protected].
6  * Based on dummy.c, and eql.c devices.
7  *
8  * bonding.c: an Ethernet Bonding driver
9  *
10  * This is useful to talk to a Cisco EtherChannel compatible equipment:
11  *      Cisco 5500
12  *      Sun Trunking (Solaris)
13  *      Alteon AceDirector Trunks
14  *      Linux Bonding
15  *      and probably many L2 switches ...
16  *
17  * How it works:
18  *    ifconfig bond0 ipaddress netmask up
19  *      will setup a network device, with an ip address.  No mac address
20  *      will be assigned at this time.  The hw mac address will come from
21  *      the first slave bonded to the channel.  All slaves will then use
22  *      this hw mac address.
23  *
24  *    ifconfig bond0 down
25  *         will release all slaves, marking them as down.
26  *
27  *    ifenslave bond0 eth0
28  *      will attach eth0 to bond0 as a slave.  eth0 hw mac address will either
29  *      a: be used as initial mac address
30  *      b: if a hw mac address already is there, eth0's hw mac address
31  *         will then be set from bond0.
32  *
33  */
34
35 #include <linux/kernel.h>
36 #include <linux/module.h>
37 #include <linux/types.h>
38 #include <linux/fcntl.h>
39 #include <linux/filter.h>
40 #include <linux/interrupt.h>
41 #include <linux/ptrace.h>
42 #include <linux/ioport.h>
43 #include <linux/in.h>
44 #include <net/ip.h>
45 #include <linux/ip.h>
46 #include <linux/icmp.h>
47 #include <linux/icmpv6.h>
48 #include <linux/tcp.h>
49 #include <linux/udp.h>
50 #include <linux/slab.h>
51 #include <linux/string.h>
52 #include <linux/init.h>
53 #include <linux/timer.h>
54 #include <linux/socket.h>
55 #include <linux/ctype.h>
56 #include <linux/inet.h>
57 #include <linux/bitops.h>
58 #include <linux/io.h>
59 #include <asm/dma.h>
60 #include <linux/uaccess.h>
61 #include <linux/errno.h>
62 #include <linux/netdevice.h>
63 #include <linux/inetdevice.h>
64 #include <linux/igmp.h>
65 #include <linux/etherdevice.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/rtnetlink.h>
69 #include <linux/smp.h>
70 #include <linux/if_ether.h>
71 #include <net/arp.h>
72 #include <linux/mii.h>
73 #include <linux/ethtool.h>
74 #include <linux/if_vlan.h>
75 #include <linux/if_bonding.h>
76 #include <linux/phy.h>
77 #include <linux/jiffies.h>
78 #include <linux/preempt.h>
79 #include <net/route.h>
80 #include <net/net_namespace.h>
81 #include <net/netns/generic.h>
82 #include <net/pkt_sched.h>
83 #include <linux/rculist.h>
84 #include <net/flow_dissector.h>
85 #include <net/xfrm.h>
86 #include <net/bonding.h>
87 #include <net/bond_3ad.h>
88 #include <net/bond_alb.h>
89 #if IS_ENABLED(CONFIG_TLS_DEVICE)
90 #include <net/tls.h>
91 #endif
92 #include <net/ip6_route.h>
93 #include <net/xdp.h>
94
95 #include "bonding_priv.h"
96
97 /*---------------------------- Module parameters ----------------------------*/
98
99 /* monitor all links that often (in milliseconds). <=0 disables monitoring */
100
101 static int max_bonds    = BOND_DEFAULT_MAX_BONDS;
102 static int tx_queues    = BOND_DEFAULT_TX_QUEUES;
103 static int num_peer_notif = 1;
104 static int miimon;
105 static int updelay;
106 static int downdelay;
107 static int use_carrier  = 1;
108 static char *mode;
109 static char *primary;
110 static char *primary_reselect;
111 static char *lacp_rate;
112 static int min_links;
113 static char *ad_select;
114 static char *xmit_hash_policy;
115 static int arp_interval;
116 static char *arp_ip_target[BOND_MAX_ARP_TARGETS];
117 static char *arp_validate;
118 static char *arp_all_targets;
119 static char *fail_over_mac;
120 static int all_slaves_active;
121 static struct bond_params bonding_defaults;
122 static int resend_igmp = BOND_DEFAULT_RESEND_IGMP;
123 static int packets_per_slave = 1;
124 static int lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;
125
126 module_param(max_bonds, int, 0);
127 MODULE_PARM_DESC(max_bonds, "Max number of bonded devices");
128 module_param(tx_queues, int, 0);
129 MODULE_PARM_DESC(tx_queues, "Max number of transmit queues (default = 16)");
130 module_param_named(num_grat_arp, num_peer_notif, int, 0644);
131 MODULE_PARM_DESC(num_grat_arp, "Number of peer notifications to send on "
132                                "failover event (alias of num_unsol_na)");
133 module_param_named(num_unsol_na, num_peer_notif, int, 0644);
134 MODULE_PARM_DESC(num_unsol_na, "Number of peer notifications to send on "
135                                "failover event (alias of num_grat_arp)");
136 module_param(miimon, int, 0);
137 MODULE_PARM_DESC(miimon, "Link check interval in milliseconds");
138 module_param(updelay, int, 0);
139 MODULE_PARM_DESC(updelay, "Delay before considering link up, in milliseconds");
140 module_param(downdelay, int, 0);
141 MODULE_PARM_DESC(downdelay, "Delay before considering link down, "
142                             "in milliseconds");
143 module_param(use_carrier, int, 0);
144 MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; "
145                               "0 for off, 1 for on (default)");
146 module_param(mode, charp, 0);
147 MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, "
148                        "1 for active-backup, 2 for balance-xor, "
149                        "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, "
150                        "6 for balance-alb");
151 module_param(primary, charp, 0);
152 MODULE_PARM_DESC(primary, "Primary network device to use");
153 module_param(primary_reselect, charp, 0);
154 MODULE_PARM_DESC(primary_reselect, "Reselect primary slave "
155                                    "once it comes up; "
156                                    "0 for always (default), "
157                                    "1 for only if speed of primary is "
158                                    "better, "
159                                    "2 for only on active slave "
160                                    "failure");
161 module_param(lacp_rate, charp, 0);
162 MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner; "
163                             "0 for slow, 1 for fast");
164 module_param(ad_select, charp, 0);
165 MODULE_PARM_DESC(ad_select, "802.3ad aggregation selection logic; "
166                             "0 for stable (default), 1 for bandwidth, "
167                             "2 for count");
168 module_param(min_links, int, 0);
169 MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier");
170
171 module_param(xmit_hash_policy, charp, 0);
172 MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
173                                    "0 for layer 2 (default), 1 for layer 3+4, "
174                                    "2 for layer 2+3, 3 for encap layer 2+3, "
175                                    "4 for encap layer 3+4, 5 for vlan+srcmac");
176 module_param(arp_interval, int, 0);
177 MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
178 module_param_array(arp_ip_target, charp, NULL, 0);
179 MODULE_PARM_DESC(arp_ip_target, "arp targets in n.n.n.n form");
180 module_param(arp_validate, charp, 0);
181 MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes; "
182                                "0 for none (default), 1 for active, "
183                                "2 for backup, 3 for all");
184 module_param(arp_all_targets, charp, 0);
185 MODULE_PARM_DESC(arp_all_targets, "fail on any/all arp targets timeout; 0 for any (default), 1 for all");
186 module_param(fail_over_mac, charp, 0);
187 MODULE_PARM_DESC(fail_over_mac, "For active-backup, do not set all slaves to "
188                                 "the same MAC; 0 for none (default), "
189                                 "1 for active, 2 for follow");
190 module_param(all_slaves_active, int, 0);
191 MODULE_PARM_DESC(all_slaves_active, "Keep all frames received on an interface "
192                                      "by setting active flag for all slaves; "
193                                      "0 for never (default), 1 for always.");
194 module_param(resend_igmp, int, 0);
195 MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on "
196                               "link failure");
197 module_param(packets_per_slave, int, 0);
198 MODULE_PARM_DESC(packets_per_slave, "Packets to send per slave in balance-rr "
199                                     "mode; 0 for a random slave, 1 packet per "
200                                     "slave (default), >1 packets per slave.");
201 module_param(lp_interval, uint, 0);
202 MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where "
203                               "the bonding driver sends learning packets to "
204                               "each slaves peer switch. The default is 1.");
205
206 /*----------------------------- Global variables ----------------------------*/
207
208 #ifdef CONFIG_NET_POLL_CONTROLLER
209 atomic_t netpoll_block_tx = ATOMIC_INIT(0);
210 #endif
211
212 unsigned int bond_net_id __read_mostly;
213
214 static const struct flow_dissector_key flow_keys_bonding_keys[] = {
215         {
216                 .key_id = FLOW_DISSECTOR_KEY_CONTROL,
217                 .offset = offsetof(struct flow_keys, control),
218         },
219         {
220                 .key_id = FLOW_DISSECTOR_KEY_BASIC,
221                 .offset = offsetof(struct flow_keys, basic),
222         },
223         {
224                 .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
225                 .offset = offsetof(struct flow_keys, addrs.v4addrs),
226         },
227         {
228                 .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
229                 .offset = offsetof(struct flow_keys, addrs.v6addrs),
230         },
231         {
232                 .key_id = FLOW_DISSECTOR_KEY_TIPC,
233                 .offset = offsetof(struct flow_keys, addrs.tipckey),
234         },
235         {
236                 .key_id = FLOW_DISSECTOR_KEY_PORTS,
237                 .offset = offsetof(struct flow_keys, ports),
238         },
239         {
240                 .key_id = FLOW_DISSECTOR_KEY_ICMP,
241                 .offset = offsetof(struct flow_keys, icmp),
242         },
243         {
244                 .key_id = FLOW_DISSECTOR_KEY_VLAN,
245                 .offset = offsetof(struct flow_keys, vlan),
246         },
247         {
248                 .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
249                 .offset = offsetof(struct flow_keys, tags),
250         },
251         {
252                 .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
253                 .offset = offsetof(struct flow_keys, keyid),
254         },
255 };
256
257 static struct flow_dissector flow_keys_bonding __read_mostly;
258
259 /*-------------------------- Forward declarations ---------------------------*/
260
261 static int bond_init(struct net_device *bond_dev);
262 static void bond_uninit(struct net_device *bond_dev);
263 static void bond_get_stats(struct net_device *bond_dev,
264                            struct rtnl_link_stats64 *stats);
265 static void bond_slave_arr_handler(struct work_struct *work);
266 static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
267                                   int mod);
268 static void bond_netdev_notify_work(struct work_struct *work);
269
270 /*---------------------------- General routines -----------------------------*/
271
272 const char *bond_mode_name(int mode)
273 {
274         static const char *names[] = {
275                 [BOND_MODE_ROUNDROBIN] = "load balancing (round-robin)",
276                 [BOND_MODE_ACTIVEBACKUP] = "fault-tolerance (active-backup)",
277                 [BOND_MODE_XOR] = "load balancing (xor)",
278                 [BOND_MODE_BROADCAST] = "fault-tolerance (broadcast)",
279                 [BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation",
280                 [BOND_MODE_TLB] = "transmit load balancing",
281                 [BOND_MODE_ALB] = "adaptive load balancing",
282         };
283
284         if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB)
285                 return "unknown";
286
287         return names[mode];
288 }
289
290 /**
291  * bond_dev_queue_xmit - Prepare skb for xmit.
292  *
293  * @bond: bond device that got this skb for tx.
294  * @skb: hw accel VLAN tagged skb to transmit
295  * @slave_dev: slave that is supposed to xmit this skbuff
296  */
297 netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb,
298                         struct net_device *slave_dev)
299 {
300         skb->dev = slave_dev;
301
302         BUILD_BUG_ON(sizeof(skb->queue_mapping) !=
303                      sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping));
304         skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping);
305
306         if (unlikely(netpoll_tx_running(bond->dev)))
307                 return bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb);
308
309         return dev_queue_xmit(skb);
310 }
311
312 static bool bond_sk_check(struct bonding *bond)
313 {
314         switch (BOND_MODE(bond)) {
315         case BOND_MODE_8023AD:
316         case BOND_MODE_XOR:
317                 if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34)
318                         return true;
319                 fallthrough;
320         default:
321                 return false;
322         }
323 }
324
325 static bool bond_xdp_check(struct bonding *bond)
326 {
327         switch (BOND_MODE(bond)) {
328         case BOND_MODE_ROUNDROBIN:
329         case BOND_MODE_ACTIVEBACKUP:
330                 return true;
331         case BOND_MODE_8023AD:
332         case BOND_MODE_XOR:
333                 /* vlan+srcmac is not supported with XDP as in most cases the 802.1q
334                  * payload is not in the packet due to hardware offload.
335                  */
336                 if (bond->params.xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC)
337                         return true;
338                 fallthrough;
339         default:
340                 return false;
341         }
342 }
343
344 /*---------------------------------- VLAN -----------------------------------*/
345
346 /* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid,
347  * We don't protect the slave list iteration with a lock because:
348  * a. This operation is performed in IOCTL context,
349  * b. The operation is protected by the RTNL semaphore in the 8021q code,
350  * c. Holding a lock with BH disabled while directly calling a base driver
351  *    entry point is generally a BAD idea.
352  *
353  * The design of synchronization/protection for this operation in the 8021q
354  * module is good for one or more VLAN devices over a single physical device
355  * and cannot be extended for a teaming solution like bonding, so there is a
356  * potential race condition here where a net device from the vlan group might
357  * be referenced (either by a base driver or the 8021q code) while it is being
358  * removed from the system. However, it turns out we're not making matters
359  * worse, and if it works for regular VLAN usage it will work here too.
360 */
361
362 /**
363  * bond_vlan_rx_add_vid - Propagates adding an id to slaves
364  * @bond_dev: bonding net device that got called
365  * @proto: network protocol ID
366  * @vid: vlan id being added
367  */
368 static int bond_vlan_rx_add_vid(struct net_device *bond_dev,
369                                 __be16 proto, u16 vid)
370 {
371         struct bonding *bond = netdev_priv(bond_dev);
372         struct slave *slave, *rollback_slave;
373         struct list_head *iter;
374         int res;
375
376         bond_for_each_slave(bond, slave, iter) {
377                 res = vlan_vid_add(slave->dev, proto, vid);
378                 if (res)
379                         goto unwind;
380         }
381
382         return 0;
383
384 unwind:
385         /* unwind to the slave that failed */
386         bond_for_each_slave(bond, rollback_slave, iter) {
387                 if (rollback_slave == slave)
388                         break;
389
390                 vlan_vid_del(rollback_slave->dev, proto, vid);
391         }
392
393         return res;
394 }
395
396 /**
397  * bond_vlan_rx_kill_vid - Propagates deleting an id to slaves
398  * @bond_dev: bonding net device that got called
399  * @proto: network protocol ID
400  * @vid: vlan id being removed
401  */
402 static int bond_vlan_rx_kill_vid(struct net_device *bond_dev,
403                                  __be16 proto, u16 vid)
404 {
405         struct bonding *bond = netdev_priv(bond_dev);
406         struct list_head *iter;
407         struct slave *slave;
408
409         bond_for_each_slave(bond, slave, iter)
410                 vlan_vid_del(slave->dev, proto, vid);
411
412         if (bond_is_lb(bond))
413                 bond_alb_clear_vlan(bond, vid);
414
415         return 0;
416 }
417
418 /*---------------------------------- XFRM -----------------------------------*/
419
420 #ifdef CONFIG_XFRM_OFFLOAD
421 /**
422  * bond_ipsec_dev - Get active device for IPsec offload
423  * @xs: pointer to transformer state struct
424  *
425  * Context: caller must hold rcu_read_lock.
426  *
427  * Return: the device for ipsec offload, or NULL if not exist.
428  **/
429 static struct net_device *bond_ipsec_dev(struct xfrm_state *xs)
430 {
431         struct net_device *bond_dev = xs->xso.dev;
432         struct bonding *bond;
433         struct slave *slave;
434
435         if (!bond_dev)
436                 return NULL;
437
438         bond = netdev_priv(bond_dev);
439         if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
440                 return NULL;
441
442         slave = rcu_dereference(bond->curr_active_slave);
443         if (!slave)
444                 return NULL;
445
446         if (!xs->xso.real_dev)
447                 return NULL;
448
449         if (xs->xso.real_dev != slave->dev)
450                 pr_warn_ratelimited("%s: (slave %s): not same with IPsec offload real dev %s\n",
451                                     bond_dev->name, slave->dev->name, xs->xso.real_dev->name);
452
453         return slave->dev;
454 }
455
456 /**
457  * bond_ipsec_add_sa - program device with a security association
458  * @xs: pointer to transformer state struct
459  * @extack: extack point to fill failure reason
460  **/
461 static int bond_ipsec_add_sa(struct xfrm_state *xs,
462                              struct netlink_ext_ack *extack)
463 {
464         struct net_device *bond_dev = xs->xso.dev;
465         struct net_device *real_dev;
466         netdevice_tracker tracker;
467         struct bond_ipsec *ipsec;
468         struct bonding *bond;
469         struct slave *slave;
470         int err;
471
472         if (!bond_dev)
473                 return -EINVAL;
474
475         rcu_read_lock();
476         bond = netdev_priv(bond_dev);
477         slave = rcu_dereference(bond->curr_active_slave);
478         real_dev = slave ? slave->dev : NULL;
479         netdev_hold(real_dev, &tracker, GFP_ATOMIC);
480         rcu_read_unlock();
481         if (!real_dev) {
482                 err = -ENODEV;
483                 goto out;
484         }
485
486         if (!real_dev->xfrmdev_ops ||
487             !real_dev->xfrmdev_ops->xdo_dev_state_add ||
488             netif_is_bond_master(real_dev)) {
489                 NL_SET_ERR_MSG_MOD(extack, "Slave does not support ipsec offload");
490                 err = -EINVAL;
491                 goto out;
492         }
493
494         ipsec = kmalloc(sizeof(*ipsec), GFP_KERNEL);
495         if (!ipsec) {
496                 err = -ENOMEM;
497                 goto out;
498         }
499
500         xs->xso.real_dev = real_dev;
501         err = real_dev->xfrmdev_ops->xdo_dev_state_add(xs, extack);
502         if (!err) {
503                 ipsec->xs = xs;
504                 INIT_LIST_HEAD(&ipsec->list);
505                 mutex_lock(&bond->ipsec_lock);
506                 list_add(&ipsec->list, &bond->ipsec_list);
507                 mutex_unlock(&bond->ipsec_lock);
508         } else {
509                 kfree(ipsec);
510         }
511 out:
512         netdev_put(real_dev, &tracker);
513         return err;
514 }
515
516 static void bond_ipsec_add_sa_all(struct bonding *bond)
517 {
518         struct net_device *bond_dev = bond->dev;
519         struct net_device *real_dev;
520         struct bond_ipsec *ipsec;
521         struct slave *slave;
522
523         slave = rtnl_dereference(bond->curr_active_slave);
524         real_dev = slave ? slave->dev : NULL;
525         if (!real_dev)
526                 return;
527
528         mutex_lock(&bond->ipsec_lock);
529         if (!real_dev->xfrmdev_ops ||
530             !real_dev->xfrmdev_ops->xdo_dev_state_add ||
531             netif_is_bond_master(real_dev)) {
532                 if (!list_empty(&bond->ipsec_list))
533                         slave_warn(bond_dev, real_dev,
534                                    "%s: no slave xdo_dev_state_add\n",
535                                    __func__);
536                 goto out;
537         }
538
539         list_for_each_entry(ipsec, &bond->ipsec_list, list) {
540                 /* If new state is added before ipsec_lock acquired */
541                 if (ipsec->xs->xso.real_dev == real_dev)
542                         continue;
543
544                 ipsec->xs->xso.real_dev = real_dev;
545                 if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
546                         slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
547                         ipsec->xs->xso.real_dev = NULL;
548                 }
549         }
550 out:
551         mutex_unlock(&bond->ipsec_lock);
552 }
553
554 /**
555  * bond_ipsec_del_sa - clear out this specific SA
556  * @xs: pointer to transformer state struct
557  **/
558 static void bond_ipsec_del_sa(struct xfrm_state *xs)
559 {
560         struct net_device *bond_dev = xs->xso.dev;
561         struct net_device *real_dev;
562         netdevice_tracker tracker;
563         struct bond_ipsec *ipsec;
564         struct bonding *bond;
565         struct slave *slave;
566
567         if (!bond_dev)
568                 return;
569
570         rcu_read_lock();
571         bond = netdev_priv(bond_dev);
572         slave = rcu_dereference(bond->curr_active_slave);
573         real_dev = slave ? slave->dev : NULL;
574         netdev_hold(real_dev, &tracker, GFP_ATOMIC);
575         rcu_read_unlock();
576
577         if (!slave)
578                 goto out;
579
580         if (!xs->xso.real_dev)
581                 goto out;
582
583         WARN_ON(xs->xso.real_dev != real_dev);
584
585         if (!real_dev->xfrmdev_ops ||
586             !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
587             netif_is_bond_master(real_dev)) {
588                 slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__);
589                 goto out;
590         }
591
592         real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
593 out:
594         netdev_put(real_dev, &tracker);
595         mutex_lock(&bond->ipsec_lock);
596         list_for_each_entry(ipsec, &bond->ipsec_list, list) {
597                 if (ipsec->xs == xs) {
598                         list_del(&ipsec->list);
599                         kfree(ipsec);
600                         break;
601                 }
602         }
603         mutex_unlock(&bond->ipsec_lock);
604 }
605
606 static void bond_ipsec_del_sa_all(struct bonding *bond)
607 {
608         struct net_device *bond_dev = bond->dev;
609         struct net_device *real_dev;
610         struct bond_ipsec *ipsec;
611         struct slave *slave;
612
613         slave = rtnl_dereference(bond->curr_active_slave);
614         real_dev = slave ? slave->dev : NULL;
615         if (!real_dev)
616                 return;
617
618         mutex_lock(&bond->ipsec_lock);
619         list_for_each_entry(ipsec, &bond->ipsec_list, list) {
620                 if (!ipsec->xs->xso.real_dev)
621                         continue;
622
623                 if (!real_dev->xfrmdev_ops ||
624                     !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
625                     netif_is_bond_master(real_dev)) {
626                         slave_warn(bond_dev, real_dev,
627                                    "%s: no slave xdo_dev_state_delete\n",
628                                    __func__);
629                 } else {
630                         real_dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs);
631                         if (real_dev->xfrmdev_ops->xdo_dev_state_free)
632                                 real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
633                 }
634         }
635         mutex_unlock(&bond->ipsec_lock);
636 }
637
638 static void bond_ipsec_free_sa(struct xfrm_state *xs)
639 {
640         struct net_device *bond_dev = xs->xso.dev;
641         struct net_device *real_dev;
642         netdevice_tracker tracker;
643         struct bonding *bond;
644         struct slave *slave;
645
646         if (!bond_dev)
647                 return;
648
649         rcu_read_lock();
650         bond = netdev_priv(bond_dev);
651         slave = rcu_dereference(bond->curr_active_slave);
652         real_dev = slave ? slave->dev : NULL;
653         netdev_hold(real_dev, &tracker, GFP_ATOMIC);
654         rcu_read_unlock();
655
656         if (!slave)
657                 goto out;
658
659         if (!xs->xso.real_dev)
660                 goto out;
661
662         WARN_ON(xs->xso.real_dev != real_dev);
663
664         if (real_dev && real_dev->xfrmdev_ops &&
665             real_dev->xfrmdev_ops->xdo_dev_state_free)
666                 real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
667 out:
668         netdev_put(real_dev, &tracker);
669 }
670
671 /**
672  * bond_ipsec_offload_ok - can this packet use the xfrm hw offload
673  * @skb: current data packet
674  * @xs: pointer to transformer state struct
675  **/
676 static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
677 {
678         struct net_device *real_dev;
679         bool ok = false;
680
681         rcu_read_lock();
682         real_dev = bond_ipsec_dev(xs);
683         if (!real_dev)
684                 goto out;
685
686         if (!real_dev->xfrmdev_ops ||
687             !real_dev->xfrmdev_ops->xdo_dev_offload_ok ||
688             netif_is_bond_master(real_dev))
689                 goto out;
690
691         ok = real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
692 out:
693         rcu_read_unlock();
694         return ok;
695 }
696
697 /**
698  * bond_advance_esn_state - ESN support for IPSec HW offload
699  * @xs: pointer to transformer state struct
700  **/
701 static void bond_advance_esn_state(struct xfrm_state *xs)
702 {
703         struct net_device *real_dev;
704
705         rcu_read_lock();
706         real_dev = bond_ipsec_dev(xs);
707         if (!real_dev)
708                 goto out;
709
710         if (!real_dev->xfrmdev_ops ||
711             !real_dev->xfrmdev_ops->xdo_dev_state_advance_esn) {
712                 pr_warn_ratelimited("%s: %s doesn't support xdo_dev_state_advance_esn\n", __func__, real_dev->name);
713                 goto out;
714         }
715
716         real_dev->xfrmdev_ops->xdo_dev_state_advance_esn(xs);
717 out:
718         rcu_read_unlock();
719 }
720
721 /**
722  * bond_xfrm_update_stats - Update xfrm state
723  * @xs: pointer to transformer state struct
724  **/
725 static void bond_xfrm_update_stats(struct xfrm_state *xs)
726 {
727         struct net_device *real_dev;
728
729         rcu_read_lock();
730         real_dev = bond_ipsec_dev(xs);
731         if (!real_dev)
732                 goto out;
733
734         if (!real_dev->xfrmdev_ops ||
735             !real_dev->xfrmdev_ops->xdo_dev_state_update_stats) {
736                 pr_warn_ratelimited("%s: %s doesn't support xdo_dev_state_update_stats\n", __func__, real_dev->name);
737                 goto out;
738         }
739
740         real_dev->xfrmdev_ops->xdo_dev_state_update_stats(xs);
741 out:
742         rcu_read_unlock();
743 }
744
745 static const struct xfrmdev_ops bond_xfrmdev_ops = {
746         .xdo_dev_state_add = bond_ipsec_add_sa,
747         .xdo_dev_state_delete = bond_ipsec_del_sa,
748         .xdo_dev_state_free = bond_ipsec_free_sa,
749         .xdo_dev_offload_ok = bond_ipsec_offload_ok,
750         .xdo_dev_state_advance_esn = bond_advance_esn_state,
751         .xdo_dev_state_update_stats = bond_xfrm_update_stats,
752 };
753 #endif /* CONFIG_XFRM_OFFLOAD */
754
755 /*------------------------------- Link status -------------------------------*/
756
757 /* Set the carrier state for the master according to the state of its
758  * slaves.  If any slaves are up, the master is up.  In 802.3ad mode,
759  * do special 802.3ad magic.
760  *
761  * Returns zero if carrier state does not change, nonzero if it does.
762  */
763 int bond_set_carrier(struct bonding *bond)
764 {
765         struct list_head *iter;
766         struct slave *slave;
767
768         if (!bond_has_slaves(bond))
769                 goto down;
770
771         if (BOND_MODE(bond) == BOND_MODE_8023AD)
772                 return bond_3ad_set_carrier(bond);
773
774         bond_for_each_slave(bond, slave, iter) {
775                 if (slave->link == BOND_LINK_UP) {
776                         if (!netif_carrier_ok(bond->dev)) {
777                                 netif_carrier_on(bond->dev);
778                                 return 1;
779                         }
780                         return 0;
781                 }
782         }
783
784 down:
785         if (netif_carrier_ok(bond->dev)) {
786                 netif_carrier_off(bond->dev);
787                 return 1;
788         }
789         return 0;
790 }
791
792 /* Get link speed and duplex from the slave's base driver
793  * using ethtool. If for some reason the call fails or the
794  * values are invalid, set speed and duplex to -1,
795  * and return. Return 1 if speed or duplex settings are
796  * UNKNOWN; 0 otherwise.
797  */
798 static int bond_update_speed_duplex(struct slave *slave)
799 {
800         struct net_device *slave_dev = slave->dev;
801         struct ethtool_link_ksettings ecmd;
802         int res;
803
804         slave->speed = SPEED_UNKNOWN;
805         slave->duplex = DUPLEX_UNKNOWN;
806
807         res = __ethtool_get_link_ksettings(slave_dev, &ecmd);
808         if (res < 0)
809                 return 1;
810         if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1))
811                 return 1;
812         switch (ecmd.base.duplex) {
813         case DUPLEX_FULL:
814         case DUPLEX_HALF:
815                 break;
816         default:
817                 return 1;
818         }
819
820         slave->speed = ecmd.base.speed;
821         slave->duplex = ecmd.base.duplex;
822
823         return 0;
824 }
825
826 const char *bond_slave_link_status(s8 link)
827 {
828         switch (link) {
829         case BOND_LINK_UP:
830                 return "up";
831         case BOND_LINK_FAIL:
832                 return "going down";
833         case BOND_LINK_DOWN:
834                 return "down";
835         case BOND_LINK_BACK:
836                 return "going back";
837         default:
838                 return "unknown";
839         }
840 }
841
842 /* if <dev> supports MII link status reporting, check its link status.
843  *
844  * We either do MII/ETHTOOL ioctls, or check netif_carrier_ok(),
845  * depending upon the setting of the use_carrier parameter.
846  *
847  * Return either BMSR_LSTATUS, meaning that the link is up (or we
848  * can't tell and just pretend it is), or 0, meaning that the link is
849  * down.
850  *
851  * If reporting is non-zero, instead of faking link up, return -1 if
852  * both ETHTOOL and MII ioctls fail (meaning the device does not
853  * support them).  If use_carrier is set, return whatever it says.
854  * It'd be nice if there was a good way to tell if a driver supports
855  * netif_carrier, but there really isn't.
856  */
857 static int bond_check_dev_link(struct bonding *bond,
858                                struct net_device *slave_dev, int reporting)
859 {
860         const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
861         int (*ioctl)(struct net_device *, struct ifreq *, int);
862         struct ifreq ifr;
863         struct mii_ioctl_data *mii;
864
865         if (!reporting && !netif_running(slave_dev))
866                 return 0;
867
868         if (bond->params.use_carrier)
869                 return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;
870
871         /* Try to get link status using Ethtool first. */
872         if (slave_dev->ethtool_ops->get_link)
873                 return slave_dev->ethtool_ops->get_link(slave_dev) ?
874                         BMSR_LSTATUS : 0;
875
876         /* Ethtool can't be used, fallback to MII ioctls. */
877         ioctl = slave_ops->ndo_eth_ioctl;
878         if (ioctl) {
879                 /* TODO: set pointer to correct ioctl on a per team member
880                  *       bases to make this more efficient. that is, once
881                  *       we determine the correct ioctl, we will always
882                  *       call it and not the others for that team
883                  *       member.
884                  */
885
886                 /* We cannot assume that SIOCGMIIPHY will also read a
887                  * register; not all network drivers (e.g., e100)
888                  * support that.
889                  */
890
891                 /* Yes, the mii is overlaid on the ifreq.ifr_ifru */
892                 strscpy_pad(ifr.ifr_name, slave_dev->name, IFNAMSIZ);
893                 mii = if_mii(&ifr);
894                 if (ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) {
895                         mii->reg_num = MII_BMSR;
896                         if (ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0)
897                                 return mii->val_out & BMSR_LSTATUS;
898                 }
899         }
900
901         /* If reporting, report that either there's no ndo_eth_ioctl,
902          * or both SIOCGMIIREG and get_link failed (meaning that we
903          * cannot report link status).  If not reporting, pretend
904          * we're ok.
905          */
906         return reporting ? -1 : BMSR_LSTATUS;
907 }
908
909 /*----------------------------- Multicast list ------------------------------*/
910
911 /* Push the promiscuity flag down to appropriate slaves */
912 static int bond_set_promiscuity(struct bonding *bond, int inc)
913 {
914         struct list_head *iter;
915         int err = 0;
916
917         if (bond_uses_primary(bond)) {
918                 struct slave *curr_active = rtnl_dereference(bond->curr_active_slave);
919
920                 if (curr_active)
921                         err = dev_set_promiscuity(curr_active->dev, inc);
922         } else {
923                 struct slave *slave;
924
925                 bond_for_each_slave(bond, slave, iter) {
926                         err = dev_set_promiscuity(slave->dev, inc);
927                         if (err)
928                                 return err;
929                 }
930         }
931         return err;
932 }
933
934 /* Push the allmulti flag down to all slaves */
935 static int bond_set_allmulti(struct bonding *bond, int inc)
936 {
937         struct list_head *iter;
938         int err = 0;
939
940         if (bond_uses_primary(bond)) {
941                 struct slave *curr_active = rtnl_dereference(bond->curr_active_slave);
942
943                 if (curr_active)
944                         err = dev_set_allmulti(curr_active->dev, inc);
945         } else {
946                 struct slave *slave;
947
948                 bond_for_each_slave(bond, slave, iter) {
949                         err = dev_set_allmulti(slave->dev, inc);
950                         if (err)
951                                 return err;
952                 }
953         }
954         return err;
955 }
956
957 /* Retrieve the list of registered multicast addresses for the bonding
958  * device and retransmit an IGMP JOIN request to the current active
959  * slave.
960  */
961 static void bond_resend_igmp_join_requests_delayed(struct work_struct *work)
962 {
963         struct bonding *bond = container_of(work, struct bonding,
964                                             mcast_work.work);
965
966         if (!rtnl_trylock()) {
967                 queue_delayed_work(bond->wq, &bond->mcast_work, 1);
968                 return;
969         }
970         call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev);
971
972         if (bond->igmp_retrans > 1) {
973                 bond->igmp_retrans--;
974                 queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5);
975         }
976         rtnl_unlock();
977 }
978
979 /* Flush bond's hardware addresses from slave */
980 static void bond_hw_addr_flush(struct net_device *bond_dev,
981                                struct net_device *slave_dev)
982 {
983         struct bonding *bond = netdev_priv(bond_dev);
984
985         dev_uc_unsync(slave_dev, bond_dev);
986         dev_mc_unsync(slave_dev, bond_dev);
987
988         if (BOND_MODE(bond) == BOND_MODE_8023AD)
989                 dev_mc_del(slave_dev, lacpdu_mcast_addr);
990 }
991
992 /*--------------------------- Active slave change ---------------------------*/
993
994 /* Update the hardware address list and promisc/allmulti for the new and
995  * old active slaves (if any).  Modes that are not using primary keep all
996  * slaves up date at all times; only the modes that use primary need to call
997  * this function to swap these settings during a failover.
998  */
999 static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active,
1000                               struct slave *old_active)
1001 {
1002         if (old_active) {
1003                 if (bond->dev->flags & IFF_PROMISC)
1004                         dev_set_promiscuity(old_active->dev, -1);
1005
1006                 if (bond->dev->flags & IFF_ALLMULTI)
1007                         dev_set_allmulti(old_active->dev, -1);
1008
1009                 if (bond->dev->flags & IFF_UP)
1010                         bond_hw_addr_flush(bond->dev, old_active->dev);
1011
1012                 bond_slave_ns_maddrs_add(bond, old_active);
1013         }
1014
1015         if (new_active) {
1016                 /* FIXME: Signal errors upstream. */
1017                 if (bond->dev->flags & IFF_PROMISC)
1018                         dev_set_promiscuity(new_active->dev, 1);
1019
1020                 if (bond->dev->flags & IFF_ALLMULTI)
1021                         dev_set_allmulti(new_active->dev, 1);
1022
1023                 if (bond->dev->flags & IFF_UP) {
1024                         netif_addr_lock_bh(bond->dev);
1025                         dev_uc_sync(new_active->dev, bond->dev);
1026                         dev_mc_sync(new_active->dev, bond->dev);
1027                         netif_addr_unlock_bh(bond->dev);
1028                 }
1029
1030                 bond_slave_ns_maddrs_del(bond, new_active);
1031         }
1032 }
1033
1034 /**
1035  * bond_set_dev_addr - clone slave's address to bond
1036  * @bond_dev: bond net device
1037  * @slave_dev: slave net device
1038  *
1039  * Should be called with RTNL held.
1040  */
1041 static int bond_set_dev_addr(struct net_device *bond_dev,
1042                              struct net_device *slave_dev)
1043 {
1044         int err;
1045
1046         slave_dbg(bond_dev, slave_dev, "bond_dev=%p slave_dev=%p slave_dev->addr_len=%d\n",
1047                   bond_dev, slave_dev, slave_dev->addr_len);
1048         err = dev_pre_changeaddr_notify(bond_dev, slave_dev->dev_addr, NULL);
1049         if (err)
1050                 return err;
1051
1052         __dev_addr_set(bond_dev, slave_dev->dev_addr, slave_dev->addr_len);
1053         bond_dev->addr_assign_type = NET_ADDR_STOLEN;
1054         call_netdevice_notifiers(NETDEV_CHANGEADDR, bond_dev);
1055         return 0;
1056 }
1057
1058 static struct slave *bond_get_old_active(struct bonding *bond,
1059                                          struct slave *new_active)
1060 {
1061         struct slave *slave;
1062         struct list_head *iter;
1063
1064         bond_for_each_slave(bond, slave, iter) {
1065                 if (slave == new_active)
1066                         continue;
1067
1068                 if (ether_addr_equal(bond->dev->dev_addr, slave->dev->dev_addr))
1069                         return slave;
1070         }
1071
1072         return NULL;
1073 }
1074
1075 /* bond_do_fail_over_mac
1076  *
1077  * Perform special MAC address swapping for fail_over_mac settings
1078  *
1079  * Called with RTNL
1080  */
1081 static void bond_do_fail_over_mac(struct bonding *bond,
1082                                   struct slave *new_active,
1083                                   struct slave *old_active)
1084 {
1085         u8 tmp_mac[MAX_ADDR_LEN];
1086         struct sockaddr_storage ss;
1087         int rv;
1088
1089         switch (bond->params.fail_over_mac) {
1090         case BOND_FOM_ACTIVE:
1091                 if (new_active) {
1092                         rv = bond_set_dev_addr(bond->dev, new_active->dev);
1093                         if (rv)
1094                                 slave_err(bond->dev, new_active->dev, "Error %d setting bond MAC from slave\n",
1095                                           -rv);
1096                 }
1097                 break;
1098         case BOND_FOM_FOLLOW:
1099                 /* if new_active && old_active, swap them
1100                  * if just old_active, do nothing (going to no active slave)
1101                  * if just new_active, set new_active to bond's MAC
1102                  */
1103                 if (!new_active)
1104                         return;
1105
1106                 if (!old_active)
1107                         old_active = bond_get_old_active(bond, new_active);
1108
1109                 if (old_active) {
1110                         bond_hw_addr_copy(tmp_mac, new_active->dev->dev_addr,
1111                                           new_active->dev->addr_len);
1112                         bond_hw_addr_copy(ss.__data,
1113                                           old_active->dev->dev_addr,
1114                                           old_active->dev->addr_len);
1115                         ss.ss_family = new_active->dev->type;
1116                 } else {
1117                         bond_hw_addr_copy(ss.__data, bond->dev->dev_addr,
1118                                           bond->dev->addr_len);
1119                         ss.ss_family = bond->dev->type;
1120                 }
1121
1122                 rv = dev_set_mac_address(new_active->dev,
1123                                          (struct sockaddr *)&ss, NULL);
1124                 if (rv) {
1125                         slave_err(bond->dev, new_active->dev, "Error %d setting MAC of new active slave\n",
1126                                   -rv);
1127                         goto out;
1128                 }
1129
1130                 if (!old_active)
1131                         goto out;
1132
1133                 bond_hw_addr_copy(ss.__data, tmp_mac,
1134                                   new_active->dev->addr_len);
1135                 ss.ss_family = old_active->dev->type;
1136
1137                 rv = dev_set_mac_address(old_active->dev,
1138                                          (struct sockaddr *)&ss, NULL);
1139                 if (rv)
1140                         slave_err(bond->dev, old_active->dev, "Error %d setting MAC of old active slave\n",
1141                                   -rv);
1142 out:
1143                 break;
1144         default:
1145                 netdev_err(bond->dev, "bond_do_fail_over_mac impossible: bad policy %d\n",
1146                            bond->params.fail_over_mac);
1147                 break;
1148         }
1149
1150 }
1151
1152 /**
1153  * bond_choose_primary_or_current - select the primary or high priority slave
1154  * @bond: our bonding struct
1155  *
1156  * - Check if there is a primary link. If the primary link was set and is up,
1157  *   go on and do link reselection.
1158  *
1159  * - If primary link is not set or down, find the highest priority link.
1160  *   If the highest priority link is not current slave, set it as primary
1161  *   link and do link reselection.
1162  */
1163 static struct slave *bond_choose_primary_or_current(struct bonding *bond)
1164 {
1165         struct slave *prim = rtnl_dereference(bond->primary_slave);
1166         struct slave *curr = rtnl_dereference(bond->curr_active_slave);
1167         struct slave *slave, *hprio = NULL;
1168         struct list_head *iter;
1169
1170         if (!prim || prim->link != BOND_LINK_UP) {
1171                 bond_for_each_slave(bond, slave, iter) {
1172                         if (slave->link == BOND_LINK_UP) {
1173                                 hprio = hprio ?: slave;
1174                                 if (slave->prio > hprio->prio)
1175                                         hprio = slave;
1176                         }
1177                 }
1178
1179                 if (hprio && hprio != curr) {
1180                         prim = hprio;
1181                         goto link_reselect;
1182                 }
1183
1184                 if (!curr || curr->link != BOND_LINK_UP)
1185                         return NULL;
1186                 return curr;
1187         }
1188
1189         if (bond->force_primary) {
1190                 bond->force_primary = false;
1191                 return prim;
1192         }
1193
1194 link_reselect:
1195         if (!curr || curr->link != BOND_LINK_UP)
1196                 return prim;
1197
1198         /* At this point, prim and curr are both up */
1199         switch (bond->params.primary_reselect) {
1200         case BOND_PRI_RESELECT_ALWAYS:
1201                 return prim;
1202         case BOND_PRI_RESELECT_BETTER:
1203                 if (prim->speed < curr->speed)
1204                         return curr;
1205                 if (prim->speed == curr->speed && prim->duplex <= curr->duplex)
1206                         return curr;
1207                 return prim;
1208         case BOND_PRI_RESELECT_FAILURE:
1209                 return curr;
1210         default:
1211                 netdev_err(bond->dev, "impossible primary_reselect %d\n",
1212                            bond->params.primary_reselect);
1213                 return curr;
1214         }
1215 }
1216
1217 /**
1218  * bond_find_best_slave - select the best available slave to be the active one
1219  * @bond: our bonding struct
1220  */
1221 static struct slave *bond_find_best_slave(struct bonding *bond)
1222 {
1223         struct slave *slave, *bestslave = NULL;
1224         struct list_head *iter;
1225         int mintime = bond->params.updelay;
1226
1227         slave = bond_choose_primary_or_current(bond);
1228         if (slave)
1229                 return slave;
1230
1231         bond_for_each_slave(bond, slave, iter) {
1232                 if (slave->link == BOND_LINK_UP)
1233                         return slave;
1234                 if (slave->link == BOND_LINK_BACK && bond_slave_is_up(slave) &&
1235                     slave->delay < mintime) {
1236                         mintime = slave->delay;
1237                         bestslave = slave;
1238                 }
1239         }
1240
1241         return bestslave;
1242 }
1243
1244 /* must be called in RCU critical section or with RTNL held */
1245 static bool bond_should_notify_peers(struct bonding *bond)
1246 {
1247         struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave);
1248
1249         if (!slave || !bond->send_peer_notif ||
1250             bond->send_peer_notif %
1251             max(1, bond->params.peer_notif_delay) != 0 ||
1252             !netif_carrier_ok(bond->dev) ||
1253             test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state))
1254                 return false;
1255
1256         netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n",
1257                    slave ? slave->dev->name : "NULL");
1258
1259         return true;
1260 }
1261
1262 /**
1263  * bond_change_active_slave - change the active slave into the specified one
1264  * @bond: our bonding struct
1265  * @new_active: the new slave to make the active one
1266  *
1267  * Set the new slave to the bond's settings and unset them on the old
1268  * curr_active_slave.
1269  * Setting include flags, mc-list, promiscuity, allmulti, etc.
1270  *
1271  * If @new's link state is %BOND_LINK_BACK we'll set it to %BOND_LINK_UP,
1272  * because it is apparently the best available slave we have, even though its
1273  * updelay hasn't timed out yet.
1274  *
1275  * Caller must hold RTNL.
1276  */
1277 void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
1278 {
1279         struct slave *old_active;
1280
1281         ASSERT_RTNL();
1282
1283         old_active = rtnl_dereference(bond->curr_active_slave);
1284
1285         if (old_active == new_active)
1286                 return;
1287
1288 #ifdef CONFIG_XFRM_OFFLOAD
1289         bond_ipsec_del_sa_all(bond);
1290 #endif /* CONFIG_XFRM_OFFLOAD */
1291
1292         if (new_active) {
1293                 new_active->last_link_up = jiffies;
1294
1295                 if (new_active->link == BOND_LINK_BACK) {
1296                         if (bond_uses_primary(bond)) {
1297                                 slave_info(bond->dev, new_active->dev, "making interface the new active one %d ms earlier\n",
1298                                            (bond->params.updelay - new_active->delay) * bond->params.miimon);
1299                         }
1300
1301                         new_active->delay = 0;
1302                         bond_set_slave_link_state(new_active, BOND_LINK_UP,
1303                                                   BOND_SLAVE_NOTIFY_NOW);
1304
1305                         if (BOND_MODE(bond) == BOND_MODE_8023AD)
1306                                 bond_3ad_handle_link_change(new_active, BOND_LINK_UP);
1307
1308                         if (bond_is_lb(bond))
1309                                 bond_alb_handle_link_change(bond, new_active, BOND_LINK_UP);
1310                 } else {
1311                         if (bond_uses_primary(bond))
1312                                 slave_info(bond->dev, new_active->dev, "making interface the new active one\n");
1313                 }
1314         }
1315
1316         if (bond_uses_primary(bond))
1317                 bond_hw_addr_swap(bond, new_active, old_active);
1318
1319         if (bond_is_lb(bond)) {
1320                 bond_alb_handle_active_change(bond, new_active);
1321                 if (old_active)
1322                         bond_set_slave_inactive_flags(old_active,
1323                                                       BOND_SLAVE_NOTIFY_NOW);
1324                 if (new_active)
1325                         bond_set_slave_active_flags(new_active,
1326                                                     BOND_SLAVE_NOTIFY_NOW);
1327         } else {
1328                 rcu_assign_pointer(bond->curr_active_slave, new_active);
1329         }
1330
1331         if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) {
1332                 if (old_active)
1333                         bond_set_slave_inactive_flags(old_active,
1334                                                       BOND_SLAVE_NOTIFY_NOW);
1335
1336                 if (new_active) {
1337                         bool should_notify_peers = false;
1338
1339                         bond_set_slave_active_flags(new_active,
1340                                                     BOND_SLAVE_NOTIFY_NOW);
1341
1342                         if (bond->params.fail_over_mac)
1343                                 bond_do_fail_over_mac(bond, new_active,
1344                                                       old_active);
1345
1346                         if (netif_running(bond->dev)) {
1347                                 bond->send_peer_notif =
1348                                         bond->params.num_peer_notif *
1349                                         max(1, bond->params.peer_notif_delay);
1350                                 should_notify_peers =
1351                                         bond_should_notify_peers(bond);
1352                         }
1353
1354                         call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev);
1355                         if (should_notify_peers) {
1356                                 bond->send_peer_notif--;
1357                                 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
1358                                                          bond->dev);
1359                         }
1360                 }
1361         }
1362
1363 #ifdef CONFIG_XFRM_OFFLOAD
1364         bond_ipsec_add_sa_all(bond);
1365 #endif /* CONFIG_XFRM_OFFLOAD */
1366
1367         /* resend IGMP joins since active slave has changed or
1368          * all were sent on curr_active_slave.
1369          * resend only if bond is brought up with the affected
1370          * bonding modes and the retransmission is enabled
1371          */
1372         if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) &&
1373             ((bond_uses_primary(bond) && new_active) ||
1374              BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) {
1375                 bond->igmp_retrans = bond->params.resend_igmp;
1376                 queue_delayed_work(bond->wq, &bond->mcast_work, 1);
1377         }
1378 }
1379
1380 /**
1381  * bond_select_active_slave - select a new active slave, if needed
1382  * @bond: our bonding struct
1383  *
1384  * This functions should be called when one of the following occurs:
1385  * - The old curr_active_slave has been released or lost its link.
1386  * - The primary_slave has got its link back.
1387  * - A slave has got its link back and there's no old curr_active_slave.
1388  *
1389  * Caller must hold RTNL.
1390  */
1391 void bond_select_active_slave(struct bonding *bond)
1392 {
1393         struct slave *best_slave;
1394         int rv;
1395
1396         ASSERT_RTNL();
1397
1398         best_slave = bond_find_best_slave(bond);
1399         if (best_slave != rtnl_dereference(bond->curr_active_slave)) {
1400                 bond_change_active_slave(bond, best_slave);
1401                 rv = bond_set_carrier(bond);
1402                 if (!rv)
1403                         return;
1404
1405                 if (netif_carrier_ok(bond->dev))
1406                         netdev_info(bond->dev, "active interface up!\n");
1407                 else
1408                         netdev_info(bond->dev, "now running without any active interface!\n");
1409         }
1410 }
1411
1412 #ifdef CONFIG_NET_POLL_CONTROLLER
1413 static inline int slave_enable_netpoll(struct slave *slave)
1414 {
1415         struct netpoll *np;
1416         int err = 0;
1417
1418         np = kzalloc(sizeof(*np), GFP_KERNEL);
1419         err = -ENOMEM;
1420         if (!np)
1421                 goto out;
1422
1423         err = __netpoll_setup(np, slave->dev);
1424         if (err) {
1425                 kfree(np);
1426                 goto out;
1427         }
1428         slave->np = np;
1429 out:
1430         return err;
1431 }
1432 static inline void slave_disable_netpoll(struct slave *slave)
1433 {
1434         struct netpoll *np = slave->np;
1435
1436         if (!np)
1437                 return;
1438
1439         slave->np = NULL;
1440
1441         __netpoll_free(np);
1442 }
1443
1444 static void bond_poll_controller(struct net_device *bond_dev)
1445 {
1446         struct bonding *bond = netdev_priv(bond_dev);
1447         struct slave *slave = NULL;
1448         struct list_head *iter;
1449         struct ad_info ad_info;
1450
1451         if (BOND_MODE(bond) == BOND_MODE_8023AD)
1452                 if (bond_3ad_get_active_agg_info(bond, &ad_info))
1453                         return;
1454
1455         bond_for_each_slave_rcu(bond, slave, iter) {
1456                 if (!bond_slave_is_up(slave))
1457                         continue;
1458
1459                 if (BOND_MODE(bond) == BOND_MODE_8023AD) {
1460                         struct aggregator *agg =
1461                             SLAVE_AD_INFO(slave)->port.aggregator;
1462
1463                         if (agg &&
1464                             agg->aggregator_identifier != ad_info.aggregator_id)
1465                                 continue;
1466                 }
1467
1468                 netpoll_poll_dev(slave->dev);
1469         }
1470 }
1471
1472 static void bond_netpoll_cleanup(struct net_device *bond_dev)
1473 {
1474         struct bonding *bond = netdev_priv(bond_dev);
1475         struct list_head *iter;
1476         struct slave *slave;
1477
1478         bond_for_each_slave(bond, slave, iter)
1479                 if (bond_slave_is_up(slave))
1480                         slave_disable_netpoll(slave);
1481 }
1482
1483 static int bond_netpoll_setup(struct net_device *dev)
1484 {
1485         struct bonding *bond = netdev_priv(dev);
1486         struct list_head *iter;
1487         struct slave *slave;
1488         int err = 0;
1489
1490         bond_for_each_slave(bond, slave, iter) {
1491                 err = slave_enable_netpoll(slave);
1492                 if (err) {
1493                         bond_netpoll_cleanup(dev);
1494                         break;
1495                 }
1496         }
1497         return err;
1498 }
1499 #else
1500 static inline int slave_enable_netpoll(struct slave *slave)
1501 {
1502         return 0;
1503 }
1504 static inline void slave_disable_netpoll(struct slave *slave)
1505 {
1506 }
1507 static void bond_netpoll_cleanup(struct net_device *bond_dev)
1508 {
1509 }
1510 #endif
1511
1512 /*---------------------------------- IOCTL ----------------------------------*/
1513
1514 static netdev_features_t bond_fix_features(struct net_device *dev,
1515                                            netdev_features_t features)
1516 {
1517         struct bonding *bond = netdev_priv(dev);
1518         struct list_head *iter;
1519         netdev_features_t mask;
1520         struct slave *slave;
1521
1522         mask = features;
1523         features = netdev_base_features(features);
1524
1525         bond_for_each_slave(bond, slave, iter) {
1526                 features = netdev_increment_features(features,
1527                                                      slave->dev->features,
1528                                                      mask);
1529         }
1530         features = netdev_add_tso_features(features, mask);
1531
1532         return features;
1533 }
1534
1535 #define BOND_VLAN_FEATURES      (NETIF_F_HW_CSUM | NETIF_F_SG | \
1536                                  NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \
1537                                  NETIF_F_GSO_ENCAP_ALL | \
1538                                  NETIF_F_HIGHDMA | NETIF_F_LRO)
1539
1540 #define BOND_ENC_FEATURES       (NETIF_F_HW_CSUM | NETIF_F_SG | \
1541                                  NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE)
1542
1543 #define BOND_MPLS_FEATURES      (NETIF_F_HW_CSUM | NETIF_F_SG | \
1544                                  NETIF_F_GSO_SOFTWARE)
1545
1546
1547 static void bond_compute_features(struct bonding *bond)
1548 {
1549         unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
1550                                         IFF_XMIT_DST_RELEASE_PERM;
1551         netdev_features_t gso_partial_features = NETIF_F_GSO_ESP;
1552         netdev_features_t vlan_features = BOND_VLAN_FEATURES;
1553         netdev_features_t enc_features  = BOND_ENC_FEATURES;
1554 #ifdef CONFIG_XFRM_OFFLOAD
1555         netdev_features_t xfrm_features  = BOND_XFRM_FEATURES;
1556 #endif /* CONFIG_XFRM_OFFLOAD */
1557         netdev_features_t mpls_features  = BOND_MPLS_FEATURES;
1558         struct net_device *bond_dev = bond->dev;
1559         struct list_head *iter;
1560         struct slave *slave;
1561         unsigned short max_hard_header_len = ETH_HLEN;
1562         unsigned int tso_max_size = TSO_MAX_SIZE;
1563         u16 tso_max_segs = TSO_MAX_SEGS;
1564
1565         if (!bond_has_slaves(bond))
1566                 goto done;
1567
1568         vlan_features = netdev_base_features(vlan_features);
1569         mpls_features = netdev_base_features(mpls_features);
1570
1571         bond_for_each_slave(bond, slave, iter) {
1572                 vlan_features = netdev_increment_features(vlan_features,
1573                         slave->dev->vlan_features, BOND_VLAN_FEATURES);
1574
1575                 enc_features = netdev_increment_features(enc_features,
1576                                                          slave->dev->hw_enc_features,
1577                                                          BOND_ENC_FEATURES);
1578
1579 #ifdef CONFIG_XFRM_OFFLOAD
1580                 xfrm_features = netdev_increment_features(xfrm_features,
1581                                                           slave->dev->hw_enc_features,
1582                                                           BOND_XFRM_FEATURES);
1583 #endif /* CONFIG_XFRM_OFFLOAD */
1584
1585                 if (slave->dev->hw_enc_features & NETIF_F_GSO_PARTIAL)
1586                         gso_partial_features &= slave->dev->gso_partial_features;
1587
1588                 mpls_features = netdev_increment_features(mpls_features,
1589                                                           slave->dev->mpls_features,
1590                                                           BOND_MPLS_FEATURES);
1591
1592                 dst_release_flag &= slave->dev->priv_flags;
1593                 if (slave->dev->hard_header_len > max_hard_header_len)
1594                         max_hard_header_len = slave->dev->hard_header_len;
1595
1596                 tso_max_size = min(tso_max_size, slave->dev->tso_max_size);
1597                 tso_max_segs = min(tso_max_segs, slave->dev->tso_max_segs);
1598         }
1599         bond_dev->hard_header_len = max_hard_header_len;
1600
1601         if (gso_partial_features & NETIF_F_GSO_ESP)
1602                 bond_dev->gso_partial_features |= NETIF_F_GSO_ESP;
1603         else
1604                 bond_dev->gso_partial_features &= ~NETIF_F_GSO_ESP;
1605
1606 done:
1607         bond_dev->vlan_features = vlan_features;
1608         bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
1609                                     NETIF_F_HW_VLAN_CTAG_TX |
1610                                     NETIF_F_HW_VLAN_STAG_TX;
1611 #ifdef CONFIG_XFRM_OFFLOAD
1612         bond_dev->hw_enc_features |= xfrm_features;
1613 #endif /* CONFIG_XFRM_OFFLOAD */
1614         bond_dev->mpls_features = mpls_features;
1615         netif_set_tso_max_segs(bond_dev, tso_max_segs);
1616         netif_set_tso_max_size(bond_dev, tso_max_size);
1617
1618         bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1619         if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
1620             dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
1621                 bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE;
1622
1623         netdev_change_features(bond_dev);
1624 }
1625
1626 static void bond_setup_by_slave(struct net_device *bond_dev,
1627                                 struct net_device *slave_dev)
1628 {
1629         bool was_up = !!(bond_dev->flags & IFF_UP);
1630
1631         dev_close(bond_dev);
1632
1633         bond_dev->header_ops        = slave_dev->header_ops;
1634
1635         bond_dev->type              = slave_dev->type;
1636         bond_dev->hard_header_len   = slave_dev->hard_header_len;
1637         bond_dev->needed_headroom   = slave_dev->needed_headroom;
1638         bond_dev->addr_len          = slave_dev->addr_len;
1639
1640         memcpy(bond_dev->broadcast, slave_dev->broadcast,
1641                 slave_dev->addr_len);
1642
1643         if (slave_dev->flags & IFF_POINTOPOINT) {
1644                 bond_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST);
1645                 bond_dev->flags |= (IFF_POINTOPOINT | IFF_NOARP);
1646         }
1647         if (was_up)
1648                 dev_open(bond_dev, NULL);
1649 }
1650
1651 /* On bonding slaves other than the currently active slave, suppress
1652  * duplicates except for alb non-mcast/bcast.
1653  */
1654 static bool bond_should_deliver_exact_match(struct sk_buff *skb,
1655                                             struct slave *slave,
1656                                             struct bonding *bond)
1657 {
1658         if (bond_is_slave_inactive(slave)) {
1659                 if (BOND_MODE(bond) == BOND_MODE_ALB &&
1660                     skb->pkt_type != PACKET_BROADCAST &&
1661                     skb->pkt_type != PACKET_MULTICAST)
1662                         return false;
1663                 return true;
1664         }
1665         return false;
1666 }
1667
1668 static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
1669 {
1670         struct sk_buff *skb = *pskb;
1671         struct slave *slave;
1672         struct bonding *bond;
1673         int (*recv_probe)(const struct sk_buff *, struct bonding *,
1674                           struct slave *);
1675         int ret = RX_HANDLER_ANOTHER;
1676
1677         skb = skb_share_check(skb, GFP_ATOMIC);
1678         if (unlikely(!skb))
1679                 return RX_HANDLER_CONSUMED;
1680
1681         *pskb = skb;
1682
1683         slave = bond_slave_get_rcu(skb->dev);
1684         bond = slave->bond;
1685
1686         recv_probe = READ_ONCE(bond->recv_probe);
1687         if (recv_probe) {
1688                 ret = recv_probe(skb, bond, slave);
1689                 if (ret == RX_HANDLER_CONSUMED) {
1690                         consume_skb(skb);
1691                         return ret;
1692                 }
1693         }
1694
1695         /*
1696          * For packets determined by bond_should_deliver_exact_match() call to
1697          * be suppressed we want to make an exception for link-local packets.
1698          * This is necessary for e.g. LLDP daemons to be able to monitor
1699          * inactive slave links without being forced to bind to them
1700          * explicitly.
1701          *
1702          * At the same time, packets that are passed to the bonding master
1703          * (including link-local ones) can have their originating interface
1704          * determined via PACKET_ORIGDEV socket option.
1705          */
1706         if (bond_should_deliver_exact_match(skb, slave, bond)) {
1707                 if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
1708                         return RX_HANDLER_PASS;
1709                 return RX_HANDLER_EXACT;
1710         }
1711
1712         skb->dev = bond->dev;
1713
1714         if (BOND_MODE(bond) == BOND_MODE_ALB &&
1715             netif_is_bridge_port(bond->dev) &&
1716             skb->pkt_type == PACKET_HOST) {
1717
1718                 if (unlikely(skb_cow_head(skb,
1719                                           skb->data - skb_mac_header(skb)))) {
1720                         kfree_skb(skb);
1721                         return RX_HANDLER_CONSUMED;
1722                 }
1723                 bond_hw_addr_copy(eth_hdr(skb)->h_dest, bond->dev->dev_addr,
1724                                   bond->dev->addr_len);
1725         }
1726
1727         return ret;
1728 }
1729
1730 static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond)
1731 {
1732         switch (BOND_MODE(bond)) {
1733         case BOND_MODE_ROUNDROBIN:
1734                 return NETDEV_LAG_TX_TYPE_ROUNDROBIN;
1735         case BOND_MODE_ACTIVEBACKUP:
1736                 return NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
1737         case BOND_MODE_BROADCAST:
1738                 return NETDEV_LAG_TX_TYPE_BROADCAST;
1739         case BOND_MODE_XOR:
1740         case BOND_MODE_8023AD:
1741                 return NETDEV_LAG_TX_TYPE_HASH;
1742         default:
1743                 return NETDEV_LAG_TX_TYPE_UNKNOWN;
1744         }
1745 }
1746
1747 static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
1748                                                enum netdev_lag_tx_type type)
1749 {
1750         if (type != NETDEV_LAG_TX_TYPE_HASH)
1751                 return NETDEV_LAG_HASH_NONE;
1752
1753         switch (bond->params.xmit_policy) {
1754         case BOND_XMIT_POLICY_LAYER2:
1755                 return NETDEV_LAG_HASH_L2;
1756         case BOND_XMIT_POLICY_LAYER34:
1757                 return NETDEV_LAG_HASH_L34;
1758         case BOND_XMIT_POLICY_LAYER23:
1759                 return NETDEV_LAG_HASH_L23;
1760         case BOND_XMIT_POLICY_ENCAP23:
1761                 return NETDEV_LAG_HASH_E23;
1762         case BOND_XMIT_POLICY_ENCAP34:
1763                 return NETDEV_LAG_HASH_E34;
1764         case BOND_XMIT_POLICY_VLAN_SRCMAC:
1765                 return NETDEV_LAG_HASH_VLAN_SRCMAC;
1766         default:
1767                 return NETDEV_LAG_HASH_UNKNOWN;
1768         }
1769 }
1770
1771 static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
1772                                       struct netlink_ext_ack *extack)
1773 {
1774         struct netdev_lag_upper_info lag_upper_info;
1775         enum netdev_lag_tx_type type;
1776         int err;
1777
1778         type = bond_lag_tx_type(bond);
1779         lag_upper_info.tx_type = type;
1780         lag_upper_info.hash_type = bond_lag_hash_type(bond, type);
1781
1782         err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
1783                                            &lag_upper_info, extack);
1784         if (err)
1785                 return err;
1786
1787         slave->dev->flags |= IFF_SLAVE;
1788         return 0;
1789 }
1790
1791 static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave)
1792 {
1793         netdev_upper_dev_unlink(slave->dev, bond->dev);
1794         slave->dev->flags &= ~IFF_SLAVE;
1795 }
1796
1797 static void slave_kobj_release(struct kobject *kobj)
1798 {
1799         struct slave *slave = to_slave(kobj);
1800         struct bonding *bond = bond_get_bond_by_slave(slave);
1801
1802         cancel_delayed_work_sync(&slave->notify_work);
1803         if (BOND_MODE(bond) == BOND_MODE_8023AD)
1804                 kfree(SLAVE_AD_INFO(slave));
1805
1806         kfree(slave);
1807 }
1808
1809 static struct kobj_type slave_ktype = {
1810         .release = slave_kobj_release,
1811 #ifdef CONFIG_SYSFS
1812         .sysfs_ops = &slave_sysfs_ops,
1813 #endif
1814 };
1815
1816 static int bond_kobj_init(struct slave *slave)
1817 {
1818         int err;
1819
1820         err = kobject_init_and_add(&slave->kobj, &slave_ktype,
1821                                    &(slave->dev->dev.kobj), "bonding_slave");
1822         if (err)
1823                 kobject_put(&slave->kobj);
1824
1825         return err;
1826 }
1827
1828 static struct slave *bond_alloc_slave(struct bonding *bond,
1829                                       struct net_device *slave_dev)
1830 {
1831         struct slave *slave = NULL;
1832
1833         slave = kzalloc(sizeof(*slave), GFP_KERNEL);
1834         if (!slave)
1835                 return NULL;
1836
1837         slave->bond = bond;
1838         slave->dev = slave_dev;
1839         INIT_DELAYED_WORK(&slave->notify_work, bond_netdev_notify_work);
1840
1841         if (bond_kobj_init(slave))
1842                 return NULL;
1843
1844         if (BOND_MODE(bond) == BOND_MODE_8023AD) {
1845                 SLAVE_AD_INFO(slave) = kzalloc(sizeof(struct ad_slave_info),
1846                                                GFP_KERNEL);
1847                 if (!SLAVE_AD_INFO(slave)) {
1848                         kobject_put(&slave->kobj);
1849                         return NULL;
1850                 }
1851         }
1852
1853         return slave;
1854 }
1855
1856 static void bond_fill_ifbond(struct bonding *bond, struct ifbond *info)
1857 {
1858         info->bond_mode = BOND_MODE(bond);
1859         info->miimon = bond->params.miimon;
1860         info->num_slaves = bond->slave_cnt;
1861 }
1862
1863 static void bond_fill_ifslave(struct slave *slave, struct ifslave *info)
1864 {
1865         strcpy(info->slave_name, slave->dev->name);
1866         info->link = slave->link;
1867         info->state = bond_slave_state(slave);
1868         info->link_failure_count = slave->link_failure_count;
1869 }
1870
1871 static void bond_netdev_notify_work(struct work_struct *_work)
1872 {
1873         struct slave *slave = container_of(_work, struct slave,
1874                                            notify_work.work);
1875
1876         if (rtnl_trylock()) {
1877                 struct netdev_bonding_info binfo;
1878
1879                 bond_fill_ifslave(slave, &binfo.slave);
1880                 bond_fill_ifbond(slave->bond, &binfo.master);
1881                 netdev_bonding_info_change(slave->dev, &binfo);
1882                 rtnl_unlock();
1883         } else {
1884                 queue_delayed_work(slave->bond->wq, &slave->notify_work, 1);
1885         }
1886 }
1887
1888 void bond_queue_slave_event(struct slave *slave)
1889 {
1890         queue_delayed_work(slave->bond->wq, &slave->notify_work, 0);
1891 }
1892
1893 void bond_lower_state_changed(struct slave *slave)
1894 {
1895         struct netdev_lag_lower_state_info info;
1896
1897         info.link_up = slave->link == BOND_LINK_UP ||
1898                        slave->link == BOND_LINK_FAIL;
1899         info.tx_enabled = bond_is_active_slave(slave);
1900         netdev_lower_state_changed(slave->dev, &info);
1901 }
1902
1903 #define BOND_NL_ERR(bond_dev, extack, errmsg) do {              \
1904         if (extack)                                             \
1905                 NL_SET_ERR_MSG(extack, errmsg);                 \
1906         else                                                    \
1907                 netdev_err(bond_dev, "Error: %s\n", errmsg);    \
1908 } while (0)
1909
1910 #define SLAVE_NL_ERR(bond_dev, slave_dev, extack, errmsg) do {          \
1911         if (extack)                                                     \
1912                 NL_SET_ERR_MSG(extack, errmsg);                         \
1913         else                                                            \
1914                 slave_err(bond_dev, slave_dev, "Error: %s\n", errmsg);  \
1915 } while (0)
1916
1917 /* The bonding driver uses ether_setup() to convert a master bond device
1918  * to ARPHRD_ETHER, that resets the target netdevice's flags so we always
1919  * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP
1920  * if they were set
1921  */
1922 static void bond_ether_setup(struct net_device *bond_dev)
1923 {
1924         unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP);
1925
1926         ether_setup(bond_dev);
1927         bond_dev->flags |= IFF_MASTER | flags;
1928         bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1929 }
1930
1931 void bond_xdp_set_features(struct net_device *bond_dev)
1932 {
1933         struct bonding *bond = netdev_priv(bond_dev);
1934         xdp_features_t val = NETDEV_XDP_ACT_MASK;
1935         struct list_head *iter;
1936         struct slave *slave;
1937
1938         ASSERT_RTNL();
1939
1940         if (!bond_xdp_check(bond) || !bond_has_slaves(bond)) {
1941                 xdp_clear_features_flag(bond_dev);
1942                 return;
1943         }
1944
1945         bond_for_each_slave(bond, slave, iter)
1946                 val &= slave->dev->xdp_features;
1947
1948         val &= ~NETDEV_XDP_ACT_XSK_ZEROCOPY;
1949
1950         xdp_set_features_flag(bond_dev, val);
1951 }
1952
1953 /* enslave device <slave> to bond device <master> */
1954 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
1955                  struct netlink_ext_ack *extack)
1956 {
1957         struct bonding *bond = netdev_priv(bond_dev);
1958         const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
1959         struct slave *new_slave = NULL, *prev_slave;
1960         struct sockaddr_storage ss;
1961         int link_reporting;
1962         int res = 0, i;
1963
1964         if (slave_dev->flags & IFF_MASTER &&
1965             !netif_is_bond_master(slave_dev)) {
1966                 BOND_NL_ERR(bond_dev, extack,
1967                             "Device type (master device) cannot be enslaved");
1968                 return -EPERM;
1969         }
1970
1971         if (!bond->params.use_carrier &&
1972             slave_dev->ethtool_ops->get_link == NULL &&
1973             slave_ops->ndo_eth_ioctl == NULL) {
1974                 slave_warn(bond_dev, slave_dev, "no link monitoring support\n");
1975         }
1976
1977         /* already in-use? */
1978         if (netdev_is_rx_handler_busy(slave_dev)) {
1979                 SLAVE_NL_ERR(bond_dev, slave_dev, extack,
1980                              "Device is in use and cannot be enslaved");
1981                 return -EBUSY;
1982         }
1983
1984         if (bond_dev == slave_dev) {
1985                 BOND_NL_ERR(bond_dev, extack, "Cannot enslave bond to itself.");
1986                 return -EPERM;
1987         }
1988
1989         /* vlan challenged mutual exclusion */
1990         /* no need to lock since we're protected by rtnl_lock */
1991         if (slave_dev->features & NETIF_F_VLAN_CHALLENGED) {
1992                 slave_dbg(bond_dev, slave_dev, "is NETIF_F_VLAN_CHALLENGED\n");
1993                 if (vlan_uses_dev(bond_dev)) {
1994                         SLAVE_NL_ERR(bond_dev, slave_dev, extack,
1995                                      "Can not enslave VLAN challenged device to VLAN enabled bond");
1996                         return -EPERM;
1997                 } else {
1998                         slave_warn(bond_dev, slave_dev, "enslaved VLAN challenged slave. Adding VLANs will be blocked as long as it is part of bond.\n");
1999                 }
2000         } else {
2001                 slave_dbg(bond_dev, slave_dev, "is !NETIF_F_VLAN_CHALLENGED\n");
2002         }
2003
2004         if (slave_dev->features & NETIF_F_HW_ESP)
2005                 slave_dbg(bond_dev, slave_dev, "is esp-hw-offload capable\n");
2006
2007         /* Old ifenslave binaries are no longer supported.  These can
2008          * be identified with moderate accuracy by the state of the slave:
2009          * the current ifenslave will set the interface down prior to
2010          * enslaving it; the old ifenslave will not.
2011          */
2012         if (slave_dev->flags & IFF_UP) {
2013                 SLAVE_NL_ERR(bond_dev, slave_dev, extack,
2014                              "Device can not be enslaved while up");
2015                 return -EPERM;
2016         }
2017
2018         /* set bonding device ether type by slave - bonding netdevices are
2019          * created with ether_setup, so when the slave type is not ARPHRD_ETHER
2020          * there is a need to override some of the type dependent attribs/funcs.
2021          *
2022          * bond ether type mutual exclusion - don't allow slaves of dissimilar
2023          * ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same bond
2024          */
2025         if (!bond_has_slaves(bond)) {
2026                 if (bond_dev->type != slave_dev->type) {
2027                         slave_dbg(bond_dev, slave_dev, "change device type from %d to %d\n",
2028                                   bond_dev->type, slave_dev->type);
2029
2030                         res = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE,
2031                                                        bond_dev);
2032                         res = notifier_to_errno(res);
2033                         if (res) {
2034                                 slave_err(bond_dev, slave_dev, "refused to change device type\n");
2035                                 return -EBUSY;
2036                         }
2037
2038                         /* Flush unicast and multicast addresses */
2039                         dev_uc_flush(bond_dev);
2040                         dev_mc_flush(bond_dev);
2041
2042                         if (slave_dev->type != ARPHRD_ETHER)
2043                                 bond_setup_by_slave(bond_dev, slave_dev);
2044                         else
2045                                 bond_ether_setup(bond_dev);
2046
2047                         call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
2048                                                  bond_dev);
2049                 }
2050         } else if (bond_dev->type != slave_dev->type) {
2051                 SLAVE_NL_ERR(bond_dev, slave_dev, extack,
2052                              "Device type is different from other slaves");
2053                 return -EINVAL;
2054         }
2055
2056         if (slave_dev->type == ARPHRD_INFINIBAND &&
2057             BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
2058                 SLAVE_NL_ERR(bond_dev, slave_dev, extack,
2059                              "Only active-backup mode is supported for infiniband slaves");
2060                 res = -EOPNOTSUPP;
2061                 goto err_undo_flags;
2062         }
2063
2064         if (!slave_ops->ndo_set_mac_address ||
2065             slave_dev->type == ARPHRD_INFINIBAND) {
2066                 slave_warn(bond_dev, slave_dev, "The slave device specified does not support setting the MAC address\n");
2067                 if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP &&
2068                     bond->params.fail_over_mac != BOND_FOM_ACTIVE) {
2069                         if (!bond_has_slaves(bond)) {
2070                                 bond->params.fail_over_mac = BOND_FOM_ACTIVE;
2071                                 slave_warn(bond_dev, slave_dev, "Setting fail_over_mac to active for active-backup mode\n");
2072                         } else {
2073                                 SLAVE_NL_ERR(bond_dev, slave_dev, extack,
2074                                              "Slave device does not support setting the MAC address, but fail_over_mac is not set to active");
2075                                 res = -EOPNOTSUPP;
2076                                 goto err_undo_flags;
2077                         }
2078                 }
2079         }
2080
2081         call_netdevice_notifiers(NETDEV_JOIN, slave_dev);
2082
2083         /* If this is the first slave, then we need to set the master's hardware
2084          * address to be the same as the slave's.
2085          */
2086         if (!bond_has_slaves(bond) &&
2087             bond->dev->addr_assign_type == NET_ADDR_RANDOM) {
2088                 res = bond_set_dev_addr(bond->dev, slave_dev);
2089                 if (res)
2090                         goto err_undo_flags;
2091         }
2092
2093         new_slave = bond_alloc_slave(bond, slave_dev);
2094         if (!new_slave) {
2095                 res = -ENOMEM;
2096                 goto err_undo_flags;
2097         }
2098
2099         /* Set the new_slave's queue_id to be zero.  Queue ID mapping
2100          * is set via sysfs or module option if desired.
2101          */
2102         new_slave->queue_id = 0;
2103
2104         /* Save slave's original mtu and then set it to match the bond */
2105         new_slave->original_mtu = slave_dev->mtu;
2106         res = dev_set_mtu(slave_dev, bond->dev->mtu);
2107         if (res) {
2108                 slave_err(bond_dev, slave_dev, "Error %d calling dev_set_mtu\n", res);
2109                 goto err_free;
2110         }
2111
2112         /* Save slave's original ("permanent") mac address for modes
2113          * that need it, and for restoring it upon release, and then
2114          * set it to the master's address
2115          */
2116         bond_hw_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr,
2117                           slave_dev->addr_len);
2118
2119         if (!bond->params.fail_over_mac ||
2120             BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
2121                 /* Set slave to master's mac address.  The application already
2122                  * set the master's mac address to that of the first slave
2123                  */
2124                 memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len);
2125                 ss.ss_family = slave_dev->type;
2126                 res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss,
2127                                           extack);
2128                 if (res) {
2129                         slave_err(bond_dev, slave_dev, "Error %d calling set_mac_address\n", res);
2130                         goto err_restore_mtu;
2131                 }
2132         }
2133
2134         /* set no_addrconf flag before open to prevent IPv6 addrconf */
2135         slave_dev->priv_flags |= IFF_NO_ADDRCONF;
2136
2137         /* open the slave since the application closed it */
2138         res = dev_open(slave_dev, extack);
2139         if (res) {
2140                 slave_err(bond_dev, slave_dev, "Opening slave failed\n");
2141                 goto err_restore_mac;
2142         }
2143
2144         slave_dev->priv_flags |= IFF_BONDING;
2145         /* initialize slave stats */
2146         dev_get_stats(new_slave->dev, &new_slave->slave_stats);
2147
2148         if (bond_is_lb(bond)) {
2149                 /* bond_alb_init_slave() must be called before all other stages since
2150                  * it might fail and we do not want to have to undo everything
2151                  */
2152                 res = bond_alb_init_slave(bond, new_slave);
2153                 if (res)
2154                         goto err_close;
2155         }
2156
2157         res = vlan_vids_add_by_dev(slave_dev, bond_dev);
2158         if (res) {
2159                 slave_err(bond_dev, slave_dev, "Couldn't add bond vlan ids\n");
2160                 goto err_close;
2161         }
2162
2163         prev_slave = bond_last_slave(bond);
2164
2165         new_slave->delay = 0;
2166         new_slave->link_failure_count = 0;
2167
2168         if (bond_update_speed_duplex(new_slave) &&
2169             bond_needs_speed_duplex(bond))
2170                 new_slave->link = BOND_LINK_DOWN;
2171
2172         new_slave->last_rx = jiffies -
2173                 (msecs_to_jiffies(bond->params.arp_interval) + 1);
2174         for (i = 0; i < BOND_MAX_ARP_TARGETS; i++)
2175                 new_slave->target_last_arp_rx[i] = new_slave->last_rx;
2176
2177         new_slave->last_tx = new_slave->last_rx;
2178
2179         if (bond->params.miimon && !bond->params.use_carrier) {
2180                 link_reporting = bond_check_dev_link(bond, slave_dev, 1);
2181
2182                 if ((link_reporting == -1) && !bond->params.arp_interval) {
2183                         /* miimon is set but a bonded network driver
2184                          * does not support ETHTOOL/MII and
2185                          * arp_interval is not set.  Note: if
2186                          * use_carrier is enabled, we will never go
2187                          * here (because netif_carrier is always
2188                          * supported); thus, we don't need to change
2189                          * the messages for netif_carrier.
2190                          */
2191                         slave_warn(bond_dev, slave_dev, "MII and ETHTOOL support not available for slave, and arp_interval/arp_ip_target module parameters not specified, thus bonding will not detect link failures! see bonding.txt for details\n");
2192                 } else if (link_reporting == -1) {
2193                         /* unable get link status using mii/ethtool */
2194                         slave_warn(bond_dev, slave_dev, "can't get link status from slave; the network driver associated with this interface does not support MII or ETHTOOL link status reporting, thus miimon has no effect on this interface\n");
2195                 }
2196         }
2197
2198         /* check for initial state */
2199         new_slave->link = BOND_LINK_NOCHANGE;
2200         if (bond->params.miimon) {
2201                 if (bond_check_dev_link(bond, slave_dev, 0) == BMSR_LSTATUS) {
2202                         if (bond->params.updelay) {
2203                                 bond_set_slave_link_state(new_slave,
2204                                                           BOND_LINK_BACK,
2205                                                           BOND_SLAVE_NOTIFY_NOW);
2206                                 new_slave->delay = bond->params.updelay;
2207                         } else {
2208                                 bond_set_slave_link_state(new_slave,
2209                                                           BOND_LINK_UP,
2210                                                           BOND_SLAVE_NOTIFY_NOW);
2211                         }
2212                 } else {
2213                         bond_set_slave_link_state(new_slave, BOND_LINK_DOWN,
2214                                                   BOND_SLAVE_NOTIFY_NOW);
2215                 }
2216         } else if (bond->params.arp_interval) {
2217                 bond_set_slave_link_state(new_slave,
2218                                           (netif_carrier_ok(slave_dev) ?
2219                                           BOND_LINK_UP : BOND_LINK_DOWN),
2220                                           BOND_SLAVE_NOTIFY_NOW);
2221         } else {
2222                 bond_set_slave_link_state(new_slave, BOND_LINK_UP,
2223                                           BOND_SLAVE_NOTIFY_NOW);
2224         }
2225
2226         if (new_slave->link != BOND_LINK_DOWN)
2227                 new_slave->last_link_up = jiffies;
2228         slave_dbg(bond_dev, slave_dev, "Initial state of slave is BOND_LINK_%s\n",
2229                   new_slave->link == BOND_LINK_DOWN ? "DOWN" :
2230                   (new_slave->link == BOND_LINK_UP ? "UP" : "BACK"));
2231
2232         if (bond_uses_primary(bond) && bond->params.primary[0]) {
2233                 /* if there is a primary slave, remember it */
2234                 if (strcmp(bond->params.primary, new_slave->dev->name) == 0) {
2235                         rcu_assign_pointer(bond->primary_slave, new_slave);
2236                         bond->force_primary = true;
2237                 }
2238         }
2239
2240         switch (BOND_MODE(bond)) {
2241         case BOND_MODE_ACTIVEBACKUP:
2242                 bond_set_slave_inactive_flags(new_slave,
2243                                               BOND_SLAVE_NOTIFY_NOW);
2244                 break;
2245         case BOND_MODE_8023AD:
2246                 /* in 802.3ad mode, the internal mechanism
2247                  * will activate the slaves in the selected
2248                  * aggregator
2249                  */
2250                 bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW);
2251                 /* if this is the first slave */
2252                 if (!prev_slave) {
2253                         SLAVE_AD_INFO(new_slave)->id = 1;
2254                         /* Initialize AD with the number of times that the AD timer is called in 1 second
2255                          * can be called only after the mac address of the bond is set
2256                          */
2257                         bond_3ad_initialize(bond);
2258                 } else {
2259                         SLAVE_AD_INFO(new_slave)->id =
2260                                 SLAVE_AD_INFO(prev_slave)->id + 1;
2261                 }
2262
2263                 bond_3ad_bind_slave(new_slave);
2264                 break;
2265         case BOND_MODE_TLB:
2266         case BOND_MODE_ALB:
2267                 bond_set_active_slave(new_slave);
2268                 bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW);
2269                 break;
2270         default:
2271                 slave_dbg(bond_dev, slave_dev, "This slave is always active in trunk mode\n");
2272
2273                 /* always active in trunk mode */
2274                 bond_set_active_slave(new_slave);
2275
2276                 /* In trunking mode there is little meaning to curr_active_slave
2277                  * anyway (it holds no special properties of the bond device),
2278                  * so we can change it without calling change_active_interface()
2279                  */
2280                 if (!rcu_access_pointer(bond->curr_active_slave) &&
2281                     new_slave->link == BOND_LINK_UP)
2282                         rcu_assign_pointer(bond->curr_active_slave, new_slave);
2283
2284                 break;
2285         } /* switch(bond_mode) */
2286
2287 #ifdef CONFIG_NET_POLL_CONTROLLER
2288         if (bond->dev->npinfo) {
2289                 if (slave_enable_netpoll(new_slave)) {
2290                         slave_info(bond_dev, slave_dev, "master_dev is using netpoll, but new slave device does not support netpoll\n");
2291                         res = -EBUSY;
2292                         goto err_detach;
2293                 }
2294         }
2295 #endif
2296
2297         if (!(bond_dev->features & NETIF_F_LRO))
2298                 dev_disable_lro(slave_dev);
2299
2300         res = netdev_rx_handler_register(slave_dev, bond_handle_frame,
2301                                          new_slave);
2302         if (res) {
2303                 slave_dbg(bond_dev, slave_dev, "Error %d calling netdev_rx_handler_register\n", res);
2304                 goto err_detach;
2305         }
2306
2307         res = bond_master_upper_dev_link(bond, new_slave, extack);
2308         if (res) {
2309                 slave_dbg(bond_dev, slave_dev, "Error %d calling bond_master_upper_dev_link\n", res);
2310                 goto err_unregister;
2311         }
2312
2313         bond_lower_state_changed(new_slave);
2314
2315         res = bond_sysfs_slave_add(new_slave);
2316         if (res) {
2317                 slave_dbg(bond_dev, slave_dev, "Error %d calling bond_sysfs_slave_add\n", res);
2318                 goto err_upper_unlink;
2319         }
2320
2321         /* If the mode uses primary, then the following is handled by
2322          * bond_change_active_slave().
2323          */
2324         if (!bond_uses_primary(bond)) {
2325                 /* set promiscuity level to new slave */
2326                 if (bond_dev->flags & IFF_PROMISC) {
2327                         res = dev_set_promiscuity(slave_dev, 1);
2328                         if (res)
2329                                 goto err_sysfs_del;
2330                 }
2331
2332                 /* set allmulti level to new slave */
2333                 if (bond_dev->flags & IFF_ALLMULTI) {
2334                         res = dev_set_allmulti(slave_dev, 1);
2335                         if (res) {
2336                                 if (bond_dev->flags & IFF_PROMISC)
2337                                         dev_set_promiscuity(slave_dev, -1);
2338                                 goto err_sysfs_del;
2339                         }
2340                 }
2341
2342                 if (bond_dev->flags & IFF_UP) {
2343                         netif_addr_lock_bh(bond_dev);
2344                         dev_mc_sync_multiple(slave_dev, bond_dev);
2345                         dev_uc_sync_multiple(slave_dev, bond_dev);
2346                         netif_addr_unlock_bh(bond_dev);
2347
2348                         if (BOND_MODE(bond) == BOND_MODE_8023AD)
2349                                 dev_mc_add(slave_dev, lacpdu_mcast_addr);
2350                 }
2351         }
2352
2353         bond->slave_cnt++;
2354         bond_compute_features(bond);
2355         bond_set_carrier(bond);
2356
2357         /* Needs to be called before bond_select_active_slave(), which will
2358          * remove the maddrs if the slave is selected as active slave.
2359          */
2360         bond_slave_ns_maddrs_add(bond, new_slave);
2361
2362         if (bond_uses_primary(bond)) {
2363                 block_netpoll_tx();
2364                 bond_select_active_slave(bond);
2365                 unblock_netpoll_tx();
2366         }
2367
2368         if (bond_mode_can_use_xmit_hash(bond))
2369                 bond_update_slave_arr(bond, NULL);
2370
2371         if (!slave_dev->netdev_ops->ndo_bpf ||
2372             !slave_dev->netdev_ops->ndo_xdp_xmit) {
2373                 if (bond->xdp_prog) {
2374                         SLAVE_NL_ERR(bond_dev, slave_dev, extack,
2375                                      "Slave does not support XDP");
2376                         res = -EOPNOTSUPP;
2377                         goto err_sysfs_del;
2378                 }
2379         } else if (bond->xdp_prog) {
2380                 struct netdev_bpf xdp = {
2381                         .command = XDP_SETUP_PROG,
2382                         .flags   = 0,
2383                         .prog    = bond->xdp_prog,
2384                         .extack  = extack,
2385                 };
2386
2387                 if (dev_xdp_prog_count(slave_dev) > 0) {
2388                         SLAVE_NL_ERR(bond_dev, slave_dev, extack,
2389                                      "Slave has XDP program loaded, please unload before enslaving");
2390                         res = -EOPNOTSUPP;
2391                         goto err_sysfs_del;
2392                 }
2393
2394                 res = dev_xdp_propagate(slave_dev, &xdp);
2395                 if (res < 0) {
2396                         /* ndo_bpf() sets extack error message */
2397                         slave_dbg(bond_dev, slave_dev, "Error %d calling ndo_bpf\n", res);
2398                         goto err_sysfs_del;
2399                 }
2400                 if (bond->xdp_prog)
2401                         bpf_prog_inc(bond->xdp_prog);
2402         }
2403
2404         bond_xdp_set_features(bond_dev);
2405
2406         slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n",
2407                    bond_is_active_slave(new_slave) ? "an active" : "a backup",
2408                    new_slave->link != BOND_LINK_DOWN ? "an up" : "a down");
2409
2410         /* enslave is successful */
2411         bond_queue_slave_event(new_slave);
2412         return 0;
2413
2414 /* Undo stages on error */
2415 err_sysfs_del:
2416         bond_sysfs_slave_del(new_slave);
2417
2418 err_upper_unlink:
2419         bond_upper_dev_unlink(bond, new_slave);
2420
2421 err_unregister:
2422         netdev_rx_handler_unregister(slave_dev);
2423
2424 err_detach:
2425         vlan_vids_del_by_dev(slave_dev, bond_dev);
2426         if (rcu_access_pointer(bond->primary_slave) == new_slave)
2427                 RCU_INIT_POINTER(bond->primary_slave, NULL);
2428         if (rcu_access_pointer(bond->curr_active_slave) == new_slave) {
2429                 block_netpoll_tx();
2430                 bond_change_active_slave(bond, NULL);
2431                 bond_select_active_slave(bond);
2432                 unblock_netpoll_tx();
2433         }
2434         /* either primary_slave or curr_active_slave might've changed */
2435         synchronize_rcu();
2436         slave_disable_netpoll(new_slave);
2437
2438 err_close:
2439         if (!netif_is_bond_master(slave_dev))
2440                 slave_dev->priv_flags &= ~IFF_BONDING;
2441         dev_close(slave_dev);
2442
2443 err_restore_mac:
2444         slave_dev->priv_flags &= ~IFF_NO_ADDRCONF;
2445         if (!bond->params.fail_over_mac ||
2446             BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
2447                 /* XXX TODO - fom follow mode needs to change master's
2448                  * MAC if this slave's MAC is in use by the bond, or at
2449                  * least print a warning.
2450                  */
2451                 bond_hw_addr_copy(ss.__data, new_slave->perm_hwaddr,
2452                                   new_slave->dev->addr_len);
2453                 ss.ss_family = slave_dev->type;
2454                 dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL);
2455         }
2456
2457 err_restore_mtu:
2458         dev_set_mtu(slave_dev, new_slave->original_mtu);
2459
2460 err_free:
2461         kobject_put(&new_slave->kobj);
2462
2463 err_undo_flags:
2464         /* Enslave of first slave has failed and we need to fix master's mac */
2465         if (!bond_has_slaves(bond)) {
2466                 if (ether_addr_equal_64bits(bond_dev->dev_addr,
2467                                             slave_dev->dev_addr))
2468                         eth_hw_addr_random(bond_dev);
2469                 if (bond_dev->type != ARPHRD_ETHER) {
2470                         dev_close(bond_dev);
2471                         bond_ether_setup(bond_dev);
2472                 }
2473         }
2474
2475         return res;
2476 }
2477
2478 /* Try to release the slave device <slave> from the bond device <master>
2479  * It is legal to access curr_active_slave without a lock because all the function
2480  * is RTNL-locked. If "all" is true it means that the function is being called
2481  * while destroying a bond interface and all slaves are being released.
2482  *
2483  * The rules for slave state should be:
2484  *   for Active/Backup:
2485  *     Active stays on all backups go down
2486  *   for Bonded connections:
2487  *     The first up interface should be left on and all others downed.
2488  */
2489 static int __bond_release_one(struct net_device *bond_dev,
2490                               struct net_device *slave_dev,
2491                               bool all, bool unregister)
2492 {
2493         struct bonding *bond = netdev_priv(bond_dev);
2494         struct slave *slave, *oldcurrent;
2495         struct sockaddr_storage ss;
2496         int old_flags = bond_dev->flags;
2497         netdev_features_t old_features = bond_dev->features;
2498
2499         /* slave is not a slave or master is not master of this slave */
2500         if (!(slave_dev->flags & IFF_SLAVE) ||
2501             !netdev_has_upper_dev(slave_dev, bond_dev)) {
2502                 slave_dbg(bond_dev, slave_dev, "cannot release slave\n");
2503                 return -EINVAL;
2504         }
2505
2506         block_netpoll_tx();
2507
2508         slave = bond_get_slave_by_dev(bond, slave_dev);
2509         if (!slave) {
2510                 /* not a slave of this bond */
2511                 slave_info(bond_dev, slave_dev, "interface not enslaved\n");
2512                 unblock_netpoll_tx();
2513                 return -EINVAL;
2514         }
2515
2516         bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW);
2517
2518         bond_sysfs_slave_del(slave);
2519
2520         /* recompute stats just before removing the slave */
2521         bond_get_stats(bond->dev, &bond->bond_stats);
2522
2523         if (bond->xdp_prog) {
2524                 struct netdev_bpf xdp = {
2525                         .command = XDP_SETUP_PROG,
2526                         .flags   = 0,
2527                         .prog    = NULL,
2528                         .extack  = NULL,
2529                 };
2530                 if (dev_xdp_propagate(slave_dev, &xdp))
2531                         slave_warn(bond_dev, slave_dev, "failed to unload XDP program\n");
2532         }
2533
2534         /* unregister rx_handler early so bond_handle_frame wouldn't be called
2535          * for this slave anymore.
2536          */
2537         netdev_rx_handler_unregister(slave_dev);
2538
2539         if (BOND_MODE(bond) == BOND_MODE_8023AD)
2540                 bond_3ad_unbind_slave(slave);
2541
2542         bond_upper_dev_unlink(bond, slave);
2543
2544         if (bond_mode_can_use_xmit_hash(bond))
2545                 bond_update_slave_arr(bond, slave);
2546
2547         slave_info(bond_dev, slave_dev, "Releasing %s interface\n",
2548                     bond_is_active_slave(slave) ? "active" : "backup");
2549
2550         oldcurrent = rcu_access_pointer(bond->curr_active_slave);
2551
2552         RCU_INIT_POINTER(bond->current_arp_slave, NULL);
2553
2554         if (!all && (!bond->params.fail_over_mac ||
2555                      BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) {
2556                 if (ether_addr_equal_64bits(bond_dev->dev_addr, slave->perm_hwaddr) &&
2557                     bond_has_slaves(bond))
2558                         slave_warn(bond_dev, slave_dev, "the permanent HWaddr of slave - %pM - is still in use by bond - set the HWaddr of slave to a different address to avoid conflicts\n",
2559                                    slave->perm_hwaddr);
2560         }
2561
2562         if (rtnl_dereference(bond->primary_slave) == slave)
2563                 RCU_INIT_POINTER(bond->primary_slave, NULL);
2564
2565         if (oldcurrent == slave)
2566                 bond_change_active_slave(bond, NULL);
2567
2568         /* Must be called after bond_change_active_slave () as the slave
2569          * might change from an active slave to a backup slave. Then it is
2570          * necessary to clear the maddrs on the backup slave.
2571          */
2572         bond_slave_ns_maddrs_del(bond, slave);
2573
2574         if (bond_is_lb(bond)) {
2575                 /* Must be called only after the slave has been
2576                  * detached from the list and the curr_active_slave
2577                  * has been cleared (if our_slave == old_current),
2578                  * but before a new active slave is selected.
2579                  */
2580                 bond_alb_deinit_slave(bond, slave);
2581         }
2582
2583         if (all) {
2584                 RCU_INIT_POINTER(bond->curr_active_slave, NULL);
2585         } else if (oldcurrent == slave) {
2586                 /* Note that we hold RTNL over this sequence, so there
2587                  * is no concern that another slave add/remove event
2588                  * will interfere.
2589                  */
2590                 bond_select_active_slave(bond);
2591         }
2592
2593         bond_set_carrier(bond);
2594         if (!bond_has_slaves(bond))
2595                 eth_hw_addr_random(bond_dev);
2596
2597         unblock_netpoll_tx();
2598         synchronize_rcu();
2599         bond->slave_cnt--;
2600
2601         if (!bond_has_slaves(bond)) {
2602                 call_netdevice_notifiers(NETDEV_CHANGEADDR, bond->dev);
2603                 call_netdevice_notifiers(NETDEV_RELEASE, bond->dev);
2604         }
2605
2606         bond_compute_features(bond);
2607         if (!(bond_dev->features & NETIF_F_VLAN_CHALLENGED) &&
2608             (old_features & NETIF_F_VLAN_CHALLENGED))
2609                 slave_info(bond_dev, slave_dev, "last VLAN challenged slave left bond - VLAN blocking is removed\n");
2610
2611         vlan_vids_del_by_dev(slave_dev, bond_dev);
2612
2613         /* If the mode uses primary, then this case was handled above by
2614          * bond_change_active_slave(..., NULL)
2615          */
2616         if (!bond_uses_primary(bond)) {
2617                 /* unset promiscuity level from slave
2618                  * NOTE: The NETDEV_CHANGEADDR call above may change the value
2619                  * of the IFF_PROMISC flag in the bond_dev, but we need the
2620                  * value of that flag before that change, as that was the value
2621                  * when this slave was attached, so we cache at the start of the
2622                  * function and use it here. Same goes for ALLMULTI below
2623                  */
2624                 if (old_flags & IFF_PROMISC)
2625                         dev_set_promiscuity(slave_dev, -1);
2626
2627                 /* unset allmulti level from slave */
2628                 if (old_flags & IFF_ALLMULTI)
2629                         dev_set_allmulti(slave_dev, -1);
2630
2631                 if (old_flags & IFF_UP)
2632                         bond_hw_addr_flush(bond_dev, slave_dev);
2633         }
2634
2635         slave_disable_netpoll(slave);
2636
2637         /* close slave before restoring its mac address */
2638         dev_close(slave_dev);
2639
2640         slave_dev->priv_flags &= ~IFF_NO_ADDRCONF;
2641
2642         if (bond->params.fail_over_mac != BOND_FOM_ACTIVE ||
2643             BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
2644                 /* restore original ("permanent") mac address */
2645                 bond_hw_addr_copy(ss.__data, slave->perm_hwaddr,
2646                                   slave->dev->addr_len);
2647                 ss.ss_family = slave_dev->type;
2648                 dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL);
2649         }
2650
2651         if (unregister)
2652                 __dev_set_mtu(slave_dev, slave->original_mtu);
2653         else
2654                 dev_set_mtu(slave_dev, slave->original_mtu);
2655
2656         if (!netif_is_bond_master(slave_dev))
2657                 slave_dev->priv_flags &= ~IFF_BONDING;
2658
2659         bond_xdp_set_features(bond_dev);
2660         kobject_put(&slave->kobj);
2661
2662         return 0;
2663 }
2664
2665 /* A wrapper used because of ndo_del_link */
2666 int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
2667 {
2668         return __bond_release_one(bond_dev, slave_dev, false, false);
2669 }
2670
2671 /* First release a slave and then destroy the bond if no more slaves are left.
2672  * Must be under rtnl_lock when this function is called.
2673  */
2674 static int bond_release_and_destroy(struct net_device *bond_dev,
2675                                     struct net_device *slave_dev)
2676 {
2677         struct bonding *bond = netdev_priv(bond_dev);
2678         int ret;
2679
2680         ret = __bond_release_one(bond_dev, slave_dev, false, true);
2681         if (ret == 0 && !bond_has_slaves(bond) &&
2682             bond_dev->reg_state != NETREG_UNREGISTERING) {
2683                 bond_dev->priv_flags |= IFF_DISABLE_NETPOLL;
2684                 netdev_info(bond_dev, "Destroying bond\n");
2685                 bond_remove_proc_entry(bond);
2686                 unregister_netdevice(bond_dev);
2687         }
2688         return ret;
2689 }
2690
2691 static void bond_info_query(struct net_device *bond_dev, struct ifbond *info)
2692 {
2693         struct bonding *bond = netdev_priv(bond_dev);
2694
2695         bond_fill_ifbond(bond, info);
2696 }
2697
2698 static int bond_slave_info_query(struct net_device *bond_dev, struct ifslave *info)
2699 {
2700         struct bonding *bond = netdev_priv(bond_dev);
2701         struct list_head *iter;
2702         int i = 0, res = -ENODEV;
2703         struct slave *slave;
2704
2705         bond_for_each_slave(bond, slave, iter) {
2706                 if (i++ == (int)info->slave_id) {
2707                         res = 0;
2708                         bond_fill_ifslave(slave, info);
2709                         break;
2710                 }
2711         }
2712
2713         return res;
2714 }
2715
2716 /*-------------------------------- Monitoring -------------------------------*/
2717
2718 /* called with rcu_read_lock() */
2719 static int bond_miimon_inspect(struct bonding *bond)
2720 {
2721         bool ignore_updelay = false;
2722         int link_state, commit = 0;
2723         struct list_head *iter;
2724         struct slave *slave;
2725
2726         if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) {
2727                 ignore_updelay = !rcu_dereference(bond->curr_active_slave);
2728         } else {
2729                 struct bond_up_slave *usable_slaves;
2730
2731                 usable_slaves = rcu_dereference(bond->usable_slaves);
2732
2733                 if (usable_slaves && usable_slaves->count == 0)
2734                         ignore_updelay = true;
2735         }
2736
2737         bond_for_each_slave_rcu(bond, slave, iter) {
2738                 bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
2739
2740                 link_state = bond_check_dev_link(bond, slave->dev, 0);
2741
2742                 switch (slave->link) {
2743                 case BOND_LINK_UP:
2744                         if (link_state)
2745                                 continue;
2746
2747                         bond_propose_link_state(slave, BOND_LINK_FAIL);
2748                         commit++;
2749                         slave->delay = bond->params.downdelay;
2750                         if (slave->delay && net_ratelimit()) {
2751                                 slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n",
2752                                            (BOND_MODE(bond) ==
2753                                             BOND_MODE_ACTIVEBACKUP) ?
2754                                             (bond_is_active_slave(slave) ?
2755                                              "active " : "backup ") : "",
2756                                            bond->params.downdelay * bond->params.miimon);
2757                         }
2758                         fallthrough;
2759                 case BOND_LINK_FAIL:
2760                         if (link_state) {
2761                                 /* recovered before downdelay expired */
2762                                 bond_propose_link_state(slave, BOND_LINK_UP);
2763                                 slave->last_link_up = jiffies;
2764                                 if (net_ratelimit())
2765                                         slave_info(bond->dev, slave->dev, "link status up again after %d ms\n",
2766                                                    (bond->params.downdelay - slave->delay) *
2767                                                    bond->params.miimon);
2768                                 commit++;
2769                                 continue;
2770                         }
2771
2772                         if (slave->delay <= 0) {
2773                                 bond_propose_link_state(slave, BOND_LINK_DOWN);
2774                                 commit++;
2775                                 continue;
2776                         }
2777
2778                         slave->delay--;
2779                         break;
2780
2781                 case BOND_LINK_DOWN:
2782                         if (!link_state)
2783                                 continue;
2784
2785                         bond_propose_link_state(slave, BOND_LINK_BACK);
2786                         commit++;
2787                         slave->delay = bond->params.updelay;
2788
2789                         if (slave->delay && net_ratelimit()) {
2790                                 slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n",
2791                                            ignore_updelay ? 0 :
2792                                            bond->params.updelay *
2793                                            bond->params.miimon);
2794                         }
2795                         fallthrough;
2796                 case BOND_LINK_BACK:
2797                         if (!link_state) {
2798                                 bond_propose_link_state(slave, BOND_LINK_DOWN);
2799                                 if (net_ratelimit())
2800                                         slave_info(bond->dev, slave->dev, "link status down again after %d ms\n",
2801                                                    (bond->params.updelay - slave->delay) *
2802                                                    bond->params.miimon);
2803                                 commit++;
2804                                 continue;
2805                         }
2806
2807                         if (ignore_updelay)
2808                                 slave->delay = 0;
2809
2810                         if (slave->delay <= 0) {
2811                                 bond_propose_link_state(slave, BOND_LINK_UP);
2812                                 commit++;
2813                                 ignore_updelay = false;
2814                                 continue;
2815                         }
2816
2817                         slave->delay--;
2818                         break;
2819                 }
2820         }
2821
2822         return commit;
2823 }
2824
2825 static void bond_miimon_link_change(struct bonding *bond,
2826                                     struct slave *slave,
2827                                     char link)
2828 {
2829         switch (BOND_MODE(bond)) {
2830         case BOND_MODE_8023AD:
2831                 bond_3ad_handle_link_change(slave, link);
2832                 break;
2833         case BOND_MODE_TLB:
2834         case BOND_MODE_ALB:
2835                 bond_alb_handle_link_change(bond, slave, link);
2836                 break;
2837         case BOND_MODE_XOR:
2838                 bond_update_slave_arr(bond, NULL);
2839                 break;
2840         }
2841 }
2842
2843 static void bond_miimon_commit(struct bonding *bond)
2844 {
2845         struct slave *slave, *primary, *active;
2846         bool do_failover = false;
2847         struct list_head *iter;
2848
2849         ASSERT_RTNL();
2850
2851         bond_for_each_slave(bond, slave, iter) {
2852                 switch (slave->link_new_state) {
2853                 case BOND_LINK_NOCHANGE:
2854                         /* For 802.3ad mode, check current slave speed and
2855                          * duplex again in case its port was disabled after
2856                          * invalid speed/duplex reporting but recovered before
2857                          * link monitoring could make a decision on the actual
2858                          * link status
2859                          */
2860                         if (BOND_MODE(bond) == BOND_MODE_8023AD &&
2861                             slave->link == BOND_LINK_UP)
2862                                 bond_3ad_adapter_speed_duplex_changed(slave);
2863                         continue;
2864
2865                 case BOND_LINK_UP:
2866                         if (bond_update_speed_duplex(slave) &&
2867                             bond_needs_speed_duplex(bond)) {
2868                                 slave->link = BOND_LINK_DOWN;
2869                                 if (net_ratelimit())
2870                                         slave_warn(bond->dev, slave->dev,
2871                                                    "failed to get link speed/duplex\n");
2872                                 continue;
2873                         }
2874                         bond_set_slave_link_state(slave, BOND_LINK_UP,
2875                                                   BOND_SLAVE_NOTIFY_NOW);
2876                         slave->last_link_up = jiffies;
2877
2878                         primary = rtnl_dereference(bond->primary_slave);
2879                         if (BOND_MODE(bond) == BOND_MODE_8023AD) {
2880                                 /* prevent it from being the active one */
2881                                 bond_set_backup_slave(slave);
2882                         } else if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
2883                                 /* make it immediately active */
2884                                 bond_set_active_slave(slave);
2885                         }
2886
2887                         slave_info(bond->dev, slave->dev, "link status definitely up, %u Mbps %s duplex\n",
2888                                    slave->speed == SPEED_UNKNOWN ? 0 : slave->speed,
2889                                    slave->duplex ? "full" : "half");
2890
2891                         bond_miimon_link_change(bond, slave, BOND_LINK_UP);
2892
2893                         active = rtnl_dereference(bond->curr_active_slave);
2894                         if (!active || slave == primary || slave->prio > active->prio)
2895                                 do_failover = true;
2896
2897                         continue;
2898
2899                 case BOND_LINK_DOWN:
2900                         if (slave->link_failure_count < UINT_MAX)
2901                                 slave->link_failure_count++;
2902
2903                         bond_set_slave_link_state(slave, BOND_LINK_DOWN,
2904                                                   BOND_SLAVE_NOTIFY_NOW);
2905
2906                         if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP ||
2907                             BOND_MODE(bond) == BOND_MODE_8023AD)
2908                                 bond_set_slave_inactive_flags(slave,
2909                                                               BOND_SLAVE_NOTIFY_NOW);
2910
2911                         slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n");
2912
2913                         bond_miimon_link_change(bond, slave, BOND_LINK_DOWN);
2914
2915                         if (slave == rcu_access_pointer(bond->curr_active_slave))
2916                                 do_failover = true;
2917
2918                         continue;
2919
2920                 default:
2921                         slave_err(bond->dev, slave->dev, "invalid new link %d on slave\n",
2922                                   slave->link_new_state);
2923                         bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
2924
2925                         continue;
2926                 }
2927         }
2928
2929         if (do_failover) {
2930                 block_netpoll_tx();
2931                 bond_select_active_slave(bond);
2932                 unblock_netpoll_tx();
2933         }
2934
2935         bond_set_carrier(bond);
2936 }
2937
2938 /* bond_mii_monitor
2939  *
2940  * Really a wrapper that splits the mii monitor into two phases: an
2941  * inspection, then (if inspection indicates something needs to be done)
2942  * an acquisition of appropriate locks followed by a commit phase to
2943  * implement whatever link state changes are indicated.
2944  */
2945 static void bond_mii_monitor(struct work_struct *work)
2946 {
2947         struct bonding *bond = container_of(work, struct bonding,
2948                                             mii_work.work);
2949         bool should_notify_peers = false;
2950         bool commit;
2951         unsigned long delay;
2952         struct slave *slave;
2953         struct list_head *iter;
2954
2955         delay = msecs_to_jiffies(bond->params.miimon);
2956
2957         if (!bond_has_slaves(bond))
2958                 goto re_arm;
2959
2960         rcu_read_lock();
2961         should_notify_peers = bond_should_notify_peers(bond);
2962         commit = !!bond_miimon_inspect(bond);
2963         if (bond->send_peer_notif) {
2964                 rcu_read_unlock();
2965                 if (rtnl_trylock()) {
2966                         bond->send_peer_notif--;
2967                         rtnl_unlock();
2968                 }
2969         } else {
2970                 rcu_read_unlock();
2971         }
2972
2973         if (commit) {
2974                 /* Race avoidance with bond_close cancel of workqueue */
2975                 if (!rtnl_trylock()) {
2976                         delay = 1;
2977                         should_notify_peers = false;
2978                         goto re_arm;
2979                 }
2980
2981                 bond_for_each_slave(bond, slave, iter) {
2982                         bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER);
2983                 }
2984                 bond_miimon_commit(bond);
2985
2986                 rtnl_unlock();  /* might sleep, hold no other locks */
2987         }
2988
2989 re_arm:
2990         if (bond->params.miimon)
2991                 queue_delayed_work(bond->wq, &bond->mii_work, delay);
2992
2993         if (should_notify_peers) {
2994                 if (!rtnl_trylock())
2995                         return;
2996                 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev);
2997                 rtnl_unlock();
2998         }
2999 }
3000
3001 static int bond_upper_dev_walk(struct net_device *upper,
3002                                struct netdev_nested_priv *priv)
3003 {
3004         __be32 ip = *(__be32 *)priv->data;
3005
3006         return ip == bond_confirm_addr(upper, 0, ip);
3007 }
3008
3009 static bool bond_has_this_ip(struct bonding *bond, __be32 ip)
3010 {
3011         struct netdev_nested_priv priv = {
3012                 .data = (void *)&ip,
3013         };
3014         bool ret = false;
3015
3016         if (ip == bond_confirm_addr(bond->dev, 0, ip))
3017                 return true;
3018
3019         rcu_read_lock();
3020         if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &priv))
3021                 ret = true;
3022         rcu_read_unlock();
3023
3024         return ret;
3025 }
3026
3027 #define BOND_VLAN_PROTO_NONE cpu_to_be16(0xffff)
3028
3029 static bool bond_handle_vlan(struct slave *slave, struct bond_vlan_tag *tags,
3030                              struct sk_buff *skb)
3031 {
3032         struct net_device *bond_dev = slave->bond->dev;
3033         struct net_device *slave_dev = slave->dev;
3034         struct bond_vlan_tag *outer_tag = tags;
3035
3036         if (!tags || tags->vlan_proto == BOND_VLAN_PROTO_NONE)
3037                 return true;
3038
3039         tags++;
3040
3041         /* Go through all the tags backwards and add them to the packet */
3042         while (tags->vlan_proto != BOND_VLAN_PROTO_NONE) {
3043                 if (!tags->vlan_id) {
3044                         tags++;
3045                         continue;
3046                 }
3047
3048                 slave_dbg(bond_dev, slave_dev, "inner tag: proto %X vid %X\n",
3049                           ntohs(outer_tag->vlan_proto), tags->vlan_id);
3050                 skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto,
3051                                                 tags->vlan_id);
3052                 if (!skb) {
3053                         net_err_ratelimited("failed to insert inner VLAN tag\n");
3054                         return false;
3055                 }
3056
3057                 tags++;
3058         }
3059         /* Set the outer tag */
3060         if (outer_tag->vlan_id) {
3061                 slave_dbg(bond_dev, slave_dev, "outer tag: proto %X vid %X\n",
3062                           ntohs(outer_tag->vlan_proto), outer_tag->vlan_id);
3063                 __vlan_hwaccel_put_tag(skb, outer_tag->vlan_proto,
3064                                        outer_tag->vlan_id);
3065         }
3066
3067         return true;
3068 }
3069
3070 /* We go to the (large) trouble of VLAN tagging ARP frames because
3071  * switches in VLAN mode (especially if ports are configured as
3072  * "native" to a VLAN) might not pass non-tagged frames.
3073  */
3074 static void bond_arp_send(struct slave *slave, int arp_op, __be32 dest_ip,
3075                           __be32 src_ip, struct bond_vlan_tag *tags)
3076 {
3077         struct net_device *bond_dev = slave->bond->dev;
3078         struct net_device *slave_dev = slave->dev;
3079         struct sk_buff *skb;
3080
3081         slave_dbg(bond_dev, slave_dev, "arp %d on slave: dst %pI4 src %pI4\n",
3082                   arp_op, &dest_ip, &src_ip);
3083
3084         skb = arp_create(arp_op, ETH_P_ARP, dest_ip, slave_dev, src_ip,
3085                          NULL, slave_dev->dev_addr, NULL);
3086
3087         if (!skb) {
3088                 net_err_ratelimited("ARP packet allocation failed\n");
3089                 return;
3090         }
3091
3092         if (bond_handle_vlan(slave, tags, skb)) {
3093                 slave_update_last_tx(slave);
3094                 arp_xmit(skb);
3095         }
3096
3097         return;
3098 }
3099
3100 /* Validate the device path between the @start_dev and the @end_dev.
3101  * The path is valid if the @end_dev is reachable through device
3102  * stacking.
3103  * When the path is validated, collect any vlan information in the
3104  * path.
3105  */
3106 struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev,
3107                                               struct net_device *end_dev,
3108                                               int level)
3109 {
3110         struct bond_vlan_tag *tags;
3111         struct net_device *upper;
3112         struct list_head  *iter;
3113
3114         if (start_dev == end_dev) {
3115                 tags = kcalloc(level + 1, sizeof(*tags), GFP_ATOMIC);
3116                 if (!tags)
3117                         return ERR_PTR(-ENOMEM);
3118                 tags[level].vlan_proto = BOND_VLAN_PROTO_NONE;
3119                 return tags;
3120         }
3121
3122         netdev_for_each_upper_dev_rcu(start_dev, upper, iter) {
3123                 tags = bond_verify_device_path(upper, end_dev, level + 1);
3124                 if (IS_ERR_OR_NULL(tags)) {
3125                         if (IS_ERR(tags))
3126                                 return tags;
3127                         continue;
3128                 }
3129                 if (is_vlan_dev(upper)) {
3130                         tags[level].vlan_proto = vlan_dev_vlan_proto(upper);
3131                         tags[level].vlan_id = vlan_dev_vlan_id(upper);
3132                 }
3133
3134                 return tags;
3135         }
3136
3137         return NULL;
3138 }
3139
3140 static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
3141 {
3142         struct rtable *rt;
3143         struct bond_vlan_tag *tags;
3144         __be32 *targets = bond->params.arp_targets, addr;
3145         int i;
3146
3147         for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) {
3148                 slave_dbg(bond->dev, slave->dev, "%s: target %pI4\n",
3149                           __func__, &targets[i]);
3150                 tags = NULL;
3151
3152                 /* Find out through which dev should the packet go */
3153                 rt = ip_route_output(dev_net(bond->dev), targets[i], 0, 0, 0,
3154                                      RT_SCOPE_LINK);
3155                 if (IS_ERR(rt)) {
3156                         /* there's no route to target - try to send arp
3157                          * probe to generate any traffic (arp_validate=0)
3158                          */
3159                         if (bond->params.arp_validate)
3160                                 pr_warn_once("%s: no route to arp_ip_target %pI4 and arp_validate is set\n",
3161                                              bond->dev->name,
3162                                              &targets[i]);
3163                         bond_arp_send(slave, ARPOP_REQUEST, targets[i],
3164                                       0, tags);
3165                         continue;
3166                 }
3167
3168                 /* bond device itself */
3169                 if (rt->dst.dev == bond->dev)
3170                         goto found;
3171
3172                 rcu_read_lock();
3173                 tags = bond_verify_device_path(bond->dev, rt->dst.dev, 0);
3174                 rcu_read_unlock();
3175
3176                 if (!IS_ERR_OR_NULL(tags))
3177                         goto found;
3178
3179                 /* Not our device - skip */
3180                 slave_dbg(bond->dev, slave->dev, "no path to arp_ip_target %pI4 via rt.dev %s\n",
3181                            &targets[i], rt->dst.dev ? rt->dst.dev->name : "NULL");
3182
3183                 ip_rt_put(rt);
3184                 continue;
3185
3186 found:
3187                 addr = bond_confirm_addr(rt->dst.dev, targets[i], 0);
3188                 ip_rt_put(rt);
3189                 bond_arp_send(slave, ARPOP_REQUEST, targets[i], addr, tags);
3190                 kfree(tags);
3191         }
3192 }
3193
3194 static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32 sip, __be32 tip)
3195 {
3196         int i;
3197
3198         if (!sip || !bond_has_this_ip(bond, tip)) {
3199                 slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 tip %pI4 not found\n",
3200                            __func__, &sip, &tip);
3201                 return;
3202         }
3203
3204         i = bond_get_targets_ip(bond->params.arp_targets, sip);
3205         if (i == -1) {
3206                 slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 not found in targets\n",
3207                            __func__, &sip);
3208                 return;
3209         }
3210         slave->last_rx = jiffies;
3211         slave->target_last_arp_rx[i] = jiffies;
3212 }
3213
3214 static int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
3215                         struct slave *slave)
3216 {
3217         struct arphdr *arp = (struct arphdr *)skb->data;
3218         struct slave *curr_active_slave, *curr_arp_slave;
3219         unsigned char *arp_ptr;
3220         __be32 sip, tip;
3221         unsigned int alen;
3222
3223         alen = arp_hdr_len(bond->dev);
3224
3225         if (alen > skb_headlen(skb)) {
3226                 arp = kmalloc(alen, GFP_ATOMIC);
3227                 if (!arp)
3228                         goto out_unlock;
3229                 if (skb_copy_bits(skb, 0, arp, alen) < 0)
3230                         goto out_unlock;
3231         }
3232
3233         if (arp->ar_hln != bond->dev->addr_len ||
3234             skb->pkt_type == PACKET_OTHERHOST ||
3235             skb->pkt_type == PACKET_LOOPBACK ||
3236             arp->ar_hrd != htons(ARPHRD_ETHER) ||
3237             arp->ar_pro != htons(ETH_P_IP) ||
3238             arp->ar_pln != 4)
3239                 goto out_unlock;
3240
3241         arp_ptr = (unsigned char *)(arp + 1);
3242         arp_ptr += bond->dev->addr_len;
3243         memcpy(&sip, arp_ptr, 4);
3244         arp_ptr += 4 + bond->dev->addr_len;
3245         memcpy(&tip, arp_ptr, 4);
3246
3247         slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI4 tip %pI4\n",
3248                   __func__, slave->dev->name, bond_slave_state(slave),
3249                   bond->params.arp_validate, slave_do_arp_validate(bond, slave),
3250                   &sip, &tip);
3251
3252         curr_active_slave = rcu_dereference(bond->curr_active_slave);
3253         curr_arp_slave = rcu_dereference(bond->current_arp_slave);
3254
3255         /* We 'trust' the received ARP enough to validate it if:
3256          *
3257          * (a) the slave receiving the ARP is active (which includes the
3258          * current ARP slave, if any), or
3259          *
3260          * (b) the receiving slave isn't active, but there is a currently
3261          * active slave and it received valid arp reply(s) after it became
3262          * the currently active slave, or
3263          *
3264          * (c) there is an ARP slave that sent an ARP during the prior ARP
3265          * interval, and we receive an ARP reply on any slave.  We accept
3266          * these because switch FDB update delays may deliver the ARP
3267          * reply to a slave other than the sender of the ARP request.
3268          *
3269          * Note: for (b), backup slaves are receiving the broadcast ARP
3270          * request, not a reply.  This request passes from the sending
3271          * slave through the L2 switch(es) to the receiving slave.  Since
3272          * this is checking the request, sip/tip are swapped for
3273          * validation.
3274          *
3275          * This is done to avoid endless looping when we can't reach the
3276          * arp_ip_target and fool ourselves with our own arp requests.
3277          */
3278         if (bond_is_active_slave(slave))
3279                 bond_validate_arp(bond, slave, sip, tip);
3280         else if (curr_active_slave &&
3281                  time_after(slave_last_rx(bond, curr_active_slave),
3282                             curr_active_slave->last_link_up))
3283                 bond_validate_arp(bond, slave, tip, sip);
3284         else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) &&
3285                  bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1))
3286                 bond_validate_arp(bond, slave, sip, tip);
3287
3288 out_unlock:
3289         if (arp != (struct arphdr *)skb->data)
3290                 kfree(arp);
3291         return RX_HANDLER_ANOTHER;
3292 }
3293
3294 #if IS_ENABLED(CONFIG_IPV6)
3295 static void bond_ns_send(struct slave *slave, const struct in6_addr *daddr,
3296                          const struct in6_addr *saddr, struct bond_vlan_tag *tags)
3297 {
3298         struct net_device *bond_dev = slave->bond->dev;
3299         struct net_device *slave_dev = slave->dev;
3300         struct in6_addr mcaddr;
3301         struct sk_buff *skb;
3302
3303         slave_dbg(bond_dev, slave_dev, "NS on slave: dst %pI6c src %pI6c\n",
3304                   daddr, saddr);
3305
3306         skb = ndisc_ns_create(slave_dev, daddr, saddr, 0);
3307         if (!skb) {
3308                 net_err_ratelimited("NS packet allocation failed\n");
3309                 return;
3310         }
3311
3312         addrconf_addr_solict_mult(daddr, &mcaddr);
3313         if (bond_handle_vlan(slave, tags, skb)) {
3314                 slave_update_last_tx(slave);
3315                 ndisc_send_skb(skb, &mcaddr, saddr);
3316         }
3317 }
3318
3319 static void bond_ns_send_all(struct bonding *bond, struct slave *slave)
3320 {
3321         struct in6_addr *targets = bond->params.ns_targets;
3322         struct bond_vlan_tag *tags;
3323         struct dst_entry *dst;
3324         struct in6_addr saddr;
3325         struct flowi6 fl6;
3326         int i;
3327
3328         for (i = 0; i < BOND_MAX_NS_TARGETS && !ipv6_addr_any(&targets[i]); i++) {
3329                 slave_dbg(bond->dev, slave->dev, "%s: target %pI6c\n",
3330                           __func__, &targets[i]);
3331                 tags = NULL;
3332
3333                 /* Find out through which dev should the packet go */
3334                 memset(&fl6, 0, sizeof(struct flowi6));
3335                 fl6.daddr = targets[i];
3336                 fl6.flowi6_oif = bond->dev->ifindex;
3337
3338                 dst = ip6_route_output(dev_net(bond->dev), NULL, &fl6);
3339                 if (dst->error) {
3340                         dst_release(dst);
3341                         /* there's no route to target - try to send arp
3342                          * probe to generate any traffic (arp_validate=0)
3343                          */
3344                         if (bond->params.arp_validate)
3345                                 pr_warn_once("%s: no route to ns_ip6_target %pI6c and arp_validate is set\n",
3346                                              bond->dev->name,
3347                                              &targets[i]);
3348                         bond_ns_send(slave, &targets[i], &in6addr_any, tags);
3349                         continue;
3350                 }
3351
3352                 /* bond device itself */
3353                 if (dst->dev == bond->dev)
3354                         goto found;
3355
3356                 rcu_read_lock();
3357                 tags = bond_verify_device_path(bond->dev, dst->dev, 0);
3358                 rcu_read_unlock();
3359
3360                 if (!IS_ERR_OR_NULL(tags))
3361                         goto found;
3362
3363                 /* Not our device - skip */
3364                 slave_dbg(bond->dev, slave->dev, "no path to ns_ip6_target %pI6c via dst->dev %s\n",
3365                           &targets[i], dst->dev ? dst->dev->name : "NULL");
3366
3367                 dst_release(dst);
3368                 continue;
3369
3370 found:
3371                 if (!ipv6_dev_get_saddr(dev_net(dst->dev), dst->dev, &targets[i], 0, &saddr))
3372                         bond_ns_send(slave, &targets[i], &saddr, tags);
3373                 else
3374                         bond_ns_send(slave, &targets[i], &in6addr_any, tags);
3375
3376                 dst_release(dst);
3377                 kfree(tags);
3378         }
3379 }
3380
3381 static int bond_confirm_addr6(struct net_device *dev,
3382                               struct netdev_nested_priv *priv)
3383 {
3384         struct in6_addr *addr = (struct in6_addr *)priv->data;
3385
3386         return ipv6_chk_addr(dev_net(dev), addr, dev, 0);
3387 }
3388
3389 static bool bond_has_this_ip6(struct bonding *bond, struct in6_addr *addr)
3390 {
3391         struct netdev_nested_priv priv = {
3392                 .data = addr,
3393         };
3394         int ret = false;
3395
3396         if (bond_confirm_addr6(bond->dev, &priv))
3397                 return true;
3398
3399         rcu_read_lock();
3400         if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_confirm_addr6, &priv))
3401                 ret = true;
3402         rcu_read_unlock();
3403
3404         return ret;
3405 }
3406
3407 static void bond_validate_na(struct bonding *bond, struct slave *slave,
3408                              struct in6_addr *saddr, struct in6_addr *daddr)
3409 {
3410         int i;
3411
3412         /* Ignore NAs that:
3413          * 1. Source address is unspecified address.
3414          * 2. Dest address is neither all-nodes multicast address nor
3415          *    exist on bond interface.
3416          */
3417         if (ipv6_addr_any(saddr) ||
3418             (!ipv6_addr_equal(daddr, &in6addr_linklocal_allnodes) &&
3419              !bond_has_this_ip6(bond, daddr))) {
3420                 slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c tip %pI6c not found\n",
3421                           __func__, saddr, daddr);
3422                 return;
3423         }
3424
3425         i = bond_get_targets_ip6(bond->params.ns_targets, saddr);
3426         if (i == -1) {
3427                 slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c not found in targets\n",
3428                           __func__, saddr);
3429                 return;
3430         }
3431         slave->last_rx = jiffies;
3432         slave->target_last_arp_rx[i] = jiffies;
3433 }
3434
3435 static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond,
3436                        struct slave *slave)
3437 {
3438         struct slave *curr_active_slave, *curr_arp_slave;
3439         struct in6_addr *saddr, *daddr;
3440         struct {
3441                 struct ipv6hdr ip6;
3442                 struct icmp6hdr icmp6;
3443         } *combined, _combined;
3444
3445         if (skb->pkt_type == PACKET_OTHERHOST ||
3446             skb->pkt_type == PACKET_LOOPBACK)
3447                 goto out;
3448
3449         combined = skb_header_pointer(skb, 0, sizeof(_combined), &_combined);
3450         if (!combined || combined->ip6.nexthdr != NEXTHDR_ICMP ||
3451             (combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION &&
3452              combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT))
3453                 goto out;
3454
3455         saddr = &combined->ip6.saddr;
3456         daddr = &combined->ip6.daddr;
3457
3458         slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI6c tip %pI6c\n",
3459                   __func__, slave->dev->name, bond_slave_state(slave),
3460                   bond->params.arp_validate, slave_do_arp_validate(bond, slave),
3461                   saddr, daddr);
3462
3463         curr_active_slave = rcu_dereference(bond->curr_active_slave);
3464         curr_arp_slave = rcu_dereference(bond->current_arp_slave);
3465
3466         /* We 'trust' the received ARP enough to validate it if:
3467          * see bond_arp_rcv().
3468          */
3469         if (bond_is_active_slave(slave))
3470                 bond_validate_na(bond, slave, saddr, daddr);
3471         else if (curr_active_slave &&
3472                  time_after(slave_last_rx(bond, curr_active_slave),
3473                             curr_active_slave->last_link_up))
3474                 bond_validate_na(bond, slave, daddr, saddr);
3475         else if (curr_arp_slave &&
3476                  bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1))
3477                 bond_validate_na(bond, slave, saddr, daddr);
3478
3479 out:
3480         return RX_HANDLER_ANOTHER;
3481 }
3482 #endif
3483
3484 int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond,
3485                       struct slave *slave)
3486 {
3487 #if IS_ENABLED(CONFIG_IPV6)
3488         bool is_ipv6 = skb->protocol == __cpu_to_be16(ETH_P_IPV6);
3489 #endif
3490         bool is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);
3491
3492         slave_dbg(bond->dev, slave->dev, "%s: skb->dev %s\n",
3493                   __func__, skb->dev->name);
3494
3495         /* Use arp validate logic for both ARP and NS */
3496         if (!slave_do_arp_validate(bond, slave)) {
3497                 if ((slave_do_arp_validate_only(bond) && is_arp) ||
3498 #if IS_ENABLED(CONFIG_IPV6)
3499                     (slave_do_arp_validate_only(bond) && is_ipv6) ||
3500 #endif
3501                     !slave_do_arp_validate_only(bond))
3502                         slave->last_rx = jiffies;
3503                 return RX_HANDLER_ANOTHER;
3504         } else if (is_arp) {
3505                 return bond_arp_rcv(skb, bond, slave);
3506 #if IS_ENABLED(CONFIG_IPV6)
3507         } else if (is_ipv6) {
3508                 return bond_na_rcv(skb, bond, slave);
3509 #endif
3510         } else {
3511                 return RX_HANDLER_ANOTHER;
3512         }
3513 }
3514
3515 static void bond_send_validate(struct bonding *bond, struct slave *slave)
3516 {
3517         bond_arp_send_all(bond, slave);
3518 #if IS_ENABLED(CONFIG_IPV6)
3519         bond_ns_send_all(bond, slave);
3520 #endif
3521 }
3522
3523 /* function to verify if we're in the arp_interval timeslice, returns true if
3524  * (last_act - arp_interval) <= jiffies <= (last_act + mod * arp_interval +
3525  * arp_interval/2) . the arp_interval/2 is needed for really fast networks.
3526  */
3527 static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
3528                                   int mod)
3529 {
3530         int delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
3531
3532         return time_in_range(jiffies,
3533                              last_act - delta_in_ticks,
3534                              last_act + mod * delta_in_ticks + delta_in_ticks/2);
3535 }
3536
3537 /* This function is called regularly to monitor each slave's link
3538  * ensuring that traffic is being sent and received when arp monitoring
3539  * is used in load-balancing mode. if the adapter has been dormant, then an
3540  * arp is transmitted to generate traffic. see activebackup_arp_monitor for
3541  * arp monitoring in active backup mode.
3542  */
3543 static void bond_loadbalance_arp_mon(struct bonding *bond)
3544 {
3545         struct slave *slave, *oldcurrent;
3546         struct list_head *iter;
3547         int do_failover = 0, slave_state_changed = 0;
3548
3549         if (!bond_has_slaves(bond))
3550                 goto re_arm;
3551
3552         rcu_read_lock();
3553
3554         oldcurrent = rcu_dereference(bond->curr_active_slave);
3555         /* see if any of the previous devices are up now (i.e. they have
3556          * xmt and rcv traffic). the curr_active_slave does not come into
3557          * the picture unless it is null. also, slave->last_link_up is not
3558          * needed here because we send an arp on each slave and give a slave
3559          * as long as it needs to get the tx/rx within the delta.
3560          * TODO: what about up/down delay in arp mode? it wasn't here before
3561          *       so it can wait
3562          */
3563         bond_for_each_slave_rcu(bond, slave, iter) {
3564                 unsigned long last_tx = slave_last_tx(slave);
3565
3566                 bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
3567
3568                 if (slave->link != BOND_LINK_UP) {
3569                         if (bond_time_in_interval(bond, last_tx, 1) &&
3570                             bond_time_in_interval(bond, slave->last_rx, 1)) {
3571
3572                                 bond_propose_link_state(slave, BOND_LINK_UP);
3573                                 slave_state_changed = 1;
3574
3575                                 /* primary_slave has no meaning in round-robin
3576                                  * mode. the window of a slave being up and
3577                                  * curr_active_slave being null after enslaving
3578                                  * is closed.
3579                                  */
3580                                 if (!oldcurrent) {
3581                                         slave_info(bond->dev, slave->dev, "link status definitely up\n");
3582                                         do_failover = 1;
3583                                 } else {
3584                                         slave_info(bond->dev, slave->dev, "interface is now up\n");
3585                                 }
3586                         }
3587                 } else {
3588                         /* slave->link == BOND_LINK_UP */
3589
3590                         /* not all switches will respond to an arp request
3591                          * when the source ip is 0, so don't take the link down
3592                          * if we don't know our ip yet
3593                          */
3594                         if (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) ||
3595                             !bond_time_in_interval(bond, slave->last_rx, bond->params.missed_max)) {
3596
3597                                 bond_propose_link_state(slave, BOND_LINK_DOWN);
3598                                 slave_state_changed = 1;
3599
3600                                 if (slave->link_failure_count < UINT_MAX)
3601                                         slave->link_failure_count++;
3602
3603                                 slave_info(bond->dev, slave->dev, "interface is now down\n");
3604
3605                                 if (slave == oldcurrent)
3606                                         do_failover = 1;
3607                         }
3608                 }
3609
3610                 /* note: if switch is in round-robin mode, all links
3611                  * must tx arp to ensure all links rx an arp - otherwise
3612                  * links may oscillate or not come up at all; if switch is
3613                  * in something like xor mode, there is nothing we can
3614                  * do - all replies will be rx'ed on same link causing slaves
3615                  * to be unstable during low/no traffic periods
3616                  */
3617                 if (bond_slave_is_up(slave))
3618                         bond_send_validate(bond, slave);
3619         }
3620
3621         rcu_read_unlock();
3622
3623         if (do_failover || slave_state_changed) {
3624                 if (!rtnl_trylock())
3625                         goto re_arm;
3626
3627                 bond_for_each_slave(bond, slave, iter) {
3628                         if (slave->link_new_state != BOND_LINK_NOCHANGE)
3629                                 slave->link = slave->link_new_state;
3630                 }
3631
3632                 if (slave_state_changed) {
3633                         bond_slave_state_change(bond);
3634                         if (BOND_MODE(bond) == BOND_MODE_XOR)
3635                                 bond_update_slave_arr(bond, NULL);
3636                 }
3637                 if (do_failover) {
3638                         block_netpoll_tx();
3639                         bond_select_active_slave(bond);
3640                         unblock_netpoll_tx();
3641                 }
3642                 rtnl_unlock();
3643         }
3644
3645 re_arm:
3646         if (bond->params.arp_interval)
3647                 queue_delayed_work(bond->wq, &bond->arp_work,
3648                                    msecs_to_jiffies(bond->params.arp_interval));
3649 }
3650
3651 /* Called to inspect slaves for active-backup mode ARP monitor link state
3652  * changes.  Sets proposed link state in slaves to specify what action
3653  * should take place for the slave.  Returns 0 if no changes are found, >0
3654  * if changes to link states must be committed.
3655  *
3656  * Called with rcu_read_lock held.
3657  */
3658 static int bond_ab_arp_inspect(struct bonding *bond)
3659 {
3660         unsigned long last_tx, last_rx;
3661         struct list_head *iter;
3662         struct slave *slave;
3663         int commit = 0;
3664
3665         bond_for_each_slave_rcu(bond, slave, iter) {
3666                 bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
3667                 last_rx = slave_last_rx(bond, slave);
3668
3669                 if (slave->link != BOND_LINK_UP) {
3670                         if (bond_time_in_interval(bond, last_rx, 1)) {
3671                                 bond_propose_link_state(slave, BOND_LINK_UP);
3672                                 commit++;
3673                         } else if (slave->link == BOND_LINK_BACK) {
3674                                 bond_propose_link_state(slave, BOND_LINK_FAIL);
3675                                 commit++;
3676                         }
3677                         continue;
3678                 }
3679
3680                 /* Give slaves 2*delta after being enslaved or made
3681                  * active.  This avoids bouncing, as the last receive
3682                  * times need a full ARP monitor cycle to be updated.
3683                  */
3684                 if (bond_time_in_interval(bond, slave->last_link_up, 2))
3685                         continue;
3686
3687                 /* Backup slave is down if:
3688                  * - No current_arp_slave AND
3689                  * - more than (missed_max+1)*delta since last receive AND
3690                  * - the bond has an IP address
3691                  *
3692                  * Note: a non-null current_arp_slave indicates
3693                  * the curr_active_slave went down and we are
3694                  * searching for a new one; under this condition
3695                  * we only take the curr_active_slave down - this
3696                  * gives each slave a chance to tx/rx traffic
3697                  * before being taken out
3698                  */
3699                 if (!bond_is_active_slave(slave) &&
3700                     !rcu_access_pointer(bond->current_arp_slave) &&
3701                     !bond_time_in_interval(bond, last_rx, bond->params.missed_max + 1)) {
3702                         bond_propose_link_state(slave, BOND_LINK_DOWN);
3703                         commit++;
3704                 }
3705
3706                 /* Active slave is down if:
3707                  * - more than missed_max*delta since transmitting OR
3708                  * - (more than missed_max*delta since receive AND
3709                  *    the bond has an IP address)
3710                  */
3711                 last_tx = slave_last_tx(slave);
3712                 if (bond_is_active_slave(slave) &&
3713                     (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) ||
3714                      !bond_time_in_interval(bond, last_rx, bond->params.missed_max))) {
3715                         bond_propose_link_state(slave, BOND_LINK_DOWN);
3716                         commit++;
3717                 }
3718         }
3719
3720         return commit;
3721 }
3722
3723 /* Called to commit link state changes noted by inspection step of
3724  * active-backup mode ARP monitor.
3725  *
3726  * Called with RTNL hold.
3727  */
3728 static void bond_ab_arp_commit(struct bonding *bond)
3729 {
3730         bool do_failover = false;
3731         struct list_head *iter;
3732         unsigned long last_tx;
3733         struct slave *slave;
3734
3735         bond_for_each_slave(bond, slave, iter) {
3736                 switch (slave->link_new_state) {
3737                 case BOND_LINK_NOCHANGE:
3738                         continue;
3739
3740                 case BOND_LINK_UP:
3741                         last_tx = slave_last_tx(slave);
3742                         if (rtnl_dereference(bond->curr_active_slave) != slave ||
3743                             (!rtnl_dereference(bond->curr_active_slave) &&
3744                              bond_time_in_interval(bond, last_tx, 1))) {
3745                                 struct slave *current_arp_slave;
3746
3747                                 current_arp_slave = rtnl_dereference(bond->current_arp_slave);
3748                                 bond_set_slave_link_state(slave, BOND_LINK_UP,
3749                                                           BOND_SLAVE_NOTIFY_NOW);
3750                                 if (current_arp_slave) {
3751                                         bond_set_slave_inactive_flags(
3752                                                 current_arp_slave,
3753                                                 BOND_SLAVE_NOTIFY_NOW);
3754                                         RCU_INIT_POINTER(bond->current_arp_slave, NULL);
3755                                 }
3756
3757                                 slave_info(bond->dev, slave->dev, "link status definitely up\n");
3758
3759                                 if (!rtnl_dereference(bond->curr_active_slave) ||
3760                                     slave == rtnl_dereference(bond->primary_slave) ||
3761                                     slave->prio > rtnl_dereference(bond->curr_active_slave)->prio)
3762                                         do_failover = true;
3763
3764                         }
3765
3766                         continue;
3767
3768                 case BOND_LINK_DOWN:
3769                         if (slave->link_failure_count < UINT_MAX)
3770                                 slave->link_failure_count++;
3771
3772                         bond_set_slave_link_state(slave, BOND_LINK_DOWN,
3773                                                   BOND_SLAVE_NOTIFY_NOW);
3774                         bond_set_slave_inactive_flags(slave,
3775                                                       BOND_SLAVE_NOTIFY_NOW);
3776
3777                         slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n");
3778
3779                         if (slave == rtnl_dereference(bond->curr_active_slave)) {
3780                                 RCU_INIT_POINTER(bond->current_arp_slave, NULL);
3781                                 do_failover = true;
3782                         }
3783
3784                         continue;
3785
3786                 case BOND_LINK_FAIL:
3787                         bond_set_slave_link_state(slave, BOND_LINK_FAIL,
3788                                                   BOND_SLAVE_NOTIFY_NOW);
3789                         bond_set_slave_inactive_flags(slave,
3790                                                       BOND_SLAVE_NOTIFY_NOW);
3791
3792                         /* A slave has just been enslaved and has become
3793                          * the current active slave.
3794                          */
3795                         if (rtnl_dereference(bond->curr_active_slave))
3796                                 RCU_INIT_POINTER(bond->current_arp_slave, NULL);
3797                         continue;
3798
3799                 default:
3800                         slave_err(bond->dev, slave->dev,
3801                                   "impossible: link_new_state %d on slave\n",
3802                                   slave->link_new_state);
3803                         continue;
3804                 }
3805         }
3806
3807         if (do_failover) {
3808                 block_netpoll_tx();
3809                 bond_select_active_slave(bond);
3810                 unblock_netpoll_tx();
3811         }
3812
3813         bond_set_carrier(bond);
3814 }
3815
3816 /* Send ARP probes for active-backup mode ARP monitor.
3817  *
3818  * Called with rcu_read_lock held.
3819  */
3820 static bool bond_ab_arp_probe(struct bonding *bond)
3821 {
3822         struct slave *slave, *before = NULL, *new_slave = NULL,
3823                      *curr_arp_slave = rcu_dereference(bond->current_arp_slave),
3824                      *curr_active_slave = rcu_dereference(bond->curr_active_slave);
3825         struct list_head *iter;
3826         bool found = false;
3827         bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER;
3828
3829         if (curr_arp_slave && curr_active_slave)
3830                 netdev_info(bond->dev, "PROBE: c_arp %s && cas %s BAD\n",
3831                             curr_arp_slave->dev->name,
3832                             curr_active_slave->dev->name);
3833
3834         if (curr_active_slave) {
3835                 bond_send_validate(bond, curr_active_slave);
3836                 return should_notify_rtnl;
3837         }
3838
3839         /* if we don't have a curr_active_slave, search for the next available
3840          * backup slave from the current_arp_slave and make it the candidate
3841          * for becoming the curr_active_slave
3842          */
3843
3844         if (!curr_arp_slave) {
3845                 curr_arp_slave = bond_first_slave_rcu(bond);
3846                 if (!curr_arp_slave)
3847                         return should_notify_rtnl;
3848         }
3849
3850         bond_for_each_slave_rcu(bond, slave, iter) {
3851                 if (!found && !before && bond_slave_is_up(slave))
3852                         before = slave;
3853
3854                 if (found && !new_slave && bond_slave_is_up(slave))
3855                         new_slave = slave;
3856                 /* if the link state is up at this point, we
3857                  * mark it down - this can happen if we have
3858                  * simultaneous link failures and
3859                  * reselect_active_interface doesn't make this
3860                  * one the current slave so it is still marked
3861                  * up when it is actually down
3862                  */
3863                 if (!bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) {
3864                         bond_set_slave_link_state(slave, BOND_LINK_DOWN,
3865                                                   BOND_SLAVE_NOTIFY_LATER);
3866                         if (slave->link_failure_count < UINT_MAX)
3867                                 slave->link_failure_count++;
3868
3869                         bond_set_slave_inactive_flags(slave,
3870                                                       BOND_SLAVE_NOTIFY_LATER);
3871
3872                         slave_info(bond->dev, slave->dev, "backup interface is now down\n");
3873                 }
3874                 if (slave == curr_arp_slave)
3875                         found = true;
3876         }
3877
3878         if (!new_slave && before)
3879                 new_slave = before;
3880
3881         if (!new_slave)
3882                 goto check_state;
3883
3884         bond_set_slave_link_state(new_slave, BOND_LINK_BACK,
3885                                   BOND_SLAVE_NOTIFY_LATER);
3886         bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_LATER);
3887         bond_send_validate(bond, new_slave);
3888         new_slave->last_link_up = jiffies;
3889         rcu_assign_pointer(bond->current_arp_slave, new_slave);
3890
3891 check_state:
3892         bond_for_each_slave_rcu(bond, slave, iter) {
3893                 if (slave->should_notify || slave->should_notify_link) {
3894                         should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW;
3895                         break;
3896                 }
3897         }
3898         return should_notify_rtnl;
3899 }
3900
3901 static void bond_activebackup_arp_mon(struct bonding *bond)
3902 {
3903         bool should_notify_peers = false;
3904         bool should_notify_rtnl = false;
3905         int delta_in_ticks;
3906
3907         delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);
3908
3909         if (!bond_has_slaves(bond))
3910                 goto re_arm;
3911
3912         rcu_read_lock();
3913
3914         should_notify_peers = bond_should_notify_peers(bond);
3915
3916         if (bond_ab_arp_inspect(bond)) {
3917                 rcu_read_unlock();
3918
3919                 /* Race avoidance with bond_close flush of workqueue */
3920                 if (!rtnl_trylock()) {
3921                         delta_in_ticks = 1;
3922                         should_notify_peers = false;
3923                         goto re_arm;
3924                 }
3925
3926                 bond_ab_arp_commit(bond);
3927
3928                 rtnl_unlock();
3929                 rcu_read_lock();
3930         }
3931
3932         should_notify_rtnl = bond_ab_arp_probe(bond);
3933         rcu_read_unlock();
3934
3935 re_arm:
3936         if (bond->params.arp_interval)
3937                 queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);
3938
3939         if (should_notify_peers || should_notify_rtnl) {
3940                 if (!rtnl_trylock())
3941                         return;
3942
3943                 if (should_notify_peers) {
3944                         bond->send_peer_notif--;
3945                         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
3946                                                  bond->dev);
3947                 }
3948                 if (should_notify_rtnl) {
3949                         bond_slave_state_notify(bond);
3950                         bond_slave_link_notify(bond);
3951                 }
3952
3953                 rtnl_unlock();
3954         }
3955 }
3956
3957 static void bond_arp_monitor(struct work_struct *work)
3958 {
3959         struct bonding *bond = container_of(work, struct bonding,
3960                                             arp_work.work);
3961
3962         if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
3963                 bond_activebackup_arp_mon(bond);
3964         else
3965                 bond_loadbalance_arp_mon(bond);
3966 }
3967
3968 /*-------------------------- netdev event handling --------------------------*/
3969
3970 /* Change device name */
3971 static int bond_event_changename(struct bonding *bond)
3972 {
3973         bond_remove_proc_entry(bond);
3974         bond_create_proc_entry(bond);
3975
3976         bond_debug_reregister(bond);
3977
3978         return NOTIFY_DONE;
3979 }
3980
3981 static int bond_master_netdev_event(unsigned long event,
3982                                     struct net_device *bond_dev)
3983 {
3984         struct bonding *event_bond = netdev_priv(bond_dev);
3985
3986         netdev_dbg(bond_dev, "%s called\n", __func__);
3987
3988         switch (event) {
3989         case NETDEV_CHANGENAME:
3990                 return bond_event_changename(event_bond);
3991         case NETDEV_UNREGISTER:
3992                 bond_remove_proc_entry(event_bond);
3993 #ifdef CONFIG_XFRM_OFFLOAD
3994                 xfrm_dev_state_flush(dev_net(bond_dev), bond_dev, true);
3995 #endif /* CONFIG_XFRM_OFFLOAD */
3996                 break;
3997         case NETDEV_REGISTER:
3998                 bond_create_proc_entry(event_bond);
3999                 break;
4000         default:
4001                 break;
4002         }
4003
4004         return NOTIFY_DONE;
4005 }
4006
4007 static int bond_slave_netdev_event(unsigned long event,
4008                                    struct net_device *slave_dev)
4009 {
4010         struct slave *slave = bond_slave_get_rtnl(slave_dev), *primary;
4011         struct bonding *bond;
4012         struct net_device *bond_dev;
4013
4014         /* A netdev event can be generated while enslaving a device
4015          * before netdev_rx_handler_register is called in which case
4016          * slave will be NULL
4017          */
4018         if (!slave) {
4019                 netdev_dbg(slave_dev, "%s called on NULL slave\n", __func__);
4020                 return NOTIFY_DONE;
4021         }
4022
4023         bond_dev = slave->bond->dev;
4024         bond = slave->bond;
4025         primary = rtnl_dereference(bond->primary_slave);
4026
4027         slave_dbg(bond_dev, slave_dev, "%s called\n", __func__);
4028
4029         switch (event) {
4030         case NETDEV_UNREGISTER:
4031                 if (bond_dev->type != ARPHRD_ETHER)
4032                         bond_release_and_destroy(bond_dev, slave_dev);
4033                 else
4034                         __bond_release_one(bond_dev, slave_dev, false, true);
4035                 break;
4036         case NETDEV_UP:
4037         case NETDEV_CHANGE:
4038                 /* For 802.3ad mode only:
4039                  * Getting invalid Speed/Duplex values here will put slave
4040                  * in weird state. Mark it as link-fail if the link was
4041                  * previously up or link-down if it hasn't yet come up, and
4042                  * let link-monitoring (miimon) set it right when correct
4043                  * speeds/duplex are available.
4044                  */
4045                 if (bond_update_speed_duplex(slave) &&
4046                     BOND_MODE(bond) == BOND_MODE_8023AD) {
4047                         if (slave->last_link_up)
4048                                 slave->link = BOND_LINK_FAIL;
4049                         else
4050                                 slave->link = BOND_LINK_DOWN;
4051                 }
4052
4053                 if (BOND_MODE(bond) == BOND_MODE_8023AD)
4054                         bond_3ad_adapter_speed_duplex_changed(slave);
4055                 fallthrough;
4056         case NETDEV_DOWN:
4057                 /* Refresh slave-array if applicable!
4058                  * If the setup does not use miimon or arpmon (mode-specific!),
4059                  * then these events will not cause the slave-array to be
4060                  * refreshed. This will cause xmit to use a slave that is not
4061                  * usable. Avoid such situation by refeshing the array at these
4062                  * events. If these (miimon/arpmon) parameters are configured
4063                  * then array gets refreshed twice and that should be fine!
4064                  */
4065                 if (bond_mode_can_use_xmit_hash(bond))
4066                         bond_update_slave_arr(bond, NULL);
4067                 break;
4068         case NETDEV_CHANGEMTU:
4069                 /* TODO: Should slaves be allowed to
4070                  * independently alter their MTU?  For
4071                  * an active-backup bond, slaves need
4072                  * not be the same type of device, so
4073                  * MTUs may vary.  For other modes,
4074                  * slaves arguably should have the
4075                  * same MTUs. To do this, we'd need to
4076                  * take over the slave's change_mtu
4077                  * function for the duration of their
4078                  * servitude.
4079                  */
4080                 break;
4081         case NETDEV_CHANGENAME:
4082                 /* we don't care if we don't have primary set */
4083                 if (!bond_uses_primary(bond) ||
4084                     !bond->params.primary[0])
4085                         break;
4086
4087                 if (slave == primary) {
4088                         /* slave's name changed - he's no longer primary */
4089                         RCU_INIT_POINTER(bond->primary_slave, NULL);
4090                 } else if (!strcmp(slave_dev->name, bond->params.primary)) {
4091                         /* we have a new primary slave */
4092                         rcu_assign_pointer(bond->primary_slave, slave);
4093                 } else { /* we didn't change primary - exit */
4094                         break;
4095                 }
4096
4097                 netdev_info(bond->dev, "Primary slave changed to %s, reselecting active slave\n",
4098                             primary ? slave_dev->name : "none");
4099
4100                 block_netpoll_tx();
4101                 bond_select_active_slave(bond);
4102                 unblock_netpoll_tx();
4103                 break;
4104         case NETDEV_FEAT_CHANGE:
4105                 if (!bond->notifier_ctx) {
4106                         bond->notifier_ctx = true;
4107                         bond_compute_features(bond);
4108                         bond->notifier_ctx = false;
4109                 }
4110                 break;
4111         case NETDEV_RESEND_IGMP:
4112                 /* Propagate to master device */
4113                 call_netdevice_notifiers(event, slave->bond->dev);
4114                 break;
4115         case NETDEV_XDP_FEAT_CHANGE:
4116                 bond_xdp_set_features(bond_dev);
4117                 break;
4118         default:
4119                 break;
4120         }
4121
4122         return NOTIFY_DONE;
4123 }
4124
4125 /* bond_netdev_event: handle netdev notifier chain events.
4126  *
4127  * This function receives events for the netdev chain.  The caller (an
4128  * ioctl handler calling blocking_notifier_call_chain) holds the necessary
4129  * locks for us to safely manipulate the slave devices (RTNL lock,
4130  * dev_probe_lock).
4131  */
4132 static int bond_netdev_event(struct notifier_block *this,
4133                              unsigned long event, void *ptr)
4134 {
4135         struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
4136
4137         netdev_dbg(event_dev, "%s received %s\n",
4138                    __func__, netdev_cmd_to_name(event));
4139
4140         if (!(event_dev->priv_flags & IFF_BONDING))
4141                 return NOTIFY_DONE;
4142
4143         if (event_dev->flags & IFF_MASTER) {
4144                 int ret;
4145
4146                 ret = bond_master_netdev_event(event, event_dev);
4147                 if (ret != NOTIFY_DONE)
4148                         return ret;
4149         }
4150
4151         if (event_dev->flags & IFF_SLAVE)
4152                 return bond_slave_netdev_event(event, event_dev);
4153
4154         return NOTIFY_DONE;
4155 }
4156
4157 static struct notifier_block bond_netdev_notifier = {
4158         .notifier_call = bond_netdev_event,
4159 };
4160
4161 /*---------------------------- Hashing Policies -----------------------------*/
4162
4163 /* Helper to access data in a packet, with or without a backing skb.
4164  * If skb is given the data is linearized if necessary via pskb_may_pull.
4165  */
4166 static inline const void *bond_pull_data(struct sk_buff *skb,
4167                                          const void *data, int hlen, int n)
4168 {
4169         if (likely(n <= hlen))
4170                 return data;
4171         else if (skb && likely(pskb_may_pull(skb, n)))
4172                 return skb->data;
4173
4174         return NULL;
4175 }
4176
4177 /* L2 hash helper */
4178 static inline u32 bond_eth_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen)
4179 {
4180         struct ethhdr *ep;
4181
4182         data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr));
4183         if (!data)
4184                 return 0;
4185
4186         ep = (struct ethhdr *)(data + mhoff);
4187         return ep->h_dest[5] ^ ep->h_source[5] ^ be16_to_cpu(ep->h_proto);
4188 }
4189
4190 static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, const void *data,
4191                          int hlen, __be16 l2_proto, int *nhoff, int *ip_proto, bool l34)
4192 {
4193         const struct ipv6hdr *iph6;
4194         const struct iphdr *iph;
4195
4196         if (l2_proto == htons(ETH_P_IP)) {
4197                 data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph));
4198                 if (!data)
4199                         return false;
4200
4201                 iph = (const struct iphdr *)(data + *nhoff);
4202                 iph_to_flow_copy_v4addrs(fk, iph);
4203                 *nhoff += iph->ihl << 2;
4204                 if (!ip_is_fragment(iph))
4205                         *ip_proto = iph->protocol;
4206         } else if (l2_proto == htons(ETH_P_IPV6)) {
4207                 data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph6));
4208                 if (!data)
4209                         return false;
4210
4211                 iph6 = (const struct ipv6hdr *)(data + *nhoff);
4212                 iph_to_flow_copy_v6addrs(fk, iph6);
4213                 *nhoff += sizeof(*iph6);
4214                 *ip_proto = iph6->nexthdr;
4215         } else {
4216                 return false;
4217         }
4218
4219         if (l34 && *ip_proto >= 0)
4220                 fk->ports.ports = __skb_flow_get_ports(skb, *nhoff, *ip_proto, data, hlen);
4221
4222         return true;
4223 }
4224
4225 static u32 bond_vlan_srcmac_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen)
4226 {
4227         u32 srcmac_vendor = 0, srcmac_dev = 0;
4228         struct ethhdr *mac_hdr;
4229         u16 vlan = 0;
4230         int i;
4231
4232         data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr));
4233         if (!data)
4234                 return 0;
4235         mac_hdr = (struct ethhdr *)(data + mhoff);
4236
4237         for (i = 0; i < 3; i++)
4238                 srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i];
4239
4240         for (i = 3; i < ETH_ALEN; i++)
4241                 srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i];
4242
4243         if (skb && skb_vlan_tag_present(skb))
4244                 vlan = skb_vlan_tag_get(skb);
4245
4246         return vlan ^ srcmac_vendor ^ srcmac_dev;
4247 }
4248
4249 /* Extract the appropriate headers based on bond's xmit policy */
4250 static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, const void *data,
4251                               __be16 l2_proto, int nhoff, int hlen, struct flow_keys *fk)
4252 {
4253         bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
4254         int ip_proto = -1;
4255
4256         switch (bond->params.xmit_policy) {
4257         case BOND_XMIT_POLICY_ENCAP23:
4258         case BOND_XMIT_POLICY_ENCAP34:
4259                 memset(fk, 0, sizeof(*fk));
4260                 return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
4261                                           fk, data, l2_proto, nhoff, hlen, 0);
4262         default:
4263                 break;
4264         }
4265
4266         fk->ports.ports = 0;
4267         memset(&fk->icmp, 0, sizeof(fk->icmp));
4268         if (!bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34))
4269                 return false;
4270
4271         /* ICMP error packets contains at least 8 bytes of the header
4272          * of the packet which generated the error. Use this information
4273          * to correlate ICMP error packets within the same flow which
4274          * generated the error.
4275          */
4276         if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) {
4277                 skb_flow_get_icmp_tci(skb, &fk->icmp, data, nhoff, hlen);
4278                 if (ip_proto == IPPROTO_ICMP) {
4279                         if (!icmp_is_err(fk->icmp.type))
4280                                 return true;
4281
4282                         nhoff += sizeof(struct icmphdr);
4283                 } else if (ip_proto == IPPROTO_ICMPV6) {
4284                         if (!icmpv6_is_err(fk->icmp.type))
4285                                 return true;
4286
4287                         nhoff += sizeof(struct icmp6hdr);
4288                 }
4289                 return bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34);
4290         }
4291
4292         return true;
4293 }
4294
4295 static u32 bond_ip_hash(u32 hash, struct flow_keys *flow, int xmit_policy)
4296 {
4297         hash ^= (__force u32)flow_get_u32_dst(flow) ^
4298                 (__force u32)flow_get_u32_src(flow);
4299         hash ^= (hash >> 16);
4300         hash ^= (hash >> 8);
4301
4302         /* discard lowest hash bit to deal with the common even ports pattern */
4303         if (xmit_policy == BOND_XMIT_POLICY_LAYER34 ||
4304                 xmit_policy == BOND_XMIT_POLICY_ENCAP34)
4305                 return hash >> 1;
4306
4307         return hash;
4308 }
4309
4310 /* Generate hash based on xmit policy. If @skb is given it is used to linearize
4311  * the data as required, but this function can be used without it if the data is
4312  * known to be linear (e.g. with xdp_buff).
4313  */
4314 static u32 __bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, const void *data,
4315                             __be16 l2_proto, int mhoff, int nhoff, int hlen)
4316 {
4317         struct flow_keys flow;
4318         u32 hash;
4319
4320         if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC)
4321                 return bond_vlan_srcmac_hash(skb, data, mhoff, hlen);
4322
4323         if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
4324             !bond_flow_dissect(bond, skb, data, l2_proto, nhoff, hlen, &flow))
4325                 return bond_eth_hash(skb, data, mhoff, hlen);
4326
4327         if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 ||
4328             bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) {
4329                 hash = bond_eth_hash(skb, data, mhoff, hlen);
4330         } else {
4331                 if (flow.icmp.id)
4332                         memcpy(&hash, &flow.icmp, sizeof(hash));
4333                 else
4334                         memcpy(&hash, &flow.ports.ports, sizeof(hash));
4335         }
4336
4337         return bond_ip_hash(hash, &flow, bond->params.xmit_policy);
4338 }
4339
4340 /**
4341  * bond_xmit_hash - generate a hash value based on the xmit policy
4342  * @bond: bonding device
4343  * @skb: buffer to use for headers
4344  *
4345  * This function will extract the necessary headers from the skb buffer and use
4346  * them to generate a hash based on the xmit_policy set in the bonding device
4347  */
4348 u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
4349 {
4350         if (bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP34 &&
4351             skb->l4_hash)
4352                 return skb->hash;
4353
4354         return __bond_xmit_hash(bond, skb, skb->data, skb->protocol,
4355                                 0, skb_network_offset(skb),
4356                                 skb_headlen(skb));
4357 }
4358
4359 /**
4360  * bond_xmit_hash_xdp - generate a hash value based on the xmit policy
4361  * @bond: bonding device
4362  * @xdp: buffer to use for headers
4363  *
4364  * The XDP variant of bond_xmit_hash.
4365  */
4366 static u32 bond_xmit_hash_xdp(struct bonding *bond, struct xdp_buff *xdp)
4367 {
4368         struct ethhdr *eth;
4369
4370         if (xdp->data + sizeof(struct ethhdr) > xdp->data_end)
4371                 return 0;
4372
4373         eth = (struct ethhdr *)xdp->data;
4374
4375         return __bond_xmit_hash(bond, NULL, xdp->data, eth->h_proto, 0,
4376                                 sizeof(struct ethhdr), xdp->data_end - xdp->data);
4377 }
4378
4379 /*-------------------------- Device entry points ----------------------------*/
4380
4381 void bond_work_init_all(struct bonding *bond)
4382 {
4383         INIT_DELAYED_WORK(&bond->mcast_work,
4384                           bond_resend_igmp_join_requests_delayed);
4385         INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor);
4386         INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor);
4387         INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor);
4388         INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
4389         INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler);
4390 }
4391
4392 static void bond_work_cancel_all(struct bonding *bond)
4393 {
4394         cancel_delayed_work_sync(&bond->mii_work);
4395         cancel_delayed_work_sync(&bond->arp_work);
4396         cancel_delayed_work_sync(&bond->alb_work);
4397         cancel_delayed_work_sync(&bond->ad_work);
4398         cancel_delayed_work_sync(&bond->mcast_work);
4399         cancel_delayed_work_sync(&bond->slave_arr_work);
4400 }
4401
4402 static int bond_open(struct net_device *bond_dev)
4403 {
4404         struct bonding *bond = netdev_priv(bond_dev);
4405         struct list_head *iter;
4406         struct slave *slave;
4407
4408         if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN && !bond->rr_tx_counter) {
4409                 bond->rr_tx_counter = alloc_percpu(u32);
4410                 if (!bond->rr_tx_counter)
4411                         return -ENOMEM;
4412         }
4413
4414         /* reset slave->backup and slave->inactive */
4415         if (bond_has_slaves(bond)) {
4416                 bond_for_each_slave(bond, slave, iter) {
4417                         if (bond_uses_primary(bond) &&
4418                             slave != rcu_access_pointer(bond->curr_active_slave)) {
4419                                 bond_set_slave_inactive_flags(slave,
4420                                                               BOND_SLAVE_NOTIFY_NOW);
4421                         } else if (BOND_MODE(bond) != BOND_MODE_8023AD) {
4422                                 bond_set_slave_active_flags(slave,
4423                                                             BOND_SLAVE_NOTIFY_NOW);
4424                         }
4425                 }
4426         }
4427
4428         if (bond_is_lb(bond)) {
4429                 /* bond_alb_initialize must be called before the timer
4430                  * is started.
4431                  */
4432                 if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB)))
4433                         return -ENOMEM;
4434                 if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB)
4435                         queue_delayed_work(bond->wq, &bond->alb_work, 0);
4436         }
4437
4438         if (bond->params.miimon)  /* link check interval, in milliseconds. */
4439                 queue_delayed_work(bond->wq, &bond->mii_work, 0);
4440
4441         if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
4442                 queue_delayed_work(bond->wq, &bond->arp_work, 0);
4443                 bond->recv_probe = bond_rcv_validate;
4444         }
4445
4446         if (BOND_MODE(bond) == BOND_MODE_8023AD) {
4447                 queue_delayed_work(bond->wq, &bond->ad_work, 0);
4448                 /* register to receive LACPDUs */
4449                 bond->recv_probe = bond_3ad_lacpdu_recv;
4450                 bond_3ad_initiate_agg_selection(bond, 1);
4451
4452                 bond_for_each_slave(bond, slave, iter)
4453                         dev_mc_add(slave->dev, lacpdu_mcast_addr);
4454         }
4455
4456         if (bond_mode_can_use_xmit_hash(bond))
4457                 bond_update_slave_arr(bond, NULL);
4458
4459         return 0;
4460 }
4461
4462 static int bond_close(struct net_device *bond_dev)
4463 {
4464         struct bonding *bond = netdev_priv(bond_dev);
4465         struct slave *slave;
4466
4467         bond_work_cancel_all(bond);
4468         bond->send_peer_notif = 0;
4469         if (bond_is_lb(bond))
4470                 bond_alb_deinitialize(bond);
4471         bond->recv_probe = NULL;
4472
4473         if (bond_uses_primary(bond)) {
4474                 rcu_read_lock();
4475                 slave = rcu_dereference(bond->curr_active_slave);
4476                 if (slave)
4477                         bond_hw_addr_flush(bond_dev, slave->dev);
4478                 rcu_read_unlock();
4479         } else {
4480                 struct list_head *iter;
4481
4482                 bond_for_each_slave(bond, slave, iter)
4483                         bond_hw_addr_flush(bond_dev, slave->dev);
4484         }
4485
4486         return 0;
4487 }
4488
4489 /* fold stats, assuming all rtnl_link_stats64 fields are u64, but
4490  * that some drivers can provide 32bit values only.
4491  */
4492 static void bond_fold_stats(struct rtnl_link_stats64 *_res,
4493                             const struct rtnl_link_stats64 *_new,
4494                             const struct rtnl_link_stats64 *_old)
4495 {
4496         const u64 *new = (const u64 *)_new;
4497         const u64 *old = (const u64 *)_old;
4498         u64 *res = (u64 *)_res;
4499         int i;
4500
4501         for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
4502                 u64 nv = new[i];
4503                 u64 ov = old[i];
4504                 s64 delta = nv - ov;
4505
4506                 /* detects if this particular field is 32bit only */
4507                 if (((nv | ov) >> 32) == 0)
4508                         delta = (s64)(s32)((u32)nv - (u32)ov);
4509
4510                 /* filter anomalies, some drivers reset their stats
4511                  * at down/up events.
4512                  */
4513                 if (delta > 0)
4514                         res[i] += delta;
4515         }
4516 }
4517
4518 #ifdef CONFIG_LOCKDEP
4519 static int bond_get_lowest_level_rcu(struct net_device *dev)
4520 {
4521         struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
4522         struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
4523         int cur = 0, max = 0;
4524
4525         now = dev;
4526         iter = &dev->adj_list.lower;
4527
4528         while (1) {
4529                 next = NULL;
4530                 while (1) {
4531                         ldev = netdev_next_lower_dev_rcu(now, &iter);
4532                         if (!ldev)
4533                                 break;
4534
4535                         next = ldev;
4536                         niter = &ldev->adj_list.lower;
4537                         dev_stack[cur] = now;
4538                         iter_stack[cur++] = iter;
4539                         if (max <= cur)
4540                                 max = cur;
4541                         break;
4542                 }
4543
4544                 if (!next) {
4545                         if (!cur)
4546                                 return max;
4547                         next = dev_stack[--cur];
4548                         niter = iter_stack[cur];
4549                 }
4550
4551                 now = next;
4552                 iter = niter;
4553         }
4554
4555         return max;
4556 }
4557 #endif
4558
4559 static void bond_get_stats(struct net_device *bond_dev,
4560                            struct rtnl_link_stats64 *stats)
4561 {
4562         struct bonding *bond = netdev_priv(bond_dev);
4563         struct rtnl_link_stats64 temp;
4564         struct list_head *iter;
4565         struct slave *slave;
4566         int nest_level = 0;
4567
4568
4569         rcu_read_lock();
4570 #ifdef CONFIG_LOCKDEP
4571         nest_level = bond_get_lowest_level_rcu(bond_dev);
4572 #endif
4573
4574         spin_lock_nested(&bond->stats_lock, nest_level);
4575         memcpy(stats, &bond->bond_stats, sizeof(*stats));
4576
4577         bond_for_each_slave_rcu(bond, slave, iter) {
4578                 const struct rtnl_link_stats64 *new =
4579                         dev_get_stats(slave->dev, &temp);
4580
4581                 bond_fold_stats(stats, new, &slave->slave_stats);
4582
4583                 /* save off the slave stats for the next run */
4584                 memcpy(&slave->slave_stats, new, sizeof(*new));
4585         }
4586
4587         memcpy(&bond->bond_stats, stats, sizeof(*stats));
4588         spin_unlock(&bond->stats_lock);
4589         rcu_read_unlock();
4590 }
4591
4592 static int bond_eth_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
4593 {
4594         struct bonding *bond = netdev_priv(bond_dev);
4595         struct mii_ioctl_data *mii = NULL;
4596
4597         netdev_dbg(bond_dev, "bond_eth_ioctl: cmd=%d\n", cmd);
4598
4599         switch (cmd) {
4600         case SIOCGMIIPHY:
4601                 mii = if_mii(ifr);
4602                 if (!mii)
4603                         return -EINVAL;
4604
4605                 mii->phy_id = 0;
4606                 fallthrough;
4607         case SIOCGMIIREG:
4608                 /* We do this again just in case we were called by SIOCGMIIREG
4609                  * instead of SIOCGMIIPHY.
4610                  */
4611                 mii = if_mii(ifr);
4612                 if (!mii)
4613                         return -EINVAL;
4614
4615                 if (mii->reg_num == 1) {
4616                         mii->val_out = 0;
4617                         if (netif_carrier_ok(bond->dev))
4618                                 mii->val_out = BMSR_LSTATUS;
4619                 }
4620
4621                 break;
4622         default:
4623                 return -EOPNOTSUPP;
4624         }
4625
4626         return 0;
4627 }
4628
4629 static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
4630 {
4631         struct bonding *bond = netdev_priv(bond_dev);
4632         struct net_device *slave_dev = NULL;
4633         struct ifbond k_binfo;
4634         struct ifbond __user *u_binfo = NULL;
4635         struct ifslave k_sinfo;
4636         struct ifslave __user *u_sinfo = NULL;
4637         struct bond_opt_value newval;
4638         struct net *net;
4639         int res = 0;
4640
4641         netdev_dbg(bond_dev, "bond_ioctl: cmd=%d\n", cmd);
4642
4643         switch (cmd) {
4644         case SIOCBONDINFOQUERY:
4645                 u_binfo = (struct ifbond __user *)ifr->ifr_data;
4646
4647                 if (copy_from_user(&k_binfo, u_binfo, sizeof(ifbond)))
4648                         return -EFAULT;
4649
4650                 bond_info_query(bond_dev, &k_binfo);
4651                 if (copy_to_user(u_binfo, &k_binfo, sizeof(ifbond)))
4652                         return -EFAULT;
4653
4654                 return 0;
4655         case SIOCBONDSLAVEINFOQUERY:
4656                 u_sinfo = (struct ifslave __user *)ifr->ifr_data;
4657
4658                 if (copy_from_user(&k_sinfo, u_sinfo, sizeof(ifslave)))
4659                         return -EFAULT;
4660
4661                 res = bond_slave_info_query(bond_dev, &k_sinfo);
4662                 if (res == 0 &&
4663                     copy_to_user(u_sinfo, &k_sinfo, sizeof(ifslave)))
4664                         return -EFAULT;
4665
4666                 return res;
4667         default:
4668                 break;
4669         }
4670
4671         net = dev_net(bond_dev);
4672
4673         if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
4674                 return -EPERM;
4675
4676         slave_dev = __dev_get_by_name(net, ifr->ifr_slave);
4677
4678         slave_dbg(bond_dev, slave_dev, "slave_dev=%p:\n", slave_dev);
4679
4680         if (!slave_dev)
4681                 return -ENODEV;
4682
4683         switch (cmd) {
4684         case SIOCBONDENSLAVE:
4685                 res = bond_enslave(bond_dev, slave_dev, NULL);
4686                 break;
4687         case SIOCBONDRELEASE:
4688                 res = bond_release(bond_dev, slave_dev);
4689                 break;
4690         case SIOCBONDSETHWADDR:
4691                 res = bond_set_dev_addr(bond_dev, slave_dev);
4692                 break;
4693         case SIOCBONDCHANGEACTIVE:
4694                 bond_opt_initstr(&newval, slave_dev->name);
4695                 res = __bond_opt_set_notify(bond, BOND_OPT_ACTIVE_SLAVE,
4696                                             &newval);
4697                 break;
4698         default:
4699                 res = -EOPNOTSUPP;
4700         }
4701
4702         return res;
4703 }
4704
4705 static int bond_siocdevprivate(struct net_device *bond_dev, struct ifreq *ifr,
4706                                void __user *data, int cmd)
4707 {
4708         struct ifreq ifrdata = { .ifr_data = data };
4709
4710         switch (cmd) {
4711         case BOND_INFO_QUERY_OLD:
4712                 return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDINFOQUERY);
4713         case BOND_SLAVE_INFO_QUERY_OLD:
4714                 return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDSLAVEINFOQUERY);
4715         case BOND_ENSLAVE_OLD:
4716                 return bond_do_ioctl(bond_dev, ifr, SIOCBONDENSLAVE);
4717         case BOND_RELEASE_OLD:
4718                 return bond_do_ioctl(bond_dev, ifr, SIOCBONDRELEASE);
4719         case BOND_SETHWADDR_OLD:
4720                 return bond_do_ioctl(bond_dev, ifr, SIOCBONDSETHWADDR);
4721         case BOND_CHANGE_ACTIVE_OLD:
4722                 return bond_do_ioctl(bond_dev, ifr, SIOCBONDCHANGEACTIVE);
4723         }
4724
4725         return -EOPNOTSUPP;
4726 }
4727
4728 static void bond_change_rx_flags(struct net_device *bond_dev, int change)
4729 {
4730         struct bonding *bond = netdev_priv(bond_dev);
4731
4732         if (change & IFF_PROMISC)
4733                 bond_set_promiscuity(bond,
4734                                      bond_dev->flags & IFF_PROMISC ? 1 : -1);
4735
4736         if (change & IFF_ALLMULTI)
4737                 bond_set_allmulti(bond,
4738                                   bond_dev->flags & IFF_ALLMULTI ? 1 : -1);
4739 }
4740
4741 static void bond_set_rx_mode(struct net_device *bond_dev)
4742 {
4743         struct bonding *bond = netdev_priv(bond_dev);
4744         struct list_head *iter;
4745         struct slave *slave;
4746
4747         rcu_read_lock();
4748         if (bond_uses_primary(bond)) {
4749                 slave = rcu_dereference(bond->curr_active_slave);
4750                 if (slave) {
4751                         dev_uc_sync(slave->dev, bond_dev);
4752                         dev_mc_sync(slave->dev, bond_dev);
4753                 }
4754         } else {
4755                 bond_for_each_slave_rcu(bond, slave, iter) {
4756                         dev_uc_sync_multiple(slave->dev, bond_dev);
4757                         dev_mc_sync_multiple(slave->dev, bond_dev);
4758                 }
4759         }
4760         rcu_read_unlock();
4761 }
4762
4763 static int bond_neigh_init(struct neighbour *n)
4764 {
4765         struct bonding *bond = netdev_priv(n->dev);
4766         const struct net_device_ops *slave_ops;
4767         struct neigh_parms parms;
4768         struct slave *slave;
4769         int ret = 0;
4770
4771         rcu_read_lock();
4772         slave = bond_first_slave_rcu(bond);
4773         if (!slave)
4774                 goto out;
4775         slave_ops = slave->dev->netdev_ops;
4776         if (!slave_ops->ndo_neigh_setup)
4777                 goto out;
4778
4779         /* TODO: find another way [1] to implement this.
4780          * Passing a zeroed structure is fragile,
4781          * but at least we do not pass garbage.
4782          *
4783          * [1] One way would be that ndo_neigh_setup() never touch
4784          *     struct neigh_parms, but propagate the new neigh_setup()
4785          *     back to ___neigh_create() / neigh_parms_alloc()
4786          */
4787         memset(&parms, 0, sizeof(parms));
4788         ret = slave_ops->ndo_neigh_setup(slave->dev, &parms);
4789
4790         if (ret)
4791                 goto out;
4792
4793         if (parms.neigh_setup)
4794                 ret = parms.neigh_setup(n);
4795 out:
4796         rcu_read_unlock();
4797         return ret;
4798 }
4799
4800 /* The bonding ndo_neigh_setup is called at init time beofre any
4801  * slave exists. So we must declare proxy setup function which will
4802  * be used at run time to resolve the actual slave neigh param setup.
4803  *
4804  * It's also called by master devices (such as vlans) to setup their
4805  * underlying devices. In that case - do nothing, we're already set up from
4806  * our init.
4807  */
4808 static int bond_neigh_setup(struct net_device *dev,
4809                             struct neigh_parms *parms)
4810 {
4811         /* modify only our neigh_parms */
4812         if (parms->dev == dev)
4813                 parms->neigh_setup = bond_neigh_init;
4814
4815         return 0;
4816 }
4817
4818 /* Change the MTU of all of a master's slaves to match the master */
4819 static int bond_change_mtu(struct net_device *bond_dev, int new_mtu)
4820 {
4821         struct bonding *bond = netdev_priv(bond_dev);
4822         struct slave *slave, *rollback_slave;
4823         struct list_head *iter;
4824         int res = 0;
4825
4826         netdev_dbg(bond_dev, "bond=%p, new_mtu=%d\n", bond, new_mtu);
4827
4828         bond_for_each_slave(bond, slave, iter) {
4829                 slave_dbg(bond_dev, slave->dev, "s %p c_m %p\n",
4830                            slave, slave->dev->netdev_ops->ndo_change_mtu);
4831
4832                 res = dev_set_mtu(slave->dev, new_mtu);
4833
4834                 if (res) {
4835                         /* If we failed to set the slave's mtu to the new value
4836                          * we must abort the operation even in ACTIVE_BACKUP
4837                          * mode, because if we allow the backup slaves to have
4838                          * different mtu values than the active slave we'll
4839                          * need to change their mtu when doing a failover. That
4840                          * means changing their mtu from timer context, which
4841                          * is probably not a good idea.
4842                          */
4843                         slave_dbg(bond_dev, slave->dev, "err %d setting mtu to %d\n",
4844                                   res, new_mtu);
4845                         goto unwind;
4846                 }
4847         }
4848
4849         WRITE_ONCE(bond_dev->mtu, new_mtu);
4850
4851         return 0;
4852
4853 unwind:
4854         /* unwind from head to the slave that failed */
4855         bond_for_each_slave(bond, rollback_slave, iter) {
4856                 int tmp_res;
4857
4858                 if (rollback_slave == slave)
4859                         break;
4860
4861                 tmp_res = dev_set_mtu(rollback_slave->dev, bond_dev->mtu);
4862                 if (tmp_res)
4863                         slave_dbg(bond_dev, rollback_slave->dev, "unwind err %d\n",
4864                                   tmp_res);
4865         }
4866
4867         return res;
4868 }
4869
4870 /* Change HW address
4871  *
4872  * Note that many devices must be down to change the HW address, and
4873  * downing the master releases all slaves.  We can make bonds full of
4874  * bonding devices to test this, however.
4875  */
4876 static int bond_set_mac_address(struct net_device *bond_dev, void *addr)
4877 {
4878         struct bonding *bond = netdev_priv(bond_dev);
4879         struct slave *slave, *rollback_slave;
4880         struct sockaddr_storage *ss = addr, tmp_ss;
4881         struct list_head *iter;
4882         int res = 0;
4883
4884         if (BOND_MODE(bond) == BOND_MODE_ALB)
4885                 return bond_alb_set_mac_address(bond_dev, addr);
4886
4887
4888         netdev_dbg(bond_dev, "%s: bond=%p\n", __func__, bond);
4889
4890         /* If fail_over_mac is enabled, do nothing and return success.
4891          * Returning an error causes ifenslave to fail.
4892          */
4893         if (bond->params.fail_over_mac &&
4894             BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
4895                 return 0;
4896
4897         if (!is_valid_ether_addr(ss->__data))
4898                 return -EADDRNOTAVAIL;
4899
4900         bond_for_each_slave(bond, slave, iter) {
4901                 slave_dbg(bond_dev, slave->dev, "%s: slave=%p\n",
4902                           __func__, slave);
4903                 res = dev_set_mac_address(slave->dev, addr, NULL);
4904                 if (res) {
4905                         /* TODO: consider downing the slave
4906                          * and retry ?
4907                          * User should expect communications
4908                          * breakage anyway until ARP finish
4909                          * updating, so...
4910                          */
4911                         slave_dbg(bond_dev, slave->dev, "%s: err %d\n",
4912                                   __func__, res);
4913                         goto unwind;
4914                 }
4915         }
4916
4917         /* success */
4918         dev_addr_set(bond_dev, ss->__data);
4919         return 0;
4920
4921 unwind:
4922         memcpy(tmp_ss.__data, bond_dev->dev_addr, bond_dev->addr_len);
4923         tmp_ss.ss_family = bond_dev->type;
4924
4925         /* unwind from head to the slave that failed */
4926         bond_for_each_slave(bond, rollback_slave, iter) {
4927                 int tmp_res;
4928
4929                 if (rollback_slave == slave)
4930                         break;
4931
4932                 tmp_res = dev_set_mac_address(rollback_slave->dev,
4933                                               (struct sockaddr *)&tmp_ss, NULL);
4934                 if (tmp_res) {
4935                         slave_dbg(bond_dev, rollback_slave->dev, "%s: unwind err %d\n",
4936                                    __func__, tmp_res);
4937                 }
4938         }
4939
4940         return res;
4941 }
4942
4943 /**
4944  * bond_get_slave_by_id - get xmit slave with slave_id
4945  * @bond: bonding device that is transmitting
4946  * @slave_id: slave id up to slave_cnt-1 through which to transmit
4947  *
4948  * This function tries to get slave with slave_id but in case
4949  * it fails, it tries to find the first available slave for transmission.
4950  */
4951 static struct slave *bond_get_slave_by_id(struct bonding *bond,
4952                                           int slave_id)
4953 {
4954         struct list_head *iter;
4955         struct slave *slave;
4956         int i = slave_id;
4957
4958         /* Here we start from the slave with slave_id */
4959         bond_for_each_slave_rcu(bond, slave, iter) {
4960                 if (--i < 0) {
4961                         if (bond_slave_can_tx(slave))
4962                                 return slave;
4963                 }
4964         }
4965
4966         /* Here we start from the first slave up to slave_id */
4967         i = slave_id;
4968         bond_for_each_slave_rcu(bond, slave, iter) {
4969                 if (--i < 0)
4970                         break;
4971                 if (bond_slave_can_tx(slave))
4972                         return slave;
4973         }
4974         /* no slave that can tx has been found */
4975         return NULL;
4976 }
4977
4978 /**
4979  * bond_rr_gen_slave_id - generate slave id based on packets_per_slave
4980  * @bond: bonding device to use
4981  *
4982  * Based on the value of the bonding device's packets_per_slave parameter
4983  * this function generates a slave id, which is usually used as the next
4984  * slave to transmit through.
4985  */
4986 static u32 bond_rr_gen_slave_id(struct bonding *bond)
4987 {
4988         u32 slave_id;
4989         struct reciprocal_value reciprocal_packets_per_slave;
4990         int packets_per_slave = bond->params.packets_per_slave;
4991
4992         switch (packets_per_slave) {
4993         case 0:
4994                 slave_id = get_random_u32();
4995                 break;
4996         case 1:
4997                 slave_id = this_cpu_inc_return(*bond->rr_tx_counter);
4998                 break;
4999         default:
5000                 reciprocal_packets_per_slave =
5001                         bond->params.reciprocal_packets_per_slave;
5002                 slave_id = this_cpu_inc_return(*bond->rr_tx_counter);
5003                 slave_id = reciprocal_divide(slave_id,
5004                                              reciprocal_packets_per_slave);
5005                 break;
5006         }
5007
5008         return slave_id;
5009 }
5010
5011 static struct slave *bond_xmit_roundrobin_slave_get(struct bonding *bond,
5012                                                     struct sk_buff *skb)
5013 {
5014         struct slave *slave;
5015         int slave_cnt;
5016         u32 slave_id;
5017
5018         /* Start with the curr_active_slave that joined the bond as the
5019          * default for sending IGMP traffic.  For failover purposes one
5020          * needs to maintain some consistency for the interface that will
5021          * send the join/membership reports.  The curr_active_slave found
5022          * will send all of this type of traffic.
5023          */
5024         if (skb->protocol == htons(ETH_P_IP)) {
5025                 int noff = skb_network_offset(skb);
5026                 struct iphdr *iph;
5027
5028                 if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
5029                         goto non_igmp;
5030
5031                 iph = ip_hdr(skb);
5032                 if (iph->protocol == IPPROTO_IGMP) {
5033                         slave = rcu_dereference(bond->curr_active_slave);
5034                         if (slave)
5035                                 return slave;
5036                         return bond_get_slave_by_id(bond, 0);
5037                 }
5038         }
5039
5040 non_igmp:
5041         slave_cnt = READ_ONCE(bond->slave_cnt);
5042         if (likely(slave_cnt)) {
5043                 slave_id = bond_rr_gen_slave_id(bond) % slave_cnt;
5044                 return bond_get_slave_by_id(bond, slave_id);
5045         }
5046         return NULL;
5047 }
5048
5049 static struct slave *bond_xdp_xmit_roundrobin_slave_get(struct bonding *bond,
5050                                                         struct xdp_buff *xdp)
5051 {
5052         struct slave *slave;
5053         int slave_cnt;
5054         u32 slave_id;
5055         const struct ethhdr *eth;
5056         void *data = xdp->data;
5057
5058         if (data + sizeof(struct ethhdr) > xdp->data_end)
5059                 goto non_igmp;
5060
5061         eth = (struct ethhdr *)data;
5062         data += sizeof(struct ethhdr);
5063
5064         /* See comment on IGMP in bond_xmit_roundrobin_slave_get() */
5065         if (eth->h_proto == htons(ETH_P_IP)) {
5066                 const struct iphdr *iph;
5067
5068                 if (data + sizeof(struct iphdr) > xdp->data_end)
5069                         goto non_igmp;
5070
5071                 iph = (struct iphdr *)data;
5072
5073                 if (iph->protocol == IPPROTO_IGMP) {
5074                         slave = rcu_dereference(bond->curr_active_slave);
5075                         if (slave)
5076                                 return slave;
5077                         return bond_get_slave_by_id(bond, 0);
5078                 }
5079         }
5080
5081 non_igmp:
5082         slave_cnt = READ_ONCE(bond->slave_cnt);
5083         if (likely(slave_cnt)) {
5084                 slave_id = bond_rr_gen_slave_id(bond) % slave_cnt;
5085                 return bond_get_slave_by_id(bond, slave_id);
5086         }
5087         return NULL;
5088 }
5089
5090 static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
5091                                         struct net_device *bond_dev)
5092 {
5093         struct bonding *bond = netdev_priv(bond_dev);
5094         struct slave *slave;
5095
5096         slave = bond_xmit_roundrobin_slave_get(bond, skb);
5097         if (likely(slave))
5098                 return bond_dev_queue_xmit(bond, skb, slave->dev);
5099
5100         return bond_tx_drop(bond_dev, skb);
5101 }
5102
5103 static struct slave *bond_xmit_activebackup_slave_get(struct bonding *bond)
5104 {
5105         return rcu_dereference(bond->curr_active_slave);
5106 }
5107
5108 /* In active-backup mode, we know that bond->curr_active_slave is always valid if
5109  * the bond has a usable interface.
5110  */
5111 static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,
5112                                           struct net_device *bond_dev)
5113 {
5114         struct bonding *bond = netdev_priv(bond_dev);
5115         struct slave *slave;
5116
5117         slave = bond_xmit_activebackup_slave_get(bond);
5118         if (slave)
5119                 return bond_dev_queue_xmit(bond, skb, slave->dev);
5120
5121         return bond_tx_drop(bond_dev, skb);
5122 }
5123
5124 /* Use this to update slave_array when (a) it's not appropriate to update
5125  * slave_array right away (note that update_slave_array() may sleep)
5126  * and / or (b) RTNL is not held.
5127  */
5128 void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay)
5129 {
5130         queue_delayed_work(bond->wq, &bond->slave_arr_work, delay);
5131 }
5132
5133 /* Slave array work handler. Holds only RTNL */
5134 static void bond_slave_arr_handler(struct work_struct *work)
5135 {
5136         struct bonding *bond = container_of(work, struct bonding,
5137                                             slave_arr_work.work);
5138         int ret;
5139
5140         if (!rtnl_trylock())
5141                 goto err;
5142
5143         ret = bond_update_slave_arr(bond, NULL);
5144         rtnl_unlock();
5145         if (ret) {
5146                 pr_warn_ratelimited("Failed to update slave array from WT\n");
5147                 goto err;
5148         }
5149         return;
5150
5151 err:
5152         bond_slave_arr_work_rearm(bond, 1);
5153 }
5154
5155 static void bond_skip_slave(struct bond_up_slave *slaves,
5156                             struct slave *skipslave)
5157 {
5158         int idx;
5159
5160         /* Rare situation where caller has asked to skip a specific
5161          * slave but allocation failed (most likely!). BTW this is
5162          * only possible when the call is initiated from
5163          * __bond_release_one(). In this situation; overwrite the
5164          * skipslave entry in the array with the last entry from the
5165          * array to avoid a situation where the xmit path may choose
5166          * this to-be-skipped slave to send a packet out.
5167          */
5168         for (idx = 0; slaves && idx < slaves->count; idx++) {
5169                 if (skipslave == slaves->arr[idx]) {
5170                         slaves->arr[idx] =
5171                                 slaves->arr[slaves->count - 1];
5172                         slaves->count--;
5173                         break;
5174                 }
5175         }
5176 }
5177
5178 static void bond_set_slave_arr(struct bonding *bond,
5179                                struct bond_up_slave *usable_slaves,
5180                                struct bond_up_slave *all_slaves)
5181 {
5182         struct bond_up_slave *usable, *all;
5183
5184         usable = rtnl_dereference(bond->usable_slaves);
5185         rcu_assign_pointer(bond->usable_slaves, usable_slaves);
5186         kfree_rcu(usable, rcu);
5187
5188         all = rtnl_dereference(bond->all_slaves);
5189         rcu_assign_pointer(bond->all_slaves, all_slaves);
5190         kfree_rcu(all, rcu);
5191 }
5192
5193 static void bond_reset_slave_arr(struct bonding *bond)
5194 {
5195         bond_set_slave_arr(bond, NULL, NULL);
5196 }
5197
5198 /* Build the usable slaves array in control path for modes that use xmit-hash
5199  * to determine the slave interface -
5200  * (a) BOND_MODE_8023AD
5201  * (b) BOND_MODE_XOR
5202  * (c) (BOND_MODE_TLB || BOND_MODE_ALB) && tlb_dynamic_lb == 0
5203  *
5204  * The caller is expected to hold RTNL only and NO other lock!
5205  */
5206 int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
5207 {
5208         struct bond_up_slave *usable_slaves = NULL, *all_slaves = NULL;
5209         struct slave *slave;
5210         struct list_head *iter;
5211         int agg_id = 0;
5212         int ret = 0;
5213
5214         might_sleep();
5215
5216         usable_slaves = kzalloc(struct_size(usable_slaves, arr,
5217                                             bond->slave_cnt), GFP_KERNEL);
5218         all_slaves = kzalloc(struct_size(all_slaves, arr,
5219                                          bond->slave_cnt), GFP_KERNEL);
5220         if (!usable_slaves || !all_slaves) {
5221                 ret = -ENOMEM;
5222                 goto out;
5223         }
5224         if (BOND_MODE(bond) == BOND_MODE_8023AD) {
5225                 struct ad_info ad_info;
5226
5227                 spin_lock_bh(&bond->mode_lock);
5228                 if (bond_3ad_get_active_agg_info(bond, &ad_info)) {
5229                         spin_unlock_bh(&bond->mode_lock);
5230                         pr_debug("bond_3ad_get_active_agg_info failed\n");
5231                         /* No active aggragator means it's not safe to use
5232                          * the previous array.
5233                          */
5234                         bond_reset_slave_arr(bond);
5235                         goto out;
5236                 }
5237                 spin_unlock_bh(&bond->mode_lock);
5238                 agg_id = ad_info.aggregator_id;
5239         }
5240         bond_for_each_slave(bond, slave, iter) {
5241                 if (skipslave == slave)
5242                         continue;
5243
5244                 all_slaves->arr[all_slaves->count++] = slave;
5245                 if (BOND_MODE(bond) == BOND_MODE_8023AD) {
5246                         struct aggregator *agg;
5247
5248                         agg = SLAVE_AD_INFO(slave)->port.aggregator;
5249                         if (!agg || agg->aggregator_identifier != agg_id)
5250                                 continue;
5251                 }
5252                 if (!bond_slave_can_tx(slave))
5253                         continue;
5254
5255                 slave_dbg(bond->dev, slave->dev, "Adding slave to tx hash array[%d]\n",
5256                           usable_slaves->count);
5257
5258                 usable_slaves->arr[usable_slaves->count++] = slave;
5259         }
5260
5261         bond_set_slave_arr(bond, usable_slaves, all_slaves);
5262         return ret;
5263 out:
5264         if (ret != 0 && skipslave) {
5265                 bond_skip_slave(rtnl_dereference(bond->all_slaves),
5266                                 skipslave);
5267                 bond_skip_slave(rtnl_dereference(bond->usable_slaves),
5268                                 skipslave);
5269         }
5270         kfree_rcu(all_slaves, rcu);
5271         kfree_rcu(usable_slaves, rcu);
5272
5273         return ret;
5274 }
5275
5276 static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond,
5277                                                  struct sk_buff *skb,
5278                                                  struct bond_up_slave *slaves)
5279 {
5280         struct slave *slave;
5281         unsigned int count;
5282         u32 hash;
5283
5284         hash = bond_xmit_hash(bond, skb);
5285         count = slaves ? READ_ONCE(slaves->count) : 0;
5286         if (unlikely(!count))
5287                 return NULL;
5288
5289         slave = slaves->arr[hash % count];
5290         return slave;
5291 }
5292
5293 static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond,
5294                                                      struct xdp_buff *xdp)
5295 {
5296         struct bond_up_slave *slaves;
5297         unsigned int count;
5298         u32 hash;
5299
5300         hash = bond_xmit_hash_xdp(bond, xdp);
5301         slaves = rcu_dereference(bond->usable_slaves);
5302         count = slaves ? READ_ONCE(slaves->count) : 0;
5303         if (unlikely(!count))
5304                 return NULL;
5305
5306         return slaves->arr[hash % count];
5307 }
5308
5309 /* Use this Xmit function for 3AD as well as XOR modes. The current
5310  * usable slave array is formed in the control path. The xmit function
5311  * just calculates hash and sends the packet out.
5312  */
5313 static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
5314                                      struct net_device *dev)
5315 {
5316         struct bonding *bond = netdev_priv(dev);
5317         struct bond_up_slave *slaves;
5318         struct slave *slave;
5319
5320         slaves = rcu_dereference(bond->usable_slaves);
5321         slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves);
5322         if (likely(slave))
5323                 return bond_dev_queue_xmit(bond, skb, slave->dev);
5324
5325         return bond_tx_drop(dev, skb);
5326 }
5327
5328 /* in broadcast mode, we send everything to all usable interfaces. */
5329 static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
5330                                        struct net_device *bond_dev)
5331 {
5332         struct bonding *bond = netdev_priv(bond_dev);
5333         struct slave *slave = NULL;
5334         struct list_head *iter;
5335         bool xmit_suc = false;
5336         bool skb_used = false;
5337
5338         bond_for_each_slave_rcu(bond, slave, iter) {
5339                 struct sk_buff *skb2;
5340
5341                 if (!(bond_slave_is_up(slave) && slave->link == BOND_LINK_UP))
5342                         continue;
5343
5344                 if (bond_is_last_slave(bond, slave)) {
5345                         skb2 = skb;
5346                         skb_used = true;
5347                 } else {
5348                         skb2 = skb_clone(skb, GFP_ATOMIC);
5349                         if (!skb2) {
5350                                 net_err_ratelimited("%s: Error: %s: skb_clone() failed\n",
5351                                                     bond_dev->name, __func__);
5352                                 continue;
5353                         }
5354                 }
5355
5356                 if (bond_dev_queue_xmit(bond, skb2, slave->dev) == NETDEV_TX_OK)
5357                         xmit_suc = true;
5358         }
5359
5360         if (!skb_used)
5361                 dev_kfree_skb_any(skb);
5362
5363         if (xmit_suc)
5364                 return NETDEV_TX_OK;
5365
5366         dev_core_stats_tx_dropped_inc(bond_dev);
5367         return NET_XMIT_DROP;
5368 }
5369
5370 /*------------------------- Device initialization ---------------------------*/
5371
5372 /* Lookup the slave that corresponds to a qid */
5373 static inline int bond_slave_override(struct bonding *bond,
5374                                       struct sk_buff *skb)
5375 {
5376         struct slave *slave = NULL;
5377         struct list_head *iter;
5378
5379         if (!skb_rx_queue_recorded(skb))
5380                 return 1;
5381
5382         /* Find out if any slaves have the same mapping as this skb. */
5383         bond_for_each_slave_rcu(bond, slave, iter) {
5384                 if (READ_ONCE(slave->queue_id) == skb_get_queue_mapping(skb)) {
5385                         if (bond_slave_is_up(slave) &&
5386                             slave->link == BOND_LINK_UP) {
5387                                 bond_dev_queue_xmit(bond, skb, slave->dev);
5388                                 return 0;
5389                         }
5390                         /* If the slave isn't UP, use default transmit policy. */
5391                         break;
5392                 }
5393         }
5394
5395         return 1;
5396 }
5397
5398
5399 static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
5400                              struct net_device *sb_dev)
5401 {
5402         /* This helper function exists to help dev_pick_tx get the correct
5403          * destination queue.  Using a helper function skips a call to
5404          * skb_tx_hash and will put the skbs in the queue we expect on their
5405          * way down to the bonding driver.
5406          */
5407         u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
5408
5409         /* Save the original txq to restore before passing to the driver */
5410         qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb_get_queue_mapping(skb);
5411
5412         if (unlikely(txq >= dev->real_num_tx_queues)) {
5413                 do {
5414                         txq -= dev->real_num_tx_queues;
5415                 } while (txq >= dev->real_num_tx_queues);
5416         }
5417         return txq;
5418 }
5419
5420 static struct net_device *bond_xmit_get_slave(struct net_device *master_dev,
5421                                               struct sk_buff *skb,
5422                                               bool all_slaves)
5423 {
5424         struct bonding *bond = netdev_priv(master_dev);
5425         struct bond_up_slave *slaves;
5426         struct slave *slave = NULL;
5427
5428         switch (BOND_MODE(bond)) {
5429         case BOND_MODE_ROUNDROBIN:
5430                 slave = bond_xmit_roundrobin_slave_get(bond, skb);
5431                 break;
5432         case BOND_MODE_ACTIVEBACKUP:
5433                 slave = bond_xmit_activebackup_slave_get(bond);
5434                 break;
5435         case BOND_MODE_8023AD:
5436         case BOND_MODE_XOR:
5437                 if (all_slaves)
5438                         slaves = rcu_dereference(bond->all_slaves);
5439                 else
5440                         slaves = rcu_dereference(bond->usable_slaves);
5441                 slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves);
5442                 break;
5443         case BOND_MODE_BROADCAST:
5444                 break;
5445         case BOND_MODE_ALB:
5446                 slave = bond_xmit_alb_slave_get(bond, skb);
5447                 break;
5448         case BOND_MODE_TLB:
5449                 slave = bond_xmit_tlb_slave_get(bond, skb);
5450                 break;
5451         default:
5452                 /* Should never happen, mode already checked */
5453                 WARN_ONCE(true, "Unknown bonding mode");
5454                 break;
5455         }
5456
5457         if (slave)
5458                 return slave->dev;
5459         return NULL;
5460 }
5461
5462 static void bond_sk_to_flow(struct sock *sk, struct flow_keys *flow)
5463 {
5464         switch (sk->sk_family) {
5465 #if IS_ENABLED(CONFIG_IPV6)
5466         case AF_INET6:
5467                 if (ipv6_only_sock(sk) ||
5468                     ipv6_addr_type(&sk->sk_v6_daddr) != IPV6_ADDR_MAPPED) {
5469                         flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
5470                         flow->addrs.v6addrs.src = inet6_sk(sk)->saddr;
5471                         flow->addrs.v6addrs.dst = sk->sk_v6_daddr;
5472                         break;
5473                 }
5474                 fallthrough;
5475 #endif
5476         default: /* AF_INET */
5477                 flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
5478                 flow->addrs.v4addrs.src = inet_sk(sk)->inet_rcv_saddr;
5479                 flow->addrs.v4addrs.dst = inet_sk(sk)->inet_daddr;
5480                 break;
5481         }
5482
5483         flow->ports.src = inet_sk(sk)->inet_sport;
5484         flow->ports.dst = inet_sk(sk)->inet_dport;
5485 }
5486
5487 /**
5488  * bond_sk_hash_l34 - generate a hash value based on the socket's L3 and L4 fields
5489  * @sk: socket to use for headers
5490  *
5491  * This function will extract the necessary field from the socket and use
5492  * them to generate a hash based on the LAYER34 xmit_policy.
5493  * Assumes that sk is a TCP or UDP socket.
5494  */
5495 static u32 bond_sk_hash_l34(struct sock *sk)
5496 {
5497         struct flow_keys flow;
5498         u32 hash;
5499
5500         bond_sk_to_flow(sk, &flow);
5501
5502         /* L4 */
5503         memcpy(&hash, &flow.ports.ports, sizeof(hash));
5504         /* L3 */
5505         return bond_ip_hash(hash, &flow, BOND_XMIT_POLICY_LAYER34);
5506 }
5507
5508 static struct net_device *__bond_sk_get_lower_dev(struct bonding *bond,
5509                                                   struct sock *sk)
5510 {
5511         struct bond_up_slave *slaves;
5512         struct slave *slave;
5513         unsigned int count;
5514         u32 hash;
5515
5516         slaves = rcu_dereference(bond->usable_slaves);
5517         count = slaves ? READ_ONCE(slaves->count) : 0;
5518         if (unlikely(!count))
5519                 return NULL;
5520
5521         hash = bond_sk_hash_l34(sk);
5522         slave = slaves->arr[hash % count];
5523
5524         return slave->dev;
5525 }
5526
5527 static struct net_device *bond_sk_get_lower_dev(struct net_device *dev,
5528                                                 struct sock *sk)
5529 {
5530         struct bonding *bond = netdev_priv(dev);
5531         struct net_device *lower = NULL;
5532
5533         rcu_read_lock();
5534         if (bond_sk_check(bond))
5535                 lower = __bond_sk_get_lower_dev(bond, sk);
5536         rcu_read_unlock();
5537
5538         return lower;
5539 }
5540
5541 #if IS_ENABLED(CONFIG_TLS_DEVICE)
5542 static netdev_tx_t bond_tls_device_xmit(struct bonding *bond, struct sk_buff *skb,
5543                                         struct net_device *dev)
5544 {
5545         struct net_device *tls_netdev = rcu_dereference(tls_get_ctx(skb->sk)->netdev);
5546
5547         /* tls_netdev might become NULL, even if tls_is_skb_tx_device_offloaded
5548          * was true, if tls_device_down is running in parallel, but it's OK,
5549          * because bond_get_slave_by_dev has a NULL check.
5550          */
5551         if (likely(bond_get_slave_by_dev(bond, tls_netdev)))
5552                 return bond_dev_queue_xmit(bond, skb, tls_netdev);
5553         return bond_tx_drop(dev, skb);
5554 }
5555 #endif
5556
5557 static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
5558 {
5559         struct bonding *bond = netdev_priv(dev);
5560
5561         if (bond_should_override_tx_queue(bond) &&
5562             !bond_slave_override(bond, skb))
5563                 return NETDEV_TX_OK;
5564
5565 #if IS_ENABLED(CONFIG_TLS_DEVICE)
5566         if (tls_is_skb_tx_device_offloaded(skb))
5567                 return bond_tls_device_xmit(bond, skb, dev);
5568 #endif
5569
5570         switch (BOND_MODE(bond)) {
5571         case BOND_MODE_ROUNDROBIN:
5572                 return bond_xmit_roundrobin(skb, dev);
5573         case BOND_MODE_ACTIVEBACKUP:
5574                 return bond_xmit_activebackup(skb, dev);
5575         case BOND_MODE_8023AD:
5576         case BOND_MODE_XOR:
5577                 return bond_3ad_xor_xmit(skb, dev);
5578         case BOND_MODE_BROADCAST:
5579                 return bond_xmit_broadcast(skb, dev);
5580         case BOND_MODE_ALB:
5581                 return bond_alb_xmit(skb, dev);
5582         case BOND_MODE_TLB:
5583                 return bond_tlb_xmit(skb, dev);
5584         default:
5585                 /* Should never happen, mode already checked */
5586                 netdev_err(dev, "Unknown bonding mode %d\n", BOND_MODE(bond));
5587                 WARN_ON_ONCE(1);
5588                 return bond_tx_drop(dev, skb);
5589         }
5590 }
5591
5592 static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
5593 {
5594         struct bonding *bond = netdev_priv(dev);
5595         netdev_tx_t ret = NETDEV_TX_OK;
5596
5597         /* If we risk deadlock from transmitting this in the
5598          * netpoll path, tell netpoll to queue the frame for later tx
5599          */
5600         if (unlikely(is_netpoll_tx_blocked(dev)))
5601                 return NETDEV_TX_BUSY;
5602
5603         rcu_read_lock();
5604         if (bond_has_slaves(bond))
5605                 ret = __bond_start_xmit(skb, dev);
5606         else
5607                 ret = bond_tx_drop(dev, skb);
5608         rcu_read_unlock();
5609
5610         return ret;
5611 }
5612
5613 static struct net_device *
5614 bond_xdp_get_xmit_slave(struct net_device *bond_dev, struct xdp_buff *xdp)
5615 {
5616         struct bonding *bond = netdev_priv(bond_dev);
5617         struct slave *slave;
5618
5619         /* Caller needs to hold rcu_read_lock() */
5620
5621         switch (BOND_MODE(bond)) {
5622         case BOND_MODE_ROUNDROBIN:
5623                 slave = bond_xdp_xmit_roundrobin_slave_get(bond, xdp);
5624                 break;
5625
5626         case BOND_MODE_ACTIVEBACKUP:
5627                 slave = bond_xmit_activebackup_slave_get(bond);
5628                 break;
5629
5630         case BOND_MODE_8023AD:
5631         case BOND_MODE_XOR:
5632                 slave = bond_xdp_xmit_3ad_xor_slave_get(bond, xdp);
5633                 break;
5634
5635         default:
5636                 if (net_ratelimit())
5637                         netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n",
5638                                    BOND_MODE(bond));
5639                 return NULL;
5640         }
5641
5642         if (slave)
5643                 return slave->dev;
5644
5645         return NULL;
5646 }
5647
5648 static int bond_xdp_xmit(struct net_device *bond_dev,
5649                          int n, struct xdp_frame **frames, u32 flags)
5650 {
5651         int nxmit, err = -ENXIO;
5652
5653         rcu_read_lock();
5654
5655         for (nxmit = 0; nxmit < n; nxmit++) {
5656                 struct xdp_frame *frame = frames[nxmit];
5657                 struct xdp_frame *frames1[] = {frame};
5658                 struct net_device *slave_dev;
5659                 struct xdp_buff xdp;
5660
5661                 xdp_convert_frame_to_buff(frame, &xdp);
5662
5663                 slave_dev = bond_xdp_get_xmit_slave(bond_dev, &xdp);
5664                 if (!slave_dev) {
5665                         err = -ENXIO;
5666                         break;
5667                 }
5668
5669                 err = slave_dev->netdev_ops->ndo_xdp_xmit(slave_dev, 1, frames1, flags);
5670                 if (err < 1)
5671                         break;
5672         }
5673
5674         rcu_read_unlock();
5675
5676         /* If error happened on the first frame then we can pass the error up, otherwise
5677          * report the number of frames that were xmitted.
5678          */
5679         if (err < 0)
5680                 return (nxmit == 0 ? err : nxmit);
5681
5682         return nxmit;
5683 }
5684
5685 static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog,
5686                         struct netlink_ext_ack *extack)
5687 {
5688         struct bonding *bond = netdev_priv(dev);
5689         struct list_head *iter;
5690         struct slave *slave, *rollback_slave;
5691         struct bpf_prog *old_prog;
5692         struct netdev_bpf xdp = {
5693                 .command = XDP_SETUP_PROG,
5694                 .flags   = 0,
5695                 .prog    = prog,
5696                 .extack  = extack,
5697         };
5698         int err;
5699
5700         ASSERT_RTNL();
5701
5702         if (!bond_xdp_check(bond)) {
5703                 BOND_NL_ERR(dev, extack,
5704                             "No native XDP support for the current bonding mode");
5705                 return -EOPNOTSUPP;
5706         }
5707
5708         old_prog = bond->xdp_prog;
5709         bond->xdp_prog = prog;
5710
5711         bond_for_each_slave(bond, slave, iter) {
5712                 struct net_device *slave_dev = slave->dev;
5713
5714                 if (!slave_dev->netdev_ops->ndo_bpf ||
5715                     !slave_dev->netdev_ops->ndo_xdp_xmit) {
5716                         SLAVE_NL_ERR(dev, slave_dev, extack,
5717                                      "Slave device does not support XDP");
5718                         err = -EOPNOTSUPP;
5719                         goto err;
5720                 }
5721
5722                 if (dev_xdp_prog_count(slave_dev) > 0) {
5723                         SLAVE_NL_ERR(dev, slave_dev, extack,
5724                                      "Slave has XDP program loaded, please unload before enslaving");
5725                         err = -EOPNOTSUPP;
5726                         goto err;
5727                 }
5728
5729                 err = dev_xdp_propagate(slave_dev, &xdp);
5730                 if (err < 0) {
5731                         /* ndo_bpf() sets extack error message */
5732                         slave_err(dev, slave_dev, "Error %d calling ndo_bpf\n", err);
5733                         goto err;
5734                 }
5735                 if (prog)
5736                         bpf_prog_inc(prog);
5737         }
5738
5739         if (prog) {
5740                 static_branch_inc(&bpf_master_redirect_enabled_key);
5741         } else if (old_prog) {
5742                 bpf_prog_put(old_prog);
5743                 static_branch_dec(&bpf_master_redirect_enabled_key);
5744         }
5745
5746         return 0;
5747
5748 err:
5749         /* unwind the program changes */
5750         bond->xdp_prog = old_prog;
5751         xdp.prog = old_prog;
5752         xdp.extack = NULL; /* do not overwrite original error */
5753
5754         bond_for_each_slave(bond, rollback_slave, iter) {
5755                 struct net_device *slave_dev = rollback_slave->dev;
5756                 int err_unwind;
5757
5758                 if (slave == rollback_slave)
5759                         break;
5760
5761                 err_unwind = dev_xdp_propagate(slave_dev, &xdp);
5762                 if (err_unwind < 0)
5763                         slave_err(dev, slave_dev,
5764                                   "Error %d when unwinding XDP program change\n", err_unwind);
5765                 else if (xdp.prog)
5766                         bpf_prog_inc(xdp.prog);
5767         }
5768         return err;
5769 }
5770
5771 static int bond_xdp(struct net_device *dev, struct netdev_bpf *xdp)
5772 {
5773         switch (xdp->command) {
5774         case XDP_SETUP_PROG:
5775                 return bond_xdp_set(dev, xdp->prog, xdp->extack);
5776         default:
5777                 return -EINVAL;
5778         }
5779 }
5780
5781 static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed)
5782 {
5783         if (speed == 0 || speed == SPEED_UNKNOWN)
5784                 speed = slave->speed;
5785         else
5786                 speed = min(speed, slave->speed);
5787
5788         return speed;
5789 }
5790
5791 /* Set the BOND_PHC_INDEX flag to notify user space */
5792 static int bond_set_phc_index_flag(struct kernel_hwtstamp_config *kernel_cfg)
5793 {
5794         struct ifreq *ifr = kernel_cfg->ifr;
5795         struct hwtstamp_config cfg;
5796
5797         if (kernel_cfg->copied_to_user) {
5798                 /* Lower device has a legacy implementation */
5799                 if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
5800                         return -EFAULT;
5801
5802                 cfg.flags |= HWTSTAMP_FLAG_BONDED_PHC_INDEX;
5803                 if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)))
5804                         return -EFAULT;
5805         } else {
5806                 kernel_cfg->flags |= HWTSTAMP_FLAG_BONDED_PHC_INDEX;
5807         }
5808
5809         return 0;
5810 }
5811
5812 static int bond_hwtstamp_get(struct net_device *dev,
5813                              struct kernel_hwtstamp_config *cfg)
5814 {
5815         struct bonding *bond = netdev_priv(dev);
5816         struct net_device *real_dev;
5817         int err;
5818
5819         real_dev = bond_option_active_slave_get_rcu(bond);
5820         if (!real_dev)
5821                 return -EOPNOTSUPP;
5822
5823         err = generic_hwtstamp_get_lower(real_dev, cfg);
5824         if (err)
5825                 return err;
5826
5827         return bond_set_phc_index_flag(cfg);
5828 }
5829
5830 static int bond_hwtstamp_set(struct net_device *dev,
5831                              struct kernel_hwtstamp_config *cfg,
5832                              struct netlink_ext_ack *extack)
5833 {
5834         struct bonding *bond = netdev_priv(dev);
5835         struct net_device *real_dev;
5836         int err;
5837
5838         if (!(cfg->flags & HWTSTAMP_FLAG_BONDED_PHC_INDEX))
5839                 return -EOPNOTSUPP;
5840
5841         real_dev = bond_option_active_slave_get_rcu(bond);
5842         if (!real_dev)
5843                 return -EOPNOTSUPP;
5844
5845         err = generic_hwtstamp_set_lower(real_dev, cfg, extack);
5846         if (err)
5847                 return err;
5848
5849         return bond_set_phc_index_flag(cfg);
5850 }
5851
5852 static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev,
5853                                            struct ethtool_link_ksettings *cmd)
5854 {
5855         struct bonding *bond = netdev_priv(bond_dev);
5856         struct list_head *iter;
5857         struct slave *slave;
5858         u32 speed = 0;
5859
5860         cmd->base.duplex = DUPLEX_UNKNOWN;
5861         cmd->base.port = PORT_OTHER;
5862
5863         /* Since bond_slave_can_tx returns false for all inactive or down slaves, we
5864          * do not need to check mode.  Though link speed might not represent
5865          * the true receive or transmit bandwidth (not all modes are symmetric)
5866          * this is an accurate maximum.
5867          */
5868         bond_for_each_slave(bond, slave, iter) {
5869                 if (bond_slave_can_tx(slave)) {
5870                         bond_update_speed_duplex(slave);
5871                         if (slave->speed != SPEED_UNKNOWN) {
5872                                 if (BOND_MODE(bond) == BOND_MODE_BROADCAST)
5873                                         speed = bond_mode_bcast_speed(slave,
5874                                                                       speed);
5875                                 else
5876                                         speed += slave->speed;
5877                         }
5878                         if (cmd->base.duplex == DUPLEX_UNKNOWN &&
5879                             slave->duplex != DUPLEX_UNKNOWN)
5880                                 cmd->base.duplex = slave->duplex;
5881                 }
5882         }
5883         cmd->base.speed = speed ? : SPEED_UNKNOWN;
5884
5885         return 0;
5886 }
5887
5888 static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
5889                                      struct ethtool_drvinfo *drvinfo)
5890 {
5891         strscpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
5892         snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%d",
5893                  BOND_ABI_VERSION);
5894 }
5895
5896 static int bond_ethtool_get_ts_info(struct net_device *bond_dev,
5897                                     struct kernel_ethtool_ts_info *info)
5898 {
5899         struct bonding *bond = netdev_priv(bond_dev);
5900         struct kernel_ethtool_ts_info ts_info;
5901         struct net_device *real_dev;
5902         bool sw_tx_support = false;
5903         struct list_head *iter;
5904         struct slave *slave;
5905         int ret = 0;
5906
5907         rcu_read_lock();
5908         real_dev = bond_option_active_slave_get_rcu(bond);
5909         dev_hold(real_dev);
5910         rcu_read_unlock();
5911
5912         if (real_dev) {
5913                 ret = ethtool_get_ts_info_by_layer(real_dev, info);
5914         } else {
5915                 /* Check if all slaves support software tx timestamping */
5916                 rcu_read_lock();
5917                 bond_for_each_slave_rcu(bond, slave, iter) {
5918                         ret = ethtool_get_ts_info_by_layer(slave->dev, &ts_info);
5919                         if (!ret && (ts_info.so_timestamping & SOF_TIMESTAMPING_TX_SOFTWARE)) {
5920                                 sw_tx_support = true;
5921                                 continue;
5922                         }
5923
5924                         sw_tx_support = false;
5925                         break;
5926                 }
5927                 rcu_read_unlock();
5928         }
5929
5930         if (sw_tx_support)
5931                 info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE;
5932
5933         dev_put(real_dev);
5934         return ret;
5935 }
5936
5937 static const struct ethtool_ops bond_ethtool_ops = {
5938         .get_drvinfo            = bond_ethtool_get_drvinfo,
5939         .get_link               = ethtool_op_get_link,
5940         .get_link_ksettings     = bond_ethtool_get_link_ksettings,
5941         .get_ts_info            = bond_ethtool_get_ts_info,
5942 };
5943
5944 static const struct net_device_ops bond_netdev_ops = {
5945         .ndo_init               = bond_init,
5946         .ndo_uninit             = bond_uninit,
5947         .ndo_open               = bond_open,
5948         .ndo_stop               = bond_close,
5949         .ndo_start_xmit         = bond_start_xmit,
5950         .ndo_select_queue       = bond_select_queue,
5951         .ndo_get_stats64        = bond_get_stats,
5952         .ndo_eth_ioctl          = bond_eth_ioctl,
5953         .ndo_siocbond           = bond_do_ioctl,
5954         .ndo_siocdevprivate     = bond_siocdevprivate,
5955         .ndo_change_rx_flags    = bond_change_rx_flags,
5956         .ndo_set_rx_mode        = bond_set_rx_mode,
5957         .ndo_change_mtu         = bond_change_mtu,
5958         .ndo_set_mac_address    = bond_set_mac_address,
5959         .ndo_neigh_setup        = bond_neigh_setup,
5960         .ndo_vlan_rx_add_vid    = bond_vlan_rx_add_vid,
5961         .ndo_vlan_rx_kill_vid   = bond_vlan_rx_kill_vid,
5962 #ifdef CONFIG_NET_POLL_CONTROLLER
5963         .ndo_netpoll_setup      = bond_netpoll_setup,
5964         .ndo_netpoll_cleanup    = bond_netpoll_cleanup,
5965         .ndo_poll_controller    = bond_poll_controller,
5966 #endif
5967         .ndo_add_slave          = bond_enslave,
5968         .ndo_del_slave          = bond_release,
5969         .ndo_fix_features       = bond_fix_features,
5970         .ndo_features_check     = passthru_features_check,
5971         .ndo_get_xmit_slave     = bond_xmit_get_slave,
5972         .ndo_sk_get_lower_dev   = bond_sk_get_lower_dev,
5973         .ndo_bpf                = bond_xdp,
5974         .ndo_xdp_xmit           = bond_xdp_xmit,
5975         .ndo_xdp_get_xmit_slave = bond_xdp_get_xmit_slave,
5976         .ndo_hwtstamp_get       = bond_hwtstamp_get,
5977         .ndo_hwtstamp_set       = bond_hwtstamp_set,
5978 };
5979
5980 static const struct device_type bond_type = {
5981         .name = "bond",
5982 };
5983
5984 static void bond_destructor(struct net_device *bond_dev)
5985 {
5986         struct bonding *bond = netdev_priv(bond_dev);
5987
5988         if (bond->wq)
5989                 destroy_workqueue(bond->wq);
5990
5991         free_percpu(bond->rr_tx_counter);
5992 }
5993
5994 void bond_setup(struct net_device *bond_dev)
5995 {
5996         struct bonding *bond = netdev_priv(bond_dev);
5997
5998         spin_lock_init(&bond->mode_lock);
5999         bond->params = bonding_defaults;
6000
6001         /* Initialize pointers */
6002         bond->dev = bond_dev;
6003
6004         /* Initialize the device entry points */
6005         ether_setup(bond_dev);
6006         bond_dev->max_mtu = ETH_MAX_MTU;
6007         bond_dev->netdev_ops = &bond_netdev_ops;
6008         bond_dev->ethtool_ops = &bond_ethtool_ops;
6009
6010         bond_dev->needs_free_netdev = true;
6011         bond_dev->priv_destructor = bond_destructor;
6012
6013         SET_NETDEV_DEVTYPE(bond_dev, &bond_type);
6014
6015         /* Initialize the device options */
6016         bond_dev->flags |= IFF_MASTER;
6017         bond_dev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT | IFF_NO_QUEUE;
6018         bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
6019
6020 #ifdef CONFIG_XFRM_OFFLOAD
6021         /* set up xfrm device ops (only supported in active-backup right now) */
6022         bond_dev->xfrmdev_ops = &bond_xfrmdev_ops;
6023         INIT_LIST_HEAD(&bond->ipsec_list);
6024         mutex_init(&bond->ipsec_lock);
6025 #endif /* CONFIG_XFRM_OFFLOAD */
6026
6027         /* don't acquire bond device's netif_tx_lock when transmitting */
6028         bond_dev->lltx = true;
6029
6030         /* Don't allow bond devices to change network namespaces. */
6031         bond_dev->netns_local = true;
6032
6033         /* By default, we declare the bond to be fully
6034          * VLAN hardware accelerated capable. Special
6035          * care is taken in the various xmit functions
6036          * when there are slaves that are not hw accel
6037          * capable
6038          */
6039
6040         bond_dev->hw_features = BOND_VLAN_FEATURES |
6041                                 NETIF_F_HW_VLAN_CTAG_RX |
6042                                 NETIF_F_HW_VLAN_CTAG_FILTER |
6043                                 NETIF_F_HW_VLAN_STAG_RX |
6044                                 NETIF_F_HW_VLAN_STAG_FILTER;
6045
6046         bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
6047         bond_dev->features |= bond_dev->hw_features;
6048         bond_dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
6049 #ifdef CONFIG_XFRM_OFFLOAD
6050         bond_dev->hw_features |= BOND_XFRM_FEATURES;
6051         /* Only enable XFRM features if this is an active-backup config */
6052         if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
6053                 bond_dev->features |= BOND_XFRM_FEATURES;
6054 #endif /* CONFIG_XFRM_OFFLOAD */
6055 }
6056
6057 /* Destroy a bonding device.
6058  * Must be under rtnl_lock when this function is called.
6059  */
6060 static void bond_uninit(struct net_device *bond_dev)
6061 {
6062         struct bonding *bond = netdev_priv(bond_dev);
6063         struct list_head *iter;
6064         struct slave *slave;
6065
6066         bond_netpoll_cleanup(bond_dev);
6067
6068         /* Release the bonded slaves */
6069         bond_for_each_slave(bond, slave, iter)
6070                 __bond_release_one(bond_dev, slave->dev, true, true);
6071         netdev_info(bond_dev, "Released all slaves\n");
6072
6073 #ifdef CONFIG_XFRM_OFFLOAD
6074         mutex_destroy(&bond->ipsec_lock);
6075 #endif /* CONFIG_XFRM_OFFLOAD */
6076
6077         bond_set_slave_arr(bond, NULL, NULL);
6078
6079         list_del_rcu(&bond->bond_list);
6080
6081         bond_debug_unregister(bond);
6082 }
6083
6084 /*------------------------- Module initialization ---------------------------*/
6085
6086 static int __init bond_check_params(struct bond_params *params)
6087 {
6088         int arp_validate_value, fail_over_mac_value, primary_reselect_value, i;
6089         struct bond_opt_value newval;
6090         const struct bond_opt_value *valptr;
6091         int arp_all_targets_value = 0;
6092         u16 ad_actor_sys_prio = 0;
6093         u16 ad_user_port_key = 0;
6094         __be32 arp_target[BOND_MAX_ARP_TARGETS] = { 0 };
6095         int arp_ip_count;
6096         int bond_mode   = BOND_MODE_ROUNDROBIN;
6097         int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
6098         int lacp_fast = 0;
6099         int tlb_dynamic_lb;
6100
6101         /* Convert string parameters. */
6102         if (mode) {
6103                 bond_opt_initstr(&newval, mode);
6104                 valptr = bond_opt_parse(bond_opt_get(BOND_OPT_MODE), &newval);
6105                 if (!valptr) {
6106                         pr_err("Error: Invalid bonding mode \"%s\"\n", mode);
6107                         return -EINVAL;
6108                 }
6109                 bond_mode = valptr->value;
6110         }
6111
6112         if (xmit_hash_policy) {
6113                 if (bond_mode == BOND_MODE_ROUNDROBIN ||
6114                     bond_mode == BOND_MODE_ACTIVEBACKUP ||
6115                     bond_mode == BOND_MODE_BROADCAST) {
6116                         pr_info("xmit_hash_policy param is irrelevant in mode %s\n",
6117                                 bond_mode_name(bond_mode));
6118                 } else {
6119                         bond_opt_initstr(&newval, xmit_hash_policy);
6120                         valptr = bond_opt_parse(bond_opt_get(BOND_OPT_XMIT_HASH),
6121                                                 &newval);
6122                         if (!valptr) {
6123                                 pr_err("Error: Invalid xmit_hash_policy \"%s\"\n",
6124                                        xmit_hash_policy);
6125                                 return -EINVAL;
6126                         }
6127                         xmit_hashtype = valptr->value;
6128                 }
6129         }
6130
6131         if (lacp_rate) {
6132                 if (bond_mode != BOND_MODE_8023AD) {
6133                         pr_info("lacp_rate param is irrelevant in mode %s\n",
6134                                 bond_mode_name(bond_mode));
6135                 } else {
6136                         bond_opt_initstr(&newval, lacp_rate);
6137                         valptr = bond_opt_parse(bond_opt_get(BOND_OPT_LACP_RATE),
6138                                                 &newval);
6139                         if (!valptr) {
6140                                 pr_err("Error: Invalid lacp rate \"%s\"\n",
6141                                        lacp_rate);
6142                                 return -EINVAL;
6143                         }
6144                         lacp_fast = valptr->value;
6145                 }
6146         }
6147
6148         if (ad_select) {
6149                 bond_opt_initstr(&newval, ad_select);
6150                 valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_SELECT),
6151                                         &newval);
6152                 if (!valptr) {
6153                         pr_err("Error: Invalid ad_select \"%s\"\n", ad_select);
6154                         return -EINVAL;
6155                 }
6156                 params->ad_select = valptr->value;
6157                 if (bond_mode != BOND_MODE_8023AD)
6158                         pr_warn("ad_select param only affects 802.3ad mode\n");
6159         } else {
6160                 params->ad_select = BOND_AD_STABLE;
6161         }
6162
6163         if (max_bonds < 0) {
6164                 pr_warn("Warning: max_bonds (%d) not in range %d-%d, so it was reset to BOND_DEFAULT_MAX_BONDS (%d)\n",
6165                         max_bonds, 0, INT_MAX, BOND_DEFAULT_MAX_BONDS);
6166                 max_bonds = BOND_DEFAULT_MAX_BONDS;
6167         }
6168
6169         if (miimon < 0) {
6170                 pr_warn("Warning: miimon module parameter (%d), not in range 0-%d, so it was reset to 0\n",
6171                         miimon, INT_MAX);
6172                 miimon = 0;
6173         }
6174
6175         if (updelay < 0) {
6176                 pr_warn("Warning: updelay module parameter (%d), not in range 0-%d, so it was reset to 0\n",
6177                         updelay, INT_MAX);
6178                 updelay = 0;
6179         }
6180
6181         if (downdelay < 0) {
6182                 pr_warn("Warning: downdelay module parameter (%d), not in range 0-%d, so it was reset to 0\n",
6183                         downdelay, INT_MAX);
6184                 downdelay = 0;
6185         }
6186
6187         if ((use_carrier != 0) && (use_carrier != 1)) {
6188                 pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n",
6189                         use_carrier);
6190                 use_carrier = 1;
6191         }
6192
6193         if (num_peer_notif < 0 || num_peer_notif > 255) {
6194                 pr_warn("Warning: num_grat_arp/num_unsol_na (%d) not in range 0-255 so it was reset to 1\n",
6195                         num_peer_notif);
6196                 num_peer_notif = 1;
6197         }
6198
6199         /* reset values for 802.3ad/TLB/ALB */
6200         if (!bond_mode_uses_arp(bond_mode)) {
6201                 if (!miimon) {
6202                         pr_warn("Warning: miimon must be specified, otherwise bonding will not detect link failure, speed and duplex which are essential for 802.3ad operation\n");
6203                         pr_warn("Forcing miimon to 100msec\n");
6204                         miimon = BOND_DEFAULT_MIIMON;
6205                 }
6206         }
6207
6208         if (tx_queues < 1 || tx_queues > 255) {
6209                 pr_warn("Warning: tx_queues (%d) should be between 1 and 255, resetting to %d\n",
6210                         tx_queues, BOND_DEFAULT_TX_QUEUES);
6211                 tx_queues = BOND_DEFAULT_TX_QUEUES;
6212         }
6213
6214         if ((all_slaves_active != 0) && (all_slaves_active != 1)) {
6215                 pr_warn("Warning: all_slaves_active module parameter (%d), not of valid value (0/1), so it was set to 0\n",
6216                         all_slaves_active);
6217                 all_slaves_active = 0;
6218         }
6219
6220         if (resend_igmp < 0 || resend_igmp > 255) {
6221                 pr_warn("Warning: resend_igmp (%d) should be between 0 and 255, resetting to %d\n",
6222                         resend_igmp, BOND_DEFAULT_RESEND_IGMP);
6223                 resend_igmp = BOND_DEFAULT_RESEND_IGMP;
6224         }
6225
6226         bond_opt_initval(&newval, packets_per_slave);
6227         if (!bond_opt_parse(bond_opt_get(BOND_OPT_PACKETS_PER_SLAVE), &newval)) {
6228                 pr_warn("Warning: packets_per_slave (%d) should be between 0 and %u resetting to 1\n",
6229                         packets_per_slave, USHRT_MAX);
6230                 packets_per_slave = 1;
6231         }
6232
6233         if (bond_mode == BOND_MODE_ALB) {
6234                 pr_notice("In ALB mode you might experience client disconnections upon reconnection of a link if the bonding module updelay parameter (%d msec) is incompatible with the forwarding delay time of the switch\n",
6235                           updelay);
6236         }
6237
6238         if (!miimon) {
6239                 if (updelay || downdelay) {
6240                         /* just warn the user the up/down delay will have
6241                          * no effect since miimon is zero...
6242                          */
6243                         pr_warn("Warning: miimon module parameter not set and updelay (%d) or downdelay (%d) module parameter is set; updelay and downdelay have no effect unless miimon is set\n",
6244                                 updelay, downdelay);
6245                 }
6246         } else {
6247                 /* don't allow arp monitoring */
6248                 if (arp_interval) {
6249                         pr_warn("Warning: miimon (%d) and arp_interval (%d) can't be used simultaneously, disabling ARP monitoring\n",
6250                                 miimon, arp_interval);
6251                         arp_interval = 0;
6252                 }
6253
6254                 if ((updelay % miimon) != 0) {
6255                         pr_warn("Warning: updelay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n",
6256                                 updelay, miimon, (updelay / miimon) * miimon);
6257                 }
6258
6259                 updelay /= miimon;
6260
6261                 if ((downdelay % miimon) != 0) {
6262                         pr_warn("Warning: downdelay (%d) is not a multiple of miimon (%d), downdelay rounded to %d ms\n",
6263                                 downdelay, miimon,
6264                                 (downdelay / miimon) * miimon);
6265                 }
6266
6267                 downdelay /= miimon;
6268         }
6269
6270         if (arp_interval < 0) {
6271                 pr_warn("Warning: arp_interval module parameter (%d), not in range 0-%d, so it was reset to 0\n",
6272                         arp_interval, INT_MAX);
6273                 arp_interval = 0;
6274         }
6275
6276         for (arp_ip_count = 0, i = 0;
6277              (arp_ip_count < BOND_MAX_ARP_TARGETS) && arp_ip_target[i]; i++) {
6278                 __be32 ip;
6279
6280                 /* not a complete check, but good enough to catch mistakes */
6281                 if (!in4_pton(arp_ip_target[i], -1, (u8 *)&ip, -1, NULL) ||
6282                     !bond_is_ip_target_ok(ip)) {
6283                         pr_warn("Warning: bad arp_ip_target module parameter (%s), ARP monitoring will not be performed\n",
6284                                 arp_ip_target[i]);
6285                         arp_interval = 0;
6286                 } else {
6287                         if (bond_get_targets_ip(arp_target, ip) == -1)
6288                                 arp_target[arp_ip_count++] = ip;
6289                         else
6290                                 pr_warn("Warning: duplicate address %pI4 in arp_ip_target, skipping\n",
6291                                         &ip);
6292                 }
6293         }
6294
6295         if (arp_interval && !arp_ip_count) {
6296                 /* don't allow arping if no arp_ip_target given... */
6297                 pr_warn("Warning: arp_interval module parameter (%d) specified without providing an arp_ip_target parameter, arp_interval was reset to 0\n",
6298                         arp_interval);
6299                 arp_interval = 0;
6300         }
6301
6302         if (arp_validate) {
6303                 if (!arp_interval) {
6304                         pr_err("arp_validate requires arp_interval\n");
6305                         return -EINVAL;
6306                 }
6307
6308                 bond_opt_initstr(&newval, arp_validate);
6309                 valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_VALIDATE),
6310                                         &newval);
6311                 if (!valptr) {
6312                         pr_err("Error: invalid arp_validate \"%s\"\n",
6313                                arp_validate);
6314                         return -EINVAL;
6315                 }
6316                 arp_validate_value = valptr->value;
6317         } else {
6318                 arp_validate_value = 0;
6319         }
6320
6321         if (arp_all_targets) {
6322                 bond_opt_initstr(&newval, arp_all_targets);
6323                 valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_ALL_TARGETS),
6324                                         &newval);
6325                 if (!valptr) {
6326                         pr_err("Error: invalid arp_all_targets_value \"%s\"\n",
6327                                arp_all_targets);
6328                         arp_all_targets_value = 0;
6329                 } else {
6330                         arp_all_targets_value = valptr->value;
6331                 }
6332         }
6333
6334         if (miimon) {
6335                 pr_info("MII link monitoring set to %d ms\n", miimon);
6336         } else if (arp_interval) {
6337                 valptr = bond_opt_get_val(BOND_OPT_ARP_VALIDATE,
6338                                           arp_validate_value);
6339                 pr_info("ARP monitoring set to %d ms, validate %s, with %d target(s):",
6340                         arp_interval, valptr->string, arp_ip_count);
6341
6342                 for (i = 0; i < arp_ip_count; i++)
6343                         pr_cont(" %s", arp_ip_target[i]);
6344
6345                 pr_cont("\n");
6346
6347         } else if (max_bonds) {
6348                 /* miimon and arp_interval not set, we need one so things
6349                  * work as expected, see bonding.txt for details
6350                  */
6351                 pr_debug("Warning: either miimon or arp_interval and arp_ip_target module parameters must be specified, otherwise bonding will not detect link failures! see bonding.txt for details\n");
6352         }
6353
6354         if (primary && !bond_mode_uses_primary(bond_mode)) {
6355                 /* currently, using a primary only makes sense
6356                  * in active backup, TLB or ALB modes
6357                  */
6358                 pr_warn("Warning: %s primary device specified but has no effect in %s mode\n",
6359                         primary, bond_mode_name(bond_mode));
6360                 primary = NULL;
6361         }
6362
6363         if (primary && primary_reselect) {
6364                 bond_opt_initstr(&newval, primary_reselect);
6365                 valptr = bond_opt_parse(bond_opt_get(BOND_OPT_PRIMARY_RESELECT),
6366                                         &newval);
6367                 if (!valptr) {
6368                         pr_err("Error: Invalid primary_reselect \"%s\"\n",
6369                                primary_reselect);
6370                         return -EINVAL;
6371                 }
6372                 primary_reselect_value = valptr->value;
6373         } else {
6374                 primary_reselect_value = BOND_PRI_RESELECT_ALWAYS;
6375         }
6376
6377         if (fail_over_mac) {
6378                 bond_opt_initstr(&newval, fail_over_mac);
6379                 valptr = bond_opt_parse(bond_opt_get(BOND_OPT_FAIL_OVER_MAC),
6380                                         &newval);
6381                 if (!valptr) {
6382                         pr_err("Error: invalid fail_over_mac \"%s\"\n",
6383                                fail_over_mac);
6384                         return -EINVAL;
6385                 }
6386                 fail_over_mac_value = valptr->value;
6387                 if (bond_mode != BOND_MODE_ACTIVEBACKUP)
6388                         pr_warn("Warning: fail_over_mac only affects active-backup mode\n");
6389         } else {
6390                 fail_over_mac_value = BOND_FOM_NONE;
6391         }
6392
6393         bond_opt_initstr(&newval, "default");
6394         valptr = bond_opt_parse(
6395                         bond_opt_get(BOND_OPT_AD_ACTOR_SYS_PRIO),
6396                                      &newval);
6397         if (!valptr) {
6398                 pr_err("Error: No ad_actor_sys_prio default value");
6399                 return -EINVAL;
6400         }
6401         ad_actor_sys_prio = valptr->value;
6402
6403         valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_USER_PORT_KEY),
6404                                 &newval);
6405         if (!valptr) {
6406                 pr_err("Error: No ad_user_port_key default value");
6407                 return -EINVAL;
6408         }
6409         ad_user_port_key = valptr->value;
6410
6411         bond_opt_initstr(&newval, "default");
6412         valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), &newval);
6413         if (!valptr) {
6414                 pr_err("Error: No tlb_dynamic_lb default value");
6415                 return -EINVAL;
6416         }
6417         tlb_dynamic_lb = valptr->value;
6418
6419         if (lp_interval == 0) {
6420                 pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n",
6421                         INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
6422                 lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;
6423         }
6424
6425         /* fill params struct with the proper values */
6426         params->mode = bond_mode;
6427         params->xmit_policy = xmit_hashtype;
6428         params->miimon = miimon;
6429         params->num_peer_notif = num_peer_notif;
6430         params->arp_interval = arp_interval;
6431         params->arp_validate = arp_validate_value;
6432         params->arp_all_targets = arp_all_targets_value;
6433         params->missed_max = 2;
6434         params->updelay = updelay;
6435         params->downdelay = downdelay;
6436         params->peer_notif_delay = 0;
6437         params->use_carrier = use_carrier;
6438         params->lacp_active = 1;
6439         params->lacp_fast = lacp_fast;
6440         params->primary[0] = 0;
6441         params->primary_reselect = primary_reselect_value;
6442         params->fail_over_mac = fail_over_mac_value;
6443         params->tx_queues = tx_queues;
6444         params->all_slaves_active = all_slaves_active;
6445         params->resend_igmp = resend_igmp;
6446         params->min_links = min_links;
6447         params->lp_interval = lp_interval;
6448         params->packets_per_slave = packets_per_slave;
6449         params->tlb_dynamic_lb = tlb_dynamic_lb;
6450         params->ad_actor_sys_prio = ad_actor_sys_prio;
6451         eth_zero_addr(params->ad_actor_system);
6452         params->ad_user_port_key = ad_user_port_key;
6453         params->coupled_control = 1;
6454         if (packets_per_slave > 0) {
6455                 params->reciprocal_packets_per_slave =
6456                         reciprocal_value(packets_per_slave);
6457         } else {
6458                 /* reciprocal_packets_per_slave is unused if
6459                  * packets_per_slave is 0 or 1, just initialize it
6460                  */
6461                 params->reciprocal_packets_per_slave =
6462                         (struct reciprocal_value) { 0 };
6463         }
6464
6465         if (primary)
6466                 strscpy_pad(params->primary, primary, sizeof(params->primary));
6467
6468         memcpy(params->arp_targets, arp_target, sizeof(arp_target));
6469 #if IS_ENABLED(CONFIG_IPV6)
6470         memset(params->ns_targets, 0, sizeof(struct in6_addr) * BOND_MAX_NS_TARGETS);
6471 #endif
6472
6473         return 0;
6474 }
6475
6476 /* Called from registration process */
6477 static int bond_init(struct net_device *bond_dev)
6478 {
6479         struct bonding *bond = netdev_priv(bond_dev);
6480         struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id);
6481
6482         netdev_dbg(bond_dev, "Begin bond_init\n");
6483
6484         bond->wq = alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM,
6485                                            bond_dev->name);
6486         if (!bond->wq)
6487                 return -ENOMEM;
6488
6489         bond->notifier_ctx = false;
6490
6491         spin_lock_init(&bond->stats_lock);
6492         netdev_lockdep_set_classes(bond_dev);
6493
6494         list_add_tail_rcu(&bond->bond_list, &bn->dev_list);
6495
6496         bond_prepare_sysfs_group(bond);
6497
6498         bond_debug_register(bond);
6499
6500         /* Ensure valid dev_addr */
6501         if (is_zero_ether_addr(bond_dev->dev_addr) &&
6502             bond_dev->addr_assign_type == NET_ADDR_PERM)
6503                 eth_hw_addr_random(bond_dev);
6504
6505         return 0;
6506 }
6507
6508 unsigned int bond_get_num_tx_queues(void)
6509 {
6510         return tx_queues;
6511 }
6512
6513 /* Create a new bond based on the specified name and bonding parameters.
6514  * If name is NULL, obtain a suitable "bond%d" name for us.
6515  * Caller must NOT hold rtnl_lock; we need to release it here before we
6516  * set up our sysfs entries.
6517  */
6518 int bond_create(struct net *net, const char *name)
6519 {
6520         struct net_device *bond_dev;
6521         struct bonding *bond;
6522         int res = -ENOMEM;
6523
6524         rtnl_lock();
6525
6526         bond_dev = alloc_netdev_mq(sizeof(struct bonding),
6527                                    name ? name : "bond%d", NET_NAME_UNKNOWN,
6528                                    bond_setup, tx_queues);
6529         if (!bond_dev)
6530                 goto out;
6531
6532         bond = netdev_priv(bond_dev);
6533         dev_net_set(bond_dev, net);
6534         bond_dev->rtnl_link_ops = &bond_link_ops;
6535
6536         res = register_netdevice(bond_dev);
6537         if (res < 0) {
6538                 free_netdev(bond_dev);
6539                 goto out;
6540         }
6541
6542         netif_carrier_off(bond_dev);
6543
6544         bond_work_init_all(bond);
6545
6546 out:
6547         rtnl_unlock();
6548         return res;
6549 }
6550
6551 static int __net_init bond_net_init(struct net *net)
6552 {
6553         struct bond_net *bn = net_generic(net, bond_net_id);
6554
6555         bn->net = net;
6556         INIT_LIST_HEAD(&bn->dev_list);
6557
6558         bond_create_proc_dir(bn);
6559         bond_create_sysfs(bn);
6560
6561         return 0;
6562 }
6563
6564 /* According to commit 69b0216ac255 ("bonding: fix bonding_masters
6565  * race condition in bond unloading") we need to remove sysfs files
6566  * before we remove our devices (done later in bond_net_exit_batch_rtnl())
6567  */
6568 static void __net_exit bond_net_pre_exit(struct net *net)
6569 {
6570         struct bond_net *bn = net_generic(net, bond_net_id);
6571
6572         bond_destroy_sysfs(bn);
6573 }
6574
6575 static void __net_exit bond_net_exit_batch_rtnl(struct list_head *net_list,
6576                                                 struct list_head *dev_kill_list)
6577 {
6578         struct bond_net *bn;
6579         struct net *net;
6580
6581         /* Kill off any bonds created after unregistering bond rtnl ops */
6582         list_for_each_entry(net, net_list, exit_list) {
6583                 struct bonding *bond, *tmp_bond;
6584
6585                 bn = net_generic(net, bond_net_id);
6586                 list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list)
6587                         unregister_netdevice_queue(bond->dev, dev_kill_list);
6588         }
6589 }
6590
6591 /* According to commit 23fa5c2caae0 ("bonding: destroy proc directory
6592  * only after all bonds are gone") bond_destroy_proc_dir() is called
6593  * after bond_net_exit_batch_rtnl() has completed.
6594  */
6595 static void __net_exit bond_net_exit_batch(struct list_head *net_list)
6596 {
6597         struct bond_net *bn;
6598         struct net *net;
6599
6600         list_for_each_entry(net, net_list, exit_list) {
6601                 bn = net_generic(net, bond_net_id);
6602                 bond_destroy_proc_dir(bn);
6603         }
6604 }
6605
6606 static struct pernet_operations bond_net_ops = {
6607         .init = bond_net_init,
6608         .pre_exit = bond_net_pre_exit,
6609         .exit_batch_rtnl = bond_net_exit_batch_rtnl,
6610         .exit_batch = bond_net_exit_batch,
6611         .id   = &bond_net_id,
6612         .size = sizeof(struct bond_net),
6613 };
6614
6615 static int __init bonding_init(void)
6616 {
6617         int i;
6618         int res;
6619
6620         res = bond_check_params(&bonding_defaults);
6621         if (res)
6622                 goto out;
6623
6624         bond_create_debugfs();
6625
6626         res = register_pernet_subsys(&bond_net_ops);
6627         if (res)
6628                 goto err_net_ops;
6629
6630         res = bond_netlink_init();
6631         if (res)
6632                 goto err_link;
6633
6634         for (i = 0; i < max_bonds; i++) {
6635                 res = bond_create(&init_net, NULL);
6636                 if (res)
6637                         goto err;
6638         }
6639
6640         skb_flow_dissector_init(&flow_keys_bonding,
6641                                 flow_keys_bonding_keys,
6642                                 ARRAY_SIZE(flow_keys_bonding_keys));
6643
6644         register_netdevice_notifier(&bond_netdev_notifier);
6645 out:
6646         return res;
6647 err:
6648         bond_netlink_fini();
6649 err_link:
6650         unregister_pernet_subsys(&bond_net_ops);
6651 err_net_ops:
6652         bond_destroy_debugfs();
6653         goto out;
6654
6655 }
6656
6657 static void __exit bonding_exit(void)
6658 {
6659         unregister_netdevice_notifier(&bond_netdev_notifier);
6660
6661         bond_netlink_fini();
6662         unregister_pernet_subsys(&bond_net_ops);
6663
6664         bond_destroy_debugfs();
6665
6666 #ifdef CONFIG_NET_POLL_CONTROLLER
6667         /* Make sure we don't have an imbalance on our netpoll blocking */
6668         WARN_ON(atomic_read(&netpoll_block_tx));
6669 #endif
6670 }
6671
6672 module_init(bonding_init);
6673 module_exit(bonding_exit);
6674 MODULE_LICENSE("GPL");
6675 MODULE_DESCRIPTION(DRV_DESCRIPTION);
6676 MODULE_AUTHOR("Thomas Davis, [email protected] and many others");
This page took 0.398683 seconds and 4 git commands to generate.