2 * Linux INET6 implementation
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109 struct fib6_info *rt, struct dst_entry *dst,
110 struct in6_addr *dest, struct in6_addr *src,
111 int iif, int type, u32 portid, u32 seq,
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114 struct in6_addr *daddr,
115 struct in6_addr *saddr);
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev,
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124 const struct in6_addr *prefix, int prefixlen,
125 const struct in6_addr *gwaddr,
126 struct net_device *dev);
129 struct uncached_list {
131 struct list_head head;
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
136 void rt6_uncached_list_add(struct rt6_info *rt)
138 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
140 rt->rt6i_uncached_list = ul;
142 spin_lock_bh(&ul->lock);
143 list_add_tail(&rt->rt6i_uncached, &ul->head);
144 spin_unlock_bh(&ul->lock);
147 void rt6_uncached_list_del(struct rt6_info *rt)
149 if (!list_empty(&rt->rt6i_uncached)) {
150 struct uncached_list *ul = rt->rt6i_uncached_list;
151 struct net *net = dev_net(rt->dst.dev);
153 spin_lock_bh(&ul->lock);
154 list_del(&rt->rt6i_uncached);
155 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156 spin_unlock_bh(&ul->lock);
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
162 struct net_device *loopback_dev = net->loopback_dev;
165 if (dev == loopback_dev)
168 for_each_possible_cpu(cpu) {
169 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
172 spin_lock_bh(&ul->lock);
173 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174 struct inet6_dev *rt_idev = rt->rt6i_idev;
175 struct net_device *rt_dev = rt->dst.dev;
177 if (rt_idev->dev == dev) {
178 rt->rt6i_idev = in6_dev_get(loopback_dev);
179 in6_dev_put(rt_idev);
183 rt->dst.dev = loopback_dev;
184 dev_hold(rt->dst.dev);
188 spin_unlock_bh(&ul->lock);
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
196 if (!ipv6_addr_any(p))
197 return (const void *) p;
199 return &ipv6_hdr(skb)->daddr;
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204 struct net_device *dev,
210 daddr = choose_neigh_daddr(gw, skb, daddr);
211 n = __ipv6_neigh_lookup(dev, daddr);
215 n = neigh_create(&nd_tbl, daddr, dev);
216 return IS_ERR(n) ? NULL : n;
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
223 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
225 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
230 struct net_device *dev = dst->dev;
231 struct rt6_info *rt = (struct rt6_info *)dst;
233 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
236 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
238 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
240 __ipv6_confirm_neigh(dev, daddr);
243 static struct dst_ops ip6_dst_ops_template = {
247 .check = ip6_dst_check,
248 .default_advmss = ip6_default_advmss,
250 .cow_metrics = dst_cow_metrics_generic,
251 .destroy = ip6_dst_destroy,
252 .ifdown = ip6_dst_ifdown,
253 .negative_advice = ip6_negative_advice,
254 .link_failure = ip6_link_failure,
255 .update_pmtu = ip6_rt_update_pmtu,
256 .redirect = rt6_do_redirect,
257 .local_out = __ip6_local_out,
258 .neigh_lookup = ip6_dst_neigh_lookup,
259 .confirm_neigh = ip6_confirm_neigh,
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
264 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
266 return mtu ? : dst->dev->mtu;
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270 struct sk_buff *skb, u32 mtu)
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
279 static struct dst_ops ip6_dst_blackhole_ops = {
281 .destroy = ip6_dst_destroy,
282 .check = ip6_dst_check,
283 .mtu = ip6_blackhole_mtu,
284 .default_advmss = ip6_default_advmss,
285 .update_pmtu = ip6_rt_blackhole_update_pmtu,
286 .redirect = ip6_rt_blackhole_redirect,
287 .cow_metrics = dst_cow_metrics_generic,
288 .neigh_lookup = ip6_dst_neigh_lookup,
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292 [RTAX_HOPLIMIT - 1] = 0,
295 static const struct fib6_info fib6_null_entry_template = {
296 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
297 .fib6_protocol = RTPROT_KERNEL,
298 .fib6_metric = ~(u32)0,
299 .fib6_ref = REFCOUNT_INIT(1),
300 .fib6_type = RTN_UNREACHABLE,
301 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
304 static const struct rt6_info ip6_null_entry_template = {
306 .__refcnt = ATOMIC_INIT(1),
308 .obsolete = DST_OBSOLETE_FORCE_CHK,
309 .error = -ENETUNREACH,
310 .input = ip6_pkt_discard,
311 .output = ip6_pkt_discard_out,
313 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
318 static const struct rt6_info ip6_prohibit_entry_template = {
320 .__refcnt = ATOMIC_INIT(1),
322 .obsolete = DST_OBSOLETE_FORCE_CHK,
324 .input = ip6_pkt_prohibit,
325 .output = ip6_pkt_prohibit_out,
327 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
330 static const struct rt6_info ip6_blk_hole_entry_template = {
332 .__refcnt = ATOMIC_INIT(1),
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
336 .input = dst_discard,
337 .output = dst_discard_out,
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
344 static void rt6_info_init(struct rt6_info *rt)
346 struct dst_entry *dst = &rt->dst;
348 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349 INIT_LIST_HEAD(&rt->rt6i_uncached);
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
356 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 1, DST_OBSOLETE_FORCE_CHK, flags);
361 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
366 EXPORT_SYMBOL(ip6_dst_alloc);
368 static void ip6_dst_destroy(struct dst_entry *dst)
370 struct rt6_info *rt = (struct rt6_info *)dst;
371 struct fib6_info *from;
372 struct inet6_dev *idev;
374 ip_dst_metrics_put(dst);
375 rt6_uncached_list_del(rt);
377 idev = rt->rt6i_idev;
379 rt->rt6i_idev = NULL;
383 from = xchg((__force struct fib6_info **)&rt->from, NULL);
384 fib6_info_release(from);
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
390 struct rt6_info *rt = (struct rt6_info *)dst;
391 struct inet6_dev *idev = rt->rt6i_idev;
392 struct net_device *loopback_dev =
393 dev_net(dev)->loopback_dev;
395 if (idev && idev->dev != loopback_dev) {
396 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
398 rt->rt6i_idev = loopback_idev;
404 static bool __rt6_check_expired(const struct rt6_info *rt)
406 if (rt->rt6i_flags & RTF_EXPIRES)
407 return time_after(jiffies, rt->dst.expires);
412 static bool rt6_check_expired(const struct rt6_info *rt)
414 struct fib6_info *from;
416 from = rcu_dereference(rt->from);
418 if (rt->rt6i_flags & RTF_EXPIRES) {
419 if (time_after(jiffies, rt->dst.expires))
422 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 fib6_check_expired(from);
428 void fib6_select_path(const struct net *net, struct fib6_result *res,
429 struct flowi6 *fl6, int oif, bool have_oif_match,
430 const struct sk_buff *skb, int strict)
432 struct fib6_info *sibling, *next_sibling;
433 struct fib6_info *match = res->f6i;
435 if (!match->fib6_nsiblings || have_oif_match)
438 /* We might have already computed the hash for ICMPv6 errors. In such
439 * case it will always be non-zero. Otherwise now is the time to do it.
442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 const struct fib6_nh *nh = &sibling->fib6_nh;
452 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
453 if (fl6->mp_hash > nh_upper_bound)
455 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
463 res->nh = &match->fib6_nh;
467 * Route lookup. rcu_read_lock() should be held.
470 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
471 const struct in6_addr *saddr, int oif, int flags)
473 const struct net_device *dev;
475 if (nh->fib_nh_flags & RTNH_F_DEAD)
478 dev = nh->fib_nh_dev;
480 if (dev->ifindex == oif)
483 if (ipv6_chk_addr(net, saddr, dev,
484 flags & RT6_LOOKUP_F_IFACE))
491 static void rt6_device_match(struct net *net, struct fib6_result *res,
492 const struct in6_addr *saddr, int oif, int flags)
494 struct fib6_info *f6i = res->f6i;
495 struct fib6_info *spf6i;
498 if (!oif && ipv6_addr_any(saddr)) {
500 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
504 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
505 nh = &spf6i->fib6_nh;
506 if (__rt6_device_match(net, nh, saddr, oif, flags)) {
512 if (oif && flags & RT6_LOOKUP_F_IFACE) {
513 res->f6i = net->ipv6.fib6_null_entry;
514 nh = &res->f6i->fib6_nh;
519 if (nh->fib_nh_flags & RTNH_F_DEAD) {
520 res->f6i = net->ipv6.fib6_null_entry;
521 nh = &res->f6i->fib6_nh;
525 res->fib6_type = res->f6i->fib6_type;
526 res->fib6_flags = res->f6i->fib6_flags;
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531 struct work_struct work;
532 struct in6_addr target;
533 struct net_device *dev;
536 static void rt6_probe_deferred(struct work_struct *w)
538 struct in6_addr mcaddr;
539 struct __rt6_probe_work *work =
540 container_of(w, struct __rt6_probe_work, work);
542 addrconf_addr_solict_mult(&work->target, &mcaddr);
543 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
548 static void rt6_probe(struct fib6_nh *fib6_nh)
550 struct __rt6_probe_work *work = NULL;
551 const struct in6_addr *nh_gw;
552 struct neighbour *neigh;
553 struct net_device *dev;
554 struct inet6_dev *idev;
557 * Okay, this does not seem to be appropriate
558 * for now, however, we need to check if it
559 * is really so; aka Router Reachability Probing.
561 * Router Reachability Probe MUST be rate-limited
562 * to no more than one per minute.
564 if (fib6_nh->fib_nh_gw_family)
567 nh_gw = &fib6_nh->fib_nh_gw6;
568 dev = fib6_nh->fib_nh_dev;
570 idev = __in6_dev_get(dev);
571 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
573 if (neigh->nud_state & NUD_VALID)
576 write_lock(&neigh->lock);
577 if (!(neigh->nud_state & NUD_VALID) &&
579 neigh->updated + idev->cnf.rtr_probe_interval)) {
580 work = kmalloc(sizeof(*work), GFP_ATOMIC);
582 __neigh_set_probe_once(neigh);
584 write_unlock(&neigh->lock);
585 } else if (time_after(jiffies, fib6_nh->last_probe +
586 idev->cnf.rtr_probe_interval)) {
587 work = kmalloc(sizeof(*work), GFP_ATOMIC);
591 fib6_nh->last_probe = jiffies;
592 INIT_WORK(&work->work, rt6_probe_deferred);
593 work->target = *nh_gw;
596 schedule_work(&work->work);
600 rcu_read_unlock_bh();
603 static inline void rt6_probe(struct fib6_nh *fib6_nh)
609 * Default Router Selection (RFC 2461 6.3.6)
611 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
613 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
614 struct neighbour *neigh;
617 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
618 &fib6_nh->fib_nh_gw6);
620 read_lock(&neigh->lock);
621 if (neigh->nud_state & NUD_VALID)
622 ret = RT6_NUD_SUCCEED;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 else if (!(neigh->nud_state & NUD_FAILED))
625 ret = RT6_NUD_SUCCEED;
627 ret = RT6_NUD_FAIL_PROBE;
629 read_unlock(&neigh->lock);
631 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
632 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
634 rcu_read_unlock_bh();
639 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
644 if (!oif || nh->fib_nh_dev->ifindex == oif)
647 if (!m && (strict & RT6_LOOKUP_F_IFACE))
648 return RT6_NUD_FAIL_HARD;
649 #ifdef CONFIG_IPV6_ROUTER_PREF
650 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
652 if ((strict & RT6_LOOKUP_F_REACHABLE) &&
653 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
654 int n = rt6_check_neigh(nh);
661 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
662 int oif, int strict, int *mpri, bool *do_rr)
664 bool match_do_rr = false;
668 if (nh->fib_nh_flags & RTNH_F_DEAD)
671 if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
672 nh->fib_nh_flags & RTNH_F_LINKDOWN &&
673 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
676 m = rt6_score_route(nh, fib6_flags, oif, strict);
677 if (m == RT6_NUD_FAIL_DO_RR) {
679 m = 0; /* lowest valid score */
680 } else if (m == RT6_NUD_FAIL_HARD) {
684 if (strict & RT6_LOOKUP_F_REACHABLE)
687 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
689 *do_rr = match_do_rr;
697 static void __find_rr_leaf(struct fib6_info *f6i_start,
698 struct fib6_info *nomatch, u32 metric,
699 struct fib6_result *res, struct fib6_info **cont,
700 int oif, int strict, bool *do_rr, int *mpri)
702 struct fib6_info *f6i;
704 for (f6i = f6i_start;
705 f6i && f6i != nomatch;
706 f6i = rcu_dereference(f6i->fib6_next)) {
709 if (cont && f6i->fib6_metric != metric) {
714 if (fib6_check_expired(f6i))
718 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
721 res->fib6_flags = f6i->fib6_flags;
722 res->fib6_type = f6i->fib6_type;
727 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
728 struct fib6_info *rr_head, int oif, int strict,
729 bool *do_rr, struct fib6_result *res)
731 u32 metric = rr_head->fib6_metric;
732 struct fib6_info *cont = NULL;
735 __find_rr_leaf(rr_head, NULL, metric, res, &cont,
736 oif, strict, do_rr, &mpri);
738 __find_rr_leaf(leaf, rr_head, metric, res, &cont,
739 oif, strict, do_rr, &mpri);
741 if (res->f6i || !cont)
744 __find_rr_leaf(cont, NULL, metric, res, NULL,
745 oif, strict, do_rr, &mpri);
748 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
749 struct fib6_result *res, int strict)
751 struct fib6_info *leaf = rcu_dereference(fn->leaf);
752 struct fib6_info *rt0;
756 /* make sure this function or its helpers sets f6i */
759 if (!leaf || leaf == net->ipv6.fib6_null_entry)
762 rt0 = rcu_dereference(fn->rr_ptr);
766 /* Double check to make sure fn is not an intermediate node
767 * and fn->leaf does not points to its child's leaf
768 * (This might happen if all routes under fn are deleted from
769 * the tree and fib6_repair_tree() is called on the node.)
771 key_plen = rt0->fib6_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773 if (rt0->fib6_src.plen)
774 key_plen = rt0->fib6_src.plen;
776 if (fn->fn_bit != key_plen)
779 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
781 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
783 /* no entries matched; do round-robin */
784 if (!next || next->fib6_metric != rt0->fib6_metric)
788 spin_lock_bh(&leaf->fib6_table->tb6_lock);
789 /* make sure next is not being deleted from the tree */
791 rcu_assign_pointer(fn->rr_ptr, next);
792 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
798 res->f6i = net->ipv6.fib6_null_entry;
799 res->nh = &res->f6i->fib6_nh;
800 res->fib6_flags = res->f6i->fib6_flags;
801 res->fib6_type = res->f6i->fib6_type;
805 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
807 return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
808 res->nh->fib_nh_gw_family;
811 #ifdef CONFIG_IPV6_ROUTE_INFO
812 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
813 const struct in6_addr *gwaddr)
815 struct net *net = dev_net(dev);
816 struct route_info *rinfo = (struct route_info *) opt;
817 struct in6_addr prefix_buf, *prefix;
819 unsigned long lifetime;
820 struct fib6_info *rt;
822 if (len < sizeof(struct route_info)) {
826 /* Sanity check for prefix_len and length */
827 if (rinfo->length > 3) {
829 } else if (rinfo->prefix_len > 128) {
831 } else if (rinfo->prefix_len > 64) {
832 if (rinfo->length < 2) {
835 } else if (rinfo->prefix_len > 0) {
836 if (rinfo->length < 1) {
841 pref = rinfo->route_pref;
842 if (pref == ICMPV6_ROUTER_PREF_INVALID)
845 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
847 if (rinfo->length == 3)
848 prefix = (struct in6_addr *)rinfo->prefix;
850 /* this function is safe */
851 ipv6_addr_prefix(&prefix_buf,
852 (struct in6_addr *)rinfo->prefix,
854 prefix = &prefix_buf;
857 if (rinfo->prefix_len == 0)
858 rt = rt6_get_dflt_router(net, gwaddr, dev);
860 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
863 if (rt && !lifetime) {
869 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
872 rt->fib6_flags = RTF_ROUTEINFO |
873 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
876 if (!addrconf_finite_timeout(lifetime))
877 fib6_clean_expires(rt);
879 fib6_set_expires(rt, jiffies + HZ * lifetime);
881 fib6_info_release(rt);
888 * Misc support functions
891 /* called with rcu_lock held */
892 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
894 struct net_device *dev = res->nh->fib_nh_dev;
896 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
897 /* for copies of local routes, dst->dev needs to be the
898 * device if it is a master device, the master device if
899 * device is enslaved, and the loopback as the default
901 if (netif_is_l3_slave(dev) &&
902 !rt6_need_strict(&res->f6i->fib6_dst.addr))
903 dev = l3mdev_master_dev_rcu(dev);
904 else if (!netif_is_l3_master(dev))
905 dev = dev_net(dev)->loopback_dev;
906 /* last case is netif_is_l3_master(dev) is true in which
907 * case we want dev returned to be dev
914 static const int fib6_prop[RTN_MAX + 1] = {
921 [RTN_BLACKHOLE] = -EINVAL,
922 [RTN_UNREACHABLE] = -EHOSTUNREACH,
923 [RTN_PROHIBIT] = -EACCES,
924 [RTN_THROW] = -EAGAIN,
926 [RTN_XRESOLVE] = -EINVAL,
929 static int ip6_rt_type_to_error(u8 fib6_type)
931 return fib6_prop[fib6_type];
934 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
936 unsigned short flags = 0;
939 flags |= DST_NOCOUNT;
940 if (rt->dst_nopolicy)
941 flags |= DST_NOPOLICY;
948 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
950 rt->dst.error = ip6_rt_type_to_error(fib6_type);
954 rt->dst.output = dst_discard_out;
955 rt->dst.input = dst_discard;
958 rt->dst.output = ip6_pkt_prohibit_out;
959 rt->dst.input = ip6_pkt_prohibit;
962 case RTN_UNREACHABLE:
964 rt->dst.output = ip6_pkt_discard_out;
965 rt->dst.input = ip6_pkt_discard;
970 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
972 struct fib6_info *f6i = res->f6i;
974 if (res->fib6_flags & RTF_REJECT) {
975 ip6_rt_init_dst_reject(rt, res->fib6_type);
980 rt->dst.output = ip6_output;
982 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
983 rt->dst.input = ip6_input;
984 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
985 rt->dst.input = ip6_mc_input;
987 rt->dst.input = ip6_forward;
990 if (res->nh->fib_nh_lws) {
991 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
992 lwtunnel_set_redirect(&rt->dst);
995 rt->dst.lastuse = jiffies;
998 /* Caller must already hold reference to @from */
999 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1001 rt->rt6i_flags &= ~RTF_EXPIRES;
1002 rcu_assign_pointer(rt->from, from);
1003 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1006 /* Caller must already hold reference to f6i in result */
1007 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1009 const struct fib6_nh *nh = res->nh;
1010 const struct net_device *dev = nh->fib_nh_dev;
1011 struct fib6_info *f6i = res->f6i;
1013 ip6_rt_init_dst(rt, res);
1015 rt->rt6i_dst = f6i->fib6_dst;
1016 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1017 rt->rt6i_flags = res->fib6_flags;
1018 if (nh->fib_nh_gw_family) {
1019 rt->rt6i_gateway = nh->fib_nh_gw6;
1020 rt->rt6i_flags |= RTF_GATEWAY;
1022 rt6_set_from(rt, f6i);
1023 #ifdef CONFIG_IPV6_SUBTREES
1024 rt->rt6i_src = f6i->fib6_src;
1028 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1029 struct in6_addr *saddr)
1031 struct fib6_node *pn, *sn;
1033 if (fn->fn_flags & RTN_TL_ROOT)
1035 pn = rcu_dereference(fn->parent);
1036 sn = FIB6_SUBTREE(pn);
1038 fn = fib6_node_lookup(sn, NULL, saddr);
1041 if (fn->fn_flags & RTN_RTINFO)
1046 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1048 struct rt6_info *rt = *prt;
1050 if (dst_hold_safe(&rt->dst))
1053 rt = net->ipv6.ip6_null_entry;
1062 /* called with rcu_lock held */
1063 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1065 struct net_device *dev = res->nh->fib_nh_dev;
1066 struct fib6_info *f6i = res->f6i;
1067 unsigned short flags;
1068 struct rt6_info *nrt;
1070 if (!fib6_info_hold_safe(f6i))
1073 flags = fib6_info_dst_flags(f6i);
1074 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1076 fib6_info_release(f6i);
1080 ip6_rt_copy_init(nrt, res);
1084 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1085 dst_hold(&nrt->dst);
1089 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1090 struct fib6_table *table,
1092 const struct sk_buff *skb,
1095 struct fib6_result res = {};
1096 struct fib6_node *fn;
1097 struct rt6_info *rt;
1099 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1100 flags &= ~RT6_LOOKUP_F_IFACE;
1103 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1105 res.f6i = rcu_dereference(fn->leaf);
1107 res.f6i = net->ipv6.fib6_null_entry;
1109 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1112 if (res.f6i == net->ipv6.fib6_null_entry) {
1113 fn = fib6_backtrack(fn, &fl6->saddr);
1117 rt = net->ipv6.ip6_null_entry;
1122 fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1123 fl6->flowi6_oif != 0, skb, flags);
1125 /* Search through exception table */
1126 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1128 if (ip6_hold_safe(net, &rt))
1129 dst_use_noref(&rt->dst, jiffies);
1131 rt = ip6_create_rt_rcu(&res);
1135 trace_fib6_table_lookup(net, &res, table, fl6);
1142 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1143 const struct sk_buff *skb, int flags)
1145 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1147 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1149 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1150 const struct in6_addr *saddr, int oif,
1151 const struct sk_buff *skb, int strict)
1153 struct flowi6 fl6 = {
1157 struct dst_entry *dst;
1158 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1161 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1162 flags |= RT6_LOOKUP_F_HAS_SADDR;
1165 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1166 if (dst->error == 0)
1167 return (struct rt6_info *) dst;
1173 EXPORT_SYMBOL(rt6_lookup);
1175 /* ip6_ins_rt is called with FREE table->tb6_lock.
1176 * It takes new route entry, the addition fails by any reason the
1177 * route is released.
1178 * Caller must hold dst before calling it.
1181 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1182 struct netlink_ext_ack *extack)
1185 struct fib6_table *table;
1187 table = rt->fib6_table;
1188 spin_lock_bh(&table->tb6_lock);
1189 err = fib6_add(&table->tb6_root, rt, info, extack);
1190 spin_unlock_bh(&table->tb6_lock);
1195 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1197 struct nl_info info = { .nl_net = net, };
1199 return __ip6_ins_rt(rt, &info, NULL);
1202 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1203 const struct in6_addr *daddr,
1204 const struct in6_addr *saddr)
1206 struct fib6_info *f6i = res->f6i;
1207 struct net_device *dev;
1208 struct rt6_info *rt;
1214 if (!fib6_info_hold_safe(f6i))
1217 dev = ip6_rt_get_dev_rcu(res);
1218 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1220 fib6_info_release(f6i);
1224 ip6_rt_copy_init(rt, res);
1225 rt->rt6i_flags |= RTF_CACHE;
1226 rt->dst.flags |= DST_HOST;
1227 rt->rt6i_dst.addr = *daddr;
1228 rt->rt6i_dst.plen = 128;
1230 if (!rt6_is_gw_or_nonexthop(res)) {
1231 if (f6i->fib6_dst.plen != 128 &&
1232 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1233 rt->rt6i_flags |= RTF_ANYCAST;
1234 #ifdef CONFIG_IPV6_SUBTREES
1235 if (rt->rt6i_src.plen && saddr) {
1236 rt->rt6i_src.addr = *saddr;
1237 rt->rt6i_src.plen = 128;
1245 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1247 struct fib6_info *f6i = res->f6i;
1248 unsigned short flags = fib6_info_dst_flags(f6i);
1249 struct net_device *dev;
1250 struct rt6_info *pcpu_rt;
1252 if (!fib6_info_hold_safe(f6i))
1256 dev = ip6_rt_get_dev_rcu(res);
1257 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1260 fib6_info_release(f6i);
1263 ip6_rt_copy_init(pcpu_rt, res);
1264 pcpu_rt->rt6i_flags |= RTF_PCPU;
1268 /* It should be called with rcu_read_lock() acquired */
1269 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1271 struct rt6_info *pcpu_rt, **p;
1273 p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1277 ip6_hold_safe(NULL, &pcpu_rt);
1282 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1283 const struct fib6_result *res)
1285 struct rt6_info *pcpu_rt, *prev, **p;
1287 pcpu_rt = ip6_rt_pcpu_alloc(res);
1289 dst_hold(&net->ipv6.ip6_null_entry->dst);
1290 return net->ipv6.ip6_null_entry;
1293 dst_hold(&pcpu_rt->dst);
1294 p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1295 prev = cmpxchg(p, NULL, pcpu_rt);
1301 /* exception hash table implementation
1303 static DEFINE_SPINLOCK(rt6_exception_lock);
1305 /* Remove rt6_ex from hash table and free the memory
1306 * Caller must hold rt6_exception_lock
1308 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1309 struct rt6_exception *rt6_ex)
1311 struct fib6_info *from;
1314 if (!bucket || !rt6_ex)
1317 net = dev_net(rt6_ex->rt6i->dst.dev);
1318 net->ipv6.rt6_stats->fib_rt_cache--;
1320 /* purge completely the exception to allow releasing the held resources:
1321 * some [sk] cache may keep the dst around for unlimited time
1323 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1324 fib6_info_release(from);
1325 dst_dev_put(&rt6_ex->rt6i->dst);
1327 hlist_del_rcu(&rt6_ex->hlist);
1328 dst_release(&rt6_ex->rt6i->dst);
1329 kfree_rcu(rt6_ex, rcu);
1330 WARN_ON_ONCE(!bucket->depth);
1334 /* Remove oldest rt6_ex in bucket and free the memory
1335 * Caller must hold rt6_exception_lock
1337 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1339 struct rt6_exception *rt6_ex, *oldest = NULL;
1344 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1345 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1348 rt6_remove_exception(bucket, oldest);
1351 static u32 rt6_exception_hash(const struct in6_addr *dst,
1352 const struct in6_addr *src)
1354 static u32 seed __read_mostly;
1357 net_get_random_once(&seed, sizeof(seed));
1358 val = jhash(dst, sizeof(*dst), seed);
1360 #ifdef CONFIG_IPV6_SUBTREES
1362 val = jhash(src, sizeof(*src), val);
1364 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1367 /* Helper function to find the cached rt in the hash table
1368 * and update bucket pointer to point to the bucket for this
1369 * (daddr, saddr) pair
1370 * Caller must hold rt6_exception_lock
1372 static struct rt6_exception *
1373 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1374 const struct in6_addr *daddr,
1375 const struct in6_addr *saddr)
1377 struct rt6_exception *rt6_ex;
1380 if (!(*bucket) || !daddr)
1383 hval = rt6_exception_hash(daddr, saddr);
1386 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1387 struct rt6_info *rt6 = rt6_ex->rt6i;
1388 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1390 #ifdef CONFIG_IPV6_SUBTREES
1391 if (matched && saddr)
1392 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1400 /* Helper function to find the cached rt in the hash table
1401 * and update bucket pointer to point to the bucket for this
1402 * (daddr, saddr) pair
1403 * Caller must hold rcu_read_lock()
1405 static struct rt6_exception *
1406 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1407 const struct in6_addr *daddr,
1408 const struct in6_addr *saddr)
1410 struct rt6_exception *rt6_ex;
1413 WARN_ON_ONCE(!rcu_read_lock_held());
1415 if (!(*bucket) || !daddr)
1418 hval = rt6_exception_hash(daddr, saddr);
1421 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1422 struct rt6_info *rt6 = rt6_ex->rt6i;
1423 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1425 #ifdef CONFIG_IPV6_SUBTREES
1426 if (matched && saddr)
1427 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1435 static unsigned int fib6_mtu(const struct fib6_result *res)
1437 const struct fib6_nh *nh = res->nh;
1440 if (res->f6i->fib6_pmtu) {
1441 mtu = res->f6i->fib6_pmtu;
1443 struct net_device *dev = nh->fib_nh_dev;
1444 struct inet6_dev *idev;
1447 idev = __in6_dev_get(dev);
1448 mtu = idev->cnf.mtu6;
1452 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1454 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1457 static int rt6_insert_exception(struct rt6_info *nrt,
1458 const struct fib6_result *res)
1460 struct net *net = dev_net(nrt->dst.dev);
1461 struct rt6_exception_bucket *bucket;
1462 struct in6_addr *src_key = NULL;
1463 struct rt6_exception *rt6_ex;
1464 struct fib6_info *f6i = res->f6i;
1467 spin_lock_bh(&rt6_exception_lock);
1469 if (f6i->exception_bucket_flushed) {
1474 bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1475 lockdep_is_held(&rt6_exception_lock));
1477 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1483 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1486 #ifdef CONFIG_IPV6_SUBTREES
1487 /* fib6_src.plen != 0 indicates f6i is in subtree
1488 * and exception table is indexed by a hash of
1489 * both fib6_dst and fib6_src.
1490 * Otherwise, the exception table is indexed by
1491 * a hash of only fib6_dst.
1493 if (f6i->fib6_src.plen)
1494 src_key = &nrt->rt6i_src.addr;
1496 /* rt6_mtu_change() might lower mtu on f6i.
1497 * Only insert this exception route if its mtu
1498 * is less than f6i's mtu value.
1500 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1505 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1508 rt6_remove_exception(bucket, rt6_ex);
1510 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1516 rt6_ex->stamp = jiffies;
1517 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1519 net->ipv6.rt6_stats->fib_rt_cache++;
1521 if (bucket->depth > FIB6_MAX_DEPTH)
1522 rt6_exception_remove_oldest(bucket);
1525 spin_unlock_bh(&rt6_exception_lock);
1527 /* Update fn->fn_sernum to invalidate all cached dst */
1529 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1530 fib6_update_sernum(net, f6i);
1531 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1532 fib6_force_start_gc(net);
1538 void rt6_flush_exceptions(struct fib6_info *rt)
1540 struct rt6_exception_bucket *bucket;
1541 struct rt6_exception *rt6_ex;
1542 struct hlist_node *tmp;
1545 spin_lock_bh(&rt6_exception_lock);
1546 /* Prevent rt6_insert_exception() to recreate the bucket list */
1547 rt->exception_bucket_flushed = 1;
1549 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550 lockdep_is_held(&rt6_exception_lock));
1554 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1555 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1556 rt6_remove_exception(bucket, rt6_ex);
1557 WARN_ON_ONCE(bucket->depth);
1562 spin_unlock_bh(&rt6_exception_lock);
1565 /* Find cached rt in the hash table inside passed in rt
1566 * Caller has to hold rcu_read_lock()
1568 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1569 struct in6_addr *daddr,
1570 struct in6_addr *saddr)
1572 struct rt6_exception_bucket *bucket;
1573 struct in6_addr *src_key = NULL;
1574 struct rt6_exception *rt6_ex;
1575 struct rt6_info *ret = NULL;
1577 bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1579 #ifdef CONFIG_IPV6_SUBTREES
1580 /* fib6i_src.plen != 0 indicates f6i is in subtree
1581 * and exception table is indexed by a hash of
1582 * both fib6_dst and fib6_src.
1583 * Otherwise, the exception table is indexed by
1584 * a hash of only fib6_dst.
1586 if (res->f6i->fib6_src.plen)
1589 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1591 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1597 /* Remove the passed in cached rt from the hash table that contains it */
1598 static int rt6_remove_exception_rt(struct rt6_info *rt)
1600 struct rt6_exception_bucket *bucket;
1601 struct in6_addr *src_key = NULL;
1602 struct rt6_exception *rt6_ex;
1603 struct fib6_info *from;
1606 from = rcu_dereference(rt->from);
1608 !(rt->rt6i_flags & RTF_CACHE))
1611 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1614 spin_lock_bh(&rt6_exception_lock);
1615 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1616 lockdep_is_held(&rt6_exception_lock));
1617 #ifdef CONFIG_IPV6_SUBTREES
1618 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1619 * and exception table is indexed by a hash of
1620 * both rt6i_dst and rt6i_src.
1621 * Otherwise, the exception table is indexed by
1622 * a hash of only rt6i_dst.
1624 if (from->fib6_src.plen)
1625 src_key = &rt->rt6i_src.addr;
1627 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1631 rt6_remove_exception(bucket, rt6_ex);
1637 spin_unlock_bh(&rt6_exception_lock);
1641 /* Find rt6_ex which contains the passed in rt cache and
1644 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1646 struct rt6_exception_bucket *bucket;
1647 struct in6_addr *src_key = NULL;
1648 struct rt6_exception *rt6_ex;
1649 struct fib6_info *from;
1652 from = rcu_dereference(rt->from);
1653 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1656 bucket = rcu_dereference(from->rt6i_exception_bucket);
1658 #ifdef CONFIG_IPV6_SUBTREES
1659 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1660 * and exception table is indexed by a hash of
1661 * both rt6i_dst and rt6i_src.
1662 * Otherwise, the exception table is indexed by
1663 * a hash of only rt6i_dst.
1665 if (from->fib6_src.plen)
1666 src_key = &rt->rt6i_src.addr;
1668 rt6_ex = __rt6_find_exception_rcu(&bucket,
1672 rt6_ex->stamp = jiffies;
1678 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1679 struct rt6_info *rt, int mtu)
1681 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1682 * lowest MTU in the path: always allow updating the route PMTU to
1683 * reflect PMTU decreases.
1685 * If the new MTU is higher, and the route PMTU is equal to the local
1686 * MTU, this means the old MTU is the lowest in the path, so allow
1687 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1691 if (dst_mtu(&rt->dst) >= mtu)
1694 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1700 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1701 struct fib6_info *rt, int mtu)
1703 struct rt6_exception_bucket *bucket;
1704 struct rt6_exception *rt6_ex;
1707 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1708 lockdep_is_held(&rt6_exception_lock));
1713 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1714 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1715 struct rt6_info *entry = rt6_ex->rt6i;
1717 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1718 * route), the metrics of its rt->from have already
1721 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1722 rt6_mtu_change_route_allowed(idev, entry, mtu))
1723 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1729 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1731 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1732 struct in6_addr *gateway)
1734 struct rt6_exception_bucket *bucket;
1735 struct rt6_exception *rt6_ex;
1736 struct hlist_node *tmp;
1739 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1742 spin_lock_bh(&rt6_exception_lock);
1743 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1744 lockdep_is_held(&rt6_exception_lock));
1747 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1748 hlist_for_each_entry_safe(rt6_ex, tmp,
1749 &bucket->chain, hlist) {
1750 struct rt6_info *entry = rt6_ex->rt6i;
1752 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1753 RTF_CACHE_GATEWAY &&
1754 ipv6_addr_equal(gateway,
1755 &entry->rt6i_gateway)) {
1756 rt6_remove_exception(bucket, rt6_ex);
1763 spin_unlock_bh(&rt6_exception_lock);
1766 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1767 struct rt6_exception *rt6_ex,
1768 struct fib6_gc_args *gc_args,
1771 struct rt6_info *rt = rt6_ex->rt6i;
1773 /* we are pruning and obsoleting aged-out and non gateway exceptions
1774 * even if others have still references to them, so that on next
1775 * dst_check() such references can be dropped.
1776 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1777 * expired, independently from their aging, as per RFC 8201 section 4
1779 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1780 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1781 RT6_TRACE("aging clone %p\n", rt);
1782 rt6_remove_exception(bucket, rt6_ex);
1785 } else if (time_after(jiffies, rt->dst.expires)) {
1786 RT6_TRACE("purging expired route %p\n", rt);
1787 rt6_remove_exception(bucket, rt6_ex);
1791 if (rt->rt6i_flags & RTF_GATEWAY) {
1792 struct neighbour *neigh;
1793 __u8 neigh_flags = 0;
1795 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1797 neigh_flags = neigh->flags;
1799 if (!(neigh_flags & NTF_ROUTER)) {
1800 RT6_TRACE("purging route %p via non-router but gateway\n",
1802 rt6_remove_exception(bucket, rt6_ex);
1810 void rt6_age_exceptions(struct fib6_info *rt,
1811 struct fib6_gc_args *gc_args,
1814 struct rt6_exception_bucket *bucket;
1815 struct rt6_exception *rt6_ex;
1816 struct hlist_node *tmp;
1819 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1823 spin_lock(&rt6_exception_lock);
1824 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1825 lockdep_is_held(&rt6_exception_lock));
1828 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1829 hlist_for_each_entry_safe(rt6_ex, tmp,
1830 &bucket->chain, hlist) {
1831 rt6_age_examine_exception(bucket, rt6_ex,
1837 spin_unlock(&rt6_exception_lock);
1838 rcu_read_unlock_bh();
1841 /* must be called with rcu lock held */
1842 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1843 struct flowi6 *fl6, struct fib6_result *res, int strict)
1845 struct fib6_node *fn, *saved_fn;
1847 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1850 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1854 rt6_select(net, fn, oif, res, strict);
1855 if (res->f6i == net->ipv6.fib6_null_entry) {
1856 fn = fib6_backtrack(fn, &fl6->saddr);
1858 goto redo_rt6_select;
1859 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1860 /* also consider unreachable route */
1861 strict &= ~RT6_LOOKUP_F_REACHABLE;
1863 goto redo_rt6_select;
1867 trace_fib6_table_lookup(net, res, table, fl6);
1872 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1873 int oif, struct flowi6 *fl6,
1874 const struct sk_buff *skb, int flags)
1876 struct fib6_result res = {};
1877 struct rt6_info *rt;
1880 strict |= flags & RT6_LOOKUP_F_IFACE;
1881 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1882 if (net->ipv6.devconf_all->forwarding == 0)
1883 strict |= RT6_LOOKUP_F_REACHABLE;
1887 fib6_table_lookup(net, table, oif, fl6, &res, strict);
1888 if (res.f6i == net->ipv6.fib6_null_entry) {
1889 rt = net->ipv6.ip6_null_entry;
1895 fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1897 /*Search through exception table */
1898 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1900 if (ip6_hold_safe(net, &rt))
1901 dst_use_noref(&rt->dst, jiffies);
1905 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1906 !res.nh->fib_nh_gw_family)) {
1907 /* Create a RTF_CACHE clone which will not be
1908 * owned by the fib6 tree. It is for the special case where
1909 * the daddr in the skb during the neighbor look-up is different
1910 * from the fl6->daddr used to look-up route here.
1912 struct rt6_info *uncached_rt;
1914 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1919 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1920 * No need for another dst_hold()
1922 rt6_uncached_list_add(uncached_rt);
1923 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1925 uncached_rt = net->ipv6.ip6_null_entry;
1926 dst_hold(&uncached_rt->dst);
1931 /* Get a percpu copy */
1933 struct rt6_info *pcpu_rt;
1936 pcpu_rt = rt6_get_pcpu_route(&res);
1939 pcpu_rt = rt6_make_pcpu_route(net, &res);
1947 EXPORT_SYMBOL_GPL(ip6_pol_route);
1949 static struct rt6_info *ip6_pol_route_input(struct net *net,
1950 struct fib6_table *table,
1952 const struct sk_buff *skb,
1955 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1958 struct dst_entry *ip6_route_input_lookup(struct net *net,
1959 struct net_device *dev,
1961 const struct sk_buff *skb,
1964 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1965 flags |= RT6_LOOKUP_F_IFACE;
1967 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1969 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1971 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1972 struct flow_keys *keys,
1973 struct flow_keys *flkeys)
1975 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1976 const struct ipv6hdr *key_iph = outer_iph;
1977 struct flow_keys *_flkeys = flkeys;
1978 const struct ipv6hdr *inner_iph;
1979 const struct icmp6hdr *icmph;
1980 struct ipv6hdr _inner_iph;
1981 struct icmp6hdr _icmph;
1983 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1986 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1987 sizeof(_icmph), &_icmph);
1991 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1992 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1993 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1994 icmph->icmp6_type != ICMPV6_PARAMPROB)
1997 inner_iph = skb_header_pointer(skb,
1998 skb_transport_offset(skb) + sizeof(*icmph),
1999 sizeof(_inner_iph), &_inner_iph);
2003 key_iph = inner_iph;
2007 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2008 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2009 keys->tags.flow_label = _flkeys->tags.flow_label;
2010 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2012 keys->addrs.v6addrs.src = key_iph->saddr;
2013 keys->addrs.v6addrs.dst = key_iph->daddr;
2014 keys->tags.flow_label = ip6_flowlabel(key_iph);
2015 keys->basic.ip_proto = key_iph->nexthdr;
2019 /* if skb is set it will be used and fl6 can be NULL */
2020 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2021 const struct sk_buff *skb, struct flow_keys *flkeys)
2023 struct flow_keys hash_keys;
2026 switch (ip6_multipath_hash_policy(net)) {
2028 memset(&hash_keys, 0, sizeof(hash_keys));
2029 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2031 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2033 hash_keys.addrs.v6addrs.src = fl6->saddr;
2034 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2035 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2036 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2041 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2042 struct flow_keys keys;
2044 /* short-circuit if we already have L4 hash present */
2046 return skb_get_hash_raw(skb) >> 1;
2048 memset(&hash_keys, 0, sizeof(hash_keys));
2051 skb_flow_dissect_flow_keys(skb, &keys, flag);
2054 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2055 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2056 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2057 hash_keys.ports.src = flkeys->ports.src;
2058 hash_keys.ports.dst = flkeys->ports.dst;
2059 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2061 memset(&hash_keys, 0, sizeof(hash_keys));
2062 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2063 hash_keys.addrs.v6addrs.src = fl6->saddr;
2064 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2065 hash_keys.ports.src = fl6->fl6_sport;
2066 hash_keys.ports.dst = fl6->fl6_dport;
2067 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2071 mhash = flow_hash_from_keys(&hash_keys);
2076 void ip6_route_input(struct sk_buff *skb)
2078 const struct ipv6hdr *iph = ipv6_hdr(skb);
2079 struct net *net = dev_net(skb->dev);
2080 int flags = RT6_LOOKUP_F_HAS_SADDR;
2081 struct ip_tunnel_info *tun_info;
2082 struct flowi6 fl6 = {
2083 .flowi6_iif = skb->dev->ifindex,
2084 .daddr = iph->daddr,
2085 .saddr = iph->saddr,
2086 .flowlabel = ip6_flowinfo(iph),
2087 .flowi6_mark = skb->mark,
2088 .flowi6_proto = iph->nexthdr,
2090 struct flow_keys *flkeys = NULL, _flkeys;
2092 tun_info = skb_tunnel_info(skb);
2093 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2094 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2096 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2099 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2100 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2103 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2106 static struct rt6_info *ip6_pol_route_output(struct net *net,
2107 struct fib6_table *table,
2109 const struct sk_buff *skb,
2112 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2115 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2116 struct flowi6 *fl6, int flags)
2120 if (ipv6_addr_type(&fl6->daddr) &
2121 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2122 struct dst_entry *dst;
2124 dst = l3mdev_link_scope_lookup(net, fl6);
2129 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2131 any_src = ipv6_addr_any(&fl6->saddr);
2132 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2133 (fl6->flowi6_oif && any_src))
2134 flags |= RT6_LOOKUP_F_IFACE;
2137 flags |= RT6_LOOKUP_F_HAS_SADDR;
2139 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2141 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2143 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2145 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2147 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2148 struct net_device *loopback_dev = net->loopback_dev;
2149 struct dst_entry *new = NULL;
2151 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2152 DST_OBSOLETE_DEAD, 0);
2155 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2159 new->input = dst_discard;
2160 new->output = dst_discard_out;
2162 dst_copy_metrics(new, &ort->dst);
2164 rt->rt6i_idev = in6_dev_get(loopback_dev);
2165 rt->rt6i_gateway = ort->rt6i_gateway;
2166 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2168 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2169 #ifdef CONFIG_IPV6_SUBTREES
2170 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2174 dst_release(dst_orig);
2175 return new ? new : ERR_PTR(-ENOMEM);
2179 * Destination cache support functions
2182 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2186 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2189 if (fib6_check_expired(f6i))
2195 static struct dst_entry *rt6_check(struct rt6_info *rt,
2196 struct fib6_info *from,
2201 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2202 rt_cookie != cookie)
2205 if (rt6_check_expired(rt))
2211 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2212 struct fib6_info *from,
2215 if (!__rt6_check_expired(rt) &&
2216 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2217 fib6_check(from, cookie))
2223 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2225 struct dst_entry *dst_ret;
2226 struct fib6_info *from;
2227 struct rt6_info *rt;
2229 rt = container_of(dst, struct rt6_info, dst);
2233 /* All IPV6 dsts are created with ->obsolete set to the value
2234 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2235 * into this function always.
2238 from = rcu_dereference(rt->from);
2240 if (from && (rt->rt6i_flags & RTF_PCPU ||
2241 unlikely(!list_empty(&rt->rt6i_uncached))))
2242 dst_ret = rt6_dst_from_check(rt, from, cookie);
2244 dst_ret = rt6_check(rt, from, cookie);
2251 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2253 struct rt6_info *rt = (struct rt6_info *) dst;
2256 if (rt->rt6i_flags & RTF_CACHE) {
2258 if (rt6_check_expired(rt)) {
2259 rt6_remove_exception_rt(rt);
2271 static void ip6_link_failure(struct sk_buff *skb)
2273 struct rt6_info *rt;
2275 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2277 rt = (struct rt6_info *) skb_dst(skb);
2280 if (rt->rt6i_flags & RTF_CACHE) {
2281 rt6_remove_exception_rt(rt);
2283 struct fib6_info *from;
2284 struct fib6_node *fn;
2286 from = rcu_dereference(rt->from);
2288 fn = rcu_dereference(from->fib6_node);
2289 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2297 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2299 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2300 struct fib6_info *from;
2303 from = rcu_dereference(rt0->from);
2305 rt0->dst.expires = from->expires;
2309 dst_set_expires(&rt0->dst, timeout);
2310 rt0->rt6i_flags |= RTF_EXPIRES;
2313 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2315 struct net *net = dev_net(rt->dst.dev);
2317 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2318 rt->rt6i_flags |= RTF_MODIFIED;
2319 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2322 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2324 return !(rt->rt6i_flags & RTF_CACHE) &&
2325 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2328 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2329 const struct ipv6hdr *iph, u32 mtu)
2331 const struct in6_addr *daddr, *saddr;
2332 struct rt6_info *rt6 = (struct rt6_info *)dst;
2334 if (dst_metric_locked(dst, RTAX_MTU))
2338 daddr = &iph->daddr;
2339 saddr = &iph->saddr;
2341 daddr = &sk->sk_v6_daddr;
2342 saddr = &inet6_sk(sk)->saddr;
2347 dst_confirm_neigh(dst, daddr);
2348 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2349 if (mtu >= dst_mtu(dst))
2352 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2353 rt6_do_update_pmtu(rt6, mtu);
2354 /* update rt6_ex->stamp for cache */
2355 if (rt6->rt6i_flags & RTF_CACHE)
2356 rt6_update_exception_stamp_rt(rt6);
2358 struct fib6_result res = {};
2359 struct rt6_info *nrt6;
2362 res.f6i = rcu_dereference(rt6->from);
2367 res.nh = &res.f6i->fib6_nh;
2368 res.fib6_flags = res.f6i->fib6_flags;
2369 res.fib6_type = res.f6i->fib6_type;
2371 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2373 rt6_do_update_pmtu(nrt6, mtu);
2374 if (rt6_insert_exception(nrt6, &res))
2375 dst_release_immediate(&nrt6->dst);
2381 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2382 struct sk_buff *skb, u32 mtu)
2384 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2387 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2388 int oif, u32 mark, kuid_t uid)
2390 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2391 struct dst_entry *dst;
2392 struct flowi6 fl6 = {
2394 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2395 .daddr = iph->daddr,
2396 .saddr = iph->saddr,
2397 .flowlabel = ip6_flowinfo(iph),
2401 dst = ip6_route_output(net, NULL, &fl6);
2403 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2406 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2408 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2410 int oif = sk->sk_bound_dev_if;
2411 struct dst_entry *dst;
2413 if (!oif && skb->dev)
2414 oif = l3mdev_master_ifindex(skb->dev);
2416 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2418 dst = __sk_dst_get(sk);
2419 if (!dst || !dst->obsolete ||
2420 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2424 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2425 ip6_datagram_dst_update(sk, false);
2428 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2430 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2431 const struct flowi6 *fl6)
2433 #ifdef CONFIG_IPV6_SUBTREES
2434 struct ipv6_pinfo *np = inet6_sk(sk);
2437 ip6_dst_store(sk, dst,
2438 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2439 &sk->sk_v6_daddr : NULL,
2440 #ifdef CONFIG_IPV6_SUBTREES
2441 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2447 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2449 const struct in6_addr *gw,
2450 struct rt6_info **ret)
2452 const struct fib6_nh *nh = res->nh;
2454 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2455 fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2458 /* rt_cache's gateway might be different from its 'parent'
2459 * in the case of an ip redirect.
2460 * So we keep searching in the exception table if the gateway
2463 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2464 struct rt6_info *rt_cache;
2466 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2468 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2477 /* Handle redirects */
2478 struct ip6rd_flowi {
2480 struct in6_addr gateway;
2483 static struct rt6_info *__ip6_route_redirect(struct net *net,
2484 struct fib6_table *table,
2486 const struct sk_buff *skb,
2489 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2490 struct rt6_info *ret = NULL;
2491 struct fib6_result res = {};
2492 struct fib6_info *rt;
2493 struct fib6_node *fn;
2495 /* Get the "current" route for this destination and
2496 * check if the redirect has come from appropriate router.
2498 * RFC 4861 specifies that redirects should only be
2499 * accepted if they come from the nexthop to the target.
2500 * Due to the way the routes are chosen, this notion
2501 * is a bit fuzzy and one might need to check all possible
2506 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2508 for_each_fib6_node_rt_rcu(fn) {
2510 res.nh = &rt->fib6_nh;
2512 if (fib6_check_expired(rt))
2514 if (rt->fib6_flags & RTF_REJECT)
2516 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2521 rt = net->ipv6.fib6_null_entry;
2522 else if (rt->fib6_flags & RTF_REJECT) {
2523 ret = net->ipv6.ip6_null_entry;
2527 if (rt == net->ipv6.fib6_null_entry) {
2528 fn = fib6_backtrack(fn, &fl6->saddr);
2534 res.nh = &rt->fib6_nh;
2537 ip6_hold_safe(net, &ret);
2539 res.fib6_flags = res.f6i->fib6_flags;
2540 res.fib6_type = res.f6i->fib6_type;
2541 ret = ip6_create_rt_rcu(&res);
2546 trace_fib6_table_lookup(net, &res, table, fl6);
2550 static struct dst_entry *ip6_route_redirect(struct net *net,
2551 const struct flowi6 *fl6,
2552 const struct sk_buff *skb,
2553 const struct in6_addr *gateway)
2555 int flags = RT6_LOOKUP_F_HAS_SADDR;
2556 struct ip6rd_flowi rdfl;
2559 rdfl.gateway = *gateway;
2561 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2562 flags, __ip6_route_redirect);
2565 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2568 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2569 struct dst_entry *dst;
2570 struct flowi6 fl6 = {
2571 .flowi6_iif = LOOPBACK_IFINDEX,
2573 .flowi6_mark = mark,
2574 .daddr = iph->daddr,
2575 .saddr = iph->saddr,
2576 .flowlabel = ip6_flowinfo(iph),
2580 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2581 rt6_do_redirect(dst, NULL, skb);
2584 EXPORT_SYMBOL_GPL(ip6_redirect);
2586 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2588 const struct ipv6hdr *iph = ipv6_hdr(skb);
2589 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2590 struct dst_entry *dst;
2591 struct flowi6 fl6 = {
2592 .flowi6_iif = LOOPBACK_IFINDEX,
2595 .saddr = iph->daddr,
2596 .flowi6_uid = sock_net_uid(net, NULL),
2599 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2600 rt6_do_redirect(dst, NULL, skb);
2604 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2606 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2609 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2611 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2613 struct net_device *dev = dst->dev;
2614 unsigned int mtu = dst_mtu(dst);
2615 struct net *net = dev_net(dev);
2617 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2619 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2620 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2623 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2624 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2625 * IPV6_MAXPLEN is also valid and means: "any MSS,
2626 * rely only on pmtu discovery"
2628 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2633 static unsigned int ip6_mtu(const struct dst_entry *dst)
2635 struct inet6_dev *idev;
2638 mtu = dst_metric_raw(dst, RTAX_MTU);
2645 idev = __in6_dev_get(dst->dev);
2647 mtu = idev->cnf.mtu6;
2651 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2653 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2657 * 1. mtu on route is locked - use it
2658 * 2. mtu from nexthop exception
2659 * 3. mtu from egress device
2661 * based on ip6_dst_mtu_forward and exception logic of
2662 * rt6_find_cached_rt; called with rcu_read_lock
2664 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2665 const struct in6_addr *daddr,
2666 const struct in6_addr *saddr)
2668 struct rt6_exception_bucket *bucket;
2669 const struct fib6_nh *nh = res->nh;
2670 struct fib6_info *f6i = res->f6i;
2671 const struct in6_addr *src_key;
2672 struct rt6_exception *rt6_ex;
2673 struct inet6_dev *idev;
2676 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2677 mtu = f6i->fib6_pmtu;
2683 #ifdef CONFIG_IPV6_SUBTREES
2684 if (f6i->fib6_src.plen)
2688 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2689 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2690 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2691 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2694 struct net_device *dev = nh->fib_nh_dev;
2697 idev = __in6_dev_get(dev);
2698 if (idev && idev->cnf.mtu6 > mtu)
2699 mtu = idev->cnf.mtu6;
2702 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2704 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2707 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2710 struct dst_entry *dst;
2711 struct rt6_info *rt;
2712 struct inet6_dev *idev = in6_dev_get(dev);
2713 struct net *net = dev_net(dev);
2715 if (unlikely(!idev))
2716 return ERR_PTR(-ENODEV);
2718 rt = ip6_dst_alloc(net, dev, 0);
2719 if (unlikely(!rt)) {
2721 dst = ERR_PTR(-ENOMEM);
2725 rt->dst.flags |= DST_HOST;
2726 rt->dst.input = ip6_input;
2727 rt->dst.output = ip6_output;
2728 rt->rt6i_gateway = fl6->daddr;
2729 rt->rt6i_dst.addr = fl6->daddr;
2730 rt->rt6i_dst.plen = 128;
2731 rt->rt6i_idev = idev;
2732 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2734 /* Add this dst into uncached_list so that rt6_disable_ip() can
2735 * do proper release of the net_device
2737 rt6_uncached_list_add(rt);
2738 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2740 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2746 static int ip6_dst_gc(struct dst_ops *ops)
2748 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2749 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2750 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2751 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2752 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2753 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2756 entries = dst_entries_get_fast(ops);
2757 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2758 entries <= rt_max_size)
2761 net->ipv6.ip6_rt_gc_expire++;
2762 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2763 entries = dst_entries_get_slow(ops);
2764 if (entries < ops->gc_thresh)
2765 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2767 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2768 return entries > rt_max_size;
2771 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2772 struct fib6_config *cfg,
2773 const struct in6_addr *gw_addr,
2774 u32 tbid, int flags)
2776 struct flowi6 fl6 = {
2777 .flowi6_oif = cfg->fc_ifindex,
2779 .saddr = cfg->fc_prefsrc,
2781 struct fib6_table *table;
2782 struct rt6_info *rt;
2784 table = fib6_get_table(net, tbid);
2788 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2789 flags |= RT6_LOOKUP_F_HAS_SADDR;
2791 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2792 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2794 /* if table lookup failed, fall back to full lookup */
2795 if (rt == net->ipv6.ip6_null_entry) {
2803 static int ip6_route_check_nh_onlink(struct net *net,
2804 struct fib6_config *cfg,
2805 const struct net_device *dev,
2806 struct netlink_ext_ack *extack)
2808 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2809 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2810 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2811 struct fib6_info *from;
2812 struct rt6_info *grt;
2816 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2819 from = rcu_dereference(grt->from);
2820 if (!grt->dst.error &&
2821 /* ignore match if it is the default route */
2822 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2823 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2824 NL_SET_ERR_MSG(extack,
2825 "Nexthop has invalid gateway or device mismatch");
2836 static int ip6_route_check_nh(struct net *net,
2837 struct fib6_config *cfg,
2838 struct net_device **_dev,
2839 struct inet6_dev **idev)
2841 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2842 struct net_device *dev = _dev ? *_dev : NULL;
2843 struct rt6_info *grt = NULL;
2844 int err = -EHOSTUNREACH;
2846 if (cfg->fc_table) {
2847 int flags = RT6_LOOKUP_F_IFACE;
2849 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2850 cfg->fc_table, flags);
2852 if (grt->rt6i_flags & RTF_GATEWAY ||
2853 (dev && dev != grt->dst.dev)) {
2861 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2867 if (dev != grt->dst.dev) {
2872 *_dev = dev = grt->dst.dev;
2873 *idev = grt->rt6i_idev;
2875 in6_dev_hold(grt->rt6i_idev);
2878 if (!(grt->rt6i_flags & RTF_GATEWAY))
2887 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2888 struct net_device **_dev, struct inet6_dev **idev,
2889 struct netlink_ext_ack *extack)
2891 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2892 int gwa_type = ipv6_addr_type(gw_addr);
2893 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2894 const struct net_device *dev = *_dev;
2895 bool need_addr_check = !dev;
2898 /* if gw_addr is local we will fail to detect this in case
2899 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2900 * will return already-added prefix route via interface that
2901 * prefix route was assigned to, which might be non-loopback.
2904 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2905 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2909 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2910 /* IPv6 strictly inhibits using not link-local
2911 * addresses as nexthop address.
2912 * Otherwise, router will not able to send redirects.
2913 * It is very good, but in some (rare!) circumstances
2914 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2915 * some exceptions. --ANK
2916 * We allow IPv4-mapped nexthops to support RFC4798-type
2919 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2920 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2924 if (cfg->fc_flags & RTNH_F_ONLINK)
2925 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2927 err = ip6_route_check_nh(net, cfg, _dev, idev);
2933 /* reload in case device was changed */
2938 NL_SET_ERR_MSG(extack, "Egress device not specified");
2940 } else if (dev->flags & IFF_LOOPBACK) {
2941 NL_SET_ERR_MSG(extack,
2942 "Egress device can not be loopback device for this route");
2946 /* if we did not check gw_addr above, do so now that the
2947 * egress device has been resolved.
2949 if (need_addr_check &&
2950 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2951 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2960 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2962 if ((flags & RTF_REJECT) ||
2963 (dev && (dev->flags & IFF_LOOPBACK) &&
2964 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2965 !(flags & RTF_LOCAL)))
2971 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2972 struct fib6_config *cfg, gfp_t gfp_flags,
2973 struct netlink_ext_ack *extack)
2975 struct net_device *dev = NULL;
2976 struct inet6_dev *idev = NULL;
2980 fib6_nh->fib_nh_family = AF_INET6;
2983 if (cfg->fc_ifindex) {
2984 dev = dev_get_by_index(net, cfg->fc_ifindex);
2987 idev = in6_dev_get(dev);
2992 if (cfg->fc_flags & RTNH_F_ONLINK) {
2994 NL_SET_ERR_MSG(extack,
2995 "Nexthop device required for onlink");
2999 if (!(dev->flags & IFF_UP)) {
3000 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3005 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3008 fib6_nh->fib_nh_weight = 1;
3010 /* We cannot add true routes via loopback here,
3011 * they would result in kernel looping; promote them to reject routes
3013 addr_type = ipv6_addr_type(&cfg->fc_dst);
3014 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3015 /* hold loopback dev/idev if we haven't done so. */
3016 if (dev != net->loopback_dev) {
3021 dev = net->loopback_dev;
3023 idev = in6_dev_get(dev);
3032 if (cfg->fc_flags & RTF_GATEWAY) {
3033 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3037 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3038 fib6_nh->fib_nh_gw_family = AF_INET6;
3045 if (idev->cnf.disable_ipv6) {
3046 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3051 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3052 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3057 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3058 !netif_carrier_ok(dev))
3059 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3061 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3062 cfg->fc_encap_type, cfg, gfp_flags, extack);
3066 fib6_nh->fib_nh_dev = dev;
3067 fib6_nh->fib_nh_oif = dev->ifindex;
3074 lwtstate_put(fib6_nh->fib_nh_lws);
3075 fib6_nh->fib_nh_lws = NULL;
3083 void fib6_nh_release(struct fib6_nh *fib6_nh)
3085 fib_nh_common_release(&fib6_nh->nh_common);
3088 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3090 struct netlink_ext_ack *extack)
3092 struct net *net = cfg->fc_nlinfo.nl_net;
3093 struct fib6_info *rt = NULL;
3094 struct fib6_table *table;
3098 /* RTF_PCPU is an internal flag; can not be set by userspace */
3099 if (cfg->fc_flags & RTF_PCPU) {
3100 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3104 /* RTF_CACHE is an internal flag; can not be set by userspace */
3105 if (cfg->fc_flags & RTF_CACHE) {
3106 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3110 if (cfg->fc_type > RTN_MAX) {
3111 NL_SET_ERR_MSG(extack, "Invalid route type");
3115 if (cfg->fc_dst_len > 128) {
3116 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3119 if (cfg->fc_src_len > 128) {
3120 NL_SET_ERR_MSG(extack, "Invalid source address length");
3123 #ifndef CONFIG_IPV6_SUBTREES
3124 if (cfg->fc_src_len) {
3125 NL_SET_ERR_MSG(extack,
3126 "Specifying source address requires IPV6_SUBTREES to be enabled");
3132 if (cfg->fc_nlinfo.nlh &&
3133 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3134 table = fib6_get_table(net, cfg->fc_table);
3136 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3137 table = fib6_new_table(net, cfg->fc_table);
3140 table = fib6_new_table(net, cfg->fc_table);
3147 rt = fib6_info_alloc(gfp_flags);
3151 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3153 if (IS_ERR(rt->fib6_metrics)) {
3154 err = PTR_ERR(rt->fib6_metrics);
3155 /* Do not leave garbage there. */
3156 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3160 if (cfg->fc_flags & RTF_ADDRCONF)
3161 rt->dst_nocount = true;
3163 if (cfg->fc_flags & RTF_EXPIRES)
3164 fib6_set_expires(rt, jiffies +
3165 clock_t_to_jiffies(cfg->fc_expires));
3167 fib6_clean_expires(rt);
3169 if (cfg->fc_protocol == RTPROT_UNSPEC)
3170 cfg->fc_protocol = RTPROT_BOOT;
3171 rt->fib6_protocol = cfg->fc_protocol;
3173 rt->fib6_table = table;
3174 rt->fib6_metric = cfg->fc_metric;
3175 rt->fib6_type = cfg->fc_type;
3176 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3178 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3179 rt->fib6_dst.plen = cfg->fc_dst_len;
3180 if (rt->fib6_dst.plen == 128)
3181 rt->dst_host = true;
3183 #ifdef CONFIG_IPV6_SUBTREES
3184 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3185 rt->fib6_src.plen = cfg->fc_src_len;
3187 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3191 /* We cannot add true routes via loopback here,
3192 * they would result in kernel looping; promote them to reject routes
3194 addr_type = ipv6_addr_type(&cfg->fc_dst);
3195 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3196 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3198 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3199 struct net_device *dev = fib6_info_nh_dev(rt);
3201 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3202 NL_SET_ERR_MSG(extack, "Invalid source address");
3206 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3207 rt->fib6_prefsrc.plen = 128;
3209 rt->fib6_prefsrc.plen = 0;
3213 fib6_info_release(rt);
3214 return ERR_PTR(err);
3217 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3218 struct netlink_ext_ack *extack)
3220 struct fib6_info *rt;
3223 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3227 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3228 fib6_info_release(rt);
3233 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3235 struct net *net = info->nl_net;
3236 struct fib6_table *table;
3239 if (rt == net->ipv6.fib6_null_entry) {
3244 table = rt->fib6_table;
3245 spin_lock_bh(&table->tb6_lock);
3246 err = fib6_del(rt, info);
3247 spin_unlock_bh(&table->tb6_lock);
3250 fib6_info_release(rt);
3254 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3256 struct nl_info info = { .nl_net = net };
3258 return __ip6_del_rt(rt, &info);
3261 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3263 struct nl_info *info = &cfg->fc_nlinfo;
3264 struct net *net = info->nl_net;
3265 struct sk_buff *skb = NULL;
3266 struct fib6_table *table;
3269 if (rt == net->ipv6.fib6_null_entry)
3271 table = rt->fib6_table;
3272 spin_lock_bh(&table->tb6_lock);
3274 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3275 struct fib6_info *sibling, *next_sibling;
3277 /* prefer to send a single notification with all hops */
3278 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3280 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3282 if (rt6_fill_node(net, skb, rt, NULL,
3283 NULL, NULL, 0, RTM_DELROUTE,
3284 info->portid, seq, 0) < 0) {
3288 info->skip_notify = 1;
3291 list_for_each_entry_safe(sibling, next_sibling,
3294 err = fib6_del(sibling, info);
3300 err = fib6_del(rt, info);
3302 spin_unlock_bh(&table->tb6_lock);
3304 fib6_info_release(rt);
3307 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3308 info->nlh, gfp_any());
3313 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3317 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3320 if (cfg->fc_flags & RTF_GATEWAY &&
3321 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3324 rc = rt6_remove_exception_rt(rt);
3329 static int ip6_route_del(struct fib6_config *cfg,
3330 struct netlink_ext_ack *extack)
3332 struct rt6_info *rt_cache;
3333 struct fib6_table *table;
3334 struct fib6_info *rt;
3335 struct fib6_node *fn;
3338 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3340 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3346 fn = fib6_locate(&table->tb6_root,
3347 &cfg->fc_dst, cfg->fc_dst_len,
3348 &cfg->fc_src, cfg->fc_src_len,
3349 !(cfg->fc_flags & RTF_CACHE));
3352 for_each_fib6_node_rt_rcu(fn) {
3355 if (cfg->fc_flags & RTF_CACHE) {
3356 struct fib6_result res = {
3361 rt_cache = rt6_find_cached_rt(&res,
3365 rc = ip6_del_cached_rt(rt_cache, cfg);
3375 if (cfg->fc_ifindex &&
3377 nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3379 if (cfg->fc_flags & RTF_GATEWAY &&
3380 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3382 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3384 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3386 if (!fib6_info_hold_safe(rt))
3390 /* if gateway was specified only delete the one hop */
3391 if (cfg->fc_flags & RTF_GATEWAY)
3392 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3394 return __ip6_del_rt_siblings(rt, cfg);
3402 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3404 struct netevent_redirect netevent;
3405 struct rt6_info *rt, *nrt = NULL;
3406 struct fib6_result res = {};
3407 struct ndisc_options ndopts;
3408 struct inet6_dev *in6_dev;
3409 struct neighbour *neigh;
3411 int optlen, on_link;
3414 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3415 optlen -= sizeof(*msg);
3418 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3422 msg = (struct rd_msg *)icmp6_hdr(skb);
3424 if (ipv6_addr_is_multicast(&msg->dest)) {
3425 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3430 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3432 } else if (ipv6_addr_type(&msg->target) !=
3433 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3434 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3438 in6_dev = __in6_dev_get(skb->dev);
3441 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3445 * The IP source address of the Redirect MUST be the same as the current
3446 * first-hop router for the specified ICMP Destination Address.
3449 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3450 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3455 if (ndopts.nd_opts_tgt_lladdr) {
3456 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3459 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3464 rt = (struct rt6_info *) dst;
3465 if (rt->rt6i_flags & RTF_REJECT) {
3466 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3470 /* Redirect received -> path was valid.
3471 * Look, redirects are sent only in response to data packets,
3472 * so that this nexthop apparently is reachable. --ANK
3474 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3476 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3481 * We have finally decided to accept it.
3484 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3485 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3486 NEIGH_UPDATE_F_OVERRIDE|
3487 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3488 NEIGH_UPDATE_F_ISROUTER)),
3489 NDISC_REDIRECT, &ndopts);
3492 res.f6i = rcu_dereference(rt->from);
3496 res.nh = &res.f6i->fib6_nh;
3497 res.fib6_flags = res.f6i->fib6_flags;
3498 res.fib6_type = res.f6i->fib6_type;
3499 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3503 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3505 nrt->rt6i_flags &= ~RTF_GATEWAY;
3507 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3509 /* rt6_insert_exception() will take care of duplicated exceptions */
3510 if (rt6_insert_exception(nrt, &res)) {
3511 dst_release_immediate(&nrt->dst);
3515 netevent.old = &rt->dst;
3516 netevent.new = &nrt->dst;
3517 netevent.daddr = &msg->dest;
3518 netevent.neigh = neigh;
3519 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3523 neigh_release(neigh);
3526 #ifdef CONFIG_IPV6_ROUTE_INFO
3527 static struct fib6_info *rt6_get_route_info(struct net *net,
3528 const struct in6_addr *prefix, int prefixlen,
3529 const struct in6_addr *gwaddr,
3530 struct net_device *dev)
3532 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3533 int ifindex = dev->ifindex;
3534 struct fib6_node *fn;
3535 struct fib6_info *rt = NULL;
3536 struct fib6_table *table;
3538 table = fib6_get_table(net, tb_id);
3543 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3547 for_each_fib6_node_rt_rcu(fn) {
3548 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3550 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3551 !rt->fib6_nh.fib_nh_gw_family)
3553 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3555 if (!fib6_info_hold_safe(rt))
3564 static struct fib6_info *rt6_add_route_info(struct net *net,
3565 const struct in6_addr *prefix, int prefixlen,
3566 const struct in6_addr *gwaddr,
3567 struct net_device *dev,
3570 struct fib6_config cfg = {
3571 .fc_metric = IP6_RT_PRIO_USER,
3572 .fc_ifindex = dev->ifindex,
3573 .fc_dst_len = prefixlen,
3574 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3575 RTF_UP | RTF_PREF(pref),
3576 .fc_protocol = RTPROT_RA,
3577 .fc_type = RTN_UNICAST,
3578 .fc_nlinfo.portid = 0,
3579 .fc_nlinfo.nlh = NULL,
3580 .fc_nlinfo.nl_net = net,
3583 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3584 cfg.fc_dst = *prefix;
3585 cfg.fc_gateway = *gwaddr;
3587 /* We should treat it as a default route if prefix length is 0. */
3589 cfg.fc_flags |= RTF_DEFAULT;
3591 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3593 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3597 struct fib6_info *rt6_get_dflt_router(struct net *net,
3598 const struct in6_addr *addr,
3599 struct net_device *dev)
3601 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3602 struct fib6_info *rt;
3603 struct fib6_table *table;
3605 table = fib6_get_table(net, tb_id);
3610 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3611 struct fib6_nh *nh = &rt->fib6_nh;
3613 if (dev == nh->fib_nh_dev &&
3614 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3615 ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3618 if (rt && !fib6_info_hold_safe(rt))
3624 struct fib6_info *rt6_add_dflt_router(struct net *net,
3625 const struct in6_addr *gwaddr,
3626 struct net_device *dev,
3629 struct fib6_config cfg = {
3630 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3631 .fc_metric = IP6_RT_PRIO_USER,
3632 .fc_ifindex = dev->ifindex,
3633 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3634 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3635 .fc_protocol = RTPROT_RA,
3636 .fc_type = RTN_UNICAST,
3637 .fc_nlinfo.portid = 0,
3638 .fc_nlinfo.nlh = NULL,
3639 .fc_nlinfo.nl_net = net,
3642 cfg.fc_gateway = *gwaddr;
3644 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3645 struct fib6_table *table;
3647 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3649 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3652 return rt6_get_dflt_router(net, gwaddr, dev);
3655 static void __rt6_purge_dflt_routers(struct net *net,
3656 struct fib6_table *table)
3658 struct fib6_info *rt;
3662 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3663 struct net_device *dev = fib6_info_nh_dev(rt);
3664 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3666 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3667 (!idev || idev->cnf.accept_ra != 2) &&
3668 fib6_info_hold_safe(rt)) {
3670 ip6_del_rt(net, rt);
3676 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3679 void rt6_purge_dflt_routers(struct net *net)
3681 struct fib6_table *table;
3682 struct hlist_head *head;
3687 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3688 head = &net->ipv6.fib_table_hash[h];
3689 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3690 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3691 __rt6_purge_dflt_routers(net, table);
3698 static void rtmsg_to_fib6_config(struct net *net,
3699 struct in6_rtmsg *rtmsg,
3700 struct fib6_config *cfg)
3702 *cfg = (struct fib6_config){
3703 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3705 .fc_ifindex = rtmsg->rtmsg_ifindex,
3706 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3707 .fc_expires = rtmsg->rtmsg_info,
3708 .fc_dst_len = rtmsg->rtmsg_dst_len,
3709 .fc_src_len = rtmsg->rtmsg_src_len,
3710 .fc_flags = rtmsg->rtmsg_flags,
3711 .fc_type = rtmsg->rtmsg_type,
3713 .fc_nlinfo.nl_net = net,
3715 .fc_dst = rtmsg->rtmsg_dst,
3716 .fc_src = rtmsg->rtmsg_src,
3717 .fc_gateway = rtmsg->rtmsg_gateway,
3721 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3723 struct fib6_config cfg;
3724 struct in6_rtmsg rtmsg;
3728 case SIOCADDRT: /* Add a route */
3729 case SIOCDELRT: /* Delete a route */
3730 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3732 err = copy_from_user(&rtmsg, arg,
3733 sizeof(struct in6_rtmsg));
3737 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3742 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3745 err = ip6_route_del(&cfg, NULL);
3759 * Drop the packet on the floor
3762 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3764 struct dst_entry *dst = skb_dst(skb);
3765 struct net *net = dev_net(dst->dev);
3766 struct inet6_dev *idev;
3769 if (netif_is_l3_master(skb->dev) &&
3770 dst->dev == net->loopback_dev)
3771 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3773 idev = ip6_dst_idev(dst);
3775 switch (ipstats_mib_noroutes) {
3776 case IPSTATS_MIB_INNOROUTES:
3777 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3778 if (type == IPV6_ADDR_ANY) {
3779 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3783 case IPSTATS_MIB_OUTNOROUTES:
3784 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3788 /* Start over by dropping the dst for l3mdev case */
3789 if (netif_is_l3_master(skb->dev))
3792 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3797 static int ip6_pkt_discard(struct sk_buff *skb)
3799 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3802 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3804 skb->dev = skb_dst(skb)->dev;
3805 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3808 static int ip6_pkt_prohibit(struct sk_buff *skb)
3810 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3813 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3815 skb->dev = skb_dst(skb)->dev;
3816 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3820 * Allocate a dst for local (unicast / anycast) address.
3823 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3824 struct inet6_dev *idev,
3825 const struct in6_addr *addr,
3826 bool anycast, gfp_t gfp_flags)
3828 struct fib6_config cfg = {
3829 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3830 .fc_ifindex = idev->dev->ifindex,
3831 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3834 .fc_protocol = RTPROT_KERNEL,
3835 .fc_nlinfo.nl_net = net,
3836 .fc_ignore_dev_down = true,
3840 cfg.fc_type = RTN_ANYCAST;
3841 cfg.fc_flags |= RTF_ANYCAST;
3843 cfg.fc_type = RTN_LOCAL;
3844 cfg.fc_flags |= RTF_LOCAL;
3847 return ip6_route_info_create(&cfg, gfp_flags, NULL);
3850 /* remove deleted ip from prefsrc entries */
3851 struct arg_dev_net_ip {
3852 struct net_device *dev;
3854 struct in6_addr *addr;
3857 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3859 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3860 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3861 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3863 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3864 rt != net->ipv6.fib6_null_entry &&
3865 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3866 spin_lock_bh(&rt6_exception_lock);
3867 /* remove prefsrc entry */
3868 rt->fib6_prefsrc.plen = 0;
3869 spin_unlock_bh(&rt6_exception_lock);
3874 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3876 struct net *net = dev_net(ifp->idev->dev);
3877 struct arg_dev_net_ip adni = {
3878 .dev = ifp->idev->dev,
3882 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3885 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
3887 /* Remove routers and update dst entries when gateway turn into host. */
3888 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3890 struct in6_addr *gateway = (struct in6_addr *)arg;
3892 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3893 rt->fib6_nh.fib_nh_gw_family &&
3894 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3898 /* Further clean up cached routes in exception table.
3899 * This is needed because cached route may have a different
3900 * gateway than its 'parent' in the case of an ip redirect.
3902 rt6_exceptions_clean_tohost(rt, gateway);
3907 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3909 fib6_clean_all(net, fib6_clean_tohost, gateway);
3912 struct arg_netdev_event {
3913 const struct net_device *dev;
3915 unsigned char nh_flags;
3916 unsigned long event;
3920 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3922 struct fib6_info *iter;
3923 struct fib6_node *fn;
3925 fn = rcu_dereference_protected(rt->fib6_node,
3926 lockdep_is_held(&rt->fib6_table->tb6_lock));
3927 iter = rcu_dereference_protected(fn->leaf,
3928 lockdep_is_held(&rt->fib6_table->tb6_lock));
3930 if (iter->fib6_metric == rt->fib6_metric &&
3931 rt6_qualify_for_ecmp(iter))
3933 iter = rcu_dereference_protected(iter->fib6_next,
3934 lockdep_is_held(&rt->fib6_table->tb6_lock));
3940 static bool rt6_is_dead(const struct fib6_info *rt)
3942 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3943 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3944 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3950 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3952 struct fib6_info *iter;
3955 if (!rt6_is_dead(rt))
3956 total += rt->fib6_nh.fib_nh_weight;
3958 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3959 if (!rt6_is_dead(iter))
3960 total += iter->fib6_nh.fib_nh_weight;
3966 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3968 int upper_bound = -1;
3970 if (!rt6_is_dead(rt)) {
3971 *weight += rt->fib6_nh.fib_nh_weight;
3972 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3975 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3978 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3980 struct fib6_info *iter;
3983 rt6_upper_bound_set(rt, &weight, total);
3985 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3986 rt6_upper_bound_set(iter, &weight, total);
3989 void rt6_multipath_rebalance(struct fib6_info *rt)
3991 struct fib6_info *first;
3994 /* In case the entire multipath route was marked for flushing,
3995 * then there is no need to rebalance upon the removal of every
3998 if (!rt->fib6_nsiblings || rt->should_flush)
4001 /* During lookup routes are evaluated in order, so we need to
4002 * make sure upper bounds are assigned from the first sibling
4005 first = rt6_multipath_first_sibling(rt);
4006 if (WARN_ON_ONCE(!first))
4009 total = rt6_multipath_total_weight(first);
4010 rt6_multipath_upper_bound_set(first, total);
4013 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4015 const struct arg_netdev_event *arg = p_arg;
4016 struct net *net = dev_net(arg->dev);
4018 if (rt != net->ipv6.fib6_null_entry &&
4019 rt->fib6_nh.fib_nh_dev == arg->dev) {
4020 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4021 fib6_update_sernum_upto_root(net, rt);
4022 rt6_multipath_rebalance(rt);
4028 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4030 struct arg_netdev_event arg = {
4033 .nh_flags = nh_flags,
4037 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4038 arg.nh_flags |= RTNH_F_LINKDOWN;
4040 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4043 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4044 const struct net_device *dev)
4046 struct fib6_info *iter;
4048 if (rt->fib6_nh.fib_nh_dev == dev)
4050 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4051 if (iter->fib6_nh.fib_nh_dev == dev)
4057 static void rt6_multipath_flush(struct fib6_info *rt)
4059 struct fib6_info *iter;
4061 rt->should_flush = 1;
4062 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4063 iter->should_flush = 1;
4066 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4067 const struct net_device *down_dev)
4069 struct fib6_info *iter;
4070 unsigned int dead = 0;
4072 if (rt->fib6_nh.fib_nh_dev == down_dev ||
4073 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4075 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4076 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4077 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4083 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4084 const struct net_device *dev,
4085 unsigned char nh_flags)
4087 struct fib6_info *iter;
4089 if (rt->fib6_nh.fib_nh_dev == dev)
4090 rt->fib6_nh.fib_nh_flags |= nh_flags;
4091 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4092 if (iter->fib6_nh.fib_nh_dev == dev)
4093 iter->fib6_nh.fib_nh_flags |= nh_flags;
4096 /* called with write lock held for table with rt */
4097 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4099 const struct arg_netdev_event *arg = p_arg;
4100 const struct net_device *dev = arg->dev;
4101 struct net *net = dev_net(dev);
4103 if (rt == net->ipv6.fib6_null_entry)
4106 switch (arg->event) {
4107 case NETDEV_UNREGISTER:
4108 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4110 if (rt->should_flush)
4112 if (!rt->fib6_nsiblings)
4113 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4114 if (rt6_multipath_uses_dev(rt, dev)) {
4117 count = rt6_multipath_dead_count(rt, dev);
4118 if (rt->fib6_nsiblings + 1 == count) {
4119 rt6_multipath_flush(rt);
4122 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4124 fib6_update_sernum(net, rt);
4125 rt6_multipath_rebalance(rt);
4129 if (rt->fib6_nh.fib_nh_dev != dev ||
4130 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4132 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4133 rt6_multipath_rebalance(rt);
4140 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4142 struct arg_netdev_event arg = {
4148 struct net *net = dev_net(dev);
4150 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4151 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4153 fib6_clean_all(net, fib6_ifdown, &arg);
4156 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4158 rt6_sync_down_dev(dev, event);
4159 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4160 neigh_ifdown(&nd_tbl, dev);
4163 struct rt6_mtu_change_arg {
4164 struct net_device *dev;
4168 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4170 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4171 struct inet6_dev *idev;
4173 /* In IPv6 pmtu discovery is not optional,
4174 so that RTAX_MTU lock cannot disable it.
4175 We still use this lock to block changes
4176 caused by addrconf/ndisc.
4179 idev = __in6_dev_get(arg->dev);
4183 /* For administrative MTU increase, there is no way to discover
4184 IPv6 PMTU increase, so PMTU increase should be updated here.
4185 Since RFC 1981 doesn't include administrative MTU increase
4186 update PMTU increase is a MUST. (i.e. jumbo frame)
4188 if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4189 !fib6_metric_locked(rt, RTAX_MTU)) {
4190 u32 mtu = rt->fib6_pmtu;
4192 if (mtu >= arg->mtu ||
4193 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4194 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4196 spin_lock_bh(&rt6_exception_lock);
4197 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4198 spin_unlock_bh(&rt6_exception_lock);
4203 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4205 struct rt6_mtu_change_arg arg = {
4210 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4213 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4214 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4215 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4216 [RTA_OIF] = { .type = NLA_U32 },
4217 [RTA_IIF] = { .type = NLA_U32 },
4218 [RTA_PRIORITY] = { .type = NLA_U32 },
4219 [RTA_METRICS] = { .type = NLA_NESTED },
4220 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4221 [RTA_PREF] = { .type = NLA_U8 },
4222 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4223 [RTA_ENCAP] = { .type = NLA_NESTED },
4224 [RTA_EXPIRES] = { .type = NLA_U32 },
4225 [RTA_UID] = { .type = NLA_U32 },
4226 [RTA_MARK] = { .type = NLA_U32 },
4227 [RTA_TABLE] = { .type = NLA_U32 },
4228 [RTA_IP_PROTO] = { .type = NLA_U8 },
4229 [RTA_SPORT] = { .type = NLA_U16 },
4230 [RTA_DPORT] = { .type = NLA_U16 },
4233 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4234 struct fib6_config *cfg,
4235 struct netlink_ext_ack *extack)
4238 struct nlattr *tb[RTA_MAX+1];
4242 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4243 rtm_ipv6_policy, extack);
4248 rtm = nlmsg_data(nlh);
4250 *cfg = (struct fib6_config){
4251 .fc_table = rtm->rtm_table,
4252 .fc_dst_len = rtm->rtm_dst_len,
4253 .fc_src_len = rtm->rtm_src_len,
4255 .fc_protocol = rtm->rtm_protocol,
4256 .fc_type = rtm->rtm_type,
4258 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4259 .fc_nlinfo.nlh = nlh,
4260 .fc_nlinfo.nl_net = sock_net(skb->sk),
4263 if (rtm->rtm_type == RTN_UNREACHABLE ||
4264 rtm->rtm_type == RTN_BLACKHOLE ||
4265 rtm->rtm_type == RTN_PROHIBIT ||
4266 rtm->rtm_type == RTN_THROW)
4267 cfg->fc_flags |= RTF_REJECT;
4269 if (rtm->rtm_type == RTN_LOCAL)
4270 cfg->fc_flags |= RTF_LOCAL;
4272 if (rtm->rtm_flags & RTM_F_CLONED)
4273 cfg->fc_flags |= RTF_CACHE;
4275 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4277 if (tb[RTA_GATEWAY]) {
4278 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4279 cfg->fc_flags |= RTF_GATEWAY;
4282 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4287 int plen = (rtm->rtm_dst_len + 7) >> 3;
4289 if (nla_len(tb[RTA_DST]) < plen)
4292 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4296 int plen = (rtm->rtm_src_len + 7) >> 3;
4298 if (nla_len(tb[RTA_SRC]) < plen)
4301 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4304 if (tb[RTA_PREFSRC])
4305 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4308 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4310 if (tb[RTA_PRIORITY])
4311 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4313 if (tb[RTA_METRICS]) {
4314 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4315 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4319 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4321 if (tb[RTA_MULTIPATH]) {
4322 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4323 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4325 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4326 cfg->fc_mp_len, extack);
4332 pref = nla_get_u8(tb[RTA_PREF]);
4333 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4334 pref != ICMPV6_ROUTER_PREF_HIGH)
4335 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4336 cfg->fc_flags |= RTF_PREF(pref);
4340 cfg->fc_encap = tb[RTA_ENCAP];
4342 if (tb[RTA_ENCAP_TYPE]) {
4343 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4345 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4350 if (tb[RTA_EXPIRES]) {
4351 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4353 if (addrconf_finite_timeout(timeout)) {
4354 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4355 cfg->fc_flags |= RTF_EXPIRES;
4365 struct fib6_info *fib6_info;
4366 struct fib6_config r_cfg;
4367 struct list_head next;
4370 static int ip6_route_info_append(struct net *net,
4371 struct list_head *rt6_nh_list,
4372 struct fib6_info *rt,
4373 struct fib6_config *r_cfg)
4378 list_for_each_entry(nh, rt6_nh_list, next) {
4379 /* check if fib6_info already exists */
4380 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4384 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4388 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4389 list_add_tail(&nh->next, rt6_nh_list);
4394 static void ip6_route_mpath_notify(struct fib6_info *rt,
4395 struct fib6_info *rt_last,
4396 struct nl_info *info,
4399 /* if this is an APPEND route, then rt points to the first route
4400 * inserted and rt_last points to last route inserted. Userspace
4401 * wants a consistent dump of the route which starts at the first
4402 * nexthop. Since sibling routes are always added at the end of
4403 * the list, find the first sibling of the last route appended
4405 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4406 rt = list_first_entry(&rt_last->fib6_siblings,
4412 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4415 static int ip6_route_multipath_add(struct fib6_config *cfg,
4416 struct netlink_ext_ack *extack)
4418 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4419 struct nl_info *info = &cfg->fc_nlinfo;
4420 struct fib6_config r_cfg;
4421 struct rtnexthop *rtnh;
4422 struct fib6_info *rt;
4423 struct rt6_nh *err_nh;
4424 struct rt6_nh *nh, *nh_safe;
4430 int replace = (cfg->fc_nlinfo.nlh &&
4431 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4432 LIST_HEAD(rt6_nh_list);
4434 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4435 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4436 nlflags |= NLM_F_APPEND;
4438 remaining = cfg->fc_mp_len;
4439 rtnh = (struct rtnexthop *)cfg->fc_mp;
4441 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4442 * fib6_info structs per nexthop
4444 while (rtnh_ok(rtnh, remaining)) {
4445 memcpy(&r_cfg, cfg, sizeof(*cfg));
4446 if (rtnh->rtnh_ifindex)
4447 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4449 attrlen = rtnh_attrlen(rtnh);
4451 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4453 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4455 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4456 r_cfg.fc_flags |= RTF_GATEWAY;
4458 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4459 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4461 r_cfg.fc_encap_type = nla_get_u16(nla);
4464 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4465 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4471 if (!rt6_qualify_for_ecmp(rt)) {
4473 NL_SET_ERR_MSG(extack,
4474 "Device only routes can not be added for IPv6 using the multipath API.");
4475 fib6_info_release(rt);
4479 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4481 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4484 fib6_info_release(rt);
4488 rtnh = rtnh_next(rtnh, &remaining);
4491 /* for add and replace send one notification with all nexthops.
4492 * Skip the notification in fib6_add_rt2node and send one with
4493 * the full route when done
4495 info->skip_notify = 1;
4498 list_for_each_entry(nh, &rt6_nh_list, next) {
4499 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4500 fib6_info_release(nh->fib6_info);
4503 /* save reference to last route successfully inserted */
4504 rt_last = nh->fib6_info;
4506 /* save reference to first route for notification */
4508 rt_notif = nh->fib6_info;
4511 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4512 nh->fib6_info = NULL;
4515 NL_SET_ERR_MSG_MOD(extack,
4516 "multipath route replace failed (check consistency of installed routes)");
4521 /* Because each route is added like a single route we remove
4522 * these flags after the first nexthop: if there is a collision,
4523 * we have already failed to add the first nexthop:
4524 * fib6_add_rt2node() has rejected it; when replacing, old
4525 * nexthops have been replaced by first new, the rest should
4528 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4533 /* success ... tell user about new route */
4534 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4538 /* send notification for routes that were added so that
4539 * the delete notifications sent by ip6_route_del are
4543 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4545 /* Delete routes that were already added */
4546 list_for_each_entry(nh, &rt6_nh_list, next) {
4549 ip6_route_del(&nh->r_cfg, extack);
4553 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4555 fib6_info_release(nh->fib6_info);
4556 list_del(&nh->next);
4563 static int ip6_route_multipath_del(struct fib6_config *cfg,
4564 struct netlink_ext_ack *extack)
4566 struct fib6_config r_cfg;
4567 struct rtnexthop *rtnh;
4570 int err = 1, last_err = 0;
4572 remaining = cfg->fc_mp_len;
4573 rtnh = (struct rtnexthop *)cfg->fc_mp;
4575 /* Parse a Multipath Entry */
4576 while (rtnh_ok(rtnh, remaining)) {
4577 memcpy(&r_cfg, cfg, sizeof(*cfg));
4578 if (rtnh->rtnh_ifindex)
4579 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4581 attrlen = rtnh_attrlen(rtnh);
4583 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4585 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4587 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4588 r_cfg.fc_flags |= RTF_GATEWAY;
4591 err = ip6_route_del(&r_cfg, extack);
4595 rtnh = rtnh_next(rtnh, &remaining);
4601 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4602 struct netlink_ext_ack *extack)
4604 struct fib6_config cfg;
4607 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4612 return ip6_route_multipath_del(&cfg, extack);
4614 cfg.fc_delete_all_nh = 1;
4615 return ip6_route_del(&cfg, extack);
4619 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4620 struct netlink_ext_ack *extack)
4622 struct fib6_config cfg;
4625 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4629 if (cfg.fc_metric == 0)
4630 cfg.fc_metric = IP6_RT_PRIO_USER;
4633 return ip6_route_multipath_add(&cfg, extack);
4635 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4638 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4640 int nexthop_len = 0;
4642 if (rt->fib6_nsiblings) {
4643 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4644 + NLA_ALIGN(sizeof(struct rtnexthop))
4645 + nla_total_size(16) /* RTA_GATEWAY */
4646 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4648 nexthop_len *= rt->fib6_nsiblings;
4651 return NLMSG_ALIGN(sizeof(struct rtmsg))
4652 + nla_total_size(16) /* RTA_SRC */
4653 + nla_total_size(16) /* RTA_DST */
4654 + nla_total_size(16) /* RTA_GATEWAY */
4655 + nla_total_size(16) /* RTA_PREFSRC */
4656 + nla_total_size(4) /* RTA_TABLE */
4657 + nla_total_size(4) /* RTA_IIF */
4658 + nla_total_size(4) /* RTA_OIF */
4659 + nla_total_size(4) /* RTA_PRIORITY */
4660 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4661 + nla_total_size(sizeof(struct rta_cacheinfo))
4662 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4663 + nla_total_size(1) /* RTA_PREF */
4664 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4668 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4669 struct fib6_info *rt, struct dst_entry *dst,
4670 struct in6_addr *dest, struct in6_addr *src,
4671 int iif, int type, u32 portid, u32 seq,
4674 struct rt6_info *rt6 = (struct rt6_info *)dst;
4675 struct rt6key *rt6_dst, *rt6_src;
4676 u32 *pmetrics, table, rt6_flags;
4677 struct nlmsghdr *nlh;
4681 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4686 rt6_dst = &rt6->rt6i_dst;
4687 rt6_src = &rt6->rt6i_src;
4688 rt6_flags = rt6->rt6i_flags;
4690 rt6_dst = &rt->fib6_dst;
4691 rt6_src = &rt->fib6_src;
4692 rt6_flags = rt->fib6_flags;
4695 rtm = nlmsg_data(nlh);
4696 rtm->rtm_family = AF_INET6;
4697 rtm->rtm_dst_len = rt6_dst->plen;
4698 rtm->rtm_src_len = rt6_src->plen;
4701 table = rt->fib6_table->tb6_id;
4703 table = RT6_TABLE_UNSPEC;
4704 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4705 if (nla_put_u32(skb, RTA_TABLE, table))
4706 goto nla_put_failure;
4708 rtm->rtm_type = rt->fib6_type;
4710 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4711 rtm->rtm_protocol = rt->fib6_protocol;
4713 if (rt6_flags & RTF_CACHE)
4714 rtm->rtm_flags |= RTM_F_CLONED;
4717 if (nla_put_in6_addr(skb, RTA_DST, dest))
4718 goto nla_put_failure;
4719 rtm->rtm_dst_len = 128;
4720 } else if (rtm->rtm_dst_len)
4721 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4722 goto nla_put_failure;
4723 #ifdef CONFIG_IPV6_SUBTREES
4725 if (nla_put_in6_addr(skb, RTA_SRC, src))
4726 goto nla_put_failure;
4727 rtm->rtm_src_len = 128;
4728 } else if (rtm->rtm_src_len &&
4729 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4730 goto nla_put_failure;
4733 #ifdef CONFIG_IPV6_MROUTE
4734 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4735 int err = ip6mr_get_route(net, skb, rtm, portid);
4740 goto nla_put_failure;
4743 if (nla_put_u32(skb, RTA_IIF, iif))
4744 goto nla_put_failure;
4746 struct in6_addr saddr_buf;
4747 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4748 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4749 goto nla_put_failure;
4752 if (rt->fib6_prefsrc.plen) {
4753 struct in6_addr saddr_buf;
4754 saddr_buf = rt->fib6_prefsrc.addr;
4755 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4756 goto nla_put_failure;
4759 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4760 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4761 goto nla_put_failure;
4763 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4764 goto nla_put_failure;
4766 /* For multipath routes, walk the siblings list and add
4767 * each as a nexthop within RTA_MULTIPATH.
4770 if (rt6_flags & RTF_GATEWAY &&
4771 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4772 goto nla_put_failure;
4774 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4775 goto nla_put_failure;
4776 } else if (rt->fib6_nsiblings) {
4777 struct fib6_info *sibling, *next_sibling;
4780 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4782 goto nla_put_failure;
4784 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4785 rt->fib6_nh.fib_nh_weight) < 0)
4786 goto nla_put_failure;
4788 list_for_each_entry_safe(sibling, next_sibling,
4789 &rt->fib6_siblings, fib6_siblings) {
4790 if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4791 sibling->fib6_nh.fib_nh_weight) < 0)
4792 goto nla_put_failure;
4795 nla_nest_end(skb, mp);
4797 unsigned char nh_flags = 0;
4799 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4800 &nh_flags, false) < 0)
4801 goto nla_put_failure;
4803 rtm->rtm_flags |= nh_flags;
4806 if (rt6_flags & RTF_EXPIRES) {
4807 expires = dst ? dst->expires : rt->expires;
4811 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4812 goto nla_put_failure;
4814 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4815 goto nla_put_failure;
4818 nlmsg_end(skb, nlh);
4822 nlmsg_cancel(skb, nlh);
4826 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4827 const struct net_device *dev)
4829 if (f6i->fib6_nh.fib_nh_dev == dev)
4832 if (f6i->fib6_nsiblings) {
4833 struct fib6_info *sibling, *next_sibling;
4835 list_for_each_entry_safe(sibling, next_sibling,
4836 &f6i->fib6_siblings, fib6_siblings) {
4837 if (sibling->fib6_nh.fib_nh_dev == dev)
4845 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4847 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4848 struct fib_dump_filter *filter = &arg->filter;
4849 unsigned int flags = NLM_F_MULTI;
4850 struct net *net = arg->net;
4852 if (rt == net->ipv6.fib6_null_entry)
4855 if ((filter->flags & RTM_F_PREFIX) &&
4856 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4857 /* success since this is not a prefix route */
4860 if (filter->filter_set) {
4861 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4862 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4863 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4866 flags |= NLM_F_DUMP_FILTERED;
4869 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4870 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4871 arg->cb->nlh->nlmsg_seq, flags);
4874 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4875 const struct nlmsghdr *nlh,
4877 struct netlink_ext_ack *extack)
4882 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4883 NL_SET_ERR_MSG_MOD(extack,
4884 "Invalid header for get route request");
4888 if (!netlink_strict_get_check(skb))
4889 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4890 rtm_ipv6_policy, extack);
4892 rtm = nlmsg_data(nlh);
4893 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4894 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4895 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4897 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4900 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4901 NL_SET_ERR_MSG_MOD(extack,
4902 "Invalid flags for get route request");
4906 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4907 rtm_ipv6_policy, extack);
4911 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4912 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4913 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4917 for (i = 0; i <= RTA_MAX; i++) {
4933 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4941 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4942 struct netlink_ext_ack *extack)
4944 struct net *net = sock_net(in_skb->sk);
4945 struct nlattr *tb[RTA_MAX+1];
4946 int err, iif = 0, oif = 0;
4947 struct fib6_info *from;
4948 struct dst_entry *dst;
4949 struct rt6_info *rt;
4950 struct sk_buff *skb;
4952 struct flowi6 fl6 = {};
4955 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4960 rtm = nlmsg_data(nlh);
4961 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4962 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4965 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4968 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4972 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4975 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4979 iif = nla_get_u32(tb[RTA_IIF]);
4982 oif = nla_get_u32(tb[RTA_OIF]);
4985 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4988 fl6.flowi6_uid = make_kuid(current_user_ns(),
4989 nla_get_u32(tb[RTA_UID]));
4991 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4994 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4997 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4999 if (tb[RTA_IP_PROTO]) {
5000 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5001 &fl6.flowi6_proto, AF_INET6,
5008 struct net_device *dev;
5013 dev = dev_get_by_index_rcu(net, iif);
5020 fl6.flowi6_iif = iif;
5022 if (!ipv6_addr_any(&fl6.saddr))
5023 flags |= RT6_LOOKUP_F_HAS_SADDR;
5025 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5029 fl6.flowi6_oif = oif;
5031 dst = ip6_route_output(net, NULL, &fl6);
5035 rt = container_of(dst, struct rt6_info, dst);
5036 if (rt->dst.error) {
5037 err = rt->dst.error;
5042 if (rt == net->ipv6.ip6_null_entry) {
5043 err = rt->dst.error;
5048 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5055 skb_dst_set(skb, &rt->dst);
5058 from = rcu_dereference(rt->from);
5061 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5063 NETLINK_CB(in_skb).portid,
5066 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5067 &fl6.saddr, iif, RTM_NEWROUTE,
5068 NETLINK_CB(in_skb).portid,
5080 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5085 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5086 unsigned int nlm_flags)
5088 struct sk_buff *skb;
5089 struct net *net = info->nl_net;
5094 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5096 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5100 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5101 event, info->portid, seq, nlm_flags);
5103 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5104 WARN_ON(err == -EMSGSIZE);
5108 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5109 info->nlh, gfp_any());
5113 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5116 static int ip6_route_dev_notify(struct notifier_block *this,
5117 unsigned long event, void *ptr)
5119 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5120 struct net *net = dev_net(dev);
5122 if (!(dev->flags & IFF_LOOPBACK))
5125 if (event == NETDEV_REGISTER) {
5126 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5127 net->ipv6.ip6_null_entry->dst.dev = dev;
5128 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5129 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5130 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5131 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5132 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5133 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5135 } else if (event == NETDEV_UNREGISTER &&
5136 dev->reg_state != NETREG_UNREGISTERED) {
5137 /* NETDEV_UNREGISTER could be fired for multiple times by
5138 * netdev_wait_allrefs(). Make sure we only call this once.
5140 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5141 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5142 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5143 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5154 #ifdef CONFIG_PROC_FS
5155 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5157 struct net *net = (struct net *)seq->private;
5158 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5159 net->ipv6.rt6_stats->fib_nodes,
5160 net->ipv6.rt6_stats->fib_route_nodes,
5161 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5162 net->ipv6.rt6_stats->fib_rt_entries,
5163 net->ipv6.rt6_stats->fib_rt_cache,
5164 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5165 net->ipv6.rt6_stats->fib_discarded_routes);
5169 #endif /* CONFIG_PROC_FS */
5171 #ifdef CONFIG_SYSCTL
5174 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5175 void __user *buffer, size_t *lenp, loff_t *ppos)
5183 net = (struct net *)ctl->extra1;
5184 delay = net->ipv6.sysctl.flush_delay;
5185 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5189 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5196 static struct ctl_table ipv6_route_table_template[] = {
5198 .procname = "flush",
5199 .data = &init_net.ipv6.sysctl.flush_delay,
5200 .maxlen = sizeof(int),
5202 .proc_handler = ipv6_sysctl_rtcache_flush
5205 .procname = "gc_thresh",
5206 .data = &ip6_dst_ops_template.gc_thresh,
5207 .maxlen = sizeof(int),
5209 .proc_handler = proc_dointvec,
5212 .procname = "max_size",
5213 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5214 .maxlen = sizeof(int),
5216 .proc_handler = proc_dointvec,
5219 .procname = "gc_min_interval",
5220 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5221 .maxlen = sizeof(int),
5223 .proc_handler = proc_dointvec_jiffies,
5226 .procname = "gc_timeout",
5227 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5228 .maxlen = sizeof(int),
5230 .proc_handler = proc_dointvec_jiffies,
5233 .procname = "gc_interval",
5234 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5235 .maxlen = sizeof(int),
5237 .proc_handler = proc_dointvec_jiffies,
5240 .procname = "gc_elasticity",
5241 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5242 .maxlen = sizeof(int),
5244 .proc_handler = proc_dointvec,
5247 .procname = "mtu_expires",
5248 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5249 .maxlen = sizeof(int),
5251 .proc_handler = proc_dointvec_jiffies,
5254 .procname = "min_adv_mss",
5255 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5256 .maxlen = sizeof(int),
5258 .proc_handler = proc_dointvec,
5261 .procname = "gc_min_interval_ms",
5262 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5263 .maxlen = sizeof(int),
5265 .proc_handler = proc_dointvec_ms_jiffies,
5268 .procname = "skip_notify_on_dev_down",
5269 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5270 .maxlen = sizeof(int),
5272 .proc_handler = proc_dointvec,
5279 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5281 struct ctl_table *table;
5283 table = kmemdup(ipv6_route_table_template,
5284 sizeof(ipv6_route_table_template),
5288 table[0].data = &net->ipv6.sysctl.flush_delay;
5289 table[0].extra1 = net;
5290 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5291 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5292 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5293 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5294 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5295 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5296 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5297 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5298 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5299 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5301 /* Don't export sysctls to unprivileged users */
5302 if (net->user_ns != &init_user_ns)
5303 table[0].procname = NULL;
5310 static int __net_init ip6_route_net_init(struct net *net)
5314 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5315 sizeof(net->ipv6.ip6_dst_ops));
5317 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5318 goto out_ip6_dst_ops;
5320 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5321 sizeof(*net->ipv6.fib6_null_entry),
5323 if (!net->ipv6.fib6_null_entry)
5324 goto out_ip6_dst_entries;
5326 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5327 sizeof(*net->ipv6.ip6_null_entry),
5329 if (!net->ipv6.ip6_null_entry)
5330 goto out_fib6_null_entry;
5331 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5332 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5333 ip6_template_metrics, true);
5335 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5336 net->ipv6.fib6_has_custom_rules = false;
5337 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5338 sizeof(*net->ipv6.ip6_prohibit_entry),
5340 if (!net->ipv6.ip6_prohibit_entry)
5341 goto out_ip6_null_entry;
5342 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5343 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5344 ip6_template_metrics, true);
5346 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5347 sizeof(*net->ipv6.ip6_blk_hole_entry),
5349 if (!net->ipv6.ip6_blk_hole_entry)
5350 goto out_ip6_prohibit_entry;
5351 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5352 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5353 ip6_template_metrics, true);
5356 net->ipv6.sysctl.flush_delay = 0;
5357 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5358 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5359 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5360 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5361 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5362 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5363 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5364 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5366 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5372 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5373 out_ip6_prohibit_entry:
5374 kfree(net->ipv6.ip6_prohibit_entry);
5376 kfree(net->ipv6.ip6_null_entry);
5378 out_fib6_null_entry:
5379 kfree(net->ipv6.fib6_null_entry);
5380 out_ip6_dst_entries:
5381 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5386 static void __net_exit ip6_route_net_exit(struct net *net)
5388 kfree(net->ipv6.fib6_null_entry);
5389 kfree(net->ipv6.ip6_null_entry);
5390 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5391 kfree(net->ipv6.ip6_prohibit_entry);
5392 kfree(net->ipv6.ip6_blk_hole_entry);
5394 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5397 static int __net_init ip6_route_net_init_late(struct net *net)
5399 #ifdef CONFIG_PROC_FS
5400 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5401 sizeof(struct ipv6_route_iter));
5402 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5403 rt6_stats_seq_show, NULL);
5408 static void __net_exit ip6_route_net_exit_late(struct net *net)
5410 #ifdef CONFIG_PROC_FS
5411 remove_proc_entry("ipv6_route", net->proc_net);
5412 remove_proc_entry("rt6_stats", net->proc_net);
5416 static struct pernet_operations ip6_route_net_ops = {
5417 .init = ip6_route_net_init,
5418 .exit = ip6_route_net_exit,
5421 static int __net_init ipv6_inetpeer_init(struct net *net)
5423 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5427 inet_peer_base_init(bp);
5428 net->ipv6.peers = bp;
5432 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5434 struct inet_peer_base *bp = net->ipv6.peers;
5436 net->ipv6.peers = NULL;
5437 inetpeer_invalidate_tree(bp);
5441 static struct pernet_operations ipv6_inetpeer_ops = {
5442 .init = ipv6_inetpeer_init,
5443 .exit = ipv6_inetpeer_exit,
5446 static struct pernet_operations ip6_route_net_late_ops = {
5447 .init = ip6_route_net_init_late,
5448 .exit = ip6_route_net_exit_late,
5451 static struct notifier_block ip6_route_dev_notifier = {
5452 .notifier_call = ip6_route_dev_notify,
5453 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5456 void __init ip6_route_init_special_entries(void)
5458 /* Registering of the loopback is done before this portion of code,
5459 * the loopback reference in rt6_info will not be taken, do it
5460 * manually for init_net */
5461 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5462 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5463 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5464 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5465 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5466 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5467 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5468 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5472 int __init ip6_route_init(void)
5478 ip6_dst_ops_template.kmem_cachep =
5479 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5480 SLAB_HWCACHE_ALIGN, NULL);
5481 if (!ip6_dst_ops_template.kmem_cachep)
5484 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5486 goto out_kmem_cache;
5488 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5490 goto out_dst_entries;
5492 ret = register_pernet_subsys(&ip6_route_net_ops);
5494 goto out_register_inetpeer;
5496 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5500 goto out_register_subsys;
5506 ret = fib6_rules_init();
5510 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5512 goto fib6_rules_init;
5514 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5515 inet6_rtm_newroute, NULL, 0);
5517 goto out_register_late_subsys;
5519 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5520 inet6_rtm_delroute, NULL, 0);
5522 goto out_register_late_subsys;
5524 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5525 inet6_rtm_getroute, NULL,
5526 RTNL_FLAG_DOIT_UNLOCKED);
5528 goto out_register_late_subsys;
5530 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5532 goto out_register_late_subsys;
5534 for_each_possible_cpu(cpu) {
5535 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5537 INIT_LIST_HEAD(&ul->head);
5538 spin_lock_init(&ul->lock);
5544 out_register_late_subsys:
5545 rtnl_unregister_all(PF_INET6);
5546 unregister_pernet_subsys(&ip6_route_net_late_ops);
5548 fib6_rules_cleanup();
5553 out_register_subsys:
5554 unregister_pernet_subsys(&ip6_route_net_ops);
5555 out_register_inetpeer:
5556 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5558 dst_entries_destroy(&ip6_dst_blackhole_ops);
5560 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5564 void ip6_route_cleanup(void)
5566 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5567 unregister_pernet_subsys(&ip6_route_net_late_ops);
5568 fib6_rules_cleanup();
5571 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5572 unregister_pernet_subsys(&ip6_route_net_ops);
5573 dst_entries_destroy(&ip6_dst_blackhole_ops);
5574 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);