2 * Linux INET6 implementation
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
214 n = neigh_create(&nd_tbl, daddr, dev);
215 return IS_ERR(n) ? NULL : n;
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 struct net_device *dev = dst->dev;
230 struct rt6_info *rt = (struct rt6_info *)dst;
232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 __ipv6_confirm_neigh(dev, daddr);
242 static struct dst_ops ip6_dst_ops_template = {
246 .check = ip6_dst_check,
247 .default_advmss = ip6_default_advmss,
249 .cow_metrics = dst_cow_metrics_generic,
250 .destroy = ip6_dst_destroy,
251 .ifdown = ip6_dst_ifdown,
252 .negative_advice = ip6_negative_advice,
253 .link_failure = ip6_link_failure,
254 .update_pmtu = ip6_rt_update_pmtu,
255 .redirect = rt6_do_redirect,
256 .local_out = __ip6_local_out,
257 .neigh_lookup = ip6_dst_neigh_lookup,
258 .confirm_neigh = ip6_confirm_neigh,
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 return mtu ? : dst->dev->mtu;
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269 struct sk_buff *skb, u32 mtu)
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 static struct dst_ops ip6_dst_blackhole_ops = {
280 .destroy = ip6_dst_destroy,
281 .check = ip6_dst_check,
282 .mtu = ip6_blackhole_mtu,
283 .default_advmss = ip6_default_advmss,
284 .update_pmtu = ip6_rt_blackhole_update_pmtu,
285 .redirect = ip6_rt_blackhole_redirect,
286 .cow_metrics = dst_cow_metrics_generic,
287 .neigh_lookup = ip6_dst_neigh_lookup,
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291 [RTAX_HOPLIMIT - 1] = 0,
294 static const struct fib6_info fib6_null_entry_template = {
295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
296 .fib6_protocol = RTPROT_KERNEL,
297 .fib6_metric = ~(u32)0,
298 .fib6_ref = ATOMIC_INIT(1),
299 .fib6_type = RTN_UNREACHABLE,
300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
303 static const struct rt6_info ip6_null_entry_template = {
305 .__refcnt = ATOMIC_INIT(1),
307 .obsolete = DST_OBSOLETE_FORCE_CHK,
308 .error = -ENETUNREACH,
309 .input = ip6_pkt_discard,
310 .output = ip6_pkt_discard_out,
312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 static const struct rt6_info ip6_prohibit_entry_template = {
319 .__refcnt = ATOMIC_INIT(1),
321 .obsolete = DST_OBSOLETE_FORCE_CHK,
323 .input = ip6_pkt_prohibit,
324 .output = ip6_pkt_prohibit_out,
326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
329 static const struct rt6_info ip6_blk_hole_entry_template = {
331 .__refcnt = ATOMIC_INIT(1),
333 .obsolete = DST_OBSOLETE_FORCE_CHK,
335 .input = dst_discard,
336 .output = dst_discard_out,
338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
343 static void rt6_info_init(struct rt6_info *rt)
345 struct dst_entry *dst = &rt->dst;
347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 INIT_LIST_HEAD(&rt->rt6i_uncached);
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 1, DST_OBSOLETE_FORCE_CHK, flags);
360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
365 EXPORT_SYMBOL(ip6_dst_alloc);
367 static void ip6_dst_destroy(struct dst_entry *dst)
369 struct rt6_info *rt = (struct rt6_info *)dst;
370 struct fib6_info *from;
371 struct inet6_dev *idev;
373 ip_dst_metrics_put(dst);
374 rt6_uncached_list_del(rt);
376 idev = rt->rt6i_idev;
378 rt->rt6i_idev = NULL;
383 from = rcu_dereference(rt->from);
384 rcu_assign_pointer(rt->from, NULL);
385 fib6_info_release(from);
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct inet6_dev *idev = rt->rt6i_idev;
394 struct net_device *loopback_dev =
395 dev_net(dev)->loopback_dev;
397 if (idev && idev->dev != loopback_dev) {
398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 rt->rt6i_idev = loopback_idev;
406 static bool __rt6_check_expired(const struct rt6_info *rt)
408 if (rt->rt6i_flags & RTF_EXPIRES)
409 return time_after(jiffies, rt->dst.expires);
414 static bool rt6_check_expired(const struct rt6_info *rt)
416 struct fib6_info *from;
418 from = rcu_dereference(rt->from);
420 if (rt->rt6i_flags & RTF_EXPIRES) {
421 if (time_after(jiffies, rt->dst.expires))
424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425 fib6_check_expired(from);
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431 struct fib6_info *match,
432 struct flowi6 *fl6, int oif,
433 const struct sk_buff *skb,
436 struct fib6_info *sibling, *next_sibling;
438 /* We might have already computed the hash for ICMPv6 errors. In such
439 * case it will always be non-zero. Otherwise now is the time to do it.
442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
451 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452 if (fl6->mp_hash > nh_upper_bound)
454 if (rt6_score_route(sibling, oif, strict) < 0)
464 * Route lookup. rcu_read_lock() should be held.
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468 struct fib6_info *rt,
469 const struct in6_addr *saddr,
473 struct fib6_info *sprt;
475 if (!oif && ipv6_addr_any(saddr) &&
476 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480 const struct net_device *dev = sprt->fib6_nh.nh_dev;
482 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
486 if (dev->ifindex == oif)
489 if (ipv6_chk_addr(net, saddr, dev,
490 flags & RT6_LOOKUP_F_IFACE))
495 if (oif && flags & RT6_LOOKUP_F_IFACE)
496 return net->ipv6.fib6_null_entry;
498 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503 struct work_struct work;
504 struct in6_addr target;
505 struct net_device *dev;
508 static void rt6_probe_deferred(struct work_struct *w)
510 struct in6_addr mcaddr;
511 struct __rt6_probe_work *work =
512 container_of(w, struct __rt6_probe_work, work);
514 addrconf_addr_solict_mult(&work->target, &mcaddr);
515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
520 static void rt6_probe(struct fib6_info *rt)
522 struct __rt6_probe_work *work = NULL;
523 const struct in6_addr *nh_gw;
524 struct neighbour *neigh;
525 struct net_device *dev;
526 struct inet6_dev *idev;
529 * Okay, this does not seem to be appropriate
530 * for now, however, we need to check if it
531 * is really so; aka Router Reachability Probing.
533 * Router Reachability Probe MUST be rate-limited
534 * to no more than one per minute.
536 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
539 nh_gw = &rt->fib6_nh.nh_gw;
540 dev = rt->fib6_nh.nh_dev;
542 idev = __in6_dev_get(dev);
543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
545 if (neigh->nud_state & NUD_VALID)
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
557 } else if (time_after(jiffies, rt->last_probe +
558 idev->cnf.rtr_probe_interval)) {
559 work = kmalloc(sizeof(*work), GFP_ATOMIC);
563 rt->last_probe = jiffies;
564 INIT_WORK(&work->work, rt6_probe_deferred);
565 work->target = *nh_gw;
568 schedule_work(&work->work);
572 rcu_read_unlock_bh();
575 static inline void rt6_probe(struct fib6_info *rt)
581 * Default Router Selection (RFC 2461 6.3.6)
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
585 const struct net_device *dev = rt->fib6_nh.nh_dev;
587 if (!oif || dev->ifindex == oif)
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595 struct neighbour *neigh;
597 if (rt->fib6_flags & RTF_NONEXTHOP ||
598 !(rt->fib6_flags & RTF_GATEWAY))
599 return RT6_NUD_SUCCEED;
602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
605 read_lock(&neigh->lock);
606 if (neigh->nud_state & NUD_VALID)
607 ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609 else if (!(neigh->nud_state & NUD_FAILED))
610 ret = RT6_NUD_SUCCEED;
612 ret = RT6_NUD_FAIL_PROBE;
614 read_unlock(&neigh->lock);
616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619 rcu_read_unlock_bh();
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
628 m = rt6_check_dev(rt, oif);
629 if (!m && (strict & RT6_LOOKUP_F_IFACE))
630 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
634 if (strict & RT6_LOOKUP_F_REACHABLE) {
635 int n = rt6_check_neigh(rt);
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
645 const struct net_device *dev = fib6_info_nh_dev(f6i);
649 const struct inet6_dev *idev = __in6_dev_get(dev);
651 rc = !!idev->cnf.ignore_routes_with_linkdown;
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658 int *mpri, struct fib6_info *match,
662 bool match_do_rr = false;
664 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
667 if (fib6_ignore_linkdown(rt) &&
668 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
672 if (fib6_check_expired(rt))
675 m = rt6_score_route(rt, oif, strict);
676 if (m == RT6_NUD_FAIL_DO_RR) {
678 m = 0; /* lowest valid score */
679 } else if (m == RT6_NUD_FAIL_HARD) {
683 if (strict & RT6_LOOKUP_F_REACHABLE)
686 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688 *do_rr = match_do_rr;
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697 struct fib6_info *leaf,
698 struct fib6_info *rr_head,
699 u32 metric, int oif, int strict,
702 struct fib6_info *rt, *match, *cont;
707 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708 if (rt->fib6_metric != metric) {
713 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716 for (rt = leaf; rt && rt != rr_head;
717 rt = rcu_dereference(rt->fib6_next)) {
718 if (rt->fib6_metric != metric) {
723 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730 match = find_match(rt, oif, strict, &mpri, match, do_rr);
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
738 struct fib6_info *leaf = rcu_dereference(fn->leaf);
739 struct fib6_info *match, *rt0;
743 if (!leaf || leaf == net->ipv6.fib6_null_entry)
744 return net->ipv6.fib6_null_entry;
746 rt0 = rcu_dereference(fn->rr_ptr);
750 /* Double check to make sure fn is not an intermediate node
751 * and fn->leaf does not points to its child's leaf
752 * (This might happen if all routes under fn are deleted from
753 * the tree and fib6_repair_tree() is called on the node.)
755 key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757 if (rt0->fib6_src.plen)
758 key_plen = rt0->fib6_src.plen;
760 if (fn->fn_bit != key_plen)
761 return net->ipv6.fib6_null_entry;
763 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
767 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
769 /* no entries matched; do round-robin */
770 if (!next || next->fib6_metric != rt0->fib6_metric)
774 spin_lock_bh(&leaf->fib6_table->tb6_lock);
775 /* make sure next is not being deleted from the tree */
777 rcu_assign_pointer(fn->rr_ptr, next);
778 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
782 return match ? match : net->ipv6.fib6_null_entry;
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
787 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792 const struct in6_addr *gwaddr)
794 struct net *net = dev_net(dev);
795 struct route_info *rinfo = (struct route_info *) opt;
796 struct in6_addr prefix_buf, *prefix;
798 unsigned long lifetime;
799 struct fib6_info *rt;
801 if (len < sizeof(struct route_info)) {
805 /* Sanity check for prefix_len and length */
806 if (rinfo->length > 3) {
808 } else if (rinfo->prefix_len > 128) {
810 } else if (rinfo->prefix_len > 64) {
811 if (rinfo->length < 2) {
814 } else if (rinfo->prefix_len > 0) {
815 if (rinfo->length < 1) {
820 pref = rinfo->route_pref;
821 if (pref == ICMPV6_ROUTER_PREF_INVALID)
824 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
826 if (rinfo->length == 3)
827 prefix = (struct in6_addr *)rinfo->prefix;
829 /* this function is safe */
830 ipv6_addr_prefix(&prefix_buf,
831 (struct in6_addr *)rinfo->prefix,
833 prefix = &prefix_buf;
836 if (rinfo->prefix_len == 0)
837 rt = rt6_get_dflt_router(net, gwaddr, dev);
839 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
842 if (rt && !lifetime) {
848 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
851 rt->fib6_flags = RTF_ROUTEINFO |
852 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
855 if (!addrconf_finite_timeout(lifetime))
856 fib6_clean_expires(rt);
858 fib6_set_expires(rt, jiffies + HZ * lifetime);
860 fib6_info_release(rt);
867 * Misc support functions
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
873 struct net_device *dev = rt->fib6_nh.nh_dev;
875 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876 /* for copies of local routes, dst->dev needs to be the
877 * device if it is a master device, the master device if
878 * device is enslaved, and the loopback as the default
880 if (netif_is_l3_slave(dev) &&
881 !rt6_need_strict(&rt->fib6_dst.addr))
882 dev = l3mdev_master_dev_rcu(dev);
883 else if (!netif_is_l3_master(dev))
884 dev = dev_net(dev)->loopback_dev;
885 /* last case is netif_is_l3_master(dev) is true in which
886 * case we want dev returned to be dev
893 static const int fib6_prop[RTN_MAX + 1] = {
900 [RTN_BLACKHOLE] = -EINVAL,
901 [RTN_UNREACHABLE] = -EHOSTUNREACH,
902 [RTN_PROHIBIT] = -EACCES,
903 [RTN_THROW] = -EAGAIN,
905 [RTN_XRESOLVE] = -EINVAL,
908 static int ip6_rt_type_to_error(u8 fib6_type)
910 return fib6_prop[fib6_type];
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
915 unsigned short flags = 0;
918 flags |= DST_NOCOUNT;
919 if (rt->dst_nopolicy)
920 flags |= DST_NOPOLICY;
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
929 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
931 switch (ort->fib6_type) {
933 rt->dst.output = dst_discard_out;
934 rt->dst.input = dst_discard;
937 rt->dst.output = ip6_pkt_prohibit_out;
938 rt->dst.input = ip6_pkt_prohibit;
941 case RTN_UNREACHABLE:
943 rt->dst.output = ip6_pkt_discard_out;
944 rt->dst.input = ip6_pkt_discard;
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
957 rt->dst.output = ip6_output;
959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
964 rt->dst.input = ip6_forward;
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
972 rt->dst.lastuse = jiffies;
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 rt->rt6i_flags &= ~RTF_EXPIRES;
979 rcu_assign_pointer(rt->from, from);
980 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
986 struct net_device *dev = fib6_info_nh_dev(ort);
988 ip6_rt_init_dst(rt, ort);
990 rt->rt6i_dst = ort->fib6_dst;
991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993 rt->rt6i_flags = ort->fib6_flags;
994 rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996 rt->rt6i_src = ort->fib6_src;
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001 struct in6_addr *saddr)
1003 struct fib6_node *pn, *sn;
1005 if (fn->fn_flags & RTN_TL_ROOT)
1007 pn = rcu_dereference(fn->parent);
1008 sn = FIB6_SUBTREE(pn);
1010 fn = fib6_node_lookup(sn, NULL, saddr);
1013 if (fn->fn_flags & RTN_RTINFO)
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1021 struct rt6_info *rt = *prt;
1023 if (dst_hold_safe(&rt->dst))
1025 if (null_fallback) {
1026 rt = net->ipv6.ip6_null_entry;
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1038 unsigned short flags = fib6_info_dst_flags(rt);
1039 struct net_device *dev = rt->fib6_nh.nh_dev;
1040 struct rt6_info *nrt;
1042 if (!fib6_info_hold_safe(rt))
1045 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1047 fib6_info_release(rt);
1051 ip6_rt_copy_init(nrt, rt);
1055 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1056 dst_hold(&nrt->dst);
1060 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1061 struct fib6_table *table,
1063 const struct sk_buff *skb,
1066 struct fib6_info *f6i;
1067 struct fib6_node *fn;
1068 struct rt6_info *rt;
1070 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1071 flags &= ~RT6_LOOKUP_F_IFACE;
1074 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1076 f6i = rcu_dereference(fn->leaf);
1078 f6i = net->ipv6.fib6_null_entry;
1080 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1081 fl6->flowi6_oif, flags);
1082 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1083 f6i = fib6_multipath_select(net, f6i, fl6,
1084 fl6->flowi6_oif, skb,
1087 if (f6i == net->ipv6.fib6_null_entry) {
1088 fn = fib6_backtrack(fn, &fl6->saddr);
1093 trace_fib6_table_lookup(net, f6i, table, fl6);
1095 /* Search through exception table */
1096 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1098 if (ip6_hold_safe(net, &rt, true))
1099 dst_use_noref(&rt->dst, jiffies);
1100 } else if (f6i == net->ipv6.fib6_null_entry) {
1101 rt = net->ipv6.ip6_null_entry;
1104 rt = ip6_create_rt_rcu(f6i);
1112 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1113 const struct sk_buff *skb, int flags)
1115 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1117 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1119 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1120 const struct in6_addr *saddr, int oif,
1121 const struct sk_buff *skb, int strict)
1123 struct flowi6 fl6 = {
1127 struct dst_entry *dst;
1128 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1131 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1132 flags |= RT6_LOOKUP_F_HAS_SADDR;
1135 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1136 if (dst->error == 0)
1137 return (struct rt6_info *) dst;
1143 EXPORT_SYMBOL(rt6_lookup);
1145 /* ip6_ins_rt is called with FREE table->tb6_lock.
1146 * It takes new route entry, the addition fails by any reason the
1147 * route is released.
1148 * Caller must hold dst before calling it.
1151 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1152 struct netlink_ext_ack *extack)
1155 struct fib6_table *table;
1157 table = rt->fib6_table;
1158 spin_lock_bh(&table->tb6_lock);
1159 err = fib6_add(&table->tb6_root, rt, info, extack);
1160 spin_unlock_bh(&table->tb6_lock);
1165 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1167 struct nl_info info = { .nl_net = net, };
1169 return __ip6_ins_rt(rt, &info, NULL);
1172 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1173 const struct in6_addr *daddr,
1174 const struct in6_addr *saddr)
1176 struct net_device *dev;
1177 struct rt6_info *rt;
1183 if (!fib6_info_hold_safe(ort))
1186 dev = ip6_rt_get_dev_rcu(ort);
1187 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1189 fib6_info_release(ort);
1193 ip6_rt_copy_init(rt, ort);
1194 rt->rt6i_flags |= RTF_CACHE;
1195 rt->dst.flags |= DST_HOST;
1196 rt->rt6i_dst.addr = *daddr;
1197 rt->rt6i_dst.plen = 128;
1199 if (!rt6_is_gw_or_nonexthop(ort)) {
1200 if (ort->fib6_dst.plen != 128 &&
1201 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1202 rt->rt6i_flags |= RTF_ANYCAST;
1203 #ifdef CONFIG_IPV6_SUBTREES
1204 if (rt->rt6i_src.plen && saddr) {
1205 rt->rt6i_src.addr = *saddr;
1206 rt->rt6i_src.plen = 128;
1214 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1216 unsigned short flags = fib6_info_dst_flags(rt);
1217 struct net_device *dev;
1218 struct rt6_info *pcpu_rt;
1220 if (!fib6_info_hold_safe(rt))
1224 dev = ip6_rt_get_dev_rcu(rt);
1225 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1228 fib6_info_release(rt);
1231 ip6_rt_copy_init(pcpu_rt, rt);
1232 pcpu_rt->rt6i_flags |= RTF_PCPU;
1236 /* It should be called with rcu_read_lock() acquired */
1237 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1239 struct rt6_info *pcpu_rt, **p;
1241 p = this_cpu_ptr(rt->rt6i_pcpu);
1245 ip6_hold_safe(NULL, &pcpu_rt, false);
1250 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1251 struct fib6_info *rt)
1253 struct rt6_info *pcpu_rt, *prev, **p;
1255 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1257 dst_hold(&net->ipv6.ip6_null_entry->dst);
1258 return net->ipv6.ip6_null_entry;
1261 dst_hold(&pcpu_rt->dst);
1262 p = this_cpu_ptr(rt->rt6i_pcpu);
1263 prev = cmpxchg(p, NULL, pcpu_rt);
1269 /* exception hash table implementation
1271 static DEFINE_SPINLOCK(rt6_exception_lock);
1273 /* Remove rt6_ex from hash table and free the memory
1274 * Caller must hold rt6_exception_lock
1276 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1277 struct rt6_exception *rt6_ex)
1279 struct fib6_info *from;
1282 if (!bucket || !rt6_ex)
1285 net = dev_net(rt6_ex->rt6i->dst.dev);
1286 net->ipv6.rt6_stats->fib_rt_cache--;
1288 /* purge completely the exception to allow releasing the held resources:
1289 * some [sk] cache may keep the dst around for unlimited time
1291 from = rcu_dereference_protected(rt6_ex->rt6i->from,
1292 lockdep_is_held(&rt6_exception_lock));
1293 rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1294 fib6_info_release(from);
1295 dst_dev_put(&rt6_ex->rt6i->dst);
1297 hlist_del_rcu(&rt6_ex->hlist);
1298 dst_release(&rt6_ex->rt6i->dst);
1299 kfree_rcu(rt6_ex, rcu);
1300 WARN_ON_ONCE(!bucket->depth);
1304 /* Remove oldest rt6_ex in bucket and free the memory
1305 * Caller must hold rt6_exception_lock
1307 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1309 struct rt6_exception *rt6_ex, *oldest = NULL;
1314 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1315 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1318 rt6_remove_exception(bucket, oldest);
1321 static u32 rt6_exception_hash(const struct in6_addr *dst,
1322 const struct in6_addr *src)
1324 static u32 seed __read_mostly;
1327 net_get_random_once(&seed, sizeof(seed));
1328 val = jhash(dst, sizeof(*dst), seed);
1330 #ifdef CONFIG_IPV6_SUBTREES
1332 val = jhash(src, sizeof(*src), val);
1334 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1337 /* Helper function to find the cached rt in the hash table
1338 * and update bucket pointer to point to the bucket for this
1339 * (daddr, saddr) pair
1340 * Caller must hold rt6_exception_lock
1342 static struct rt6_exception *
1343 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1344 const struct in6_addr *daddr,
1345 const struct in6_addr *saddr)
1347 struct rt6_exception *rt6_ex;
1350 if (!(*bucket) || !daddr)
1353 hval = rt6_exception_hash(daddr, saddr);
1356 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1357 struct rt6_info *rt6 = rt6_ex->rt6i;
1358 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1360 #ifdef CONFIG_IPV6_SUBTREES
1361 if (matched && saddr)
1362 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1370 /* Helper function to find the cached rt in the hash table
1371 * and update bucket pointer to point to the bucket for this
1372 * (daddr, saddr) pair
1373 * Caller must hold rcu_read_lock()
1375 static struct rt6_exception *
1376 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1377 const struct in6_addr *daddr,
1378 const struct in6_addr *saddr)
1380 struct rt6_exception *rt6_ex;
1383 WARN_ON_ONCE(!rcu_read_lock_held());
1385 if (!(*bucket) || !daddr)
1388 hval = rt6_exception_hash(daddr, saddr);
1391 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1392 struct rt6_info *rt6 = rt6_ex->rt6i;
1393 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1395 #ifdef CONFIG_IPV6_SUBTREES
1396 if (matched && saddr)
1397 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1405 static unsigned int fib6_mtu(const struct fib6_info *rt)
1409 if (rt->fib6_pmtu) {
1410 mtu = rt->fib6_pmtu;
1412 struct net_device *dev = fib6_info_nh_dev(rt);
1413 struct inet6_dev *idev;
1416 idev = __in6_dev_get(dev);
1417 mtu = idev->cnf.mtu6;
1421 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1423 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1426 static int rt6_insert_exception(struct rt6_info *nrt,
1427 struct fib6_info *ort)
1429 struct net *net = dev_net(nrt->dst.dev);
1430 struct rt6_exception_bucket *bucket;
1431 struct in6_addr *src_key = NULL;
1432 struct rt6_exception *rt6_ex;
1435 spin_lock_bh(&rt6_exception_lock);
1437 if (ort->exception_bucket_flushed) {
1442 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1443 lockdep_is_held(&rt6_exception_lock));
1445 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1451 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1454 #ifdef CONFIG_IPV6_SUBTREES
1455 /* rt6i_src.plen != 0 indicates ort is in subtree
1456 * and exception table is indexed by a hash of
1457 * both rt6i_dst and rt6i_src.
1458 * Otherwise, the exception table is indexed by
1459 * a hash of only rt6i_dst.
1461 if (ort->fib6_src.plen)
1462 src_key = &nrt->rt6i_src.addr;
1464 /* rt6_mtu_change() might lower mtu on ort.
1465 * Only insert this exception route if its mtu
1466 * is less than ort's mtu value.
1468 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1473 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1476 rt6_remove_exception(bucket, rt6_ex);
1478 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1484 rt6_ex->stamp = jiffies;
1485 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1487 net->ipv6.rt6_stats->fib_rt_cache++;
1489 if (bucket->depth > FIB6_MAX_DEPTH)
1490 rt6_exception_remove_oldest(bucket);
1493 spin_unlock_bh(&rt6_exception_lock);
1495 /* Update fn->fn_sernum to invalidate all cached dst */
1497 spin_lock_bh(&ort->fib6_table->tb6_lock);
1498 fib6_update_sernum(net, ort);
1499 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1500 fib6_force_start_gc(net);
1506 void rt6_flush_exceptions(struct fib6_info *rt)
1508 struct rt6_exception_bucket *bucket;
1509 struct rt6_exception *rt6_ex;
1510 struct hlist_node *tmp;
1513 spin_lock_bh(&rt6_exception_lock);
1514 /* Prevent rt6_insert_exception() to recreate the bucket list */
1515 rt->exception_bucket_flushed = 1;
1517 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1518 lockdep_is_held(&rt6_exception_lock));
1522 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1523 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1524 rt6_remove_exception(bucket, rt6_ex);
1525 WARN_ON_ONCE(bucket->depth);
1530 spin_unlock_bh(&rt6_exception_lock);
1533 /* Find cached rt in the hash table inside passed in rt
1534 * Caller has to hold rcu_read_lock()
1536 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1537 struct in6_addr *daddr,
1538 struct in6_addr *saddr)
1540 struct rt6_exception_bucket *bucket;
1541 struct in6_addr *src_key = NULL;
1542 struct rt6_exception *rt6_ex;
1543 struct rt6_info *res = NULL;
1545 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1547 #ifdef CONFIG_IPV6_SUBTREES
1548 /* rt6i_src.plen != 0 indicates rt is in subtree
1549 * and exception table is indexed by a hash of
1550 * both rt6i_dst and rt6i_src.
1551 * Otherwise, the exception table is indexed by
1552 * a hash of only rt6i_dst.
1554 if (rt->fib6_src.plen)
1557 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1559 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1565 /* Remove the passed in cached rt from the hash table that contains it */
1566 static int rt6_remove_exception_rt(struct rt6_info *rt)
1568 struct rt6_exception_bucket *bucket;
1569 struct in6_addr *src_key = NULL;
1570 struct rt6_exception *rt6_ex;
1571 struct fib6_info *from;
1574 from = rcu_dereference(rt->from);
1576 !(rt->rt6i_flags & RTF_CACHE))
1579 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1582 spin_lock_bh(&rt6_exception_lock);
1583 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1584 lockdep_is_held(&rt6_exception_lock));
1585 #ifdef CONFIG_IPV6_SUBTREES
1586 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1587 * and exception table is indexed by a hash of
1588 * both rt6i_dst and rt6i_src.
1589 * Otherwise, the exception table is indexed by
1590 * a hash of only rt6i_dst.
1592 if (from->fib6_src.plen)
1593 src_key = &rt->rt6i_src.addr;
1595 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1599 rt6_remove_exception(bucket, rt6_ex);
1605 spin_unlock_bh(&rt6_exception_lock);
1609 /* Find rt6_ex which contains the passed in rt cache and
1612 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1614 struct rt6_exception_bucket *bucket;
1615 struct in6_addr *src_key = NULL;
1616 struct rt6_exception *rt6_ex;
1617 struct fib6_info *from;
1620 from = rcu_dereference(rt->from);
1621 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1624 bucket = rcu_dereference(from->rt6i_exception_bucket);
1626 #ifdef CONFIG_IPV6_SUBTREES
1627 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1628 * and exception table is indexed by a hash of
1629 * both rt6i_dst and rt6i_src.
1630 * Otherwise, the exception table is indexed by
1631 * a hash of only rt6i_dst.
1633 if (from->fib6_src.plen)
1634 src_key = &rt->rt6i_src.addr;
1636 rt6_ex = __rt6_find_exception_rcu(&bucket,
1640 rt6_ex->stamp = jiffies;
1646 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1647 struct rt6_info *rt, int mtu)
1649 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1650 * lowest MTU in the path: always allow updating the route PMTU to
1651 * reflect PMTU decreases.
1653 * If the new MTU is higher, and the route PMTU is equal to the local
1654 * MTU, this means the old MTU is the lowest in the path, so allow
1655 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1659 if (dst_mtu(&rt->dst) >= mtu)
1662 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1668 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1669 struct fib6_info *rt, int mtu)
1671 struct rt6_exception_bucket *bucket;
1672 struct rt6_exception *rt6_ex;
1675 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1676 lockdep_is_held(&rt6_exception_lock));
1681 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1682 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1683 struct rt6_info *entry = rt6_ex->rt6i;
1685 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1686 * route), the metrics of its rt->from have already
1689 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1690 rt6_mtu_change_route_allowed(idev, entry, mtu))
1691 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1697 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1699 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1700 struct in6_addr *gateway)
1702 struct rt6_exception_bucket *bucket;
1703 struct rt6_exception *rt6_ex;
1704 struct hlist_node *tmp;
1707 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1710 spin_lock_bh(&rt6_exception_lock);
1711 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1712 lockdep_is_held(&rt6_exception_lock));
1715 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1716 hlist_for_each_entry_safe(rt6_ex, tmp,
1717 &bucket->chain, hlist) {
1718 struct rt6_info *entry = rt6_ex->rt6i;
1720 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1721 RTF_CACHE_GATEWAY &&
1722 ipv6_addr_equal(gateway,
1723 &entry->rt6i_gateway)) {
1724 rt6_remove_exception(bucket, rt6_ex);
1731 spin_unlock_bh(&rt6_exception_lock);
1734 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1735 struct rt6_exception *rt6_ex,
1736 struct fib6_gc_args *gc_args,
1739 struct rt6_info *rt = rt6_ex->rt6i;
1741 /* we are pruning and obsoleting aged-out and non gateway exceptions
1742 * even if others have still references to them, so that on next
1743 * dst_check() such references can be dropped.
1744 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1745 * expired, independently from their aging, as per RFC 8201 section 4
1747 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1748 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1749 RT6_TRACE("aging clone %p\n", rt);
1750 rt6_remove_exception(bucket, rt6_ex);
1753 } else if (time_after(jiffies, rt->dst.expires)) {
1754 RT6_TRACE("purging expired route %p\n", rt);
1755 rt6_remove_exception(bucket, rt6_ex);
1759 if (rt->rt6i_flags & RTF_GATEWAY) {
1760 struct neighbour *neigh;
1761 __u8 neigh_flags = 0;
1763 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1765 neigh_flags = neigh->flags;
1767 if (!(neigh_flags & NTF_ROUTER)) {
1768 RT6_TRACE("purging route %p via non-router but gateway\n",
1770 rt6_remove_exception(bucket, rt6_ex);
1778 void rt6_age_exceptions(struct fib6_info *rt,
1779 struct fib6_gc_args *gc_args,
1782 struct rt6_exception_bucket *bucket;
1783 struct rt6_exception *rt6_ex;
1784 struct hlist_node *tmp;
1787 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1791 spin_lock(&rt6_exception_lock);
1792 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1793 lockdep_is_held(&rt6_exception_lock));
1796 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1797 hlist_for_each_entry_safe(rt6_ex, tmp,
1798 &bucket->chain, hlist) {
1799 rt6_age_examine_exception(bucket, rt6_ex,
1805 spin_unlock(&rt6_exception_lock);
1806 rcu_read_unlock_bh();
1809 /* must be called with rcu lock held */
1810 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1811 int oif, struct flowi6 *fl6, int strict)
1813 struct fib6_node *fn, *saved_fn;
1814 struct fib6_info *f6i;
1816 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1819 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1823 f6i = rt6_select(net, fn, oif, strict);
1824 if (f6i == net->ipv6.fib6_null_entry) {
1825 fn = fib6_backtrack(fn, &fl6->saddr);
1827 goto redo_rt6_select;
1828 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1829 /* also consider unreachable route */
1830 strict &= ~RT6_LOOKUP_F_REACHABLE;
1832 goto redo_rt6_select;
1836 trace_fib6_table_lookup(net, f6i, table, fl6);
1841 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1842 int oif, struct flowi6 *fl6,
1843 const struct sk_buff *skb, int flags)
1845 struct fib6_info *f6i;
1846 struct rt6_info *rt;
1849 strict |= flags & RT6_LOOKUP_F_IFACE;
1850 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1851 if (net->ipv6.devconf_all->forwarding == 0)
1852 strict |= RT6_LOOKUP_F_REACHABLE;
1856 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1857 if (f6i->fib6_nsiblings)
1858 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1860 if (f6i == net->ipv6.fib6_null_entry) {
1861 rt = net->ipv6.ip6_null_entry;
1867 /*Search through exception table */
1868 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1870 if (ip6_hold_safe(net, &rt, true))
1871 dst_use_noref(&rt->dst, jiffies);
1875 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1876 !(f6i->fib6_flags & RTF_GATEWAY))) {
1877 /* Create a RTF_CACHE clone which will not be
1878 * owned by the fib6 tree. It is for the special case where
1879 * the daddr in the skb during the neighbor look-up is different
1880 * from the fl6->daddr used to look-up route here.
1882 struct rt6_info *uncached_rt;
1884 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1889 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1890 * No need for another dst_hold()
1892 rt6_uncached_list_add(uncached_rt);
1893 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1895 uncached_rt = net->ipv6.ip6_null_entry;
1896 dst_hold(&uncached_rt->dst);
1901 /* Get a percpu copy */
1903 struct rt6_info *pcpu_rt;
1906 pcpu_rt = rt6_get_pcpu_route(f6i);
1909 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1917 EXPORT_SYMBOL_GPL(ip6_pol_route);
1919 static struct rt6_info *ip6_pol_route_input(struct net *net,
1920 struct fib6_table *table,
1922 const struct sk_buff *skb,
1925 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1928 struct dst_entry *ip6_route_input_lookup(struct net *net,
1929 struct net_device *dev,
1931 const struct sk_buff *skb,
1934 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1935 flags |= RT6_LOOKUP_F_IFACE;
1937 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1939 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1941 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1942 struct flow_keys *keys,
1943 struct flow_keys *flkeys)
1945 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1946 const struct ipv6hdr *key_iph = outer_iph;
1947 struct flow_keys *_flkeys = flkeys;
1948 const struct ipv6hdr *inner_iph;
1949 const struct icmp6hdr *icmph;
1950 struct ipv6hdr _inner_iph;
1951 struct icmp6hdr _icmph;
1953 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1956 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1957 sizeof(_icmph), &_icmph);
1961 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1962 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1963 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1964 icmph->icmp6_type != ICMPV6_PARAMPROB)
1967 inner_iph = skb_header_pointer(skb,
1968 skb_transport_offset(skb) + sizeof(*icmph),
1969 sizeof(_inner_iph), &_inner_iph);
1973 key_iph = inner_iph;
1977 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1978 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1979 keys->tags.flow_label = _flkeys->tags.flow_label;
1980 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1982 keys->addrs.v6addrs.src = key_iph->saddr;
1983 keys->addrs.v6addrs.dst = key_iph->daddr;
1984 keys->tags.flow_label = ip6_flowlabel(key_iph);
1985 keys->basic.ip_proto = key_iph->nexthdr;
1989 /* if skb is set it will be used and fl6 can be NULL */
1990 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1991 const struct sk_buff *skb, struct flow_keys *flkeys)
1993 struct flow_keys hash_keys;
1996 switch (ip6_multipath_hash_policy(net)) {
1998 memset(&hash_keys, 0, sizeof(hash_keys));
1999 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2001 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2003 hash_keys.addrs.v6addrs.src = fl6->saddr;
2004 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2005 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2006 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2011 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2012 struct flow_keys keys;
2014 /* short-circuit if we already have L4 hash present */
2016 return skb_get_hash_raw(skb) >> 1;
2018 memset(&hash_keys, 0, sizeof(hash_keys));
2021 skb_flow_dissect_flow_keys(skb, &keys, flag);
2024 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2025 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2026 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2027 hash_keys.ports.src = flkeys->ports.src;
2028 hash_keys.ports.dst = flkeys->ports.dst;
2029 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2031 memset(&hash_keys, 0, sizeof(hash_keys));
2032 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2033 hash_keys.addrs.v6addrs.src = fl6->saddr;
2034 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2035 hash_keys.ports.src = fl6->fl6_sport;
2036 hash_keys.ports.dst = fl6->fl6_dport;
2037 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2041 mhash = flow_hash_from_keys(&hash_keys);
2046 void ip6_route_input(struct sk_buff *skb)
2048 const struct ipv6hdr *iph = ipv6_hdr(skb);
2049 struct net *net = dev_net(skb->dev);
2050 int flags = RT6_LOOKUP_F_HAS_SADDR;
2051 struct ip_tunnel_info *tun_info;
2052 struct flowi6 fl6 = {
2053 .flowi6_iif = skb->dev->ifindex,
2054 .daddr = iph->daddr,
2055 .saddr = iph->saddr,
2056 .flowlabel = ip6_flowinfo(iph),
2057 .flowi6_mark = skb->mark,
2058 .flowi6_proto = iph->nexthdr,
2060 struct flow_keys *flkeys = NULL, _flkeys;
2062 tun_info = skb_tunnel_info(skb);
2063 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2064 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2066 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2069 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2070 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2073 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2076 static struct rt6_info *ip6_pol_route_output(struct net *net,
2077 struct fib6_table *table,
2079 const struct sk_buff *skb,
2082 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2085 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2086 struct flowi6 *fl6, int flags)
2090 if (ipv6_addr_type(&fl6->daddr) &
2091 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2092 struct dst_entry *dst;
2094 dst = l3mdev_link_scope_lookup(net, fl6);
2099 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2101 any_src = ipv6_addr_any(&fl6->saddr);
2102 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2103 (fl6->flowi6_oif && any_src))
2104 flags |= RT6_LOOKUP_F_IFACE;
2107 flags |= RT6_LOOKUP_F_HAS_SADDR;
2109 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2111 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2113 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2115 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2117 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2118 struct net_device *loopback_dev = net->loopback_dev;
2119 struct dst_entry *new = NULL;
2121 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2122 DST_OBSOLETE_DEAD, 0);
2125 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2129 new->input = dst_discard;
2130 new->output = dst_discard_out;
2132 dst_copy_metrics(new, &ort->dst);
2134 rt->rt6i_idev = in6_dev_get(loopback_dev);
2135 rt->rt6i_gateway = ort->rt6i_gateway;
2136 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2138 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2139 #ifdef CONFIG_IPV6_SUBTREES
2140 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2144 dst_release(dst_orig);
2145 return new ? new : ERR_PTR(-ENOMEM);
2149 * Destination cache support functions
2152 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2156 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2159 if (fib6_check_expired(f6i))
2165 static struct dst_entry *rt6_check(struct rt6_info *rt,
2166 struct fib6_info *from,
2171 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2172 rt_cookie != cookie)
2175 if (rt6_check_expired(rt))
2181 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2182 struct fib6_info *from,
2185 if (!__rt6_check_expired(rt) &&
2186 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2187 fib6_check(from, cookie))
2193 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2195 struct dst_entry *dst_ret;
2196 struct fib6_info *from;
2197 struct rt6_info *rt;
2199 rt = container_of(dst, struct rt6_info, dst);
2203 /* All IPV6 dsts are created with ->obsolete set to the value
2204 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2205 * into this function always.
2208 from = rcu_dereference(rt->from);
2210 if (from && (rt->rt6i_flags & RTF_PCPU ||
2211 unlikely(!list_empty(&rt->rt6i_uncached))))
2212 dst_ret = rt6_dst_from_check(rt, from, cookie);
2214 dst_ret = rt6_check(rt, from, cookie);
2221 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2223 struct rt6_info *rt = (struct rt6_info *) dst;
2226 if (rt->rt6i_flags & RTF_CACHE) {
2228 if (rt6_check_expired(rt)) {
2229 rt6_remove_exception_rt(rt);
2241 static void ip6_link_failure(struct sk_buff *skb)
2243 struct rt6_info *rt;
2245 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2247 rt = (struct rt6_info *) skb_dst(skb);
2250 if (rt->rt6i_flags & RTF_CACHE) {
2251 rt6_remove_exception_rt(rt);
2253 struct fib6_info *from;
2254 struct fib6_node *fn;
2256 from = rcu_dereference(rt->from);
2258 fn = rcu_dereference(from->fib6_node);
2259 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2267 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2269 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2270 struct fib6_info *from;
2273 from = rcu_dereference(rt0->from);
2275 rt0->dst.expires = from->expires;
2279 dst_set_expires(&rt0->dst, timeout);
2280 rt0->rt6i_flags |= RTF_EXPIRES;
2283 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2285 struct net *net = dev_net(rt->dst.dev);
2287 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2288 rt->rt6i_flags |= RTF_MODIFIED;
2289 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2292 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2294 return !(rt->rt6i_flags & RTF_CACHE) &&
2295 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2298 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2299 const struct ipv6hdr *iph, u32 mtu)
2301 const struct in6_addr *daddr, *saddr;
2302 struct rt6_info *rt6 = (struct rt6_info *)dst;
2304 if (dst_metric_locked(dst, RTAX_MTU))
2308 daddr = &iph->daddr;
2309 saddr = &iph->saddr;
2311 daddr = &sk->sk_v6_daddr;
2312 saddr = &inet6_sk(sk)->saddr;
2317 dst_confirm_neigh(dst, daddr);
2318 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2319 if (mtu >= dst_mtu(dst))
2322 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2323 rt6_do_update_pmtu(rt6, mtu);
2324 /* update rt6_ex->stamp for cache */
2325 if (rt6->rt6i_flags & RTF_CACHE)
2326 rt6_update_exception_stamp_rt(rt6);
2328 struct fib6_info *from;
2329 struct rt6_info *nrt6;
2332 from = rcu_dereference(rt6->from);
2333 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2335 rt6_do_update_pmtu(nrt6, mtu);
2336 if (rt6_insert_exception(nrt6, from))
2337 dst_release_immediate(&nrt6->dst);
2343 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2344 struct sk_buff *skb, u32 mtu)
2346 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2349 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2350 int oif, u32 mark, kuid_t uid)
2352 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2353 struct dst_entry *dst;
2354 struct flowi6 fl6 = {
2356 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2357 .daddr = iph->daddr,
2358 .saddr = iph->saddr,
2359 .flowlabel = ip6_flowinfo(iph),
2363 dst = ip6_route_output(net, NULL, &fl6);
2365 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2368 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2370 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2372 int oif = sk->sk_bound_dev_if;
2373 struct dst_entry *dst;
2375 if (!oif && skb->dev)
2376 oif = l3mdev_master_ifindex(skb->dev);
2378 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2380 dst = __sk_dst_get(sk);
2381 if (!dst || !dst->obsolete ||
2382 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2386 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2387 ip6_datagram_dst_update(sk, false);
2390 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2392 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2393 const struct flowi6 *fl6)
2395 #ifdef CONFIG_IPV6_SUBTREES
2396 struct ipv6_pinfo *np = inet6_sk(sk);
2399 ip6_dst_store(sk, dst,
2400 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2401 &sk->sk_v6_daddr : NULL,
2402 #ifdef CONFIG_IPV6_SUBTREES
2403 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2409 /* Handle redirects */
2410 struct ip6rd_flowi {
2412 struct in6_addr gateway;
2415 static struct rt6_info *__ip6_route_redirect(struct net *net,
2416 struct fib6_table *table,
2418 const struct sk_buff *skb,
2421 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2422 struct rt6_info *ret = NULL, *rt_cache;
2423 struct fib6_info *rt;
2424 struct fib6_node *fn;
2426 /* Get the "current" route for this destination and
2427 * check if the redirect has come from appropriate router.
2429 * RFC 4861 specifies that redirects should only be
2430 * accepted if they come from the nexthop to the target.
2431 * Due to the way the routes are chosen, this notion
2432 * is a bit fuzzy and one might need to check all possible
2437 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2439 for_each_fib6_node_rt_rcu(fn) {
2440 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2442 if (fib6_check_expired(rt))
2444 if (rt->fib6_flags & RTF_REJECT)
2446 if (!(rt->fib6_flags & RTF_GATEWAY))
2448 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2450 /* rt_cache's gateway might be different from its 'parent'
2451 * in the case of an ip redirect.
2452 * So we keep searching in the exception table if the gateway
2455 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2456 rt_cache = rt6_find_cached_rt(rt,
2460 ipv6_addr_equal(&rdfl->gateway,
2461 &rt_cache->rt6i_gateway)) {
2471 rt = net->ipv6.fib6_null_entry;
2472 else if (rt->fib6_flags & RTF_REJECT) {
2473 ret = net->ipv6.ip6_null_entry;
2477 if (rt == net->ipv6.fib6_null_entry) {
2478 fn = fib6_backtrack(fn, &fl6->saddr);
2485 ip6_hold_safe(net, &ret, true);
2487 ret = ip6_create_rt_rcu(rt);
2491 trace_fib6_table_lookup(net, rt, table, fl6);
2495 static struct dst_entry *ip6_route_redirect(struct net *net,
2496 const struct flowi6 *fl6,
2497 const struct sk_buff *skb,
2498 const struct in6_addr *gateway)
2500 int flags = RT6_LOOKUP_F_HAS_SADDR;
2501 struct ip6rd_flowi rdfl;
2504 rdfl.gateway = *gateway;
2506 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2507 flags, __ip6_route_redirect);
2510 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2513 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2514 struct dst_entry *dst;
2515 struct flowi6 fl6 = {
2516 .flowi6_iif = LOOPBACK_IFINDEX,
2518 .flowi6_mark = mark,
2519 .daddr = iph->daddr,
2520 .saddr = iph->saddr,
2521 .flowlabel = ip6_flowinfo(iph),
2525 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2526 rt6_do_redirect(dst, NULL, skb);
2529 EXPORT_SYMBOL_GPL(ip6_redirect);
2531 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2533 const struct ipv6hdr *iph = ipv6_hdr(skb);
2534 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2535 struct dst_entry *dst;
2536 struct flowi6 fl6 = {
2537 .flowi6_iif = LOOPBACK_IFINDEX,
2540 .saddr = iph->daddr,
2541 .flowi6_uid = sock_net_uid(net, NULL),
2544 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2545 rt6_do_redirect(dst, NULL, skb);
2549 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2551 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2554 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2556 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2558 struct net_device *dev = dst->dev;
2559 unsigned int mtu = dst_mtu(dst);
2560 struct net *net = dev_net(dev);
2562 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2564 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2565 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2568 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2569 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2570 * IPV6_MAXPLEN is also valid and means: "any MSS,
2571 * rely only on pmtu discovery"
2573 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2578 static unsigned int ip6_mtu(const struct dst_entry *dst)
2580 struct inet6_dev *idev;
2583 mtu = dst_metric_raw(dst, RTAX_MTU);
2590 idev = __in6_dev_get(dst->dev);
2592 mtu = idev->cnf.mtu6;
2596 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2598 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2602 * 1. mtu on route is locked - use it
2603 * 2. mtu from nexthop exception
2604 * 3. mtu from egress device
2606 * based on ip6_dst_mtu_forward and exception logic of
2607 * rt6_find_cached_rt; called with rcu_read_lock
2609 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2610 struct in6_addr *saddr)
2612 struct rt6_exception_bucket *bucket;
2613 struct rt6_exception *rt6_ex;
2614 struct in6_addr *src_key;
2615 struct inet6_dev *idev;
2618 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2619 mtu = f6i->fib6_pmtu;
2625 #ifdef CONFIG_IPV6_SUBTREES
2626 if (f6i->fib6_src.plen)
2630 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2631 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2632 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2633 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2636 struct net_device *dev = fib6_info_nh_dev(f6i);
2639 idev = __in6_dev_get(dev);
2640 if (idev && idev->cnf.mtu6 > mtu)
2641 mtu = idev->cnf.mtu6;
2644 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2646 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2649 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2652 struct dst_entry *dst;
2653 struct rt6_info *rt;
2654 struct inet6_dev *idev = in6_dev_get(dev);
2655 struct net *net = dev_net(dev);
2657 if (unlikely(!idev))
2658 return ERR_PTR(-ENODEV);
2660 rt = ip6_dst_alloc(net, dev, 0);
2661 if (unlikely(!rt)) {
2663 dst = ERR_PTR(-ENOMEM);
2667 rt->dst.flags |= DST_HOST;
2668 rt->dst.input = ip6_input;
2669 rt->dst.output = ip6_output;
2670 rt->rt6i_gateway = fl6->daddr;
2671 rt->rt6i_dst.addr = fl6->daddr;
2672 rt->rt6i_dst.plen = 128;
2673 rt->rt6i_idev = idev;
2674 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2676 /* Add this dst into uncached_list so that rt6_disable_ip() can
2677 * do proper release of the net_device
2679 rt6_uncached_list_add(rt);
2680 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2682 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2688 static int ip6_dst_gc(struct dst_ops *ops)
2690 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2691 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2692 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2693 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2694 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2695 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2698 entries = dst_entries_get_fast(ops);
2699 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2700 entries <= rt_max_size)
2703 net->ipv6.ip6_rt_gc_expire++;
2704 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2705 entries = dst_entries_get_slow(ops);
2706 if (entries < ops->gc_thresh)
2707 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2709 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2710 return entries > rt_max_size;
2713 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2714 struct fib6_config *cfg,
2715 const struct in6_addr *gw_addr,
2716 u32 tbid, int flags)
2718 struct flowi6 fl6 = {
2719 .flowi6_oif = cfg->fc_ifindex,
2721 .saddr = cfg->fc_prefsrc,
2723 struct fib6_table *table;
2724 struct rt6_info *rt;
2726 table = fib6_get_table(net, tbid);
2730 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2731 flags |= RT6_LOOKUP_F_HAS_SADDR;
2733 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2734 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2736 /* if table lookup failed, fall back to full lookup */
2737 if (rt == net->ipv6.ip6_null_entry) {
2745 static int ip6_route_check_nh_onlink(struct net *net,
2746 struct fib6_config *cfg,
2747 const struct net_device *dev,
2748 struct netlink_ext_ack *extack)
2750 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2751 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2752 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2753 struct fib6_info *from;
2754 struct rt6_info *grt;
2758 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2761 from = rcu_dereference(grt->from);
2762 if (!grt->dst.error &&
2763 /* ignore match if it is the default route */
2764 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2765 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2766 NL_SET_ERR_MSG(extack,
2767 "Nexthop has invalid gateway or device mismatch");
2778 static int ip6_route_check_nh(struct net *net,
2779 struct fib6_config *cfg,
2780 struct net_device **_dev,
2781 struct inet6_dev **idev)
2783 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2784 struct net_device *dev = _dev ? *_dev : NULL;
2785 struct rt6_info *grt = NULL;
2786 int err = -EHOSTUNREACH;
2788 if (cfg->fc_table) {
2789 int flags = RT6_LOOKUP_F_IFACE;
2791 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2792 cfg->fc_table, flags);
2794 if (grt->rt6i_flags & RTF_GATEWAY ||
2795 (dev && dev != grt->dst.dev)) {
2803 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2809 if (dev != grt->dst.dev) {
2814 *_dev = dev = grt->dst.dev;
2815 *idev = grt->rt6i_idev;
2817 in6_dev_hold(grt->rt6i_idev);
2820 if (!(grt->rt6i_flags & RTF_GATEWAY))
2829 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2830 struct net_device **_dev, struct inet6_dev **idev,
2831 struct netlink_ext_ack *extack)
2833 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2834 int gwa_type = ipv6_addr_type(gw_addr);
2835 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2836 const struct net_device *dev = *_dev;
2837 bool need_addr_check = !dev;
2840 /* if gw_addr is local we will fail to detect this in case
2841 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2842 * will return already-added prefix route via interface that
2843 * prefix route was assigned to, which might be non-loopback.
2846 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2847 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2851 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2852 /* IPv6 strictly inhibits using not link-local
2853 * addresses as nexthop address.
2854 * Otherwise, router will not able to send redirects.
2855 * It is very good, but in some (rare!) circumstances
2856 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2857 * some exceptions. --ANK
2858 * We allow IPv4-mapped nexthops to support RFC4798-type
2861 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2862 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2866 if (cfg->fc_flags & RTNH_F_ONLINK)
2867 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2869 err = ip6_route_check_nh(net, cfg, _dev, idev);
2875 /* reload in case device was changed */
2880 NL_SET_ERR_MSG(extack, "Egress device not specified");
2882 } else if (dev->flags & IFF_LOOPBACK) {
2883 NL_SET_ERR_MSG(extack,
2884 "Egress device can not be loopback device for this route");
2888 /* if we did not check gw_addr above, do so now that the
2889 * egress device has been resolved.
2891 if (need_addr_check &&
2892 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2893 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2902 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2904 struct netlink_ext_ack *extack)
2906 struct net *net = cfg->fc_nlinfo.nl_net;
2907 struct fib6_info *rt = NULL;
2908 struct net_device *dev = NULL;
2909 struct inet6_dev *idev = NULL;
2910 struct fib6_table *table;
2914 /* RTF_PCPU is an internal flag; can not be set by userspace */
2915 if (cfg->fc_flags & RTF_PCPU) {
2916 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2920 /* RTF_CACHE is an internal flag; can not be set by userspace */
2921 if (cfg->fc_flags & RTF_CACHE) {
2922 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2926 if (cfg->fc_type > RTN_MAX) {
2927 NL_SET_ERR_MSG(extack, "Invalid route type");
2931 if (cfg->fc_dst_len > 128) {
2932 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2935 if (cfg->fc_src_len > 128) {
2936 NL_SET_ERR_MSG(extack, "Invalid source address length");
2939 #ifndef CONFIG_IPV6_SUBTREES
2940 if (cfg->fc_src_len) {
2941 NL_SET_ERR_MSG(extack,
2942 "Specifying source address requires IPV6_SUBTREES to be enabled");
2946 if (cfg->fc_ifindex) {
2948 dev = dev_get_by_index(net, cfg->fc_ifindex);
2951 idev = in6_dev_get(dev);
2956 if (cfg->fc_metric == 0)
2957 cfg->fc_metric = IP6_RT_PRIO_USER;
2959 if (cfg->fc_flags & RTNH_F_ONLINK) {
2961 NL_SET_ERR_MSG(extack,
2962 "Nexthop device required for onlink");
2967 if (!(dev->flags & IFF_UP)) {
2968 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2975 if (cfg->fc_nlinfo.nlh &&
2976 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2977 table = fib6_get_table(net, cfg->fc_table);
2979 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2980 table = fib6_new_table(net, cfg->fc_table);
2983 table = fib6_new_table(net, cfg->fc_table);
2990 rt = fib6_info_alloc(gfp_flags);
2994 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2996 if (IS_ERR(rt->fib6_metrics)) {
2997 err = PTR_ERR(rt->fib6_metrics);
2998 /* Do not leave garbage there. */
2999 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3003 if (cfg->fc_flags & RTF_ADDRCONF)
3004 rt->dst_nocount = true;
3006 if (cfg->fc_flags & RTF_EXPIRES)
3007 fib6_set_expires(rt, jiffies +
3008 clock_t_to_jiffies(cfg->fc_expires));
3010 fib6_clean_expires(rt);
3012 if (cfg->fc_protocol == RTPROT_UNSPEC)
3013 cfg->fc_protocol = RTPROT_BOOT;
3014 rt->fib6_protocol = cfg->fc_protocol;
3016 addr_type = ipv6_addr_type(&cfg->fc_dst);
3018 if (cfg->fc_encap) {
3019 struct lwtunnel_state *lwtstate;
3021 err = lwtunnel_build_state(cfg->fc_encap_type,
3022 cfg->fc_encap, AF_INET6, cfg,
3026 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3029 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3030 rt->fib6_dst.plen = cfg->fc_dst_len;
3031 if (rt->fib6_dst.plen == 128)
3032 rt->dst_host = true;
3034 #ifdef CONFIG_IPV6_SUBTREES
3035 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3036 rt->fib6_src.plen = cfg->fc_src_len;
3039 rt->fib6_metric = cfg->fc_metric;
3040 rt->fib6_nh.nh_weight = 1;
3042 rt->fib6_type = cfg->fc_type;
3044 /* We cannot add true routes via loopback here,
3045 they would result in kernel looping; promote them to reject routes
3047 if ((cfg->fc_flags & RTF_REJECT) ||
3048 (dev && (dev->flags & IFF_LOOPBACK) &&
3049 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3050 !(cfg->fc_flags & RTF_LOCAL))) {
3051 /* hold loopback dev/idev if we haven't done so. */
3052 if (dev != net->loopback_dev) {
3057 dev = net->loopback_dev;
3059 idev = in6_dev_get(dev);
3065 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3069 if (cfg->fc_flags & RTF_GATEWAY) {
3070 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3074 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3081 if (idev->cnf.disable_ipv6) {
3082 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3087 if (!(dev->flags & IFF_UP)) {
3088 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3093 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3094 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3095 NL_SET_ERR_MSG(extack, "Invalid source address");
3099 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3100 rt->fib6_prefsrc.plen = 128;
3102 rt->fib6_prefsrc.plen = 0;
3104 rt->fib6_flags = cfg->fc_flags;
3107 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3108 !netif_carrier_ok(dev))
3109 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3110 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3111 rt->fib6_nh.nh_dev = dev;
3112 rt->fib6_table = table;
3124 fib6_info_release(rt);
3125 return ERR_PTR(err);
3128 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3129 struct netlink_ext_ack *extack)
3131 struct fib6_info *rt;
3134 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3138 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3139 fib6_info_release(rt);
3144 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3146 struct net *net = info->nl_net;
3147 struct fib6_table *table;
3150 if (rt == net->ipv6.fib6_null_entry) {
3155 table = rt->fib6_table;
3156 spin_lock_bh(&table->tb6_lock);
3157 err = fib6_del(rt, info);
3158 spin_unlock_bh(&table->tb6_lock);
3161 fib6_info_release(rt);
3165 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3167 struct nl_info info = { .nl_net = net };
3169 return __ip6_del_rt(rt, &info);
3172 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3174 struct nl_info *info = &cfg->fc_nlinfo;
3175 struct net *net = info->nl_net;
3176 struct sk_buff *skb = NULL;
3177 struct fib6_table *table;
3180 if (rt == net->ipv6.fib6_null_entry)
3182 table = rt->fib6_table;
3183 spin_lock_bh(&table->tb6_lock);
3185 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3186 struct fib6_info *sibling, *next_sibling;
3188 /* prefer to send a single notification with all hops */
3189 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3191 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3193 if (rt6_fill_node(net, skb, rt, NULL,
3194 NULL, NULL, 0, RTM_DELROUTE,
3195 info->portid, seq, 0) < 0) {
3199 info->skip_notify = 1;
3202 list_for_each_entry_safe(sibling, next_sibling,
3205 err = fib6_del(sibling, info);
3211 err = fib6_del(rt, info);
3213 spin_unlock_bh(&table->tb6_lock);
3215 fib6_info_release(rt);
3218 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3219 info->nlh, gfp_any());
3224 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3228 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3231 if (cfg->fc_flags & RTF_GATEWAY &&
3232 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3235 rc = rt6_remove_exception_rt(rt);
3240 static int ip6_route_del(struct fib6_config *cfg,
3241 struct netlink_ext_ack *extack)
3243 struct rt6_info *rt_cache;
3244 struct fib6_table *table;
3245 struct fib6_info *rt;
3246 struct fib6_node *fn;
3249 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3251 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3257 fn = fib6_locate(&table->tb6_root,
3258 &cfg->fc_dst, cfg->fc_dst_len,
3259 &cfg->fc_src, cfg->fc_src_len,
3260 !(cfg->fc_flags & RTF_CACHE));
3263 for_each_fib6_node_rt_rcu(fn) {
3264 if (cfg->fc_flags & RTF_CACHE) {
3267 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3270 rc = ip6_del_cached_rt(rt_cache, cfg);
3278 if (cfg->fc_ifindex &&
3279 (!rt->fib6_nh.nh_dev ||
3280 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3282 if (cfg->fc_flags & RTF_GATEWAY &&
3283 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3285 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3287 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3289 if (!fib6_info_hold_safe(rt))
3293 /* if gateway was specified only delete the one hop */
3294 if (cfg->fc_flags & RTF_GATEWAY)
3295 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3297 return __ip6_del_rt_siblings(rt, cfg);
3305 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3307 struct netevent_redirect netevent;
3308 struct rt6_info *rt, *nrt = NULL;
3309 struct ndisc_options ndopts;
3310 struct inet6_dev *in6_dev;
3311 struct neighbour *neigh;
3312 struct fib6_info *from;
3314 int optlen, on_link;
3317 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3318 optlen -= sizeof(*msg);
3321 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3325 msg = (struct rd_msg *)icmp6_hdr(skb);
3327 if (ipv6_addr_is_multicast(&msg->dest)) {
3328 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3333 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3335 } else if (ipv6_addr_type(&msg->target) !=
3336 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3337 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3341 in6_dev = __in6_dev_get(skb->dev);
3344 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3348 * The IP source address of the Redirect MUST be the same as the current
3349 * first-hop router for the specified ICMP Destination Address.
3352 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3353 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3358 if (ndopts.nd_opts_tgt_lladdr) {
3359 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3362 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3367 rt = (struct rt6_info *) dst;
3368 if (rt->rt6i_flags & RTF_REJECT) {
3369 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3373 /* Redirect received -> path was valid.
3374 * Look, redirects are sent only in response to data packets,
3375 * so that this nexthop apparently is reachable. --ANK
3377 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3379 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3384 * We have finally decided to accept it.
3387 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3388 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3389 NEIGH_UPDATE_F_OVERRIDE|
3390 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3391 NEIGH_UPDATE_F_ISROUTER)),
3392 NDISC_REDIRECT, &ndopts);
3395 from = rcu_dereference(rt->from);
3396 /* This fib6_info_hold() is safe here because we hold reference to rt
3397 * and rt already holds reference to fib6_info.
3399 fib6_info_hold(from);
3402 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3406 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3408 nrt->rt6i_flags &= ~RTF_GATEWAY;
3410 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3412 /* No need to remove rt from the exception table if rt is
3413 * a cached route because rt6_insert_exception() will
3416 if (rt6_insert_exception(nrt, from)) {
3417 dst_release_immediate(&nrt->dst);
3421 netevent.old = &rt->dst;
3422 netevent.new = &nrt->dst;
3423 netevent.daddr = &msg->dest;
3424 netevent.neigh = neigh;
3425 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3428 fib6_info_release(from);
3429 neigh_release(neigh);
3432 #ifdef CONFIG_IPV6_ROUTE_INFO
3433 static struct fib6_info *rt6_get_route_info(struct net *net,
3434 const struct in6_addr *prefix, int prefixlen,
3435 const struct in6_addr *gwaddr,
3436 struct net_device *dev)
3438 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3439 int ifindex = dev->ifindex;
3440 struct fib6_node *fn;
3441 struct fib6_info *rt = NULL;
3442 struct fib6_table *table;
3444 table = fib6_get_table(net, tb_id);
3449 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3453 for_each_fib6_node_rt_rcu(fn) {
3454 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3456 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3458 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3460 if (!fib6_info_hold_safe(rt))
3469 static struct fib6_info *rt6_add_route_info(struct net *net,
3470 const struct in6_addr *prefix, int prefixlen,
3471 const struct in6_addr *gwaddr,
3472 struct net_device *dev,
3475 struct fib6_config cfg = {
3476 .fc_metric = IP6_RT_PRIO_USER,
3477 .fc_ifindex = dev->ifindex,
3478 .fc_dst_len = prefixlen,
3479 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3480 RTF_UP | RTF_PREF(pref),
3481 .fc_protocol = RTPROT_RA,
3482 .fc_type = RTN_UNICAST,
3483 .fc_nlinfo.portid = 0,
3484 .fc_nlinfo.nlh = NULL,
3485 .fc_nlinfo.nl_net = net,
3488 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3489 cfg.fc_dst = *prefix;
3490 cfg.fc_gateway = *gwaddr;
3492 /* We should treat it as a default route if prefix length is 0. */
3494 cfg.fc_flags |= RTF_DEFAULT;
3496 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3498 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3502 struct fib6_info *rt6_get_dflt_router(struct net *net,
3503 const struct in6_addr *addr,
3504 struct net_device *dev)
3506 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3507 struct fib6_info *rt;
3508 struct fib6_table *table;
3510 table = fib6_get_table(net, tb_id);
3515 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3516 if (dev == rt->fib6_nh.nh_dev &&
3517 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3518 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3521 if (rt && !fib6_info_hold_safe(rt))
3527 struct fib6_info *rt6_add_dflt_router(struct net *net,
3528 const struct in6_addr *gwaddr,
3529 struct net_device *dev,
3532 struct fib6_config cfg = {
3533 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3534 .fc_metric = IP6_RT_PRIO_USER,
3535 .fc_ifindex = dev->ifindex,
3536 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3537 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3538 .fc_protocol = RTPROT_RA,
3539 .fc_type = RTN_UNICAST,
3540 .fc_nlinfo.portid = 0,
3541 .fc_nlinfo.nlh = NULL,
3542 .fc_nlinfo.nl_net = net,
3545 cfg.fc_gateway = *gwaddr;
3547 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3548 struct fib6_table *table;
3550 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3552 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3555 return rt6_get_dflt_router(net, gwaddr, dev);
3558 static void __rt6_purge_dflt_routers(struct net *net,
3559 struct fib6_table *table)
3561 struct fib6_info *rt;
3565 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3566 struct net_device *dev = fib6_info_nh_dev(rt);
3567 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3569 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3570 (!idev || idev->cnf.accept_ra != 2) &&
3571 fib6_info_hold_safe(rt)) {
3573 ip6_del_rt(net, rt);
3579 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3582 void rt6_purge_dflt_routers(struct net *net)
3584 struct fib6_table *table;
3585 struct hlist_head *head;
3590 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3591 head = &net->ipv6.fib_table_hash[h];
3592 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3593 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3594 __rt6_purge_dflt_routers(net, table);
3601 static void rtmsg_to_fib6_config(struct net *net,
3602 struct in6_rtmsg *rtmsg,
3603 struct fib6_config *cfg)
3605 *cfg = (struct fib6_config){
3606 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3608 .fc_ifindex = rtmsg->rtmsg_ifindex,
3609 .fc_metric = rtmsg->rtmsg_metric,
3610 .fc_expires = rtmsg->rtmsg_info,
3611 .fc_dst_len = rtmsg->rtmsg_dst_len,
3612 .fc_src_len = rtmsg->rtmsg_src_len,
3613 .fc_flags = rtmsg->rtmsg_flags,
3614 .fc_type = rtmsg->rtmsg_type,
3616 .fc_nlinfo.nl_net = net,
3618 .fc_dst = rtmsg->rtmsg_dst,
3619 .fc_src = rtmsg->rtmsg_src,
3620 .fc_gateway = rtmsg->rtmsg_gateway,
3624 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3626 struct fib6_config cfg;
3627 struct in6_rtmsg rtmsg;
3631 case SIOCADDRT: /* Add a route */
3632 case SIOCDELRT: /* Delete a route */
3633 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3635 err = copy_from_user(&rtmsg, arg,
3636 sizeof(struct in6_rtmsg));
3640 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3645 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3648 err = ip6_route_del(&cfg, NULL);
3662 * Drop the packet on the floor
3665 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3668 struct dst_entry *dst = skb_dst(skb);
3669 switch (ipstats_mib_noroutes) {
3670 case IPSTATS_MIB_INNOROUTES:
3671 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3672 if (type == IPV6_ADDR_ANY) {
3673 IP6_INC_STATS(dev_net(dst->dev),
3674 __in6_dev_get_safely(skb->dev),
3675 IPSTATS_MIB_INADDRERRORS);
3679 case IPSTATS_MIB_OUTNOROUTES:
3680 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3681 ipstats_mib_noroutes);
3684 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3689 static int ip6_pkt_discard(struct sk_buff *skb)
3691 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3694 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3696 skb->dev = skb_dst(skb)->dev;
3697 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3700 static int ip6_pkt_prohibit(struct sk_buff *skb)
3702 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3705 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3707 skb->dev = skb_dst(skb)->dev;
3708 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3712 * Allocate a dst for local (unicast / anycast) address.
3715 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3716 struct inet6_dev *idev,
3717 const struct in6_addr *addr,
3718 bool anycast, gfp_t gfp_flags)
3721 struct net_device *dev = idev->dev;
3722 struct fib6_info *f6i;
3724 f6i = fib6_info_alloc(gfp_flags);
3726 return ERR_PTR(-ENOMEM);
3728 f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3729 f6i->dst_nocount = true;
3730 f6i->dst_host = true;
3731 f6i->fib6_protocol = RTPROT_KERNEL;
3732 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3734 f6i->fib6_type = RTN_ANYCAST;
3735 f6i->fib6_flags |= RTF_ANYCAST;
3737 f6i->fib6_type = RTN_LOCAL;
3738 f6i->fib6_flags |= RTF_LOCAL;
3741 f6i->fib6_nh.nh_gw = *addr;
3743 f6i->fib6_nh.nh_dev = dev;
3744 f6i->fib6_dst.addr = *addr;
3745 f6i->fib6_dst.plen = 128;
3746 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3747 f6i->fib6_table = fib6_get_table(net, tb_id);
3752 /* remove deleted ip from prefsrc entries */
3753 struct arg_dev_net_ip {
3754 struct net_device *dev;
3756 struct in6_addr *addr;
3759 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3761 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3762 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3763 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3765 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3766 rt != net->ipv6.fib6_null_entry &&
3767 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3768 spin_lock_bh(&rt6_exception_lock);
3769 /* remove prefsrc entry */
3770 rt->fib6_prefsrc.plen = 0;
3771 spin_unlock_bh(&rt6_exception_lock);
3776 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3778 struct net *net = dev_net(ifp->idev->dev);
3779 struct arg_dev_net_ip adni = {
3780 .dev = ifp->idev->dev,
3784 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3787 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3789 /* Remove routers and update dst entries when gateway turn into host. */
3790 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3792 struct in6_addr *gateway = (struct in6_addr *)arg;
3794 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3795 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3799 /* Further clean up cached routes in exception table.
3800 * This is needed because cached route may have a different
3801 * gateway than its 'parent' in the case of an ip redirect.
3803 rt6_exceptions_clean_tohost(rt, gateway);
3808 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3810 fib6_clean_all(net, fib6_clean_tohost, gateway);
3813 struct arg_netdev_event {
3814 const struct net_device *dev;
3816 unsigned int nh_flags;
3817 unsigned long event;
3821 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3823 struct fib6_info *iter;
3824 struct fib6_node *fn;
3826 fn = rcu_dereference_protected(rt->fib6_node,
3827 lockdep_is_held(&rt->fib6_table->tb6_lock));
3828 iter = rcu_dereference_protected(fn->leaf,
3829 lockdep_is_held(&rt->fib6_table->tb6_lock));
3831 if (iter->fib6_metric == rt->fib6_metric &&
3832 rt6_qualify_for_ecmp(iter))
3834 iter = rcu_dereference_protected(iter->fib6_next,
3835 lockdep_is_held(&rt->fib6_table->tb6_lock));
3841 static bool rt6_is_dead(const struct fib6_info *rt)
3843 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3844 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3845 fib6_ignore_linkdown(rt)))
3851 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3853 struct fib6_info *iter;
3856 if (!rt6_is_dead(rt))
3857 total += rt->fib6_nh.nh_weight;
3859 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3860 if (!rt6_is_dead(iter))
3861 total += iter->fib6_nh.nh_weight;
3867 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3869 int upper_bound = -1;
3871 if (!rt6_is_dead(rt)) {
3872 *weight += rt->fib6_nh.nh_weight;
3873 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3876 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3879 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3881 struct fib6_info *iter;
3884 rt6_upper_bound_set(rt, &weight, total);
3886 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3887 rt6_upper_bound_set(iter, &weight, total);
3890 void rt6_multipath_rebalance(struct fib6_info *rt)
3892 struct fib6_info *first;
3895 /* In case the entire multipath route was marked for flushing,
3896 * then there is no need to rebalance upon the removal of every
3899 if (!rt->fib6_nsiblings || rt->should_flush)
3902 /* During lookup routes are evaluated in order, so we need to
3903 * make sure upper bounds are assigned from the first sibling
3906 first = rt6_multipath_first_sibling(rt);
3907 if (WARN_ON_ONCE(!first))
3910 total = rt6_multipath_total_weight(first);
3911 rt6_multipath_upper_bound_set(first, total);
3914 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3916 const struct arg_netdev_event *arg = p_arg;
3917 struct net *net = dev_net(arg->dev);
3919 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3920 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3921 fib6_update_sernum_upto_root(net, rt);
3922 rt6_multipath_rebalance(rt);
3928 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3930 struct arg_netdev_event arg = {
3933 .nh_flags = nh_flags,
3937 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3938 arg.nh_flags |= RTNH_F_LINKDOWN;
3940 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3943 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3944 const struct net_device *dev)
3946 struct fib6_info *iter;
3948 if (rt->fib6_nh.nh_dev == dev)
3950 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3951 if (iter->fib6_nh.nh_dev == dev)
3957 static void rt6_multipath_flush(struct fib6_info *rt)
3959 struct fib6_info *iter;
3961 rt->should_flush = 1;
3962 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3963 iter->should_flush = 1;
3966 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3967 const struct net_device *down_dev)
3969 struct fib6_info *iter;
3970 unsigned int dead = 0;
3972 if (rt->fib6_nh.nh_dev == down_dev ||
3973 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3975 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3976 if (iter->fib6_nh.nh_dev == down_dev ||
3977 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3983 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3984 const struct net_device *dev,
3985 unsigned int nh_flags)
3987 struct fib6_info *iter;
3989 if (rt->fib6_nh.nh_dev == dev)
3990 rt->fib6_nh.nh_flags |= nh_flags;
3991 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3992 if (iter->fib6_nh.nh_dev == dev)
3993 iter->fib6_nh.nh_flags |= nh_flags;
3996 /* called with write lock held for table with rt */
3997 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3999 const struct arg_netdev_event *arg = p_arg;
4000 const struct net_device *dev = arg->dev;
4001 struct net *net = dev_net(dev);
4003 if (rt == net->ipv6.fib6_null_entry)
4006 switch (arg->event) {
4007 case NETDEV_UNREGISTER:
4008 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4010 if (rt->should_flush)
4012 if (!rt->fib6_nsiblings)
4013 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4014 if (rt6_multipath_uses_dev(rt, dev)) {
4017 count = rt6_multipath_dead_count(rt, dev);
4018 if (rt->fib6_nsiblings + 1 == count) {
4019 rt6_multipath_flush(rt);
4022 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4024 fib6_update_sernum(net, rt);
4025 rt6_multipath_rebalance(rt);
4029 if (rt->fib6_nh.nh_dev != dev ||
4030 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4032 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4033 rt6_multipath_rebalance(rt);
4040 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4042 struct arg_netdev_event arg = {
4048 struct net *net = dev_net(dev);
4050 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4051 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4053 fib6_clean_all(net, fib6_ifdown, &arg);
4056 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4058 rt6_sync_down_dev(dev, event);
4059 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4060 neigh_ifdown(&nd_tbl, dev);
4063 struct rt6_mtu_change_arg {
4064 struct net_device *dev;
4068 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4070 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4071 struct inet6_dev *idev;
4073 /* In IPv6 pmtu discovery is not optional,
4074 so that RTAX_MTU lock cannot disable it.
4075 We still use this lock to block changes
4076 caused by addrconf/ndisc.
4079 idev = __in6_dev_get(arg->dev);
4083 /* For administrative MTU increase, there is no way to discover
4084 IPv6 PMTU increase, so PMTU increase should be updated here.
4085 Since RFC 1981 doesn't include administrative MTU increase
4086 update PMTU increase is a MUST. (i.e. jumbo frame)
4088 if (rt->fib6_nh.nh_dev == arg->dev &&
4089 !fib6_metric_locked(rt, RTAX_MTU)) {
4090 u32 mtu = rt->fib6_pmtu;
4092 if (mtu >= arg->mtu ||
4093 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4094 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4096 spin_lock_bh(&rt6_exception_lock);
4097 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4098 spin_unlock_bh(&rt6_exception_lock);
4103 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4105 struct rt6_mtu_change_arg arg = {
4110 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4113 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4114 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4115 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4116 [RTA_OIF] = { .type = NLA_U32 },
4117 [RTA_IIF] = { .type = NLA_U32 },
4118 [RTA_PRIORITY] = { .type = NLA_U32 },
4119 [RTA_METRICS] = { .type = NLA_NESTED },
4120 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4121 [RTA_PREF] = { .type = NLA_U8 },
4122 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4123 [RTA_ENCAP] = { .type = NLA_NESTED },
4124 [RTA_EXPIRES] = { .type = NLA_U32 },
4125 [RTA_UID] = { .type = NLA_U32 },
4126 [RTA_MARK] = { .type = NLA_U32 },
4127 [RTA_TABLE] = { .type = NLA_U32 },
4128 [RTA_IP_PROTO] = { .type = NLA_U8 },
4129 [RTA_SPORT] = { .type = NLA_U16 },
4130 [RTA_DPORT] = { .type = NLA_U16 },
4133 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4134 struct fib6_config *cfg,
4135 struct netlink_ext_ack *extack)
4138 struct nlattr *tb[RTA_MAX+1];
4142 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4148 rtm = nlmsg_data(nlh);
4150 *cfg = (struct fib6_config){
4151 .fc_table = rtm->rtm_table,
4152 .fc_dst_len = rtm->rtm_dst_len,
4153 .fc_src_len = rtm->rtm_src_len,
4155 .fc_protocol = rtm->rtm_protocol,
4156 .fc_type = rtm->rtm_type,
4158 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4159 .fc_nlinfo.nlh = nlh,
4160 .fc_nlinfo.nl_net = sock_net(skb->sk),
4163 if (rtm->rtm_type == RTN_UNREACHABLE ||
4164 rtm->rtm_type == RTN_BLACKHOLE ||
4165 rtm->rtm_type == RTN_PROHIBIT ||
4166 rtm->rtm_type == RTN_THROW)
4167 cfg->fc_flags |= RTF_REJECT;
4169 if (rtm->rtm_type == RTN_LOCAL)
4170 cfg->fc_flags |= RTF_LOCAL;
4172 if (rtm->rtm_flags & RTM_F_CLONED)
4173 cfg->fc_flags |= RTF_CACHE;
4175 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4177 if (tb[RTA_GATEWAY]) {
4178 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4179 cfg->fc_flags |= RTF_GATEWAY;
4182 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4187 int plen = (rtm->rtm_dst_len + 7) >> 3;
4189 if (nla_len(tb[RTA_DST]) < plen)
4192 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4196 int plen = (rtm->rtm_src_len + 7) >> 3;
4198 if (nla_len(tb[RTA_SRC]) < plen)
4201 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4204 if (tb[RTA_PREFSRC])
4205 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4208 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4210 if (tb[RTA_PRIORITY])
4211 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4213 if (tb[RTA_METRICS]) {
4214 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4215 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4219 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4221 if (tb[RTA_MULTIPATH]) {
4222 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4223 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4225 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4226 cfg->fc_mp_len, extack);
4232 pref = nla_get_u8(tb[RTA_PREF]);
4233 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4234 pref != ICMPV6_ROUTER_PREF_HIGH)
4235 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4236 cfg->fc_flags |= RTF_PREF(pref);
4240 cfg->fc_encap = tb[RTA_ENCAP];
4242 if (tb[RTA_ENCAP_TYPE]) {
4243 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4245 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4250 if (tb[RTA_EXPIRES]) {
4251 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4253 if (addrconf_finite_timeout(timeout)) {
4254 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4255 cfg->fc_flags |= RTF_EXPIRES;
4265 struct fib6_info *fib6_info;
4266 struct fib6_config r_cfg;
4267 struct list_head next;
4270 static int ip6_route_info_append(struct net *net,
4271 struct list_head *rt6_nh_list,
4272 struct fib6_info *rt,
4273 struct fib6_config *r_cfg)
4278 list_for_each_entry(nh, rt6_nh_list, next) {
4279 /* check if fib6_info already exists */
4280 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4284 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4288 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4289 list_add_tail(&nh->next, rt6_nh_list);
4294 static void ip6_route_mpath_notify(struct fib6_info *rt,
4295 struct fib6_info *rt_last,
4296 struct nl_info *info,
4299 /* if this is an APPEND route, then rt points to the first route
4300 * inserted and rt_last points to last route inserted. Userspace
4301 * wants a consistent dump of the route which starts at the first
4302 * nexthop. Since sibling routes are always added at the end of
4303 * the list, find the first sibling of the last route appended
4305 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4306 rt = list_first_entry(&rt_last->fib6_siblings,
4312 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4315 static int ip6_route_multipath_add(struct fib6_config *cfg,
4316 struct netlink_ext_ack *extack)
4318 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4319 struct nl_info *info = &cfg->fc_nlinfo;
4320 struct fib6_config r_cfg;
4321 struct rtnexthop *rtnh;
4322 struct fib6_info *rt;
4323 struct rt6_nh *err_nh;
4324 struct rt6_nh *nh, *nh_safe;
4330 int replace = (cfg->fc_nlinfo.nlh &&
4331 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4332 LIST_HEAD(rt6_nh_list);
4334 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4335 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4336 nlflags |= NLM_F_APPEND;
4338 remaining = cfg->fc_mp_len;
4339 rtnh = (struct rtnexthop *)cfg->fc_mp;
4341 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4342 * fib6_info structs per nexthop
4344 while (rtnh_ok(rtnh, remaining)) {
4345 memcpy(&r_cfg, cfg, sizeof(*cfg));
4346 if (rtnh->rtnh_ifindex)
4347 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4349 attrlen = rtnh_attrlen(rtnh);
4351 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4353 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4355 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4356 r_cfg.fc_flags |= RTF_GATEWAY;
4358 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4359 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4361 r_cfg.fc_encap_type = nla_get_u16(nla);
4364 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4365 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4371 if (!rt6_qualify_for_ecmp(rt)) {
4373 NL_SET_ERR_MSG(extack,
4374 "Device only routes can not be added for IPv6 using the multipath API.");
4375 fib6_info_release(rt);
4379 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4381 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4384 fib6_info_release(rt);
4388 rtnh = rtnh_next(rtnh, &remaining);
4391 /* for add and replace send one notification with all nexthops.
4392 * Skip the notification in fib6_add_rt2node and send one with
4393 * the full route when done
4395 info->skip_notify = 1;
4398 list_for_each_entry(nh, &rt6_nh_list, next) {
4399 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4400 fib6_info_release(nh->fib6_info);
4403 /* save reference to last route successfully inserted */
4404 rt_last = nh->fib6_info;
4406 /* save reference to first route for notification */
4408 rt_notif = nh->fib6_info;
4411 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4412 nh->fib6_info = NULL;
4415 NL_SET_ERR_MSG_MOD(extack,
4416 "multipath route replace failed (check consistency of installed routes)");
4421 /* Because each route is added like a single route we remove
4422 * these flags after the first nexthop: if there is a collision,
4423 * we have already failed to add the first nexthop:
4424 * fib6_add_rt2node() has rejected it; when replacing, old
4425 * nexthops have been replaced by first new, the rest should
4428 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4433 /* success ... tell user about new route */
4434 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4438 /* send notification for routes that were added so that
4439 * the delete notifications sent by ip6_route_del are
4443 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4445 /* Delete routes that were already added */
4446 list_for_each_entry(nh, &rt6_nh_list, next) {
4449 ip6_route_del(&nh->r_cfg, extack);
4453 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4455 fib6_info_release(nh->fib6_info);
4456 list_del(&nh->next);
4463 static int ip6_route_multipath_del(struct fib6_config *cfg,
4464 struct netlink_ext_ack *extack)
4466 struct fib6_config r_cfg;
4467 struct rtnexthop *rtnh;
4470 int err = 1, last_err = 0;
4472 remaining = cfg->fc_mp_len;
4473 rtnh = (struct rtnexthop *)cfg->fc_mp;
4475 /* Parse a Multipath Entry */
4476 while (rtnh_ok(rtnh, remaining)) {
4477 memcpy(&r_cfg, cfg, sizeof(*cfg));
4478 if (rtnh->rtnh_ifindex)
4479 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4481 attrlen = rtnh_attrlen(rtnh);
4483 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4485 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4487 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4488 r_cfg.fc_flags |= RTF_GATEWAY;
4491 err = ip6_route_del(&r_cfg, extack);
4495 rtnh = rtnh_next(rtnh, &remaining);
4501 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4502 struct netlink_ext_ack *extack)
4504 struct fib6_config cfg;
4507 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4512 return ip6_route_multipath_del(&cfg, extack);
4514 cfg.fc_delete_all_nh = 1;
4515 return ip6_route_del(&cfg, extack);
4519 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4520 struct netlink_ext_ack *extack)
4522 struct fib6_config cfg;
4525 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4530 return ip6_route_multipath_add(&cfg, extack);
4532 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4535 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4537 int nexthop_len = 0;
4539 if (rt->fib6_nsiblings) {
4540 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4541 + NLA_ALIGN(sizeof(struct rtnexthop))
4542 + nla_total_size(16) /* RTA_GATEWAY */
4543 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4545 nexthop_len *= rt->fib6_nsiblings;
4548 return NLMSG_ALIGN(sizeof(struct rtmsg))
4549 + nla_total_size(16) /* RTA_SRC */
4550 + nla_total_size(16) /* RTA_DST */
4551 + nla_total_size(16) /* RTA_GATEWAY */
4552 + nla_total_size(16) /* RTA_PREFSRC */
4553 + nla_total_size(4) /* RTA_TABLE */
4554 + nla_total_size(4) /* RTA_IIF */
4555 + nla_total_size(4) /* RTA_OIF */
4556 + nla_total_size(4) /* RTA_PRIORITY */
4557 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4558 + nla_total_size(sizeof(struct rta_cacheinfo))
4559 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4560 + nla_total_size(1) /* RTA_PREF */
4561 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4565 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4566 unsigned int *flags, bool skip_oif)
4568 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4569 *flags |= RTNH_F_DEAD;
4571 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4572 *flags |= RTNH_F_LINKDOWN;
4575 if (fib6_ignore_linkdown(rt))
4576 *flags |= RTNH_F_DEAD;
4580 if (rt->fib6_flags & RTF_GATEWAY) {
4581 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4582 goto nla_put_failure;
4585 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4586 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4587 *flags |= RTNH_F_OFFLOAD;
4589 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4590 if (!skip_oif && rt->fib6_nh.nh_dev &&
4591 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4592 goto nla_put_failure;
4594 if (rt->fib6_nh.nh_lwtstate &&
4595 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4596 goto nla_put_failure;
4604 /* add multipath next hop */
4605 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4607 const struct net_device *dev = rt->fib6_nh.nh_dev;
4608 struct rtnexthop *rtnh;
4609 unsigned int flags = 0;
4611 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4613 goto nla_put_failure;
4615 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4616 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4618 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4619 goto nla_put_failure;
4621 rtnh->rtnh_flags = flags;
4623 /* length of rtnetlink header + attributes */
4624 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4632 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4633 struct fib6_info *rt, struct dst_entry *dst,
4634 struct in6_addr *dest, struct in6_addr *src,
4635 int iif, int type, u32 portid, u32 seq,
4638 struct rt6_info *rt6 = (struct rt6_info *)dst;
4639 struct rt6key *rt6_dst, *rt6_src;
4640 u32 *pmetrics, table, rt6_flags;
4641 struct nlmsghdr *nlh;
4645 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4650 rt6_dst = &rt6->rt6i_dst;
4651 rt6_src = &rt6->rt6i_src;
4652 rt6_flags = rt6->rt6i_flags;
4654 rt6_dst = &rt->fib6_dst;
4655 rt6_src = &rt->fib6_src;
4656 rt6_flags = rt->fib6_flags;
4659 rtm = nlmsg_data(nlh);
4660 rtm->rtm_family = AF_INET6;
4661 rtm->rtm_dst_len = rt6_dst->plen;
4662 rtm->rtm_src_len = rt6_src->plen;
4665 table = rt->fib6_table->tb6_id;
4667 table = RT6_TABLE_UNSPEC;
4668 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4669 if (nla_put_u32(skb, RTA_TABLE, table))
4670 goto nla_put_failure;
4672 rtm->rtm_type = rt->fib6_type;
4674 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4675 rtm->rtm_protocol = rt->fib6_protocol;
4677 if (rt6_flags & RTF_CACHE)
4678 rtm->rtm_flags |= RTM_F_CLONED;
4681 if (nla_put_in6_addr(skb, RTA_DST, dest))
4682 goto nla_put_failure;
4683 rtm->rtm_dst_len = 128;
4684 } else if (rtm->rtm_dst_len)
4685 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4686 goto nla_put_failure;
4687 #ifdef CONFIG_IPV6_SUBTREES
4689 if (nla_put_in6_addr(skb, RTA_SRC, src))
4690 goto nla_put_failure;
4691 rtm->rtm_src_len = 128;
4692 } else if (rtm->rtm_src_len &&
4693 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4694 goto nla_put_failure;
4697 #ifdef CONFIG_IPV6_MROUTE
4698 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4699 int err = ip6mr_get_route(net, skb, rtm, portid);
4704 goto nla_put_failure;
4707 if (nla_put_u32(skb, RTA_IIF, iif))
4708 goto nla_put_failure;
4710 struct in6_addr saddr_buf;
4711 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4712 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4713 goto nla_put_failure;
4716 if (rt->fib6_prefsrc.plen) {
4717 struct in6_addr saddr_buf;
4718 saddr_buf = rt->fib6_prefsrc.addr;
4719 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4720 goto nla_put_failure;
4723 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4724 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4725 goto nla_put_failure;
4727 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4728 goto nla_put_failure;
4730 /* For multipath routes, walk the siblings list and add
4731 * each as a nexthop within RTA_MULTIPATH.
4734 if (rt6_flags & RTF_GATEWAY &&
4735 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4736 goto nla_put_failure;
4738 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4739 goto nla_put_failure;
4740 } else if (rt->fib6_nsiblings) {
4741 struct fib6_info *sibling, *next_sibling;
4744 mp = nla_nest_start(skb, RTA_MULTIPATH);
4746 goto nla_put_failure;
4748 if (rt6_add_nexthop(skb, rt) < 0)
4749 goto nla_put_failure;
4751 list_for_each_entry_safe(sibling, next_sibling,
4752 &rt->fib6_siblings, fib6_siblings) {
4753 if (rt6_add_nexthop(skb, sibling) < 0)
4754 goto nla_put_failure;
4757 nla_nest_end(skb, mp);
4759 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4760 goto nla_put_failure;
4763 if (rt6_flags & RTF_EXPIRES) {
4764 expires = dst ? dst->expires : rt->expires;
4768 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4769 goto nla_put_failure;
4771 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4772 goto nla_put_failure;
4775 nlmsg_end(skb, nlh);
4779 nlmsg_cancel(skb, nlh);
4783 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4784 const struct net_device *dev)
4786 if (f6i->fib6_nh.nh_dev == dev)
4789 if (f6i->fib6_nsiblings) {
4790 struct fib6_info *sibling, *next_sibling;
4792 list_for_each_entry_safe(sibling, next_sibling,
4793 &f6i->fib6_siblings, fib6_siblings) {
4794 if (sibling->fib6_nh.nh_dev == dev)
4802 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4804 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4805 struct fib_dump_filter *filter = &arg->filter;
4806 unsigned int flags = NLM_F_MULTI;
4807 struct net *net = arg->net;
4809 if (rt == net->ipv6.fib6_null_entry)
4812 if ((filter->flags & RTM_F_PREFIX) &&
4813 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4814 /* success since this is not a prefix route */
4817 if (filter->filter_set) {
4818 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4819 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4820 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4823 flags |= NLM_F_DUMP_FILTERED;
4826 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4827 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4828 arg->cb->nlh->nlmsg_seq, flags);
4831 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4832 const struct nlmsghdr *nlh,
4834 struct netlink_ext_ack *extack)
4839 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4840 NL_SET_ERR_MSG_MOD(extack,
4841 "Invalid header for get route request");
4845 if (!netlink_strict_get_check(skb))
4846 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4847 rtm_ipv6_policy, extack);
4849 rtm = nlmsg_data(nlh);
4850 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4851 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4852 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4854 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4857 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4858 NL_SET_ERR_MSG_MOD(extack,
4859 "Invalid flags for get route request");
4863 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4864 rtm_ipv6_policy, extack);
4868 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4869 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4870 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4874 for (i = 0; i <= RTA_MAX; i++) {
4890 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4898 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4899 struct netlink_ext_ack *extack)
4901 struct net *net = sock_net(in_skb->sk);
4902 struct nlattr *tb[RTA_MAX+1];
4903 int err, iif = 0, oif = 0;
4904 struct fib6_info *from;
4905 struct dst_entry *dst;
4906 struct rt6_info *rt;
4907 struct sk_buff *skb;
4909 struct flowi6 fl6 = {};
4912 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4917 rtm = nlmsg_data(nlh);
4918 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4919 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4922 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4925 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4929 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4932 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4936 iif = nla_get_u32(tb[RTA_IIF]);
4939 oif = nla_get_u32(tb[RTA_OIF]);
4942 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4945 fl6.flowi6_uid = make_kuid(current_user_ns(),
4946 nla_get_u32(tb[RTA_UID]));
4948 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4951 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4954 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4956 if (tb[RTA_IP_PROTO]) {
4957 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4958 &fl6.flowi6_proto, AF_INET6,
4965 struct net_device *dev;
4970 dev = dev_get_by_index_rcu(net, iif);
4977 fl6.flowi6_iif = iif;
4979 if (!ipv6_addr_any(&fl6.saddr))
4980 flags |= RT6_LOOKUP_F_HAS_SADDR;
4982 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4986 fl6.flowi6_oif = oif;
4988 dst = ip6_route_output(net, NULL, &fl6);
4992 rt = container_of(dst, struct rt6_info, dst);
4993 if (rt->dst.error) {
4994 err = rt->dst.error;
4999 if (rt == net->ipv6.ip6_null_entry) {
5000 err = rt->dst.error;
5005 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5012 skb_dst_set(skb, &rt->dst);
5015 from = rcu_dereference(rt->from);
5018 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5019 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5022 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5023 &fl6.saddr, iif, RTM_NEWROUTE,
5024 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5033 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5038 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5039 unsigned int nlm_flags)
5041 struct sk_buff *skb;
5042 struct net *net = info->nl_net;
5047 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5049 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5053 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5054 event, info->portid, seq, nlm_flags);
5056 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5057 WARN_ON(err == -EMSGSIZE);
5061 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5062 info->nlh, gfp_any());
5066 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5069 static int ip6_route_dev_notify(struct notifier_block *this,
5070 unsigned long event, void *ptr)
5072 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5073 struct net *net = dev_net(dev);
5075 if (!(dev->flags & IFF_LOOPBACK))
5078 if (event == NETDEV_REGISTER) {
5079 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5080 net->ipv6.ip6_null_entry->dst.dev = dev;
5081 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5082 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5083 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5084 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5085 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5086 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5088 } else if (event == NETDEV_UNREGISTER &&
5089 dev->reg_state != NETREG_UNREGISTERED) {
5090 /* NETDEV_UNREGISTER could be fired for multiple times by
5091 * netdev_wait_allrefs(). Make sure we only call this once.
5093 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5094 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5095 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5096 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5107 #ifdef CONFIG_PROC_FS
5108 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5110 struct net *net = (struct net *)seq->private;
5111 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5112 net->ipv6.rt6_stats->fib_nodes,
5113 net->ipv6.rt6_stats->fib_route_nodes,
5114 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5115 net->ipv6.rt6_stats->fib_rt_entries,
5116 net->ipv6.rt6_stats->fib_rt_cache,
5117 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5118 net->ipv6.rt6_stats->fib_discarded_routes);
5122 #endif /* CONFIG_PROC_FS */
5124 #ifdef CONFIG_SYSCTL
5127 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5128 void __user *buffer, size_t *lenp, loff_t *ppos)
5136 net = (struct net *)ctl->extra1;
5137 delay = net->ipv6.sysctl.flush_delay;
5138 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5142 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5149 static struct ctl_table ipv6_route_table_template[] = {
5151 .procname = "flush",
5152 .data = &init_net.ipv6.sysctl.flush_delay,
5153 .maxlen = sizeof(int),
5155 .proc_handler = ipv6_sysctl_rtcache_flush
5158 .procname = "gc_thresh",
5159 .data = &ip6_dst_ops_template.gc_thresh,
5160 .maxlen = sizeof(int),
5162 .proc_handler = proc_dointvec,
5165 .procname = "max_size",
5166 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5167 .maxlen = sizeof(int),
5169 .proc_handler = proc_dointvec,
5172 .procname = "gc_min_interval",
5173 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5174 .maxlen = sizeof(int),
5176 .proc_handler = proc_dointvec_jiffies,
5179 .procname = "gc_timeout",
5180 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5181 .maxlen = sizeof(int),
5183 .proc_handler = proc_dointvec_jiffies,
5186 .procname = "gc_interval",
5187 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5188 .maxlen = sizeof(int),
5190 .proc_handler = proc_dointvec_jiffies,
5193 .procname = "gc_elasticity",
5194 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5195 .maxlen = sizeof(int),
5197 .proc_handler = proc_dointvec,
5200 .procname = "mtu_expires",
5201 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5202 .maxlen = sizeof(int),
5204 .proc_handler = proc_dointvec_jiffies,
5207 .procname = "min_adv_mss",
5208 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5209 .maxlen = sizeof(int),
5211 .proc_handler = proc_dointvec,
5214 .procname = "gc_min_interval_ms",
5215 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5216 .maxlen = sizeof(int),
5218 .proc_handler = proc_dointvec_ms_jiffies,
5221 .procname = "skip_notify_on_dev_down",
5222 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5223 .maxlen = sizeof(int),
5225 .proc_handler = proc_dointvec,
5232 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5234 struct ctl_table *table;
5236 table = kmemdup(ipv6_route_table_template,
5237 sizeof(ipv6_route_table_template),
5241 table[0].data = &net->ipv6.sysctl.flush_delay;
5242 table[0].extra1 = net;
5243 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5244 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5245 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5246 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5247 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5248 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5249 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5250 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5251 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5252 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5254 /* Don't export sysctls to unprivileged users */
5255 if (net->user_ns != &init_user_ns)
5256 table[0].procname = NULL;
5263 static int __net_init ip6_route_net_init(struct net *net)
5267 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5268 sizeof(net->ipv6.ip6_dst_ops));
5270 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5271 goto out_ip6_dst_ops;
5273 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5274 sizeof(*net->ipv6.fib6_null_entry),
5276 if (!net->ipv6.fib6_null_entry)
5277 goto out_ip6_dst_entries;
5279 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5280 sizeof(*net->ipv6.ip6_null_entry),
5282 if (!net->ipv6.ip6_null_entry)
5283 goto out_fib6_null_entry;
5284 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5285 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5286 ip6_template_metrics, true);
5288 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5289 net->ipv6.fib6_has_custom_rules = false;
5290 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5291 sizeof(*net->ipv6.ip6_prohibit_entry),
5293 if (!net->ipv6.ip6_prohibit_entry)
5294 goto out_ip6_null_entry;
5295 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5296 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5297 ip6_template_metrics, true);
5299 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5300 sizeof(*net->ipv6.ip6_blk_hole_entry),
5302 if (!net->ipv6.ip6_blk_hole_entry)
5303 goto out_ip6_prohibit_entry;
5304 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5305 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5306 ip6_template_metrics, true);
5309 net->ipv6.sysctl.flush_delay = 0;
5310 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5311 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5312 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5313 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5314 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5315 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5316 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5317 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5319 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5325 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5326 out_ip6_prohibit_entry:
5327 kfree(net->ipv6.ip6_prohibit_entry);
5329 kfree(net->ipv6.ip6_null_entry);
5331 out_fib6_null_entry:
5332 kfree(net->ipv6.fib6_null_entry);
5333 out_ip6_dst_entries:
5334 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5339 static void __net_exit ip6_route_net_exit(struct net *net)
5341 kfree(net->ipv6.fib6_null_entry);
5342 kfree(net->ipv6.ip6_null_entry);
5343 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5344 kfree(net->ipv6.ip6_prohibit_entry);
5345 kfree(net->ipv6.ip6_blk_hole_entry);
5347 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5350 static int __net_init ip6_route_net_init_late(struct net *net)
5352 #ifdef CONFIG_PROC_FS
5353 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5354 sizeof(struct ipv6_route_iter));
5355 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5356 rt6_stats_seq_show, NULL);
5361 static void __net_exit ip6_route_net_exit_late(struct net *net)
5363 #ifdef CONFIG_PROC_FS
5364 remove_proc_entry("ipv6_route", net->proc_net);
5365 remove_proc_entry("rt6_stats", net->proc_net);
5369 static struct pernet_operations ip6_route_net_ops = {
5370 .init = ip6_route_net_init,
5371 .exit = ip6_route_net_exit,
5374 static int __net_init ipv6_inetpeer_init(struct net *net)
5376 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5380 inet_peer_base_init(bp);
5381 net->ipv6.peers = bp;
5385 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5387 struct inet_peer_base *bp = net->ipv6.peers;
5389 net->ipv6.peers = NULL;
5390 inetpeer_invalidate_tree(bp);
5394 static struct pernet_operations ipv6_inetpeer_ops = {
5395 .init = ipv6_inetpeer_init,
5396 .exit = ipv6_inetpeer_exit,
5399 static struct pernet_operations ip6_route_net_late_ops = {
5400 .init = ip6_route_net_init_late,
5401 .exit = ip6_route_net_exit_late,
5404 static struct notifier_block ip6_route_dev_notifier = {
5405 .notifier_call = ip6_route_dev_notify,
5406 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5409 void __init ip6_route_init_special_entries(void)
5411 /* Registering of the loopback is done before this portion of code,
5412 * the loopback reference in rt6_info will not be taken, do it
5413 * manually for init_net */
5414 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5415 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5416 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5417 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5418 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5419 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5420 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5421 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5425 int __init ip6_route_init(void)
5431 ip6_dst_ops_template.kmem_cachep =
5432 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5433 SLAB_HWCACHE_ALIGN, NULL);
5434 if (!ip6_dst_ops_template.kmem_cachep)
5437 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5439 goto out_kmem_cache;
5441 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5443 goto out_dst_entries;
5445 ret = register_pernet_subsys(&ip6_route_net_ops);
5447 goto out_register_inetpeer;
5449 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5453 goto out_register_subsys;
5459 ret = fib6_rules_init();
5463 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5465 goto fib6_rules_init;
5467 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5468 inet6_rtm_newroute, NULL, 0);
5470 goto out_register_late_subsys;
5472 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5473 inet6_rtm_delroute, NULL, 0);
5475 goto out_register_late_subsys;
5477 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5478 inet6_rtm_getroute, NULL,
5479 RTNL_FLAG_DOIT_UNLOCKED);
5481 goto out_register_late_subsys;
5483 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5485 goto out_register_late_subsys;
5487 for_each_possible_cpu(cpu) {
5488 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5490 INIT_LIST_HEAD(&ul->head);
5491 spin_lock_init(&ul->lock);
5497 out_register_late_subsys:
5498 rtnl_unregister_all(PF_INET6);
5499 unregister_pernet_subsys(&ip6_route_net_late_ops);
5501 fib6_rules_cleanup();
5506 out_register_subsys:
5507 unregister_pernet_subsys(&ip6_route_net_ops);
5508 out_register_inetpeer:
5509 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5511 dst_entries_destroy(&ip6_dst_blackhole_ops);
5513 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5517 void ip6_route_cleanup(void)
5519 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5520 unregister_pernet_subsys(&ip6_route_net_late_ops);
5521 fib6_rules_cleanup();
5524 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5525 unregister_pernet_subsys(&ip6_route_net_ops);
5526 dst_entries_destroy(&ip6_dst_blackhole_ops);
5527 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);