1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Linux INET6 implementation
12 * YOSHIFUJI Hideaki @USAGI
13 * reworked default router selection.
14 * - respect outgoing interface
15 * - select from (probably) reachable routers (i.e.
16 * routers in REACHABLE, STALE, DELAY or PROBE states).
17 * - always select the same router if it is (probably)
18 * reachable. otherwise, round-robin the list.
20 * Fixed routing subtrees.
23 #define pr_fmt(fmt) "IPv6: " fmt
25 #include <linux/capability.h>
26 #include <linux/errno.h>
27 #include <linux/export.h>
28 #include <linux/types.h>
29 #include <linux/times.h>
30 #include <linux/socket.h>
31 #include <linux/sockios.h>
32 #include <linux/net.h>
33 #include <linux/route.h>
34 #include <linux/netdevice.h>
35 #include <linux/in6.h>
36 #include <linux/mroute6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
39 #include <linux/proc_fs.h>
40 #include <linux/seq_file.h>
41 #include <linux/nsproxy.h>
42 #include <linux/slab.h>
43 #include <linux/jhash.h>
44 #include <net/net_namespace.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
54 #include <net/dst_metadata.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <net/lwtunnel.h>
60 #include <net/ip_tunnels.h>
61 #include <net/l3mdev.h>
63 #include <linux/uaccess.h>
66 #include <linux/sysctl.h>
69 static int ip6_rt_type_to_error(u8 fib6_type);
71 #define CREATE_TRACE_POINTS
72 #include <trace/events/fib6.h>
73 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
74 #undef CREATE_TRACE_POINTS
77 RT6_NUD_FAIL_HARD = -3,
78 RT6_NUD_FAIL_PROBE = -2,
79 RT6_NUD_FAIL_DO_RR = -1,
83 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
84 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
85 static unsigned int ip6_mtu(const struct dst_entry *dst);
86 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
87 static void ip6_dst_destroy(struct dst_entry *);
88 static void ip6_dst_ifdown(struct dst_entry *,
89 struct net_device *dev, int how);
90 static int ip6_dst_gc(struct dst_ops *ops);
92 static int ip6_pkt_discard(struct sk_buff *skb);
93 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static int ip6_pkt_prohibit(struct sk_buff *skb);
95 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
96 static void ip6_link_failure(struct sk_buff *skb);
97 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
98 struct sk_buff *skb, u32 mtu);
99 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 struct sk_buff *skb);
101 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
103 static size_t rt6_nlmsg_size(struct fib6_info *rt);
104 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
105 struct fib6_info *rt, struct dst_entry *dst,
106 struct in6_addr *dest, struct in6_addr *src,
107 int iif, int type, u32 portid, u32 seq,
109 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
110 const struct in6_addr *daddr,
111 const struct in6_addr *saddr);
113 #ifdef CONFIG_IPV6_ROUTE_INFO
114 static struct fib6_info *rt6_add_route_info(struct net *net,
115 const struct in6_addr *prefix, int prefixlen,
116 const struct in6_addr *gwaddr,
117 struct net_device *dev,
119 static struct fib6_info *rt6_get_route_info(struct net *net,
120 const struct in6_addr *prefix, int prefixlen,
121 const struct in6_addr *gwaddr,
122 struct net_device *dev);
125 struct uncached_list {
127 struct list_head head;
130 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
132 void rt6_uncached_list_add(struct rt6_info *rt)
134 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
136 rt->rt6i_uncached_list = ul;
138 spin_lock_bh(&ul->lock);
139 list_add_tail(&rt->rt6i_uncached, &ul->head);
140 spin_unlock_bh(&ul->lock);
143 void rt6_uncached_list_del(struct rt6_info *rt)
145 if (!list_empty(&rt->rt6i_uncached)) {
146 struct uncached_list *ul = rt->rt6i_uncached_list;
147 struct net *net = dev_net(rt->dst.dev);
149 spin_lock_bh(&ul->lock);
150 list_del(&rt->rt6i_uncached);
151 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
152 spin_unlock_bh(&ul->lock);
156 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
158 struct net_device *loopback_dev = net->loopback_dev;
161 if (dev == loopback_dev)
164 for_each_possible_cpu(cpu) {
165 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
168 spin_lock_bh(&ul->lock);
169 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
170 struct inet6_dev *rt_idev = rt->rt6i_idev;
171 struct net_device *rt_dev = rt->dst.dev;
173 if (rt_idev->dev == dev) {
174 rt->rt6i_idev = in6_dev_get(loopback_dev);
175 in6_dev_put(rt_idev);
179 rt->dst.dev = loopback_dev;
180 dev_hold(rt->dst.dev);
184 spin_unlock_bh(&ul->lock);
188 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192 if (!ipv6_addr_any(p))
193 return (const void *) p;
195 return &ipv6_hdr(skb)->daddr;
199 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
200 struct net_device *dev,
206 daddr = choose_neigh_daddr(gw, skb, daddr);
207 n = __ipv6_neigh_lookup(dev, daddr);
211 n = neigh_create(&nd_tbl, daddr, dev);
212 return IS_ERR(n) ? NULL : n;
215 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
221 return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
222 dst->dev, skb, daddr);
225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
227 struct net_device *dev = dst->dev;
228 struct rt6_info *rt = (struct rt6_info *)dst;
230 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
237 __ipv6_confirm_neigh(dev, daddr);
240 static struct dst_ops ip6_dst_ops_template = {
244 .check = ip6_dst_check,
245 .default_advmss = ip6_default_advmss,
247 .cow_metrics = dst_cow_metrics_generic,
248 .destroy = ip6_dst_destroy,
249 .ifdown = ip6_dst_ifdown,
250 .negative_advice = ip6_negative_advice,
251 .link_failure = ip6_link_failure,
252 .update_pmtu = ip6_rt_update_pmtu,
253 .redirect = rt6_do_redirect,
254 .local_out = __ip6_local_out,
255 .neigh_lookup = ip6_dst_neigh_lookup,
256 .confirm_neigh = ip6_confirm_neigh,
259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
263 return mtu ? : dst->dev->mtu;
266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
267 struct sk_buff *skb, u32 mtu)
271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
276 static struct dst_ops ip6_dst_blackhole_ops = {
278 .destroy = ip6_dst_destroy,
279 .check = ip6_dst_check,
280 .mtu = ip6_blackhole_mtu,
281 .default_advmss = ip6_default_advmss,
282 .update_pmtu = ip6_rt_blackhole_update_pmtu,
283 .redirect = ip6_rt_blackhole_redirect,
284 .cow_metrics = dst_cow_metrics_generic,
285 .neigh_lookup = ip6_dst_neigh_lookup,
288 static const u32 ip6_template_metrics[RTAX_MAX] = {
289 [RTAX_HOPLIMIT - 1] = 0,
292 static const struct fib6_info fib6_null_entry_template = {
293 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
294 .fib6_protocol = RTPROT_KERNEL,
295 .fib6_metric = ~(u32)0,
296 .fib6_ref = REFCOUNT_INIT(1),
297 .fib6_type = RTN_UNREACHABLE,
298 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
301 static const struct rt6_info ip6_null_entry_template = {
303 .__refcnt = ATOMIC_INIT(1),
305 .obsolete = DST_OBSOLETE_FORCE_CHK,
306 .error = -ENETUNREACH,
307 .input = ip6_pkt_discard,
308 .output = ip6_pkt_discard_out,
310 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
327 static const struct rt6_info ip6_blk_hole_entry_template = {
329 .__refcnt = ATOMIC_INIT(1),
331 .obsolete = DST_OBSOLETE_FORCE_CHK,
333 .input = dst_discard,
334 .output = dst_discard_out,
336 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
341 static void rt6_info_init(struct rt6_info *rt)
343 struct dst_entry *dst = &rt->dst;
345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
346 INIT_LIST_HEAD(&rt->rt6i_uncached);
349 /* allocate dst with ip6_dst_ops */
350 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
354 1, DST_OBSOLETE_FORCE_CHK, flags);
358 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
363 EXPORT_SYMBOL(ip6_dst_alloc);
365 static void ip6_dst_destroy(struct dst_entry *dst)
367 struct rt6_info *rt = (struct rt6_info *)dst;
368 struct fib6_info *from;
369 struct inet6_dev *idev;
371 ip_dst_metrics_put(dst);
372 rt6_uncached_list_del(rt);
374 idev = rt->rt6i_idev;
376 rt->rt6i_idev = NULL;
380 from = xchg((__force struct fib6_info **)&rt->from, NULL);
381 fib6_info_release(from);
384 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
387 struct rt6_info *rt = (struct rt6_info *)dst;
388 struct inet6_dev *idev = rt->rt6i_idev;
389 struct net_device *loopback_dev =
390 dev_net(dev)->loopback_dev;
392 if (idev && idev->dev != loopback_dev) {
393 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
395 rt->rt6i_idev = loopback_idev;
401 static bool __rt6_check_expired(const struct rt6_info *rt)
403 if (rt->rt6i_flags & RTF_EXPIRES)
404 return time_after(jiffies, rt->dst.expires);
409 static bool rt6_check_expired(const struct rt6_info *rt)
411 struct fib6_info *from;
413 from = rcu_dereference(rt->from);
415 if (rt->rt6i_flags & RTF_EXPIRES) {
416 if (time_after(jiffies, rt->dst.expires))
419 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
420 fib6_check_expired(from);
425 void fib6_select_path(const struct net *net, struct fib6_result *res,
426 struct flowi6 *fl6, int oif, bool have_oif_match,
427 const struct sk_buff *skb, int strict)
429 struct fib6_info *sibling, *next_sibling;
430 struct fib6_info *match = res->f6i;
432 if (!match->fib6_nsiblings || have_oif_match)
435 /* We might have already computed the hash for ICMPv6 errors. In such
436 * case it will always be non-zero. Otherwise now is the time to do it.
439 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
441 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
444 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
446 const struct fib6_nh *nh = &sibling->fib6_nh;
449 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
450 if (fl6->mp_hash > nh_upper_bound)
452 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
460 res->nh = &match->fib6_nh;
464 * Route lookup. rcu_read_lock() should be held.
467 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
468 const struct in6_addr *saddr, int oif, int flags)
470 const struct net_device *dev;
472 if (nh->fib_nh_flags & RTNH_F_DEAD)
475 dev = nh->fib_nh_dev;
477 if (dev->ifindex == oif)
480 if (ipv6_chk_addr(net, saddr, dev,
481 flags & RT6_LOOKUP_F_IFACE))
488 static void rt6_device_match(struct net *net, struct fib6_result *res,
489 const struct in6_addr *saddr, int oif, int flags)
491 struct fib6_info *f6i = res->f6i;
492 struct fib6_info *spf6i;
495 if (!oif && ipv6_addr_any(saddr)) {
497 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
501 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
502 nh = &spf6i->fib6_nh;
503 if (__rt6_device_match(net, nh, saddr, oif, flags)) {
509 if (oif && flags & RT6_LOOKUP_F_IFACE) {
510 res->f6i = net->ipv6.fib6_null_entry;
511 nh = &res->f6i->fib6_nh;
516 if (nh->fib_nh_flags & RTNH_F_DEAD) {
517 res->f6i = net->ipv6.fib6_null_entry;
518 nh = &res->f6i->fib6_nh;
522 res->fib6_type = res->f6i->fib6_type;
523 res->fib6_flags = res->f6i->fib6_flags;
526 #ifdef CONFIG_IPV6_ROUTER_PREF
527 struct __rt6_probe_work {
528 struct work_struct work;
529 struct in6_addr target;
530 struct net_device *dev;
533 static void rt6_probe_deferred(struct work_struct *w)
535 struct in6_addr mcaddr;
536 struct __rt6_probe_work *work =
537 container_of(w, struct __rt6_probe_work, work);
539 addrconf_addr_solict_mult(&work->target, &mcaddr);
540 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
545 static void rt6_probe(struct fib6_nh *fib6_nh)
547 struct __rt6_probe_work *work = NULL;
548 const struct in6_addr *nh_gw;
549 struct neighbour *neigh;
550 struct net_device *dev;
551 struct inet6_dev *idev;
554 * Okay, this does not seem to be appropriate
555 * for now, however, we need to check if it
556 * is really so; aka Router Reachability Probing.
558 * Router Reachability Probe MUST be rate-limited
559 * to no more than one per minute.
561 if (fib6_nh->fib_nh_gw_family)
564 nh_gw = &fib6_nh->fib_nh_gw6;
565 dev = fib6_nh->fib_nh_dev;
567 idev = __in6_dev_get(dev);
568 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
570 if (neigh->nud_state & NUD_VALID)
573 write_lock(&neigh->lock);
574 if (!(neigh->nud_state & NUD_VALID) &&
576 neigh->updated + idev->cnf.rtr_probe_interval)) {
577 work = kmalloc(sizeof(*work), GFP_ATOMIC);
579 __neigh_set_probe_once(neigh);
581 write_unlock(&neigh->lock);
582 } else if (time_after(jiffies, fib6_nh->last_probe +
583 idev->cnf.rtr_probe_interval)) {
584 work = kmalloc(sizeof(*work), GFP_ATOMIC);
588 fib6_nh->last_probe = jiffies;
589 INIT_WORK(&work->work, rt6_probe_deferred);
590 work->target = *nh_gw;
593 schedule_work(&work->work);
597 rcu_read_unlock_bh();
600 static inline void rt6_probe(struct fib6_nh *fib6_nh)
606 * Default Router Selection (RFC 2461 6.3.6)
608 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
610 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
611 struct neighbour *neigh;
614 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
615 &fib6_nh->fib_nh_gw6);
617 read_lock(&neigh->lock);
618 if (neigh->nud_state & NUD_VALID)
619 ret = RT6_NUD_SUCCEED;
620 #ifdef CONFIG_IPV6_ROUTER_PREF
621 else if (!(neigh->nud_state & NUD_FAILED))
622 ret = RT6_NUD_SUCCEED;
624 ret = RT6_NUD_FAIL_PROBE;
626 read_unlock(&neigh->lock);
628 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
629 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
631 rcu_read_unlock_bh();
636 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
641 if (!oif || nh->fib_nh_dev->ifindex == oif)
644 if (!m && (strict & RT6_LOOKUP_F_IFACE))
645 return RT6_NUD_FAIL_HARD;
646 #ifdef CONFIG_IPV6_ROUTER_PREF
647 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
649 if ((strict & RT6_LOOKUP_F_REACHABLE) &&
650 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
651 int n = rt6_check_neigh(nh);
658 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
659 int oif, int strict, int *mpri, bool *do_rr)
661 bool match_do_rr = false;
665 if (nh->fib_nh_flags & RTNH_F_DEAD)
668 if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
669 nh->fib_nh_flags & RTNH_F_LINKDOWN &&
670 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
673 m = rt6_score_route(nh, fib6_flags, oif, strict);
674 if (m == RT6_NUD_FAIL_DO_RR) {
676 m = 0; /* lowest valid score */
677 } else if (m == RT6_NUD_FAIL_HARD) {
681 if (strict & RT6_LOOKUP_F_REACHABLE)
684 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
686 *do_rr = match_do_rr;
694 static void __find_rr_leaf(struct fib6_info *f6i_start,
695 struct fib6_info *nomatch, u32 metric,
696 struct fib6_result *res, struct fib6_info **cont,
697 int oif, int strict, bool *do_rr, int *mpri)
699 struct fib6_info *f6i;
701 for (f6i = f6i_start;
702 f6i && f6i != nomatch;
703 f6i = rcu_dereference(f6i->fib6_next)) {
706 if (cont && f6i->fib6_metric != metric) {
711 if (fib6_check_expired(f6i))
715 if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
718 res->fib6_flags = f6i->fib6_flags;
719 res->fib6_type = f6i->fib6_type;
724 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
725 struct fib6_info *rr_head, int oif, int strict,
726 bool *do_rr, struct fib6_result *res)
728 u32 metric = rr_head->fib6_metric;
729 struct fib6_info *cont = NULL;
732 __find_rr_leaf(rr_head, NULL, metric, res, &cont,
733 oif, strict, do_rr, &mpri);
735 __find_rr_leaf(leaf, rr_head, metric, res, &cont,
736 oif, strict, do_rr, &mpri);
738 if (res->f6i || !cont)
741 __find_rr_leaf(cont, NULL, metric, res, NULL,
742 oif, strict, do_rr, &mpri);
745 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
746 struct fib6_result *res, int strict)
748 struct fib6_info *leaf = rcu_dereference(fn->leaf);
749 struct fib6_info *rt0;
753 /* make sure this function or its helpers sets f6i */
756 if (!leaf || leaf == net->ipv6.fib6_null_entry)
759 rt0 = rcu_dereference(fn->rr_ptr);
763 /* Double check to make sure fn is not an intermediate node
764 * and fn->leaf does not points to its child's leaf
765 * (This might happen if all routes under fn are deleted from
766 * the tree and fib6_repair_tree() is called on the node.)
768 key_plen = rt0->fib6_dst.plen;
769 #ifdef CONFIG_IPV6_SUBTREES
770 if (rt0->fib6_src.plen)
771 key_plen = rt0->fib6_src.plen;
773 if (fn->fn_bit != key_plen)
776 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
778 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
780 /* no entries matched; do round-robin */
781 if (!next || next->fib6_metric != rt0->fib6_metric)
785 spin_lock_bh(&leaf->fib6_table->tb6_lock);
786 /* make sure next is not being deleted from the tree */
788 rcu_assign_pointer(fn->rr_ptr, next);
789 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
795 res->f6i = net->ipv6.fib6_null_entry;
796 res->nh = &res->f6i->fib6_nh;
797 res->fib6_flags = res->f6i->fib6_flags;
798 res->fib6_type = res->f6i->fib6_type;
802 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
804 return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
805 res->nh->fib_nh_gw_family;
808 #ifdef CONFIG_IPV6_ROUTE_INFO
809 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
810 const struct in6_addr *gwaddr)
812 struct net *net = dev_net(dev);
813 struct route_info *rinfo = (struct route_info *) opt;
814 struct in6_addr prefix_buf, *prefix;
816 unsigned long lifetime;
817 struct fib6_info *rt;
819 if (len < sizeof(struct route_info)) {
823 /* Sanity check for prefix_len and length */
824 if (rinfo->length > 3) {
826 } else if (rinfo->prefix_len > 128) {
828 } else if (rinfo->prefix_len > 64) {
829 if (rinfo->length < 2) {
832 } else if (rinfo->prefix_len > 0) {
833 if (rinfo->length < 1) {
838 pref = rinfo->route_pref;
839 if (pref == ICMPV6_ROUTER_PREF_INVALID)
842 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
844 if (rinfo->length == 3)
845 prefix = (struct in6_addr *)rinfo->prefix;
847 /* this function is safe */
848 ipv6_addr_prefix(&prefix_buf,
849 (struct in6_addr *)rinfo->prefix,
851 prefix = &prefix_buf;
854 if (rinfo->prefix_len == 0)
855 rt = rt6_get_dflt_router(net, gwaddr, dev);
857 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
860 if (rt && !lifetime) {
866 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
869 rt->fib6_flags = RTF_ROUTEINFO |
870 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
873 if (!addrconf_finite_timeout(lifetime))
874 fib6_clean_expires(rt);
876 fib6_set_expires(rt, jiffies + HZ * lifetime);
878 fib6_info_release(rt);
885 * Misc support functions
888 /* called with rcu_lock held */
889 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
891 struct net_device *dev = res->nh->fib_nh_dev;
893 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
894 /* for copies of local routes, dst->dev needs to be the
895 * device if it is a master device, the master device if
896 * device is enslaved, and the loopback as the default
898 if (netif_is_l3_slave(dev) &&
899 !rt6_need_strict(&res->f6i->fib6_dst.addr))
900 dev = l3mdev_master_dev_rcu(dev);
901 else if (!netif_is_l3_master(dev))
902 dev = dev_net(dev)->loopback_dev;
903 /* last case is netif_is_l3_master(dev) is true in which
904 * case we want dev returned to be dev
911 static const int fib6_prop[RTN_MAX + 1] = {
918 [RTN_BLACKHOLE] = -EINVAL,
919 [RTN_UNREACHABLE] = -EHOSTUNREACH,
920 [RTN_PROHIBIT] = -EACCES,
921 [RTN_THROW] = -EAGAIN,
923 [RTN_XRESOLVE] = -EINVAL,
926 static int ip6_rt_type_to_error(u8 fib6_type)
928 return fib6_prop[fib6_type];
931 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
933 unsigned short flags = 0;
936 flags |= DST_NOCOUNT;
937 if (rt->dst_nopolicy)
938 flags |= DST_NOPOLICY;
945 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
947 rt->dst.error = ip6_rt_type_to_error(fib6_type);
951 rt->dst.output = dst_discard_out;
952 rt->dst.input = dst_discard;
955 rt->dst.output = ip6_pkt_prohibit_out;
956 rt->dst.input = ip6_pkt_prohibit;
959 case RTN_UNREACHABLE:
961 rt->dst.output = ip6_pkt_discard_out;
962 rt->dst.input = ip6_pkt_discard;
967 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
969 struct fib6_info *f6i = res->f6i;
971 if (res->fib6_flags & RTF_REJECT) {
972 ip6_rt_init_dst_reject(rt, res->fib6_type);
977 rt->dst.output = ip6_output;
979 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
980 rt->dst.input = ip6_input;
981 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
982 rt->dst.input = ip6_mc_input;
984 rt->dst.input = ip6_forward;
987 if (res->nh->fib_nh_lws) {
988 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
989 lwtunnel_set_redirect(&rt->dst);
992 rt->dst.lastuse = jiffies;
995 /* Caller must already hold reference to @from */
996 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
998 rt->rt6i_flags &= ~RTF_EXPIRES;
999 rcu_assign_pointer(rt->from, from);
1000 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1003 /* Caller must already hold reference to f6i in result */
1004 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1006 const struct fib6_nh *nh = res->nh;
1007 const struct net_device *dev = nh->fib_nh_dev;
1008 struct fib6_info *f6i = res->f6i;
1010 ip6_rt_init_dst(rt, res);
1012 rt->rt6i_dst = f6i->fib6_dst;
1013 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1014 rt->rt6i_flags = res->fib6_flags;
1015 if (nh->fib_nh_gw_family) {
1016 rt->rt6i_gateway = nh->fib_nh_gw6;
1017 rt->rt6i_flags |= RTF_GATEWAY;
1019 rt6_set_from(rt, f6i);
1020 #ifdef CONFIG_IPV6_SUBTREES
1021 rt->rt6i_src = f6i->fib6_src;
1025 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1026 struct in6_addr *saddr)
1028 struct fib6_node *pn, *sn;
1030 if (fn->fn_flags & RTN_TL_ROOT)
1032 pn = rcu_dereference(fn->parent);
1033 sn = FIB6_SUBTREE(pn);
1035 fn = fib6_node_lookup(sn, NULL, saddr);
1038 if (fn->fn_flags & RTN_RTINFO)
1043 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1045 struct rt6_info *rt = *prt;
1047 if (dst_hold_safe(&rt->dst))
1050 rt = net->ipv6.ip6_null_entry;
1059 /* called with rcu_lock held */
1060 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1062 struct net_device *dev = res->nh->fib_nh_dev;
1063 struct fib6_info *f6i = res->f6i;
1064 unsigned short flags;
1065 struct rt6_info *nrt;
1067 if (!fib6_info_hold_safe(f6i))
1070 flags = fib6_info_dst_flags(f6i);
1071 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1073 fib6_info_release(f6i);
1077 ip6_rt_copy_init(nrt, res);
1081 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1082 dst_hold(&nrt->dst);
1086 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1087 struct fib6_table *table,
1089 const struct sk_buff *skb,
1092 struct fib6_result res = {};
1093 struct fib6_node *fn;
1094 struct rt6_info *rt;
1096 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1097 flags &= ~RT6_LOOKUP_F_IFACE;
1100 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1102 res.f6i = rcu_dereference(fn->leaf);
1104 res.f6i = net->ipv6.fib6_null_entry;
1106 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1109 if (res.f6i == net->ipv6.fib6_null_entry) {
1110 fn = fib6_backtrack(fn, &fl6->saddr);
1114 rt = net->ipv6.ip6_null_entry;
1119 fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1120 fl6->flowi6_oif != 0, skb, flags);
1122 /* Search through exception table */
1123 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1125 if (ip6_hold_safe(net, &rt))
1126 dst_use_noref(&rt->dst, jiffies);
1128 rt = ip6_create_rt_rcu(&res);
1132 trace_fib6_table_lookup(net, &res, table, fl6);
1139 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1140 const struct sk_buff *skb, int flags)
1142 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1144 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1146 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1147 const struct in6_addr *saddr, int oif,
1148 const struct sk_buff *skb, int strict)
1150 struct flowi6 fl6 = {
1154 struct dst_entry *dst;
1155 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1158 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1159 flags |= RT6_LOOKUP_F_HAS_SADDR;
1162 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1163 if (dst->error == 0)
1164 return (struct rt6_info *) dst;
1170 EXPORT_SYMBOL(rt6_lookup);
1172 /* ip6_ins_rt is called with FREE table->tb6_lock.
1173 * It takes new route entry, the addition fails by any reason the
1174 * route is released.
1175 * Caller must hold dst before calling it.
1178 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1179 struct netlink_ext_ack *extack)
1182 struct fib6_table *table;
1184 table = rt->fib6_table;
1185 spin_lock_bh(&table->tb6_lock);
1186 err = fib6_add(&table->tb6_root, rt, info, extack);
1187 spin_unlock_bh(&table->tb6_lock);
1192 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1194 struct nl_info info = { .nl_net = net, };
1196 return __ip6_ins_rt(rt, &info, NULL);
1199 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1200 const struct in6_addr *daddr,
1201 const struct in6_addr *saddr)
1203 struct fib6_info *f6i = res->f6i;
1204 struct net_device *dev;
1205 struct rt6_info *rt;
1211 if (!fib6_info_hold_safe(f6i))
1214 dev = ip6_rt_get_dev_rcu(res);
1215 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1217 fib6_info_release(f6i);
1221 ip6_rt_copy_init(rt, res);
1222 rt->rt6i_flags |= RTF_CACHE;
1223 rt->dst.flags |= DST_HOST;
1224 rt->rt6i_dst.addr = *daddr;
1225 rt->rt6i_dst.plen = 128;
1227 if (!rt6_is_gw_or_nonexthop(res)) {
1228 if (f6i->fib6_dst.plen != 128 &&
1229 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1230 rt->rt6i_flags |= RTF_ANYCAST;
1231 #ifdef CONFIG_IPV6_SUBTREES
1232 if (rt->rt6i_src.plen && saddr) {
1233 rt->rt6i_src.addr = *saddr;
1234 rt->rt6i_src.plen = 128;
1242 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1244 struct fib6_info *f6i = res->f6i;
1245 unsigned short flags = fib6_info_dst_flags(f6i);
1246 struct net_device *dev;
1247 struct rt6_info *pcpu_rt;
1249 if (!fib6_info_hold_safe(f6i))
1253 dev = ip6_rt_get_dev_rcu(res);
1254 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1257 fib6_info_release(f6i);
1260 ip6_rt_copy_init(pcpu_rt, res);
1261 pcpu_rt->rt6i_flags |= RTF_PCPU;
1265 /* It should be called with rcu_read_lock() acquired */
1266 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1268 struct rt6_info *pcpu_rt, **p;
1270 p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1274 ip6_hold_safe(NULL, &pcpu_rt);
1279 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1280 const struct fib6_result *res)
1282 struct rt6_info *pcpu_rt, *prev, **p;
1284 pcpu_rt = ip6_rt_pcpu_alloc(res);
1286 dst_hold(&net->ipv6.ip6_null_entry->dst);
1287 return net->ipv6.ip6_null_entry;
1290 dst_hold(&pcpu_rt->dst);
1291 p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1292 prev = cmpxchg(p, NULL, pcpu_rt);
1295 if (res->f6i->fib6_destroying) {
1296 struct fib6_info *from;
1298 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1299 fib6_info_release(from);
1305 /* exception hash table implementation
1307 static DEFINE_SPINLOCK(rt6_exception_lock);
1309 /* Remove rt6_ex from hash table and free the memory
1310 * Caller must hold rt6_exception_lock
1312 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1313 struct rt6_exception *rt6_ex)
1315 struct fib6_info *from;
1318 if (!bucket || !rt6_ex)
1321 net = dev_net(rt6_ex->rt6i->dst.dev);
1322 net->ipv6.rt6_stats->fib_rt_cache--;
1324 /* purge completely the exception to allow releasing the held resources:
1325 * some [sk] cache may keep the dst around for unlimited time
1327 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1328 fib6_info_release(from);
1329 dst_dev_put(&rt6_ex->rt6i->dst);
1331 hlist_del_rcu(&rt6_ex->hlist);
1332 dst_release(&rt6_ex->rt6i->dst);
1333 kfree_rcu(rt6_ex, rcu);
1334 WARN_ON_ONCE(!bucket->depth);
1338 /* Remove oldest rt6_ex in bucket and free the memory
1339 * Caller must hold rt6_exception_lock
1341 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1343 struct rt6_exception *rt6_ex, *oldest = NULL;
1348 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1349 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1352 rt6_remove_exception(bucket, oldest);
1355 static u32 rt6_exception_hash(const struct in6_addr *dst,
1356 const struct in6_addr *src)
1358 static u32 seed __read_mostly;
1361 net_get_random_once(&seed, sizeof(seed));
1362 val = jhash(dst, sizeof(*dst), seed);
1364 #ifdef CONFIG_IPV6_SUBTREES
1366 val = jhash(src, sizeof(*src), val);
1368 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1371 /* Helper function to find the cached rt in the hash table
1372 * and update bucket pointer to point to the bucket for this
1373 * (daddr, saddr) pair
1374 * Caller must hold rt6_exception_lock
1376 static struct rt6_exception *
1377 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1378 const struct in6_addr *daddr,
1379 const struct in6_addr *saddr)
1381 struct rt6_exception *rt6_ex;
1384 if (!(*bucket) || !daddr)
1387 hval = rt6_exception_hash(daddr, saddr);
1390 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1391 struct rt6_info *rt6 = rt6_ex->rt6i;
1392 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1394 #ifdef CONFIG_IPV6_SUBTREES
1395 if (matched && saddr)
1396 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1404 /* Helper function to find the cached rt in the hash table
1405 * and update bucket pointer to point to the bucket for this
1406 * (daddr, saddr) pair
1407 * Caller must hold rcu_read_lock()
1409 static struct rt6_exception *
1410 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1411 const struct in6_addr *daddr,
1412 const struct in6_addr *saddr)
1414 struct rt6_exception *rt6_ex;
1417 WARN_ON_ONCE(!rcu_read_lock_held());
1419 if (!(*bucket) || !daddr)
1422 hval = rt6_exception_hash(daddr, saddr);
1425 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1426 struct rt6_info *rt6 = rt6_ex->rt6i;
1427 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1429 #ifdef CONFIG_IPV6_SUBTREES
1430 if (matched && saddr)
1431 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1439 static unsigned int fib6_mtu(const struct fib6_result *res)
1441 const struct fib6_nh *nh = res->nh;
1444 if (res->f6i->fib6_pmtu) {
1445 mtu = res->f6i->fib6_pmtu;
1447 struct net_device *dev = nh->fib_nh_dev;
1448 struct inet6_dev *idev;
1451 idev = __in6_dev_get(dev);
1452 mtu = idev->cnf.mtu6;
1456 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1458 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1461 static int rt6_insert_exception(struct rt6_info *nrt,
1462 const struct fib6_result *res)
1464 struct net *net = dev_net(nrt->dst.dev);
1465 struct rt6_exception_bucket *bucket;
1466 struct in6_addr *src_key = NULL;
1467 struct rt6_exception *rt6_ex;
1468 struct fib6_info *f6i = res->f6i;
1471 spin_lock_bh(&rt6_exception_lock);
1473 if (f6i->exception_bucket_flushed) {
1478 bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1479 lockdep_is_held(&rt6_exception_lock));
1481 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1487 rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1490 #ifdef CONFIG_IPV6_SUBTREES
1491 /* fib6_src.plen != 0 indicates f6i is in subtree
1492 * and exception table is indexed by a hash of
1493 * both fib6_dst and fib6_src.
1494 * Otherwise, the exception table is indexed by
1495 * a hash of only fib6_dst.
1497 if (f6i->fib6_src.plen)
1498 src_key = &nrt->rt6i_src.addr;
1500 /* rt6_mtu_change() might lower mtu on f6i.
1501 * Only insert this exception route if its mtu
1502 * is less than f6i's mtu value.
1504 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1509 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1512 rt6_remove_exception(bucket, rt6_ex);
1514 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1520 rt6_ex->stamp = jiffies;
1521 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1523 net->ipv6.rt6_stats->fib_rt_cache++;
1525 if (bucket->depth > FIB6_MAX_DEPTH)
1526 rt6_exception_remove_oldest(bucket);
1529 spin_unlock_bh(&rt6_exception_lock);
1531 /* Update fn->fn_sernum to invalidate all cached dst */
1533 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1534 fib6_update_sernum(net, f6i);
1535 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1536 fib6_force_start_gc(net);
1542 void rt6_flush_exceptions(struct fib6_info *rt)
1544 struct rt6_exception_bucket *bucket;
1545 struct rt6_exception *rt6_ex;
1546 struct hlist_node *tmp;
1549 spin_lock_bh(&rt6_exception_lock);
1550 /* Prevent rt6_insert_exception() to recreate the bucket list */
1551 rt->exception_bucket_flushed = 1;
1553 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1554 lockdep_is_held(&rt6_exception_lock));
1558 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1559 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1560 rt6_remove_exception(bucket, rt6_ex);
1561 WARN_ON_ONCE(bucket->depth);
1566 spin_unlock_bh(&rt6_exception_lock);
1569 /* Find cached rt in the hash table inside passed in rt
1570 * Caller has to hold rcu_read_lock()
1572 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1573 const struct in6_addr *daddr,
1574 const struct in6_addr *saddr)
1576 const struct in6_addr *src_key = NULL;
1577 struct rt6_exception_bucket *bucket;
1578 struct rt6_exception *rt6_ex;
1579 struct rt6_info *ret = NULL;
1581 #ifdef CONFIG_IPV6_SUBTREES
1582 /* fib6i_src.plen != 0 indicates f6i is in subtree
1583 * and exception table is indexed by a hash of
1584 * both fib6_dst and fib6_src.
1585 * However, the src addr used to create the hash
1586 * might not be exactly the passed in saddr which
1587 * is a /128 addr from the flow.
1588 * So we need to use f6i->fib6_src to redo lookup
1589 * if the passed in saddr does not find anything.
1590 * (See the logic in ip6_rt_cache_alloc() on how
1591 * rt->rt6i_src is updated.)
1593 if (res->f6i->fib6_src.plen)
1597 bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1598 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1600 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1603 #ifdef CONFIG_IPV6_SUBTREES
1604 /* Use fib6_src as src_key and redo lookup */
1605 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1606 src_key = &res->f6i->fib6_src.addr;
1614 /* Remove the passed in cached rt from the hash table that contains it */
1615 static int rt6_remove_exception_rt(struct rt6_info *rt)
1617 struct rt6_exception_bucket *bucket;
1618 struct in6_addr *src_key = NULL;
1619 struct rt6_exception *rt6_ex;
1620 struct fib6_info *from;
1623 from = rcu_dereference(rt->from);
1625 !(rt->rt6i_flags & RTF_CACHE))
1628 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1631 spin_lock_bh(&rt6_exception_lock);
1632 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1633 lockdep_is_held(&rt6_exception_lock));
1634 #ifdef CONFIG_IPV6_SUBTREES
1635 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1636 * and exception table is indexed by a hash of
1637 * both rt6i_dst and rt6i_src.
1638 * Otherwise, the exception table is indexed by
1639 * a hash of only rt6i_dst.
1641 if (from->fib6_src.plen)
1642 src_key = &rt->rt6i_src.addr;
1644 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1648 rt6_remove_exception(bucket, rt6_ex);
1654 spin_unlock_bh(&rt6_exception_lock);
1658 /* Find rt6_ex which contains the passed in rt cache and
1661 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1663 struct rt6_exception_bucket *bucket;
1664 struct in6_addr *src_key = NULL;
1665 struct rt6_exception *rt6_ex;
1666 struct fib6_info *from;
1669 from = rcu_dereference(rt->from);
1670 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1673 bucket = rcu_dereference(from->rt6i_exception_bucket);
1675 #ifdef CONFIG_IPV6_SUBTREES
1676 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1677 * and exception table is indexed by a hash of
1678 * both rt6i_dst and rt6i_src.
1679 * Otherwise, the exception table is indexed by
1680 * a hash of only rt6i_dst.
1682 if (from->fib6_src.plen)
1683 src_key = &rt->rt6i_src.addr;
1685 rt6_ex = __rt6_find_exception_rcu(&bucket,
1689 rt6_ex->stamp = jiffies;
1695 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1696 struct rt6_info *rt, int mtu)
1698 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1699 * lowest MTU in the path: always allow updating the route PMTU to
1700 * reflect PMTU decreases.
1702 * If the new MTU is higher, and the route PMTU is equal to the local
1703 * MTU, this means the old MTU is the lowest in the path, so allow
1704 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1708 if (dst_mtu(&rt->dst) >= mtu)
1711 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1717 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1718 struct fib6_info *rt, int mtu)
1720 struct rt6_exception_bucket *bucket;
1721 struct rt6_exception *rt6_ex;
1724 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1725 lockdep_is_held(&rt6_exception_lock));
1730 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1731 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1732 struct rt6_info *entry = rt6_ex->rt6i;
1734 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1735 * route), the metrics of its rt->from have already
1738 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1739 rt6_mtu_change_route_allowed(idev, entry, mtu))
1740 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1746 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1748 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1749 struct in6_addr *gateway)
1751 struct rt6_exception_bucket *bucket;
1752 struct rt6_exception *rt6_ex;
1753 struct hlist_node *tmp;
1756 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1759 spin_lock_bh(&rt6_exception_lock);
1760 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1761 lockdep_is_held(&rt6_exception_lock));
1764 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1765 hlist_for_each_entry_safe(rt6_ex, tmp,
1766 &bucket->chain, hlist) {
1767 struct rt6_info *entry = rt6_ex->rt6i;
1769 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1770 RTF_CACHE_GATEWAY &&
1771 ipv6_addr_equal(gateway,
1772 &entry->rt6i_gateway)) {
1773 rt6_remove_exception(bucket, rt6_ex);
1780 spin_unlock_bh(&rt6_exception_lock);
1783 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1784 struct rt6_exception *rt6_ex,
1785 struct fib6_gc_args *gc_args,
1788 struct rt6_info *rt = rt6_ex->rt6i;
1790 /* we are pruning and obsoleting aged-out and non gateway exceptions
1791 * even if others have still references to them, so that on next
1792 * dst_check() such references can be dropped.
1793 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1794 * expired, independently from their aging, as per RFC 8201 section 4
1796 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1797 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1798 RT6_TRACE("aging clone %p\n", rt);
1799 rt6_remove_exception(bucket, rt6_ex);
1802 } else if (time_after(jiffies, rt->dst.expires)) {
1803 RT6_TRACE("purging expired route %p\n", rt);
1804 rt6_remove_exception(bucket, rt6_ex);
1808 if (rt->rt6i_flags & RTF_GATEWAY) {
1809 struct neighbour *neigh;
1810 __u8 neigh_flags = 0;
1812 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1814 neigh_flags = neigh->flags;
1816 if (!(neigh_flags & NTF_ROUTER)) {
1817 RT6_TRACE("purging route %p via non-router but gateway\n",
1819 rt6_remove_exception(bucket, rt6_ex);
1827 void rt6_age_exceptions(struct fib6_info *rt,
1828 struct fib6_gc_args *gc_args,
1831 struct rt6_exception_bucket *bucket;
1832 struct rt6_exception *rt6_ex;
1833 struct hlist_node *tmp;
1836 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1840 spin_lock(&rt6_exception_lock);
1841 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1842 lockdep_is_held(&rt6_exception_lock));
1845 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1846 hlist_for_each_entry_safe(rt6_ex, tmp,
1847 &bucket->chain, hlist) {
1848 rt6_age_examine_exception(bucket, rt6_ex,
1854 spin_unlock(&rt6_exception_lock);
1855 rcu_read_unlock_bh();
1858 /* must be called with rcu lock held */
1859 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1860 struct flowi6 *fl6, struct fib6_result *res, int strict)
1862 struct fib6_node *fn, *saved_fn;
1864 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1867 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1871 rt6_select(net, fn, oif, res, strict);
1872 if (res->f6i == net->ipv6.fib6_null_entry) {
1873 fn = fib6_backtrack(fn, &fl6->saddr);
1875 goto redo_rt6_select;
1876 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1877 /* also consider unreachable route */
1878 strict &= ~RT6_LOOKUP_F_REACHABLE;
1880 goto redo_rt6_select;
1884 trace_fib6_table_lookup(net, res, table, fl6);
1889 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1890 int oif, struct flowi6 *fl6,
1891 const struct sk_buff *skb, int flags)
1893 struct fib6_result res = {};
1894 struct rt6_info *rt;
1897 strict |= flags & RT6_LOOKUP_F_IFACE;
1898 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1899 if (net->ipv6.devconf_all->forwarding == 0)
1900 strict |= RT6_LOOKUP_F_REACHABLE;
1904 fib6_table_lookup(net, table, oif, fl6, &res, strict);
1905 if (res.f6i == net->ipv6.fib6_null_entry) {
1906 rt = net->ipv6.ip6_null_entry;
1912 fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1914 /*Search through exception table */
1915 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1917 if (ip6_hold_safe(net, &rt))
1918 dst_use_noref(&rt->dst, jiffies);
1922 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1923 !res.nh->fib_nh_gw_family)) {
1924 /* Create a RTF_CACHE clone which will not be
1925 * owned by the fib6 tree. It is for the special case where
1926 * the daddr in the skb during the neighbor look-up is different
1927 * from the fl6->daddr used to look-up route here.
1929 struct rt6_info *uncached_rt;
1931 uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1936 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1937 * No need for another dst_hold()
1939 rt6_uncached_list_add(uncached_rt);
1940 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1942 uncached_rt = net->ipv6.ip6_null_entry;
1943 dst_hold(&uncached_rt->dst);
1948 /* Get a percpu copy */
1950 struct rt6_info *pcpu_rt;
1953 pcpu_rt = rt6_get_pcpu_route(&res);
1956 pcpu_rt = rt6_make_pcpu_route(net, &res);
1964 EXPORT_SYMBOL_GPL(ip6_pol_route);
1966 static struct rt6_info *ip6_pol_route_input(struct net *net,
1967 struct fib6_table *table,
1969 const struct sk_buff *skb,
1972 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1975 struct dst_entry *ip6_route_input_lookup(struct net *net,
1976 struct net_device *dev,
1978 const struct sk_buff *skb,
1981 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1982 flags |= RT6_LOOKUP_F_IFACE;
1984 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1986 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1988 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1989 struct flow_keys *keys,
1990 struct flow_keys *flkeys)
1992 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1993 const struct ipv6hdr *key_iph = outer_iph;
1994 struct flow_keys *_flkeys = flkeys;
1995 const struct ipv6hdr *inner_iph;
1996 const struct icmp6hdr *icmph;
1997 struct ipv6hdr _inner_iph;
1998 struct icmp6hdr _icmph;
2000 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2003 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2004 sizeof(_icmph), &_icmph);
2008 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2009 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2010 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2011 icmph->icmp6_type != ICMPV6_PARAMPROB)
2014 inner_iph = skb_header_pointer(skb,
2015 skb_transport_offset(skb) + sizeof(*icmph),
2016 sizeof(_inner_iph), &_inner_iph);
2020 key_iph = inner_iph;
2024 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2025 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2026 keys->tags.flow_label = _flkeys->tags.flow_label;
2027 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2029 keys->addrs.v6addrs.src = key_iph->saddr;
2030 keys->addrs.v6addrs.dst = key_iph->daddr;
2031 keys->tags.flow_label = ip6_flowlabel(key_iph);
2032 keys->basic.ip_proto = key_iph->nexthdr;
2036 /* if skb is set it will be used and fl6 can be NULL */
2037 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2038 const struct sk_buff *skb, struct flow_keys *flkeys)
2040 struct flow_keys hash_keys;
2043 switch (ip6_multipath_hash_policy(net)) {
2045 memset(&hash_keys, 0, sizeof(hash_keys));
2046 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2048 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2050 hash_keys.addrs.v6addrs.src = fl6->saddr;
2051 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2052 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2053 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2058 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2059 struct flow_keys keys;
2061 /* short-circuit if we already have L4 hash present */
2063 return skb_get_hash_raw(skb) >> 1;
2065 memset(&hash_keys, 0, sizeof(hash_keys));
2068 skb_flow_dissect_flow_keys(skb, &keys, flag);
2071 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2072 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2073 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2074 hash_keys.ports.src = flkeys->ports.src;
2075 hash_keys.ports.dst = flkeys->ports.dst;
2076 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2078 memset(&hash_keys, 0, sizeof(hash_keys));
2079 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2080 hash_keys.addrs.v6addrs.src = fl6->saddr;
2081 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2082 hash_keys.ports.src = fl6->fl6_sport;
2083 hash_keys.ports.dst = fl6->fl6_dport;
2084 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2088 mhash = flow_hash_from_keys(&hash_keys);
2093 void ip6_route_input(struct sk_buff *skb)
2095 const struct ipv6hdr *iph = ipv6_hdr(skb);
2096 struct net *net = dev_net(skb->dev);
2097 int flags = RT6_LOOKUP_F_HAS_SADDR;
2098 struct ip_tunnel_info *tun_info;
2099 struct flowi6 fl6 = {
2100 .flowi6_iif = skb->dev->ifindex,
2101 .daddr = iph->daddr,
2102 .saddr = iph->saddr,
2103 .flowlabel = ip6_flowinfo(iph),
2104 .flowi6_mark = skb->mark,
2105 .flowi6_proto = iph->nexthdr,
2107 struct flow_keys *flkeys = NULL, _flkeys;
2109 tun_info = skb_tunnel_info(skb);
2110 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2111 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2113 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2116 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2117 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2120 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2123 static struct rt6_info *ip6_pol_route_output(struct net *net,
2124 struct fib6_table *table,
2126 const struct sk_buff *skb,
2129 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2132 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2133 struct flowi6 *fl6, int flags)
2137 if (ipv6_addr_type(&fl6->daddr) &
2138 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2139 struct dst_entry *dst;
2141 dst = l3mdev_link_scope_lookup(net, fl6);
2146 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2148 any_src = ipv6_addr_any(&fl6->saddr);
2149 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2150 (fl6->flowi6_oif && any_src))
2151 flags |= RT6_LOOKUP_F_IFACE;
2154 flags |= RT6_LOOKUP_F_HAS_SADDR;
2156 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2158 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2160 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2162 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2164 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2165 struct net_device *loopback_dev = net->loopback_dev;
2166 struct dst_entry *new = NULL;
2168 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2169 DST_OBSOLETE_DEAD, 0);
2172 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2176 new->input = dst_discard;
2177 new->output = dst_discard_out;
2179 dst_copy_metrics(new, &ort->dst);
2181 rt->rt6i_idev = in6_dev_get(loopback_dev);
2182 rt->rt6i_gateway = ort->rt6i_gateway;
2183 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2185 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2186 #ifdef CONFIG_IPV6_SUBTREES
2187 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2191 dst_release(dst_orig);
2192 return new ? new : ERR_PTR(-ENOMEM);
2196 * Destination cache support functions
2199 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2203 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2206 if (fib6_check_expired(f6i))
2212 static struct dst_entry *rt6_check(struct rt6_info *rt,
2213 struct fib6_info *from,
2218 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2219 rt_cookie != cookie)
2222 if (rt6_check_expired(rt))
2228 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2229 struct fib6_info *from,
2232 if (!__rt6_check_expired(rt) &&
2233 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2234 fib6_check(from, cookie))
2240 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2242 struct dst_entry *dst_ret;
2243 struct fib6_info *from;
2244 struct rt6_info *rt;
2246 rt = container_of(dst, struct rt6_info, dst);
2250 /* All IPV6 dsts are created with ->obsolete set to the value
2251 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2252 * into this function always.
2255 from = rcu_dereference(rt->from);
2257 if (from && (rt->rt6i_flags & RTF_PCPU ||
2258 unlikely(!list_empty(&rt->rt6i_uncached))))
2259 dst_ret = rt6_dst_from_check(rt, from, cookie);
2261 dst_ret = rt6_check(rt, from, cookie);
2268 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2270 struct rt6_info *rt = (struct rt6_info *) dst;
2273 if (rt->rt6i_flags & RTF_CACHE) {
2275 if (rt6_check_expired(rt)) {
2276 rt6_remove_exception_rt(rt);
2288 static void ip6_link_failure(struct sk_buff *skb)
2290 struct rt6_info *rt;
2292 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2294 rt = (struct rt6_info *) skb_dst(skb);
2297 if (rt->rt6i_flags & RTF_CACHE) {
2298 rt6_remove_exception_rt(rt);
2300 struct fib6_info *from;
2301 struct fib6_node *fn;
2303 from = rcu_dereference(rt->from);
2305 fn = rcu_dereference(from->fib6_node);
2306 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2314 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2316 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2317 struct fib6_info *from;
2320 from = rcu_dereference(rt0->from);
2322 rt0->dst.expires = from->expires;
2326 dst_set_expires(&rt0->dst, timeout);
2327 rt0->rt6i_flags |= RTF_EXPIRES;
2330 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2332 struct net *net = dev_net(rt->dst.dev);
2334 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2335 rt->rt6i_flags |= RTF_MODIFIED;
2336 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2339 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2341 return !(rt->rt6i_flags & RTF_CACHE) &&
2342 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2345 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2346 const struct ipv6hdr *iph, u32 mtu)
2348 const struct in6_addr *daddr, *saddr;
2349 struct rt6_info *rt6 = (struct rt6_info *)dst;
2351 if (dst_metric_locked(dst, RTAX_MTU))
2355 daddr = &iph->daddr;
2356 saddr = &iph->saddr;
2358 daddr = &sk->sk_v6_daddr;
2359 saddr = &inet6_sk(sk)->saddr;
2364 dst_confirm_neigh(dst, daddr);
2365 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2366 if (mtu >= dst_mtu(dst))
2369 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2370 rt6_do_update_pmtu(rt6, mtu);
2371 /* update rt6_ex->stamp for cache */
2372 if (rt6->rt6i_flags & RTF_CACHE)
2373 rt6_update_exception_stamp_rt(rt6);
2375 struct fib6_result res = {};
2376 struct rt6_info *nrt6;
2379 res.f6i = rcu_dereference(rt6->from);
2384 res.nh = &res.f6i->fib6_nh;
2385 res.fib6_flags = res.f6i->fib6_flags;
2386 res.fib6_type = res.f6i->fib6_type;
2388 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2390 rt6_do_update_pmtu(nrt6, mtu);
2391 if (rt6_insert_exception(nrt6, &res))
2392 dst_release_immediate(&nrt6->dst);
2398 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2399 struct sk_buff *skb, u32 mtu)
2401 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2404 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2405 int oif, u32 mark, kuid_t uid)
2407 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2408 struct dst_entry *dst;
2409 struct flowi6 fl6 = {
2411 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2412 .daddr = iph->daddr,
2413 .saddr = iph->saddr,
2414 .flowlabel = ip6_flowinfo(iph),
2418 dst = ip6_route_output(net, NULL, &fl6);
2420 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2423 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2425 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2427 int oif = sk->sk_bound_dev_if;
2428 struct dst_entry *dst;
2430 if (!oif && skb->dev)
2431 oif = l3mdev_master_ifindex(skb->dev);
2433 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2435 dst = __sk_dst_get(sk);
2436 if (!dst || !dst->obsolete ||
2437 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2441 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2442 ip6_datagram_dst_update(sk, false);
2445 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2447 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2448 const struct flowi6 *fl6)
2450 #ifdef CONFIG_IPV6_SUBTREES
2451 struct ipv6_pinfo *np = inet6_sk(sk);
2454 ip6_dst_store(sk, dst,
2455 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2456 &sk->sk_v6_daddr : NULL,
2457 #ifdef CONFIG_IPV6_SUBTREES
2458 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2464 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2466 const struct in6_addr *gw,
2467 struct rt6_info **ret)
2469 const struct fib6_nh *nh = res->nh;
2471 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2472 fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2475 /* rt_cache's gateway might be different from its 'parent'
2476 * in the case of an ip redirect.
2477 * So we keep searching in the exception table if the gateway
2480 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2481 struct rt6_info *rt_cache;
2483 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2485 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2494 /* Handle redirects */
2495 struct ip6rd_flowi {
2497 struct in6_addr gateway;
2500 static struct rt6_info *__ip6_route_redirect(struct net *net,
2501 struct fib6_table *table,
2503 const struct sk_buff *skb,
2506 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2507 struct rt6_info *ret = NULL;
2508 struct fib6_result res = {};
2509 struct fib6_info *rt;
2510 struct fib6_node *fn;
2512 /* l3mdev_update_flow overrides oif if the device is enslaved; in
2513 * this case we must match on the real ingress device, so reset it
2515 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2516 fl6->flowi6_oif = skb->dev->ifindex;
2518 /* Get the "current" route for this destination and
2519 * check if the redirect has come from appropriate router.
2521 * RFC 4861 specifies that redirects should only be
2522 * accepted if they come from the nexthop to the target.
2523 * Due to the way the routes are chosen, this notion
2524 * is a bit fuzzy and one might need to check all possible
2529 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2531 for_each_fib6_node_rt_rcu(fn) {
2533 res.nh = &rt->fib6_nh;
2535 if (fib6_check_expired(rt))
2537 if (rt->fib6_flags & RTF_REJECT)
2539 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2544 rt = net->ipv6.fib6_null_entry;
2545 else if (rt->fib6_flags & RTF_REJECT) {
2546 ret = net->ipv6.ip6_null_entry;
2550 if (rt == net->ipv6.fib6_null_entry) {
2551 fn = fib6_backtrack(fn, &fl6->saddr);
2557 res.nh = &rt->fib6_nh;
2560 ip6_hold_safe(net, &ret);
2562 res.fib6_flags = res.f6i->fib6_flags;
2563 res.fib6_type = res.f6i->fib6_type;
2564 ret = ip6_create_rt_rcu(&res);
2569 trace_fib6_table_lookup(net, &res, table, fl6);
2573 static struct dst_entry *ip6_route_redirect(struct net *net,
2574 const struct flowi6 *fl6,
2575 const struct sk_buff *skb,
2576 const struct in6_addr *gateway)
2578 int flags = RT6_LOOKUP_F_HAS_SADDR;
2579 struct ip6rd_flowi rdfl;
2582 rdfl.gateway = *gateway;
2584 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2585 flags, __ip6_route_redirect);
2588 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2591 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2592 struct dst_entry *dst;
2593 struct flowi6 fl6 = {
2594 .flowi6_iif = LOOPBACK_IFINDEX,
2596 .flowi6_mark = mark,
2597 .daddr = iph->daddr,
2598 .saddr = iph->saddr,
2599 .flowlabel = ip6_flowinfo(iph),
2603 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2604 rt6_do_redirect(dst, NULL, skb);
2607 EXPORT_SYMBOL_GPL(ip6_redirect);
2609 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2611 const struct ipv6hdr *iph = ipv6_hdr(skb);
2612 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2613 struct dst_entry *dst;
2614 struct flowi6 fl6 = {
2615 .flowi6_iif = LOOPBACK_IFINDEX,
2618 .saddr = iph->daddr,
2619 .flowi6_uid = sock_net_uid(net, NULL),
2622 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2623 rt6_do_redirect(dst, NULL, skb);
2627 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2629 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2632 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2634 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2636 struct net_device *dev = dst->dev;
2637 unsigned int mtu = dst_mtu(dst);
2638 struct net *net = dev_net(dev);
2640 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2642 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2643 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2646 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2647 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2648 * IPV6_MAXPLEN is also valid and means: "any MSS,
2649 * rely only on pmtu discovery"
2651 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2656 static unsigned int ip6_mtu(const struct dst_entry *dst)
2658 struct inet6_dev *idev;
2661 mtu = dst_metric_raw(dst, RTAX_MTU);
2668 idev = __in6_dev_get(dst->dev);
2670 mtu = idev->cnf.mtu6;
2674 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2676 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2680 * 1. mtu on route is locked - use it
2681 * 2. mtu from nexthop exception
2682 * 3. mtu from egress device
2684 * based on ip6_dst_mtu_forward and exception logic of
2685 * rt6_find_cached_rt; called with rcu_read_lock
2687 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2688 const struct in6_addr *daddr,
2689 const struct in6_addr *saddr)
2691 const struct fib6_nh *nh = res->nh;
2692 struct fib6_info *f6i = res->f6i;
2693 struct inet6_dev *idev;
2694 struct rt6_info *rt;
2697 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2698 mtu = f6i->fib6_pmtu;
2703 rt = rt6_find_cached_rt(res, daddr, saddr);
2705 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2707 struct net_device *dev = nh->fib_nh_dev;
2710 idev = __in6_dev_get(dev);
2711 if (idev && idev->cnf.mtu6 > mtu)
2712 mtu = idev->cnf.mtu6;
2715 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2717 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2720 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2723 struct dst_entry *dst;
2724 struct rt6_info *rt;
2725 struct inet6_dev *idev = in6_dev_get(dev);
2726 struct net *net = dev_net(dev);
2728 if (unlikely(!idev))
2729 return ERR_PTR(-ENODEV);
2731 rt = ip6_dst_alloc(net, dev, 0);
2732 if (unlikely(!rt)) {
2734 dst = ERR_PTR(-ENOMEM);
2738 rt->dst.flags |= DST_HOST;
2739 rt->dst.input = ip6_input;
2740 rt->dst.output = ip6_output;
2741 rt->rt6i_gateway = fl6->daddr;
2742 rt->rt6i_dst.addr = fl6->daddr;
2743 rt->rt6i_dst.plen = 128;
2744 rt->rt6i_idev = idev;
2745 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2747 /* Add this dst into uncached_list so that rt6_disable_ip() can
2748 * do proper release of the net_device
2750 rt6_uncached_list_add(rt);
2751 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2753 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2759 static int ip6_dst_gc(struct dst_ops *ops)
2761 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2762 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2763 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2764 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2765 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2766 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2769 entries = dst_entries_get_fast(ops);
2770 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2771 entries <= rt_max_size)
2774 net->ipv6.ip6_rt_gc_expire++;
2775 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2776 entries = dst_entries_get_slow(ops);
2777 if (entries < ops->gc_thresh)
2778 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2780 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2781 return entries > rt_max_size;
2784 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2785 struct fib6_config *cfg,
2786 const struct in6_addr *gw_addr,
2787 u32 tbid, int flags)
2789 struct flowi6 fl6 = {
2790 .flowi6_oif = cfg->fc_ifindex,
2792 .saddr = cfg->fc_prefsrc,
2794 struct fib6_table *table;
2795 struct rt6_info *rt;
2797 table = fib6_get_table(net, tbid);
2801 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2802 flags |= RT6_LOOKUP_F_HAS_SADDR;
2804 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2805 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2807 /* if table lookup failed, fall back to full lookup */
2808 if (rt == net->ipv6.ip6_null_entry) {
2816 static int ip6_route_check_nh_onlink(struct net *net,
2817 struct fib6_config *cfg,
2818 const struct net_device *dev,
2819 struct netlink_ext_ack *extack)
2821 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2822 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2823 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2824 struct fib6_info *from;
2825 struct rt6_info *grt;
2829 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2832 from = rcu_dereference(grt->from);
2833 if (!grt->dst.error &&
2834 /* ignore match if it is the default route */
2835 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2836 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2837 NL_SET_ERR_MSG(extack,
2838 "Nexthop has invalid gateway or device mismatch");
2849 static int ip6_route_check_nh(struct net *net,
2850 struct fib6_config *cfg,
2851 struct net_device **_dev,
2852 struct inet6_dev **idev)
2854 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2855 struct net_device *dev = _dev ? *_dev : NULL;
2856 struct rt6_info *grt = NULL;
2857 int err = -EHOSTUNREACH;
2859 if (cfg->fc_table) {
2860 int flags = RT6_LOOKUP_F_IFACE;
2862 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2863 cfg->fc_table, flags);
2865 if (grt->rt6i_flags & RTF_GATEWAY ||
2866 (dev && dev != grt->dst.dev)) {
2874 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2880 if (dev != grt->dst.dev) {
2885 *_dev = dev = grt->dst.dev;
2886 *idev = grt->rt6i_idev;
2888 in6_dev_hold(grt->rt6i_idev);
2891 if (!(grt->rt6i_flags & RTF_GATEWAY))
2900 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2901 struct net_device **_dev, struct inet6_dev **idev,
2902 struct netlink_ext_ack *extack)
2904 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2905 int gwa_type = ipv6_addr_type(gw_addr);
2906 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2907 const struct net_device *dev = *_dev;
2908 bool need_addr_check = !dev;
2911 /* if gw_addr is local we will fail to detect this in case
2912 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2913 * will return already-added prefix route via interface that
2914 * prefix route was assigned to, which might be non-loopback.
2917 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2918 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2922 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2923 /* IPv6 strictly inhibits using not link-local
2924 * addresses as nexthop address.
2925 * Otherwise, router will not able to send redirects.
2926 * It is very good, but in some (rare!) circumstances
2927 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2928 * some exceptions. --ANK
2929 * We allow IPv4-mapped nexthops to support RFC4798-type
2932 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2933 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2937 if (cfg->fc_flags & RTNH_F_ONLINK)
2938 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2940 err = ip6_route_check_nh(net, cfg, _dev, idev);
2946 /* reload in case device was changed */
2951 NL_SET_ERR_MSG(extack, "Egress device not specified");
2953 } else if (dev->flags & IFF_LOOPBACK) {
2954 NL_SET_ERR_MSG(extack,
2955 "Egress device can not be loopback device for this route");
2959 /* if we did not check gw_addr above, do so now that the
2960 * egress device has been resolved.
2962 if (need_addr_check &&
2963 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2964 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2973 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2975 if ((flags & RTF_REJECT) ||
2976 (dev && (dev->flags & IFF_LOOPBACK) &&
2977 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2978 !(flags & RTF_LOCAL)))
2984 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2985 struct fib6_config *cfg, gfp_t gfp_flags,
2986 struct netlink_ext_ack *extack)
2988 struct net_device *dev = NULL;
2989 struct inet6_dev *idev = NULL;
2993 fib6_nh->fib_nh_family = AF_INET6;
2996 if (cfg->fc_ifindex) {
2997 dev = dev_get_by_index(net, cfg->fc_ifindex);
3000 idev = in6_dev_get(dev);
3005 if (cfg->fc_flags & RTNH_F_ONLINK) {
3007 NL_SET_ERR_MSG(extack,
3008 "Nexthop device required for onlink");
3012 if (!(dev->flags & IFF_UP)) {
3013 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3018 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3021 fib6_nh->fib_nh_weight = 1;
3023 /* We cannot add true routes via loopback here,
3024 * they would result in kernel looping; promote them to reject routes
3026 addr_type = ipv6_addr_type(&cfg->fc_dst);
3027 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3028 /* hold loopback dev/idev if we haven't done so. */
3029 if (dev != net->loopback_dev) {
3034 dev = net->loopback_dev;
3036 idev = in6_dev_get(dev);
3045 if (cfg->fc_flags & RTF_GATEWAY) {
3046 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3050 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3051 fib6_nh->fib_nh_gw_family = AF_INET6;
3058 if (idev->cnf.disable_ipv6) {
3059 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3064 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3065 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3070 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3071 !netif_carrier_ok(dev))
3072 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3074 err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3075 cfg->fc_encap_type, cfg, gfp_flags, extack);
3079 fib6_nh->fib_nh_dev = dev;
3080 fib6_nh->fib_nh_oif = dev->ifindex;
3087 lwtstate_put(fib6_nh->fib_nh_lws);
3088 fib6_nh->fib_nh_lws = NULL;
3096 void fib6_nh_release(struct fib6_nh *fib6_nh)
3098 fib_nh_common_release(&fib6_nh->nh_common);
3101 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3103 struct netlink_ext_ack *extack)
3105 struct net *net = cfg->fc_nlinfo.nl_net;
3106 struct fib6_info *rt = NULL;
3107 struct fib6_table *table;
3111 /* RTF_PCPU is an internal flag; can not be set by userspace */
3112 if (cfg->fc_flags & RTF_PCPU) {
3113 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3117 /* RTF_CACHE is an internal flag; can not be set by userspace */
3118 if (cfg->fc_flags & RTF_CACHE) {
3119 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3123 if (cfg->fc_type > RTN_MAX) {
3124 NL_SET_ERR_MSG(extack, "Invalid route type");
3128 if (cfg->fc_dst_len > 128) {
3129 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3132 if (cfg->fc_src_len > 128) {
3133 NL_SET_ERR_MSG(extack, "Invalid source address length");
3136 #ifndef CONFIG_IPV6_SUBTREES
3137 if (cfg->fc_src_len) {
3138 NL_SET_ERR_MSG(extack,
3139 "Specifying source address requires IPV6_SUBTREES to be enabled");
3145 if (cfg->fc_nlinfo.nlh &&
3146 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3147 table = fib6_get_table(net, cfg->fc_table);
3149 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3150 table = fib6_new_table(net, cfg->fc_table);
3153 table = fib6_new_table(net, cfg->fc_table);
3160 rt = fib6_info_alloc(gfp_flags);
3164 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3166 if (IS_ERR(rt->fib6_metrics)) {
3167 err = PTR_ERR(rt->fib6_metrics);
3168 /* Do not leave garbage there. */
3169 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3173 if (cfg->fc_flags & RTF_ADDRCONF)
3174 rt->dst_nocount = true;
3176 if (cfg->fc_flags & RTF_EXPIRES)
3177 fib6_set_expires(rt, jiffies +
3178 clock_t_to_jiffies(cfg->fc_expires));
3180 fib6_clean_expires(rt);
3182 if (cfg->fc_protocol == RTPROT_UNSPEC)
3183 cfg->fc_protocol = RTPROT_BOOT;
3184 rt->fib6_protocol = cfg->fc_protocol;
3186 rt->fib6_table = table;
3187 rt->fib6_metric = cfg->fc_metric;
3188 rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3189 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3191 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3192 rt->fib6_dst.plen = cfg->fc_dst_len;
3193 if (rt->fib6_dst.plen == 128)
3194 rt->dst_host = true;
3196 #ifdef CONFIG_IPV6_SUBTREES
3197 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3198 rt->fib6_src.plen = cfg->fc_src_len;
3200 err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3204 /* We cannot add true routes via loopback here,
3205 * they would result in kernel looping; promote them to reject routes
3207 addr_type = ipv6_addr_type(&cfg->fc_dst);
3208 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3209 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3211 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3212 struct net_device *dev = fib6_info_nh_dev(rt);
3214 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3215 NL_SET_ERR_MSG(extack, "Invalid source address");
3219 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3220 rt->fib6_prefsrc.plen = 128;
3222 rt->fib6_prefsrc.plen = 0;
3226 fib6_info_release(rt);
3227 return ERR_PTR(err);
3230 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3231 struct netlink_ext_ack *extack)
3233 struct fib6_info *rt;
3236 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3240 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3241 fib6_info_release(rt);
3246 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3248 struct net *net = info->nl_net;
3249 struct fib6_table *table;
3252 if (rt == net->ipv6.fib6_null_entry) {
3257 table = rt->fib6_table;
3258 spin_lock_bh(&table->tb6_lock);
3259 err = fib6_del(rt, info);
3260 spin_unlock_bh(&table->tb6_lock);
3263 fib6_info_release(rt);
3267 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3269 struct nl_info info = { .nl_net = net };
3271 return __ip6_del_rt(rt, &info);
3274 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3276 struct nl_info *info = &cfg->fc_nlinfo;
3277 struct net *net = info->nl_net;
3278 struct sk_buff *skb = NULL;
3279 struct fib6_table *table;
3282 if (rt == net->ipv6.fib6_null_entry)
3284 table = rt->fib6_table;
3285 spin_lock_bh(&table->tb6_lock);
3287 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3288 struct fib6_info *sibling, *next_sibling;
3290 /* prefer to send a single notification with all hops */
3291 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3293 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3295 if (rt6_fill_node(net, skb, rt, NULL,
3296 NULL, NULL, 0, RTM_DELROUTE,
3297 info->portid, seq, 0) < 0) {
3301 info->skip_notify = 1;
3304 list_for_each_entry_safe(sibling, next_sibling,
3307 err = fib6_del(sibling, info);
3313 err = fib6_del(rt, info);
3315 spin_unlock_bh(&table->tb6_lock);
3317 fib6_info_release(rt);
3320 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3321 info->nlh, gfp_any());
3326 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3330 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3333 if (cfg->fc_flags & RTF_GATEWAY &&
3334 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3337 rc = rt6_remove_exception_rt(rt);
3342 static int ip6_route_del(struct fib6_config *cfg,
3343 struct netlink_ext_ack *extack)
3345 struct rt6_info *rt_cache;
3346 struct fib6_table *table;
3347 struct fib6_info *rt;
3348 struct fib6_node *fn;
3351 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3353 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3359 fn = fib6_locate(&table->tb6_root,
3360 &cfg->fc_dst, cfg->fc_dst_len,
3361 &cfg->fc_src, cfg->fc_src_len,
3362 !(cfg->fc_flags & RTF_CACHE));
3365 for_each_fib6_node_rt_rcu(fn) {
3368 if (cfg->fc_flags & RTF_CACHE) {
3369 struct fib6_result res = {
3374 rt_cache = rt6_find_cached_rt(&res,
3378 rc = ip6_del_cached_rt(rt_cache, cfg);
3388 if (cfg->fc_ifindex &&
3390 nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3392 if (cfg->fc_flags & RTF_GATEWAY &&
3393 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3395 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3397 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3399 if (!fib6_info_hold_safe(rt))
3403 /* if gateway was specified only delete the one hop */
3404 if (cfg->fc_flags & RTF_GATEWAY)
3405 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3407 return __ip6_del_rt_siblings(rt, cfg);
3415 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3417 struct netevent_redirect netevent;
3418 struct rt6_info *rt, *nrt = NULL;
3419 struct fib6_result res = {};
3420 struct ndisc_options ndopts;
3421 struct inet6_dev *in6_dev;
3422 struct neighbour *neigh;
3424 int optlen, on_link;
3427 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3428 optlen -= sizeof(*msg);
3431 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3435 msg = (struct rd_msg *)icmp6_hdr(skb);
3437 if (ipv6_addr_is_multicast(&msg->dest)) {
3438 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3443 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3445 } else if (ipv6_addr_type(&msg->target) !=
3446 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3447 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3451 in6_dev = __in6_dev_get(skb->dev);
3454 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3458 * The IP source address of the Redirect MUST be the same as the current
3459 * first-hop router for the specified ICMP Destination Address.
3462 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3463 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3468 if (ndopts.nd_opts_tgt_lladdr) {
3469 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3472 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3477 rt = (struct rt6_info *) dst;
3478 if (rt->rt6i_flags & RTF_REJECT) {
3479 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3483 /* Redirect received -> path was valid.
3484 * Look, redirects are sent only in response to data packets,
3485 * so that this nexthop apparently is reachable. --ANK
3487 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3489 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3494 * We have finally decided to accept it.
3497 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3498 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3499 NEIGH_UPDATE_F_OVERRIDE|
3500 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3501 NEIGH_UPDATE_F_ISROUTER)),
3502 NDISC_REDIRECT, &ndopts);
3505 res.f6i = rcu_dereference(rt->from);
3509 res.nh = &res.f6i->fib6_nh;
3510 res.fib6_flags = res.f6i->fib6_flags;
3511 res.fib6_type = res.f6i->fib6_type;
3512 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3516 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3518 nrt->rt6i_flags &= ~RTF_GATEWAY;
3520 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3522 /* rt6_insert_exception() will take care of duplicated exceptions */
3523 if (rt6_insert_exception(nrt, &res)) {
3524 dst_release_immediate(&nrt->dst);
3528 netevent.old = &rt->dst;
3529 netevent.new = &nrt->dst;
3530 netevent.daddr = &msg->dest;
3531 netevent.neigh = neigh;
3532 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3536 neigh_release(neigh);
3539 #ifdef CONFIG_IPV6_ROUTE_INFO
3540 static struct fib6_info *rt6_get_route_info(struct net *net,
3541 const struct in6_addr *prefix, int prefixlen,
3542 const struct in6_addr *gwaddr,
3543 struct net_device *dev)
3545 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3546 int ifindex = dev->ifindex;
3547 struct fib6_node *fn;
3548 struct fib6_info *rt = NULL;
3549 struct fib6_table *table;
3551 table = fib6_get_table(net, tb_id);
3556 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3560 for_each_fib6_node_rt_rcu(fn) {
3561 if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3563 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3564 !rt->fib6_nh.fib_nh_gw_family)
3566 if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3568 if (!fib6_info_hold_safe(rt))
3577 static struct fib6_info *rt6_add_route_info(struct net *net,
3578 const struct in6_addr *prefix, int prefixlen,
3579 const struct in6_addr *gwaddr,
3580 struct net_device *dev,
3583 struct fib6_config cfg = {
3584 .fc_metric = IP6_RT_PRIO_USER,
3585 .fc_ifindex = dev->ifindex,
3586 .fc_dst_len = prefixlen,
3587 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3588 RTF_UP | RTF_PREF(pref),
3589 .fc_protocol = RTPROT_RA,
3590 .fc_type = RTN_UNICAST,
3591 .fc_nlinfo.portid = 0,
3592 .fc_nlinfo.nlh = NULL,
3593 .fc_nlinfo.nl_net = net,
3596 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3597 cfg.fc_dst = *prefix;
3598 cfg.fc_gateway = *gwaddr;
3600 /* We should treat it as a default route if prefix length is 0. */
3602 cfg.fc_flags |= RTF_DEFAULT;
3604 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3606 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3610 struct fib6_info *rt6_get_dflt_router(struct net *net,
3611 const struct in6_addr *addr,
3612 struct net_device *dev)
3614 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3615 struct fib6_info *rt;
3616 struct fib6_table *table;
3618 table = fib6_get_table(net, tb_id);
3623 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3624 struct fib6_nh *nh = &rt->fib6_nh;
3626 if (dev == nh->fib_nh_dev &&
3627 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3628 ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3631 if (rt && !fib6_info_hold_safe(rt))
3637 struct fib6_info *rt6_add_dflt_router(struct net *net,
3638 const struct in6_addr *gwaddr,
3639 struct net_device *dev,
3642 struct fib6_config cfg = {
3643 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3644 .fc_metric = IP6_RT_PRIO_USER,
3645 .fc_ifindex = dev->ifindex,
3646 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3647 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3648 .fc_protocol = RTPROT_RA,
3649 .fc_type = RTN_UNICAST,
3650 .fc_nlinfo.portid = 0,
3651 .fc_nlinfo.nlh = NULL,
3652 .fc_nlinfo.nl_net = net,
3655 cfg.fc_gateway = *gwaddr;
3657 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3658 struct fib6_table *table;
3660 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3662 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3665 return rt6_get_dflt_router(net, gwaddr, dev);
3668 static void __rt6_purge_dflt_routers(struct net *net,
3669 struct fib6_table *table)
3671 struct fib6_info *rt;
3675 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3676 struct net_device *dev = fib6_info_nh_dev(rt);
3677 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3679 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3680 (!idev || idev->cnf.accept_ra != 2) &&
3681 fib6_info_hold_safe(rt)) {
3683 ip6_del_rt(net, rt);
3689 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3692 void rt6_purge_dflt_routers(struct net *net)
3694 struct fib6_table *table;
3695 struct hlist_head *head;
3700 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3701 head = &net->ipv6.fib_table_hash[h];
3702 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3703 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3704 __rt6_purge_dflt_routers(net, table);
3711 static void rtmsg_to_fib6_config(struct net *net,
3712 struct in6_rtmsg *rtmsg,
3713 struct fib6_config *cfg)
3715 *cfg = (struct fib6_config){
3716 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3718 .fc_ifindex = rtmsg->rtmsg_ifindex,
3719 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3720 .fc_expires = rtmsg->rtmsg_info,
3721 .fc_dst_len = rtmsg->rtmsg_dst_len,
3722 .fc_src_len = rtmsg->rtmsg_src_len,
3723 .fc_flags = rtmsg->rtmsg_flags,
3724 .fc_type = rtmsg->rtmsg_type,
3726 .fc_nlinfo.nl_net = net,
3728 .fc_dst = rtmsg->rtmsg_dst,
3729 .fc_src = rtmsg->rtmsg_src,
3730 .fc_gateway = rtmsg->rtmsg_gateway,
3734 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3736 struct fib6_config cfg;
3737 struct in6_rtmsg rtmsg;
3741 case SIOCADDRT: /* Add a route */
3742 case SIOCDELRT: /* Delete a route */
3743 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3745 err = copy_from_user(&rtmsg, arg,
3746 sizeof(struct in6_rtmsg));
3750 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3755 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3758 err = ip6_route_del(&cfg, NULL);
3772 * Drop the packet on the floor
3775 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3777 struct dst_entry *dst = skb_dst(skb);
3778 struct net *net = dev_net(dst->dev);
3779 struct inet6_dev *idev;
3782 if (netif_is_l3_master(skb->dev) &&
3783 dst->dev == net->loopback_dev)
3784 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3786 idev = ip6_dst_idev(dst);
3788 switch (ipstats_mib_noroutes) {
3789 case IPSTATS_MIB_INNOROUTES:
3790 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3791 if (type == IPV6_ADDR_ANY) {
3792 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3796 case IPSTATS_MIB_OUTNOROUTES:
3797 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3801 /* Start over by dropping the dst for l3mdev case */
3802 if (netif_is_l3_master(skb->dev))
3805 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3810 static int ip6_pkt_discard(struct sk_buff *skb)
3812 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3815 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3817 skb->dev = skb_dst(skb)->dev;
3818 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3821 static int ip6_pkt_prohibit(struct sk_buff *skb)
3823 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3826 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3828 skb->dev = skb_dst(skb)->dev;
3829 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3833 * Allocate a dst for local (unicast / anycast) address.
3836 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3837 struct inet6_dev *idev,
3838 const struct in6_addr *addr,
3839 bool anycast, gfp_t gfp_flags)
3841 struct fib6_config cfg = {
3842 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3843 .fc_ifindex = idev->dev->ifindex,
3844 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3847 .fc_protocol = RTPROT_KERNEL,
3848 .fc_nlinfo.nl_net = net,
3849 .fc_ignore_dev_down = true,
3853 cfg.fc_type = RTN_ANYCAST;
3854 cfg.fc_flags |= RTF_ANYCAST;
3856 cfg.fc_type = RTN_LOCAL;
3857 cfg.fc_flags |= RTF_LOCAL;
3860 return ip6_route_info_create(&cfg, gfp_flags, NULL);
3863 /* remove deleted ip from prefsrc entries */
3864 struct arg_dev_net_ip {
3865 struct net_device *dev;
3867 struct in6_addr *addr;
3870 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3872 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3873 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3874 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3876 if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3877 rt != net->ipv6.fib6_null_entry &&
3878 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3879 spin_lock_bh(&rt6_exception_lock);
3880 /* remove prefsrc entry */
3881 rt->fib6_prefsrc.plen = 0;
3882 spin_unlock_bh(&rt6_exception_lock);
3887 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3889 struct net *net = dev_net(ifp->idev->dev);
3890 struct arg_dev_net_ip adni = {
3891 .dev = ifp->idev->dev,
3895 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3898 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
3900 /* Remove routers and update dst entries when gateway turn into host. */
3901 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3903 struct in6_addr *gateway = (struct in6_addr *)arg;
3905 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3906 rt->fib6_nh.fib_nh_gw_family &&
3907 ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3911 /* Further clean up cached routes in exception table.
3912 * This is needed because cached route may have a different
3913 * gateway than its 'parent' in the case of an ip redirect.
3915 rt6_exceptions_clean_tohost(rt, gateway);
3920 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3922 fib6_clean_all(net, fib6_clean_tohost, gateway);
3925 struct arg_netdev_event {
3926 const struct net_device *dev;
3928 unsigned char nh_flags;
3929 unsigned long event;
3933 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3935 struct fib6_info *iter;
3936 struct fib6_node *fn;
3938 fn = rcu_dereference_protected(rt->fib6_node,
3939 lockdep_is_held(&rt->fib6_table->tb6_lock));
3940 iter = rcu_dereference_protected(fn->leaf,
3941 lockdep_is_held(&rt->fib6_table->tb6_lock));
3943 if (iter->fib6_metric == rt->fib6_metric &&
3944 rt6_qualify_for_ecmp(iter))
3946 iter = rcu_dereference_protected(iter->fib6_next,
3947 lockdep_is_held(&rt->fib6_table->tb6_lock));
3953 static bool rt6_is_dead(const struct fib6_info *rt)
3955 if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3956 (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3957 ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3963 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3965 struct fib6_info *iter;
3968 if (!rt6_is_dead(rt))
3969 total += rt->fib6_nh.fib_nh_weight;
3971 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3972 if (!rt6_is_dead(iter))
3973 total += iter->fib6_nh.fib_nh_weight;
3979 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3981 int upper_bound = -1;
3983 if (!rt6_is_dead(rt)) {
3984 *weight += rt->fib6_nh.fib_nh_weight;
3985 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3988 atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3991 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3993 struct fib6_info *iter;
3996 rt6_upper_bound_set(rt, &weight, total);
3998 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3999 rt6_upper_bound_set(iter, &weight, total);
4002 void rt6_multipath_rebalance(struct fib6_info *rt)
4004 struct fib6_info *first;
4007 /* In case the entire multipath route was marked for flushing,
4008 * then there is no need to rebalance upon the removal of every
4011 if (!rt->fib6_nsiblings || rt->should_flush)
4014 /* During lookup routes are evaluated in order, so we need to
4015 * make sure upper bounds are assigned from the first sibling
4018 first = rt6_multipath_first_sibling(rt);
4019 if (WARN_ON_ONCE(!first))
4022 total = rt6_multipath_total_weight(first);
4023 rt6_multipath_upper_bound_set(first, total);
4026 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4028 const struct arg_netdev_event *arg = p_arg;
4029 struct net *net = dev_net(arg->dev);
4031 if (rt != net->ipv6.fib6_null_entry &&
4032 rt->fib6_nh.fib_nh_dev == arg->dev) {
4033 rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4034 fib6_update_sernum_upto_root(net, rt);
4035 rt6_multipath_rebalance(rt);
4041 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4043 struct arg_netdev_event arg = {
4046 .nh_flags = nh_flags,
4050 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4051 arg.nh_flags |= RTNH_F_LINKDOWN;
4053 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4056 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4057 const struct net_device *dev)
4059 struct fib6_info *iter;
4061 if (rt->fib6_nh.fib_nh_dev == dev)
4063 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4064 if (iter->fib6_nh.fib_nh_dev == dev)
4070 static void rt6_multipath_flush(struct fib6_info *rt)
4072 struct fib6_info *iter;
4074 rt->should_flush = 1;
4075 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4076 iter->should_flush = 1;
4079 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4080 const struct net_device *down_dev)
4082 struct fib6_info *iter;
4083 unsigned int dead = 0;
4085 if (rt->fib6_nh.fib_nh_dev == down_dev ||
4086 rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4088 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4089 if (iter->fib6_nh.fib_nh_dev == down_dev ||
4090 iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4096 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4097 const struct net_device *dev,
4098 unsigned char nh_flags)
4100 struct fib6_info *iter;
4102 if (rt->fib6_nh.fib_nh_dev == dev)
4103 rt->fib6_nh.fib_nh_flags |= nh_flags;
4104 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4105 if (iter->fib6_nh.fib_nh_dev == dev)
4106 iter->fib6_nh.fib_nh_flags |= nh_flags;
4109 /* called with write lock held for table with rt */
4110 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4112 const struct arg_netdev_event *arg = p_arg;
4113 const struct net_device *dev = arg->dev;
4114 struct net *net = dev_net(dev);
4116 if (rt == net->ipv6.fib6_null_entry)
4119 switch (arg->event) {
4120 case NETDEV_UNREGISTER:
4121 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4123 if (rt->should_flush)
4125 if (!rt->fib6_nsiblings)
4126 return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4127 if (rt6_multipath_uses_dev(rt, dev)) {
4130 count = rt6_multipath_dead_count(rt, dev);
4131 if (rt->fib6_nsiblings + 1 == count) {
4132 rt6_multipath_flush(rt);
4135 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4137 fib6_update_sernum(net, rt);
4138 rt6_multipath_rebalance(rt);
4142 if (rt->fib6_nh.fib_nh_dev != dev ||
4143 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4145 rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4146 rt6_multipath_rebalance(rt);
4153 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4155 struct arg_netdev_event arg = {
4161 struct net *net = dev_net(dev);
4163 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4164 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4166 fib6_clean_all(net, fib6_ifdown, &arg);
4169 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4171 rt6_sync_down_dev(dev, event);
4172 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4173 neigh_ifdown(&nd_tbl, dev);
4176 struct rt6_mtu_change_arg {
4177 struct net_device *dev;
4181 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4183 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4184 struct inet6_dev *idev;
4186 /* In IPv6 pmtu discovery is not optional,
4187 so that RTAX_MTU lock cannot disable it.
4188 We still use this lock to block changes
4189 caused by addrconf/ndisc.
4192 idev = __in6_dev_get(arg->dev);
4196 /* For administrative MTU increase, there is no way to discover
4197 IPv6 PMTU increase, so PMTU increase should be updated here.
4198 Since RFC 1981 doesn't include administrative MTU increase
4199 update PMTU increase is a MUST. (i.e. jumbo frame)
4201 if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4202 !fib6_metric_locked(rt, RTAX_MTU)) {
4203 u32 mtu = rt->fib6_pmtu;
4205 if (mtu >= arg->mtu ||
4206 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4207 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4209 spin_lock_bh(&rt6_exception_lock);
4210 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4211 spin_unlock_bh(&rt6_exception_lock);
4216 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4218 struct rt6_mtu_change_arg arg = {
4223 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4226 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4227 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4228 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4229 [RTA_OIF] = { .type = NLA_U32 },
4230 [RTA_IIF] = { .type = NLA_U32 },
4231 [RTA_PRIORITY] = { .type = NLA_U32 },
4232 [RTA_METRICS] = { .type = NLA_NESTED },
4233 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4234 [RTA_PREF] = { .type = NLA_U8 },
4235 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4236 [RTA_ENCAP] = { .type = NLA_NESTED },
4237 [RTA_EXPIRES] = { .type = NLA_U32 },
4238 [RTA_UID] = { .type = NLA_U32 },
4239 [RTA_MARK] = { .type = NLA_U32 },
4240 [RTA_TABLE] = { .type = NLA_U32 },
4241 [RTA_IP_PROTO] = { .type = NLA_U8 },
4242 [RTA_SPORT] = { .type = NLA_U16 },
4243 [RTA_DPORT] = { .type = NLA_U16 },
4246 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4247 struct fib6_config *cfg,
4248 struct netlink_ext_ack *extack)
4251 struct nlattr *tb[RTA_MAX+1];
4255 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4256 rtm_ipv6_policy, extack);
4261 rtm = nlmsg_data(nlh);
4263 *cfg = (struct fib6_config){
4264 .fc_table = rtm->rtm_table,
4265 .fc_dst_len = rtm->rtm_dst_len,
4266 .fc_src_len = rtm->rtm_src_len,
4268 .fc_protocol = rtm->rtm_protocol,
4269 .fc_type = rtm->rtm_type,
4271 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4272 .fc_nlinfo.nlh = nlh,
4273 .fc_nlinfo.nl_net = sock_net(skb->sk),
4276 if (rtm->rtm_type == RTN_UNREACHABLE ||
4277 rtm->rtm_type == RTN_BLACKHOLE ||
4278 rtm->rtm_type == RTN_PROHIBIT ||
4279 rtm->rtm_type == RTN_THROW)
4280 cfg->fc_flags |= RTF_REJECT;
4282 if (rtm->rtm_type == RTN_LOCAL)
4283 cfg->fc_flags |= RTF_LOCAL;
4285 if (rtm->rtm_flags & RTM_F_CLONED)
4286 cfg->fc_flags |= RTF_CACHE;
4288 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4290 if (tb[RTA_GATEWAY]) {
4291 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4292 cfg->fc_flags |= RTF_GATEWAY;
4295 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4300 int plen = (rtm->rtm_dst_len + 7) >> 3;
4302 if (nla_len(tb[RTA_DST]) < plen)
4305 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4309 int plen = (rtm->rtm_src_len + 7) >> 3;
4311 if (nla_len(tb[RTA_SRC]) < plen)
4314 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4317 if (tb[RTA_PREFSRC])
4318 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4321 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4323 if (tb[RTA_PRIORITY])
4324 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4326 if (tb[RTA_METRICS]) {
4327 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4328 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4332 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4334 if (tb[RTA_MULTIPATH]) {
4335 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4336 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4338 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4339 cfg->fc_mp_len, extack);
4345 pref = nla_get_u8(tb[RTA_PREF]);
4346 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4347 pref != ICMPV6_ROUTER_PREF_HIGH)
4348 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4349 cfg->fc_flags |= RTF_PREF(pref);
4353 cfg->fc_encap = tb[RTA_ENCAP];
4355 if (tb[RTA_ENCAP_TYPE]) {
4356 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4358 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4363 if (tb[RTA_EXPIRES]) {
4364 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4366 if (addrconf_finite_timeout(timeout)) {
4367 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4368 cfg->fc_flags |= RTF_EXPIRES;
4378 struct fib6_info *fib6_info;
4379 struct fib6_config r_cfg;
4380 struct list_head next;
4383 static int ip6_route_info_append(struct net *net,
4384 struct list_head *rt6_nh_list,
4385 struct fib6_info *rt,
4386 struct fib6_config *r_cfg)
4391 list_for_each_entry(nh, rt6_nh_list, next) {
4392 /* check if fib6_info already exists */
4393 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4397 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4401 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4402 list_add_tail(&nh->next, rt6_nh_list);
4407 static void ip6_route_mpath_notify(struct fib6_info *rt,
4408 struct fib6_info *rt_last,
4409 struct nl_info *info,
4412 /* if this is an APPEND route, then rt points to the first route
4413 * inserted and rt_last points to last route inserted. Userspace
4414 * wants a consistent dump of the route which starts at the first
4415 * nexthop. Since sibling routes are always added at the end of
4416 * the list, find the first sibling of the last route appended
4418 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4419 rt = list_first_entry(&rt_last->fib6_siblings,
4425 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4428 static int ip6_route_multipath_add(struct fib6_config *cfg,
4429 struct netlink_ext_ack *extack)
4431 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4432 struct nl_info *info = &cfg->fc_nlinfo;
4433 struct fib6_config r_cfg;
4434 struct rtnexthop *rtnh;
4435 struct fib6_info *rt;
4436 struct rt6_nh *err_nh;
4437 struct rt6_nh *nh, *nh_safe;
4443 int replace = (cfg->fc_nlinfo.nlh &&
4444 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4445 LIST_HEAD(rt6_nh_list);
4447 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4448 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4449 nlflags |= NLM_F_APPEND;
4451 remaining = cfg->fc_mp_len;
4452 rtnh = (struct rtnexthop *)cfg->fc_mp;
4454 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4455 * fib6_info structs per nexthop
4457 while (rtnh_ok(rtnh, remaining)) {
4458 memcpy(&r_cfg, cfg, sizeof(*cfg));
4459 if (rtnh->rtnh_ifindex)
4460 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4462 attrlen = rtnh_attrlen(rtnh);
4464 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4466 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4468 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4469 r_cfg.fc_flags |= RTF_GATEWAY;
4471 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4472 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4474 r_cfg.fc_encap_type = nla_get_u16(nla);
4477 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4478 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4484 if (!rt6_qualify_for_ecmp(rt)) {
4486 NL_SET_ERR_MSG(extack,
4487 "Device only routes can not be added for IPv6 using the multipath API.");
4488 fib6_info_release(rt);
4492 rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4494 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4497 fib6_info_release(rt);
4501 rtnh = rtnh_next(rtnh, &remaining);
4504 /* for add and replace send one notification with all nexthops.
4505 * Skip the notification in fib6_add_rt2node and send one with
4506 * the full route when done
4508 info->skip_notify = 1;
4511 list_for_each_entry(nh, &rt6_nh_list, next) {
4512 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4513 fib6_info_release(nh->fib6_info);
4516 /* save reference to last route successfully inserted */
4517 rt_last = nh->fib6_info;
4519 /* save reference to first route for notification */
4521 rt_notif = nh->fib6_info;
4524 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4525 nh->fib6_info = NULL;
4528 NL_SET_ERR_MSG_MOD(extack,
4529 "multipath route replace failed (check consistency of installed routes)");
4534 /* Because each route is added like a single route we remove
4535 * these flags after the first nexthop: if there is a collision,
4536 * we have already failed to add the first nexthop:
4537 * fib6_add_rt2node() has rejected it; when replacing, old
4538 * nexthops have been replaced by first new, the rest should
4541 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4546 /* success ... tell user about new route */
4547 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4551 /* send notification for routes that were added so that
4552 * the delete notifications sent by ip6_route_del are
4556 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4558 /* Delete routes that were already added */
4559 list_for_each_entry(nh, &rt6_nh_list, next) {
4562 ip6_route_del(&nh->r_cfg, extack);
4566 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4568 fib6_info_release(nh->fib6_info);
4569 list_del(&nh->next);
4576 static int ip6_route_multipath_del(struct fib6_config *cfg,
4577 struct netlink_ext_ack *extack)
4579 struct fib6_config r_cfg;
4580 struct rtnexthop *rtnh;
4583 int err = 1, last_err = 0;
4585 remaining = cfg->fc_mp_len;
4586 rtnh = (struct rtnexthop *)cfg->fc_mp;
4588 /* Parse a Multipath Entry */
4589 while (rtnh_ok(rtnh, remaining)) {
4590 memcpy(&r_cfg, cfg, sizeof(*cfg));
4591 if (rtnh->rtnh_ifindex)
4592 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4594 attrlen = rtnh_attrlen(rtnh);
4596 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4598 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4600 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4601 r_cfg.fc_flags |= RTF_GATEWAY;
4604 err = ip6_route_del(&r_cfg, extack);
4608 rtnh = rtnh_next(rtnh, &remaining);
4614 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4615 struct netlink_ext_ack *extack)
4617 struct fib6_config cfg;
4620 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4625 return ip6_route_multipath_del(&cfg, extack);
4627 cfg.fc_delete_all_nh = 1;
4628 return ip6_route_del(&cfg, extack);
4632 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4633 struct netlink_ext_ack *extack)
4635 struct fib6_config cfg;
4638 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4642 if (cfg.fc_metric == 0)
4643 cfg.fc_metric = IP6_RT_PRIO_USER;
4646 return ip6_route_multipath_add(&cfg, extack);
4648 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4651 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4653 int nexthop_len = 0;
4655 if (rt->fib6_nsiblings) {
4656 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4657 + NLA_ALIGN(sizeof(struct rtnexthop))
4658 + nla_total_size(16) /* RTA_GATEWAY */
4659 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4661 nexthop_len *= rt->fib6_nsiblings;
4664 return NLMSG_ALIGN(sizeof(struct rtmsg))
4665 + nla_total_size(16) /* RTA_SRC */
4666 + nla_total_size(16) /* RTA_DST */
4667 + nla_total_size(16) /* RTA_GATEWAY */
4668 + nla_total_size(16) /* RTA_PREFSRC */
4669 + nla_total_size(4) /* RTA_TABLE */
4670 + nla_total_size(4) /* RTA_IIF */
4671 + nla_total_size(4) /* RTA_OIF */
4672 + nla_total_size(4) /* RTA_PRIORITY */
4673 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4674 + nla_total_size(sizeof(struct rta_cacheinfo))
4675 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4676 + nla_total_size(1) /* RTA_PREF */
4677 + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4681 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4682 struct fib6_info *rt, struct dst_entry *dst,
4683 struct in6_addr *dest, struct in6_addr *src,
4684 int iif, int type, u32 portid, u32 seq,
4687 struct rt6_info *rt6 = (struct rt6_info *)dst;
4688 struct rt6key *rt6_dst, *rt6_src;
4689 u32 *pmetrics, table, rt6_flags;
4690 struct nlmsghdr *nlh;
4694 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4699 rt6_dst = &rt6->rt6i_dst;
4700 rt6_src = &rt6->rt6i_src;
4701 rt6_flags = rt6->rt6i_flags;
4703 rt6_dst = &rt->fib6_dst;
4704 rt6_src = &rt->fib6_src;
4705 rt6_flags = rt->fib6_flags;
4708 rtm = nlmsg_data(nlh);
4709 rtm->rtm_family = AF_INET6;
4710 rtm->rtm_dst_len = rt6_dst->plen;
4711 rtm->rtm_src_len = rt6_src->plen;
4714 table = rt->fib6_table->tb6_id;
4716 table = RT6_TABLE_UNSPEC;
4717 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4718 if (nla_put_u32(skb, RTA_TABLE, table))
4719 goto nla_put_failure;
4721 rtm->rtm_type = rt->fib6_type;
4723 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4724 rtm->rtm_protocol = rt->fib6_protocol;
4726 if (rt6_flags & RTF_CACHE)
4727 rtm->rtm_flags |= RTM_F_CLONED;
4730 if (nla_put_in6_addr(skb, RTA_DST, dest))
4731 goto nla_put_failure;
4732 rtm->rtm_dst_len = 128;
4733 } else if (rtm->rtm_dst_len)
4734 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4735 goto nla_put_failure;
4736 #ifdef CONFIG_IPV6_SUBTREES
4738 if (nla_put_in6_addr(skb, RTA_SRC, src))
4739 goto nla_put_failure;
4740 rtm->rtm_src_len = 128;
4741 } else if (rtm->rtm_src_len &&
4742 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4743 goto nla_put_failure;
4746 #ifdef CONFIG_IPV6_MROUTE
4747 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4748 int err = ip6mr_get_route(net, skb, rtm, portid);
4753 goto nla_put_failure;
4756 if (nla_put_u32(skb, RTA_IIF, iif))
4757 goto nla_put_failure;
4759 struct in6_addr saddr_buf;
4760 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4761 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4762 goto nla_put_failure;
4765 if (rt->fib6_prefsrc.plen) {
4766 struct in6_addr saddr_buf;
4767 saddr_buf = rt->fib6_prefsrc.addr;
4768 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4769 goto nla_put_failure;
4772 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4773 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4774 goto nla_put_failure;
4776 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4777 goto nla_put_failure;
4779 /* For multipath routes, walk the siblings list and add
4780 * each as a nexthop within RTA_MULTIPATH.
4783 if (rt6_flags & RTF_GATEWAY &&
4784 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4785 goto nla_put_failure;
4787 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4788 goto nla_put_failure;
4789 } else if (rt->fib6_nsiblings) {
4790 struct fib6_info *sibling, *next_sibling;
4793 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4795 goto nla_put_failure;
4797 if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4798 rt->fib6_nh.fib_nh_weight) < 0)
4799 goto nla_put_failure;
4801 list_for_each_entry_safe(sibling, next_sibling,
4802 &rt->fib6_siblings, fib6_siblings) {
4803 if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4804 sibling->fib6_nh.fib_nh_weight) < 0)
4805 goto nla_put_failure;
4808 nla_nest_end(skb, mp);
4810 unsigned char nh_flags = 0;
4812 if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4813 &nh_flags, false) < 0)
4814 goto nla_put_failure;
4816 rtm->rtm_flags |= nh_flags;
4819 if (rt6_flags & RTF_EXPIRES) {
4820 expires = dst ? dst->expires : rt->expires;
4824 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4825 goto nla_put_failure;
4827 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4828 goto nla_put_failure;
4831 nlmsg_end(skb, nlh);
4835 nlmsg_cancel(skb, nlh);
4839 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4840 const struct net_device *dev)
4842 if (f6i->fib6_nh.fib_nh_dev == dev)
4845 if (f6i->fib6_nsiblings) {
4846 struct fib6_info *sibling, *next_sibling;
4848 list_for_each_entry_safe(sibling, next_sibling,
4849 &f6i->fib6_siblings, fib6_siblings) {
4850 if (sibling->fib6_nh.fib_nh_dev == dev)
4858 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4860 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4861 struct fib_dump_filter *filter = &arg->filter;
4862 unsigned int flags = NLM_F_MULTI;
4863 struct net *net = arg->net;
4865 if (rt == net->ipv6.fib6_null_entry)
4868 if ((filter->flags & RTM_F_PREFIX) &&
4869 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4870 /* success since this is not a prefix route */
4873 if (filter->filter_set) {
4874 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4875 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4876 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4879 flags |= NLM_F_DUMP_FILTERED;
4882 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4883 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4884 arg->cb->nlh->nlmsg_seq, flags);
4887 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4888 const struct nlmsghdr *nlh,
4890 struct netlink_ext_ack *extack)
4895 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4896 NL_SET_ERR_MSG_MOD(extack,
4897 "Invalid header for get route request");
4901 if (!netlink_strict_get_check(skb))
4902 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4903 rtm_ipv6_policy, extack);
4905 rtm = nlmsg_data(nlh);
4906 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4907 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4908 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4910 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4913 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4914 NL_SET_ERR_MSG_MOD(extack,
4915 "Invalid flags for get route request");
4919 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4920 rtm_ipv6_policy, extack);
4924 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4925 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4926 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4930 for (i = 0; i <= RTA_MAX; i++) {
4946 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4954 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4955 struct netlink_ext_ack *extack)
4957 struct net *net = sock_net(in_skb->sk);
4958 struct nlattr *tb[RTA_MAX+1];
4959 int err, iif = 0, oif = 0;
4960 struct fib6_info *from;
4961 struct dst_entry *dst;
4962 struct rt6_info *rt;
4963 struct sk_buff *skb;
4965 struct flowi6 fl6 = {};
4968 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4973 rtm = nlmsg_data(nlh);
4974 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4975 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4978 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4981 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4985 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4988 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4992 iif = nla_get_u32(tb[RTA_IIF]);
4995 oif = nla_get_u32(tb[RTA_OIF]);
4998 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
5001 fl6.flowi6_uid = make_kuid(current_user_ns(),
5002 nla_get_u32(tb[RTA_UID]));
5004 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5007 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5010 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5012 if (tb[RTA_IP_PROTO]) {
5013 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5014 &fl6.flowi6_proto, AF_INET6,
5021 struct net_device *dev;
5026 dev = dev_get_by_index_rcu(net, iif);
5033 fl6.flowi6_iif = iif;
5035 if (!ipv6_addr_any(&fl6.saddr))
5036 flags |= RT6_LOOKUP_F_HAS_SADDR;
5038 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5042 fl6.flowi6_oif = oif;
5044 dst = ip6_route_output(net, NULL, &fl6);
5048 rt = container_of(dst, struct rt6_info, dst);
5049 if (rt->dst.error) {
5050 err = rt->dst.error;
5055 if (rt == net->ipv6.ip6_null_entry) {
5056 err = rt->dst.error;
5061 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5068 skb_dst_set(skb, &rt->dst);
5071 from = rcu_dereference(rt->from);
5074 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5076 NETLINK_CB(in_skb).portid,
5079 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5080 &fl6.saddr, iif, RTM_NEWROUTE,
5081 NETLINK_CB(in_skb).portid,
5093 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5098 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5099 unsigned int nlm_flags)
5101 struct sk_buff *skb;
5102 struct net *net = info->nl_net;
5107 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5109 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5113 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5114 event, info->portid, seq, nlm_flags);
5116 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5117 WARN_ON(err == -EMSGSIZE);
5121 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5122 info->nlh, gfp_any());
5126 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5129 static int ip6_route_dev_notify(struct notifier_block *this,
5130 unsigned long event, void *ptr)
5132 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5133 struct net *net = dev_net(dev);
5135 if (!(dev->flags & IFF_LOOPBACK))
5138 if (event == NETDEV_REGISTER) {
5139 net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5140 net->ipv6.ip6_null_entry->dst.dev = dev;
5141 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5143 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5144 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5145 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5146 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5148 } else if (event == NETDEV_UNREGISTER &&
5149 dev->reg_state != NETREG_UNREGISTERED) {
5150 /* NETDEV_UNREGISTER could be fired for multiple times by
5151 * netdev_wait_allrefs(). Make sure we only call this once.
5153 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5154 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5155 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5156 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5167 #ifdef CONFIG_PROC_FS
5168 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5170 struct net *net = (struct net *)seq->private;
5171 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5172 net->ipv6.rt6_stats->fib_nodes,
5173 net->ipv6.rt6_stats->fib_route_nodes,
5174 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5175 net->ipv6.rt6_stats->fib_rt_entries,
5176 net->ipv6.rt6_stats->fib_rt_cache,
5177 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5178 net->ipv6.rt6_stats->fib_discarded_routes);
5182 #endif /* CONFIG_PROC_FS */
5184 #ifdef CONFIG_SYSCTL
5187 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5188 void __user *buffer, size_t *lenp, loff_t *ppos)
5196 net = (struct net *)ctl->extra1;
5197 delay = net->ipv6.sysctl.flush_delay;
5198 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5202 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5209 static struct ctl_table ipv6_route_table_template[] = {
5211 .procname = "flush",
5212 .data = &init_net.ipv6.sysctl.flush_delay,
5213 .maxlen = sizeof(int),
5215 .proc_handler = ipv6_sysctl_rtcache_flush
5218 .procname = "gc_thresh",
5219 .data = &ip6_dst_ops_template.gc_thresh,
5220 .maxlen = sizeof(int),
5222 .proc_handler = proc_dointvec,
5225 .procname = "max_size",
5226 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5227 .maxlen = sizeof(int),
5229 .proc_handler = proc_dointvec,
5232 .procname = "gc_min_interval",
5233 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5234 .maxlen = sizeof(int),
5236 .proc_handler = proc_dointvec_jiffies,
5239 .procname = "gc_timeout",
5240 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5241 .maxlen = sizeof(int),
5243 .proc_handler = proc_dointvec_jiffies,
5246 .procname = "gc_interval",
5247 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5248 .maxlen = sizeof(int),
5250 .proc_handler = proc_dointvec_jiffies,
5253 .procname = "gc_elasticity",
5254 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5255 .maxlen = sizeof(int),
5257 .proc_handler = proc_dointvec,
5260 .procname = "mtu_expires",
5261 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5262 .maxlen = sizeof(int),
5264 .proc_handler = proc_dointvec_jiffies,
5267 .procname = "min_adv_mss",
5268 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5269 .maxlen = sizeof(int),
5271 .proc_handler = proc_dointvec,
5274 .procname = "gc_min_interval_ms",
5275 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5276 .maxlen = sizeof(int),
5278 .proc_handler = proc_dointvec_ms_jiffies,
5281 .procname = "skip_notify_on_dev_down",
5282 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5283 .maxlen = sizeof(int),
5285 .proc_handler = proc_dointvec_minmax,
5292 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5294 struct ctl_table *table;
5296 table = kmemdup(ipv6_route_table_template,
5297 sizeof(ipv6_route_table_template),
5301 table[0].data = &net->ipv6.sysctl.flush_delay;
5302 table[0].extra1 = net;
5303 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5304 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5305 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5306 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5307 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5308 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5309 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5310 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5311 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5312 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5314 /* Don't export sysctls to unprivileged users */
5315 if (net->user_ns != &init_user_ns)
5316 table[0].procname = NULL;
5323 static int __net_init ip6_route_net_init(struct net *net)
5327 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5328 sizeof(net->ipv6.ip6_dst_ops));
5330 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5331 goto out_ip6_dst_ops;
5333 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5334 sizeof(*net->ipv6.fib6_null_entry),
5336 if (!net->ipv6.fib6_null_entry)
5337 goto out_ip6_dst_entries;
5339 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5340 sizeof(*net->ipv6.ip6_null_entry),
5342 if (!net->ipv6.ip6_null_entry)
5343 goto out_fib6_null_entry;
5344 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5345 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5346 ip6_template_metrics, true);
5348 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5349 net->ipv6.fib6_has_custom_rules = false;
5350 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5351 sizeof(*net->ipv6.ip6_prohibit_entry),
5353 if (!net->ipv6.ip6_prohibit_entry)
5354 goto out_ip6_null_entry;
5355 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5356 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5357 ip6_template_metrics, true);
5359 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5360 sizeof(*net->ipv6.ip6_blk_hole_entry),
5362 if (!net->ipv6.ip6_blk_hole_entry)
5363 goto out_ip6_prohibit_entry;
5364 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5365 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5366 ip6_template_metrics, true);
5369 net->ipv6.sysctl.flush_delay = 0;
5370 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5371 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5372 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5373 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5374 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5375 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5376 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5377 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5379 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5385 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5386 out_ip6_prohibit_entry:
5387 kfree(net->ipv6.ip6_prohibit_entry);
5389 kfree(net->ipv6.ip6_null_entry);
5391 out_fib6_null_entry:
5392 kfree(net->ipv6.fib6_null_entry);
5393 out_ip6_dst_entries:
5394 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5399 static void __net_exit ip6_route_net_exit(struct net *net)
5401 kfree(net->ipv6.fib6_null_entry);
5402 kfree(net->ipv6.ip6_null_entry);
5403 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5404 kfree(net->ipv6.ip6_prohibit_entry);
5405 kfree(net->ipv6.ip6_blk_hole_entry);
5407 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5410 static int __net_init ip6_route_net_init_late(struct net *net)
5412 #ifdef CONFIG_PROC_FS
5413 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5414 sizeof(struct ipv6_route_iter));
5415 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5416 rt6_stats_seq_show, NULL);
5421 static void __net_exit ip6_route_net_exit_late(struct net *net)
5423 #ifdef CONFIG_PROC_FS
5424 remove_proc_entry("ipv6_route", net->proc_net);
5425 remove_proc_entry("rt6_stats", net->proc_net);
5429 static struct pernet_operations ip6_route_net_ops = {
5430 .init = ip6_route_net_init,
5431 .exit = ip6_route_net_exit,
5434 static int __net_init ipv6_inetpeer_init(struct net *net)
5436 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5440 inet_peer_base_init(bp);
5441 net->ipv6.peers = bp;
5445 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5447 struct inet_peer_base *bp = net->ipv6.peers;
5449 net->ipv6.peers = NULL;
5450 inetpeer_invalidate_tree(bp);
5454 static struct pernet_operations ipv6_inetpeer_ops = {
5455 .init = ipv6_inetpeer_init,
5456 .exit = ipv6_inetpeer_exit,
5459 static struct pernet_operations ip6_route_net_late_ops = {
5460 .init = ip6_route_net_init_late,
5461 .exit = ip6_route_net_exit_late,
5464 static struct notifier_block ip6_route_dev_notifier = {
5465 .notifier_call = ip6_route_dev_notify,
5466 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5469 void __init ip6_route_init_special_entries(void)
5471 /* Registering of the loopback is done before this portion of code,
5472 * the loopback reference in rt6_info will not be taken, do it
5473 * manually for init_net */
5474 init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5475 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5476 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5477 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5478 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5479 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5480 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5481 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5485 int __init ip6_route_init(void)
5491 ip6_dst_ops_template.kmem_cachep =
5492 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5493 SLAB_HWCACHE_ALIGN, NULL);
5494 if (!ip6_dst_ops_template.kmem_cachep)
5497 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5499 goto out_kmem_cache;
5501 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5503 goto out_dst_entries;
5505 ret = register_pernet_subsys(&ip6_route_net_ops);
5507 goto out_register_inetpeer;
5509 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5513 goto out_register_subsys;
5519 ret = fib6_rules_init();
5523 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5525 goto fib6_rules_init;
5527 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5528 inet6_rtm_newroute, NULL, 0);
5530 goto out_register_late_subsys;
5532 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5533 inet6_rtm_delroute, NULL, 0);
5535 goto out_register_late_subsys;
5537 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5538 inet6_rtm_getroute, NULL,
5539 RTNL_FLAG_DOIT_UNLOCKED);
5541 goto out_register_late_subsys;
5543 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5545 goto out_register_late_subsys;
5547 for_each_possible_cpu(cpu) {
5548 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5550 INIT_LIST_HEAD(&ul->head);
5551 spin_lock_init(&ul->lock);
5557 out_register_late_subsys:
5558 rtnl_unregister_all(PF_INET6);
5559 unregister_pernet_subsys(&ip6_route_net_late_ops);
5561 fib6_rules_cleanup();
5566 out_register_subsys:
5567 unregister_pernet_subsys(&ip6_route_net_ops);
5568 out_register_inetpeer:
5569 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5571 dst_entries_destroy(&ip6_dst_blackhole_ops);
5573 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5577 void ip6_route_cleanup(void)
5579 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5580 unregister_pernet_subsys(&ip6_route_net_late_ops);
5581 fib6_rules_cleanup();
5584 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5585 unregister_pernet_subsys(&ip6_route_net_ops);
5586 dst_entries_destroy(&ip6_dst_blackhole_ops);
5587 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);