2 * Linux INET6 implementation
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void ip6_dst_destroy(struct dst_entry *);
87 static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
89 static int ip6_dst_gc(struct dst_ops *ops);
91 static int ip6_pkt_discard(struct sk_buff *skb);
92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int ip6_pkt_prohibit(struct sk_buff *skb);
94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void ip6_link_failure(struct sk_buff *skb);
96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 const struct in6_addr *prefix, int prefixlen,
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 const struct in6_addr *prefix, int prefixlen,
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
124 struct uncached_list {
126 struct list_head head;
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131 static void rt6_uncached_list_add(struct rt6_info *rt)
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135 rt->rt6i_uncached_list = ul;
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
142 static void rt6_uncached_list_del(struct rt6_info *rt)
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
146 struct net *net = dev_net(rt->dst.dev);
148 spin_lock_bh(&ul->lock);
149 list_del(&rt->rt6i_uncached);
150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 spin_unlock_bh(&ul->lock);
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
157 struct net_device *loopback_dev = net->loopback_dev;
160 if (dev == loopback_dev)
163 for_each_possible_cpu(cpu) {
164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
167 spin_lock_bh(&ul->lock);
168 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 struct inet6_dev *rt_idev = rt->rt6i_idev;
170 struct net_device *rt_dev = rt->dst.dev;
172 if (rt_idev->dev == dev) {
173 rt->rt6i_idev = in6_dev_get(loopback_dev);
174 in6_dev_put(rt_idev);
178 rt->dst.dev = loopback_dev;
179 dev_hold(rt->dst.dev);
183 spin_unlock_bh(&ul->lock);
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
189 return dst_metrics_write_ptr(&rt->from->dst);
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
194 struct rt6_info *rt = (struct rt6_info *)dst;
196 if (rt->rt6i_flags & RTF_PCPU)
197 return rt6_pcpu_cow_metrics(rt);
198 else if (rt->rt6i_flags & RTF_CACHE)
201 return dst_cow_metrics_generic(dst, old);
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
208 struct in6_addr *p = &rt->rt6i_gateway;
210 if (!ipv6_addr_any(p))
211 return (const void *) p;
213 return &ipv6_hdr(skb)->daddr;
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
221 struct rt6_info *rt = (struct rt6_info *) dst;
224 daddr = choose_neigh_daddr(rt, skb, daddr);
225 n = __ipv6_neigh_lookup(dst->dev, daddr);
228 return neigh_create(&nd_tbl, daddr, dst->dev);
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
233 struct net_device *dev = dst->dev;
234 struct rt6_info *rt = (struct rt6_info *)dst;
236 daddr = choose_neigh_daddr(rt, NULL, daddr);
239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
243 __ipv6_confirm_neigh(dev, daddr);
246 static struct dst_ops ip6_dst_ops_template = {
250 .check = ip6_dst_check,
251 .default_advmss = ip6_default_advmss,
253 .cow_metrics = ipv6_cow_metrics,
254 .destroy = ip6_dst_destroy,
255 .ifdown = ip6_dst_ifdown,
256 .negative_advice = ip6_negative_advice,
257 .link_failure = ip6_link_failure,
258 .update_pmtu = ip6_rt_update_pmtu,
259 .redirect = rt6_do_redirect,
260 .local_out = __ip6_local_out,
261 .neigh_lookup = ip6_neigh_lookup,
262 .confirm_neigh = ip6_confirm_neigh,
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
269 return mtu ? : dst->dev->mtu;
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 struct sk_buff *skb, u32 mtu)
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
282 static struct dst_ops ip6_dst_blackhole_ops = {
284 .destroy = ip6_dst_destroy,
285 .check = ip6_dst_check,
286 .mtu = ip6_blackhole_mtu,
287 .default_advmss = ip6_default_advmss,
288 .update_pmtu = ip6_rt_blackhole_update_pmtu,
289 .redirect = ip6_rt_blackhole_redirect,
290 .cow_metrics = dst_cow_metrics_generic,
291 .neigh_lookup = ip6_neigh_lookup,
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 [RTAX_HOPLIMIT - 1] = 0,
298 static const struct rt6_info ip6_null_entry_template = {
300 .__refcnt = ATOMIC_INIT(1),
302 .obsolete = DST_OBSOLETE_FORCE_CHK,
303 .error = -ENETUNREACH,
304 .input = ip6_pkt_discard,
305 .output = ip6_pkt_discard_out,
307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
308 .rt6i_protocol = RTPROT_KERNEL,
309 .rt6i_metric = ~(u32) 0,
310 .rt6i_ref = ATOMIC_INIT(1),
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
315 static const struct rt6_info ip6_prohibit_entry_template = {
317 .__refcnt = ATOMIC_INIT(1),
319 .obsolete = DST_OBSOLETE_FORCE_CHK,
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
325 .rt6i_protocol = RTPROT_KERNEL,
326 .rt6i_metric = ~(u32) 0,
327 .rt6i_ref = ATOMIC_INIT(1),
330 static const struct rt6_info ip6_blk_hole_entry_template = {
332 .__refcnt = ATOMIC_INIT(1),
334 .obsolete = DST_OBSOLETE_FORCE_CHK,
336 .input = dst_discard,
337 .output = dst_discard_out,
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
340 .rt6i_protocol = RTPROT_KERNEL,
341 .rt6i_metric = ~(u32) 0,
342 .rt6i_ref = ATOMIC_INIT(1),
347 static void rt6_info_init(struct rt6_info *rt)
349 struct dst_entry *dst = &rt->dst;
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached);
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 struct net_device *dev,
361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 1, DST_OBSOLETE_FORCE_CHK, flags);
366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 struct net_device *dev,
376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 if (!rt->rt6i_pcpu) {
381 dst_release_immediate(&rt->dst);
388 EXPORT_SYMBOL(ip6_dst_alloc);
390 static void ip6_dst_destroy(struct dst_entry *dst)
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct rt6_exception_bucket *bucket;
394 struct rt6_info *from = rt->from;
395 struct inet6_dev *idev;
397 dst_destroy_metrics_generic(dst);
398 free_percpu(rt->rt6i_pcpu);
399 rt6_uncached_list_del(rt);
401 idev = rt->rt6i_idev;
403 rt->rt6i_idev = NULL;
406 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
408 rt->rt6i_exception_bucket = NULL;
413 dst_release(&from->dst);
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
419 struct rt6_info *rt = (struct rt6_info *)dst;
420 struct inet6_dev *idev = rt->rt6i_idev;
421 struct net_device *loopback_dev =
422 dev_net(dev)->loopback_dev;
424 if (idev && idev->dev != loopback_dev) {
425 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
427 rt->rt6i_idev = loopback_idev;
433 static bool __rt6_check_expired(const struct rt6_info *rt)
435 if (rt->rt6i_flags & RTF_EXPIRES)
436 return time_after(jiffies, rt->dst.expires);
441 static bool rt6_check_expired(const struct rt6_info *rt)
443 if (rt->rt6i_flags & RTF_EXPIRES) {
444 if (time_after(jiffies, rt->dst.expires))
446 } else if (rt->from) {
447 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 rt6_check_expired(rt->from);
453 static struct rt6_info *rt6_multipath_select(const struct net *net,
454 struct rt6_info *match,
455 struct flowi6 *fl6, int oif,
456 const struct sk_buff *skb,
459 struct rt6_info *sibling, *next_sibling;
461 /* We might have already computed the hash for ICMPv6 errors. In such
462 * case it will always be non-zero. Otherwise now is the time to do it.
465 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
467 if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
470 list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
472 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
474 if (rt6_score_route(sibling, oif, strict) < 0)
484 * Route lookup. rcu_read_lock() should be held.
487 static inline struct rt6_info *rt6_device_match(struct net *net,
489 const struct in6_addr *saddr,
493 struct rt6_info *local = NULL;
494 struct rt6_info *sprt;
496 if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
499 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
500 struct net_device *dev = sprt->dst.dev;
502 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
506 if (dev->ifindex == oif)
508 if (dev->flags & IFF_LOOPBACK) {
509 if (!sprt->rt6i_idev ||
510 sprt->rt6i_idev->dev->ifindex != oif) {
511 if (flags & RT6_LOOKUP_F_IFACE)
514 local->rt6i_idev->dev->ifindex == oif)
520 if (ipv6_chk_addr(net, saddr, dev,
521 flags & RT6_LOOKUP_F_IFACE))
530 if (flags & RT6_LOOKUP_F_IFACE)
531 return net->ipv6.ip6_null_entry;
534 return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
537 #ifdef CONFIG_IPV6_ROUTER_PREF
538 struct __rt6_probe_work {
539 struct work_struct work;
540 struct in6_addr target;
541 struct net_device *dev;
544 static void rt6_probe_deferred(struct work_struct *w)
546 struct in6_addr mcaddr;
547 struct __rt6_probe_work *work =
548 container_of(w, struct __rt6_probe_work, work);
550 addrconf_addr_solict_mult(&work->target, &mcaddr);
551 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
556 static void rt6_probe(struct rt6_info *rt)
558 struct __rt6_probe_work *work;
559 struct neighbour *neigh;
561 * Okay, this does not seem to be appropriate
562 * for now, however, we need to check if it
563 * is really so; aka Router Reachability Probing.
565 * Router Reachability Probe MUST be rate-limited
566 * to no more than one per minute.
568 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
571 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
573 if (neigh->nud_state & NUD_VALID)
577 write_lock(&neigh->lock);
578 if (!(neigh->nud_state & NUD_VALID) &&
581 rt->rt6i_idev->cnf.rtr_probe_interval)) {
582 work = kmalloc(sizeof(*work), GFP_ATOMIC);
584 __neigh_set_probe_once(neigh);
586 write_unlock(&neigh->lock);
588 work = kmalloc(sizeof(*work), GFP_ATOMIC);
592 INIT_WORK(&work->work, rt6_probe_deferred);
593 work->target = rt->rt6i_gateway;
594 dev_hold(rt->dst.dev);
595 work->dev = rt->dst.dev;
596 schedule_work(&work->work);
600 rcu_read_unlock_bh();
603 static inline void rt6_probe(struct rt6_info *rt)
609 * Default Router Selection (RFC 2461 6.3.6)
611 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
613 struct net_device *dev = rt->dst.dev;
614 if (!oif || dev->ifindex == oif)
616 if ((dev->flags & IFF_LOOPBACK) &&
617 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
622 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
624 struct neighbour *neigh;
625 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
627 if (rt->rt6i_flags & RTF_NONEXTHOP ||
628 !(rt->rt6i_flags & RTF_GATEWAY))
629 return RT6_NUD_SUCCEED;
632 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
634 read_lock(&neigh->lock);
635 if (neigh->nud_state & NUD_VALID)
636 ret = RT6_NUD_SUCCEED;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638 else if (!(neigh->nud_state & NUD_FAILED))
639 ret = RT6_NUD_SUCCEED;
641 ret = RT6_NUD_FAIL_PROBE;
643 read_unlock(&neigh->lock);
645 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
646 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
648 rcu_read_unlock_bh();
653 static int rt6_score_route(struct rt6_info *rt, int oif,
658 m = rt6_check_dev(rt, oif);
659 if (!m && (strict & RT6_LOOKUP_F_IFACE))
660 return RT6_NUD_FAIL_HARD;
661 #ifdef CONFIG_IPV6_ROUTER_PREF
662 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
664 if (strict & RT6_LOOKUP_F_REACHABLE) {
665 int n = rt6_check_neigh(rt);
672 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
673 int *mpri, struct rt6_info *match,
677 bool match_do_rr = false;
678 struct inet6_dev *idev = rt->rt6i_idev;
680 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
683 if (idev->cnf.ignore_routes_with_linkdown &&
684 rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
688 if (rt6_check_expired(rt))
691 m = rt6_score_route(rt, oif, strict);
692 if (m == RT6_NUD_FAIL_DO_RR) {
694 m = 0; /* lowest valid score */
695 } else if (m == RT6_NUD_FAIL_HARD) {
699 if (strict & RT6_LOOKUP_F_REACHABLE)
702 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
704 *do_rr = match_do_rr;
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713 struct rt6_info *leaf,
714 struct rt6_info *rr_head,
715 u32 metric, int oif, int strict,
718 struct rt6_info *rt, *match, *cont;
723 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724 if (rt->rt6i_metric != metric) {
729 match = find_match(rt, oif, strict, &mpri, match, do_rr);
732 for (rt = leaf; rt && rt != rr_head;
733 rt = rcu_dereference(rt->rt6_next)) {
734 if (rt->rt6i_metric != metric) {
739 match = find_match(rt, oif, strict, &mpri, match, do_rr);
745 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746 match = find_match(rt, oif, strict, &mpri, match, do_rr);
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
754 struct rt6_info *leaf = rcu_dereference(fn->leaf);
755 struct rt6_info *match, *rt0;
759 if (!leaf || leaf == net->ipv6.ip6_null_entry)
760 return net->ipv6.ip6_null_entry;
762 rt0 = rcu_dereference(fn->rr_ptr);
766 /* Double check to make sure fn is not an intermediate node
767 * and fn->leaf does not points to its child's leaf
768 * (This might happen if all routes under fn are deleted from
769 * the tree and fib6_repair_tree() is called on the node.)
771 key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773 if (rt0->rt6i_src.plen)
774 key_plen = rt0->rt6i_src.plen;
776 if (fn->fn_bit != key_plen)
777 return net->ipv6.ip6_null_entry;
779 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
783 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
785 /* no entries matched; do round-robin */
786 if (!next || next->rt6i_metric != rt0->rt6i_metric)
790 spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791 /* make sure next is not being deleted from the tree */
793 rcu_assign_pointer(fn->rr_ptr, next);
794 spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
798 return match ? match : net->ipv6.ip6_null_entry;
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
803 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808 const struct in6_addr *gwaddr)
810 struct net *net = dev_net(dev);
811 struct route_info *rinfo = (struct route_info *) opt;
812 struct in6_addr prefix_buf, *prefix;
814 unsigned long lifetime;
817 if (len < sizeof(struct route_info)) {
821 /* Sanity check for prefix_len and length */
822 if (rinfo->length > 3) {
824 } else if (rinfo->prefix_len > 128) {
826 } else if (rinfo->prefix_len > 64) {
827 if (rinfo->length < 2) {
830 } else if (rinfo->prefix_len > 0) {
831 if (rinfo->length < 1) {
836 pref = rinfo->route_pref;
837 if (pref == ICMPV6_ROUTER_PREF_INVALID)
840 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
842 if (rinfo->length == 3)
843 prefix = (struct in6_addr *)rinfo->prefix;
845 /* this function is safe */
846 ipv6_addr_prefix(&prefix_buf,
847 (struct in6_addr *)rinfo->prefix,
849 prefix = &prefix_buf;
852 if (rinfo->prefix_len == 0)
853 rt = rt6_get_dflt_router(gwaddr, dev);
855 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
858 if (rt && !lifetime) {
864 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
867 rt->rt6i_flags = RTF_ROUTEINFO |
868 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
871 if (!addrconf_finite_timeout(lifetime))
872 rt6_clean_expires(rt);
874 rt6_set_expires(rt, jiffies + HZ * lifetime);
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883 struct in6_addr *saddr)
885 struct fib6_node *pn, *sn;
887 if (fn->fn_flags & RTN_TL_ROOT)
889 pn = rcu_dereference(fn->parent);
890 sn = FIB6_SUBTREE(pn);
892 fn = fib6_lookup(sn, NULL, saddr);
895 if (fn->fn_flags & RTN_RTINFO)
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
903 struct rt6_info *rt = *prt;
905 if (dst_hold_safe(&rt->dst))
908 rt = net->ipv6.ip6_null_entry;
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918 struct fib6_table *table,
920 const struct sk_buff *skb,
923 struct rt6_info *rt, *rt_cache;
924 struct fib6_node *fn;
927 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
929 rt = rcu_dereference(fn->leaf);
931 rt = net->ipv6.ip6_null_entry;
933 rt = rt6_device_match(net, rt, &fl6->saddr,
934 fl6->flowi6_oif, flags);
935 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
936 rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
939 if (rt == net->ipv6.ip6_null_entry) {
940 fn = fib6_backtrack(fn, &fl6->saddr);
944 /* Search through exception table */
945 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
949 if (ip6_hold_safe(net, &rt, true))
950 dst_use_noref(&rt->dst, jiffies);
954 trace_fib6_table_lookup(net, rt, table, fl6);
960 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
961 const struct sk_buff *skb, int flags)
963 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
965 EXPORT_SYMBOL_GPL(ip6_route_lookup);
967 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
968 const struct in6_addr *saddr, int oif,
969 const struct sk_buff *skb, int strict)
971 struct flowi6 fl6 = {
975 struct dst_entry *dst;
976 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
979 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
980 flags |= RT6_LOOKUP_F_HAS_SADDR;
983 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
985 return (struct rt6_info *) dst;
991 EXPORT_SYMBOL(rt6_lookup);
993 /* ip6_ins_rt is called with FREE table->tb6_lock.
994 * It takes new route entry, the addition fails by any reason the
996 * Caller must hold dst before calling it.
999 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1000 struct mx6_config *mxc,
1001 struct netlink_ext_ack *extack)
1004 struct fib6_table *table;
1006 table = rt->rt6i_table;
1007 spin_lock_bh(&table->tb6_lock);
1008 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1009 spin_unlock_bh(&table->tb6_lock);
1014 int ip6_ins_rt(struct rt6_info *rt)
1016 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1017 struct mx6_config mxc = { .mx = NULL, };
1019 /* Hold dst to account for the reference from the fib6 tree */
1021 return __ip6_ins_rt(rt, &info, &mxc, NULL);
1024 /* called with rcu_lock held */
1025 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1027 struct net_device *dev = rt->dst.dev;
1029 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1030 /* for copies of local routes, dst->dev needs to be the
1031 * device if it is a master device, the master device if
1032 * device is enslaved, and the loopback as the default
1034 if (netif_is_l3_slave(dev) &&
1035 !rt6_need_strict(&rt->rt6i_dst.addr))
1036 dev = l3mdev_master_dev_rcu(dev);
1037 else if (!netif_is_l3_master(dev))
1038 dev = dev_net(dev)->loopback_dev;
1039 /* last case is netif_is_l3_master(dev) is true in which
1040 * case we want dev returned to be dev
1047 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1048 const struct in6_addr *daddr,
1049 const struct in6_addr *saddr)
1051 struct net_device *dev;
1052 struct rt6_info *rt;
1058 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062 dev = ip6_rt_get_dev_rcu(ort);
1063 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1068 ip6_rt_copy_init(rt, ort);
1069 rt->rt6i_flags |= RTF_CACHE;
1070 rt->rt6i_metric = 0;
1071 rt->dst.flags |= DST_HOST;
1072 rt->rt6i_dst.addr = *daddr;
1073 rt->rt6i_dst.plen = 128;
1075 if (!rt6_is_gw_or_nonexthop(ort)) {
1076 if (ort->rt6i_dst.plen != 128 &&
1077 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1078 rt->rt6i_flags |= RTF_ANYCAST;
1079 #ifdef CONFIG_IPV6_SUBTREES
1080 if (rt->rt6i_src.plen && saddr) {
1081 rt->rt6i_src.addr = *saddr;
1082 rt->rt6i_src.plen = 128;
1090 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1092 struct net_device *dev;
1093 struct rt6_info *pcpu_rt;
1096 dev = ip6_rt_get_dev_rcu(rt);
1097 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1101 ip6_rt_copy_init(pcpu_rt, rt);
1102 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1103 pcpu_rt->rt6i_flags |= RTF_PCPU;
1107 /* It should be called with rcu_read_lock() acquired */
1108 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1110 struct rt6_info *pcpu_rt, **p;
1112 p = this_cpu_ptr(rt->rt6i_pcpu);
1115 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1116 rt6_dst_from_metrics_check(pcpu_rt);
1121 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1123 struct rt6_info *pcpu_rt, *prev, **p;
1125 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1127 struct net *net = dev_net(rt->dst.dev);
1129 dst_hold(&net->ipv6.ip6_null_entry->dst);
1130 return net->ipv6.ip6_null_entry;
1133 dst_hold(&pcpu_rt->dst);
1134 p = this_cpu_ptr(rt->rt6i_pcpu);
1135 prev = cmpxchg(p, NULL, pcpu_rt);
1138 rt6_dst_from_metrics_check(pcpu_rt);
1142 /* exception hash table implementation
1144 static DEFINE_SPINLOCK(rt6_exception_lock);
1146 /* Remove rt6_ex from hash table and free the memory
1147 * Caller must hold rt6_exception_lock
1149 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1150 struct rt6_exception *rt6_ex)
1154 if (!bucket || !rt6_ex)
1157 net = dev_net(rt6_ex->rt6i->dst.dev);
1158 rt6_ex->rt6i->rt6i_node = NULL;
1159 hlist_del_rcu(&rt6_ex->hlist);
1160 rt6_release(rt6_ex->rt6i);
1161 kfree_rcu(rt6_ex, rcu);
1162 WARN_ON_ONCE(!bucket->depth);
1164 net->ipv6.rt6_stats->fib_rt_cache--;
1167 /* Remove oldest rt6_ex in bucket and free the memory
1168 * Caller must hold rt6_exception_lock
1170 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1172 struct rt6_exception *rt6_ex, *oldest = NULL;
1177 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1178 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1181 rt6_remove_exception(bucket, oldest);
1184 static u32 rt6_exception_hash(const struct in6_addr *dst,
1185 const struct in6_addr *src)
1187 static u32 seed __read_mostly;
1190 net_get_random_once(&seed, sizeof(seed));
1191 val = jhash(dst, sizeof(*dst), seed);
1193 #ifdef CONFIG_IPV6_SUBTREES
1195 val = jhash(src, sizeof(*src), val);
1197 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1200 /* Helper function to find the cached rt in the hash table
1201 * and update bucket pointer to point to the bucket for this
1202 * (daddr, saddr) pair
1203 * Caller must hold rt6_exception_lock
1205 static struct rt6_exception *
1206 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1207 const struct in6_addr *daddr,
1208 const struct in6_addr *saddr)
1210 struct rt6_exception *rt6_ex;
1213 if (!(*bucket) || !daddr)
1216 hval = rt6_exception_hash(daddr, saddr);
1219 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1220 struct rt6_info *rt6 = rt6_ex->rt6i;
1221 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1223 #ifdef CONFIG_IPV6_SUBTREES
1224 if (matched && saddr)
1225 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1233 /* Helper function to find the cached rt in the hash table
1234 * and update bucket pointer to point to the bucket for this
1235 * (daddr, saddr) pair
1236 * Caller must hold rcu_read_lock()
1238 static struct rt6_exception *
1239 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1240 const struct in6_addr *daddr,
1241 const struct in6_addr *saddr)
1243 struct rt6_exception *rt6_ex;
1246 WARN_ON_ONCE(!rcu_read_lock_held());
1248 if (!(*bucket) || !daddr)
1251 hval = rt6_exception_hash(daddr, saddr);
1254 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1255 struct rt6_info *rt6 = rt6_ex->rt6i;
1256 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1258 #ifdef CONFIG_IPV6_SUBTREES
1259 if (matched && saddr)
1260 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1268 static int rt6_insert_exception(struct rt6_info *nrt,
1269 struct rt6_info *ort)
1271 struct net *net = dev_net(ort->dst.dev);
1272 struct rt6_exception_bucket *bucket;
1273 struct in6_addr *src_key = NULL;
1274 struct rt6_exception *rt6_ex;
1277 /* ort can't be a cache or pcpu route */
1278 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1280 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1282 spin_lock_bh(&rt6_exception_lock);
1284 if (ort->exception_bucket_flushed) {
1289 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1290 lockdep_is_held(&rt6_exception_lock));
1292 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1298 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1301 #ifdef CONFIG_IPV6_SUBTREES
1302 /* rt6i_src.plen != 0 indicates ort is in subtree
1303 * and exception table is indexed by a hash of
1304 * both rt6i_dst and rt6i_src.
1305 * Otherwise, the exception table is indexed by
1306 * a hash of only rt6i_dst.
1308 if (ort->rt6i_src.plen)
1309 src_key = &nrt->rt6i_src.addr;
1312 /* Update rt6i_prefsrc as it could be changed
1313 * in rt6_remove_prefsrc()
1315 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1316 /* rt6_mtu_change() might lower mtu on ort.
1317 * Only insert this exception route if its mtu
1318 * is less than ort's mtu value.
1320 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1325 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1328 rt6_remove_exception(bucket, rt6_ex);
1330 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1336 rt6_ex->stamp = jiffies;
1337 atomic_inc(&nrt->rt6i_ref);
1338 nrt->rt6i_node = ort->rt6i_node;
1339 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1341 net->ipv6.rt6_stats->fib_rt_cache++;
1343 if (bucket->depth > FIB6_MAX_DEPTH)
1344 rt6_exception_remove_oldest(bucket);
1347 spin_unlock_bh(&rt6_exception_lock);
1349 /* Update fn->fn_sernum to invalidate all cached dst */
1351 spin_lock_bh(&ort->rt6i_table->tb6_lock);
1352 fib6_update_sernum(ort);
1353 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1354 fib6_force_start_gc(net);
1360 void rt6_flush_exceptions(struct rt6_info *rt)
1362 struct rt6_exception_bucket *bucket;
1363 struct rt6_exception *rt6_ex;
1364 struct hlist_node *tmp;
1367 spin_lock_bh(&rt6_exception_lock);
1368 /* Prevent rt6_insert_exception() to recreate the bucket list */
1369 rt->exception_bucket_flushed = 1;
1371 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1372 lockdep_is_held(&rt6_exception_lock));
1376 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1377 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1378 rt6_remove_exception(bucket, rt6_ex);
1379 WARN_ON_ONCE(bucket->depth);
1384 spin_unlock_bh(&rt6_exception_lock);
1387 /* Find cached rt in the hash table inside passed in rt
1388 * Caller has to hold rcu_read_lock()
1390 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1391 struct in6_addr *daddr,
1392 struct in6_addr *saddr)
1394 struct rt6_exception_bucket *bucket;
1395 struct in6_addr *src_key = NULL;
1396 struct rt6_exception *rt6_ex;
1397 struct rt6_info *res = NULL;
1399 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1401 #ifdef CONFIG_IPV6_SUBTREES
1402 /* rt6i_src.plen != 0 indicates rt is in subtree
1403 * and exception table is indexed by a hash of
1404 * both rt6i_dst and rt6i_src.
1405 * Otherwise, the exception table is indexed by
1406 * a hash of only rt6i_dst.
1408 if (rt->rt6i_src.plen)
1411 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1413 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1419 /* Remove the passed in cached rt from the hash table that contains it */
1420 int rt6_remove_exception_rt(struct rt6_info *rt)
1422 struct rt6_exception_bucket *bucket;
1423 struct rt6_info *from = rt->from;
1424 struct in6_addr *src_key = NULL;
1425 struct rt6_exception *rt6_ex;
1429 !(rt->rt6i_flags & RTF_CACHE))
1432 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1435 spin_lock_bh(&rt6_exception_lock);
1436 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1437 lockdep_is_held(&rt6_exception_lock));
1438 #ifdef CONFIG_IPV6_SUBTREES
1439 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1440 * and exception table is indexed by a hash of
1441 * both rt6i_dst and rt6i_src.
1442 * Otherwise, the exception table is indexed by
1443 * a hash of only rt6i_dst.
1445 if (from->rt6i_src.plen)
1446 src_key = &rt->rt6i_src.addr;
1448 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1452 rt6_remove_exception(bucket, rt6_ex);
1458 spin_unlock_bh(&rt6_exception_lock);
1462 /* Find rt6_ex which contains the passed in rt cache and
1465 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1467 struct rt6_exception_bucket *bucket;
1468 struct rt6_info *from = rt->from;
1469 struct in6_addr *src_key = NULL;
1470 struct rt6_exception *rt6_ex;
1473 !(rt->rt6i_flags & RTF_CACHE))
1477 bucket = rcu_dereference(from->rt6i_exception_bucket);
1479 #ifdef CONFIG_IPV6_SUBTREES
1480 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1481 * and exception table is indexed by a hash of
1482 * both rt6i_dst and rt6i_src.
1483 * Otherwise, the exception table is indexed by
1484 * a hash of only rt6i_dst.
1486 if (from->rt6i_src.plen)
1487 src_key = &rt->rt6i_src.addr;
1489 rt6_ex = __rt6_find_exception_rcu(&bucket,
1493 rt6_ex->stamp = jiffies;
1498 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1500 struct rt6_exception_bucket *bucket;
1501 struct rt6_exception *rt6_ex;
1504 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1505 lockdep_is_held(&rt6_exception_lock));
1508 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1509 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1510 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1517 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1519 struct rt6_exception_bucket *bucket;
1520 struct rt6_exception *rt6_ex;
1523 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1524 lockdep_is_held(&rt6_exception_lock));
1527 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1528 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1529 struct rt6_info *entry = rt6_ex->rt6i;
1530 /* For RTF_CACHE with rt6i_pmtu == 0
1531 * (i.e. a redirected route),
1532 * the metrics of its rt->dst.from has already
1535 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1536 entry->rt6i_pmtu = mtu;
1543 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1545 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1546 struct in6_addr *gateway)
1548 struct rt6_exception_bucket *bucket;
1549 struct rt6_exception *rt6_ex;
1550 struct hlist_node *tmp;
1553 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1556 spin_lock_bh(&rt6_exception_lock);
1557 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1558 lockdep_is_held(&rt6_exception_lock));
1561 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1562 hlist_for_each_entry_safe(rt6_ex, tmp,
1563 &bucket->chain, hlist) {
1564 struct rt6_info *entry = rt6_ex->rt6i;
1566 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1567 RTF_CACHE_GATEWAY &&
1568 ipv6_addr_equal(gateway,
1569 &entry->rt6i_gateway)) {
1570 rt6_remove_exception(bucket, rt6_ex);
1577 spin_unlock_bh(&rt6_exception_lock);
1580 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1581 struct rt6_exception *rt6_ex,
1582 struct fib6_gc_args *gc_args,
1585 struct rt6_info *rt = rt6_ex->rt6i;
1587 /* we are pruning and obsoleting aged-out and non gateway exceptions
1588 * even if others have still references to them, so that on next
1589 * dst_check() such references can be dropped.
1590 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1591 * expired, independently from their aging, as per RFC 8201 section 4
1593 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1594 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1595 RT6_TRACE("aging clone %p\n", rt);
1596 rt6_remove_exception(bucket, rt6_ex);
1599 } else if (time_after(jiffies, rt->dst.expires)) {
1600 RT6_TRACE("purging expired route %p\n", rt);
1601 rt6_remove_exception(bucket, rt6_ex);
1605 if (rt->rt6i_flags & RTF_GATEWAY) {
1606 struct neighbour *neigh;
1607 __u8 neigh_flags = 0;
1609 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1611 neigh_flags = neigh->flags;
1612 neigh_release(neigh);
1614 if (!(neigh_flags & NTF_ROUTER)) {
1615 RT6_TRACE("purging route %p via non-router but gateway\n",
1617 rt6_remove_exception(bucket, rt6_ex);
1625 void rt6_age_exceptions(struct rt6_info *rt,
1626 struct fib6_gc_args *gc_args,
1629 struct rt6_exception_bucket *bucket;
1630 struct rt6_exception *rt6_ex;
1631 struct hlist_node *tmp;
1634 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1637 spin_lock_bh(&rt6_exception_lock);
1638 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1639 lockdep_is_held(&rt6_exception_lock));
1642 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1643 hlist_for_each_entry_safe(rt6_ex, tmp,
1644 &bucket->chain, hlist) {
1645 rt6_age_examine_exception(bucket, rt6_ex,
1651 spin_unlock_bh(&rt6_exception_lock);
1654 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1655 int oif, struct flowi6 *fl6,
1656 const struct sk_buff *skb, int flags)
1658 struct fib6_node *fn, *saved_fn;
1659 struct rt6_info *rt, *rt_cache;
1662 strict |= flags & RT6_LOOKUP_F_IFACE;
1663 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1664 if (net->ipv6.devconf_all->forwarding == 0)
1665 strict |= RT6_LOOKUP_F_REACHABLE;
1669 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1672 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1676 rt = rt6_select(net, fn, oif, strict);
1677 if (rt->rt6i_nsiblings)
1678 rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1679 if (rt == net->ipv6.ip6_null_entry) {
1680 fn = fib6_backtrack(fn, &fl6->saddr);
1682 goto redo_rt6_select;
1683 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1684 /* also consider unreachable route */
1685 strict &= ~RT6_LOOKUP_F_REACHABLE;
1687 goto redo_rt6_select;
1691 /*Search through exception table */
1692 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1696 if (rt == net->ipv6.ip6_null_entry) {
1699 trace_fib6_table_lookup(net, rt, table, fl6);
1701 } else if (rt->rt6i_flags & RTF_CACHE) {
1702 if (ip6_hold_safe(net, &rt, true)) {
1703 dst_use_noref(&rt->dst, jiffies);
1704 rt6_dst_from_metrics_check(rt);
1707 trace_fib6_table_lookup(net, rt, table, fl6);
1709 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1710 !(rt->rt6i_flags & RTF_GATEWAY))) {
1711 /* Create a RTF_CACHE clone which will not be
1712 * owned by the fib6 tree. It is for the special case where
1713 * the daddr in the skb during the neighbor look-up is different
1714 * from the fl6->daddr used to look-up route here.
1717 struct rt6_info *uncached_rt;
1719 if (ip6_hold_safe(net, &rt, true)) {
1720 dst_use_noref(&rt->dst, jiffies);
1724 goto uncached_rt_out;
1728 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1729 dst_release(&rt->dst);
1732 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1733 * No need for another dst_hold()
1735 rt6_uncached_list_add(uncached_rt);
1736 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1738 uncached_rt = net->ipv6.ip6_null_entry;
1739 dst_hold(&uncached_rt->dst);
1743 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1747 /* Get a percpu copy */
1749 struct rt6_info *pcpu_rt;
1751 dst_use_noref(&rt->dst, jiffies);
1753 pcpu_rt = rt6_get_pcpu_route(rt);
1756 /* atomic_inc_not_zero() is needed when using rcu */
1757 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1758 /* No dst_hold() on rt is needed because grabbing
1759 * rt->rt6i_ref makes sure rt can't be released.
1761 pcpu_rt = rt6_make_pcpu_route(rt);
1764 /* rt is already removed from tree */
1765 pcpu_rt = net->ipv6.ip6_null_entry;
1766 dst_hold(&pcpu_rt->dst);
1771 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1775 EXPORT_SYMBOL_GPL(ip6_pol_route);
1777 static struct rt6_info *ip6_pol_route_input(struct net *net,
1778 struct fib6_table *table,
1780 const struct sk_buff *skb,
1783 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1786 struct dst_entry *ip6_route_input_lookup(struct net *net,
1787 struct net_device *dev,
1789 const struct sk_buff *skb,
1792 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1793 flags |= RT6_LOOKUP_F_IFACE;
1795 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1797 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1799 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1800 struct flow_keys *keys,
1801 struct flow_keys *flkeys)
1803 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1804 const struct ipv6hdr *key_iph = outer_iph;
1805 struct flow_keys *_flkeys = flkeys;
1806 const struct ipv6hdr *inner_iph;
1807 const struct icmp6hdr *icmph;
1808 struct ipv6hdr _inner_iph;
1810 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1813 icmph = icmp6_hdr(skb);
1814 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1815 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1816 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1817 icmph->icmp6_type != ICMPV6_PARAMPROB)
1820 inner_iph = skb_header_pointer(skb,
1821 skb_transport_offset(skb) + sizeof(*icmph),
1822 sizeof(_inner_iph), &_inner_iph);
1826 key_iph = inner_iph;
1830 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1831 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1832 keys->tags.flow_label = _flkeys->tags.flow_label;
1833 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1835 keys->addrs.v6addrs.src = key_iph->saddr;
1836 keys->addrs.v6addrs.dst = key_iph->daddr;
1837 keys->tags.flow_label = ip6_flowinfo(key_iph);
1838 keys->basic.ip_proto = key_iph->nexthdr;
1842 /* if skb is set it will be used and fl6 can be NULL */
1843 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1844 const struct sk_buff *skb, struct flow_keys *flkeys)
1846 struct flow_keys hash_keys;
1849 switch (net->ipv6.sysctl.multipath_hash_policy) {
1851 memset(&hash_keys, 0, sizeof(hash_keys));
1852 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1854 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1856 hash_keys.addrs.v6addrs.src = fl6->saddr;
1857 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1858 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1859 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1864 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1865 struct flow_keys keys;
1867 /* short-circuit if we already have L4 hash present */
1869 return skb_get_hash_raw(skb) >> 1;
1871 memset(&hash_keys, 0, sizeof(hash_keys));
1874 skb_flow_dissect_flow_keys(skb, &keys, flag);
1877 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1878 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1879 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1880 hash_keys.ports.src = flkeys->ports.src;
1881 hash_keys.ports.dst = flkeys->ports.dst;
1882 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1884 memset(&hash_keys, 0, sizeof(hash_keys));
1885 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1886 hash_keys.addrs.v6addrs.src = fl6->saddr;
1887 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1888 hash_keys.ports.src = fl6->fl6_sport;
1889 hash_keys.ports.dst = fl6->fl6_dport;
1890 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1894 mhash = flow_hash_from_keys(&hash_keys);
1899 void ip6_route_input(struct sk_buff *skb)
1901 const struct ipv6hdr *iph = ipv6_hdr(skb);
1902 struct net *net = dev_net(skb->dev);
1903 int flags = RT6_LOOKUP_F_HAS_SADDR;
1904 struct ip_tunnel_info *tun_info;
1905 struct flowi6 fl6 = {
1906 .flowi6_iif = skb->dev->ifindex,
1907 .daddr = iph->daddr,
1908 .saddr = iph->saddr,
1909 .flowlabel = ip6_flowinfo(iph),
1910 .flowi6_mark = skb->mark,
1911 .flowi6_proto = iph->nexthdr,
1913 struct flow_keys *flkeys = NULL, _flkeys;
1915 tun_info = skb_tunnel_info(skb);
1916 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1917 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1919 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1922 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1923 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
1926 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
1929 static struct rt6_info *ip6_pol_route_output(struct net *net,
1930 struct fib6_table *table,
1932 const struct sk_buff *skb,
1935 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
1938 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1939 struct flowi6 *fl6, int flags)
1943 if (rt6_need_strict(&fl6->daddr)) {
1944 struct dst_entry *dst;
1946 dst = l3mdev_link_scope_lookup(net, fl6);
1951 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1953 any_src = ipv6_addr_any(&fl6->saddr);
1954 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1955 (fl6->flowi6_oif && any_src))
1956 flags |= RT6_LOOKUP_F_IFACE;
1959 flags |= RT6_LOOKUP_F_HAS_SADDR;
1961 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1963 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1965 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1967 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1969 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1970 struct net_device *loopback_dev = net->loopback_dev;
1971 struct dst_entry *new = NULL;
1973 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1974 DST_OBSOLETE_DEAD, 0);
1977 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1981 new->input = dst_discard;
1982 new->output = dst_discard_out;
1984 dst_copy_metrics(new, &ort->dst);
1986 rt->rt6i_idev = in6_dev_get(loopback_dev);
1987 rt->rt6i_gateway = ort->rt6i_gateway;
1988 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1989 rt->rt6i_metric = 0;
1991 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1992 #ifdef CONFIG_IPV6_SUBTREES
1993 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1997 dst_release(dst_orig);
1998 return new ? new : ERR_PTR(-ENOMEM);
2002 * Destination cache support functions
2005 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2008 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2009 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
2012 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2016 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2019 if (rt6_check_expired(rt))
2025 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2027 if (!__rt6_check_expired(rt) &&
2028 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2029 rt6_check(rt->from, cookie))
2035 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2037 struct rt6_info *rt;
2039 rt = (struct rt6_info *) dst;
2041 /* All IPV6 dsts are created with ->obsolete set to the value
2042 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2043 * into this function always.
2046 rt6_dst_from_metrics_check(rt);
2048 if (rt->rt6i_flags & RTF_PCPU ||
2049 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2050 return rt6_dst_from_check(rt, cookie);
2052 return rt6_check(rt, cookie);
2055 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2057 struct rt6_info *rt = (struct rt6_info *) dst;
2060 if (rt->rt6i_flags & RTF_CACHE) {
2061 if (rt6_check_expired(rt)) {
2073 static void ip6_link_failure(struct sk_buff *skb)
2075 struct rt6_info *rt;
2077 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2079 rt = (struct rt6_info *) skb_dst(skb);
2081 if (rt->rt6i_flags & RTF_CACHE) {
2082 if (dst_hold_safe(&rt->dst))
2085 struct fib6_node *fn;
2088 fn = rcu_dereference(rt->rt6i_node);
2089 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2096 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2098 struct net *net = dev_net(rt->dst.dev);
2100 rt->rt6i_flags |= RTF_MODIFIED;
2101 rt->rt6i_pmtu = mtu;
2102 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2105 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2107 return !(rt->rt6i_flags & RTF_CACHE) &&
2108 (rt->rt6i_flags & RTF_PCPU ||
2109 rcu_access_pointer(rt->rt6i_node));
2112 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2113 const struct ipv6hdr *iph, u32 mtu)
2115 const struct in6_addr *daddr, *saddr;
2116 struct rt6_info *rt6 = (struct rt6_info *)dst;
2118 if (rt6->rt6i_flags & RTF_LOCAL)
2121 if (dst_metric_locked(dst, RTAX_MTU))
2125 daddr = &iph->daddr;
2126 saddr = &iph->saddr;
2128 daddr = &sk->sk_v6_daddr;
2129 saddr = &inet6_sk(sk)->saddr;
2134 dst_confirm_neigh(dst, daddr);
2135 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2136 if (mtu >= dst_mtu(dst))
2139 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2140 rt6_do_update_pmtu(rt6, mtu);
2141 /* update rt6_ex->stamp for cache */
2142 if (rt6->rt6i_flags & RTF_CACHE)
2143 rt6_update_exception_stamp_rt(rt6);
2145 struct rt6_info *nrt6;
2147 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2149 rt6_do_update_pmtu(nrt6, mtu);
2150 if (rt6_insert_exception(nrt6, rt6))
2151 dst_release_immediate(&nrt6->dst);
2156 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2157 struct sk_buff *skb, u32 mtu)
2159 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2162 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2163 int oif, u32 mark, kuid_t uid)
2165 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2166 struct dst_entry *dst;
2169 memset(&fl6, 0, sizeof(fl6));
2170 fl6.flowi6_oif = oif;
2171 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2172 fl6.daddr = iph->daddr;
2173 fl6.saddr = iph->saddr;
2174 fl6.flowlabel = ip6_flowinfo(iph);
2175 fl6.flowi6_uid = uid;
2177 dst = ip6_route_output(net, NULL, &fl6);
2179 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2182 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2184 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2186 struct dst_entry *dst;
2188 ip6_update_pmtu(skb, sock_net(sk), mtu,
2189 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2191 dst = __sk_dst_get(sk);
2192 if (!dst || !dst->obsolete ||
2193 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2197 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2198 ip6_datagram_dst_update(sk, false);
2201 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2203 /* Handle redirects */
2204 struct ip6rd_flowi {
2206 struct in6_addr gateway;
2209 static struct rt6_info *__ip6_route_redirect(struct net *net,
2210 struct fib6_table *table,
2212 const struct sk_buff *skb,
2215 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2216 struct rt6_info *rt, *rt_cache;
2217 struct fib6_node *fn;
2219 /* Get the "current" route for this destination and
2220 * check if the redirect has come from appropriate router.
2222 * RFC 4861 specifies that redirects should only be
2223 * accepted if they come from the nexthop to the target.
2224 * Due to the way the routes are chosen, this notion
2225 * is a bit fuzzy and one might need to check all possible
2230 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2232 for_each_fib6_node_rt_rcu(fn) {
2233 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2235 if (rt6_check_expired(rt))
2239 if (!(rt->rt6i_flags & RTF_GATEWAY))
2241 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2243 /* rt_cache's gateway might be different from its 'parent'
2244 * in the case of an ip redirect.
2245 * So we keep searching in the exception table if the gateway
2248 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2249 rt_cache = rt6_find_cached_rt(rt,
2253 ipv6_addr_equal(&rdfl->gateway,
2254 &rt_cache->rt6i_gateway)) {
2264 rt = net->ipv6.ip6_null_entry;
2265 else if (rt->dst.error) {
2266 rt = net->ipv6.ip6_null_entry;
2270 if (rt == net->ipv6.ip6_null_entry) {
2271 fn = fib6_backtrack(fn, &fl6->saddr);
2277 ip6_hold_safe(net, &rt, true);
2281 trace_fib6_table_lookup(net, rt, table, fl6);
2285 static struct dst_entry *ip6_route_redirect(struct net *net,
2286 const struct flowi6 *fl6,
2287 const struct sk_buff *skb,
2288 const struct in6_addr *gateway)
2290 int flags = RT6_LOOKUP_F_HAS_SADDR;
2291 struct ip6rd_flowi rdfl;
2294 rdfl.gateway = *gateway;
2296 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2297 flags, __ip6_route_redirect);
2300 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2303 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2304 struct dst_entry *dst;
2307 memset(&fl6, 0, sizeof(fl6));
2308 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2309 fl6.flowi6_oif = oif;
2310 fl6.flowi6_mark = mark;
2311 fl6.daddr = iph->daddr;
2312 fl6.saddr = iph->saddr;
2313 fl6.flowlabel = ip6_flowinfo(iph);
2314 fl6.flowi6_uid = uid;
2316 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2317 rt6_do_redirect(dst, NULL, skb);
2320 EXPORT_SYMBOL_GPL(ip6_redirect);
2322 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2325 const struct ipv6hdr *iph = ipv6_hdr(skb);
2326 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2327 struct dst_entry *dst;
2330 memset(&fl6, 0, sizeof(fl6));
2331 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2332 fl6.flowi6_oif = oif;
2333 fl6.flowi6_mark = mark;
2334 fl6.daddr = msg->dest;
2335 fl6.saddr = iph->daddr;
2336 fl6.flowi6_uid = sock_net_uid(net, NULL);
2338 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2339 rt6_do_redirect(dst, NULL, skb);
2343 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2345 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2348 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2350 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2352 struct net_device *dev = dst->dev;
2353 unsigned int mtu = dst_mtu(dst);
2354 struct net *net = dev_net(dev);
2356 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2358 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2359 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2362 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2363 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2364 * IPV6_MAXPLEN is also valid and means: "any MSS,
2365 * rely only on pmtu discovery"
2367 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2372 static unsigned int ip6_mtu(const struct dst_entry *dst)
2374 const struct rt6_info *rt = (const struct rt6_info *)dst;
2375 unsigned int mtu = rt->rt6i_pmtu;
2376 struct inet6_dev *idev;
2381 mtu = dst_metric_raw(dst, RTAX_MTU);
2388 idev = __in6_dev_get(dst->dev);
2390 mtu = idev->cnf.mtu6;
2394 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2396 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2399 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2402 struct dst_entry *dst;
2403 struct rt6_info *rt;
2404 struct inet6_dev *idev = in6_dev_get(dev);
2405 struct net *net = dev_net(dev);
2407 if (unlikely(!idev))
2408 return ERR_PTR(-ENODEV);
2410 rt = ip6_dst_alloc(net, dev, 0);
2411 if (unlikely(!rt)) {
2413 dst = ERR_PTR(-ENOMEM);
2417 rt->dst.flags |= DST_HOST;
2418 rt->dst.input = ip6_input;
2419 rt->dst.output = ip6_output;
2420 rt->rt6i_gateway = fl6->daddr;
2421 rt->rt6i_dst.addr = fl6->daddr;
2422 rt->rt6i_dst.plen = 128;
2423 rt->rt6i_idev = idev;
2424 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2426 /* Add this dst into uncached_list so that rt6_disable_ip() can
2427 * do proper release of the net_device
2429 rt6_uncached_list_add(rt);
2430 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2432 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2438 static int ip6_dst_gc(struct dst_ops *ops)
2440 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2441 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2442 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2443 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2444 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2445 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2448 entries = dst_entries_get_fast(ops);
2449 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2450 entries <= rt_max_size)
2453 net->ipv6.ip6_rt_gc_expire++;
2454 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2455 entries = dst_entries_get_slow(ops);
2456 if (entries < ops->gc_thresh)
2457 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2459 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2460 return entries > rt_max_size;
2463 static int ip6_convert_metrics(struct mx6_config *mxc,
2464 const struct fib6_config *cfg)
2466 struct net *net = cfg->fc_nlinfo.nl_net;
2467 bool ecn_ca = false;
2475 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2479 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2480 int type = nla_type(nla);
2485 if (unlikely(type > RTAX_MAX))
2488 if (type == RTAX_CC_ALGO) {
2489 char tmp[TCP_CA_NAME_MAX];
2491 nla_strlcpy(tmp, nla, sizeof(tmp));
2492 val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2493 if (val == TCP_CA_UNSPEC)
2496 val = nla_get_u32(nla);
2498 if (type == RTAX_HOPLIMIT && val > 255)
2500 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2504 __set_bit(type - 1, mxc->mx_valid);
2508 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2509 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2519 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2520 struct fib6_config *cfg,
2521 const struct in6_addr *gw_addr,
2522 u32 tbid, int flags)
2524 struct flowi6 fl6 = {
2525 .flowi6_oif = cfg->fc_ifindex,
2527 .saddr = cfg->fc_prefsrc,
2529 struct fib6_table *table;
2530 struct rt6_info *rt;
2532 table = fib6_get_table(net, tbid);
2536 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2537 flags |= RT6_LOOKUP_F_HAS_SADDR;
2539 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2540 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2542 /* if table lookup failed, fall back to full lookup */
2543 if (rt == net->ipv6.ip6_null_entry) {
2551 static int ip6_route_check_nh_onlink(struct net *net,
2552 struct fib6_config *cfg,
2553 struct net_device *dev,
2554 struct netlink_ext_ack *extack)
2556 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2557 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2558 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2559 struct rt6_info *grt;
2563 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2565 if (!grt->dst.error &&
2566 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2567 NL_SET_ERR_MSG(extack,
2568 "Nexthop has invalid gateway or device mismatch");
2578 static int ip6_route_check_nh(struct net *net,
2579 struct fib6_config *cfg,
2580 struct net_device **_dev,
2581 struct inet6_dev **idev)
2583 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2584 struct net_device *dev = _dev ? *_dev : NULL;
2585 struct rt6_info *grt = NULL;
2586 int err = -EHOSTUNREACH;
2588 if (cfg->fc_table) {
2589 int flags = RT6_LOOKUP_F_IFACE;
2591 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2592 cfg->fc_table, flags);
2594 if (grt->rt6i_flags & RTF_GATEWAY ||
2595 (dev && dev != grt->dst.dev)) {
2603 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2609 if (dev != grt->dst.dev) {
2614 *_dev = dev = grt->dst.dev;
2615 *idev = grt->rt6i_idev;
2617 in6_dev_hold(grt->rt6i_idev);
2620 if (!(grt->rt6i_flags & RTF_GATEWAY))
2629 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2630 struct netlink_ext_ack *extack)
2632 struct net *net = cfg->fc_nlinfo.nl_net;
2633 struct rt6_info *rt = NULL;
2634 struct net_device *dev = NULL;
2635 struct inet6_dev *idev = NULL;
2636 struct fib6_table *table;
2640 /* RTF_PCPU is an internal flag; can not be set by userspace */
2641 if (cfg->fc_flags & RTF_PCPU) {
2642 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2646 /* RTF_CACHE is an internal flag; can not be set by userspace */
2647 if (cfg->fc_flags & RTF_CACHE) {
2648 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2652 if (cfg->fc_dst_len > 128) {
2653 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2656 if (cfg->fc_src_len > 128) {
2657 NL_SET_ERR_MSG(extack, "Invalid source address length");
2660 #ifndef CONFIG_IPV6_SUBTREES
2661 if (cfg->fc_src_len) {
2662 NL_SET_ERR_MSG(extack,
2663 "Specifying source address requires IPV6_SUBTREES to be enabled");
2667 if (cfg->fc_ifindex) {
2669 dev = dev_get_by_index(net, cfg->fc_ifindex);
2672 idev = in6_dev_get(dev);
2677 if (cfg->fc_metric == 0)
2678 cfg->fc_metric = IP6_RT_PRIO_USER;
2680 if (cfg->fc_flags & RTNH_F_ONLINK) {
2682 NL_SET_ERR_MSG(extack,
2683 "Nexthop device required for onlink");
2688 if (!(dev->flags & IFF_UP)) {
2689 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2696 if (cfg->fc_nlinfo.nlh &&
2697 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2698 table = fib6_get_table(net, cfg->fc_table);
2700 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2701 table = fib6_new_table(net, cfg->fc_table);
2704 table = fib6_new_table(net, cfg->fc_table);
2710 rt = ip6_dst_alloc(net, NULL,
2711 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2718 if (cfg->fc_flags & RTF_EXPIRES)
2719 rt6_set_expires(rt, jiffies +
2720 clock_t_to_jiffies(cfg->fc_expires));
2722 rt6_clean_expires(rt);
2724 if (cfg->fc_protocol == RTPROT_UNSPEC)
2725 cfg->fc_protocol = RTPROT_BOOT;
2726 rt->rt6i_protocol = cfg->fc_protocol;
2728 addr_type = ipv6_addr_type(&cfg->fc_dst);
2730 if (addr_type & IPV6_ADDR_MULTICAST)
2731 rt->dst.input = ip6_mc_input;
2732 else if (cfg->fc_flags & RTF_LOCAL)
2733 rt->dst.input = ip6_input;
2735 rt->dst.input = ip6_forward;
2737 rt->dst.output = ip6_output;
2739 if (cfg->fc_encap) {
2740 struct lwtunnel_state *lwtstate;
2742 err = lwtunnel_build_state(cfg->fc_encap_type,
2743 cfg->fc_encap, AF_INET6, cfg,
2747 rt->dst.lwtstate = lwtstate_get(lwtstate);
2748 lwtunnel_set_redirect(&rt->dst);
2751 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2752 rt->rt6i_dst.plen = cfg->fc_dst_len;
2753 if (rt->rt6i_dst.plen == 128)
2754 rt->dst.flags |= DST_HOST;
2756 #ifdef CONFIG_IPV6_SUBTREES
2757 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2758 rt->rt6i_src.plen = cfg->fc_src_len;
2761 rt->rt6i_metric = cfg->fc_metric;
2762 rt->rt6i_nh_weight = 1;
2764 /* We cannot add true routes via loopback here,
2765 they would result in kernel looping; promote them to reject routes
2767 if ((cfg->fc_flags & RTF_REJECT) ||
2768 (dev && (dev->flags & IFF_LOOPBACK) &&
2769 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2770 !(cfg->fc_flags & RTF_LOCAL))) {
2771 /* hold loopback dev/idev if we haven't done so. */
2772 if (dev != net->loopback_dev) {
2777 dev = net->loopback_dev;
2779 idev = in6_dev_get(dev);
2785 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2786 switch (cfg->fc_type) {
2788 rt->dst.error = -EINVAL;
2789 rt->dst.output = dst_discard_out;
2790 rt->dst.input = dst_discard;
2793 rt->dst.error = -EACCES;
2794 rt->dst.output = ip6_pkt_prohibit_out;
2795 rt->dst.input = ip6_pkt_prohibit;
2798 case RTN_UNREACHABLE:
2800 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2801 : (cfg->fc_type == RTN_UNREACHABLE)
2802 ? -EHOSTUNREACH : -ENETUNREACH;
2803 rt->dst.output = ip6_pkt_discard_out;
2804 rt->dst.input = ip6_pkt_discard;
2810 if (cfg->fc_flags & RTF_GATEWAY) {
2811 const struct in6_addr *gw_addr;
2814 gw_addr = &cfg->fc_gateway;
2815 gwa_type = ipv6_addr_type(gw_addr);
2817 /* if gw_addr is local we will fail to detect this in case
2818 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2819 * will return already-added prefix route via interface that
2820 * prefix route was assigned to, which might be non-loopback.
2823 if (ipv6_chk_addr_and_flags(net, gw_addr,
2824 gwa_type & IPV6_ADDR_LINKLOCAL ?
2825 dev : NULL, 0, 0)) {
2826 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2829 rt->rt6i_gateway = *gw_addr;
2831 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2832 /* IPv6 strictly inhibits using not link-local
2833 addresses as nexthop address.
2834 Otherwise, router will not able to send redirects.
2835 It is very good, but in some (rare!) circumstances
2836 (SIT, PtP, NBMA NOARP links) it is handy to allow
2837 some exceptions. --ANK
2838 We allow IPv4-mapped nexthops to support RFC4798-type
2841 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2842 IPV6_ADDR_MAPPED))) {
2843 NL_SET_ERR_MSG(extack,
2844 "Invalid gateway address");
2848 if (cfg->fc_flags & RTNH_F_ONLINK) {
2849 err = ip6_route_check_nh_onlink(net, cfg, dev,
2852 err = ip6_route_check_nh(net, cfg, &dev, &idev);
2859 NL_SET_ERR_MSG(extack, "Egress device not specified");
2861 } else if (dev->flags & IFF_LOOPBACK) {
2862 NL_SET_ERR_MSG(extack,
2863 "Egress device can not be loopback device for this route");
2872 if (!(dev->flags & IFF_UP)) {
2873 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2878 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2879 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2880 NL_SET_ERR_MSG(extack, "Invalid source address");
2884 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2885 rt->rt6i_prefsrc.plen = 128;
2887 rt->rt6i_prefsrc.plen = 0;
2889 rt->rt6i_flags = cfg->fc_flags;
2892 if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2893 !netif_carrier_ok(dev))
2894 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2895 rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2897 rt->rt6i_idev = idev;
2898 rt->rt6i_table = table;
2900 cfg->fc_nlinfo.nl_net = dev_net(dev);
2909 dst_release_immediate(&rt->dst);
2911 return ERR_PTR(err);
2914 int ip6_route_add(struct fib6_config *cfg,
2915 struct netlink_ext_ack *extack)
2917 struct mx6_config mxc = { .mx = NULL, };
2918 struct rt6_info *rt;
2921 rt = ip6_route_info_create(cfg, extack);
2928 err = ip6_convert_metrics(&mxc, cfg);
2932 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2939 dst_release_immediate(&rt->dst);
2944 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2947 struct fib6_table *table;
2948 struct net *net = dev_net(rt->dst.dev);
2950 if (rt == net->ipv6.ip6_null_entry) {
2955 table = rt->rt6i_table;
2956 spin_lock_bh(&table->tb6_lock);
2957 err = fib6_del(rt, info);
2958 spin_unlock_bh(&table->tb6_lock);
2965 int ip6_del_rt(struct rt6_info *rt)
2967 struct nl_info info = {
2968 .nl_net = dev_net(rt->dst.dev),
2970 return __ip6_del_rt(rt, &info);
2973 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2975 struct nl_info *info = &cfg->fc_nlinfo;
2976 struct net *net = info->nl_net;
2977 struct sk_buff *skb = NULL;
2978 struct fib6_table *table;
2981 if (rt == net->ipv6.ip6_null_entry)
2983 table = rt->rt6i_table;
2984 spin_lock_bh(&table->tb6_lock);
2986 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2987 struct rt6_info *sibling, *next_sibling;
2989 /* prefer to send a single notification with all hops */
2990 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2992 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2994 if (rt6_fill_node(net, skb, rt,
2995 NULL, NULL, 0, RTM_DELROUTE,
2996 info->portid, seq, 0) < 0) {
3000 info->skip_notify = 1;
3003 list_for_each_entry_safe(sibling, next_sibling,
3006 err = fib6_del(sibling, info);
3012 err = fib6_del(rt, info);
3014 spin_unlock_bh(&table->tb6_lock);
3019 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3020 info->nlh, gfp_any());
3025 static int ip6_route_del(struct fib6_config *cfg,
3026 struct netlink_ext_ack *extack)
3028 struct rt6_info *rt, *rt_cache;
3029 struct fib6_table *table;
3030 struct fib6_node *fn;
3033 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3035 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3041 fn = fib6_locate(&table->tb6_root,
3042 &cfg->fc_dst, cfg->fc_dst_len,
3043 &cfg->fc_src, cfg->fc_src_len,
3044 !(cfg->fc_flags & RTF_CACHE));
3047 for_each_fib6_node_rt_rcu(fn) {
3048 if (cfg->fc_flags & RTF_CACHE) {
3049 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3055 if (cfg->fc_ifindex &&
3057 rt->dst.dev->ifindex != cfg->fc_ifindex))
3059 if (cfg->fc_flags & RTF_GATEWAY &&
3060 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3062 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3064 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3066 if (!dst_hold_safe(&rt->dst))
3070 /* if gateway was specified only delete the one hop */
3071 if (cfg->fc_flags & RTF_GATEWAY)
3072 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3074 return __ip6_del_rt_siblings(rt, cfg);
3082 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3084 struct netevent_redirect netevent;
3085 struct rt6_info *rt, *nrt = NULL;
3086 struct ndisc_options ndopts;
3087 struct inet6_dev *in6_dev;
3088 struct neighbour *neigh;
3090 int optlen, on_link;
3093 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3094 optlen -= sizeof(*msg);
3097 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3101 msg = (struct rd_msg *)icmp6_hdr(skb);
3103 if (ipv6_addr_is_multicast(&msg->dest)) {
3104 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3109 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3111 } else if (ipv6_addr_type(&msg->target) !=
3112 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3113 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3117 in6_dev = __in6_dev_get(skb->dev);
3120 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3124 * The IP source address of the Redirect MUST be the same as the current
3125 * first-hop router for the specified ICMP Destination Address.
3128 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3129 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3134 if (ndopts.nd_opts_tgt_lladdr) {
3135 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3138 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3143 rt = (struct rt6_info *) dst;
3144 if (rt->rt6i_flags & RTF_REJECT) {
3145 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3149 /* Redirect received -> path was valid.
3150 * Look, redirects are sent only in response to data packets,
3151 * so that this nexthop apparently is reachable. --ANK
3153 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3155 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3160 * We have finally decided to accept it.
3163 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3164 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3165 NEIGH_UPDATE_F_OVERRIDE|
3166 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3167 NEIGH_UPDATE_F_ISROUTER)),
3168 NDISC_REDIRECT, &ndopts);
3170 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3174 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3176 nrt->rt6i_flags &= ~RTF_GATEWAY;
3178 nrt->rt6i_protocol = RTPROT_REDIRECT;
3179 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3181 /* No need to remove rt from the exception table if rt is
3182 * a cached route because rt6_insert_exception() will
3185 if (rt6_insert_exception(nrt, rt)) {
3186 dst_release_immediate(&nrt->dst);
3190 netevent.old = &rt->dst;
3191 netevent.new = &nrt->dst;
3192 netevent.daddr = &msg->dest;
3193 netevent.neigh = neigh;
3194 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3197 neigh_release(neigh);
3201 * Misc support functions
3204 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3208 rt->rt6i_flags &= ~RTF_EXPIRES;
3209 dst_hold(&from->dst);
3211 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3214 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3216 rt->dst.input = ort->dst.input;
3217 rt->dst.output = ort->dst.output;
3218 rt->rt6i_dst = ort->rt6i_dst;
3219 rt->dst.error = ort->dst.error;
3220 rt->rt6i_idev = ort->rt6i_idev;
3222 in6_dev_hold(rt->rt6i_idev);
3223 rt->dst.lastuse = jiffies;
3224 rt->rt6i_gateway = ort->rt6i_gateway;
3225 rt->rt6i_flags = ort->rt6i_flags;
3226 rt6_set_from(rt, ort);
3227 rt->rt6i_metric = ort->rt6i_metric;
3228 #ifdef CONFIG_IPV6_SUBTREES
3229 rt->rt6i_src = ort->rt6i_src;
3231 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3232 rt->rt6i_table = ort->rt6i_table;
3233 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3236 #ifdef CONFIG_IPV6_ROUTE_INFO
3237 static struct rt6_info *rt6_get_route_info(struct net *net,
3238 const struct in6_addr *prefix, int prefixlen,
3239 const struct in6_addr *gwaddr,
3240 struct net_device *dev)
3242 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3243 int ifindex = dev->ifindex;
3244 struct fib6_node *fn;
3245 struct rt6_info *rt = NULL;
3246 struct fib6_table *table;
3248 table = fib6_get_table(net, tb_id);
3253 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3257 for_each_fib6_node_rt_rcu(fn) {
3258 if (rt->dst.dev->ifindex != ifindex)
3260 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3262 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3264 ip6_hold_safe(NULL, &rt, false);
3272 static struct rt6_info *rt6_add_route_info(struct net *net,
3273 const struct in6_addr *prefix, int prefixlen,
3274 const struct in6_addr *gwaddr,
3275 struct net_device *dev,
3278 struct fib6_config cfg = {
3279 .fc_metric = IP6_RT_PRIO_USER,
3280 .fc_ifindex = dev->ifindex,
3281 .fc_dst_len = prefixlen,
3282 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3283 RTF_UP | RTF_PREF(pref),
3284 .fc_protocol = RTPROT_RA,
3285 .fc_nlinfo.portid = 0,
3286 .fc_nlinfo.nlh = NULL,
3287 .fc_nlinfo.nl_net = net,
3290 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3291 cfg.fc_dst = *prefix;
3292 cfg.fc_gateway = *gwaddr;
3294 /* We should treat it as a default route if prefix length is 0. */
3296 cfg.fc_flags |= RTF_DEFAULT;
3298 ip6_route_add(&cfg, NULL);
3300 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3304 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3306 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3307 struct rt6_info *rt;
3308 struct fib6_table *table;
3310 table = fib6_get_table(dev_net(dev), tb_id);
3315 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3316 if (dev == rt->dst.dev &&
3317 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3318 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3322 ip6_hold_safe(NULL, &rt, false);
3327 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3328 struct net_device *dev,
3331 struct fib6_config cfg = {
3332 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3333 .fc_metric = IP6_RT_PRIO_USER,
3334 .fc_ifindex = dev->ifindex,
3335 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3336 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3337 .fc_protocol = RTPROT_RA,
3338 .fc_nlinfo.portid = 0,
3339 .fc_nlinfo.nlh = NULL,
3340 .fc_nlinfo.nl_net = dev_net(dev),
3343 cfg.fc_gateway = *gwaddr;
3345 if (!ip6_route_add(&cfg, NULL)) {
3346 struct fib6_table *table;
3348 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3350 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3353 return rt6_get_dflt_router(gwaddr, dev);
3356 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3358 struct rt6_info *rt;
3362 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3363 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3364 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3365 if (dst_hold_safe(&rt->dst)) {
3376 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3379 void rt6_purge_dflt_routers(struct net *net)
3381 struct fib6_table *table;
3382 struct hlist_head *head;
3387 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3388 head = &net->ipv6.fib_table_hash[h];
3389 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3390 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3391 __rt6_purge_dflt_routers(table);
3398 static void rtmsg_to_fib6_config(struct net *net,
3399 struct in6_rtmsg *rtmsg,
3400 struct fib6_config *cfg)
3402 memset(cfg, 0, sizeof(*cfg));
3404 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3406 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3407 cfg->fc_metric = rtmsg->rtmsg_metric;
3408 cfg->fc_expires = rtmsg->rtmsg_info;
3409 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3410 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3411 cfg->fc_flags = rtmsg->rtmsg_flags;
3413 cfg->fc_nlinfo.nl_net = net;
3415 cfg->fc_dst = rtmsg->rtmsg_dst;
3416 cfg->fc_src = rtmsg->rtmsg_src;
3417 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3420 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3422 struct fib6_config cfg;
3423 struct in6_rtmsg rtmsg;
3427 case SIOCADDRT: /* Add a route */
3428 case SIOCDELRT: /* Delete a route */
3429 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3431 err = copy_from_user(&rtmsg, arg,
3432 sizeof(struct in6_rtmsg));
3436 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3441 err = ip6_route_add(&cfg, NULL);
3444 err = ip6_route_del(&cfg, NULL);
3458 * Drop the packet on the floor
3461 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3464 struct dst_entry *dst = skb_dst(skb);
3465 switch (ipstats_mib_noroutes) {
3466 case IPSTATS_MIB_INNOROUTES:
3467 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3468 if (type == IPV6_ADDR_ANY) {
3469 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3470 IPSTATS_MIB_INADDRERRORS);
3474 case IPSTATS_MIB_OUTNOROUTES:
3475 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3476 ipstats_mib_noroutes);
3479 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3484 static int ip6_pkt_discard(struct sk_buff *skb)
3486 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3489 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3491 skb->dev = skb_dst(skb)->dev;
3492 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3495 static int ip6_pkt_prohibit(struct sk_buff *skb)
3497 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3500 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3502 skb->dev = skb_dst(skb)->dev;
3503 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3507 * Allocate a dst for local (unicast / anycast) address.
3510 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3511 const struct in6_addr *addr,
3515 struct net *net = dev_net(idev->dev);
3516 struct net_device *dev = idev->dev;
3517 struct rt6_info *rt;
3519 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3521 return ERR_PTR(-ENOMEM);
3525 rt->dst.flags |= DST_HOST;
3526 rt->dst.input = ip6_input;
3527 rt->dst.output = ip6_output;
3528 rt->rt6i_idev = idev;
3530 rt->rt6i_protocol = RTPROT_KERNEL;
3531 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3533 rt->rt6i_flags |= RTF_ANYCAST;
3535 rt->rt6i_flags |= RTF_LOCAL;
3537 rt->rt6i_gateway = *addr;
3538 rt->rt6i_dst.addr = *addr;
3539 rt->rt6i_dst.plen = 128;
3540 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3541 rt->rt6i_table = fib6_get_table(net, tb_id);
3546 /* remove deleted ip from prefsrc entries */
3547 struct arg_dev_net_ip {
3548 struct net_device *dev;
3550 struct in6_addr *addr;
3553 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3555 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3556 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3557 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3559 if (((void *)rt->dst.dev == dev || !dev) &&
3560 rt != net->ipv6.ip6_null_entry &&
3561 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3562 spin_lock_bh(&rt6_exception_lock);
3563 /* remove prefsrc entry */
3564 rt->rt6i_prefsrc.plen = 0;
3565 /* need to update cache as well */
3566 rt6_exceptions_remove_prefsrc(rt);
3567 spin_unlock_bh(&rt6_exception_lock);
3572 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3574 struct net *net = dev_net(ifp->idev->dev);
3575 struct arg_dev_net_ip adni = {
3576 .dev = ifp->idev->dev,
3580 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3583 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3585 /* Remove routers and update dst entries when gateway turn into host. */
3586 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3588 struct in6_addr *gateway = (struct in6_addr *)arg;
3590 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3591 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3595 /* Further clean up cached routes in exception table.
3596 * This is needed because cached route may have a different
3597 * gateway than its 'parent' in the case of an ip redirect.
3599 rt6_exceptions_clean_tohost(rt, gateway);
3604 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3606 fib6_clean_all(net, fib6_clean_tohost, gateway);
3609 struct arg_netdev_event {
3610 const struct net_device *dev;
3612 unsigned int nh_flags;
3613 unsigned long event;
3617 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3619 struct rt6_info *iter;
3620 struct fib6_node *fn;
3622 fn = rcu_dereference_protected(rt->rt6i_node,
3623 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3624 iter = rcu_dereference_protected(fn->leaf,
3625 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3627 if (iter->rt6i_metric == rt->rt6i_metric &&
3628 rt6_qualify_for_ecmp(iter))
3630 iter = rcu_dereference_protected(iter->rt6_next,
3631 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3637 static bool rt6_is_dead(const struct rt6_info *rt)
3639 if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3640 (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3641 rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3647 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3649 struct rt6_info *iter;
3652 if (!rt6_is_dead(rt))
3653 total += rt->rt6i_nh_weight;
3655 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3656 if (!rt6_is_dead(iter))
3657 total += iter->rt6i_nh_weight;
3663 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3665 int upper_bound = -1;
3667 if (!rt6_is_dead(rt)) {
3668 *weight += rt->rt6i_nh_weight;
3669 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3672 atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3675 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3677 struct rt6_info *iter;
3680 rt6_upper_bound_set(rt, &weight, total);
3682 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3683 rt6_upper_bound_set(iter, &weight, total);
3686 void rt6_multipath_rebalance(struct rt6_info *rt)
3688 struct rt6_info *first;
3691 /* In case the entire multipath route was marked for flushing,
3692 * then there is no need to rebalance upon the removal of every
3695 if (!rt->rt6i_nsiblings || rt->should_flush)
3698 /* During lookup routes are evaluated in order, so we need to
3699 * make sure upper bounds are assigned from the first sibling
3702 first = rt6_multipath_first_sibling(rt);
3703 if (WARN_ON_ONCE(!first))
3706 total = rt6_multipath_total_weight(first);
3707 rt6_multipath_upper_bound_set(first, total);
3710 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3712 const struct arg_netdev_event *arg = p_arg;
3713 const struct net *net = dev_net(arg->dev);
3715 if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3716 rt->rt6i_nh_flags &= ~arg->nh_flags;
3717 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3718 rt6_multipath_rebalance(rt);
3724 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3726 struct arg_netdev_event arg = {
3729 .nh_flags = nh_flags,
3733 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3734 arg.nh_flags |= RTNH_F_LINKDOWN;
3736 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3739 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3740 const struct net_device *dev)
3742 struct rt6_info *iter;
3744 if (rt->dst.dev == dev)
3746 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3747 if (iter->dst.dev == dev)
3753 static void rt6_multipath_flush(struct rt6_info *rt)
3755 struct rt6_info *iter;
3757 rt->should_flush = 1;
3758 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3759 iter->should_flush = 1;
3762 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3763 const struct net_device *down_dev)
3765 struct rt6_info *iter;
3766 unsigned int dead = 0;
3768 if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3770 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3771 if (iter->dst.dev == down_dev ||
3772 iter->rt6i_nh_flags & RTNH_F_DEAD)
3778 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3779 const struct net_device *dev,
3780 unsigned int nh_flags)
3782 struct rt6_info *iter;
3784 if (rt->dst.dev == dev)
3785 rt->rt6i_nh_flags |= nh_flags;
3786 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3787 if (iter->dst.dev == dev)
3788 iter->rt6i_nh_flags |= nh_flags;
3791 /* called with write lock held for table with rt */
3792 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3794 const struct arg_netdev_event *arg = p_arg;
3795 const struct net_device *dev = arg->dev;
3796 const struct net *net = dev_net(dev);
3798 if (rt == net->ipv6.ip6_null_entry)
3801 switch (arg->event) {
3802 case NETDEV_UNREGISTER:
3803 return rt->dst.dev == dev ? -1 : 0;
3805 if (rt->should_flush)
3807 if (!rt->rt6i_nsiblings)
3808 return rt->dst.dev == dev ? -1 : 0;
3809 if (rt6_multipath_uses_dev(rt, dev)) {
3812 count = rt6_multipath_dead_count(rt, dev);
3813 if (rt->rt6i_nsiblings + 1 == count) {
3814 rt6_multipath_flush(rt);
3817 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3819 fib6_update_sernum(rt);
3820 rt6_multipath_rebalance(rt);
3824 if (rt->dst.dev != dev ||
3825 rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3827 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3828 rt6_multipath_rebalance(rt);
3835 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3837 struct arg_netdev_event arg = {
3844 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3847 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3849 rt6_sync_down_dev(dev, event);
3850 rt6_uncached_list_flush_dev(dev_net(dev), dev);
3851 neigh_ifdown(&nd_tbl, dev);
3854 struct rt6_mtu_change_arg {
3855 struct net_device *dev;
3859 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3861 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3862 struct inet6_dev *idev;
3864 /* In IPv6 pmtu discovery is not optional,
3865 so that RTAX_MTU lock cannot disable it.
3866 We still use this lock to block changes
3867 caused by addrconf/ndisc.
3870 idev = __in6_dev_get(arg->dev);
3874 /* For administrative MTU increase, there is no way to discover
3875 IPv6 PMTU increase, so PMTU increase should be updated here.
3876 Since RFC 1981 doesn't include administrative MTU increase
3877 update PMTU increase is a MUST. (i.e. jumbo frame)
3880 If new MTU is less than route PMTU, this new MTU will be the
3881 lowest MTU in the path, update the route PMTU to reflect PMTU
3882 decreases; if new MTU is greater than route PMTU, and the
3883 old MTU is the lowest MTU in the path, update the route PMTU
3884 to reflect the increase. In this case if the other nodes' MTU
3885 also have the lowest MTU, TOO BIG MESSAGE will be lead to
3888 if (rt->dst.dev == arg->dev &&
3889 dst_metric_raw(&rt->dst, RTAX_MTU) &&
3890 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3891 spin_lock_bh(&rt6_exception_lock);
3892 if (dst_mtu(&rt->dst) >= arg->mtu ||
3893 (dst_mtu(&rt->dst) < arg->mtu &&
3894 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3895 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3897 rt6_exceptions_update_pmtu(rt, arg->mtu);
3898 spin_unlock_bh(&rt6_exception_lock);
3903 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3905 struct rt6_mtu_change_arg arg = {
3910 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3913 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3914 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
3915 [RTA_OIF] = { .type = NLA_U32 },
3916 [RTA_IIF] = { .type = NLA_U32 },
3917 [RTA_PRIORITY] = { .type = NLA_U32 },
3918 [RTA_METRICS] = { .type = NLA_NESTED },
3919 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
3920 [RTA_PREF] = { .type = NLA_U8 },
3921 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3922 [RTA_ENCAP] = { .type = NLA_NESTED },
3923 [RTA_EXPIRES] = { .type = NLA_U32 },
3924 [RTA_UID] = { .type = NLA_U32 },
3925 [RTA_MARK] = { .type = NLA_U32 },
3928 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3929 struct fib6_config *cfg,
3930 struct netlink_ext_ack *extack)
3933 struct nlattr *tb[RTA_MAX+1];
3937 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3943 rtm = nlmsg_data(nlh);
3944 memset(cfg, 0, sizeof(*cfg));
3946 cfg->fc_table = rtm->rtm_table;
3947 cfg->fc_dst_len = rtm->rtm_dst_len;
3948 cfg->fc_src_len = rtm->rtm_src_len;
3949 cfg->fc_flags = RTF_UP;
3950 cfg->fc_protocol = rtm->rtm_protocol;
3951 cfg->fc_type = rtm->rtm_type;
3953 if (rtm->rtm_type == RTN_UNREACHABLE ||
3954 rtm->rtm_type == RTN_BLACKHOLE ||
3955 rtm->rtm_type == RTN_PROHIBIT ||
3956 rtm->rtm_type == RTN_THROW)
3957 cfg->fc_flags |= RTF_REJECT;
3959 if (rtm->rtm_type == RTN_LOCAL)
3960 cfg->fc_flags |= RTF_LOCAL;
3962 if (rtm->rtm_flags & RTM_F_CLONED)
3963 cfg->fc_flags |= RTF_CACHE;
3965 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
3967 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3968 cfg->fc_nlinfo.nlh = nlh;
3969 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3971 if (tb[RTA_GATEWAY]) {
3972 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3973 cfg->fc_flags |= RTF_GATEWAY;
3977 int plen = (rtm->rtm_dst_len + 7) >> 3;
3979 if (nla_len(tb[RTA_DST]) < plen)
3982 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3986 int plen = (rtm->rtm_src_len + 7) >> 3;
3988 if (nla_len(tb[RTA_SRC]) < plen)
3991 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3994 if (tb[RTA_PREFSRC])
3995 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3998 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4000 if (tb[RTA_PRIORITY])
4001 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4003 if (tb[RTA_METRICS]) {
4004 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4005 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4009 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4011 if (tb[RTA_MULTIPATH]) {
4012 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4013 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4015 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4016 cfg->fc_mp_len, extack);
4022 pref = nla_get_u8(tb[RTA_PREF]);
4023 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4024 pref != ICMPV6_ROUTER_PREF_HIGH)
4025 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4026 cfg->fc_flags |= RTF_PREF(pref);
4030 cfg->fc_encap = tb[RTA_ENCAP];
4032 if (tb[RTA_ENCAP_TYPE]) {
4033 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4035 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4040 if (tb[RTA_EXPIRES]) {
4041 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4043 if (addrconf_finite_timeout(timeout)) {
4044 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4045 cfg->fc_flags |= RTF_EXPIRES;
4055 struct rt6_info *rt6_info;
4056 struct fib6_config r_cfg;
4057 struct mx6_config mxc;
4058 struct list_head next;
4061 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4065 list_for_each_entry(nh, rt6_nh_list, next) {
4066 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4067 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4068 nh->r_cfg.fc_ifindex);
4072 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4073 struct rt6_info *rt, struct fib6_config *r_cfg)
4078 list_for_each_entry(nh, rt6_nh_list, next) {
4079 /* check if rt6_info already exists */
4080 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4084 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4088 err = ip6_convert_metrics(&nh->mxc, r_cfg);
4093 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4094 list_add_tail(&nh->next, rt6_nh_list);
4099 static void ip6_route_mpath_notify(struct rt6_info *rt,
4100 struct rt6_info *rt_last,
4101 struct nl_info *info,
4104 /* if this is an APPEND route, then rt points to the first route
4105 * inserted and rt_last points to last route inserted. Userspace
4106 * wants a consistent dump of the route which starts at the first
4107 * nexthop. Since sibling routes are always added at the end of
4108 * the list, find the first sibling of the last route appended
4110 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4111 rt = list_first_entry(&rt_last->rt6i_siblings,
4117 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4120 static int ip6_route_multipath_add(struct fib6_config *cfg,
4121 struct netlink_ext_ack *extack)
4123 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4124 struct nl_info *info = &cfg->fc_nlinfo;
4125 struct fib6_config r_cfg;
4126 struct rtnexthop *rtnh;
4127 struct rt6_info *rt;
4128 struct rt6_nh *err_nh;
4129 struct rt6_nh *nh, *nh_safe;
4135 int replace = (cfg->fc_nlinfo.nlh &&
4136 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4137 LIST_HEAD(rt6_nh_list);
4139 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4140 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4141 nlflags |= NLM_F_APPEND;
4143 remaining = cfg->fc_mp_len;
4144 rtnh = (struct rtnexthop *)cfg->fc_mp;
4146 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4147 * rt6_info structs per nexthop
4149 while (rtnh_ok(rtnh, remaining)) {
4150 memcpy(&r_cfg, cfg, sizeof(*cfg));
4151 if (rtnh->rtnh_ifindex)
4152 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4154 attrlen = rtnh_attrlen(rtnh);
4156 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4158 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4160 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4161 r_cfg.fc_flags |= RTF_GATEWAY;
4163 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4164 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4166 r_cfg.fc_encap_type = nla_get_u16(nla);
4169 rt = ip6_route_info_create(&r_cfg, extack);
4176 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4178 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4180 dst_release_immediate(&rt->dst);
4184 rtnh = rtnh_next(rtnh, &remaining);
4187 /* for add and replace send one notification with all nexthops.
4188 * Skip the notification in fib6_add_rt2node and send one with
4189 * the full route when done
4191 info->skip_notify = 1;
4194 list_for_each_entry(nh, &rt6_nh_list, next) {
4195 rt_last = nh->rt6_info;
4196 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4197 /* save reference to first route for notification */
4198 if (!rt_notif && !err)
4199 rt_notif = nh->rt6_info;
4201 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4202 nh->rt6_info = NULL;
4205 ip6_print_replace_route_err(&rt6_nh_list);
4210 /* Because each route is added like a single route we remove
4211 * these flags after the first nexthop: if there is a collision,
4212 * we have already failed to add the first nexthop:
4213 * fib6_add_rt2node() has rejected it; when replacing, old
4214 * nexthops have been replaced by first new, the rest should
4217 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4222 /* success ... tell user about new route */
4223 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4227 /* send notification for routes that were added so that
4228 * the delete notifications sent by ip6_route_del are
4232 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4234 /* Delete routes that were already added */
4235 list_for_each_entry(nh, &rt6_nh_list, next) {
4238 ip6_route_del(&nh->r_cfg, extack);
4242 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4244 dst_release_immediate(&nh->rt6_info->dst);
4246 list_del(&nh->next);
4253 static int ip6_route_multipath_del(struct fib6_config *cfg,
4254 struct netlink_ext_ack *extack)
4256 struct fib6_config r_cfg;
4257 struct rtnexthop *rtnh;
4260 int err = 1, last_err = 0;
4262 remaining = cfg->fc_mp_len;
4263 rtnh = (struct rtnexthop *)cfg->fc_mp;
4265 /* Parse a Multipath Entry */
4266 while (rtnh_ok(rtnh, remaining)) {
4267 memcpy(&r_cfg, cfg, sizeof(*cfg));
4268 if (rtnh->rtnh_ifindex)
4269 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4271 attrlen = rtnh_attrlen(rtnh);
4273 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4275 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4277 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4278 r_cfg.fc_flags |= RTF_GATEWAY;
4281 err = ip6_route_del(&r_cfg, extack);
4285 rtnh = rtnh_next(rtnh, &remaining);
4291 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4292 struct netlink_ext_ack *extack)
4294 struct fib6_config cfg;
4297 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4302 return ip6_route_multipath_del(&cfg, extack);
4304 cfg.fc_delete_all_nh = 1;
4305 return ip6_route_del(&cfg, extack);
4309 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4310 struct netlink_ext_ack *extack)
4312 struct fib6_config cfg;
4315 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4320 return ip6_route_multipath_add(&cfg, extack);
4322 return ip6_route_add(&cfg, extack);
4325 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4327 int nexthop_len = 0;
4329 if (rt->rt6i_nsiblings) {
4330 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4331 + NLA_ALIGN(sizeof(struct rtnexthop))
4332 + nla_total_size(16) /* RTA_GATEWAY */
4333 + lwtunnel_get_encap_size(rt->dst.lwtstate);
4335 nexthop_len *= rt->rt6i_nsiblings;
4338 return NLMSG_ALIGN(sizeof(struct rtmsg))
4339 + nla_total_size(16) /* RTA_SRC */
4340 + nla_total_size(16) /* RTA_DST */
4341 + nla_total_size(16) /* RTA_GATEWAY */
4342 + nla_total_size(16) /* RTA_PREFSRC */
4343 + nla_total_size(4) /* RTA_TABLE */
4344 + nla_total_size(4) /* RTA_IIF */
4345 + nla_total_size(4) /* RTA_OIF */
4346 + nla_total_size(4) /* RTA_PRIORITY */
4347 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4348 + nla_total_size(sizeof(struct rta_cacheinfo))
4349 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4350 + nla_total_size(1) /* RTA_PREF */
4351 + lwtunnel_get_encap_size(rt->dst.lwtstate)
4355 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4356 unsigned int *flags, bool skip_oif)
4358 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4359 *flags |= RTNH_F_DEAD;
4361 if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4362 *flags |= RTNH_F_LINKDOWN;
4363 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4364 *flags |= RTNH_F_DEAD;
4367 if (rt->rt6i_flags & RTF_GATEWAY) {
4368 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4369 goto nla_put_failure;
4372 *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4373 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4374 *flags |= RTNH_F_OFFLOAD;
4376 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4377 if (!skip_oif && rt->dst.dev &&
4378 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4379 goto nla_put_failure;
4381 if (rt->dst.lwtstate &&
4382 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4383 goto nla_put_failure;
4391 /* add multipath next hop */
4392 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4394 struct rtnexthop *rtnh;
4395 unsigned int flags = 0;
4397 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4399 goto nla_put_failure;
4401 rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4402 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4404 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4405 goto nla_put_failure;
4407 rtnh->rtnh_flags = flags;
4409 /* length of rtnetlink header + attributes */
4410 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4418 static int rt6_fill_node(struct net *net,
4419 struct sk_buff *skb, struct rt6_info *rt,
4420 struct in6_addr *dst, struct in6_addr *src,
4421 int iif, int type, u32 portid, u32 seq,
4424 u32 metrics[RTAX_MAX];
4426 struct nlmsghdr *nlh;
4430 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4434 rtm = nlmsg_data(nlh);
4435 rtm->rtm_family = AF_INET6;
4436 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4437 rtm->rtm_src_len = rt->rt6i_src.plen;
4440 table = rt->rt6i_table->tb6_id;
4442 table = RT6_TABLE_UNSPEC;
4443 rtm->rtm_table = table;
4444 if (nla_put_u32(skb, RTA_TABLE, table))
4445 goto nla_put_failure;
4446 if (rt->rt6i_flags & RTF_REJECT) {
4447 switch (rt->dst.error) {
4449 rtm->rtm_type = RTN_BLACKHOLE;
4452 rtm->rtm_type = RTN_PROHIBIT;
4455 rtm->rtm_type = RTN_THROW;
4458 rtm->rtm_type = RTN_UNREACHABLE;
4462 else if (rt->rt6i_flags & RTF_LOCAL)
4463 rtm->rtm_type = RTN_LOCAL;
4464 else if (rt->rt6i_flags & RTF_ANYCAST)
4465 rtm->rtm_type = RTN_ANYCAST;
4466 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4467 rtm->rtm_type = RTN_LOCAL;
4469 rtm->rtm_type = RTN_UNICAST;
4471 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4472 rtm->rtm_protocol = rt->rt6i_protocol;
4474 if (rt->rt6i_flags & RTF_CACHE)
4475 rtm->rtm_flags |= RTM_F_CLONED;
4478 if (nla_put_in6_addr(skb, RTA_DST, dst))
4479 goto nla_put_failure;
4480 rtm->rtm_dst_len = 128;
4481 } else if (rtm->rtm_dst_len)
4482 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4483 goto nla_put_failure;
4484 #ifdef CONFIG_IPV6_SUBTREES
4486 if (nla_put_in6_addr(skb, RTA_SRC, src))
4487 goto nla_put_failure;
4488 rtm->rtm_src_len = 128;
4489 } else if (rtm->rtm_src_len &&
4490 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4491 goto nla_put_failure;
4494 #ifdef CONFIG_IPV6_MROUTE
4495 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4496 int err = ip6mr_get_route(net, skb, rtm, portid);
4501 goto nla_put_failure;
4504 if (nla_put_u32(skb, RTA_IIF, iif))
4505 goto nla_put_failure;
4507 struct in6_addr saddr_buf;
4508 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4509 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4510 goto nla_put_failure;
4513 if (rt->rt6i_prefsrc.plen) {
4514 struct in6_addr saddr_buf;
4515 saddr_buf = rt->rt6i_prefsrc.addr;
4516 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4517 goto nla_put_failure;
4520 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4522 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4523 if (rtnetlink_put_metrics(skb, metrics) < 0)
4524 goto nla_put_failure;
4526 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4527 goto nla_put_failure;
4529 /* For multipath routes, walk the siblings list and add
4530 * each as a nexthop within RTA_MULTIPATH.
4532 if (rt->rt6i_nsiblings) {
4533 struct rt6_info *sibling, *next_sibling;
4536 mp = nla_nest_start(skb, RTA_MULTIPATH);
4538 goto nla_put_failure;
4540 if (rt6_add_nexthop(skb, rt) < 0)
4541 goto nla_put_failure;
4543 list_for_each_entry_safe(sibling, next_sibling,
4544 &rt->rt6i_siblings, rt6i_siblings) {
4545 if (rt6_add_nexthop(skb, sibling) < 0)
4546 goto nla_put_failure;
4549 nla_nest_end(skb, mp);
4551 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4552 goto nla_put_failure;
4555 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4557 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4558 goto nla_put_failure;
4560 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4561 goto nla_put_failure;
4564 nlmsg_end(skb, nlh);
4568 nlmsg_cancel(skb, nlh);
4572 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4574 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4575 struct net *net = arg->net;
4577 if (rt == net->ipv6.ip6_null_entry)
4580 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4581 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4583 /* user wants prefix routes only */
4584 if (rtm->rtm_flags & RTM_F_PREFIX &&
4585 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4586 /* success since this is not a prefix route */
4591 return rt6_fill_node(net,
4592 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4593 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4597 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4598 struct netlink_ext_ack *extack)
4600 struct net *net = sock_net(in_skb->sk);
4601 struct nlattr *tb[RTA_MAX+1];
4602 int err, iif = 0, oif = 0;
4603 struct dst_entry *dst;
4604 struct rt6_info *rt;
4605 struct sk_buff *skb;
4610 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4616 memset(&fl6, 0, sizeof(fl6));
4617 rtm = nlmsg_data(nlh);
4618 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4619 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4622 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4625 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4629 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4632 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4636 iif = nla_get_u32(tb[RTA_IIF]);
4639 oif = nla_get_u32(tb[RTA_OIF]);
4642 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4645 fl6.flowi6_uid = make_kuid(current_user_ns(),
4646 nla_get_u32(tb[RTA_UID]));
4648 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4651 struct net_device *dev;
4656 dev = dev_get_by_index_rcu(net, iif);
4663 fl6.flowi6_iif = iif;
4665 if (!ipv6_addr_any(&fl6.saddr))
4666 flags |= RT6_LOOKUP_F_HAS_SADDR;
4668 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4672 fl6.flowi6_oif = oif;
4674 dst = ip6_route_output(net, NULL, &fl6);
4678 rt = container_of(dst, struct rt6_info, dst);
4679 if (rt->dst.error) {
4680 err = rt->dst.error;
4685 if (rt == net->ipv6.ip6_null_entry) {
4686 err = rt->dst.error;
4691 if (fibmatch && rt->from) {
4692 struct rt6_info *ort = rt->from;
4694 dst_hold(&ort->dst);
4699 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4706 skb_dst_set(skb, &rt->dst);
4708 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4709 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4712 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4713 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4720 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4725 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4726 unsigned int nlm_flags)
4728 struct sk_buff *skb;
4729 struct net *net = info->nl_net;
4734 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4736 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4740 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4741 event, info->portid, seq, nlm_flags);
4743 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4744 WARN_ON(err == -EMSGSIZE);
4748 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4749 info->nlh, gfp_any());
4753 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4756 static int ip6_route_dev_notify(struct notifier_block *this,
4757 unsigned long event, void *ptr)
4759 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4760 struct net *net = dev_net(dev);
4762 if (!(dev->flags & IFF_LOOPBACK))
4765 if (event == NETDEV_REGISTER) {
4766 net->ipv6.ip6_null_entry->dst.dev = dev;
4767 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4768 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4769 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4770 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4771 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4772 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4774 } else if (event == NETDEV_UNREGISTER &&
4775 dev->reg_state != NETREG_UNREGISTERED) {
4776 /* NETDEV_UNREGISTER could be fired for multiple times by
4777 * netdev_wait_allrefs(). Make sure we only call this once.
4779 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4780 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4781 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4782 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4793 #ifdef CONFIG_PROC_FS
4795 static const struct file_operations ipv6_route_proc_fops = {
4796 .open = ipv6_route_open,
4798 .llseek = seq_lseek,
4799 .release = seq_release_net,
4802 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4804 struct net *net = (struct net *)seq->private;
4805 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4806 net->ipv6.rt6_stats->fib_nodes,
4807 net->ipv6.rt6_stats->fib_route_nodes,
4808 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4809 net->ipv6.rt6_stats->fib_rt_entries,
4810 net->ipv6.rt6_stats->fib_rt_cache,
4811 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4812 net->ipv6.rt6_stats->fib_discarded_routes);
4817 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4819 return single_open_net(inode, file, rt6_stats_seq_show);
4822 static const struct file_operations rt6_stats_seq_fops = {
4823 .open = rt6_stats_seq_open,
4825 .llseek = seq_lseek,
4826 .release = single_release_net,
4828 #endif /* CONFIG_PROC_FS */
4830 #ifdef CONFIG_SYSCTL
4833 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4834 void __user *buffer, size_t *lenp, loff_t *ppos)
4841 net = (struct net *)ctl->extra1;
4842 delay = net->ipv6.sysctl.flush_delay;
4843 proc_dointvec(ctl, write, buffer, lenp, ppos);
4844 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4848 struct ctl_table ipv6_route_table_template[] = {
4850 .procname = "flush",
4851 .data = &init_net.ipv6.sysctl.flush_delay,
4852 .maxlen = sizeof(int),
4854 .proc_handler = ipv6_sysctl_rtcache_flush
4857 .procname = "gc_thresh",
4858 .data = &ip6_dst_ops_template.gc_thresh,
4859 .maxlen = sizeof(int),
4861 .proc_handler = proc_dointvec,
4864 .procname = "max_size",
4865 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
4866 .maxlen = sizeof(int),
4868 .proc_handler = proc_dointvec,
4871 .procname = "gc_min_interval",
4872 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4873 .maxlen = sizeof(int),
4875 .proc_handler = proc_dointvec_jiffies,
4878 .procname = "gc_timeout",
4879 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4880 .maxlen = sizeof(int),
4882 .proc_handler = proc_dointvec_jiffies,
4885 .procname = "gc_interval",
4886 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
4887 .maxlen = sizeof(int),
4889 .proc_handler = proc_dointvec_jiffies,
4892 .procname = "gc_elasticity",
4893 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4894 .maxlen = sizeof(int),
4896 .proc_handler = proc_dointvec,
4899 .procname = "mtu_expires",
4900 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4901 .maxlen = sizeof(int),
4903 .proc_handler = proc_dointvec_jiffies,
4906 .procname = "min_adv_mss",
4907 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4908 .maxlen = sizeof(int),
4910 .proc_handler = proc_dointvec,
4913 .procname = "gc_min_interval_ms",
4914 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4915 .maxlen = sizeof(int),
4917 .proc_handler = proc_dointvec_ms_jiffies,
4922 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4924 struct ctl_table *table;
4926 table = kmemdup(ipv6_route_table_template,
4927 sizeof(ipv6_route_table_template),
4931 table[0].data = &net->ipv6.sysctl.flush_delay;
4932 table[0].extra1 = net;
4933 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4934 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4935 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4936 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4937 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4938 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4939 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4940 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4941 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4943 /* Don't export sysctls to unprivileged users */
4944 if (net->user_ns != &init_user_ns)
4945 table[0].procname = NULL;
4952 static int __net_init ip6_route_net_init(struct net *net)
4956 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4957 sizeof(net->ipv6.ip6_dst_ops));
4959 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4960 goto out_ip6_dst_ops;
4962 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4963 sizeof(*net->ipv6.ip6_null_entry),
4965 if (!net->ipv6.ip6_null_entry)
4966 goto out_ip6_dst_entries;
4967 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4968 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4969 ip6_template_metrics, true);
4971 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4972 net->ipv6.fib6_has_custom_rules = false;
4973 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4974 sizeof(*net->ipv6.ip6_prohibit_entry),
4976 if (!net->ipv6.ip6_prohibit_entry)
4977 goto out_ip6_null_entry;
4978 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4979 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4980 ip6_template_metrics, true);
4982 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4983 sizeof(*net->ipv6.ip6_blk_hole_entry),
4985 if (!net->ipv6.ip6_blk_hole_entry)
4986 goto out_ip6_prohibit_entry;
4987 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4988 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4989 ip6_template_metrics, true);
4992 net->ipv6.sysctl.flush_delay = 0;
4993 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4994 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4995 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4996 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4997 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4998 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4999 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5001 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5007 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5008 out_ip6_prohibit_entry:
5009 kfree(net->ipv6.ip6_prohibit_entry);
5011 kfree(net->ipv6.ip6_null_entry);
5013 out_ip6_dst_entries:
5014 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5019 static void __net_exit ip6_route_net_exit(struct net *net)
5021 kfree(net->ipv6.ip6_null_entry);
5022 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5023 kfree(net->ipv6.ip6_prohibit_entry);
5024 kfree(net->ipv6.ip6_blk_hole_entry);
5026 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5029 static int __net_init ip6_route_net_init_late(struct net *net)
5031 #ifdef CONFIG_PROC_FS
5032 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5033 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
5038 static void __net_exit ip6_route_net_exit_late(struct net *net)
5040 #ifdef CONFIG_PROC_FS
5041 remove_proc_entry("ipv6_route", net->proc_net);
5042 remove_proc_entry("rt6_stats", net->proc_net);
5046 static struct pernet_operations ip6_route_net_ops = {
5047 .init = ip6_route_net_init,
5048 .exit = ip6_route_net_exit,
5052 static int __net_init ipv6_inetpeer_init(struct net *net)
5054 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5058 inet_peer_base_init(bp);
5059 net->ipv6.peers = bp;
5063 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5065 struct inet_peer_base *bp = net->ipv6.peers;
5067 net->ipv6.peers = NULL;
5068 inetpeer_invalidate_tree(bp);
5072 static struct pernet_operations ipv6_inetpeer_ops = {
5073 .init = ipv6_inetpeer_init,
5074 .exit = ipv6_inetpeer_exit,
5078 static struct pernet_operations ip6_route_net_late_ops = {
5079 .init = ip6_route_net_init_late,
5080 .exit = ip6_route_net_exit_late,
5084 static struct notifier_block ip6_route_dev_notifier = {
5085 .notifier_call = ip6_route_dev_notify,
5086 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5089 void __init ip6_route_init_special_entries(void)
5091 /* Registering of the loopback is done before this portion of code,
5092 * the loopback reference in rt6_info will not be taken, do it
5093 * manually for init_net */
5094 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5095 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5096 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5097 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5098 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5099 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5100 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5104 int __init ip6_route_init(void)
5110 ip6_dst_ops_template.kmem_cachep =
5111 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5112 SLAB_HWCACHE_ALIGN, NULL);
5113 if (!ip6_dst_ops_template.kmem_cachep)
5116 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5118 goto out_kmem_cache;
5120 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5122 goto out_dst_entries;
5124 ret = register_pernet_subsys(&ip6_route_net_ops);
5126 goto out_register_inetpeer;
5128 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5132 goto out_register_subsys;
5138 ret = fib6_rules_init();
5142 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5144 goto fib6_rules_init;
5146 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5147 inet6_rtm_newroute, NULL, 0);
5149 goto out_register_late_subsys;
5151 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5152 inet6_rtm_delroute, NULL, 0);
5154 goto out_register_late_subsys;
5156 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5157 inet6_rtm_getroute, NULL,
5158 RTNL_FLAG_DOIT_UNLOCKED);
5160 goto out_register_late_subsys;
5162 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5164 goto out_register_late_subsys;
5166 for_each_possible_cpu(cpu) {
5167 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5169 INIT_LIST_HEAD(&ul->head);
5170 spin_lock_init(&ul->lock);
5176 out_register_late_subsys:
5177 rtnl_unregister_all(PF_INET6);
5178 unregister_pernet_subsys(&ip6_route_net_late_ops);
5180 fib6_rules_cleanup();
5185 out_register_subsys:
5186 unregister_pernet_subsys(&ip6_route_net_ops);
5187 out_register_inetpeer:
5188 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5190 dst_entries_destroy(&ip6_dst_blackhole_ops);
5192 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5196 void ip6_route_cleanup(void)
5198 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5199 unregister_pernet_subsys(&ip6_route_net_late_ops);
5200 fib6_rules_cleanup();
5203 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5204 unregister_pernet_subsys(&ip6_route_net_ops);
5205 dst_entries_destroy(&ip6_dst_blackhole_ops);
5206 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);