2 * Linux INET6 implementation
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
214 n = neigh_create(&nd_tbl, daddr, dev);
215 return IS_ERR(n) ? NULL : n;
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 struct net_device *dev = dst->dev;
230 struct rt6_info *rt = (struct rt6_info *)dst;
232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 __ipv6_confirm_neigh(dev, daddr);
242 static struct dst_ops ip6_dst_ops_template = {
246 .check = ip6_dst_check,
247 .default_advmss = ip6_default_advmss,
249 .cow_metrics = dst_cow_metrics_generic,
250 .destroy = ip6_dst_destroy,
251 .ifdown = ip6_dst_ifdown,
252 .negative_advice = ip6_negative_advice,
253 .link_failure = ip6_link_failure,
254 .update_pmtu = ip6_rt_update_pmtu,
255 .redirect = rt6_do_redirect,
256 .local_out = __ip6_local_out,
257 .neigh_lookup = ip6_dst_neigh_lookup,
258 .confirm_neigh = ip6_confirm_neigh,
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 return mtu ? : dst->dev->mtu;
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269 struct sk_buff *skb, u32 mtu)
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 static struct dst_ops ip6_dst_blackhole_ops = {
280 .destroy = ip6_dst_destroy,
281 .check = ip6_dst_check,
282 .mtu = ip6_blackhole_mtu,
283 .default_advmss = ip6_default_advmss,
284 .update_pmtu = ip6_rt_blackhole_update_pmtu,
285 .redirect = ip6_rt_blackhole_redirect,
286 .cow_metrics = dst_cow_metrics_generic,
287 .neigh_lookup = ip6_dst_neigh_lookup,
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291 [RTAX_HOPLIMIT - 1] = 0,
294 static const struct fib6_info fib6_null_entry_template = {
295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
296 .fib6_protocol = RTPROT_KERNEL,
297 .fib6_metric = ~(u32)0,
298 .fib6_ref = ATOMIC_INIT(1),
299 .fib6_type = RTN_UNREACHABLE,
300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
303 static const struct rt6_info ip6_null_entry_template = {
305 .__refcnt = ATOMIC_INIT(1),
307 .obsolete = DST_OBSOLETE_FORCE_CHK,
308 .error = -ENETUNREACH,
309 .input = ip6_pkt_discard,
310 .output = ip6_pkt_discard_out,
312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 static const struct rt6_info ip6_prohibit_entry_template = {
319 .__refcnt = ATOMIC_INIT(1),
321 .obsolete = DST_OBSOLETE_FORCE_CHK,
323 .input = ip6_pkt_prohibit,
324 .output = ip6_pkt_prohibit_out,
326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
329 static const struct rt6_info ip6_blk_hole_entry_template = {
331 .__refcnt = ATOMIC_INIT(1),
333 .obsolete = DST_OBSOLETE_FORCE_CHK,
335 .input = dst_discard,
336 .output = dst_discard_out,
338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
343 static void rt6_info_init(struct rt6_info *rt)
345 struct dst_entry *dst = &rt->dst;
347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 INIT_LIST_HEAD(&rt->rt6i_uncached);
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 1, DST_OBSOLETE_FORCE_CHK, flags);
360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
365 EXPORT_SYMBOL(ip6_dst_alloc);
367 static void ip6_dst_destroy(struct dst_entry *dst)
369 struct rt6_info *rt = (struct rt6_info *)dst;
370 struct fib6_info *from;
371 struct inet6_dev *idev;
373 ip_dst_metrics_put(dst);
374 rt6_uncached_list_del(rt);
376 idev = rt->rt6i_idev;
378 rt->rt6i_idev = NULL;
383 from = rcu_dereference(rt->from);
384 rcu_assign_pointer(rt->from, NULL);
385 fib6_info_release(from);
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct inet6_dev *idev = rt->rt6i_idev;
394 struct net_device *loopback_dev =
395 dev_net(dev)->loopback_dev;
397 if (idev && idev->dev != loopback_dev) {
398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 rt->rt6i_idev = loopback_idev;
406 static bool __rt6_check_expired(const struct rt6_info *rt)
408 if (rt->rt6i_flags & RTF_EXPIRES)
409 return time_after(jiffies, rt->dst.expires);
414 static bool rt6_check_expired(const struct rt6_info *rt)
416 struct fib6_info *from;
418 from = rcu_dereference(rt->from);
420 if (rt->rt6i_flags & RTF_EXPIRES) {
421 if (time_after(jiffies, rt->dst.expires))
424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425 fib6_check_expired(from);
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431 struct fib6_info *match,
432 struct flowi6 *fl6, int oif,
433 const struct sk_buff *skb,
436 struct fib6_info *sibling, *next_sibling;
438 /* We might have already computed the hash for ICMPv6 errors. In such
439 * case it will always be non-zero. Otherwise now is the time to do it.
442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
451 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452 if (fl6->mp_hash > nh_upper_bound)
454 if (rt6_score_route(sibling, oif, strict) < 0)
464 * Route lookup. rcu_read_lock() should be held.
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468 struct fib6_info *rt,
469 const struct in6_addr *saddr,
473 struct fib6_info *sprt;
475 if (!oif && ipv6_addr_any(saddr) &&
476 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480 const struct net_device *dev = sprt->fib6_nh.nh_dev;
482 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
486 if (dev->ifindex == oif)
489 if (ipv6_chk_addr(net, saddr, dev,
490 flags & RT6_LOOKUP_F_IFACE))
495 if (oif && flags & RT6_LOOKUP_F_IFACE)
496 return net->ipv6.fib6_null_entry;
498 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503 struct work_struct work;
504 struct in6_addr target;
505 struct net_device *dev;
508 static void rt6_probe_deferred(struct work_struct *w)
510 struct in6_addr mcaddr;
511 struct __rt6_probe_work *work =
512 container_of(w, struct __rt6_probe_work, work);
514 addrconf_addr_solict_mult(&work->target, &mcaddr);
515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
520 static void rt6_probe(struct fib6_info *rt)
522 struct __rt6_probe_work *work = NULL;
523 const struct in6_addr *nh_gw;
524 struct neighbour *neigh;
525 struct net_device *dev;
526 struct inet6_dev *idev;
529 * Okay, this does not seem to be appropriate
530 * for now, however, we need to check if it
531 * is really so; aka Router Reachability Probing.
533 * Router Reachability Probe MUST be rate-limited
534 * to no more than one per minute.
536 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
539 nh_gw = &rt->fib6_nh.nh_gw;
540 dev = rt->fib6_nh.nh_dev;
542 idev = __in6_dev_get(dev);
543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
545 if (neigh->nud_state & NUD_VALID)
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
557 } else if (time_after(jiffies, rt->last_probe +
558 idev->cnf.rtr_probe_interval)) {
559 work = kmalloc(sizeof(*work), GFP_ATOMIC);
563 rt->last_probe = jiffies;
564 INIT_WORK(&work->work, rt6_probe_deferred);
565 work->target = *nh_gw;
568 schedule_work(&work->work);
572 rcu_read_unlock_bh();
575 static inline void rt6_probe(struct fib6_info *rt)
581 * Default Router Selection (RFC 2461 6.3.6)
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
585 const struct net_device *dev = rt->fib6_nh.nh_dev;
587 if (!oif || dev->ifindex == oif)
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595 struct neighbour *neigh;
597 if (rt->fib6_flags & RTF_NONEXTHOP ||
598 !(rt->fib6_flags & RTF_GATEWAY))
599 return RT6_NUD_SUCCEED;
602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
605 read_lock(&neigh->lock);
606 if (neigh->nud_state & NUD_VALID)
607 ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609 else if (!(neigh->nud_state & NUD_FAILED))
610 ret = RT6_NUD_SUCCEED;
612 ret = RT6_NUD_FAIL_PROBE;
614 read_unlock(&neigh->lock);
616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619 rcu_read_unlock_bh();
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
628 m = rt6_check_dev(rt, oif);
629 if (!m && (strict & RT6_LOOKUP_F_IFACE))
630 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
634 if (strict & RT6_LOOKUP_F_REACHABLE) {
635 int n = rt6_check_neigh(rt);
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
645 const struct net_device *dev = fib6_info_nh_dev(f6i);
649 const struct inet6_dev *idev = __in6_dev_get(dev);
651 rc = !!idev->cnf.ignore_routes_with_linkdown;
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658 int *mpri, struct fib6_info *match,
662 bool match_do_rr = false;
664 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
667 if (fib6_ignore_linkdown(rt) &&
668 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
672 if (fib6_check_expired(rt))
675 m = rt6_score_route(rt, oif, strict);
676 if (m == RT6_NUD_FAIL_DO_RR) {
678 m = 0; /* lowest valid score */
679 } else if (m == RT6_NUD_FAIL_HARD) {
683 if (strict & RT6_LOOKUP_F_REACHABLE)
686 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688 *do_rr = match_do_rr;
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697 struct fib6_info *leaf,
698 struct fib6_info *rr_head,
699 u32 metric, int oif, int strict,
702 struct fib6_info *rt, *match, *cont;
707 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708 if (rt->fib6_metric != metric) {
713 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716 for (rt = leaf; rt && rt != rr_head;
717 rt = rcu_dereference(rt->fib6_next)) {
718 if (rt->fib6_metric != metric) {
723 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730 match = find_match(rt, oif, strict, &mpri, match, do_rr);
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
738 struct fib6_info *leaf = rcu_dereference(fn->leaf);
739 struct fib6_info *match, *rt0;
743 if (!leaf || leaf == net->ipv6.fib6_null_entry)
744 return net->ipv6.fib6_null_entry;
746 rt0 = rcu_dereference(fn->rr_ptr);
750 /* Double check to make sure fn is not an intermediate node
751 * and fn->leaf does not points to its child's leaf
752 * (This might happen if all routes under fn are deleted from
753 * the tree and fib6_repair_tree() is called on the node.)
755 key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757 if (rt0->fib6_src.plen)
758 key_plen = rt0->fib6_src.plen;
760 if (fn->fn_bit != key_plen)
761 return net->ipv6.fib6_null_entry;
763 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
767 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
769 /* no entries matched; do round-robin */
770 if (!next || next->fib6_metric != rt0->fib6_metric)
774 spin_lock_bh(&leaf->fib6_table->tb6_lock);
775 /* make sure next is not being deleted from the tree */
777 rcu_assign_pointer(fn->rr_ptr, next);
778 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
782 return match ? match : net->ipv6.fib6_null_entry;
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
787 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792 const struct in6_addr *gwaddr)
794 struct net *net = dev_net(dev);
795 struct route_info *rinfo = (struct route_info *) opt;
796 struct in6_addr prefix_buf, *prefix;
798 unsigned long lifetime;
799 struct fib6_info *rt;
801 if (len < sizeof(struct route_info)) {
805 /* Sanity check for prefix_len and length */
806 if (rinfo->length > 3) {
808 } else if (rinfo->prefix_len > 128) {
810 } else if (rinfo->prefix_len > 64) {
811 if (rinfo->length < 2) {
814 } else if (rinfo->prefix_len > 0) {
815 if (rinfo->length < 1) {
820 pref = rinfo->route_pref;
821 if (pref == ICMPV6_ROUTER_PREF_INVALID)
824 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
826 if (rinfo->length == 3)
827 prefix = (struct in6_addr *)rinfo->prefix;
829 /* this function is safe */
830 ipv6_addr_prefix(&prefix_buf,
831 (struct in6_addr *)rinfo->prefix,
833 prefix = &prefix_buf;
836 if (rinfo->prefix_len == 0)
837 rt = rt6_get_dflt_router(net, gwaddr, dev);
839 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
842 if (rt && !lifetime) {
848 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
851 rt->fib6_flags = RTF_ROUTEINFO |
852 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
855 if (!addrconf_finite_timeout(lifetime))
856 fib6_clean_expires(rt);
858 fib6_set_expires(rt, jiffies + HZ * lifetime);
860 fib6_info_release(rt);
867 * Misc support functions
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
873 struct net_device *dev = rt->fib6_nh.nh_dev;
875 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876 /* for copies of local routes, dst->dev needs to be the
877 * device if it is a master device, the master device if
878 * device is enslaved, and the loopback as the default
880 if (netif_is_l3_slave(dev) &&
881 !rt6_need_strict(&rt->fib6_dst.addr))
882 dev = l3mdev_master_dev_rcu(dev);
883 else if (!netif_is_l3_master(dev))
884 dev = dev_net(dev)->loopback_dev;
885 /* last case is netif_is_l3_master(dev) is true in which
886 * case we want dev returned to be dev
893 static const int fib6_prop[RTN_MAX + 1] = {
900 [RTN_BLACKHOLE] = -EINVAL,
901 [RTN_UNREACHABLE] = -EHOSTUNREACH,
902 [RTN_PROHIBIT] = -EACCES,
903 [RTN_THROW] = -EAGAIN,
905 [RTN_XRESOLVE] = -EINVAL,
908 static int ip6_rt_type_to_error(u8 fib6_type)
910 return fib6_prop[fib6_type];
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
915 unsigned short flags = 0;
918 flags |= DST_NOCOUNT;
919 if (rt->dst_nopolicy)
920 flags |= DST_NOPOLICY;
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
929 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
931 switch (ort->fib6_type) {
933 rt->dst.output = dst_discard_out;
934 rt->dst.input = dst_discard;
937 rt->dst.output = ip6_pkt_prohibit_out;
938 rt->dst.input = ip6_pkt_prohibit;
941 case RTN_UNREACHABLE:
943 rt->dst.output = ip6_pkt_discard_out;
944 rt->dst.input = ip6_pkt_discard;
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
957 rt->dst.output = ip6_output;
959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
964 rt->dst.input = ip6_forward;
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
972 rt->dst.lastuse = jiffies;
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 rt->rt6i_flags &= ~RTF_EXPIRES;
979 rcu_assign_pointer(rt->from, from);
980 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
986 struct net_device *dev = fib6_info_nh_dev(ort);
988 ip6_rt_init_dst(rt, ort);
990 rt->rt6i_dst = ort->fib6_dst;
991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993 rt->rt6i_flags = ort->fib6_flags;
994 rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996 rt->rt6i_src = ort->fib6_src;
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001 struct in6_addr *saddr)
1003 struct fib6_node *pn, *sn;
1005 if (fn->fn_flags & RTN_TL_ROOT)
1007 pn = rcu_dereference(fn->parent);
1008 sn = FIB6_SUBTREE(pn);
1010 fn = fib6_node_lookup(sn, NULL, saddr);
1013 if (fn->fn_flags & RTN_RTINFO)
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1020 struct rt6_info *rt = *prt;
1022 if (dst_hold_safe(&rt->dst))
1025 rt = net->ipv6.ip6_null_entry;
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 unsigned short flags = fib6_info_dst_flags(rt);
1038 struct net_device *dev = rt->fib6_nh.nh_dev;
1039 struct rt6_info *nrt;
1041 if (!fib6_info_hold_safe(rt))
1044 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046 ip6_rt_copy_init(nrt, rt);
1048 fib6_info_release(rt);
1053 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1054 struct fib6_table *table,
1056 const struct sk_buff *skb,
1059 struct fib6_info *f6i;
1060 struct fib6_node *fn;
1061 struct rt6_info *rt;
1063 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1064 flags &= ~RT6_LOOKUP_F_IFACE;
1067 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1069 f6i = rcu_dereference(fn->leaf);
1071 f6i = net->ipv6.fib6_null_entry;
1073 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1074 fl6->flowi6_oif, flags);
1075 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1076 f6i = fib6_multipath_select(net, f6i, fl6,
1077 fl6->flowi6_oif, skb,
1080 if (f6i == net->ipv6.fib6_null_entry) {
1081 fn = fib6_backtrack(fn, &fl6->saddr);
1086 trace_fib6_table_lookup(net, f6i, table, fl6);
1088 /* Search through exception table */
1089 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1091 if (ip6_hold_safe(net, &rt))
1092 dst_use_noref(&rt->dst, jiffies);
1093 } else if (f6i == net->ipv6.fib6_null_entry) {
1094 rt = net->ipv6.ip6_null_entry;
1097 rt = ip6_create_rt_rcu(f6i);
1099 rt = net->ipv6.ip6_null_entry;
1109 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1110 const struct sk_buff *skb, int flags)
1112 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1114 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1116 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1117 const struct in6_addr *saddr, int oif,
1118 const struct sk_buff *skb, int strict)
1120 struct flowi6 fl6 = {
1124 struct dst_entry *dst;
1125 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1128 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1129 flags |= RT6_LOOKUP_F_HAS_SADDR;
1132 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1133 if (dst->error == 0)
1134 return (struct rt6_info *) dst;
1140 EXPORT_SYMBOL(rt6_lookup);
1142 /* ip6_ins_rt is called with FREE table->tb6_lock.
1143 * It takes new route entry, the addition fails by any reason the
1144 * route is released.
1145 * Caller must hold dst before calling it.
1148 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1149 struct netlink_ext_ack *extack)
1152 struct fib6_table *table;
1154 table = rt->fib6_table;
1155 spin_lock_bh(&table->tb6_lock);
1156 err = fib6_add(&table->tb6_root, rt, info, extack);
1157 spin_unlock_bh(&table->tb6_lock);
1162 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1164 struct nl_info info = { .nl_net = net, };
1166 return __ip6_ins_rt(rt, &info, NULL);
1169 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1170 const struct in6_addr *daddr,
1171 const struct in6_addr *saddr)
1173 struct net_device *dev;
1174 struct rt6_info *rt;
1180 if (!fib6_info_hold_safe(ort))
1183 dev = ip6_rt_get_dev_rcu(ort);
1184 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1186 fib6_info_release(ort);
1190 ip6_rt_copy_init(rt, ort);
1191 rt->rt6i_flags |= RTF_CACHE;
1192 rt->dst.flags |= DST_HOST;
1193 rt->rt6i_dst.addr = *daddr;
1194 rt->rt6i_dst.plen = 128;
1196 if (!rt6_is_gw_or_nonexthop(ort)) {
1197 if (ort->fib6_dst.plen != 128 &&
1198 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1199 rt->rt6i_flags |= RTF_ANYCAST;
1200 #ifdef CONFIG_IPV6_SUBTREES
1201 if (rt->rt6i_src.plen && saddr) {
1202 rt->rt6i_src.addr = *saddr;
1203 rt->rt6i_src.plen = 128;
1211 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1213 unsigned short flags = fib6_info_dst_flags(rt);
1214 struct net_device *dev;
1215 struct rt6_info *pcpu_rt;
1217 if (!fib6_info_hold_safe(rt))
1221 dev = ip6_rt_get_dev_rcu(rt);
1222 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1225 fib6_info_release(rt);
1228 ip6_rt_copy_init(pcpu_rt, rt);
1229 pcpu_rt->rt6i_flags |= RTF_PCPU;
1233 /* It should be called with rcu_read_lock() acquired */
1234 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1236 struct rt6_info *pcpu_rt, **p;
1238 p = this_cpu_ptr(rt->rt6i_pcpu);
1242 ip6_hold_safe(NULL, &pcpu_rt);
1247 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1248 struct fib6_info *rt)
1250 struct rt6_info *pcpu_rt, *prev, **p;
1252 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1254 dst_hold(&net->ipv6.ip6_null_entry->dst);
1255 return net->ipv6.ip6_null_entry;
1258 dst_hold(&pcpu_rt->dst);
1259 p = this_cpu_ptr(rt->rt6i_pcpu);
1260 prev = cmpxchg(p, NULL, pcpu_rt);
1266 /* exception hash table implementation
1268 static DEFINE_SPINLOCK(rt6_exception_lock);
1270 /* Remove rt6_ex from hash table and free the memory
1271 * Caller must hold rt6_exception_lock
1273 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1274 struct rt6_exception *rt6_ex)
1276 struct fib6_info *from;
1279 if (!bucket || !rt6_ex)
1282 net = dev_net(rt6_ex->rt6i->dst.dev);
1283 net->ipv6.rt6_stats->fib_rt_cache--;
1285 /* purge completely the exception to allow releasing the held resources:
1286 * some [sk] cache may keep the dst around for unlimited time
1288 from = rcu_dereference_protected(rt6_ex->rt6i->from,
1289 lockdep_is_held(&rt6_exception_lock));
1290 rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1291 fib6_info_release(from);
1292 dst_dev_put(&rt6_ex->rt6i->dst);
1294 hlist_del_rcu(&rt6_ex->hlist);
1295 dst_release(&rt6_ex->rt6i->dst);
1296 kfree_rcu(rt6_ex, rcu);
1297 WARN_ON_ONCE(!bucket->depth);
1301 /* Remove oldest rt6_ex in bucket and free the memory
1302 * Caller must hold rt6_exception_lock
1304 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1306 struct rt6_exception *rt6_ex, *oldest = NULL;
1311 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1312 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1315 rt6_remove_exception(bucket, oldest);
1318 static u32 rt6_exception_hash(const struct in6_addr *dst,
1319 const struct in6_addr *src)
1321 static u32 seed __read_mostly;
1324 net_get_random_once(&seed, sizeof(seed));
1325 val = jhash(dst, sizeof(*dst), seed);
1327 #ifdef CONFIG_IPV6_SUBTREES
1329 val = jhash(src, sizeof(*src), val);
1331 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1334 /* Helper function to find the cached rt in the hash table
1335 * and update bucket pointer to point to the bucket for this
1336 * (daddr, saddr) pair
1337 * Caller must hold rt6_exception_lock
1339 static struct rt6_exception *
1340 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1341 const struct in6_addr *daddr,
1342 const struct in6_addr *saddr)
1344 struct rt6_exception *rt6_ex;
1347 if (!(*bucket) || !daddr)
1350 hval = rt6_exception_hash(daddr, saddr);
1353 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1354 struct rt6_info *rt6 = rt6_ex->rt6i;
1355 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1357 #ifdef CONFIG_IPV6_SUBTREES
1358 if (matched && saddr)
1359 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1367 /* Helper function to find the cached rt in the hash table
1368 * and update bucket pointer to point to the bucket for this
1369 * (daddr, saddr) pair
1370 * Caller must hold rcu_read_lock()
1372 static struct rt6_exception *
1373 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1374 const struct in6_addr *daddr,
1375 const struct in6_addr *saddr)
1377 struct rt6_exception *rt6_ex;
1380 WARN_ON_ONCE(!rcu_read_lock_held());
1382 if (!(*bucket) || !daddr)
1385 hval = rt6_exception_hash(daddr, saddr);
1388 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1389 struct rt6_info *rt6 = rt6_ex->rt6i;
1390 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1392 #ifdef CONFIG_IPV6_SUBTREES
1393 if (matched && saddr)
1394 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1402 static unsigned int fib6_mtu(const struct fib6_info *rt)
1406 if (rt->fib6_pmtu) {
1407 mtu = rt->fib6_pmtu;
1409 struct net_device *dev = fib6_info_nh_dev(rt);
1410 struct inet6_dev *idev;
1413 idev = __in6_dev_get(dev);
1414 mtu = idev->cnf.mtu6;
1418 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1420 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1423 static int rt6_insert_exception(struct rt6_info *nrt,
1424 struct fib6_info *ort)
1426 struct net *net = dev_net(nrt->dst.dev);
1427 struct rt6_exception_bucket *bucket;
1428 struct in6_addr *src_key = NULL;
1429 struct rt6_exception *rt6_ex;
1432 spin_lock_bh(&rt6_exception_lock);
1434 if (ort->exception_bucket_flushed) {
1439 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1440 lockdep_is_held(&rt6_exception_lock));
1442 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1448 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1451 #ifdef CONFIG_IPV6_SUBTREES
1452 /* rt6i_src.plen != 0 indicates ort is in subtree
1453 * and exception table is indexed by a hash of
1454 * both rt6i_dst and rt6i_src.
1455 * Otherwise, the exception table is indexed by
1456 * a hash of only rt6i_dst.
1458 if (ort->fib6_src.plen)
1459 src_key = &nrt->rt6i_src.addr;
1461 /* rt6_mtu_change() might lower mtu on ort.
1462 * Only insert this exception route if its mtu
1463 * is less than ort's mtu value.
1465 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1470 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1473 rt6_remove_exception(bucket, rt6_ex);
1475 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1481 rt6_ex->stamp = jiffies;
1482 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1484 net->ipv6.rt6_stats->fib_rt_cache++;
1486 if (bucket->depth > FIB6_MAX_DEPTH)
1487 rt6_exception_remove_oldest(bucket);
1490 spin_unlock_bh(&rt6_exception_lock);
1492 /* Update fn->fn_sernum to invalidate all cached dst */
1494 spin_lock_bh(&ort->fib6_table->tb6_lock);
1495 fib6_update_sernum(net, ort);
1496 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1497 fib6_force_start_gc(net);
1503 void rt6_flush_exceptions(struct fib6_info *rt)
1505 struct rt6_exception_bucket *bucket;
1506 struct rt6_exception *rt6_ex;
1507 struct hlist_node *tmp;
1510 spin_lock_bh(&rt6_exception_lock);
1511 /* Prevent rt6_insert_exception() to recreate the bucket list */
1512 rt->exception_bucket_flushed = 1;
1514 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1515 lockdep_is_held(&rt6_exception_lock));
1519 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1520 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1521 rt6_remove_exception(bucket, rt6_ex);
1522 WARN_ON_ONCE(bucket->depth);
1527 spin_unlock_bh(&rt6_exception_lock);
1530 /* Find cached rt in the hash table inside passed in rt
1531 * Caller has to hold rcu_read_lock()
1533 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1534 struct in6_addr *daddr,
1535 struct in6_addr *saddr)
1537 struct rt6_exception_bucket *bucket;
1538 struct in6_addr *src_key = NULL;
1539 struct rt6_exception *rt6_ex;
1540 struct rt6_info *res = NULL;
1542 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1544 #ifdef CONFIG_IPV6_SUBTREES
1545 /* rt6i_src.plen != 0 indicates rt is in subtree
1546 * and exception table is indexed by a hash of
1547 * both rt6i_dst and rt6i_src.
1548 * Otherwise, the exception table is indexed by
1549 * a hash of only rt6i_dst.
1551 if (rt->fib6_src.plen)
1554 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1556 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1562 /* Remove the passed in cached rt from the hash table that contains it */
1563 static int rt6_remove_exception_rt(struct rt6_info *rt)
1565 struct rt6_exception_bucket *bucket;
1566 struct in6_addr *src_key = NULL;
1567 struct rt6_exception *rt6_ex;
1568 struct fib6_info *from;
1571 from = rcu_dereference(rt->from);
1573 !(rt->rt6i_flags & RTF_CACHE))
1576 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1579 spin_lock_bh(&rt6_exception_lock);
1580 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1581 lockdep_is_held(&rt6_exception_lock));
1582 #ifdef CONFIG_IPV6_SUBTREES
1583 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1584 * and exception table is indexed by a hash of
1585 * both rt6i_dst and rt6i_src.
1586 * Otherwise, the exception table is indexed by
1587 * a hash of only rt6i_dst.
1589 if (from->fib6_src.plen)
1590 src_key = &rt->rt6i_src.addr;
1592 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1596 rt6_remove_exception(bucket, rt6_ex);
1602 spin_unlock_bh(&rt6_exception_lock);
1606 /* Find rt6_ex which contains the passed in rt cache and
1609 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1611 struct rt6_exception_bucket *bucket;
1612 struct in6_addr *src_key = NULL;
1613 struct rt6_exception *rt6_ex;
1614 struct fib6_info *from;
1617 from = rcu_dereference(rt->from);
1618 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1621 bucket = rcu_dereference(from->rt6i_exception_bucket);
1623 #ifdef CONFIG_IPV6_SUBTREES
1624 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1625 * and exception table is indexed by a hash of
1626 * both rt6i_dst and rt6i_src.
1627 * Otherwise, the exception table is indexed by
1628 * a hash of only rt6i_dst.
1630 if (from->fib6_src.plen)
1631 src_key = &rt->rt6i_src.addr;
1633 rt6_ex = __rt6_find_exception_rcu(&bucket,
1637 rt6_ex->stamp = jiffies;
1643 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1644 struct rt6_info *rt, int mtu)
1646 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1647 * lowest MTU in the path: always allow updating the route PMTU to
1648 * reflect PMTU decreases.
1650 * If the new MTU is higher, and the route PMTU is equal to the local
1651 * MTU, this means the old MTU is the lowest in the path, so allow
1652 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1656 if (dst_mtu(&rt->dst) >= mtu)
1659 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1665 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1666 struct fib6_info *rt, int mtu)
1668 struct rt6_exception_bucket *bucket;
1669 struct rt6_exception *rt6_ex;
1672 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1673 lockdep_is_held(&rt6_exception_lock));
1678 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1679 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1680 struct rt6_info *entry = rt6_ex->rt6i;
1682 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1683 * route), the metrics of its rt->from have already
1686 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1687 rt6_mtu_change_route_allowed(idev, entry, mtu))
1688 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1694 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1696 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1697 struct in6_addr *gateway)
1699 struct rt6_exception_bucket *bucket;
1700 struct rt6_exception *rt6_ex;
1701 struct hlist_node *tmp;
1704 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1707 spin_lock_bh(&rt6_exception_lock);
1708 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1709 lockdep_is_held(&rt6_exception_lock));
1712 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1713 hlist_for_each_entry_safe(rt6_ex, tmp,
1714 &bucket->chain, hlist) {
1715 struct rt6_info *entry = rt6_ex->rt6i;
1717 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1718 RTF_CACHE_GATEWAY &&
1719 ipv6_addr_equal(gateway,
1720 &entry->rt6i_gateway)) {
1721 rt6_remove_exception(bucket, rt6_ex);
1728 spin_unlock_bh(&rt6_exception_lock);
1731 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1732 struct rt6_exception *rt6_ex,
1733 struct fib6_gc_args *gc_args,
1736 struct rt6_info *rt = rt6_ex->rt6i;
1738 /* we are pruning and obsoleting aged-out and non gateway exceptions
1739 * even if others have still references to them, so that on next
1740 * dst_check() such references can be dropped.
1741 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1742 * expired, independently from their aging, as per RFC 8201 section 4
1744 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1745 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1746 RT6_TRACE("aging clone %p\n", rt);
1747 rt6_remove_exception(bucket, rt6_ex);
1750 } else if (time_after(jiffies, rt->dst.expires)) {
1751 RT6_TRACE("purging expired route %p\n", rt);
1752 rt6_remove_exception(bucket, rt6_ex);
1756 if (rt->rt6i_flags & RTF_GATEWAY) {
1757 struct neighbour *neigh;
1758 __u8 neigh_flags = 0;
1760 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1762 neigh_flags = neigh->flags;
1764 if (!(neigh_flags & NTF_ROUTER)) {
1765 RT6_TRACE("purging route %p via non-router but gateway\n",
1767 rt6_remove_exception(bucket, rt6_ex);
1775 void rt6_age_exceptions(struct fib6_info *rt,
1776 struct fib6_gc_args *gc_args,
1779 struct rt6_exception_bucket *bucket;
1780 struct rt6_exception *rt6_ex;
1781 struct hlist_node *tmp;
1784 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1788 spin_lock(&rt6_exception_lock);
1789 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1790 lockdep_is_held(&rt6_exception_lock));
1793 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1794 hlist_for_each_entry_safe(rt6_ex, tmp,
1795 &bucket->chain, hlist) {
1796 rt6_age_examine_exception(bucket, rt6_ex,
1802 spin_unlock(&rt6_exception_lock);
1803 rcu_read_unlock_bh();
1806 /* must be called with rcu lock held */
1807 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1808 int oif, struct flowi6 *fl6, int strict)
1810 struct fib6_node *fn, *saved_fn;
1811 struct fib6_info *f6i;
1813 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1816 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1820 f6i = rt6_select(net, fn, oif, strict);
1821 if (f6i == net->ipv6.fib6_null_entry) {
1822 fn = fib6_backtrack(fn, &fl6->saddr);
1824 goto redo_rt6_select;
1825 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1826 /* also consider unreachable route */
1827 strict &= ~RT6_LOOKUP_F_REACHABLE;
1829 goto redo_rt6_select;
1833 trace_fib6_table_lookup(net, f6i, table, fl6);
1838 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1839 int oif, struct flowi6 *fl6,
1840 const struct sk_buff *skb, int flags)
1842 struct fib6_info *f6i;
1843 struct rt6_info *rt;
1846 strict |= flags & RT6_LOOKUP_F_IFACE;
1847 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1848 if (net->ipv6.devconf_all->forwarding == 0)
1849 strict |= RT6_LOOKUP_F_REACHABLE;
1853 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1854 if (f6i->fib6_nsiblings)
1855 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1857 if (f6i == net->ipv6.fib6_null_entry) {
1858 rt = net->ipv6.ip6_null_entry;
1864 /*Search through exception table */
1865 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1867 if (ip6_hold_safe(net, &rt))
1868 dst_use_noref(&rt->dst, jiffies);
1872 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1873 !(f6i->fib6_flags & RTF_GATEWAY))) {
1874 /* Create a RTF_CACHE clone which will not be
1875 * owned by the fib6 tree. It is for the special case where
1876 * the daddr in the skb during the neighbor look-up is different
1877 * from the fl6->daddr used to look-up route here.
1879 struct rt6_info *uncached_rt;
1881 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1886 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1887 * No need for another dst_hold()
1889 rt6_uncached_list_add(uncached_rt);
1890 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1892 uncached_rt = net->ipv6.ip6_null_entry;
1893 dst_hold(&uncached_rt->dst);
1898 /* Get a percpu copy */
1900 struct rt6_info *pcpu_rt;
1903 pcpu_rt = rt6_get_pcpu_route(f6i);
1906 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1914 EXPORT_SYMBOL_GPL(ip6_pol_route);
1916 static struct rt6_info *ip6_pol_route_input(struct net *net,
1917 struct fib6_table *table,
1919 const struct sk_buff *skb,
1922 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1925 struct dst_entry *ip6_route_input_lookup(struct net *net,
1926 struct net_device *dev,
1928 const struct sk_buff *skb,
1931 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1932 flags |= RT6_LOOKUP_F_IFACE;
1934 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1936 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1938 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1939 struct flow_keys *keys,
1940 struct flow_keys *flkeys)
1942 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1943 const struct ipv6hdr *key_iph = outer_iph;
1944 struct flow_keys *_flkeys = flkeys;
1945 const struct ipv6hdr *inner_iph;
1946 const struct icmp6hdr *icmph;
1947 struct ipv6hdr _inner_iph;
1948 struct icmp6hdr _icmph;
1950 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1953 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1954 sizeof(_icmph), &_icmph);
1958 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1959 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1960 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1961 icmph->icmp6_type != ICMPV6_PARAMPROB)
1964 inner_iph = skb_header_pointer(skb,
1965 skb_transport_offset(skb) + sizeof(*icmph),
1966 sizeof(_inner_iph), &_inner_iph);
1970 key_iph = inner_iph;
1974 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1975 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1976 keys->tags.flow_label = _flkeys->tags.flow_label;
1977 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1979 keys->addrs.v6addrs.src = key_iph->saddr;
1980 keys->addrs.v6addrs.dst = key_iph->daddr;
1981 keys->tags.flow_label = ip6_flowlabel(key_iph);
1982 keys->basic.ip_proto = key_iph->nexthdr;
1986 /* if skb is set it will be used and fl6 can be NULL */
1987 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1988 const struct sk_buff *skb, struct flow_keys *flkeys)
1990 struct flow_keys hash_keys;
1993 switch (ip6_multipath_hash_policy(net)) {
1995 memset(&hash_keys, 0, sizeof(hash_keys));
1996 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1998 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2000 hash_keys.addrs.v6addrs.src = fl6->saddr;
2001 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2002 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2003 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2008 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2009 struct flow_keys keys;
2011 /* short-circuit if we already have L4 hash present */
2013 return skb_get_hash_raw(skb) >> 1;
2015 memset(&hash_keys, 0, sizeof(hash_keys));
2018 skb_flow_dissect_flow_keys(skb, &keys, flag);
2021 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2022 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2023 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2024 hash_keys.ports.src = flkeys->ports.src;
2025 hash_keys.ports.dst = flkeys->ports.dst;
2026 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2028 memset(&hash_keys, 0, sizeof(hash_keys));
2029 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2030 hash_keys.addrs.v6addrs.src = fl6->saddr;
2031 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2032 hash_keys.ports.src = fl6->fl6_sport;
2033 hash_keys.ports.dst = fl6->fl6_dport;
2034 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2038 mhash = flow_hash_from_keys(&hash_keys);
2043 void ip6_route_input(struct sk_buff *skb)
2045 const struct ipv6hdr *iph = ipv6_hdr(skb);
2046 struct net *net = dev_net(skb->dev);
2047 int flags = RT6_LOOKUP_F_HAS_SADDR;
2048 struct ip_tunnel_info *tun_info;
2049 struct flowi6 fl6 = {
2050 .flowi6_iif = skb->dev->ifindex,
2051 .daddr = iph->daddr,
2052 .saddr = iph->saddr,
2053 .flowlabel = ip6_flowinfo(iph),
2054 .flowi6_mark = skb->mark,
2055 .flowi6_proto = iph->nexthdr,
2057 struct flow_keys *flkeys = NULL, _flkeys;
2059 tun_info = skb_tunnel_info(skb);
2060 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2061 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2063 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2066 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2067 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2070 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2073 static struct rt6_info *ip6_pol_route_output(struct net *net,
2074 struct fib6_table *table,
2076 const struct sk_buff *skb,
2079 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2082 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2083 struct flowi6 *fl6, int flags)
2087 if (ipv6_addr_type(&fl6->daddr) &
2088 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2089 struct dst_entry *dst;
2091 dst = l3mdev_link_scope_lookup(net, fl6);
2096 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2098 any_src = ipv6_addr_any(&fl6->saddr);
2099 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2100 (fl6->flowi6_oif && any_src))
2101 flags |= RT6_LOOKUP_F_IFACE;
2104 flags |= RT6_LOOKUP_F_HAS_SADDR;
2106 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2108 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2110 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2112 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2114 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2115 struct net_device *loopback_dev = net->loopback_dev;
2116 struct dst_entry *new = NULL;
2118 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2119 DST_OBSOLETE_DEAD, 0);
2122 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2126 new->input = dst_discard;
2127 new->output = dst_discard_out;
2129 dst_copy_metrics(new, &ort->dst);
2131 rt->rt6i_idev = in6_dev_get(loopback_dev);
2132 rt->rt6i_gateway = ort->rt6i_gateway;
2133 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2135 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2136 #ifdef CONFIG_IPV6_SUBTREES
2137 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2141 dst_release(dst_orig);
2142 return new ? new : ERR_PTR(-ENOMEM);
2146 * Destination cache support functions
2149 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2153 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2156 if (fib6_check_expired(f6i))
2162 static struct dst_entry *rt6_check(struct rt6_info *rt,
2163 struct fib6_info *from,
2168 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2169 rt_cookie != cookie)
2172 if (rt6_check_expired(rt))
2178 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2179 struct fib6_info *from,
2182 if (!__rt6_check_expired(rt) &&
2183 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2184 fib6_check(from, cookie))
2190 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2192 struct dst_entry *dst_ret;
2193 struct fib6_info *from;
2194 struct rt6_info *rt;
2196 rt = container_of(dst, struct rt6_info, dst);
2200 /* All IPV6 dsts are created with ->obsolete set to the value
2201 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2202 * into this function always.
2205 from = rcu_dereference(rt->from);
2207 if (from && (rt->rt6i_flags & RTF_PCPU ||
2208 unlikely(!list_empty(&rt->rt6i_uncached))))
2209 dst_ret = rt6_dst_from_check(rt, from, cookie);
2211 dst_ret = rt6_check(rt, from, cookie);
2218 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2220 struct rt6_info *rt = (struct rt6_info *) dst;
2223 if (rt->rt6i_flags & RTF_CACHE) {
2225 if (rt6_check_expired(rt)) {
2226 rt6_remove_exception_rt(rt);
2238 static void ip6_link_failure(struct sk_buff *skb)
2240 struct rt6_info *rt;
2242 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2244 rt = (struct rt6_info *) skb_dst(skb);
2247 if (rt->rt6i_flags & RTF_CACHE) {
2248 rt6_remove_exception_rt(rt);
2250 struct fib6_info *from;
2251 struct fib6_node *fn;
2253 from = rcu_dereference(rt->from);
2255 fn = rcu_dereference(from->fib6_node);
2256 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2264 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2266 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2267 struct fib6_info *from;
2270 from = rcu_dereference(rt0->from);
2272 rt0->dst.expires = from->expires;
2276 dst_set_expires(&rt0->dst, timeout);
2277 rt0->rt6i_flags |= RTF_EXPIRES;
2280 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2282 struct net *net = dev_net(rt->dst.dev);
2284 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2285 rt->rt6i_flags |= RTF_MODIFIED;
2286 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2289 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2291 return !(rt->rt6i_flags & RTF_CACHE) &&
2292 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2295 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2296 const struct ipv6hdr *iph, u32 mtu)
2298 const struct in6_addr *daddr, *saddr;
2299 struct rt6_info *rt6 = (struct rt6_info *)dst;
2301 if (dst_metric_locked(dst, RTAX_MTU))
2305 daddr = &iph->daddr;
2306 saddr = &iph->saddr;
2308 daddr = &sk->sk_v6_daddr;
2309 saddr = &inet6_sk(sk)->saddr;
2314 dst_confirm_neigh(dst, daddr);
2315 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2316 if (mtu >= dst_mtu(dst))
2319 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2320 rt6_do_update_pmtu(rt6, mtu);
2321 /* update rt6_ex->stamp for cache */
2322 if (rt6->rt6i_flags & RTF_CACHE)
2323 rt6_update_exception_stamp_rt(rt6);
2325 struct fib6_info *from;
2326 struct rt6_info *nrt6;
2329 from = rcu_dereference(rt6->from);
2330 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2332 rt6_do_update_pmtu(nrt6, mtu);
2333 if (rt6_insert_exception(nrt6, from))
2334 dst_release_immediate(&nrt6->dst);
2340 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2341 struct sk_buff *skb, u32 mtu)
2343 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2346 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2347 int oif, u32 mark, kuid_t uid)
2349 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2350 struct dst_entry *dst;
2351 struct flowi6 fl6 = {
2353 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2354 .daddr = iph->daddr,
2355 .saddr = iph->saddr,
2356 .flowlabel = ip6_flowinfo(iph),
2360 dst = ip6_route_output(net, NULL, &fl6);
2362 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2365 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2367 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2369 int oif = sk->sk_bound_dev_if;
2370 struct dst_entry *dst;
2372 if (!oif && skb->dev)
2373 oif = l3mdev_master_ifindex(skb->dev);
2375 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2377 dst = __sk_dst_get(sk);
2378 if (!dst || !dst->obsolete ||
2379 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2383 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2384 ip6_datagram_dst_update(sk, false);
2387 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2389 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2390 const struct flowi6 *fl6)
2392 #ifdef CONFIG_IPV6_SUBTREES
2393 struct ipv6_pinfo *np = inet6_sk(sk);
2396 ip6_dst_store(sk, dst,
2397 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2398 &sk->sk_v6_daddr : NULL,
2399 #ifdef CONFIG_IPV6_SUBTREES
2400 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2406 /* Handle redirects */
2407 struct ip6rd_flowi {
2409 struct in6_addr gateway;
2412 static struct rt6_info *__ip6_route_redirect(struct net *net,
2413 struct fib6_table *table,
2415 const struct sk_buff *skb,
2418 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2419 struct rt6_info *ret = NULL, *rt_cache;
2420 struct fib6_info *rt;
2421 struct fib6_node *fn;
2423 /* Get the "current" route for this destination and
2424 * check if the redirect has come from appropriate router.
2426 * RFC 4861 specifies that redirects should only be
2427 * accepted if they come from the nexthop to the target.
2428 * Due to the way the routes are chosen, this notion
2429 * is a bit fuzzy and one might need to check all possible
2434 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2436 for_each_fib6_node_rt_rcu(fn) {
2437 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2439 if (fib6_check_expired(rt))
2441 if (rt->fib6_flags & RTF_REJECT)
2443 if (!(rt->fib6_flags & RTF_GATEWAY))
2445 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2447 /* rt_cache's gateway might be different from its 'parent'
2448 * in the case of an ip redirect.
2449 * So we keep searching in the exception table if the gateway
2452 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2453 rt_cache = rt6_find_cached_rt(rt,
2457 ipv6_addr_equal(&rdfl->gateway,
2458 &rt_cache->rt6i_gateway)) {
2468 rt = net->ipv6.fib6_null_entry;
2469 else if (rt->fib6_flags & RTF_REJECT) {
2470 ret = net->ipv6.ip6_null_entry;
2474 if (rt == net->ipv6.fib6_null_entry) {
2475 fn = fib6_backtrack(fn, &fl6->saddr);
2482 ip6_hold_safe(net, &ret);
2484 ret = ip6_create_rt_rcu(rt);
2488 trace_fib6_table_lookup(net, rt, table, fl6);
2492 static struct dst_entry *ip6_route_redirect(struct net *net,
2493 const struct flowi6 *fl6,
2494 const struct sk_buff *skb,
2495 const struct in6_addr *gateway)
2497 int flags = RT6_LOOKUP_F_HAS_SADDR;
2498 struct ip6rd_flowi rdfl;
2501 rdfl.gateway = *gateway;
2503 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2504 flags, __ip6_route_redirect);
2507 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2510 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2511 struct dst_entry *dst;
2512 struct flowi6 fl6 = {
2513 .flowi6_iif = LOOPBACK_IFINDEX,
2515 .flowi6_mark = mark,
2516 .daddr = iph->daddr,
2517 .saddr = iph->saddr,
2518 .flowlabel = ip6_flowinfo(iph),
2522 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2523 rt6_do_redirect(dst, NULL, skb);
2526 EXPORT_SYMBOL_GPL(ip6_redirect);
2528 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2530 const struct ipv6hdr *iph = ipv6_hdr(skb);
2531 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2532 struct dst_entry *dst;
2533 struct flowi6 fl6 = {
2534 .flowi6_iif = LOOPBACK_IFINDEX,
2537 .saddr = iph->daddr,
2538 .flowi6_uid = sock_net_uid(net, NULL),
2541 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2542 rt6_do_redirect(dst, NULL, skb);
2546 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2548 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2551 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2553 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2555 struct net_device *dev = dst->dev;
2556 unsigned int mtu = dst_mtu(dst);
2557 struct net *net = dev_net(dev);
2559 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2561 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2562 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2565 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2566 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2567 * IPV6_MAXPLEN is also valid and means: "any MSS,
2568 * rely only on pmtu discovery"
2570 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2575 static unsigned int ip6_mtu(const struct dst_entry *dst)
2577 struct inet6_dev *idev;
2580 mtu = dst_metric_raw(dst, RTAX_MTU);
2587 idev = __in6_dev_get(dst->dev);
2589 mtu = idev->cnf.mtu6;
2593 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2595 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2599 * 1. mtu on route is locked - use it
2600 * 2. mtu from nexthop exception
2601 * 3. mtu from egress device
2603 * based on ip6_dst_mtu_forward and exception logic of
2604 * rt6_find_cached_rt; called with rcu_read_lock
2606 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2607 struct in6_addr *saddr)
2609 struct rt6_exception_bucket *bucket;
2610 struct rt6_exception *rt6_ex;
2611 struct in6_addr *src_key;
2612 struct inet6_dev *idev;
2615 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2616 mtu = f6i->fib6_pmtu;
2622 #ifdef CONFIG_IPV6_SUBTREES
2623 if (f6i->fib6_src.plen)
2627 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2628 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2629 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2630 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2633 struct net_device *dev = fib6_info_nh_dev(f6i);
2636 idev = __in6_dev_get(dev);
2637 if (idev && idev->cnf.mtu6 > mtu)
2638 mtu = idev->cnf.mtu6;
2641 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2643 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2646 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2649 struct dst_entry *dst;
2650 struct rt6_info *rt;
2651 struct inet6_dev *idev = in6_dev_get(dev);
2652 struct net *net = dev_net(dev);
2654 if (unlikely(!idev))
2655 return ERR_PTR(-ENODEV);
2657 rt = ip6_dst_alloc(net, dev, 0);
2658 if (unlikely(!rt)) {
2660 dst = ERR_PTR(-ENOMEM);
2664 rt->dst.flags |= DST_HOST;
2665 rt->dst.input = ip6_input;
2666 rt->dst.output = ip6_output;
2667 rt->rt6i_gateway = fl6->daddr;
2668 rt->rt6i_dst.addr = fl6->daddr;
2669 rt->rt6i_dst.plen = 128;
2670 rt->rt6i_idev = idev;
2671 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2673 /* Add this dst into uncached_list so that rt6_disable_ip() can
2674 * do proper release of the net_device
2676 rt6_uncached_list_add(rt);
2677 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2679 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2685 static int ip6_dst_gc(struct dst_ops *ops)
2687 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2688 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2689 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2690 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2691 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2692 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2695 entries = dst_entries_get_fast(ops);
2696 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2697 entries <= rt_max_size)
2700 net->ipv6.ip6_rt_gc_expire++;
2701 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2702 entries = dst_entries_get_slow(ops);
2703 if (entries < ops->gc_thresh)
2704 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2706 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2707 return entries > rt_max_size;
2710 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2711 struct fib6_config *cfg,
2712 const struct in6_addr *gw_addr,
2713 u32 tbid, int flags)
2715 struct flowi6 fl6 = {
2716 .flowi6_oif = cfg->fc_ifindex,
2718 .saddr = cfg->fc_prefsrc,
2720 struct fib6_table *table;
2721 struct rt6_info *rt;
2723 table = fib6_get_table(net, tbid);
2727 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2728 flags |= RT6_LOOKUP_F_HAS_SADDR;
2730 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2731 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2733 /* if table lookup failed, fall back to full lookup */
2734 if (rt == net->ipv6.ip6_null_entry) {
2742 static int ip6_route_check_nh_onlink(struct net *net,
2743 struct fib6_config *cfg,
2744 const struct net_device *dev,
2745 struct netlink_ext_ack *extack)
2747 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2748 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2749 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2750 struct fib6_info *from;
2751 struct rt6_info *grt;
2755 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2758 from = rcu_dereference(grt->from);
2759 if (!grt->dst.error &&
2760 /* ignore match if it is the default route */
2761 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2762 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2763 NL_SET_ERR_MSG(extack,
2764 "Nexthop has invalid gateway or device mismatch");
2775 static int ip6_route_check_nh(struct net *net,
2776 struct fib6_config *cfg,
2777 struct net_device **_dev,
2778 struct inet6_dev **idev)
2780 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2781 struct net_device *dev = _dev ? *_dev : NULL;
2782 struct rt6_info *grt = NULL;
2783 int err = -EHOSTUNREACH;
2785 if (cfg->fc_table) {
2786 int flags = RT6_LOOKUP_F_IFACE;
2788 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2789 cfg->fc_table, flags);
2791 if (grt->rt6i_flags & RTF_GATEWAY ||
2792 (dev && dev != grt->dst.dev)) {
2800 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2806 if (dev != grt->dst.dev) {
2811 *_dev = dev = grt->dst.dev;
2812 *idev = grt->rt6i_idev;
2814 in6_dev_hold(grt->rt6i_idev);
2817 if (!(grt->rt6i_flags & RTF_GATEWAY))
2826 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2827 struct net_device **_dev, struct inet6_dev **idev,
2828 struct netlink_ext_ack *extack)
2830 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2831 int gwa_type = ipv6_addr_type(gw_addr);
2832 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2833 const struct net_device *dev = *_dev;
2834 bool need_addr_check = !dev;
2837 /* if gw_addr is local we will fail to detect this in case
2838 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2839 * will return already-added prefix route via interface that
2840 * prefix route was assigned to, which might be non-loopback.
2843 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2844 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2848 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2849 /* IPv6 strictly inhibits using not link-local
2850 * addresses as nexthop address.
2851 * Otherwise, router will not able to send redirects.
2852 * It is very good, but in some (rare!) circumstances
2853 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2854 * some exceptions. --ANK
2855 * We allow IPv4-mapped nexthops to support RFC4798-type
2858 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2859 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2863 if (cfg->fc_flags & RTNH_F_ONLINK)
2864 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2866 err = ip6_route_check_nh(net, cfg, _dev, idev);
2872 /* reload in case device was changed */
2877 NL_SET_ERR_MSG(extack, "Egress device not specified");
2879 } else if (dev->flags & IFF_LOOPBACK) {
2880 NL_SET_ERR_MSG(extack,
2881 "Egress device can not be loopback device for this route");
2885 /* if we did not check gw_addr above, do so now that the
2886 * egress device has been resolved.
2888 if (need_addr_check &&
2889 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2890 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2899 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2901 struct netlink_ext_ack *extack)
2903 struct net *net = cfg->fc_nlinfo.nl_net;
2904 struct fib6_info *rt = NULL;
2905 struct net_device *dev = NULL;
2906 struct inet6_dev *idev = NULL;
2907 struct fib6_table *table;
2911 /* RTF_PCPU is an internal flag; can not be set by userspace */
2912 if (cfg->fc_flags & RTF_PCPU) {
2913 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2917 /* RTF_CACHE is an internal flag; can not be set by userspace */
2918 if (cfg->fc_flags & RTF_CACHE) {
2919 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2923 if (cfg->fc_type > RTN_MAX) {
2924 NL_SET_ERR_MSG(extack, "Invalid route type");
2928 if (cfg->fc_dst_len > 128) {
2929 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2932 if (cfg->fc_src_len > 128) {
2933 NL_SET_ERR_MSG(extack, "Invalid source address length");
2936 #ifndef CONFIG_IPV6_SUBTREES
2937 if (cfg->fc_src_len) {
2938 NL_SET_ERR_MSG(extack,
2939 "Specifying source address requires IPV6_SUBTREES to be enabled");
2943 if (cfg->fc_ifindex) {
2945 dev = dev_get_by_index(net, cfg->fc_ifindex);
2948 idev = in6_dev_get(dev);
2953 if (cfg->fc_flags & RTNH_F_ONLINK) {
2955 NL_SET_ERR_MSG(extack,
2956 "Nexthop device required for onlink");
2961 if (!(dev->flags & IFF_UP)) {
2962 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2969 if (cfg->fc_nlinfo.nlh &&
2970 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2971 table = fib6_get_table(net, cfg->fc_table);
2973 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2974 table = fib6_new_table(net, cfg->fc_table);
2977 table = fib6_new_table(net, cfg->fc_table);
2984 rt = fib6_info_alloc(gfp_flags);
2988 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2990 if (IS_ERR(rt->fib6_metrics)) {
2991 err = PTR_ERR(rt->fib6_metrics);
2992 /* Do not leave garbage there. */
2993 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2997 if (cfg->fc_flags & RTF_ADDRCONF)
2998 rt->dst_nocount = true;
3000 if (cfg->fc_flags & RTF_EXPIRES)
3001 fib6_set_expires(rt, jiffies +
3002 clock_t_to_jiffies(cfg->fc_expires));
3004 fib6_clean_expires(rt);
3006 if (cfg->fc_protocol == RTPROT_UNSPEC)
3007 cfg->fc_protocol = RTPROT_BOOT;
3008 rt->fib6_protocol = cfg->fc_protocol;
3010 addr_type = ipv6_addr_type(&cfg->fc_dst);
3012 if (cfg->fc_encap) {
3013 struct lwtunnel_state *lwtstate;
3015 err = lwtunnel_build_state(cfg->fc_encap_type,
3016 cfg->fc_encap, AF_INET6, cfg,
3020 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3023 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3024 rt->fib6_dst.plen = cfg->fc_dst_len;
3025 if (rt->fib6_dst.plen == 128)
3026 rt->dst_host = true;
3028 #ifdef CONFIG_IPV6_SUBTREES
3029 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3030 rt->fib6_src.plen = cfg->fc_src_len;
3033 rt->fib6_metric = cfg->fc_metric;
3034 rt->fib6_nh.nh_weight = 1;
3036 rt->fib6_type = cfg->fc_type;
3038 /* We cannot add true routes via loopback here,
3039 they would result in kernel looping; promote them to reject routes
3041 if ((cfg->fc_flags & RTF_REJECT) ||
3042 (dev && (dev->flags & IFF_LOOPBACK) &&
3043 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3044 !(cfg->fc_flags & RTF_LOCAL))) {
3045 /* hold loopback dev/idev if we haven't done so. */
3046 if (dev != net->loopback_dev) {
3051 dev = net->loopback_dev;
3053 idev = in6_dev_get(dev);
3059 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3063 if (cfg->fc_flags & RTF_GATEWAY) {
3064 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3068 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3075 if (idev->cnf.disable_ipv6) {
3076 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3081 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3082 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3087 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3088 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3089 NL_SET_ERR_MSG(extack, "Invalid source address");
3093 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3094 rt->fib6_prefsrc.plen = 128;
3096 rt->fib6_prefsrc.plen = 0;
3098 rt->fib6_flags = cfg->fc_flags;
3101 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3102 !netif_carrier_ok(dev))
3103 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3104 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3105 rt->fib6_nh.nh_dev = dev;
3106 rt->fib6_table = table;
3118 fib6_info_release(rt);
3119 return ERR_PTR(err);
3122 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3123 struct netlink_ext_ack *extack)
3125 struct fib6_info *rt;
3128 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3132 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3133 fib6_info_release(rt);
3138 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3140 struct net *net = info->nl_net;
3141 struct fib6_table *table;
3144 if (rt == net->ipv6.fib6_null_entry) {
3149 table = rt->fib6_table;
3150 spin_lock_bh(&table->tb6_lock);
3151 err = fib6_del(rt, info);
3152 spin_unlock_bh(&table->tb6_lock);
3155 fib6_info_release(rt);
3159 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3161 struct nl_info info = { .nl_net = net };
3163 return __ip6_del_rt(rt, &info);
3166 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3168 struct nl_info *info = &cfg->fc_nlinfo;
3169 struct net *net = info->nl_net;
3170 struct sk_buff *skb = NULL;
3171 struct fib6_table *table;
3174 if (rt == net->ipv6.fib6_null_entry)
3176 table = rt->fib6_table;
3177 spin_lock_bh(&table->tb6_lock);
3179 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3180 struct fib6_info *sibling, *next_sibling;
3182 /* prefer to send a single notification with all hops */
3183 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3185 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3187 if (rt6_fill_node(net, skb, rt, NULL,
3188 NULL, NULL, 0, RTM_DELROUTE,
3189 info->portid, seq, 0) < 0) {
3193 info->skip_notify = 1;
3196 list_for_each_entry_safe(sibling, next_sibling,
3199 err = fib6_del(sibling, info);
3205 err = fib6_del(rt, info);
3207 spin_unlock_bh(&table->tb6_lock);
3209 fib6_info_release(rt);
3212 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3213 info->nlh, gfp_any());
3218 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3222 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3225 if (cfg->fc_flags & RTF_GATEWAY &&
3226 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3229 rc = rt6_remove_exception_rt(rt);
3234 static int ip6_route_del(struct fib6_config *cfg,
3235 struct netlink_ext_ack *extack)
3237 struct rt6_info *rt_cache;
3238 struct fib6_table *table;
3239 struct fib6_info *rt;
3240 struct fib6_node *fn;
3243 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3245 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3251 fn = fib6_locate(&table->tb6_root,
3252 &cfg->fc_dst, cfg->fc_dst_len,
3253 &cfg->fc_src, cfg->fc_src_len,
3254 !(cfg->fc_flags & RTF_CACHE));
3257 for_each_fib6_node_rt_rcu(fn) {
3258 if (cfg->fc_flags & RTF_CACHE) {
3261 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3264 rc = ip6_del_cached_rt(rt_cache, cfg);
3272 if (cfg->fc_ifindex &&
3273 (!rt->fib6_nh.nh_dev ||
3274 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3276 if (cfg->fc_flags & RTF_GATEWAY &&
3277 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3279 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3281 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3283 if (!fib6_info_hold_safe(rt))
3287 /* if gateway was specified only delete the one hop */
3288 if (cfg->fc_flags & RTF_GATEWAY)
3289 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3291 return __ip6_del_rt_siblings(rt, cfg);
3299 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3301 struct netevent_redirect netevent;
3302 struct rt6_info *rt, *nrt = NULL;
3303 struct ndisc_options ndopts;
3304 struct inet6_dev *in6_dev;
3305 struct neighbour *neigh;
3306 struct fib6_info *from;
3308 int optlen, on_link;
3311 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3312 optlen -= sizeof(*msg);
3315 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3319 msg = (struct rd_msg *)icmp6_hdr(skb);
3321 if (ipv6_addr_is_multicast(&msg->dest)) {
3322 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3327 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3329 } else if (ipv6_addr_type(&msg->target) !=
3330 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3331 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3335 in6_dev = __in6_dev_get(skb->dev);
3338 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3342 * The IP source address of the Redirect MUST be the same as the current
3343 * first-hop router for the specified ICMP Destination Address.
3346 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3347 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3352 if (ndopts.nd_opts_tgt_lladdr) {
3353 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3356 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3361 rt = (struct rt6_info *) dst;
3362 if (rt->rt6i_flags & RTF_REJECT) {
3363 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3367 /* Redirect received -> path was valid.
3368 * Look, redirects are sent only in response to data packets,
3369 * so that this nexthop apparently is reachable. --ANK
3371 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3373 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3378 * We have finally decided to accept it.
3381 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3382 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3383 NEIGH_UPDATE_F_OVERRIDE|
3384 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3385 NEIGH_UPDATE_F_ISROUTER)),
3386 NDISC_REDIRECT, &ndopts);
3389 from = rcu_dereference(rt->from);
3390 /* This fib6_info_hold() is safe here because we hold reference to rt
3391 * and rt already holds reference to fib6_info.
3393 fib6_info_hold(from);
3396 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3400 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3402 nrt->rt6i_flags &= ~RTF_GATEWAY;
3404 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3406 /* No need to remove rt from the exception table if rt is
3407 * a cached route because rt6_insert_exception() will
3410 if (rt6_insert_exception(nrt, from)) {
3411 dst_release_immediate(&nrt->dst);
3415 netevent.old = &rt->dst;
3416 netevent.new = &nrt->dst;
3417 netevent.daddr = &msg->dest;
3418 netevent.neigh = neigh;
3419 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3422 fib6_info_release(from);
3423 neigh_release(neigh);
3426 #ifdef CONFIG_IPV6_ROUTE_INFO
3427 static struct fib6_info *rt6_get_route_info(struct net *net,
3428 const struct in6_addr *prefix, int prefixlen,
3429 const struct in6_addr *gwaddr,
3430 struct net_device *dev)
3432 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3433 int ifindex = dev->ifindex;
3434 struct fib6_node *fn;
3435 struct fib6_info *rt = NULL;
3436 struct fib6_table *table;
3438 table = fib6_get_table(net, tb_id);
3443 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3447 for_each_fib6_node_rt_rcu(fn) {
3448 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3450 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3452 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3454 if (!fib6_info_hold_safe(rt))
3463 static struct fib6_info *rt6_add_route_info(struct net *net,
3464 const struct in6_addr *prefix, int prefixlen,
3465 const struct in6_addr *gwaddr,
3466 struct net_device *dev,
3469 struct fib6_config cfg = {
3470 .fc_metric = IP6_RT_PRIO_USER,
3471 .fc_ifindex = dev->ifindex,
3472 .fc_dst_len = prefixlen,
3473 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3474 RTF_UP | RTF_PREF(pref),
3475 .fc_protocol = RTPROT_RA,
3476 .fc_type = RTN_UNICAST,
3477 .fc_nlinfo.portid = 0,
3478 .fc_nlinfo.nlh = NULL,
3479 .fc_nlinfo.nl_net = net,
3482 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3483 cfg.fc_dst = *prefix;
3484 cfg.fc_gateway = *gwaddr;
3486 /* We should treat it as a default route if prefix length is 0. */
3488 cfg.fc_flags |= RTF_DEFAULT;
3490 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3492 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3496 struct fib6_info *rt6_get_dflt_router(struct net *net,
3497 const struct in6_addr *addr,
3498 struct net_device *dev)
3500 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3501 struct fib6_info *rt;
3502 struct fib6_table *table;
3504 table = fib6_get_table(net, tb_id);
3509 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3510 if (dev == rt->fib6_nh.nh_dev &&
3511 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3512 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3515 if (rt && !fib6_info_hold_safe(rt))
3521 struct fib6_info *rt6_add_dflt_router(struct net *net,
3522 const struct in6_addr *gwaddr,
3523 struct net_device *dev,
3526 struct fib6_config cfg = {
3527 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3528 .fc_metric = IP6_RT_PRIO_USER,
3529 .fc_ifindex = dev->ifindex,
3530 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3531 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3532 .fc_protocol = RTPROT_RA,
3533 .fc_type = RTN_UNICAST,
3534 .fc_nlinfo.portid = 0,
3535 .fc_nlinfo.nlh = NULL,
3536 .fc_nlinfo.nl_net = net,
3539 cfg.fc_gateway = *gwaddr;
3541 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3542 struct fib6_table *table;
3544 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3546 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3549 return rt6_get_dflt_router(net, gwaddr, dev);
3552 static void __rt6_purge_dflt_routers(struct net *net,
3553 struct fib6_table *table)
3555 struct fib6_info *rt;
3559 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3560 struct net_device *dev = fib6_info_nh_dev(rt);
3561 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3563 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3564 (!idev || idev->cnf.accept_ra != 2) &&
3565 fib6_info_hold_safe(rt)) {
3567 ip6_del_rt(net, rt);
3573 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3576 void rt6_purge_dflt_routers(struct net *net)
3578 struct fib6_table *table;
3579 struct hlist_head *head;
3584 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3585 head = &net->ipv6.fib_table_hash[h];
3586 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3587 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3588 __rt6_purge_dflt_routers(net, table);
3595 static void rtmsg_to_fib6_config(struct net *net,
3596 struct in6_rtmsg *rtmsg,
3597 struct fib6_config *cfg)
3599 *cfg = (struct fib6_config){
3600 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3602 .fc_ifindex = rtmsg->rtmsg_ifindex,
3603 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3604 .fc_expires = rtmsg->rtmsg_info,
3605 .fc_dst_len = rtmsg->rtmsg_dst_len,
3606 .fc_src_len = rtmsg->rtmsg_src_len,
3607 .fc_flags = rtmsg->rtmsg_flags,
3608 .fc_type = rtmsg->rtmsg_type,
3610 .fc_nlinfo.nl_net = net,
3612 .fc_dst = rtmsg->rtmsg_dst,
3613 .fc_src = rtmsg->rtmsg_src,
3614 .fc_gateway = rtmsg->rtmsg_gateway,
3618 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3620 struct fib6_config cfg;
3621 struct in6_rtmsg rtmsg;
3625 case SIOCADDRT: /* Add a route */
3626 case SIOCDELRT: /* Delete a route */
3627 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3629 err = copy_from_user(&rtmsg, arg,
3630 sizeof(struct in6_rtmsg));
3634 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3639 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3642 err = ip6_route_del(&cfg, NULL);
3656 * Drop the packet on the floor
3659 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3662 struct dst_entry *dst = skb_dst(skb);
3663 switch (ipstats_mib_noroutes) {
3664 case IPSTATS_MIB_INNOROUTES:
3665 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3666 if (type == IPV6_ADDR_ANY) {
3667 IP6_INC_STATS(dev_net(dst->dev),
3668 __in6_dev_get_safely(skb->dev),
3669 IPSTATS_MIB_INADDRERRORS);
3673 case IPSTATS_MIB_OUTNOROUTES:
3674 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3675 ipstats_mib_noroutes);
3678 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3683 static int ip6_pkt_discard(struct sk_buff *skb)
3685 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3688 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3690 skb->dev = skb_dst(skb)->dev;
3691 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3694 static int ip6_pkt_prohibit(struct sk_buff *skb)
3696 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3699 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3701 skb->dev = skb_dst(skb)->dev;
3702 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3706 * Allocate a dst for local (unicast / anycast) address.
3709 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3710 struct inet6_dev *idev,
3711 const struct in6_addr *addr,
3712 bool anycast, gfp_t gfp_flags)
3714 struct fib6_config cfg = {
3715 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3716 .fc_ifindex = idev->dev->ifindex,
3717 .fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3720 .fc_protocol = RTPROT_KERNEL,
3721 .fc_nlinfo.nl_net = net,
3722 .fc_ignore_dev_down = true,
3726 cfg.fc_type = RTN_ANYCAST;
3727 cfg.fc_flags |= RTF_ANYCAST;
3729 cfg.fc_type = RTN_LOCAL;
3730 cfg.fc_flags |= RTF_LOCAL;
3733 return ip6_route_info_create(&cfg, gfp_flags, NULL);
3736 /* remove deleted ip from prefsrc entries */
3737 struct arg_dev_net_ip {
3738 struct net_device *dev;
3740 struct in6_addr *addr;
3743 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3745 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3746 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3747 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3749 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3750 rt != net->ipv6.fib6_null_entry &&
3751 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3752 spin_lock_bh(&rt6_exception_lock);
3753 /* remove prefsrc entry */
3754 rt->fib6_prefsrc.plen = 0;
3755 spin_unlock_bh(&rt6_exception_lock);
3760 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3762 struct net *net = dev_net(ifp->idev->dev);
3763 struct arg_dev_net_ip adni = {
3764 .dev = ifp->idev->dev,
3768 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3771 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3773 /* Remove routers and update dst entries when gateway turn into host. */
3774 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3776 struct in6_addr *gateway = (struct in6_addr *)arg;
3778 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3779 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3783 /* Further clean up cached routes in exception table.
3784 * This is needed because cached route may have a different
3785 * gateway than its 'parent' in the case of an ip redirect.
3787 rt6_exceptions_clean_tohost(rt, gateway);
3792 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3794 fib6_clean_all(net, fib6_clean_tohost, gateway);
3797 struct arg_netdev_event {
3798 const struct net_device *dev;
3800 unsigned int nh_flags;
3801 unsigned long event;
3805 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3807 struct fib6_info *iter;
3808 struct fib6_node *fn;
3810 fn = rcu_dereference_protected(rt->fib6_node,
3811 lockdep_is_held(&rt->fib6_table->tb6_lock));
3812 iter = rcu_dereference_protected(fn->leaf,
3813 lockdep_is_held(&rt->fib6_table->tb6_lock));
3815 if (iter->fib6_metric == rt->fib6_metric &&
3816 rt6_qualify_for_ecmp(iter))
3818 iter = rcu_dereference_protected(iter->fib6_next,
3819 lockdep_is_held(&rt->fib6_table->tb6_lock));
3825 static bool rt6_is_dead(const struct fib6_info *rt)
3827 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3828 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3829 fib6_ignore_linkdown(rt)))
3835 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3837 struct fib6_info *iter;
3840 if (!rt6_is_dead(rt))
3841 total += rt->fib6_nh.nh_weight;
3843 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3844 if (!rt6_is_dead(iter))
3845 total += iter->fib6_nh.nh_weight;
3851 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3853 int upper_bound = -1;
3855 if (!rt6_is_dead(rt)) {
3856 *weight += rt->fib6_nh.nh_weight;
3857 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3860 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3863 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3865 struct fib6_info *iter;
3868 rt6_upper_bound_set(rt, &weight, total);
3870 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3871 rt6_upper_bound_set(iter, &weight, total);
3874 void rt6_multipath_rebalance(struct fib6_info *rt)
3876 struct fib6_info *first;
3879 /* In case the entire multipath route was marked for flushing,
3880 * then there is no need to rebalance upon the removal of every
3883 if (!rt->fib6_nsiblings || rt->should_flush)
3886 /* During lookup routes are evaluated in order, so we need to
3887 * make sure upper bounds are assigned from the first sibling
3890 first = rt6_multipath_first_sibling(rt);
3891 if (WARN_ON_ONCE(!first))
3894 total = rt6_multipath_total_weight(first);
3895 rt6_multipath_upper_bound_set(first, total);
3898 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3900 const struct arg_netdev_event *arg = p_arg;
3901 struct net *net = dev_net(arg->dev);
3903 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3904 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3905 fib6_update_sernum_upto_root(net, rt);
3906 rt6_multipath_rebalance(rt);
3912 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3914 struct arg_netdev_event arg = {
3917 .nh_flags = nh_flags,
3921 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3922 arg.nh_flags |= RTNH_F_LINKDOWN;
3924 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3927 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3928 const struct net_device *dev)
3930 struct fib6_info *iter;
3932 if (rt->fib6_nh.nh_dev == dev)
3934 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3935 if (iter->fib6_nh.nh_dev == dev)
3941 static void rt6_multipath_flush(struct fib6_info *rt)
3943 struct fib6_info *iter;
3945 rt->should_flush = 1;
3946 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3947 iter->should_flush = 1;
3950 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3951 const struct net_device *down_dev)
3953 struct fib6_info *iter;
3954 unsigned int dead = 0;
3956 if (rt->fib6_nh.nh_dev == down_dev ||
3957 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3959 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3960 if (iter->fib6_nh.nh_dev == down_dev ||
3961 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3967 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3968 const struct net_device *dev,
3969 unsigned int nh_flags)
3971 struct fib6_info *iter;
3973 if (rt->fib6_nh.nh_dev == dev)
3974 rt->fib6_nh.nh_flags |= nh_flags;
3975 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3976 if (iter->fib6_nh.nh_dev == dev)
3977 iter->fib6_nh.nh_flags |= nh_flags;
3980 /* called with write lock held for table with rt */
3981 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3983 const struct arg_netdev_event *arg = p_arg;
3984 const struct net_device *dev = arg->dev;
3985 struct net *net = dev_net(dev);
3987 if (rt == net->ipv6.fib6_null_entry)
3990 switch (arg->event) {
3991 case NETDEV_UNREGISTER:
3992 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3994 if (rt->should_flush)
3996 if (!rt->fib6_nsiblings)
3997 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3998 if (rt6_multipath_uses_dev(rt, dev)) {
4001 count = rt6_multipath_dead_count(rt, dev);
4002 if (rt->fib6_nsiblings + 1 == count) {
4003 rt6_multipath_flush(rt);
4006 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4008 fib6_update_sernum(net, rt);
4009 rt6_multipath_rebalance(rt);
4013 if (rt->fib6_nh.nh_dev != dev ||
4014 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4016 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4017 rt6_multipath_rebalance(rt);
4024 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4026 struct arg_netdev_event arg = {
4032 struct net *net = dev_net(dev);
4034 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4035 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4037 fib6_clean_all(net, fib6_ifdown, &arg);
4040 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4042 rt6_sync_down_dev(dev, event);
4043 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4044 neigh_ifdown(&nd_tbl, dev);
4047 struct rt6_mtu_change_arg {
4048 struct net_device *dev;
4052 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4054 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4055 struct inet6_dev *idev;
4057 /* In IPv6 pmtu discovery is not optional,
4058 so that RTAX_MTU lock cannot disable it.
4059 We still use this lock to block changes
4060 caused by addrconf/ndisc.
4063 idev = __in6_dev_get(arg->dev);
4067 /* For administrative MTU increase, there is no way to discover
4068 IPv6 PMTU increase, so PMTU increase should be updated here.
4069 Since RFC 1981 doesn't include administrative MTU increase
4070 update PMTU increase is a MUST. (i.e. jumbo frame)
4072 if (rt->fib6_nh.nh_dev == arg->dev &&
4073 !fib6_metric_locked(rt, RTAX_MTU)) {
4074 u32 mtu = rt->fib6_pmtu;
4076 if (mtu >= arg->mtu ||
4077 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4078 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4080 spin_lock_bh(&rt6_exception_lock);
4081 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4082 spin_unlock_bh(&rt6_exception_lock);
4087 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4089 struct rt6_mtu_change_arg arg = {
4094 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4097 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4098 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4099 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4100 [RTA_OIF] = { .type = NLA_U32 },
4101 [RTA_IIF] = { .type = NLA_U32 },
4102 [RTA_PRIORITY] = { .type = NLA_U32 },
4103 [RTA_METRICS] = { .type = NLA_NESTED },
4104 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4105 [RTA_PREF] = { .type = NLA_U8 },
4106 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4107 [RTA_ENCAP] = { .type = NLA_NESTED },
4108 [RTA_EXPIRES] = { .type = NLA_U32 },
4109 [RTA_UID] = { .type = NLA_U32 },
4110 [RTA_MARK] = { .type = NLA_U32 },
4111 [RTA_TABLE] = { .type = NLA_U32 },
4112 [RTA_IP_PROTO] = { .type = NLA_U8 },
4113 [RTA_SPORT] = { .type = NLA_U16 },
4114 [RTA_DPORT] = { .type = NLA_U16 },
4117 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4118 struct fib6_config *cfg,
4119 struct netlink_ext_ack *extack)
4122 struct nlattr *tb[RTA_MAX+1];
4126 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4132 rtm = nlmsg_data(nlh);
4134 *cfg = (struct fib6_config){
4135 .fc_table = rtm->rtm_table,
4136 .fc_dst_len = rtm->rtm_dst_len,
4137 .fc_src_len = rtm->rtm_src_len,
4139 .fc_protocol = rtm->rtm_protocol,
4140 .fc_type = rtm->rtm_type,
4142 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4143 .fc_nlinfo.nlh = nlh,
4144 .fc_nlinfo.nl_net = sock_net(skb->sk),
4147 if (rtm->rtm_type == RTN_UNREACHABLE ||
4148 rtm->rtm_type == RTN_BLACKHOLE ||
4149 rtm->rtm_type == RTN_PROHIBIT ||
4150 rtm->rtm_type == RTN_THROW)
4151 cfg->fc_flags |= RTF_REJECT;
4153 if (rtm->rtm_type == RTN_LOCAL)
4154 cfg->fc_flags |= RTF_LOCAL;
4156 if (rtm->rtm_flags & RTM_F_CLONED)
4157 cfg->fc_flags |= RTF_CACHE;
4159 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4161 if (tb[RTA_GATEWAY]) {
4162 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4163 cfg->fc_flags |= RTF_GATEWAY;
4166 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4171 int plen = (rtm->rtm_dst_len + 7) >> 3;
4173 if (nla_len(tb[RTA_DST]) < plen)
4176 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4180 int plen = (rtm->rtm_src_len + 7) >> 3;
4182 if (nla_len(tb[RTA_SRC]) < plen)
4185 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4188 if (tb[RTA_PREFSRC])
4189 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4192 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4194 if (tb[RTA_PRIORITY])
4195 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4197 if (tb[RTA_METRICS]) {
4198 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4199 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4203 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4205 if (tb[RTA_MULTIPATH]) {
4206 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4207 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4209 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4210 cfg->fc_mp_len, extack);
4216 pref = nla_get_u8(tb[RTA_PREF]);
4217 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4218 pref != ICMPV6_ROUTER_PREF_HIGH)
4219 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4220 cfg->fc_flags |= RTF_PREF(pref);
4224 cfg->fc_encap = tb[RTA_ENCAP];
4226 if (tb[RTA_ENCAP_TYPE]) {
4227 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4229 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4234 if (tb[RTA_EXPIRES]) {
4235 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4237 if (addrconf_finite_timeout(timeout)) {
4238 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4239 cfg->fc_flags |= RTF_EXPIRES;
4249 struct fib6_info *fib6_info;
4250 struct fib6_config r_cfg;
4251 struct list_head next;
4254 static int ip6_route_info_append(struct net *net,
4255 struct list_head *rt6_nh_list,
4256 struct fib6_info *rt,
4257 struct fib6_config *r_cfg)
4262 list_for_each_entry(nh, rt6_nh_list, next) {
4263 /* check if fib6_info already exists */
4264 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4268 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4272 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4273 list_add_tail(&nh->next, rt6_nh_list);
4278 static void ip6_route_mpath_notify(struct fib6_info *rt,
4279 struct fib6_info *rt_last,
4280 struct nl_info *info,
4283 /* if this is an APPEND route, then rt points to the first route
4284 * inserted and rt_last points to last route inserted. Userspace
4285 * wants a consistent dump of the route which starts at the first
4286 * nexthop. Since sibling routes are always added at the end of
4287 * the list, find the first sibling of the last route appended
4289 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4290 rt = list_first_entry(&rt_last->fib6_siblings,
4296 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4299 static int ip6_route_multipath_add(struct fib6_config *cfg,
4300 struct netlink_ext_ack *extack)
4302 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4303 struct nl_info *info = &cfg->fc_nlinfo;
4304 struct fib6_config r_cfg;
4305 struct rtnexthop *rtnh;
4306 struct fib6_info *rt;
4307 struct rt6_nh *err_nh;
4308 struct rt6_nh *nh, *nh_safe;
4314 int replace = (cfg->fc_nlinfo.nlh &&
4315 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4316 LIST_HEAD(rt6_nh_list);
4318 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4319 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4320 nlflags |= NLM_F_APPEND;
4322 remaining = cfg->fc_mp_len;
4323 rtnh = (struct rtnexthop *)cfg->fc_mp;
4325 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4326 * fib6_info structs per nexthop
4328 while (rtnh_ok(rtnh, remaining)) {
4329 memcpy(&r_cfg, cfg, sizeof(*cfg));
4330 if (rtnh->rtnh_ifindex)
4331 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4333 attrlen = rtnh_attrlen(rtnh);
4335 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4337 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4339 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4340 r_cfg.fc_flags |= RTF_GATEWAY;
4342 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4343 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4345 r_cfg.fc_encap_type = nla_get_u16(nla);
4348 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4349 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4355 if (!rt6_qualify_for_ecmp(rt)) {
4357 NL_SET_ERR_MSG(extack,
4358 "Device only routes can not be added for IPv6 using the multipath API.");
4359 fib6_info_release(rt);
4363 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4365 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4368 fib6_info_release(rt);
4372 rtnh = rtnh_next(rtnh, &remaining);
4375 /* for add and replace send one notification with all nexthops.
4376 * Skip the notification in fib6_add_rt2node and send one with
4377 * the full route when done
4379 info->skip_notify = 1;
4382 list_for_each_entry(nh, &rt6_nh_list, next) {
4383 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4384 fib6_info_release(nh->fib6_info);
4387 /* save reference to last route successfully inserted */
4388 rt_last = nh->fib6_info;
4390 /* save reference to first route for notification */
4392 rt_notif = nh->fib6_info;
4395 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4396 nh->fib6_info = NULL;
4399 NL_SET_ERR_MSG_MOD(extack,
4400 "multipath route replace failed (check consistency of installed routes)");
4405 /* Because each route is added like a single route we remove
4406 * these flags after the first nexthop: if there is a collision,
4407 * we have already failed to add the first nexthop:
4408 * fib6_add_rt2node() has rejected it; when replacing, old
4409 * nexthops have been replaced by first new, the rest should
4412 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4417 /* success ... tell user about new route */
4418 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4422 /* send notification for routes that were added so that
4423 * the delete notifications sent by ip6_route_del are
4427 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4429 /* Delete routes that were already added */
4430 list_for_each_entry(nh, &rt6_nh_list, next) {
4433 ip6_route_del(&nh->r_cfg, extack);
4437 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4439 fib6_info_release(nh->fib6_info);
4440 list_del(&nh->next);
4447 static int ip6_route_multipath_del(struct fib6_config *cfg,
4448 struct netlink_ext_ack *extack)
4450 struct fib6_config r_cfg;
4451 struct rtnexthop *rtnh;
4454 int err = 1, last_err = 0;
4456 remaining = cfg->fc_mp_len;
4457 rtnh = (struct rtnexthop *)cfg->fc_mp;
4459 /* Parse a Multipath Entry */
4460 while (rtnh_ok(rtnh, remaining)) {
4461 memcpy(&r_cfg, cfg, sizeof(*cfg));
4462 if (rtnh->rtnh_ifindex)
4463 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4465 attrlen = rtnh_attrlen(rtnh);
4467 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4469 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4471 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4472 r_cfg.fc_flags |= RTF_GATEWAY;
4475 err = ip6_route_del(&r_cfg, extack);
4479 rtnh = rtnh_next(rtnh, &remaining);
4485 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4486 struct netlink_ext_ack *extack)
4488 struct fib6_config cfg;
4491 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4496 return ip6_route_multipath_del(&cfg, extack);
4498 cfg.fc_delete_all_nh = 1;
4499 return ip6_route_del(&cfg, extack);
4503 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4504 struct netlink_ext_ack *extack)
4506 struct fib6_config cfg;
4509 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4513 if (cfg.fc_metric == 0)
4514 cfg.fc_metric = IP6_RT_PRIO_USER;
4517 return ip6_route_multipath_add(&cfg, extack);
4519 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4522 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4524 int nexthop_len = 0;
4526 if (rt->fib6_nsiblings) {
4527 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4528 + NLA_ALIGN(sizeof(struct rtnexthop))
4529 + nla_total_size(16) /* RTA_GATEWAY */
4530 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4532 nexthop_len *= rt->fib6_nsiblings;
4535 return NLMSG_ALIGN(sizeof(struct rtmsg))
4536 + nla_total_size(16) /* RTA_SRC */
4537 + nla_total_size(16) /* RTA_DST */
4538 + nla_total_size(16) /* RTA_GATEWAY */
4539 + nla_total_size(16) /* RTA_PREFSRC */
4540 + nla_total_size(4) /* RTA_TABLE */
4541 + nla_total_size(4) /* RTA_IIF */
4542 + nla_total_size(4) /* RTA_OIF */
4543 + nla_total_size(4) /* RTA_PRIORITY */
4544 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4545 + nla_total_size(sizeof(struct rta_cacheinfo))
4546 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4547 + nla_total_size(1) /* RTA_PREF */
4548 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4552 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4553 unsigned int *flags, bool skip_oif)
4555 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4556 *flags |= RTNH_F_DEAD;
4558 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4559 *flags |= RTNH_F_LINKDOWN;
4562 if (fib6_ignore_linkdown(rt))
4563 *flags |= RTNH_F_DEAD;
4567 if (rt->fib6_flags & RTF_GATEWAY) {
4568 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4569 goto nla_put_failure;
4572 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4573 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4574 *flags |= RTNH_F_OFFLOAD;
4576 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4577 if (!skip_oif && rt->fib6_nh.nh_dev &&
4578 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4579 goto nla_put_failure;
4581 if (rt->fib6_nh.nh_lwtstate &&
4582 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4583 goto nla_put_failure;
4591 /* add multipath next hop */
4592 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4594 const struct net_device *dev = rt->fib6_nh.nh_dev;
4595 struct rtnexthop *rtnh;
4596 unsigned int flags = 0;
4598 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4600 goto nla_put_failure;
4602 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4603 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4605 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4606 goto nla_put_failure;
4608 rtnh->rtnh_flags = flags;
4610 /* length of rtnetlink header + attributes */
4611 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4619 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4620 struct fib6_info *rt, struct dst_entry *dst,
4621 struct in6_addr *dest, struct in6_addr *src,
4622 int iif, int type, u32 portid, u32 seq,
4625 struct rt6_info *rt6 = (struct rt6_info *)dst;
4626 struct rt6key *rt6_dst, *rt6_src;
4627 u32 *pmetrics, table, rt6_flags;
4628 struct nlmsghdr *nlh;
4632 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4637 rt6_dst = &rt6->rt6i_dst;
4638 rt6_src = &rt6->rt6i_src;
4639 rt6_flags = rt6->rt6i_flags;
4641 rt6_dst = &rt->fib6_dst;
4642 rt6_src = &rt->fib6_src;
4643 rt6_flags = rt->fib6_flags;
4646 rtm = nlmsg_data(nlh);
4647 rtm->rtm_family = AF_INET6;
4648 rtm->rtm_dst_len = rt6_dst->plen;
4649 rtm->rtm_src_len = rt6_src->plen;
4652 table = rt->fib6_table->tb6_id;
4654 table = RT6_TABLE_UNSPEC;
4655 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4656 if (nla_put_u32(skb, RTA_TABLE, table))
4657 goto nla_put_failure;
4659 rtm->rtm_type = rt->fib6_type;
4661 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4662 rtm->rtm_protocol = rt->fib6_protocol;
4664 if (rt6_flags & RTF_CACHE)
4665 rtm->rtm_flags |= RTM_F_CLONED;
4668 if (nla_put_in6_addr(skb, RTA_DST, dest))
4669 goto nla_put_failure;
4670 rtm->rtm_dst_len = 128;
4671 } else if (rtm->rtm_dst_len)
4672 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4673 goto nla_put_failure;
4674 #ifdef CONFIG_IPV6_SUBTREES
4676 if (nla_put_in6_addr(skb, RTA_SRC, src))
4677 goto nla_put_failure;
4678 rtm->rtm_src_len = 128;
4679 } else if (rtm->rtm_src_len &&
4680 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4681 goto nla_put_failure;
4684 #ifdef CONFIG_IPV6_MROUTE
4685 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4686 int err = ip6mr_get_route(net, skb, rtm, portid);
4691 goto nla_put_failure;
4694 if (nla_put_u32(skb, RTA_IIF, iif))
4695 goto nla_put_failure;
4697 struct in6_addr saddr_buf;
4698 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4699 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4700 goto nla_put_failure;
4703 if (rt->fib6_prefsrc.plen) {
4704 struct in6_addr saddr_buf;
4705 saddr_buf = rt->fib6_prefsrc.addr;
4706 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4707 goto nla_put_failure;
4710 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4711 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4712 goto nla_put_failure;
4714 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4715 goto nla_put_failure;
4717 /* For multipath routes, walk the siblings list and add
4718 * each as a nexthop within RTA_MULTIPATH.
4721 if (rt6_flags & RTF_GATEWAY &&
4722 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4723 goto nla_put_failure;
4725 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4726 goto nla_put_failure;
4727 } else if (rt->fib6_nsiblings) {
4728 struct fib6_info *sibling, *next_sibling;
4731 mp = nla_nest_start(skb, RTA_MULTIPATH);
4733 goto nla_put_failure;
4735 if (rt6_add_nexthop(skb, rt) < 0)
4736 goto nla_put_failure;
4738 list_for_each_entry_safe(sibling, next_sibling,
4739 &rt->fib6_siblings, fib6_siblings) {
4740 if (rt6_add_nexthop(skb, sibling) < 0)
4741 goto nla_put_failure;
4744 nla_nest_end(skb, mp);
4746 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4747 goto nla_put_failure;
4750 if (rt6_flags & RTF_EXPIRES) {
4751 expires = dst ? dst->expires : rt->expires;
4755 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4756 goto nla_put_failure;
4758 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4759 goto nla_put_failure;
4762 nlmsg_end(skb, nlh);
4766 nlmsg_cancel(skb, nlh);
4770 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4771 const struct net_device *dev)
4773 if (f6i->fib6_nh.nh_dev == dev)
4776 if (f6i->fib6_nsiblings) {
4777 struct fib6_info *sibling, *next_sibling;
4779 list_for_each_entry_safe(sibling, next_sibling,
4780 &f6i->fib6_siblings, fib6_siblings) {
4781 if (sibling->fib6_nh.nh_dev == dev)
4789 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4791 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4792 struct fib_dump_filter *filter = &arg->filter;
4793 unsigned int flags = NLM_F_MULTI;
4794 struct net *net = arg->net;
4796 if (rt == net->ipv6.fib6_null_entry)
4799 if ((filter->flags & RTM_F_PREFIX) &&
4800 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4801 /* success since this is not a prefix route */
4804 if (filter->filter_set) {
4805 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4806 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4807 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4810 flags |= NLM_F_DUMP_FILTERED;
4813 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4814 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4815 arg->cb->nlh->nlmsg_seq, flags);
4818 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4819 const struct nlmsghdr *nlh,
4821 struct netlink_ext_ack *extack)
4826 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4827 NL_SET_ERR_MSG_MOD(extack,
4828 "Invalid header for get route request");
4832 if (!netlink_strict_get_check(skb))
4833 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4834 rtm_ipv6_policy, extack);
4836 rtm = nlmsg_data(nlh);
4837 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4838 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4839 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4841 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4844 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4845 NL_SET_ERR_MSG_MOD(extack,
4846 "Invalid flags for get route request");
4850 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4851 rtm_ipv6_policy, extack);
4855 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4856 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4857 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4861 for (i = 0; i <= RTA_MAX; i++) {
4877 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4885 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4886 struct netlink_ext_ack *extack)
4888 struct net *net = sock_net(in_skb->sk);
4889 struct nlattr *tb[RTA_MAX+1];
4890 int err, iif = 0, oif = 0;
4891 struct fib6_info *from;
4892 struct dst_entry *dst;
4893 struct rt6_info *rt;
4894 struct sk_buff *skb;
4896 struct flowi6 fl6 = {};
4899 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4904 rtm = nlmsg_data(nlh);
4905 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4906 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4909 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4912 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4916 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4919 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4923 iif = nla_get_u32(tb[RTA_IIF]);
4926 oif = nla_get_u32(tb[RTA_OIF]);
4929 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4932 fl6.flowi6_uid = make_kuid(current_user_ns(),
4933 nla_get_u32(tb[RTA_UID]));
4935 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4938 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4941 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4943 if (tb[RTA_IP_PROTO]) {
4944 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4945 &fl6.flowi6_proto, AF_INET6,
4952 struct net_device *dev;
4957 dev = dev_get_by_index_rcu(net, iif);
4964 fl6.flowi6_iif = iif;
4966 if (!ipv6_addr_any(&fl6.saddr))
4967 flags |= RT6_LOOKUP_F_HAS_SADDR;
4969 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4973 fl6.flowi6_oif = oif;
4975 dst = ip6_route_output(net, NULL, &fl6);
4979 rt = container_of(dst, struct rt6_info, dst);
4980 if (rt->dst.error) {
4981 err = rt->dst.error;
4986 if (rt == net->ipv6.ip6_null_entry) {
4987 err = rt->dst.error;
4992 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4999 skb_dst_set(skb, &rt->dst);
5002 from = rcu_dereference(rt->from);
5005 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5006 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5009 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5010 &fl6.saddr, iif, RTM_NEWROUTE,
5011 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5020 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5025 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5026 unsigned int nlm_flags)
5028 struct sk_buff *skb;
5029 struct net *net = info->nl_net;
5034 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5036 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5040 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5041 event, info->portid, seq, nlm_flags);
5043 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5044 WARN_ON(err == -EMSGSIZE);
5048 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5049 info->nlh, gfp_any());
5053 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5056 static int ip6_route_dev_notify(struct notifier_block *this,
5057 unsigned long event, void *ptr)
5059 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5060 struct net *net = dev_net(dev);
5062 if (!(dev->flags & IFF_LOOPBACK))
5065 if (event == NETDEV_REGISTER) {
5066 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5067 net->ipv6.ip6_null_entry->dst.dev = dev;
5068 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5069 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5070 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5071 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5072 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5073 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5075 } else if (event == NETDEV_UNREGISTER &&
5076 dev->reg_state != NETREG_UNREGISTERED) {
5077 /* NETDEV_UNREGISTER could be fired for multiple times by
5078 * netdev_wait_allrefs(). Make sure we only call this once.
5080 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5081 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5082 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5083 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5094 #ifdef CONFIG_PROC_FS
5095 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5097 struct net *net = (struct net *)seq->private;
5098 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5099 net->ipv6.rt6_stats->fib_nodes,
5100 net->ipv6.rt6_stats->fib_route_nodes,
5101 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5102 net->ipv6.rt6_stats->fib_rt_entries,
5103 net->ipv6.rt6_stats->fib_rt_cache,
5104 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5105 net->ipv6.rt6_stats->fib_discarded_routes);
5109 #endif /* CONFIG_PROC_FS */
5111 #ifdef CONFIG_SYSCTL
5114 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5115 void __user *buffer, size_t *lenp, loff_t *ppos)
5123 net = (struct net *)ctl->extra1;
5124 delay = net->ipv6.sysctl.flush_delay;
5125 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5129 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5136 static struct ctl_table ipv6_route_table_template[] = {
5138 .procname = "flush",
5139 .data = &init_net.ipv6.sysctl.flush_delay,
5140 .maxlen = sizeof(int),
5142 .proc_handler = ipv6_sysctl_rtcache_flush
5145 .procname = "gc_thresh",
5146 .data = &ip6_dst_ops_template.gc_thresh,
5147 .maxlen = sizeof(int),
5149 .proc_handler = proc_dointvec,
5152 .procname = "max_size",
5153 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5154 .maxlen = sizeof(int),
5156 .proc_handler = proc_dointvec,
5159 .procname = "gc_min_interval",
5160 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5161 .maxlen = sizeof(int),
5163 .proc_handler = proc_dointvec_jiffies,
5166 .procname = "gc_timeout",
5167 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5168 .maxlen = sizeof(int),
5170 .proc_handler = proc_dointvec_jiffies,
5173 .procname = "gc_interval",
5174 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5175 .maxlen = sizeof(int),
5177 .proc_handler = proc_dointvec_jiffies,
5180 .procname = "gc_elasticity",
5181 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5182 .maxlen = sizeof(int),
5184 .proc_handler = proc_dointvec,
5187 .procname = "mtu_expires",
5188 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5189 .maxlen = sizeof(int),
5191 .proc_handler = proc_dointvec_jiffies,
5194 .procname = "min_adv_mss",
5195 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5196 .maxlen = sizeof(int),
5198 .proc_handler = proc_dointvec,
5201 .procname = "gc_min_interval_ms",
5202 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5203 .maxlen = sizeof(int),
5205 .proc_handler = proc_dointvec_ms_jiffies,
5208 .procname = "skip_notify_on_dev_down",
5209 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5210 .maxlen = sizeof(int),
5212 .proc_handler = proc_dointvec,
5219 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5221 struct ctl_table *table;
5223 table = kmemdup(ipv6_route_table_template,
5224 sizeof(ipv6_route_table_template),
5228 table[0].data = &net->ipv6.sysctl.flush_delay;
5229 table[0].extra1 = net;
5230 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5231 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5232 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5233 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5234 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5235 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5236 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5237 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5238 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5239 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5241 /* Don't export sysctls to unprivileged users */
5242 if (net->user_ns != &init_user_ns)
5243 table[0].procname = NULL;
5250 static int __net_init ip6_route_net_init(struct net *net)
5254 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5255 sizeof(net->ipv6.ip6_dst_ops));
5257 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5258 goto out_ip6_dst_ops;
5260 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5261 sizeof(*net->ipv6.fib6_null_entry),
5263 if (!net->ipv6.fib6_null_entry)
5264 goto out_ip6_dst_entries;
5266 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5267 sizeof(*net->ipv6.ip6_null_entry),
5269 if (!net->ipv6.ip6_null_entry)
5270 goto out_fib6_null_entry;
5271 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5272 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5273 ip6_template_metrics, true);
5275 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5276 net->ipv6.fib6_has_custom_rules = false;
5277 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5278 sizeof(*net->ipv6.ip6_prohibit_entry),
5280 if (!net->ipv6.ip6_prohibit_entry)
5281 goto out_ip6_null_entry;
5282 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5283 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5284 ip6_template_metrics, true);
5286 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5287 sizeof(*net->ipv6.ip6_blk_hole_entry),
5289 if (!net->ipv6.ip6_blk_hole_entry)
5290 goto out_ip6_prohibit_entry;
5291 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5292 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5293 ip6_template_metrics, true);
5296 net->ipv6.sysctl.flush_delay = 0;
5297 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5298 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5299 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5300 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5301 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5302 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5303 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5304 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5306 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5312 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5313 out_ip6_prohibit_entry:
5314 kfree(net->ipv6.ip6_prohibit_entry);
5316 kfree(net->ipv6.ip6_null_entry);
5318 out_fib6_null_entry:
5319 kfree(net->ipv6.fib6_null_entry);
5320 out_ip6_dst_entries:
5321 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5326 static void __net_exit ip6_route_net_exit(struct net *net)
5328 kfree(net->ipv6.fib6_null_entry);
5329 kfree(net->ipv6.ip6_null_entry);
5330 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5331 kfree(net->ipv6.ip6_prohibit_entry);
5332 kfree(net->ipv6.ip6_blk_hole_entry);
5334 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5337 static int __net_init ip6_route_net_init_late(struct net *net)
5339 #ifdef CONFIG_PROC_FS
5340 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5341 sizeof(struct ipv6_route_iter));
5342 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5343 rt6_stats_seq_show, NULL);
5348 static void __net_exit ip6_route_net_exit_late(struct net *net)
5350 #ifdef CONFIG_PROC_FS
5351 remove_proc_entry("ipv6_route", net->proc_net);
5352 remove_proc_entry("rt6_stats", net->proc_net);
5356 static struct pernet_operations ip6_route_net_ops = {
5357 .init = ip6_route_net_init,
5358 .exit = ip6_route_net_exit,
5361 static int __net_init ipv6_inetpeer_init(struct net *net)
5363 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5367 inet_peer_base_init(bp);
5368 net->ipv6.peers = bp;
5372 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5374 struct inet_peer_base *bp = net->ipv6.peers;
5376 net->ipv6.peers = NULL;
5377 inetpeer_invalidate_tree(bp);
5381 static struct pernet_operations ipv6_inetpeer_ops = {
5382 .init = ipv6_inetpeer_init,
5383 .exit = ipv6_inetpeer_exit,
5386 static struct pernet_operations ip6_route_net_late_ops = {
5387 .init = ip6_route_net_init_late,
5388 .exit = ip6_route_net_exit_late,
5391 static struct notifier_block ip6_route_dev_notifier = {
5392 .notifier_call = ip6_route_dev_notify,
5393 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5396 void __init ip6_route_init_special_entries(void)
5398 /* Registering of the loopback is done before this portion of code,
5399 * the loopback reference in rt6_info will not be taken, do it
5400 * manually for init_net */
5401 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5402 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5403 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5404 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5405 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5406 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5407 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5408 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5412 int __init ip6_route_init(void)
5418 ip6_dst_ops_template.kmem_cachep =
5419 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5420 SLAB_HWCACHE_ALIGN, NULL);
5421 if (!ip6_dst_ops_template.kmem_cachep)
5424 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5426 goto out_kmem_cache;
5428 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5430 goto out_dst_entries;
5432 ret = register_pernet_subsys(&ip6_route_net_ops);
5434 goto out_register_inetpeer;
5436 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5440 goto out_register_subsys;
5446 ret = fib6_rules_init();
5450 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5452 goto fib6_rules_init;
5454 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5455 inet6_rtm_newroute, NULL, 0);
5457 goto out_register_late_subsys;
5459 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5460 inet6_rtm_delroute, NULL, 0);
5462 goto out_register_late_subsys;
5464 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5465 inet6_rtm_getroute, NULL,
5466 RTNL_FLAG_DOIT_UNLOCKED);
5468 goto out_register_late_subsys;
5470 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5472 goto out_register_late_subsys;
5474 for_each_possible_cpu(cpu) {
5475 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5477 INIT_LIST_HEAD(&ul->head);
5478 spin_lock_init(&ul->lock);
5484 out_register_late_subsys:
5485 rtnl_unregister_all(PF_INET6);
5486 unregister_pernet_subsys(&ip6_route_net_late_ops);
5488 fib6_rules_cleanup();
5493 out_register_subsys:
5494 unregister_pernet_subsys(&ip6_route_net_ops);
5495 out_register_inetpeer:
5496 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5498 dst_entries_destroy(&ip6_dst_blackhole_ops);
5500 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5504 void ip6_route_cleanup(void)
5506 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5507 unregister_pernet_subsys(&ip6_route_net_late_ops);
5508 fib6_rules_cleanup();
5511 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5512 unregister_pernet_subsys(&ip6_route_net_ops);
5513 dst_entries_destroy(&ip6_dst_blackhole_ops);
5514 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);