2 * Linux INET6 implementation
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
56 #include <linux/rtnetlink.h>
58 #include <net/dst_metadata.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
73 static int ip6_rt_type_to_error(u8 fib6_type);
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
81 RT6_NUD_FAIL_HARD = -3,
82 RT6_NUD_FAIL_PROBE = -2,
83 RT6_NUD_FAIL_DO_RR = -1,
87 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void ip6_dst_destroy(struct dst_entry *);
92 static void ip6_dst_ifdown(struct dst_entry *,
93 struct net_device *dev, int how);
94 static int ip6_dst_gc(struct dst_ops *ops);
96 static int ip6_pkt_discard(struct sk_buff *skb);
97 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int ip6_pkt_prohibit(struct sk_buff *skb);
99 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void ip6_link_failure(struct sk_buff *skb);
101 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 struct sk_buff *skb, u32 mtu);
103 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 struct fib6_info *rt, struct dst_entry *dst,
109 struct in6_addr *dest, struct in6_addr *src,
110 int iif, int type, u32 portid, u32 seq,
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 struct in6_addr *daddr,
114 struct in6_addr *saddr);
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 const struct in6_addr *prefix, int prefixlen,
119 const struct in6_addr *gwaddr,
120 struct net_device *dev,
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 const struct in6_addr *prefix, int prefixlen,
124 const struct in6_addr *gwaddr,
125 struct net_device *dev);
128 struct uncached_list {
130 struct list_head head;
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 void rt6_uncached_list_add(struct rt6_info *rt)
137 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 rt->rt6i_uncached_list = ul;
141 spin_lock_bh(&ul->lock);
142 list_add_tail(&rt->rt6i_uncached, &ul->head);
143 spin_unlock_bh(&ul->lock);
146 void rt6_uncached_list_del(struct rt6_info *rt)
148 if (!list_empty(&rt->rt6i_uncached)) {
149 struct uncached_list *ul = rt->rt6i_uncached_list;
150 struct net *net = dev_net(rt->dst.dev);
152 spin_lock_bh(&ul->lock);
153 list_del(&rt->rt6i_uncached);
154 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 spin_unlock_bh(&ul->lock);
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 struct net_device *loopback_dev = net->loopback_dev;
164 if (dev == loopback_dev)
167 for_each_possible_cpu(cpu) {
168 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
171 spin_lock_bh(&ul->lock);
172 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 struct inet6_dev *rt_idev = rt->rt6i_idev;
174 struct net_device *rt_dev = rt->dst.dev;
176 if (rt_idev->dev == dev) {
177 rt->rt6i_idev = in6_dev_get(loopback_dev);
178 in6_dev_put(rt_idev);
182 rt->dst.dev = loopback_dev;
183 dev_hold(rt->dst.dev);
187 spin_unlock_bh(&ul->lock);
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
195 if (!ipv6_addr_any(p))
196 return (const void *) p;
198 return &ipv6_hdr(skb)->daddr;
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 struct net_device *dev,
209 daddr = choose_neigh_daddr(gw, skb, daddr);
210 n = __ipv6_neigh_lookup(dev, daddr);
214 n = neigh_create(&nd_tbl, daddr, dev);
215 return IS_ERR(n) ? NULL : n;
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
222 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 struct net_device *dev = dst->dev;
230 struct rt6_info *rt = (struct rt6_info *)dst;
232 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
235 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 __ipv6_confirm_neigh(dev, daddr);
242 static struct dst_ops ip6_dst_ops_template = {
246 .check = ip6_dst_check,
247 .default_advmss = ip6_default_advmss,
249 .cow_metrics = dst_cow_metrics_generic,
250 .destroy = ip6_dst_destroy,
251 .ifdown = ip6_dst_ifdown,
252 .negative_advice = ip6_negative_advice,
253 .link_failure = ip6_link_failure,
254 .update_pmtu = ip6_rt_update_pmtu,
255 .redirect = rt6_do_redirect,
256 .local_out = __ip6_local_out,
257 .neigh_lookup = ip6_dst_neigh_lookup,
258 .confirm_neigh = ip6_confirm_neigh,
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 return mtu ? : dst->dev->mtu;
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269 struct sk_buff *skb, u32 mtu)
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 static struct dst_ops ip6_dst_blackhole_ops = {
280 .destroy = ip6_dst_destroy,
281 .check = ip6_dst_check,
282 .mtu = ip6_blackhole_mtu,
283 .default_advmss = ip6_default_advmss,
284 .update_pmtu = ip6_rt_blackhole_update_pmtu,
285 .redirect = ip6_rt_blackhole_redirect,
286 .cow_metrics = dst_cow_metrics_generic,
287 .neigh_lookup = ip6_dst_neigh_lookup,
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291 [RTAX_HOPLIMIT - 1] = 0,
294 static const struct fib6_info fib6_null_entry_template = {
295 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
296 .fib6_protocol = RTPROT_KERNEL,
297 .fib6_metric = ~(u32)0,
298 .fib6_ref = ATOMIC_INIT(1),
299 .fib6_type = RTN_UNREACHABLE,
300 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
303 static const struct rt6_info ip6_null_entry_template = {
305 .__refcnt = ATOMIC_INIT(1),
307 .obsolete = DST_OBSOLETE_FORCE_CHK,
308 .error = -ENETUNREACH,
309 .input = ip6_pkt_discard,
310 .output = ip6_pkt_discard_out,
312 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 static const struct rt6_info ip6_prohibit_entry_template = {
319 .__refcnt = ATOMIC_INIT(1),
321 .obsolete = DST_OBSOLETE_FORCE_CHK,
323 .input = ip6_pkt_prohibit,
324 .output = ip6_pkt_prohibit_out,
326 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
329 static const struct rt6_info ip6_blk_hole_entry_template = {
331 .__refcnt = ATOMIC_INIT(1),
333 .obsolete = DST_OBSOLETE_FORCE_CHK,
335 .input = dst_discard,
336 .output = dst_discard_out,
338 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
343 static void rt6_info_init(struct rt6_info *rt)
345 struct dst_entry *dst = &rt->dst;
347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 INIT_LIST_HEAD(&rt->rt6i_uncached);
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 1, DST_OBSOLETE_FORCE_CHK, flags);
360 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
365 EXPORT_SYMBOL(ip6_dst_alloc);
367 static void ip6_dst_destroy(struct dst_entry *dst)
369 struct rt6_info *rt = (struct rt6_info *)dst;
370 struct fib6_info *from;
371 struct inet6_dev *idev;
373 ip_dst_metrics_put(dst);
374 rt6_uncached_list_del(rt);
376 idev = rt->rt6i_idev;
378 rt->rt6i_idev = NULL;
383 from = rcu_dereference(rt->from);
384 rcu_assign_pointer(rt->from, NULL);
385 fib6_info_release(from);
389 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392 struct rt6_info *rt = (struct rt6_info *)dst;
393 struct inet6_dev *idev = rt->rt6i_idev;
394 struct net_device *loopback_dev =
395 dev_net(dev)->loopback_dev;
397 if (idev && idev->dev != loopback_dev) {
398 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 rt->rt6i_idev = loopback_idev;
406 static bool __rt6_check_expired(const struct rt6_info *rt)
408 if (rt->rt6i_flags & RTF_EXPIRES)
409 return time_after(jiffies, rt->dst.expires);
414 static bool rt6_check_expired(const struct rt6_info *rt)
416 struct fib6_info *from;
418 from = rcu_dereference(rt->from);
420 if (rt->rt6i_flags & RTF_EXPIRES) {
421 if (time_after(jiffies, rt->dst.expires))
424 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425 fib6_check_expired(from);
430 struct fib6_info *fib6_multipath_select(const struct net *net,
431 struct fib6_info *match,
432 struct flowi6 *fl6, int oif,
433 const struct sk_buff *skb,
436 struct fib6_info *sibling, *next_sibling;
438 /* We might have already computed the hash for ICMPv6 errors. In such
439 * case it will always be non-zero. Otherwise now is the time to do it.
442 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
447 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
451 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
452 if (fl6->mp_hash > nh_upper_bound)
454 if (rt6_score_route(sibling, oif, strict) < 0)
464 * Route lookup. rcu_read_lock() should be held.
467 static inline struct fib6_info *rt6_device_match(struct net *net,
468 struct fib6_info *rt,
469 const struct in6_addr *saddr,
473 struct fib6_info *sprt;
475 if (!oif && ipv6_addr_any(saddr) &&
476 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
479 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
480 const struct net_device *dev = sprt->fib6_nh.nh_dev;
482 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
486 if (dev->ifindex == oif)
489 if (ipv6_chk_addr(net, saddr, dev,
490 flags & RT6_LOOKUP_F_IFACE))
495 if (oif && flags & RT6_LOOKUP_F_IFACE)
496 return net->ipv6.fib6_null_entry;
498 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
501 #ifdef CONFIG_IPV6_ROUTER_PREF
502 struct __rt6_probe_work {
503 struct work_struct work;
504 struct in6_addr target;
505 struct net_device *dev;
508 static void rt6_probe_deferred(struct work_struct *w)
510 struct in6_addr mcaddr;
511 struct __rt6_probe_work *work =
512 container_of(w, struct __rt6_probe_work, work);
514 addrconf_addr_solict_mult(&work->target, &mcaddr);
515 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
520 static void rt6_probe(struct fib6_info *rt)
522 struct __rt6_probe_work *work = NULL;
523 const struct in6_addr *nh_gw;
524 struct neighbour *neigh;
525 struct net_device *dev;
526 struct inet6_dev *idev;
529 * Okay, this does not seem to be appropriate
530 * for now, however, we need to check if it
531 * is really so; aka Router Reachability Probing.
533 * Router Reachability Probe MUST be rate-limited
534 * to no more than one per minute.
536 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
539 nh_gw = &rt->fib6_nh.nh_gw;
540 dev = rt->fib6_nh.nh_dev;
542 idev = __in6_dev_get(dev);
543 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
545 if (neigh->nud_state & NUD_VALID)
548 write_lock(&neigh->lock);
549 if (!(neigh->nud_state & NUD_VALID) &&
551 neigh->updated + idev->cnf.rtr_probe_interval)) {
552 work = kmalloc(sizeof(*work), GFP_ATOMIC);
554 __neigh_set_probe_once(neigh);
556 write_unlock(&neigh->lock);
557 } else if (time_after(jiffies, rt->last_probe +
558 idev->cnf.rtr_probe_interval)) {
559 work = kmalloc(sizeof(*work), GFP_ATOMIC);
563 rt->last_probe = jiffies;
564 INIT_WORK(&work->work, rt6_probe_deferred);
565 work->target = *nh_gw;
568 schedule_work(&work->work);
572 rcu_read_unlock_bh();
575 static inline void rt6_probe(struct fib6_info *rt)
581 * Default Router Selection (RFC 2461 6.3.6)
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
585 const struct net_device *dev = rt->fib6_nh.nh_dev;
587 if (!oif || dev->ifindex == oif)
592 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
594 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595 struct neighbour *neigh;
597 if (rt->fib6_flags & RTF_NONEXTHOP ||
598 !(rt->fib6_flags & RTF_GATEWAY))
599 return RT6_NUD_SUCCEED;
602 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
605 read_lock(&neigh->lock);
606 if (neigh->nud_state & NUD_VALID)
607 ret = RT6_NUD_SUCCEED;
608 #ifdef CONFIG_IPV6_ROUTER_PREF
609 else if (!(neigh->nud_state & NUD_FAILED))
610 ret = RT6_NUD_SUCCEED;
612 ret = RT6_NUD_FAIL_PROBE;
614 read_unlock(&neigh->lock);
616 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
617 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619 rcu_read_unlock_bh();
624 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
628 m = rt6_check_dev(rt, oif);
629 if (!m && (strict & RT6_LOOKUP_F_IFACE))
630 return RT6_NUD_FAIL_HARD;
631 #ifdef CONFIG_IPV6_ROUTER_PREF
632 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
634 if (strict & RT6_LOOKUP_F_REACHABLE) {
635 int n = rt6_check_neigh(rt);
642 /* called with rc_read_lock held */
643 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
645 const struct net_device *dev = fib6_info_nh_dev(f6i);
649 const struct inet6_dev *idev = __in6_dev_get(dev);
651 rc = !!idev->cnf.ignore_routes_with_linkdown;
657 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
658 int *mpri, struct fib6_info *match,
662 bool match_do_rr = false;
664 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
667 if (fib6_ignore_linkdown(rt) &&
668 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
669 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
672 if (fib6_check_expired(rt))
675 m = rt6_score_route(rt, oif, strict);
676 if (m == RT6_NUD_FAIL_DO_RR) {
678 m = 0; /* lowest valid score */
679 } else if (m == RT6_NUD_FAIL_HARD) {
683 if (strict & RT6_LOOKUP_F_REACHABLE)
686 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
688 *do_rr = match_do_rr;
696 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
697 struct fib6_info *leaf,
698 struct fib6_info *rr_head,
699 u32 metric, int oif, int strict,
702 struct fib6_info *rt, *match, *cont;
707 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
708 if (rt->fib6_metric != metric) {
713 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716 for (rt = leaf; rt && rt != rr_head;
717 rt = rcu_dereference(rt->fib6_next)) {
718 if (rt->fib6_metric != metric) {
723 match = find_match(rt, oif, strict, &mpri, match, do_rr);
729 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
730 match = find_match(rt, oif, strict, &mpri, match, do_rr);
735 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
738 struct fib6_info *leaf = rcu_dereference(fn->leaf);
739 struct fib6_info *match, *rt0;
743 if (!leaf || leaf == net->ipv6.fib6_null_entry)
744 return net->ipv6.fib6_null_entry;
746 rt0 = rcu_dereference(fn->rr_ptr);
750 /* Double check to make sure fn is not an intermediate node
751 * and fn->leaf does not points to its child's leaf
752 * (This might happen if all routes under fn are deleted from
753 * the tree and fib6_repair_tree() is called on the node.)
755 key_plen = rt0->fib6_dst.plen;
756 #ifdef CONFIG_IPV6_SUBTREES
757 if (rt0->fib6_src.plen)
758 key_plen = rt0->fib6_src.plen;
760 if (fn->fn_bit != key_plen)
761 return net->ipv6.fib6_null_entry;
763 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
767 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
769 /* no entries matched; do round-robin */
770 if (!next || next->fib6_metric != rt0->fib6_metric)
774 spin_lock_bh(&leaf->fib6_table->tb6_lock);
775 /* make sure next is not being deleted from the tree */
777 rcu_assign_pointer(fn->rr_ptr, next);
778 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
782 return match ? match : net->ipv6.fib6_null_entry;
785 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
787 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
790 #ifdef CONFIG_IPV6_ROUTE_INFO
791 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
792 const struct in6_addr *gwaddr)
794 struct net *net = dev_net(dev);
795 struct route_info *rinfo = (struct route_info *) opt;
796 struct in6_addr prefix_buf, *prefix;
798 unsigned long lifetime;
799 struct fib6_info *rt;
801 if (len < sizeof(struct route_info)) {
805 /* Sanity check for prefix_len and length */
806 if (rinfo->length > 3) {
808 } else if (rinfo->prefix_len > 128) {
810 } else if (rinfo->prefix_len > 64) {
811 if (rinfo->length < 2) {
814 } else if (rinfo->prefix_len > 0) {
815 if (rinfo->length < 1) {
820 pref = rinfo->route_pref;
821 if (pref == ICMPV6_ROUTER_PREF_INVALID)
824 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
826 if (rinfo->length == 3)
827 prefix = (struct in6_addr *)rinfo->prefix;
829 /* this function is safe */
830 ipv6_addr_prefix(&prefix_buf,
831 (struct in6_addr *)rinfo->prefix,
833 prefix = &prefix_buf;
836 if (rinfo->prefix_len == 0)
837 rt = rt6_get_dflt_router(net, gwaddr, dev);
839 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
842 if (rt && !lifetime) {
848 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
851 rt->fib6_flags = RTF_ROUTEINFO |
852 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
855 if (!addrconf_finite_timeout(lifetime))
856 fib6_clean_expires(rt);
858 fib6_set_expires(rt, jiffies + HZ * lifetime);
860 fib6_info_release(rt);
867 * Misc support functions
870 /* called with rcu_lock held */
871 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
873 struct net_device *dev = rt->fib6_nh.nh_dev;
875 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
876 /* for copies of local routes, dst->dev needs to be the
877 * device if it is a master device, the master device if
878 * device is enslaved, and the loopback as the default
880 if (netif_is_l3_slave(dev) &&
881 !rt6_need_strict(&rt->fib6_dst.addr))
882 dev = l3mdev_master_dev_rcu(dev);
883 else if (!netif_is_l3_master(dev))
884 dev = dev_net(dev)->loopback_dev;
885 /* last case is netif_is_l3_master(dev) is true in which
886 * case we want dev returned to be dev
893 static const int fib6_prop[RTN_MAX + 1] = {
900 [RTN_BLACKHOLE] = -EINVAL,
901 [RTN_UNREACHABLE] = -EHOSTUNREACH,
902 [RTN_PROHIBIT] = -EACCES,
903 [RTN_THROW] = -EAGAIN,
905 [RTN_XRESOLVE] = -EINVAL,
908 static int ip6_rt_type_to_error(u8 fib6_type)
910 return fib6_prop[fib6_type];
913 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
915 unsigned short flags = 0;
918 flags |= DST_NOCOUNT;
919 if (rt->dst_nopolicy)
920 flags |= DST_NOPOLICY;
927 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
929 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
931 switch (ort->fib6_type) {
933 rt->dst.output = dst_discard_out;
934 rt->dst.input = dst_discard;
937 rt->dst.output = ip6_pkt_prohibit_out;
938 rt->dst.input = ip6_pkt_prohibit;
941 case RTN_UNREACHABLE:
943 rt->dst.output = ip6_pkt_discard_out;
944 rt->dst.input = ip6_pkt_discard;
949 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
951 if (ort->fib6_flags & RTF_REJECT) {
952 ip6_rt_init_dst_reject(rt, ort);
957 rt->dst.output = ip6_output;
959 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
960 rt->dst.input = ip6_input;
961 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
962 rt->dst.input = ip6_mc_input;
964 rt->dst.input = ip6_forward;
967 if (ort->fib6_nh.nh_lwtstate) {
968 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
969 lwtunnel_set_redirect(&rt->dst);
972 rt->dst.lastuse = jiffies;
975 /* Caller must already hold reference to @from */
976 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 rt->rt6i_flags &= ~RTF_EXPIRES;
979 rcu_assign_pointer(rt->from, from);
980 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
983 /* Caller must already hold reference to @ort */
984 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
986 struct net_device *dev = fib6_info_nh_dev(ort);
988 ip6_rt_init_dst(rt, ort);
990 rt->rt6i_dst = ort->fib6_dst;
991 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
992 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
993 rt->rt6i_flags = ort->fib6_flags;
994 rt6_set_from(rt, ort);
995 #ifdef CONFIG_IPV6_SUBTREES
996 rt->rt6i_src = ort->fib6_src;
1000 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1001 struct in6_addr *saddr)
1003 struct fib6_node *pn, *sn;
1005 if (fn->fn_flags & RTN_TL_ROOT)
1007 pn = rcu_dereference(fn->parent);
1008 sn = FIB6_SUBTREE(pn);
1010 fn = fib6_node_lookup(sn, NULL, saddr);
1013 if (fn->fn_flags & RTN_RTINFO)
1018 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1021 struct rt6_info *rt = *prt;
1023 if (dst_hold_safe(&rt->dst))
1025 if (null_fallback) {
1026 rt = net->ipv6.ip6_null_entry;
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1038 unsigned short flags = fib6_info_dst_flags(rt);
1039 struct net_device *dev = rt->fib6_nh.nh_dev;
1040 struct rt6_info *nrt;
1042 if (!fib6_info_hold_safe(rt))
1045 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1047 ip6_rt_copy_init(nrt, rt);
1049 fib6_info_release(rt);
1054 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1055 struct fib6_table *table,
1057 const struct sk_buff *skb,
1060 struct fib6_info *f6i;
1061 struct fib6_node *fn;
1062 struct rt6_info *rt;
1064 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1065 flags &= ~RT6_LOOKUP_F_IFACE;
1068 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1070 f6i = rcu_dereference(fn->leaf);
1072 f6i = net->ipv6.fib6_null_entry;
1074 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1075 fl6->flowi6_oif, flags);
1076 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1077 f6i = fib6_multipath_select(net, f6i, fl6,
1078 fl6->flowi6_oif, skb,
1081 if (f6i == net->ipv6.fib6_null_entry) {
1082 fn = fib6_backtrack(fn, &fl6->saddr);
1087 trace_fib6_table_lookup(net, f6i, table, fl6);
1089 /* Search through exception table */
1090 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1092 if (ip6_hold_safe(net, &rt, true))
1093 dst_use_noref(&rt->dst, jiffies);
1094 } else if (f6i == net->ipv6.fib6_null_entry) {
1095 rt = net->ipv6.ip6_null_entry;
1098 rt = ip6_create_rt_rcu(f6i);
1100 rt = net->ipv6.ip6_null_entry;
1110 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1111 const struct sk_buff *skb, int flags)
1113 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1115 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1117 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1118 const struct in6_addr *saddr, int oif,
1119 const struct sk_buff *skb, int strict)
1121 struct flowi6 fl6 = {
1125 struct dst_entry *dst;
1126 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1129 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1130 flags |= RT6_LOOKUP_F_HAS_SADDR;
1133 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1134 if (dst->error == 0)
1135 return (struct rt6_info *) dst;
1141 EXPORT_SYMBOL(rt6_lookup);
1143 /* ip6_ins_rt is called with FREE table->tb6_lock.
1144 * It takes new route entry, the addition fails by any reason the
1145 * route is released.
1146 * Caller must hold dst before calling it.
1149 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1150 struct netlink_ext_ack *extack)
1153 struct fib6_table *table;
1155 table = rt->fib6_table;
1156 spin_lock_bh(&table->tb6_lock);
1157 err = fib6_add(&table->tb6_root, rt, info, extack);
1158 spin_unlock_bh(&table->tb6_lock);
1163 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1165 struct nl_info info = { .nl_net = net, };
1167 return __ip6_ins_rt(rt, &info, NULL);
1170 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1171 const struct in6_addr *daddr,
1172 const struct in6_addr *saddr)
1174 struct net_device *dev;
1175 struct rt6_info *rt;
1181 if (!fib6_info_hold_safe(ort))
1184 dev = ip6_rt_get_dev_rcu(ort);
1185 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1187 fib6_info_release(ort);
1191 ip6_rt_copy_init(rt, ort);
1192 rt->rt6i_flags |= RTF_CACHE;
1193 rt->dst.flags |= DST_HOST;
1194 rt->rt6i_dst.addr = *daddr;
1195 rt->rt6i_dst.plen = 128;
1197 if (!rt6_is_gw_or_nonexthop(ort)) {
1198 if (ort->fib6_dst.plen != 128 &&
1199 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1200 rt->rt6i_flags |= RTF_ANYCAST;
1201 #ifdef CONFIG_IPV6_SUBTREES
1202 if (rt->rt6i_src.plen && saddr) {
1203 rt->rt6i_src.addr = *saddr;
1204 rt->rt6i_src.plen = 128;
1212 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1214 unsigned short flags = fib6_info_dst_flags(rt);
1215 struct net_device *dev;
1216 struct rt6_info *pcpu_rt;
1218 if (!fib6_info_hold_safe(rt))
1222 dev = ip6_rt_get_dev_rcu(rt);
1223 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1226 fib6_info_release(rt);
1229 ip6_rt_copy_init(pcpu_rt, rt);
1230 pcpu_rt->rt6i_flags |= RTF_PCPU;
1234 /* It should be called with rcu_read_lock() acquired */
1235 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1237 struct rt6_info *pcpu_rt, **p;
1239 p = this_cpu_ptr(rt->rt6i_pcpu);
1243 ip6_hold_safe(NULL, &pcpu_rt, false);
1248 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1249 struct fib6_info *rt)
1251 struct rt6_info *pcpu_rt, *prev, **p;
1253 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1255 dst_hold(&net->ipv6.ip6_null_entry->dst);
1256 return net->ipv6.ip6_null_entry;
1259 dst_hold(&pcpu_rt->dst);
1260 p = this_cpu_ptr(rt->rt6i_pcpu);
1261 prev = cmpxchg(p, NULL, pcpu_rt);
1267 /* exception hash table implementation
1269 static DEFINE_SPINLOCK(rt6_exception_lock);
1271 /* Remove rt6_ex from hash table and free the memory
1272 * Caller must hold rt6_exception_lock
1274 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1275 struct rt6_exception *rt6_ex)
1277 struct fib6_info *from;
1280 if (!bucket || !rt6_ex)
1283 net = dev_net(rt6_ex->rt6i->dst.dev);
1284 net->ipv6.rt6_stats->fib_rt_cache--;
1286 /* purge completely the exception to allow releasing the held resources:
1287 * some [sk] cache may keep the dst around for unlimited time
1289 from = rcu_dereference_protected(rt6_ex->rt6i->from,
1290 lockdep_is_held(&rt6_exception_lock));
1291 rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1292 fib6_info_release(from);
1293 dst_dev_put(&rt6_ex->rt6i->dst);
1295 hlist_del_rcu(&rt6_ex->hlist);
1296 dst_release(&rt6_ex->rt6i->dst);
1297 kfree_rcu(rt6_ex, rcu);
1298 WARN_ON_ONCE(!bucket->depth);
1302 /* Remove oldest rt6_ex in bucket and free the memory
1303 * Caller must hold rt6_exception_lock
1305 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1307 struct rt6_exception *rt6_ex, *oldest = NULL;
1312 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1313 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1316 rt6_remove_exception(bucket, oldest);
1319 static u32 rt6_exception_hash(const struct in6_addr *dst,
1320 const struct in6_addr *src)
1322 static u32 seed __read_mostly;
1325 net_get_random_once(&seed, sizeof(seed));
1326 val = jhash(dst, sizeof(*dst), seed);
1328 #ifdef CONFIG_IPV6_SUBTREES
1330 val = jhash(src, sizeof(*src), val);
1332 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1335 /* Helper function to find the cached rt in the hash table
1336 * and update bucket pointer to point to the bucket for this
1337 * (daddr, saddr) pair
1338 * Caller must hold rt6_exception_lock
1340 static struct rt6_exception *
1341 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1342 const struct in6_addr *daddr,
1343 const struct in6_addr *saddr)
1345 struct rt6_exception *rt6_ex;
1348 if (!(*bucket) || !daddr)
1351 hval = rt6_exception_hash(daddr, saddr);
1354 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1355 struct rt6_info *rt6 = rt6_ex->rt6i;
1356 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1358 #ifdef CONFIG_IPV6_SUBTREES
1359 if (matched && saddr)
1360 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1368 /* Helper function to find the cached rt in the hash table
1369 * and update bucket pointer to point to the bucket for this
1370 * (daddr, saddr) pair
1371 * Caller must hold rcu_read_lock()
1373 static struct rt6_exception *
1374 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1375 const struct in6_addr *daddr,
1376 const struct in6_addr *saddr)
1378 struct rt6_exception *rt6_ex;
1381 WARN_ON_ONCE(!rcu_read_lock_held());
1383 if (!(*bucket) || !daddr)
1386 hval = rt6_exception_hash(daddr, saddr);
1389 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1390 struct rt6_info *rt6 = rt6_ex->rt6i;
1391 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1393 #ifdef CONFIG_IPV6_SUBTREES
1394 if (matched && saddr)
1395 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1403 static unsigned int fib6_mtu(const struct fib6_info *rt)
1407 if (rt->fib6_pmtu) {
1408 mtu = rt->fib6_pmtu;
1410 struct net_device *dev = fib6_info_nh_dev(rt);
1411 struct inet6_dev *idev;
1414 idev = __in6_dev_get(dev);
1415 mtu = idev->cnf.mtu6;
1419 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1421 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1424 static int rt6_insert_exception(struct rt6_info *nrt,
1425 struct fib6_info *ort)
1427 struct net *net = dev_net(nrt->dst.dev);
1428 struct rt6_exception_bucket *bucket;
1429 struct in6_addr *src_key = NULL;
1430 struct rt6_exception *rt6_ex;
1433 spin_lock_bh(&rt6_exception_lock);
1435 if (ort->exception_bucket_flushed) {
1440 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1441 lockdep_is_held(&rt6_exception_lock));
1443 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1449 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1452 #ifdef CONFIG_IPV6_SUBTREES
1453 /* rt6i_src.plen != 0 indicates ort is in subtree
1454 * and exception table is indexed by a hash of
1455 * both rt6i_dst and rt6i_src.
1456 * Otherwise, the exception table is indexed by
1457 * a hash of only rt6i_dst.
1459 if (ort->fib6_src.plen)
1460 src_key = &nrt->rt6i_src.addr;
1462 /* rt6_mtu_change() might lower mtu on ort.
1463 * Only insert this exception route if its mtu
1464 * is less than ort's mtu value.
1466 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1471 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1474 rt6_remove_exception(bucket, rt6_ex);
1476 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1482 rt6_ex->stamp = jiffies;
1483 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1485 net->ipv6.rt6_stats->fib_rt_cache++;
1487 if (bucket->depth > FIB6_MAX_DEPTH)
1488 rt6_exception_remove_oldest(bucket);
1491 spin_unlock_bh(&rt6_exception_lock);
1493 /* Update fn->fn_sernum to invalidate all cached dst */
1495 spin_lock_bh(&ort->fib6_table->tb6_lock);
1496 fib6_update_sernum(net, ort);
1497 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1498 fib6_force_start_gc(net);
1504 void rt6_flush_exceptions(struct fib6_info *rt)
1506 struct rt6_exception_bucket *bucket;
1507 struct rt6_exception *rt6_ex;
1508 struct hlist_node *tmp;
1511 spin_lock_bh(&rt6_exception_lock);
1512 /* Prevent rt6_insert_exception() to recreate the bucket list */
1513 rt->exception_bucket_flushed = 1;
1515 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1516 lockdep_is_held(&rt6_exception_lock));
1520 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1521 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1522 rt6_remove_exception(bucket, rt6_ex);
1523 WARN_ON_ONCE(bucket->depth);
1528 spin_unlock_bh(&rt6_exception_lock);
1531 /* Find cached rt in the hash table inside passed in rt
1532 * Caller has to hold rcu_read_lock()
1534 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1535 struct in6_addr *daddr,
1536 struct in6_addr *saddr)
1538 struct rt6_exception_bucket *bucket;
1539 struct in6_addr *src_key = NULL;
1540 struct rt6_exception *rt6_ex;
1541 struct rt6_info *res = NULL;
1543 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1545 #ifdef CONFIG_IPV6_SUBTREES
1546 /* rt6i_src.plen != 0 indicates rt is in subtree
1547 * and exception table is indexed by a hash of
1548 * both rt6i_dst and rt6i_src.
1549 * Otherwise, the exception table is indexed by
1550 * a hash of only rt6i_dst.
1552 if (rt->fib6_src.plen)
1555 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1557 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1563 /* Remove the passed in cached rt from the hash table that contains it */
1564 static int rt6_remove_exception_rt(struct rt6_info *rt)
1566 struct rt6_exception_bucket *bucket;
1567 struct in6_addr *src_key = NULL;
1568 struct rt6_exception *rt6_ex;
1569 struct fib6_info *from;
1572 from = rcu_dereference(rt->from);
1574 !(rt->rt6i_flags & RTF_CACHE))
1577 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1580 spin_lock_bh(&rt6_exception_lock);
1581 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1582 lockdep_is_held(&rt6_exception_lock));
1583 #ifdef CONFIG_IPV6_SUBTREES
1584 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1585 * and exception table is indexed by a hash of
1586 * both rt6i_dst and rt6i_src.
1587 * Otherwise, the exception table is indexed by
1588 * a hash of only rt6i_dst.
1590 if (from->fib6_src.plen)
1591 src_key = &rt->rt6i_src.addr;
1593 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1597 rt6_remove_exception(bucket, rt6_ex);
1603 spin_unlock_bh(&rt6_exception_lock);
1607 /* Find rt6_ex which contains the passed in rt cache and
1610 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1612 struct rt6_exception_bucket *bucket;
1613 struct in6_addr *src_key = NULL;
1614 struct rt6_exception *rt6_ex;
1615 struct fib6_info *from;
1618 from = rcu_dereference(rt->from);
1619 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1622 bucket = rcu_dereference(from->rt6i_exception_bucket);
1624 #ifdef CONFIG_IPV6_SUBTREES
1625 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1626 * and exception table is indexed by a hash of
1627 * both rt6i_dst and rt6i_src.
1628 * Otherwise, the exception table is indexed by
1629 * a hash of only rt6i_dst.
1631 if (from->fib6_src.plen)
1632 src_key = &rt->rt6i_src.addr;
1634 rt6_ex = __rt6_find_exception_rcu(&bucket,
1638 rt6_ex->stamp = jiffies;
1644 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1645 struct rt6_info *rt, int mtu)
1647 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1648 * lowest MTU in the path: always allow updating the route PMTU to
1649 * reflect PMTU decreases.
1651 * If the new MTU is higher, and the route PMTU is equal to the local
1652 * MTU, this means the old MTU is the lowest in the path, so allow
1653 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1657 if (dst_mtu(&rt->dst) >= mtu)
1660 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1666 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1667 struct fib6_info *rt, int mtu)
1669 struct rt6_exception_bucket *bucket;
1670 struct rt6_exception *rt6_ex;
1673 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1674 lockdep_is_held(&rt6_exception_lock));
1679 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1680 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1681 struct rt6_info *entry = rt6_ex->rt6i;
1683 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1684 * route), the metrics of its rt->from have already
1687 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1688 rt6_mtu_change_route_allowed(idev, entry, mtu))
1689 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1695 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1697 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1698 struct in6_addr *gateway)
1700 struct rt6_exception_bucket *bucket;
1701 struct rt6_exception *rt6_ex;
1702 struct hlist_node *tmp;
1705 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1708 spin_lock_bh(&rt6_exception_lock);
1709 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1710 lockdep_is_held(&rt6_exception_lock));
1713 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1714 hlist_for_each_entry_safe(rt6_ex, tmp,
1715 &bucket->chain, hlist) {
1716 struct rt6_info *entry = rt6_ex->rt6i;
1718 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1719 RTF_CACHE_GATEWAY &&
1720 ipv6_addr_equal(gateway,
1721 &entry->rt6i_gateway)) {
1722 rt6_remove_exception(bucket, rt6_ex);
1729 spin_unlock_bh(&rt6_exception_lock);
1732 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1733 struct rt6_exception *rt6_ex,
1734 struct fib6_gc_args *gc_args,
1737 struct rt6_info *rt = rt6_ex->rt6i;
1739 /* we are pruning and obsoleting aged-out and non gateway exceptions
1740 * even if others have still references to them, so that on next
1741 * dst_check() such references can be dropped.
1742 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1743 * expired, independently from their aging, as per RFC 8201 section 4
1745 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1746 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1747 RT6_TRACE("aging clone %p\n", rt);
1748 rt6_remove_exception(bucket, rt6_ex);
1751 } else if (time_after(jiffies, rt->dst.expires)) {
1752 RT6_TRACE("purging expired route %p\n", rt);
1753 rt6_remove_exception(bucket, rt6_ex);
1757 if (rt->rt6i_flags & RTF_GATEWAY) {
1758 struct neighbour *neigh;
1759 __u8 neigh_flags = 0;
1761 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1763 neigh_flags = neigh->flags;
1765 if (!(neigh_flags & NTF_ROUTER)) {
1766 RT6_TRACE("purging route %p via non-router but gateway\n",
1768 rt6_remove_exception(bucket, rt6_ex);
1776 void rt6_age_exceptions(struct fib6_info *rt,
1777 struct fib6_gc_args *gc_args,
1780 struct rt6_exception_bucket *bucket;
1781 struct rt6_exception *rt6_ex;
1782 struct hlist_node *tmp;
1785 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1789 spin_lock(&rt6_exception_lock);
1790 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1791 lockdep_is_held(&rt6_exception_lock));
1794 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1795 hlist_for_each_entry_safe(rt6_ex, tmp,
1796 &bucket->chain, hlist) {
1797 rt6_age_examine_exception(bucket, rt6_ex,
1803 spin_unlock(&rt6_exception_lock);
1804 rcu_read_unlock_bh();
1807 /* must be called with rcu lock held */
1808 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1809 int oif, struct flowi6 *fl6, int strict)
1811 struct fib6_node *fn, *saved_fn;
1812 struct fib6_info *f6i;
1814 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1817 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1821 f6i = rt6_select(net, fn, oif, strict);
1822 if (f6i == net->ipv6.fib6_null_entry) {
1823 fn = fib6_backtrack(fn, &fl6->saddr);
1825 goto redo_rt6_select;
1826 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1827 /* also consider unreachable route */
1828 strict &= ~RT6_LOOKUP_F_REACHABLE;
1830 goto redo_rt6_select;
1834 trace_fib6_table_lookup(net, f6i, table, fl6);
1839 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1840 int oif, struct flowi6 *fl6,
1841 const struct sk_buff *skb, int flags)
1843 struct fib6_info *f6i;
1844 struct rt6_info *rt;
1847 strict |= flags & RT6_LOOKUP_F_IFACE;
1848 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1849 if (net->ipv6.devconf_all->forwarding == 0)
1850 strict |= RT6_LOOKUP_F_REACHABLE;
1854 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1855 if (f6i->fib6_nsiblings)
1856 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1858 if (f6i == net->ipv6.fib6_null_entry) {
1859 rt = net->ipv6.ip6_null_entry;
1865 /*Search through exception table */
1866 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1868 if (ip6_hold_safe(net, &rt, true))
1869 dst_use_noref(&rt->dst, jiffies);
1873 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1874 !(f6i->fib6_flags & RTF_GATEWAY))) {
1875 /* Create a RTF_CACHE clone which will not be
1876 * owned by the fib6 tree. It is for the special case where
1877 * the daddr in the skb during the neighbor look-up is different
1878 * from the fl6->daddr used to look-up route here.
1880 struct rt6_info *uncached_rt;
1882 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1887 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1888 * No need for another dst_hold()
1890 rt6_uncached_list_add(uncached_rt);
1891 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1893 uncached_rt = net->ipv6.ip6_null_entry;
1894 dst_hold(&uncached_rt->dst);
1899 /* Get a percpu copy */
1901 struct rt6_info *pcpu_rt;
1904 pcpu_rt = rt6_get_pcpu_route(f6i);
1907 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1915 EXPORT_SYMBOL_GPL(ip6_pol_route);
1917 static struct rt6_info *ip6_pol_route_input(struct net *net,
1918 struct fib6_table *table,
1920 const struct sk_buff *skb,
1923 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1926 struct dst_entry *ip6_route_input_lookup(struct net *net,
1927 struct net_device *dev,
1929 const struct sk_buff *skb,
1932 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1933 flags |= RT6_LOOKUP_F_IFACE;
1935 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1937 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1939 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1940 struct flow_keys *keys,
1941 struct flow_keys *flkeys)
1943 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1944 const struct ipv6hdr *key_iph = outer_iph;
1945 struct flow_keys *_flkeys = flkeys;
1946 const struct ipv6hdr *inner_iph;
1947 const struct icmp6hdr *icmph;
1948 struct ipv6hdr _inner_iph;
1949 struct icmp6hdr _icmph;
1951 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1954 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1955 sizeof(_icmph), &_icmph);
1959 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1960 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1961 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1962 icmph->icmp6_type != ICMPV6_PARAMPROB)
1965 inner_iph = skb_header_pointer(skb,
1966 skb_transport_offset(skb) + sizeof(*icmph),
1967 sizeof(_inner_iph), &_inner_iph);
1971 key_iph = inner_iph;
1975 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1976 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1977 keys->tags.flow_label = _flkeys->tags.flow_label;
1978 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1980 keys->addrs.v6addrs.src = key_iph->saddr;
1981 keys->addrs.v6addrs.dst = key_iph->daddr;
1982 keys->tags.flow_label = ip6_flowlabel(key_iph);
1983 keys->basic.ip_proto = key_iph->nexthdr;
1987 /* if skb is set it will be used and fl6 can be NULL */
1988 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1989 const struct sk_buff *skb, struct flow_keys *flkeys)
1991 struct flow_keys hash_keys;
1994 switch (ip6_multipath_hash_policy(net)) {
1996 memset(&hash_keys, 0, sizeof(hash_keys));
1997 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1999 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2001 hash_keys.addrs.v6addrs.src = fl6->saddr;
2002 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2003 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2004 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2009 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2010 struct flow_keys keys;
2012 /* short-circuit if we already have L4 hash present */
2014 return skb_get_hash_raw(skb) >> 1;
2016 memset(&hash_keys, 0, sizeof(hash_keys));
2019 skb_flow_dissect_flow_keys(skb, &keys, flag);
2022 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2023 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2024 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2025 hash_keys.ports.src = flkeys->ports.src;
2026 hash_keys.ports.dst = flkeys->ports.dst;
2027 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2029 memset(&hash_keys, 0, sizeof(hash_keys));
2030 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2031 hash_keys.addrs.v6addrs.src = fl6->saddr;
2032 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2033 hash_keys.ports.src = fl6->fl6_sport;
2034 hash_keys.ports.dst = fl6->fl6_dport;
2035 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2039 mhash = flow_hash_from_keys(&hash_keys);
2044 void ip6_route_input(struct sk_buff *skb)
2046 const struct ipv6hdr *iph = ipv6_hdr(skb);
2047 struct net *net = dev_net(skb->dev);
2048 int flags = RT6_LOOKUP_F_HAS_SADDR;
2049 struct ip_tunnel_info *tun_info;
2050 struct flowi6 fl6 = {
2051 .flowi6_iif = skb->dev->ifindex,
2052 .daddr = iph->daddr,
2053 .saddr = iph->saddr,
2054 .flowlabel = ip6_flowinfo(iph),
2055 .flowi6_mark = skb->mark,
2056 .flowi6_proto = iph->nexthdr,
2058 struct flow_keys *flkeys = NULL, _flkeys;
2060 tun_info = skb_tunnel_info(skb);
2061 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2062 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2064 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2067 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2068 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2071 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2074 static struct rt6_info *ip6_pol_route_output(struct net *net,
2075 struct fib6_table *table,
2077 const struct sk_buff *skb,
2080 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2083 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2084 struct flowi6 *fl6, int flags)
2088 if (ipv6_addr_type(&fl6->daddr) &
2089 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2090 struct dst_entry *dst;
2092 dst = l3mdev_link_scope_lookup(net, fl6);
2097 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2099 any_src = ipv6_addr_any(&fl6->saddr);
2100 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2101 (fl6->flowi6_oif && any_src))
2102 flags |= RT6_LOOKUP_F_IFACE;
2105 flags |= RT6_LOOKUP_F_HAS_SADDR;
2107 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2109 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2111 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2113 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2115 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2116 struct net_device *loopback_dev = net->loopback_dev;
2117 struct dst_entry *new = NULL;
2119 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2120 DST_OBSOLETE_DEAD, 0);
2123 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2127 new->input = dst_discard;
2128 new->output = dst_discard_out;
2130 dst_copy_metrics(new, &ort->dst);
2132 rt->rt6i_idev = in6_dev_get(loopback_dev);
2133 rt->rt6i_gateway = ort->rt6i_gateway;
2134 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2136 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2137 #ifdef CONFIG_IPV6_SUBTREES
2138 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2142 dst_release(dst_orig);
2143 return new ? new : ERR_PTR(-ENOMEM);
2147 * Destination cache support functions
2150 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2154 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2157 if (fib6_check_expired(f6i))
2163 static struct dst_entry *rt6_check(struct rt6_info *rt,
2164 struct fib6_info *from,
2169 if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2170 rt_cookie != cookie)
2173 if (rt6_check_expired(rt))
2179 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2180 struct fib6_info *from,
2183 if (!__rt6_check_expired(rt) &&
2184 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2185 fib6_check(from, cookie))
2191 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2193 struct dst_entry *dst_ret;
2194 struct fib6_info *from;
2195 struct rt6_info *rt;
2197 rt = container_of(dst, struct rt6_info, dst);
2201 /* All IPV6 dsts are created with ->obsolete set to the value
2202 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2203 * into this function always.
2206 from = rcu_dereference(rt->from);
2208 if (from && (rt->rt6i_flags & RTF_PCPU ||
2209 unlikely(!list_empty(&rt->rt6i_uncached))))
2210 dst_ret = rt6_dst_from_check(rt, from, cookie);
2212 dst_ret = rt6_check(rt, from, cookie);
2219 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2221 struct rt6_info *rt = (struct rt6_info *) dst;
2224 if (rt->rt6i_flags & RTF_CACHE) {
2226 if (rt6_check_expired(rt)) {
2227 rt6_remove_exception_rt(rt);
2239 static void ip6_link_failure(struct sk_buff *skb)
2241 struct rt6_info *rt;
2243 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2245 rt = (struct rt6_info *) skb_dst(skb);
2248 if (rt->rt6i_flags & RTF_CACHE) {
2249 rt6_remove_exception_rt(rt);
2251 struct fib6_info *from;
2252 struct fib6_node *fn;
2254 from = rcu_dereference(rt->from);
2256 fn = rcu_dereference(from->fib6_node);
2257 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2265 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2267 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2268 struct fib6_info *from;
2271 from = rcu_dereference(rt0->from);
2273 rt0->dst.expires = from->expires;
2277 dst_set_expires(&rt0->dst, timeout);
2278 rt0->rt6i_flags |= RTF_EXPIRES;
2281 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2283 struct net *net = dev_net(rt->dst.dev);
2285 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2286 rt->rt6i_flags |= RTF_MODIFIED;
2287 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2290 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2292 return !(rt->rt6i_flags & RTF_CACHE) &&
2293 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2296 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2297 const struct ipv6hdr *iph, u32 mtu)
2299 const struct in6_addr *daddr, *saddr;
2300 struct rt6_info *rt6 = (struct rt6_info *)dst;
2302 if (dst_metric_locked(dst, RTAX_MTU))
2306 daddr = &iph->daddr;
2307 saddr = &iph->saddr;
2309 daddr = &sk->sk_v6_daddr;
2310 saddr = &inet6_sk(sk)->saddr;
2315 dst_confirm_neigh(dst, daddr);
2316 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2317 if (mtu >= dst_mtu(dst))
2320 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2321 rt6_do_update_pmtu(rt6, mtu);
2322 /* update rt6_ex->stamp for cache */
2323 if (rt6->rt6i_flags & RTF_CACHE)
2324 rt6_update_exception_stamp_rt(rt6);
2326 struct fib6_info *from;
2327 struct rt6_info *nrt6;
2330 from = rcu_dereference(rt6->from);
2331 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2333 rt6_do_update_pmtu(nrt6, mtu);
2334 if (rt6_insert_exception(nrt6, from))
2335 dst_release_immediate(&nrt6->dst);
2341 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2342 struct sk_buff *skb, u32 mtu)
2344 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2347 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2348 int oif, u32 mark, kuid_t uid)
2350 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2351 struct dst_entry *dst;
2352 struct flowi6 fl6 = {
2354 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2355 .daddr = iph->daddr,
2356 .saddr = iph->saddr,
2357 .flowlabel = ip6_flowinfo(iph),
2361 dst = ip6_route_output(net, NULL, &fl6);
2363 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2366 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2368 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2370 int oif = sk->sk_bound_dev_if;
2371 struct dst_entry *dst;
2373 if (!oif && skb->dev)
2374 oif = l3mdev_master_ifindex(skb->dev);
2376 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2378 dst = __sk_dst_get(sk);
2379 if (!dst || !dst->obsolete ||
2380 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2384 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2385 ip6_datagram_dst_update(sk, false);
2388 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2390 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2391 const struct flowi6 *fl6)
2393 #ifdef CONFIG_IPV6_SUBTREES
2394 struct ipv6_pinfo *np = inet6_sk(sk);
2397 ip6_dst_store(sk, dst,
2398 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2399 &sk->sk_v6_daddr : NULL,
2400 #ifdef CONFIG_IPV6_SUBTREES
2401 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2407 /* Handle redirects */
2408 struct ip6rd_flowi {
2410 struct in6_addr gateway;
2413 static struct rt6_info *__ip6_route_redirect(struct net *net,
2414 struct fib6_table *table,
2416 const struct sk_buff *skb,
2419 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2420 struct rt6_info *ret = NULL, *rt_cache;
2421 struct fib6_info *rt;
2422 struct fib6_node *fn;
2424 /* Get the "current" route for this destination and
2425 * check if the redirect has come from appropriate router.
2427 * RFC 4861 specifies that redirects should only be
2428 * accepted if they come from the nexthop to the target.
2429 * Due to the way the routes are chosen, this notion
2430 * is a bit fuzzy and one might need to check all possible
2435 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2437 for_each_fib6_node_rt_rcu(fn) {
2438 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2440 if (fib6_check_expired(rt))
2442 if (rt->fib6_flags & RTF_REJECT)
2444 if (!(rt->fib6_flags & RTF_GATEWAY))
2446 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2448 /* rt_cache's gateway might be different from its 'parent'
2449 * in the case of an ip redirect.
2450 * So we keep searching in the exception table if the gateway
2453 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2454 rt_cache = rt6_find_cached_rt(rt,
2458 ipv6_addr_equal(&rdfl->gateway,
2459 &rt_cache->rt6i_gateway)) {
2469 rt = net->ipv6.fib6_null_entry;
2470 else if (rt->fib6_flags & RTF_REJECT) {
2471 ret = net->ipv6.ip6_null_entry;
2475 if (rt == net->ipv6.fib6_null_entry) {
2476 fn = fib6_backtrack(fn, &fl6->saddr);
2483 ip6_hold_safe(net, &ret, true);
2485 ret = ip6_create_rt_rcu(rt);
2489 trace_fib6_table_lookup(net, rt, table, fl6);
2493 static struct dst_entry *ip6_route_redirect(struct net *net,
2494 const struct flowi6 *fl6,
2495 const struct sk_buff *skb,
2496 const struct in6_addr *gateway)
2498 int flags = RT6_LOOKUP_F_HAS_SADDR;
2499 struct ip6rd_flowi rdfl;
2502 rdfl.gateway = *gateway;
2504 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2505 flags, __ip6_route_redirect);
2508 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2511 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2512 struct dst_entry *dst;
2513 struct flowi6 fl6 = {
2514 .flowi6_iif = LOOPBACK_IFINDEX,
2516 .flowi6_mark = mark,
2517 .daddr = iph->daddr,
2518 .saddr = iph->saddr,
2519 .flowlabel = ip6_flowinfo(iph),
2523 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2524 rt6_do_redirect(dst, NULL, skb);
2527 EXPORT_SYMBOL_GPL(ip6_redirect);
2529 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2531 const struct ipv6hdr *iph = ipv6_hdr(skb);
2532 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2533 struct dst_entry *dst;
2534 struct flowi6 fl6 = {
2535 .flowi6_iif = LOOPBACK_IFINDEX,
2538 .saddr = iph->daddr,
2539 .flowi6_uid = sock_net_uid(net, NULL),
2542 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2543 rt6_do_redirect(dst, NULL, skb);
2547 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2549 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2552 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2554 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2556 struct net_device *dev = dst->dev;
2557 unsigned int mtu = dst_mtu(dst);
2558 struct net *net = dev_net(dev);
2560 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2562 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2563 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2566 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2567 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2568 * IPV6_MAXPLEN is also valid and means: "any MSS,
2569 * rely only on pmtu discovery"
2571 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2576 static unsigned int ip6_mtu(const struct dst_entry *dst)
2578 struct inet6_dev *idev;
2581 mtu = dst_metric_raw(dst, RTAX_MTU);
2588 idev = __in6_dev_get(dst->dev);
2590 mtu = idev->cnf.mtu6;
2594 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2596 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2600 * 1. mtu on route is locked - use it
2601 * 2. mtu from nexthop exception
2602 * 3. mtu from egress device
2604 * based on ip6_dst_mtu_forward and exception logic of
2605 * rt6_find_cached_rt; called with rcu_read_lock
2607 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2608 struct in6_addr *saddr)
2610 struct rt6_exception_bucket *bucket;
2611 struct rt6_exception *rt6_ex;
2612 struct in6_addr *src_key;
2613 struct inet6_dev *idev;
2616 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2617 mtu = f6i->fib6_pmtu;
2623 #ifdef CONFIG_IPV6_SUBTREES
2624 if (f6i->fib6_src.plen)
2628 bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2629 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2630 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2631 mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2634 struct net_device *dev = fib6_info_nh_dev(f6i);
2637 idev = __in6_dev_get(dev);
2638 if (idev && idev->cnf.mtu6 > mtu)
2639 mtu = idev->cnf.mtu6;
2642 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2644 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2647 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2650 struct dst_entry *dst;
2651 struct rt6_info *rt;
2652 struct inet6_dev *idev = in6_dev_get(dev);
2653 struct net *net = dev_net(dev);
2655 if (unlikely(!idev))
2656 return ERR_PTR(-ENODEV);
2658 rt = ip6_dst_alloc(net, dev, 0);
2659 if (unlikely(!rt)) {
2661 dst = ERR_PTR(-ENOMEM);
2665 rt->dst.flags |= DST_HOST;
2666 rt->dst.input = ip6_input;
2667 rt->dst.output = ip6_output;
2668 rt->rt6i_gateway = fl6->daddr;
2669 rt->rt6i_dst.addr = fl6->daddr;
2670 rt->rt6i_dst.plen = 128;
2671 rt->rt6i_idev = idev;
2672 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2674 /* Add this dst into uncached_list so that rt6_disable_ip() can
2675 * do proper release of the net_device
2677 rt6_uncached_list_add(rt);
2678 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2680 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2686 static int ip6_dst_gc(struct dst_ops *ops)
2688 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2689 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2690 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2691 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2692 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2693 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2696 entries = dst_entries_get_fast(ops);
2697 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2698 entries <= rt_max_size)
2701 net->ipv6.ip6_rt_gc_expire++;
2702 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2703 entries = dst_entries_get_slow(ops);
2704 if (entries < ops->gc_thresh)
2705 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2707 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2708 return entries > rt_max_size;
2711 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2712 struct fib6_config *cfg,
2713 const struct in6_addr *gw_addr,
2714 u32 tbid, int flags)
2716 struct flowi6 fl6 = {
2717 .flowi6_oif = cfg->fc_ifindex,
2719 .saddr = cfg->fc_prefsrc,
2721 struct fib6_table *table;
2722 struct rt6_info *rt;
2724 table = fib6_get_table(net, tbid);
2728 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2729 flags |= RT6_LOOKUP_F_HAS_SADDR;
2731 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2732 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2734 /* if table lookup failed, fall back to full lookup */
2735 if (rt == net->ipv6.ip6_null_entry) {
2743 static int ip6_route_check_nh_onlink(struct net *net,
2744 struct fib6_config *cfg,
2745 const struct net_device *dev,
2746 struct netlink_ext_ack *extack)
2748 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2749 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2750 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2751 struct fib6_info *from;
2752 struct rt6_info *grt;
2756 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2759 from = rcu_dereference(grt->from);
2760 if (!grt->dst.error &&
2761 /* ignore match if it is the default route */
2762 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2763 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2764 NL_SET_ERR_MSG(extack,
2765 "Nexthop has invalid gateway or device mismatch");
2776 static int ip6_route_check_nh(struct net *net,
2777 struct fib6_config *cfg,
2778 struct net_device **_dev,
2779 struct inet6_dev **idev)
2781 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2782 struct net_device *dev = _dev ? *_dev : NULL;
2783 struct rt6_info *grt = NULL;
2784 int err = -EHOSTUNREACH;
2786 if (cfg->fc_table) {
2787 int flags = RT6_LOOKUP_F_IFACE;
2789 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2790 cfg->fc_table, flags);
2792 if (grt->rt6i_flags & RTF_GATEWAY ||
2793 (dev && dev != grt->dst.dev)) {
2801 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2807 if (dev != grt->dst.dev) {
2812 *_dev = dev = grt->dst.dev;
2813 *idev = grt->rt6i_idev;
2815 in6_dev_hold(grt->rt6i_idev);
2818 if (!(grt->rt6i_flags & RTF_GATEWAY))
2827 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2828 struct net_device **_dev, struct inet6_dev **idev,
2829 struct netlink_ext_ack *extack)
2831 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2832 int gwa_type = ipv6_addr_type(gw_addr);
2833 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2834 const struct net_device *dev = *_dev;
2835 bool need_addr_check = !dev;
2838 /* if gw_addr is local we will fail to detect this in case
2839 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2840 * will return already-added prefix route via interface that
2841 * prefix route was assigned to, which might be non-loopback.
2844 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2845 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2849 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2850 /* IPv6 strictly inhibits using not link-local
2851 * addresses as nexthop address.
2852 * Otherwise, router will not able to send redirects.
2853 * It is very good, but in some (rare!) circumstances
2854 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2855 * some exceptions. --ANK
2856 * We allow IPv4-mapped nexthops to support RFC4798-type
2859 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2860 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2864 if (cfg->fc_flags & RTNH_F_ONLINK)
2865 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2867 err = ip6_route_check_nh(net, cfg, _dev, idev);
2873 /* reload in case device was changed */
2878 NL_SET_ERR_MSG(extack, "Egress device not specified");
2880 } else if (dev->flags & IFF_LOOPBACK) {
2881 NL_SET_ERR_MSG(extack,
2882 "Egress device can not be loopback device for this route");
2886 /* if we did not check gw_addr above, do so now that the
2887 * egress device has been resolved.
2889 if (need_addr_check &&
2890 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2891 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2900 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2902 struct netlink_ext_ack *extack)
2904 struct net *net = cfg->fc_nlinfo.nl_net;
2905 struct fib6_info *rt = NULL;
2906 struct net_device *dev = NULL;
2907 struct inet6_dev *idev = NULL;
2908 struct fib6_table *table;
2912 /* RTF_PCPU is an internal flag; can not be set by userspace */
2913 if (cfg->fc_flags & RTF_PCPU) {
2914 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2918 /* RTF_CACHE is an internal flag; can not be set by userspace */
2919 if (cfg->fc_flags & RTF_CACHE) {
2920 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2924 if (cfg->fc_type > RTN_MAX) {
2925 NL_SET_ERR_MSG(extack, "Invalid route type");
2929 if (cfg->fc_dst_len > 128) {
2930 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2933 if (cfg->fc_src_len > 128) {
2934 NL_SET_ERR_MSG(extack, "Invalid source address length");
2937 #ifndef CONFIG_IPV6_SUBTREES
2938 if (cfg->fc_src_len) {
2939 NL_SET_ERR_MSG(extack,
2940 "Specifying source address requires IPV6_SUBTREES to be enabled");
2944 if (cfg->fc_ifindex) {
2946 dev = dev_get_by_index(net, cfg->fc_ifindex);
2949 idev = in6_dev_get(dev);
2954 if (cfg->fc_metric == 0)
2955 cfg->fc_metric = IP6_RT_PRIO_USER;
2957 if (cfg->fc_flags & RTNH_F_ONLINK) {
2959 NL_SET_ERR_MSG(extack,
2960 "Nexthop device required for onlink");
2965 if (!(dev->flags & IFF_UP)) {
2966 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2973 if (cfg->fc_nlinfo.nlh &&
2974 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2975 table = fib6_get_table(net, cfg->fc_table);
2977 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2978 table = fib6_new_table(net, cfg->fc_table);
2981 table = fib6_new_table(net, cfg->fc_table);
2988 rt = fib6_info_alloc(gfp_flags);
2992 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2994 if (IS_ERR(rt->fib6_metrics)) {
2995 err = PTR_ERR(rt->fib6_metrics);
2996 /* Do not leave garbage there. */
2997 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3001 if (cfg->fc_flags & RTF_ADDRCONF)
3002 rt->dst_nocount = true;
3004 if (cfg->fc_flags & RTF_EXPIRES)
3005 fib6_set_expires(rt, jiffies +
3006 clock_t_to_jiffies(cfg->fc_expires));
3008 fib6_clean_expires(rt);
3010 if (cfg->fc_protocol == RTPROT_UNSPEC)
3011 cfg->fc_protocol = RTPROT_BOOT;
3012 rt->fib6_protocol = cfg->fc_protocol;
3014 addr_type = ipv6_addr_type(&cfg->fc_dst);
3016 if (cfg->fc_encap) {
3017 struct lwtunnel_state *lwtstate;
3019 err = lwtunnel_build_state(cfg->fc_encap_type,
3020 cfg->fc_encap, AF_INET6, cfg,
3024 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3027 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3028 rt->fib6_dst.plen = cfg->fc_dst_len;
3029 if (rt->fib6_dst.plen == 128)
3030 rt->dst_host = true;
3032 #ifdef CONFIG_IPV6_SUBTREES
3033 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3034 rt->fib6_src.plen = cfg->fc_src_len;
3037 rt->fib6_metric = cfg->fc_metric;
3038 rt->fib6_nh.nh_weight = 1;
3040 rt->fib6_type = cfg->fc_type;
3042 /* We cannot add true routes via loopback here,
3043 they would result in kernel looping; promote them to reject routes
3045 if ((cfg->fc_flags & RTF_REJECT) ||
3046 (dev && (dev->flags & IFF_LOOPBACK) &&
3047 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3048 !(cfg->fc_flags & RTF_LOCAL))) {
3049 /* hold loopback dev/idev if we haven't done so. */
3050 if (dev != net->loopback_dev) {
3055 dev = net->loopback_dev;
3057 idev = in6_dev_get(dev);
3063 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3067 if (cfg->fc_flags & RTF_GATEWAY) {
3068 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3072 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3079 if (idev->cnf.disable_ipv6) {
3080 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3085 if (!(dev->flags & IFF_UP)) {
3086 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3091 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3092 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3093 NL_SET_ERR_MSG(extack, "Invalid source address");
3097 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3098 rt->fib6_prefsrc.plen = 128;
3100 rt->fib6_prefsrc.plen = 0;
3102 rt->fib6_flags = cfg->fc_flags;
3105 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3106 !netif_carrier_ok(dev))
3107 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3108 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3109 rt->fib6_nh.nh_dev = dev;
3110 rt->fib6_table = table;
3122 fib6_info_release(rt);
3123 return ERR_PTR(err);
3126 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3127 struct netlink_ext_ack *extack)
3129 struct fib6_info *rt;
3132 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3136 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3137 fib6_info_release(rt);
3142 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3144 struct net *net = info->nl_net;
3145 struct fib6_table *table;
3148 if (rt == net->ipv6.fib6_null_entry) {
3153 table = rt->fib6_table;
3154 spin_lock_bh(&table->tb6_lock);
3155 err = fib6_del(rt, info);
3156 spin_unlock_bh(&table->tb6_lock);
3159 fib6_info_release(rt);
3163 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3165 struct nl_info info = { .nl_net = net };
3167 return __ip6_del_rt(rt, &info);
3170 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3172 struct nl_info *info = &cfg->fc_nlinfo;
3173 struct net *net = info->nl_net;
3174 struct sk_buff *skb = NULL;
3175 struct fib6_table *table;
3178 if (rt == net->ipv6.fib6_null_entry)
3180 table = rt->fib6_table;
3181 spin_lock_bh(&table->tb6_lock);
3183 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3184 struct fib6_info *sibling, *next_sibling;
3186 /* prefer to send a single notification with all hops */
3187 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3189 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3191 if (rt6_fill_node(net, skb, rt, NULL,
3192 NULL, NULL, 0, RTM_DELROUTE,
3193 info->portid, seq, 0) < 0) {
3197 info->skip_notify = 1;
3200 list_for_each_entry_safe(sibling, next_sibling,
3203 err = fib6_del(sibling, info);
3209 err = fib6_del(rt, info);
3211 spin_unlock_bh(&table->tb6_lock);
3213 fib6_info_release(rt);
3216 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3217 info->nlh, gfp_any());
3222 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3226 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3229 if (cfg->fc_flags & RTF_GATEWAY &&
3230 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3233 rc = rt6_remove_exception_rt(rt);
3238 static int ip6_route_del(struct fib6_config *cfg,
3239 struct netlink_ext_ack *extack)
3241 struct rt6_info *rt_cache;
3242 struct fib6_table *table;
3243 struct fib6_info *rt;
3244 struct fib6_node *fn;
3247 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3249 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3255 fn = fib6_locate(&table->tb6_root,
3256 &cfg->fc_dst, cfg->fc_dst_len,
3257 &cfg->fc_src, cfg->fc_src_len,
3258 !(cfg->fc_flags & RTF_CACHE));
3261 for_each_fib6_node_rt_rcu(fn) {
3262 if (cfg->fc_flags & RTF_CACHE) {
3265 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3268 rc = ip6_del_cached_rt(rt_cache, cfg);
3276 if (cfg->fc_ifindex &&
3277 (!rt->fib6_nh.nh_dev ||
3278 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3280 if (cfg->fc_flags & RTF_GATEWAY &&
3281 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3283 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3285 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3287 if (!fib6_info_hold_safe(rt))
3291 /* if gateway was specified only delete the one hop */
3292 if (cfg->fc_flags & RTF_GATEWAY)
3293 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3295 return __ip6_del_rt_siblings(rt, cfg);
3303 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3305 struct netevent_redirect netevent;
3306 struct rt6_info *rt, *nrt = NULL;
3307 struct ndisc_options ndopts;
3308 struct inet6_dev *in6_dev;
3309 struct neighbour *neigh;
3310 struct fib6_info *from;
3312 int optlen, on_link;
3315 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3316 optlen -= sizeof(*msg);
3319 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3323 msg = (struct rd_msg *)icmp6_hdr(skb);
3325 if (ipv6_addr_is_multicast(&msg->dest)) {
3326 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3331 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3333 } else if (ipv6_addr_type(&msg->target) !=
3334 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3335 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3339 in6_dev = __in6_dev_get(skb->dev);
3342 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3346 * The IP source address of the Redirect MUST be the same as the current
3347 * first-hop router for the specified ICMP Destination Address.
3350 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3351 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3356 if (ndopts.nd_opts_tgt_lladdr) {
3357 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3360 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3365 rt = (struct rt6_info *) dst;
3366 if (rt->rt6i_flags & RTF_REJECT) {
3367 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3371 /* Redirect received -> path was valid.
3372 * Look, redirects are sent only in response to data packets,
3373 * so that this nexthop apparently is reachable. --ANK
3375 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3377 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3382 * We have finally decided to accept it.
3385 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3386 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3387 NEIGH_UPDATE_F_OVERRIDE|
3388 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3389 NEIGH_UPDATE_F_ISROUTER)),
3390 NDISC_REDIRECT, &ndopts);
3393 from = rcu_dereference(rt->from);
3394 /* This fib6_info_hold() is safe here because we hold reference to rt
3395 * and rt already holds reference to fib6_info.
3397 fib6_info_hold(from);
3400 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3404 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3406 nrt->rt6i_flags &= ~RTF_GATEWAY;
3408 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3410 /* No need to remove rt from the exception table if rt is
3411 * a cached route because rt6_insert_exception() will
3414 if (rt6_insert_exception(nrt, from)) {
3415 dst_release_immediate(&nrt->dst);
3419 netevent.old = &rt->dst;
3420 netevent.new = &nrt->dst;
3421 netevent.daddr = &msg->dest;
3422 netevent.neigh = neigh;
3423 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3426 fib6_info_release(from);
3427 neigh_release(neigh);
3430 #ifdef CONFIG_IPV6_ROUTE_INFO
3431 static struct fib6_info *rt6_get_route_info(struct net *net,
3432 const struct in6_addr *prefix, int prefixlen,
3433 const struct in6_addr *gwaddr,
3434 struct net_device *dev)
3436 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3437 int ifindex = dev->ifindex;
3438 struct fib6_node *fn;
3439 struct fib6_info *rt = NULL;
3440 struct fib6_table *table;
3442 table = fib6_get_table(net, tb_id);
3447 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3451 for_each_fib6_node_rt_rcu(fn) {
3452 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3454 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3456 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3458 if (!fib6_info_hold_safe(rt))
3467 static struct fib6_info *rt6_add_route_info(struct net *net,
3468 const struct in6_addr *prefix, int prefixlen,
3469 const struct in6_addr *gwaddr,
3470 struct net_device *dev,
3473 struct fib6_config cfg = {
3474 .fc_metric = IP6_RT_PRIO_USER,
3475 .fc_ifindex = dev->ifindex,
3476 .fc_dst_len = prefixlen,
3477 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3478 RTF_UP | RTF_PREF(pref),
3479 .fc_protocol = RTPROT_RA,
3480 .fc_type = RTN_UNICAST,
3481 .fc_nlinfo.portid = 0,
3482 .fc_nlinfo.nlh = NULL,
3483 .fc_nlinfo.nl_net = net,
3486 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3487 cfg.fc_dst = *prefix;
3488 cfg.fc_gateway = *gwaddr;
3490 /* We should treat it as a default route if prefix length is 0. */
3492 cfg.fc_flags |= RTF_DEFAULT;
3494 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3496 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3500 struct fib6_info *rt6_get_dflt_router(struct net *net,
3501 const struct in6_addr *addr,
3502 struct net_device *dev)
3504 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3505 struct fib6_info *rt;
3506 struct fib6_table *table;
3508 table = fib6_get_table(net, tb_id);
3513 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3514 if (dev == rt->fib6_nh.nh_dev &&
3515 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3516 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3519 if (rt && !fib6_info_hold_safe(rt))
3525 struct fib6_info *rt6_add_dflt_router(struct net *net,
3526 const struct in6_addr *gwaddr,
3527 struct net_device *dev,
3530 struct fib6_config cfg = {
3531 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3532 .fc_metric = IP6_RT_PRIO_USER,
3533 .fc_ifindex = dev->ifindex,
3534 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3535 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3536 .fc_protocol = RTPROT_RA,
3537 .fc_type = RTN_UNICAST,
3538 .fc_nlinfo.portid = 0,
3539 .fc_nlinfo.nlh = NULL,
3540 .fc_nlinfo.nl_net = net,
3543 cfg.fc_gateway = *gwaddr;
3545 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3546 struct fib6_table *table;
3548 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3550 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3553 return rt6_get_dflt_router(net, gwaddr, dev);
3556 static void __rt6_purge_dflt_routers(struct net *net,
3557 struct fib6_table *table)
3559 struct fib6_info *rt;
3563 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3564 struct net_device *dev = fib6_info_nh_dev(rt);
3565 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3567 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3568 (!idev || idev->cnf.accept_ra != 2) &&
3569 fib6_info_hold_safe(rt)) {
3571 ip6_del_rt(net, rt);
3577 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3580 void rt6_purge_dflt_routers(struct net *net)
3582 struct fib6_table *table;
3583 struct hlist_head *head;
3588 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3589 head = &net->ipv6.fib_table_hash[h];
3590 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3591 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3592 __rt6_purge_dflt_routers(net, table);
3599 static void rtmsg_to_fib6_config(struct net *net,
3600 struct in6_rtmsg *rtmsg,
3601 struct fib6_config *cfg)
3603 *cfg = (struct fib6_config){
3604 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3606 .fc_ifindex = rtmsg->rtmsg_ifindex,
3607 .fc_metric = rtmsg->rtmsg_metric,
3608 .fc_expires = rtmsg->rtmsg_info,
3609 .fc_dst_len = rtmsg->rtmsg_dst_len,
3610 .fc_src_len = rtmsg->rtmsg_src_len,
3611 .fc_flags = rtmsg->rtmsg_flags,
3612 .fc_type = rtmsg->rtmsg_type,
3614 .fc_nlinfo.nl_net = net,
3616 .fc_dst = rtmsg->rtmsg_dst,
3617 .fc_src = rtmsg->rtmsg_src,
3618 .fc_gateway = rtmsg->rtmsg_gateway,
3622 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3624 struct fib6_config cfg;
3625 struct in6_rtmsg rtmsg;
3629 case SIOCADDRT: /* Add a route */
3630 case SIOCDELRT: /* Delete a route */
3631 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3633 err = copy_from_user(&rtmsg, arg,
3634 sizeof(struct in6_rtmsg));
3638 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3643 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3646 err = ip6_route_del(&cfg, NULL);
3660 * Drop the packet on the floor
3663 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3666 struct dst_entry *dst = skb_dst(skb);
3667 switch (ipstats_mib_noroutes) {
3668 case IPSTATS_MIB_INNOROUTES:
3669 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3670 if (type == IPV6_ADDR_ANY) {
3671 IP6_INC_STATS(dev_net(dst->dev),
3672 __in6_dev_get_safely(skb->dev),
3673 IPSTATS_MIB_INADDRERRORS);
3677 case IPSTATS_MIB_OUTNOROUTES:
3678 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3679 ipstats_mib_noroutes);
3682 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3687 static int ip6_pkt_discard(struct sk_buff *skb)
3689 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3692 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3694 skb->dev = skb_dst(skb)->dev;
3695 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3698 static int ip6_pkt_prohibit(struct sk_buff *skb)
3700 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3703 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3705 skb->dev = skb_dst(skb)->dev;
3706 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3710 * Allocate a dst for local (unicast / anycast) address.
3713 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3714 struct inet6_dev *idev,
3715 const struct in6_addr *addr,
3716 bool anycast, gfp_t gfp_flags)
3719 struct net_device *dev = idev->dev;
3720 struct fib6_info *f6i;
3722 f6i = fib6_info_alloc(gfp_flags);
3724 return ERR_PTR(-ENOMEM);
3726 f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3727 f6i->dst_nocount = true;
3728 f6i->dst_host = true;
3729 f6i->fib6_protocol = RTPROT_KERNEL;
3730 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3732 f6i->fib6_type = RTN_ANYCAST;
3733 f6i->fib6_flags |= RTF_ANYCAST;
3735 f6i->fib6_type = RTN_LOCAL;
3736 f6i->fib6_flags |= RTF_LOCAL;
3739 f6i->fib6_nh.nh_gw = *addr;
3741 f6i->fib6_nh.nh_dev = dev;
3742 f6i->fib6_dst.addr = *addr;
3743 f6i->fib6_dst.plen = 128;
3744 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3745 f6i->fib6_table = fib6_get_table(net, tb_id);
3750 /* remove deleted ip from prefsrc entries */
3751 struct arg_dev_net_ip {
3752 struct net_device *dev;
3754 struct in6_addr *addr;
3757 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3759 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3760 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3761 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3763 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3764 rt != net->ipv6.fib6_null_entry &&
3765 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3766 spin_lock_bh(&rt6_exception_lock);
3767 /* remove prefsrc entry */
3768 rt->fib6_prefsrc.plen = 0;
3769 spin_unlock_bh(&rt6_exception_lock);
3774 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3776 struct net *net = dev_net(ifp->idev->dev);
3777 struct arg_dev_net_ip adni = {
3778 .dev = ifp->idev->dev,
3782 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3785 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3787 /* Remove routers and update dst entries when gateway turn into host. */
3788 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3790 struct in6_addr *gateway = (struct in6_addr *)arg;
3792 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3793 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3797 /* Further clean up cached routes in exception table.
3798 * This is needed because cached route may have a different
3799 * gateway than its 'parent' in the case of an ip redirect.
3801 rt6_exceptions_clean_tohost(rt, gateway);
3806 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3808 fib6_clean_all(net, fib6_clean_tohost, gateway);
3811 struct arg_netdev_event {
3812 const struct net_device *dev;
3814 unsigned int nh_flags;
3815 unsigned long event;
3819 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3821 struct fib6_info *iter;
3822 struct fib6_node *fn;
3824 fn = rcu_dereference_protected(rt->fib6_node,
3825 lockdep_is_held(&rt->fib6_table->tb6_lock));
3826 iter = rcu_dereference_protected(fn->leaf,
3827 lockdep_is_held(&rt->fib6_table->tb6_lock));
3829 if (iter->fib6_metric == rt->fib6_metric &&
3830 rt6_qualify_for_ecmp(iter))
3832 iter = rcu_dereference_protected(iter->fib6_next,
3833 lockdep_is_held(&rt->fib6_table->tb6_lock));
3839 static bool rt6_is_dead(const struct fib6_info *rt)
3841 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3842 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3843 fib6_ignore_linkdown(rt)))
3849 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3851 struct fib6_info *iter;
3854 if (!rt6_is_dead(rt))
3855 total += rt->fib6_nh.nh_weight;
3857 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3858 if (!rt6_is_dead(iter))
3859 total += iter->fib6_nh.nh_weight;
3865 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3867 int upper_bound = -1;
3869 if (!rt6_is_dead(rt)) {
3870 *weight += rt->fib6_nh.nh_weight;
3871 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3874 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3877 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3879 struct fib6_info *iter;
3882 rt6_upper_bound_set(rt, &weight, total);
3884 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3885 rt6_upper_bound_set(iter, &weight, total);
3888 void rt6_multipath_rebalance(struct fib6_info *rt)
3890 struct fib6_info *first;
3893 /* In case the entire multipath route was marked for flushing,
3894 * then there is no need to rebalance upon the removal of every
3897 if (!rt->fib6_nsiblings || rt->should_flush)
3900 /* During lookup routes are evaluated in order, so we need to
3901 * make sure upper bounds are assigned from the first sibling
3904 first = rt6_multipath_first_sibling(rt);
3905 if (WARN_ON_ONCE(!first))
3908 total = rt6_multipath_total_weight(first);
3909 rt6_multipath_upper_bound_set(first, total);
3912 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3914 const struct arg_netdev_event *arg = p_arg;
3915 struct net *net = dev_net(arg->dev);
3917 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3918 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3919 fib6_update_sernum_upto_root(net, rt);
3920 rt6_multipath_rebalance(rt);
3926 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3928 struct arg_netdev_event arg = {
3931 .nh_flags = nh_flags,
3935 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3936 arg.nh_flags |= RTNH_F_LINKDOWN;
3938 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3941 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3942 const struct net_device *dev)
3944 struct fib6_info *iter;
3946 if (rt->fib6_nh.nh_dev == dev)
3948 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3949 if (iter->fib6_nh.nh_dev == dev)
3955 static void rt6_multipath_flush(struct fib6_info *rt)
3957 struct fib6_info *iter;
3959 rt->should_flush = 1;
3960 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3961 iter->should_flush = 1;
3964 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3965 const struct net_device *down_dev)
3967 struct fib6_info *iter;
3968 unsigned int dead = 0;
3970 if (rt->fib6_nh.nh_dev == down_dev ||
3971 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3973 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3974 if (iter->fib6_nh.nh_dev == down_dev ||
3975 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3981 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3982 const struct net_device *dev,
3983 unsigned int nh_flags)
3985 struct fib6_info *iter;
3987 if (rt->fib6_nh.nh_dev == dev)
3988 rt->fib6_nh.nh_flags |= nh_flags;
3989 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3990 if (iter->fib6_nh.nh_dev == dev)
3991 iter->fib6_nh.nh_flags |= nh_flags;
3994 /* called with write lock held for table with rt */
3995 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3997 const struct arg_netdev_event *arg = p_arg;
3998 const struct net_device *dev = arg->dev;
3999 struct net *net = dev_net(dev);
4001 if (rt == net->ipv6.fib6_null_entry)
4004 switch (arg->event) {
4005 case NETDEV_UNREGISTER:
4006 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4008 if (rt->should_flush)
4010 if (!rt->fib6_nsiblings)
4011 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4012 if (rt6_multipath_uses_dev(rt, dev)) {
4015 count = rt6_multipath_dead_count(rt, dev);
4016 if (rt->fib6_nsiblings + 1 == count) {
4017 rt6_multipath_flush(rt);
4020 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4022 fib6_update_sernum(net, rt);
4023 rt6_multipath_rebalance(rt);
4027 if (rt->fib6_nh.nh_dev != dev ||
4028 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4030 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4031 rt6_multipath_rebalance(rt);
4038 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4040 struct arg_netdev_event arg = {
4046 struct net *net = dev_net(dev);
4048 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4049 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4051 fib6_clean_all(net, fib6_ifdown, &arg);
4054 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4056 rt6_sync_down_dev(dev, event);
4057 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4058 neigh_ifdown(&nd_tbl, dev);
4061 struct rt6_mtu_change_arg {
4062 struct net_device *dev;
4066 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4068 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4069 struct inet6_dev *idev;
4071 /* In IPv6 pmtu discovery is not optional,
4072 so that RTAX_MTU lock cannot disable it.
4073 We still use this lock to block changes
4074 caused by addrconf/ndisc.
4077 idev = __in6_dev_get(arg->dev);
4081 /* For administrative MTU increase, there is no way to discover
4082 IPv6 PMTU increase, so PMTU increase should be updated here.
4083 Since RFC 1981 doesn't include administrative MTU increase
4084 update PMTU increase is a MUST. (i.e. jumbo frame)
4086 if (rt->fib6_nh.nh_dev == arg->dev &&
4087 !fib6_metric_locked(rt, RTAX_MTU)) {
4088 u32 mtu = rt->fib6_pmtu;
4090 if (mtu >= arg->mtu ||
4091 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4092 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4094 spin_lock_bh(&rt6_exception_lock);
4095 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4096 spin_unlock_bh(&rt6_exception_lock);
4101 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4103 struct rt6_mtu_change_arg arg = {
4108 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4111 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4112 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4113 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4114 [RTA_OIF] = { .type = NLA_U32 },
4115 [RTA_IIF] = { .type = NLA_U32 },
4116 [RTA_PRIORITY] = { .type = NLA_U32 },
4117 [RTA_METRICS] = { .type = NLA_NESTED },
4118 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4119 [RTA_PREF] = { .type = NLA_U8 },
4120 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4121 [RTA_ENCAP] = { .type = NLA_NESTED },
4122 [RTA_EXPIRES] = { .type = NLA_U32 },
4123 [RTA_UID] = { .type = NLA_U32 },
4124 [RTA_MARK] = { .type = NLA_U32 },
4125 [RTA_TABLE] = { .type = NLA_U32 },
4126 [RTA_IP_PROTO] = { .type = NLA_U8 },
4127 [RTA_SPORT] = { .type = NLA_U16 },
4128 [RTA_DPORT] = { .type = NLA_U16 },
4131 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4132 struct fib6_config *cfg,
4133 struct netlink_ext_ack *extack)
4136 struct nlattr *tb[RTA_MAX+1];
4140 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4146 rtm = nlmsg_data(nlh);
4148 *cfg = (struct fib6_config){
4149 .fc_table = rtm->rtm_table,
4150 .fc_dst_len = rtm->rtm_dst_len,
4151 .fc_src_len = rtm->rtm_src_len,
4153 .fc_protocol = rtm->rtm_protocol,
4154 .fc_type = rtm->rtm_type,
4156 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
4157 .fc_nlinfo.nlh = nlh,
4158 .fc_nlinfo.nl_net = sock_net(skb->sk),
4161 if (rtm->rtm_type == RTN_UNREACHABLE ||
4162 rtm->rtm_type == RTN_BLACKHOLE ||
4163 rtm->rtm_type == RTN_PROHIBIT ||
4164 rtm->rtm_type == RTN_THROW)
4165 cfg->fc_flags |= RTF_REJECT;
4167 if (rtm->rtm_type == RTN_LOCAL)
4168 cfg->fc_flags |= RTF_LOCAL;
4170 if (rtm->rtm_flags & RTM_F_CLONED)
4171 cfg->fc_flags |= RTF_CACHE;
4173 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4175 if (tb[RTA_GATEWAY]) {
4176 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4177 cfg->fc_flags |= RTF_GATEWAY;
4181 int plen = (rtm->rtm_dst_len + 7) >> 3;
4183 if (nla_len(tb[RTA_DST]) < plen)
4186 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4190 int plen = (rtm->rtm_src_len + 7) >> 3;
4192 if (nla_len(tb[RTA_SRC]) < plen)
4195 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4198 if (tb[RTA_PREFSRC])
4199 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4202 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4204 if (tb[RTA_PRIORITY])
4205 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4207 if (tb[RTA_METRICS]) {
4208 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4209 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4213 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4215 if (tb[RTA_MULTIPATH]) {
4216 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4217 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4219 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4220 cfg->fc_mp_len, extack);
4226 pref = nla_get_u8(tb[RTA_PREF]);
4227 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4228 pref != ICMPV6_ROUTER_PREF_HIGH)
4229 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4230 cfg->fc_flags |= RTF_PREF(pref);
4234 cfg->fc_encap = tb[RTA_ENCAP];
4236 if (tb[RTA_ENCAP_TYPE]) {
4237 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4239 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4244 if (tb[RTA_EXPIRES]) {
4245 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4247 if (addrconf_finite_timeout(timeout)) {
4248 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4249 cfg->fc_flags |= RTF_EXPIRES;
4259 struct fib6_info *fib6_info;
4260 struct fib6_config r_cfg;
4261 struct list_head next;
4264 static int ip6_route_info_append(struct net *net,
4265 struct list_head *rt6_nh_list,
4266 struct fib6_info *rt,
4267 struct fib6_config *r_cfg)
4272 list_for_each_entry(nh, rt6_nh_list, next) {
4273 /* check if fib6_info already exists */
4274 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4278 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4282 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4283 list_add_tail(&nh->next, rt6_nh_list);
4288 static void ip6_route_mpath_notify(struct fib6_info *rt,
4289 struct fib6_info *rt_last,
4290 struct nl_info *info,
4293 /* if this is an APPEND route, then rt points to the first route
4294 * inserted and rt_last points to last route inserted. Userspace
4295 * wants a consistent dump of the route which starts at the first
4296 * nexthop. Since sibling routes are always added at the end of
4297 * the list, find the first sibling of the last route appended
4299 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4300 rt = list_first_entry(&rt_last->fib6_siblings,
4306 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4309 static int ip6_route_multipath_add(struct fib6_config *cfg,
4310 struct netlink_ext_ack *extack)
4312 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4313 struct nl_info *info = &cfg->fc_nlinfo;
4314 struct fib6_config r_cfg;
4315 struct rtnexthop *rtnh;
4316 struct fib6_info *rt;
4317 struct rt6_nh *err_nh;
4318 struct rt6_nh *nh, *nh_safe;
4324 int replace = (cfg->fc_nlinfo.nlh &&
4325 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4326 LIST_HEAD(rt6_nh_list);
4328 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4329 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4330 nlflags |= NLM_F_APPEND;
4332 remaining = cfg->fc_mp_len;
4333 rtnh = (struct rtnexthop *)cfg->fc_mp;
4335 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4336 * fib6_info structs per nexthop
4338 while (rtnh_ok(rtnh, remaining)) {
4339 memcpy(&r_cfg, cfg, sizeof(*cfg));
4340 if (rtnh->rtnh_ifindex)
4341 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4343 attrlen = rtnh_attrlen(rtnh);
4345 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4347 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4349 r_cfg.fc_gateway = nla_get_in6_addr(nla);
4350 r_cfg.fc_flags |= RTF_GATEWAY;
4352 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4353 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4355 r_cfg.fc_encap_type = nla_get_u16(nla);
4358 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4359 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4365 if (!rt6_qualify_for_ecmp(rt)) {
4367 NL_SET_ERR_MSG(extack,
4368 "Device only routes can not be added for IPv6 using the multipath API.");
4369 fib6_info_release(rt);
4373 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4375 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4378 fib6_info_release(rt);
4382 rtnh = rtnh_next(rtnh, &remaining);
4385 /* for add and replace send one notification with all nexthops.
4386 * Skip the notification in fib6_add_rt2node and send one with
4387 * the full route when done
4389 info->skip_notify = 1;
4392 list_for_each_entry(nh, &rt6_nh_list, next) {
4393 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4394 fib6_info_release(nh->fib6_info);
4397 /* save reference to last route successfully inserted */
4398 rt_last = nh->fib6_info;
4400 /* save reference to first route for notification */
4402 rt_notif = nh->fib6_info;
4405 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4406 nh->fib6_info = NULL;
4409 NL_SET_ERR_MSG_MOD(extack,
4410 "multipath route replace failed (check consistency of installed routes)");
4415 /* Because each route is added like a single route we remove
4416 * these flags after the first nexthop: if there is a collision,
4417 * we have already failed to add the first nexthop:
4418 * fib6_add_rt2node() has rejected it; when replacing, old
4419 * nexthops have been replaced by first new, the rest should
4422 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4427 /* success ... tell user about new route */
4428 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4432 /* send notification for routes that were added so that
4433 * the delete notifications sent by ip6_route_del are
4437 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4439 /* Delete routes that were already added */
4440 list_for_each_entry(nh, &rt6_nh_list, next) {
4443 ip6_route_del(&nh->r_cfg, extack);
4447 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4449 fib6_info_release(nh->fib6_info);
4450 list_del(&nh->next);
4457 static int ip6_route_multipath_del(struct fib6_config *cfg,
4458 struct netlink_ext_ack *extack)
4460 struct fib6_config r_cfg;
4461 struct rtnexthop *rtnh;
4464 int err = 1, last_err = 0;
4466 remaining = cfg->fc_mp_len;
4467 rtnh = (struct rtnexthop *)cfg->fc_mp;
4469 /* Parse a Multipath Entry */
4470 while (rtnh_ok(rtnh, remaining)) {
4471 memcpy(&r_cfg, cfg, sizeof(*cfg));
4472 if (rtnh->rtnh_ifindex)
4473 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4475 attrlen = rtnh_attrlen(rtnh);
4477 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4479 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4481 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4482 r_cfg.fc_flags |= RTF_GATEWAY;
4485 err = ip6_route_del(&r_cfg, extack);
4489 rtnh = rtnh_next(rtnh, &remaining);
4495 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4496 struct netlink_ext_ack *extack)
4498 struct fib6_config cfg;
4501 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4506 return ip6_route_multipath_del(&cfg, extack);
4508 cfg.fc_delete_all_nh = 1;
4509 return ip6_route_del(&cfg, extack);
4513 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4514 struct netlink_ext_ack *extack)
4516 struct fib6_config cfg;
4519 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4524 return ip6_route_multipath_add(&cfg, extack);
4526 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4529 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4531 int nexthop_len = 0;
4533 if (rt->fib6_nsiblings) {
4534 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4535 + NLA_ALIGN(sizeof(struct rtnexthop))
4536 + nla_total_size(16) /* RTA_GATEWAY */
4537 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4539 nexthop_len *= rt->fib6_nsiblings;
4542 return NLMSG_ALIGN(sizeof(struct rtmsg))
4543 + nla_total_size(16) /* RTA_SRC */
4544 + nla_total_size(16) /* RTA_DST */
4545 + nla_total_size(16) /* RTA_GATEWAY */
4546 + nla_total_size(16) /* RTA_PREFSRC */
4547 + nla_total_size(4) /* RTA_TABLE */
4548 + nla_total_size(4) /* RTA_IIF */
4549 + nla_total_size(4) /* RTA_OIF */
4550 + nla_total_size(4) /* RTA_PRIORITY */
4551 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4552 + nla_total_size(sizeof(struct rta_cacheinfo))
4553 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4554 + nla_total_size(1) /* RTA_PREF */
4555 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4559 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4560 unsigned int *flags, bool skip_oif)
4562 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4563 *flags |= RTNH_F_DEAD;
4565 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4566 *flags |= RTNH_F_LINKDOWN;
4569 if (fib6_ignore_linkdown(rt))
4570 *flags |= RTNH_F_DEAD;
4574 if (rt->fib6_flags & RTF_GATEWAY) {
4575 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4576 goto nla_put_failure;
4579 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4580 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4581 *flags |= RTNH_F_OFFLOAD;
4583 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4584 if (!skip_oif && rt->fib6_nh.nh_dev &&
4585 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4586 goto nla_put_failure;
4588 if (rt->fib6_nh.nh_lwtstate &&
4589 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4590 goto nla_put_failure;
4598 /* add multipath next hop */
4599 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4601 const struct net_device *dev = rt->fib6_nh.nh_dev;
4602 struct rtnexthop *rtnh;
4603 unsigned int flags = 0;
4605 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4607 goto nla_put_failure;
4609 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4610 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4612 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4613 goto nla_put_failure;
4615 rtnh->rtnh_flags = flags;
4617 /* length of rtnetlink header + attributes */
4618 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4626 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4627 struct fib6_info *rt, struct dst_entry *dst,
4628 struct in6_addr *dest, struct in6_addr *src,
4629 int iif, int type, u32 portid, u32 seq,
4632 struct rt6_info *rt6 = (struct rt6_info *)dst;
4633 struct rt6key *rt6_dst, *rt6_src;
4634 u32 *pmetrics, table, rt6_flags;
4635 struct nlmsghdr *nlh;
4639 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4644 rt6_dst = &rt6->rt6i_dst;
4645 rt6_src = &rt6->rt6i_src;
4646 rt6_flags = rt6->rt6i_flags;
4648 rt6_dst = &rt->fib6_dst;
4649 rt6_src = &rt->fib6_src;
4650 rt6_flags = rt->fib6_flags;
4653 rtm = nlmsg_data(nlh);
4654 rtm->rtm_family = AF_INET6;
4655 rtm->rtm_dst_len = rt6_dst->plen;
4656 rtm->rtm_src_len = rt6_src->plen;
4659 table = rt->fib6_table->tb6_id;
4661 table = RT6_TABLE_UNSPEC;
4662 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4663 if (nla_put_u32(skb, RTA_TABLE, table))
4664 goto nla_put_failure;
4666 rtm->rtm_type = rt->fib6_type;
4668 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4669 rtm->rtm_protocol = rt->fib6_protocol;
4671 if (rt6_flags & RTF_CACHE)
4672 rtm->rtm_flags |= RTM_F_CLONED;
4675 if (nla_put_in6_addr(skb, RTA_DST, dest))
4676 goto nla_put_failure;
4677 rtm->rtm_dst_len = 128;
4678 } else if (rtm->rtm_dst_len)
4679 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4680 goto nla_put_failure;
4681 #ifdef CONFIG_IPV6_SUBTREES
4683 if (nla_put_in6_addr(skb, RTA_SRC, src))
4684 goto nla_put_failure;
4685 rtm->rtm_src_len = 128;
4686 } else if (rtm->rtm_src_len &&
4687 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4688 goto nla_put_failure;
4691 #ifdef CONFIG_IPV6_MROUTE
4692 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4693 int err = ip6mr_get_route(net, skb, rtm, portid);
4698 goto nla_put_failure;
4701 if (nla_put_u32(skb, RTA_IIF, iif))
4702 goto nla_put_failure;
4704 struct in6_addr saddr_buf;
4705 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4706 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4707 goto nla_put_failure;
4710 if (rt->fib6_prefsrc.plen) {
4711 struct in6_addr saddr_buf;
4712 saddr_buf = rt->fib6_prefsrc.addr;
4713 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4714 goto nla_put_failure;
4717 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4718 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4719 goto nla_put_failure;
4721 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4722 goto nla_put_failure;
4724 /* For multipath routes, walk the siblings list and add
4725 * each as a nexthop within RTA_MULTIPATH.
4728 if (rt6_flags & RTF_GATEWAY &&
4729 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4730 goto nla_put_failure;
4732 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4733 goto nla_put_failure;
4734 } else if (rt->fib6_nsiblings) {
4735 struct fib6_info *sibling, *next_sibling;
4738 mp = nla_nest_start(skb, RTA_MULTIPATH);
4740 goto nla_put_failure;
4742 if (rt6_add_nexthop(skb, rt) < 0)
4743 goto nla_put_failure;
4745 list_for_each_entry_safe(sibling, next_sibling,
4746 &rt->fib6_siblings, fib6_siblings) {
4747 if (rt6_add_nexthop(skb, sibling) < 0)
4748 goto nla_put_failure;
4751 nla_nest_end(skb, mp);
4753 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4754 goto nla_put_failure;
4757 if (rt6_flags & RTF_EXPIRES) {
4758 expires = dst ? dst->expires : rt->expires;
4762 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4763 goto nla_put_failure;
4765 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4766 goto nla_put_failure;
4769 nlmsg_end(skb, nlh);
4773 nlmsg_cancel(skb, nlh);
4777 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4778 const struct net_device *dev)
4780 if (f6i->fib6_nh.nh_dev == dev)
4783 if (f6i->fib6_nsiblings) {
4784 struct fib6_info *sibling, *next_sibling;
4786 list_for_each_entry_safe(sibling, next_sibling,
4787 &f6i->fib6_siblings, fib6_siblings) {
4788 if (sibling->fib6_nh.nh_dev == dev)
4796 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4798 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4799 struct fib_dump_filter *filter = &arg->filter;
4800 unsigned int flags = NLM_F_MULTI;
4801 struct net *net = arg->net;
4803 if (rt == net->ipv6.fib6_null_entry)
4806 if ((filter->flags & RTM_F_PREFIX) &&
4807 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4808 /* success since this is not a prefix route */
4811 if (filter->filter_set) {
4812 if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4813 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4814 (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4817 flags |= NLM_F_DUMP_FILTERED;
4820 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4821 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4822 arg->cb->nlh->nlmsg_seq, flags);
4825 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4826 const struct nlmsghdr *nlh,
4828 struct netlink_ext_ack *extack)
4833 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4834 NL_SET_ERR_MSG_MOD(extack,
4835 "Invalid header for get route request");
4839 if (!netlink_strict_get_check(skb))
4840 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4841 rtm_ipv6_policy, extack);
4843 rtm = nlmsg_data(nlh);
4844 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4845 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4846 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4848 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4851 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4852 NL_SET_ERR_MSG_MOD(extack,
4853 "Invalid flags for get route request");
4857 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4858 rtm_ipv6_policy, extack);
4862 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4863 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4864 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4868 for (i = 0; i <= RTA_MAX; i++) {
4884 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4892 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4893 struct netlink_ext_ack *extack)
4895 struct net *net = sock_net(in_skb->sk);
4896 struct nlattr *tb[RTA_MAX+1];
4897 int err, iif = 0, oif = 0;
4898 struct fib6_info *from;
4899 struct dst_entry *dst;
4900 struct rt6_info *rt;
4901 struct sk_buff *skb;
4903 struct flowi6 fl6 = {};
4906 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4911 rtm = nlmsg_data(nlh);
4912 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4913 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4916 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4919 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4923 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4926 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4930 iif = nla_get_u32(tb[RTA_IIF]);
4933 oif = nla_get_u32(tb[RTA_OIF]);
4936 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4939 fl6.flowi6_uid = make_kuid(current_user_ns(),
4940 nla_get_u32(tb[RTA_UID]));
4942 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4945 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4948 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4950 if (tb[RTA_IP_PROTO]) {
4951 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4952 &fl6.flowi6_proto, extack);
4958 struct net_device *dev;
4963 dev = dev_get_by_index_rcu(net, iif);
4970 fl6.flowi6_iif = iif;
4972 if (!ipv6_addr_any(&fl6.saddr))
4973 flags |= RT6_LOOKUP_F_HAS_SADDR;
4975 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4979 fl6.flowi6_oif = oif;
4981 dst = ip6_route_output(net, NULL, &fl6);
4985 rt = container_of(dst, struct rt6_info, dst);
4986 if (rt->dst.error) {
4987 err = rt->dst.error;
4992 if (rt == net->ipv6.ip6_null_entry) {
4993 err = rt->dst.error;
4998 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5005 skb_dst_set(skb, &rt->dst);
5008 from = rcu_dereference(rt->from);
5011 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5012 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5015 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5016 &fl6.saddr, iif, RTM_NEWROUTE,
5017 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5026 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5031 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5032 unsigned int nlm_flags)
5034 struct sk_buff *skb;
5035 struct net *net = info->nl_net;
5040 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5042 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5046 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5047 event, info->portid, seq, nlm_flags);
5049 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5050 WARN_ON(err == -EMSGSIZE);
5054 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5055 info->nlh, gfp_any());
5059 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5062 static int ip6_route_dev_notify(struct notifier_block *this,
5063 unsigned long event, void *ptr)
5065 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5066 struct net *net = dev_net(dev);
5068 if (!(dev->flags & IFF_LOOPBACK))
5071 if (event == NETDEV_REGISTER) {
5072 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5073 net->ipv6.ip6_null_entry->dst.dev = dev;
5074 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5075 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5076 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5077 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5078 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5079 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5081 } else if (event == NETDEV_UNREGISTER &&
5082 dev->reg_state != NETREG_UNREGISTERED) {
5083 /* NETDEV_UNREGISTER could be fired for multiple times by
5084 * netdev_wait_allrefs(). Make sure we only call this once.
5086 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5087 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5088 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5089 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5100 #ifdef CONFIG_PROC_FS
5101 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5103 struct net *net = (struct net *)seq->private;
5104 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5105 net->ipv6.rt6_stats->fib_nodes,
5106 net->ipv6.rt6_stats->fib_route_nodes,
5107 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5108 net->ipv6.rt6_stats->fib_rt_entries,
5109 net->ipv6.rt6_stats->fib_rt_cache,
5110 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5111 net->ipv6.rt6_stats->fib_discarded_routes);
5115 #endif /* CONFIG_PROC_FS */
5117 #ifdef CONFIG_SYSCTL
5120 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5121 void __user *buffer, size_t *lenp, loff_t *ppos)
5129 net = (struct net *)ctl->extra1;
5130 delay = net->ipv6.sysctl.flush_delay;
5131 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5135 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5142 static struct ctl_table ipv6_route_table_template[] = {
5144 .procname = "flush",
5145 .data = &init_net.ipv6.sysctl.flush_delay,
5146 .maxlen = sizeof(int),
5148 .proc_handler = ipv6_sysctl_rtcache_flush
5151 .procname = "gc_thresh",
5152 .data = &ip6_dst_ops_template.gc_thresh,
5153 .maxlen = sizeof(int),
5155 .proc_handler = proc_dointvec,
5158 .procname = "max_size",
5159 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5160 .maxlen = sizeof(int),
5162 .proc_handler = proc_dointvec,
5165 .procname = "gc_min_interval",
5166 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5167 .maxlen = sizeof(int),
5169 .proc_handler = proc_dointvec_jiffies,
5172 .procname = "gc_timeout",
5173 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5174 .maxlen = sizeof(int),
5176 .proc_handler = proc_dointvec_jiffies,
5179 .procname = "gc_interval",
5180 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5181 .maxlen = sizeof(int),
5183 .proc_handler = proc_dointvec_jiffies,
5186 .procname = "gc_elasticity",
5187 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5188 .maxlen = sizeof(int),
5190 .proc_handler = proc_dointvec,
5193 .procname = "mtu_expires",
5194 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5195 .maxlen = sizeof(int),
5197 .proc_handler = proc_dointvec_jiffies,
5200 .procname = "min_adv_mss",
5201 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5202 .maxlen = sizeof(int),
5204 .proc_handler = proc_dointvec,
5207 .procname = "gc_min_interval_ms",
5208 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5209 .maxlen = sizeof(int),
5211 .proc_handler = proc_dointvec_ms_jiffies,
5214 .procname = "skip_notify_on_dev_down",
5215 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
5216 .maxlen = sizeof(int),
5218 .proc_handler = proc_dointvec,
5225 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5227 struct ctl_table *table;
5229 table = kmemdup(ipv6_route_table_template,
5230 sizeof(ipv6_route_table_template),
5234 table[0].data = &net->ipv6.sysctl.flush_delay;
5235 table[0].extra1 = net;
5236 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5237 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5238 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5239 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5240 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5241 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5242 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5243 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5244 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5245 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5247 /* Don't export sysctls to unprivileged users */
5248 if (net->user_ns != &init_user_ns)
5249 table[0].procname = NULL;
5256 static int __net_init ip6_route_net_init(struct net *net)
5260 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5261 sizeof(net->ipv6.ip6_dst_ops));
5263 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5264 goto out_ip6_dst_ops;
5266 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5267 sizeof(*net->ipv6.fib6_null_entry),
5269 if (!net->ipv6.fib6_null_entry)
5270 goto out_ip6_dst_entries;
5272 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5273 sizeof(*net->ipv6.ip6_null_entry),
5275 if (!net->ipv6.ip6_null_entry)
5276 goto out_fib6_null_entry;
5277 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5278 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5279 ip6_template_metrics, true);
5281 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5282 net->ipv6.fib6_has_custom_rules = false;
5283 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5284 sizeof(*net->ipv6.ip6_prohibit_entry),
5286 if (!net->ipv6.ip6_prohibit_entry)
5287 goto out_ip6_null_entry;
5288 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5289 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5290 ip6_template_metrics, true);
5292 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5293 sizeof(*net->ipv6.ip6_blk_hole_entry),
5295 if (!net->ipv6.ip6_blk_hole_entry)
5296 goto out_ip6_prohibit_entry;
5297 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5298 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5299 ip6_template_metrics, true);
5302 net->ipv6.sysctl.flush_delay = 0;
5303 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5304 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5305 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5306 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5307 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5308 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5309 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5310 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5312 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5318 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5319 out_ip6_prohibit_entry:
5320 kfree(net->ipv6.ip6_prohibit_entry);
5322 kfree(net->ipv6.ip6_null_entry);
5324 out_fib6_null_entry:
5325 kfree(net->ipv6.fib6_null_entry);
5326 out_ip6_dst_entries:
5327 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5332 static void __net_exit ip6_route_net_exit(struct net *net)
5334 kfree(net->ipv6.fib6_null_entry);
5335 kfree(net->ipv6.ip6_null_entry);
5336 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5337 kfree(net->ipv6.ip6_prohibit_entry);
5338 kfree(net->ipv6.ip6_blk_hole_entry);
5340 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5343 static int __net_init ip6_route_net_init_late(struct net *net)
5345 #ifdef CONFIG_PROC_FS
5346 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5347 sizeof(struct ipv6_route_iter));
5348 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5349 rt6_stats_seq_show, NULL);
5354 static void __net_exit ip6_route_net_exit_late(struct net *net)
5356 #ifdef CONFIG_PROC_FS
5357 remove_proc_entry("ipv6_route", net->proc_net);
5358 remove_proc_entry("rt6_stats", net->proc_net);
5362 static struct pernet_operations ip6_route_net_ops = {
5363 .init = ip6_route_net_init,
5364 .exit = ip6_route_net_exit,
5367 static int __net_init ipv6_inetpeer_init(struct net *net)
5369 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5373 inet_peer_base_init(bp);
5374 net->ipv6.peers = bp;
5378 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5380 struct inet_peer_base *bp = net->ipv6.peers;
5382 net->ipv6.peers = NULL;
5383 inetpeer_invalidate_tree(bp);
5387 static struct pernet_operations ipv6_inetpeer_ops = {
5388 .init = ipv6_inetpeer_init,
5389 .exit = ipv6_inetpeer_exit,
5392 static struct pernet_operations ip6_route_net_late_ops = {
5393 .init = ip6_route_net_init_late,
5394 .exit = ip6_route_net_exit_late,
5397 static struct notifier_block ip6_route_dev_notifier = {
5398 .notifier_call = ip6_route_dev_notify,
5399 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5402 void __init ip6_route_init_special_entries(void)
5404 /* Registering of the loopback is done before this portion of code,
5405 * the loopback reference in rt6_info will not be taken, do it
5406 * manually for init_net */
5407 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5408 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5409 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5410 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5411 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5412 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5413 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5414 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5418 int __init ip6_route_init(void)
5424 ip6_dst_ops_template.kmem_cachep =
5425 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5426 SLAB_HWCACHE_ALIGN, NULL);
5427 if (!ip6_dst_ops_template.kmem_cachep)
5430 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5432 goto out_kmem_cache;
5434 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5436 goto out_dst_entries;
5438 ret = register_pernet_subsys(&ip6_route_net_ops);
5440 goto out_register_inetpeer;
5442 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5446 goto out_register_subsys;
5452 ret = fib6_rules_init();
5456 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5458 goto fib6_rules_init;
5460 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5461 inet6_rtm_newroute, NULL, 0);
5463 goto out_register_late_subsys;
5465 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5466 inet6_rtm_delroute, NULL, 0);
5468 goto out_register_late_subsys;
5470 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5471 inet6_rtm_getroute, NULL,
5472 RTNL_FLAG_DOIT_UNLOCKED);
5474 goto out_register_late_subsys;
5476 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5478 goto out_register_late_subsys;
5480 for_each_possible_cpu(cpu) {
5481 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5483 INIT_LIST_HEAD(&ul->head);
5484 spin_lock_init(&ul->lock);
5490 out_register_late_subsys:
5491 rtnl_unregister_all(PF_INET6);
5492 unregister_pernet_subsys(&ip6_route_net_late_ops);
5494 fib6_rules_cleanup();
5499 out_register_subsys:
5500 unregister_pernet_subsys(&ip6_route_net_ops);
5501 out_register_inetpeer:
5502 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5504 dst_entries_destroy(&ip6_dst_blackhole_ops);
5506 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5510 void ip6_route_cleanup(void)
5512 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5513 unregister_pernet_subsys(&ip6_route_net_late_ops);
5514 fib6_rules_cleanup();
5517 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5518 unregister_pernet_subsys(&ip6_route_net_ops);
5519 dst_entries_destroy(&ip6_dst_blackhole_ops);
5520 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);