2 * Linux INET6 implementation
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
53 #include <linux/rtnetlink.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
62 #include <linux/sysctl.h>
65 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
66 const struct in6_addr *dest);
67 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
68 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
69 static unsigned int ip6_mtu(const struct dst_entry *dst);
70 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
71 static void ip6_dst_destroy(struct dst_entry *);
72 static void ip6_dst_ifdown(struct dst_entry *,
73 struct net_device *dev, int how);
74 static int ip6_dst_gc(struct dst_ops *ops);
76 static int ip6_pkt_discard(struct sk_buff *skb);
77 static int ip6_pkt_discard_out(struct sk_buff *skb);
78 static void ip6_link_failure(struct sk_buff *skb);
79 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
81 #ifdef CONFIG_IPV6_ROUTE_INFO
82 static struct rt6_info *rt6_add_route_info(struct net *net,
83 const struct in6_addr *prefix, int prefixlen,
84 const struct in6_addr *gwaddr, int ifindex,
86 static struct rt6_info *rt6_get_route_info(struct net *net,
87 const struct in6_addr *prefix, int prefixlen,
88 const struct in6_addr *gwaddr, int ifindex);
91 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
93 struct rt6_info *rt = (struct rt6_info *) dst;
94 struct inet_peer *peer;
97 if (!(rt->dst.flags & DST_HOST))
101 rt6_bind_peer(rt, 1);
103 peer = rt->rt6i_peer;
105 u32 *old_p = __DST_METRICS_PTR(old);
106 unsigned long prev, new;
109 if (inet_metrics_new(peer))
110 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
112 new = (unsigned long) p;
113 prev = cmpxchg(&dst->_metrics, old, new);
116 p = __DST_METRICS_PTR(prev);
117 if (prev & DST_METRICS_READ_ONLY)
124 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
126 struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
129 return neigh_create(&nd_tbl, daddr, dst->dev);
132 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
134 struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
136 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
140 dst_set_neighbour(&rt->dst, n);
145 static struct dst_ops ip6_dst_ops_template = {
147 .protocol = cpu_to_be16(ETH_P_IPV6),
150 .check = ip6_dst_check,
151 .default_advmss = ip6_default_advmss,
153 .cow_metrics = ipv6_cow_metrics,
154 .destroy = ip6_dst_destroy,
155 .ifdown = ip6_dst_ifdown,
156 .negative_advice = ip6_negative_advice,
157 .link_failure = ip6_link_failure,
158 .update_pmtu = ip6_rt_update_pmtu,
159 .local_out = __ip6_local_out,
160 .neigh_lookup = ip6_neigh_lookup,
163 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
165 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
167 return mtu ? : dst->dev->mtu;
170 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
174 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
180 static struct dst_ops ip6_dst_blackhole_ops = {
182 .protocol = cpu_to_be16(ETH_P_IPV6),
183 .destroy = ip6_dst_destroy,
184 .check = ip6_dst_check,
185 .mtu = ip6_blackhole_mtu,
186 .default_advmss = ip6_default_advmss,
187 .update_pmtu = ip6_rt_blackhole_update_pmtu,
188 .cow_metrics = ip6_rt_blackhole_cow_metrics,
189 .neigh_lookup = ip6_neigh_lookup,
192 static const u32 ip6_template_metrics[RTAX_MAX] = {
193 [RTAX_HOPLIMIT - 1] = 255,
196 static struct rt6_info ip6_null_entry_template = {
198 .__refcnt = ATOMIC_INIT(1),
201 .error = -ENETUNREACH,
202 .input = ip6_pkt_discard,
203 .output = ip6_pkt_discard_out,
205 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
206 .rt6i_protocol = RTPROT_KERNEL,
207 .rt6i_metric = ~(u32) 0,
208 .rt6i_ref = ATOMIC_INIT(1),
211 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
213 static int ip6_pkt_prohibit(struct sk_buff *skb);
214 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
216 static struct rt6_info ip6_prohibit_entry_template = {
218 .__refcnt = ATOMIC_INIT(1),
222 .input = ip6_pkt_prohibit,
223 .output = ip6_pkt_prohibit_out,
225 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
226 .rt6i_protocol = RTPROT_KERNEL,
227 .rt6i_metric = ~(u32) 0,
228 .rt6i_ref = ATOMIC_INIT(1),
231 static struct rt6_info ip6_blk_hole_entry_template = {
233 .__refcnt = ATOMIC_INIT(1),
237 .input = dst_discard,
238 .output = dst_discard,
240 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
241 .rt6i_protocol = RTPROT_KERNEL,
242 .rt6i_metric = ~(u32) 0,
243 .rt6i_ref = ATOMIC_INIT(1),
248 /* allocate dst with ip6_dst_ops */
249 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
250 struct net_device *dev,
253 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
256 memset(&rt->rt6i_table, 0,
257 sizeof(*rt) - sizeof(struct dst_entry));
262 static void ip6_dst_destroy(struct dst_entry *dst)
264 struct rt6_info *rt = (struct rt6_info *)dst;
265 struct inet6_dev *idev = rt->rt6i_idev;
266 struct inet_peer *peer = rt->rt6i_peer;
268 if (!(rt->dst.flags & DST_HOST))
269 dst_destroy_metrics_generic(dst);
272 rt->rt6i_idev = NULL;
276 rt->rt6i_peer = NULL;
281 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
283 static u32 rt6_peer_genid(void)
285 return atomic_read(&__rt6_peer_genid);
288 void rt6_bind_peer(struct rt6_info *rt, int create)
290 struct inet_peer *peer;
292 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
293 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
296 rt->rt6i_peer_genid = rt6_peer_genid();
299 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
302 struct rt6_info *rt = (struct rt6_info *)dst;
303 struct inet6_dev *idev = rt->rt6i_idev;
304 struct net_device *loopback_dev =
305 dev_net(dev)->loopback_dev;
307 if (dev != loopback_dev && idev && idev->dev == dev) {
308 struct inet6_dev *loopback_idev =
309 in6_dev_get(loopback_dev);
311 rt->rt6i_idev = loopback_idev;
317 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
319 return (rt->rt6i_flags & RTF_EXPIRES) &&
320 time_after(jiffies, rt->dst.expires);
323 static inline int rt6_need_strict(const struct in6_addr *daddr)
325 return ipv6_addr_type(daddr) &
326 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
330 * Route lookup. Any table->tb6_lock is implied.
333 static inline struct rt6_info *rt6_device_match(struct net *net,
335 const struct in6_addr *saddr,
339 struct rt6_info *local = NULL;
340 struct rt6_info *sprt;
342 if (!oif && ipv6_addr_any(saddr))
345 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
346 struct net_device *dev = sprt->dst.dev;
349 if (dev->ifindex == oif)
351 if (dev->flags & IFF_LOOPBACK) {
352 if (!sprt->rt6i_idev ||
353 sprt->rt6i_idev->dev->ifindex != oif) {
354 if (flags & RT6_LOOKUP_F_IFACE && oif)
356 if (local && (!oif ||
357 local->rt6i_idev->dev->ifindex == oif))
363 if (ipv6_chk_addr(net, saddr, dev,
364 flags & RT6_LOOKUP_F_IFACE))
373 if (flags & RT6_LOOKUP_F_IFACE)
374 return net->ipv6.ip6_null_entry;
380 #ifdef CONFIG_IPV6_ROUTER_PREF
381 static void rt6_probe(struct rt6_info *rt)
383 struct neighbour *neigh;
385 * Okay, this does not seem to be appropriate
386 * for now, however, we need to check if it
387 * is really so; aka Router Reachability Probing.
389 * Router Reachability Probe MUST be rate-limited
390 * to no more than one per minute.
393 neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
394 if (!neigh || (neigh->nud_state & NUD_VALID))
396 read_lock_bh(&neigh->lock);
397 if (!(neigh->nud_state & NUD_VALID) &&
398 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
399 struct in6_addr mcaddr;
400 struct in6_addr *target;
402 neigh->updated = jiffies;
403 read_unlock_bh(&neigh->lock);
405 target = (struct in6_addr *)&neigh->primary_key;
406 addrconf_addr_solict_mult(target, &mcaddr);
407 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
409 read_unlock_bh(&neigh->lock);
415 static inline void rt6_probe(struct rt6_info *rt)
421 * Default Router Selection (RFC 2461 6.3.6)
423 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
425 struct net_device *dev = rt->dst.dev;
426 if (!oif || dev->ifindex == oif)
428 if ((dev->flags & IFF_LOOPBACK) &&
429 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
434 static inline int rt6_check_neigh(struct rt6_info *rt)
436 struct neighbour *neigh;
440 neigh = dst_get_neighbour_noref(&rt->dst);
441 if (rt->rt6i_flags & RTF_NONEXTHOP ||
442 !(rt->rt6i_flags & RTF_GATEWAY))
445 read_lock_bh(&neigh->lock);
446 if (neigh->nud_state & NUD_VALID)
448 #ifdef CONFIG_IPV6_ROUTER_PREF
449 else if (neigh->nud_state & NUD_FAILED)
454 read_unlock_bh(&neigh->lock);
461 static int rt6_score_route(struct rt6_info *rt, int oif,
466 m = rt6_check_dev(rt, oif);
467 if (!m && (strict & RT6_LOOKUP_F_IFACE))
469 #ifdef CONFIG_IPV6_ROUTER_PREF
470 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
472 n = rt6_check_neigh(rt);
473 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
478 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
479 int *mpri, struct rt6_info *match)
483 if (rt6_check_expired(rt))
486 m = rt6_score_route(rt, oif, strict);
491 if (strict & RT6_LOOKUP_F_REACHABLE)
495 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
503 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
504 struct rt6_info *rr_head,
505 u32 metric, int oif, int strict)
507 struct rt6_info *rt, *match;
511 for (rt = rr_head; rt && rt->rt6i_metric == metric;
512 rt = rt->dst.rt6_next)
513 match = find_match(rt, oif, strict, &mpri, match);
514 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
515 rt = rt->dst.rt6_next)
516 match = find_match(rt, oif, strict, &mpri, match);
521 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
523 struct rt6_info *match, *rt0;
528 fn->rr_ptr = rt0 = fn->leaf;
530 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
533 (strict & RT6_LOOKUP_F_REACHABLE)) {
534 struct rt6_info *next = rt0->dst.rt6_next;
536 /* no entries matched; do round-robin */
537 if (!next || next->rt6i_metric != rt0->rt6i_metric)
544 net = dev_net(rt0->dst.dev);
545 return match ? match : net->ipv6.ip6_null_entry;
548 #ifdef CONFIG_IPV6_ROUTE_INFO
549 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
550 const struct in6_addr *gwaddr)
552 struct net *net = dev_net(dev);
553 struct route_info *rinfo = (struct route_info *) opt;
554 struct in6_addr prefix_buf, *prefix;
556 unsigned long lifetime;
559 if (len < sizeof(struct route_info)) {
563 /* Sanity check for prefix_len and length */
564 if (rinfo->length > 3) {
566 } else if (rinfo->prefix_len > 128) {
568 } else if (rinfo->prefix_len > 64) {
569 if (rinfo->length < 2) {
572 } else if (rinfo->prefix_len > 0) {
573 if (rinfo->length < 1) {
578 pref = rinfo->route_pref;
579 if (pref == ICMPV6_ROUTER_PREF_INVALID)
582 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584 if (rinfo->length == 3)
585 prefix = (struct in6_addr *)rinfo->prefix;
587 /* this function is safe */
588 ipv6_addr_prefix(&prefix_buf,
589 (struct in6_addr *)rinfo->prefix,
591 prefix = &prefix_buf;
594 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
597 if (rt && !lifetime) {
603 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
606 rt->rt6i_flags = RTF_ROUTEINFO |
607 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
610 if (!addrconf_finite_timeout(lifetime)) {
611 rt->rt6i_flags &= ~RTF_EXPIRES;
613 rt->dst.expires = jiffies + HZ * lifetime;
614 rt->rt6i_flags |= RTF_EXPIRES;
616 dst_release(&rt->dst);
622 #define BACKTRACK(__net, saddr) \
624 if (rt == __net->ipv6.ip6_null_entry) { \
625 struct fib6_node *pn; \
627 if (fn->fn_flags & RTN_TL_ROOT) \
630 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
631 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
634 if (fn->fn_flags & RTN_RTINFO) \
640 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
641 struct fib6_table *table,
642 struct flowi6 *fl6, int flags)
644 struct fib6_node *fn;
647 read_lock_bh(&table->tb6_lock);
648 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
651 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
652 BACKTRACK(net, &fl6->saddr);
654 dst_use(&rt->dst, jiffies);
655 read_unlock_bh(&table->tb6_lock);
660 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
663 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
665 EXPORT_SYMBOL_GPL(ip6_route_lookup);
667 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
668 const struct in6_addr *saddr, int oif, int strict)
670 struct flowi6 fl6 = {
674 struct dst_entry *dst;
675 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
678 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
679 flags |= RT6_LOOKUP_F_HAS_SADDR;
682 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
684 return (struct rt6_info *) dst;
691 EXPORT_SYMBOL(rt6_lookup);
693 /* ip6_ins_rt is called with FREE table->tb6_lock.
694 It takes new route entry, the addition fails by any reason the
695 route is freed. In any case, if caller does not hold it, it may
699 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
702 struct fib6_table *table;
704 table = rt->rt6i_table;
705 write_lock_bh(&table->tb6_lock);
706 err = fib6_add(&table->tb6_root, rt, info);
707 write_unlock_bh(&table->tb6_lock);
712 int ip6_ins_rt(struct rt6_info *rt)
714 struct nl_info info = {
715 .nl_net = dev_net(rt->dst.dev),
717 return __ip6_ins_rt(rt, &info);
720 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
721 const struct in6_addr *daddr,
722 const struct in6_addr *saddr)
730 rt = ip6_rt_copy(ort, daddr);
733 int attempts = !in_softirq();
735 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
736 if (ort->rt6i_dst.plen != 128 &&
737 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
738 rt->rt6i_flags |= RTF_ANYCAST;
739 rt->rt6i_gateway = *daddr;
742 rt->rt6i_flags |= RTF_CACHE;
744 #ifdef CONFIG_IPV6_SUBTREES
745 if (rt->rt6i_src.plen && saddr) {
746 rt->rt6i_src.addr = *saddr;
747 rt->rt6i_src.plen = 128;
752 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
753 struct net *net = dev_net(rt->dst.dev);
754 int saved_rt_min_interval =
755 net->ipv6.sysctl.ip6_rt_gc_min_interval;
756 int saved_rt_elasticity =
757 net->ipv6.sysctl.ip6_rt_gc_elasticity;
759 if (attempts-- > 0) {
760 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
761 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
763 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
765 net->ipv6.sysctl.ip6_rt_gc_elasticity =
767 net->ipv6.sysctl.ip6_rt_gc_min_interval =
768 saved_rt_min_interval;
774 "ipv6: Neighbour table overflow.\n");
783 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
784 const struct in6_addr *daddr)
786 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
789 rt->rt6i_flags |= RTF_CACHE;
790 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
795 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
796 struct flowi6 *fl6, int flags)
798 struct fib6_node *fn;
799 struct rt6_info *rt, *nrt;
803 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
805 strict |= flags & RT6_LOOKUP_F_IFACE;
808 read_lock_bh(&table->tb6_lock);
811 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
814 rt = rt6_select(fn, oif, strict | reachable);
816 BACKTRACK(net, &fl6->saddr);
817 if (rt == net->ipv6.ip6_null_entry ||
818 rt->rt6i_flags & RTF_CACHE)
822 read_unlock_bh(&table->tb6_lock);
824 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
825 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
826 else if (!(rt->dst.flags & DST_HOST))
827 nrt = rt6_alloc_clone(rt, &fl6->daddr);
831 dst_release(&rt->dst);
832 rt = nrt ? : net->ipv6.ip6_null_entry;
836 err = ip6_ins_rt(nrt);
845 * Race condition! In the gap, when table->tb6_lock was
846 * released someone could insert this route. Relookup.
848 dst_release(&rt->dst);
857 read_unlock_bh(&table->tb6_lock);
859 rt->dst.lastuse = jiffies;
865 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
866 struct flowi6 *fl6, int flags)
868 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
871 void ip6_route_input(struct sk_buff *skb)
873 const struct ipv6hdr *iph = ipv6_hdr(skb);
874 struct net *net = dev_net(skb->dev);
875 int flags = RT6_LOOKUP_F_HAS_SADDR;
876 struct flowi6 fl6 = {
877 .flowi6_iif = skb->dev->ifindex,
880 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
881 .flowi6_mark = skb->mark,
882 .flowi6_proto = iph->nexthdr,
885 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
886 flags |= RT6_LOOKUP_F_IFACE;
888 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
891 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
892 struct flowi6 *fl6, int flags)
894 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
897 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
902 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
903 flags |= RT6_LOOKUP_F_IFACE;
905 if (!ipv6_addr_any(&fl6->saddr))
906 flags |= RT6_LOOKUP_F_HAS_SADDR;
908 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
910 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
913 EXPORT_SYMBOL(ip6_route_output);
915 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
917 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
918 struct dst_entry *new = NULL;
920 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
922 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
927 new->input = dst_discard;
928 new->output = dst_discard;
930 if (dst_metrics_read_only(&ort->dst))
931 new->_metrics = ort->dst._metrics;
933 dst_copy_metrics(new, &ort->dst);
934 rt->rt6i_idev = ort->rt6i_idev;
936 in6_dev_hold(rt->rt6i_idev);
939 rt->rt6i_gateway = ort->rt6i_gateway;
940 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
943 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
944 #ifdef CONFIG_IPV6_SUBTREES
945 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
951 dst_release(dst_orig);
952 return new ? new : ERR_PTR(-ENOMEM);
956 * Destination cache support functions
959 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
963 rt = (struct rt6_info *) dst;
965 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
966 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
968 rt6_bind_peer(rt, 0);
969 rt->rt6i_peer_genid = rt6_peer_genid();
976 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
978 struct rt6_info *rt = (struct rt6_info *) dst;
981 if (rt->rt6i_flags & RTF_CACHE) {
982 if (rt6_check_expired(rt)) {
994 static void ip6_link_failure(struct sk_buff *skb)
998 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1000 rt = (struct rt6_info *) skb_dst(skb);
1002 if (rt->rt6i_flags & RTF_CACHE) {
1003 dst_set_expires(&rt->dst, 0);
1004 rt->rt6i_flags |= RTF_EXPIRES;
1005 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1006 rt->rt6i_node->fn_sernum = -1;
1010 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1012 struct rt6_info *rt6 = (struct rt6_info*)dst;
1014 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1015 rt6->rt6i_flags |= RTF_MODIFIED;
1016 if (mtu < IPV6_MIN_MTU) {
1017 u32 features = dst_metric(dst, RTAX_FEATURES);
1019 features |= RTAX_FEATURE_ALLFRAG;
1020 dst_metric_set(dst, RTAX_FEATURES, features);
1022 dst_metric_set(dst, RTAX_MTU, mtu);
1026 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1028 struct net_device *dev = dst->dev;
1029 unsigned int mtu = dst_mtu(dst);
1030 struct net *net = dev_net(dev);
1032 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1034 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1035 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1038 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1039 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1040 * IPV6_MAXPLEN is also valid and means: "any MSS,
1041 * rely only on pmtu discovery"
1043 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1048 static unsigned int ip6_mtu(const struct dst_entry *dst)
1050 struct inet6_dev *idev;
1051 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1059 idev = __in6_dev_get(dst->dev);
1061 mtu = idev->cnf.mtu6;
1067 static struct dst_entry *icmp6_dst_gc_list;
1068 static DEFINE_SPINLOCK(icmp6_dst_lock);
1070 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1071 struct neighbour *neigh,
1074 struct dst_entry *dst;
1075 struct rt6_info *rt;
1076 struct inet6_dev *idev = in6_dev_get(dev);
1077 struct net *net = dev_net(dev);
1079 if (unlikely(!idev))
1082 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1083 if (unlikely(!rt)) {
1085 dst = ERR_PTR(-ENOMEM);
1092 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1093 if (IS_ERR(neigh)) {
1095 return ERR_CAST(neigh);
1099 rt->dst.flags |= DST_HOST;
1100 rt->dst.output = ip6_output;
1101 dst_set_neighbour(&rt->dst, neigh);
1102 atomic_set(&rt->dst.__refcnt, 1);
1103 rt->rt6i_dst.addr = fl6->daddr;
1104 rt->rt6i_dst.plen = 128;
1105 rt->rt6i_idev = idev;
1106 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1108 spin_lock_bh(&icmp6_dst_lock);
1109 rt->dst.next = icmp6_dst_gc_list;
1110 icmp6_dst_gc_list = &rt->dst;
1111 spin_unlock_bh(&icmp6_dst_lock);
1113 fib6_force_start_gc(net);
1115 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1121 int icmp6_dst_gc(void)
1123 struct dst_entry *dst, **pprev;
1126 spin_lock_bh(&icmp6_dst_lock);
1127 pprev = &icmp6_dst_gc_list;
1129 while ((dst = *pprev) != NULL) {
1130 if (!atomic_read(&dst->__refcnt)) {
1139 spin_unlock_bh(&icmp6_dst_lock);
1144 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1147 struct dst_entry *dst, **pprev;
1149 spin_lock_bh(&icmp6_dst_lock);
1150 pprev = &icmp6_dst_gc_list;
1151 while ((dst = *pprev) != NULL) {
1152 struct rt6_info *rt = (struct rt6_info *) dst;
1153 if (func(rt, arg)) {
1160 spin_unlock_bh(&icmp6_dst_lock);
1163 static int ip6_dst_gc(struct dst_ops *ops)
1165 unsigned long now = jiffies;
1166 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1167 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1168 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1169 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1170 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1171 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1174 entries = dst_entries_get_fast(ops);
1175 if (time_after(rt_last_gc + rt_min_interval, now) &&
1176 entries <= rt_max_size)
1179 net->ipv6.ip6_rt_gc_expire++;
1180 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1181 net->ipv6.ip6_rt_last_gc = now;
1182 entries = dst_entries_get_slow(ops);
1183 if (entries < ops->gc_thresh)
1184 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1186 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1187 return entries > rt_max_size;
1190 /* Clean host part of a prefix. Not necessary in radix tree,
1191 but results in cleaner routing tables.
1193 Remove it only when all the things will work!
1196 int ip6_dst_hoplimit(struct dst_entry *dst)
1198 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1199 if (hoplimit == 0) {
1200 struct net_device *dev = dst->dev;
1201 struct inet6_dev *idev;
1204 idev = __in6_dev_get(dev);
1206 hoplimit = idev->cnf.hop_limit;
1208 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1213 EXPORT_SYMBOL(ip6_dst_hoplimit);
1219 int ip6_route_add(struct fib6_config *cfg)
1222 struct net *net = cfg->fc_nlinfo.nl_net;
1223 struct rt6_info *rt = NULL;
1224 struct net_device *dev = NULL;
1225 struct inet6_dev *idev = NULL;
1226 struct fib6_table *table;
1229 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1231 #ifndef CONFIG_IPV6_SUBTREES
1232 if (cfg->fc_src_len)
1235 if (cfg->fc_ifindex) {
1237 dev = dev_get_by_index(net, cfg->fc_ifindex);
1240 idev = in6_dev_get(dev);
1245 if (cfg->fc_metric == 0)
1246 cfg->fc_metric = IP6_RT_PRIO_USER;
1249 if (cfg->fc_nlinfo.nlh &&
1250 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1251 table = fib6_get_table(net, cfg->fc_table);
1253 printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1254 table = fib6_new_table(net, cfg->fc_table);
1257 table = fib6_new_table(net, cfg->fc_table);
1263 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1270 rt->dst.obsolete = -1;
1271 rt->dst.expires = (cfg->fc_flags & RTF_EXPIRES) ?
1272 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1275 if (cfg->fc_protocol == RTPROT_UNSPEC)
1276 cfg->fc_protocol = RTPROT_BOOT;
1277 rt->rt6i_protocol = cfg->fc_protocol;
1279 addr_type = ipv6_addr_type(&cfg->fc_dst);
1281 if (addr_type & IPV6_ADDR_MULTICAST)
1282 rt->dst.input = ip6_mc_input;
1283 else if (cfg->fc_flags & RTF_LOCAL)
1284 rt->dst.input = ip6_input;
1286 rt->dst.input = ip6_forward;
1288 rt->dst.output = ip6_output;
1290 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1291 rt->rt6i_dst.plen = cfg->fc_dst_len;
1292 if (rt->rt6i_dst.plen == 128)
1293 rt->dst.flags |= DST_HOST;
1295 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1296 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1301 dst_init_metrics(&rt->dst, metrics, 0);
1303 #ifdef CONFIG_IPV6_SUBTREES
1304 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1305 rt->rt6i_src.plen = cfg->fc_src_len;
1308 rt->rt6i_metric = cfg->fc_metric;
1310 /* We cannot add true routes via loopback here,
1311 they would result in kernel looping; promote them to reject routes
1313 if ((cfg->fc_flags & RTF_REJECT) ||
1314 (dev && (dev->flags & IFF_LOOPBACK) &&
1315 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1316 !(cfg->fc_flags & RTF_LOCAL))) {
1317 /* hold loopback dev/idev if we haven't done so. */
1318 if (dev != net->loopback_dev) {
1323 dev = net->loopback_dev;
1325 idev = in6_dev_get(dev);
1331 rt->dst.output = ip6_pkt_discard_out;
1332 rt->dst.input = ip6_pkt_discard;
1333 rt->dst.error = -ENETUNREACH;
1334 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1338 if (cfg->fc_flags & RTF_GATEWAY) {
1339 const struct in6_addr *gw_addr;
1342 gw_addr = &cfg->fc_gateway;
1343 rt->rt6i_gateway = *gw_addr;
1344 gwa_type = ipv6_addr_type(gw_addr);
1346 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1347 struct rt6_info *grt;
1349 /* IPv6 strictly inhibits using not link-local
1350 addresses as nexthop address.
1351 Otherwise, router will not able to send redirects.
1352 It is very good, but in some (rare!) circumstances
1353 (SIT, PtP, NBMA NOARP links) it is handy to allow
1354 some exceptions. --ANK
1357 if (!(gwa_type & IPV6_ADDR_UNICAST))
1360 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1362 err = -EHOSTUNREACH;
1366 if (dev != grt->dst.dev) {
1367 dst_release(&grt->dst);
1372 idev = grt->rt6i_idev;
1374 in6_dev_hold(grt->rt6i_idev);
1376 if (!(grt->rt6i_flags & RTF_GATEWAY))
1378 dst_release(&grt->dst);
1384 if (!dev || (dev->flags & IFF_LOOPBACK))
1392 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1393 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1397 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1398 rt->rt6i_prefsrc.plen = 128;
1400 rt->rt6i_prefsrc.plen = 0;
1402 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1403 err = rt6_bind_neighbour(rt, dev);
1408 rt->rt6i_flags = cfg->fc_flags;
1415 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1416 int type = nla_type(nla);
1419 if (type > RTAX_MAX) {
1424 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1430 rt->rt6i_idev = idev;
1431 rt->rt6i_table = table;
1433 cfg->fc_nlinfo.nl_net = dev_net(dev);
1435 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1447 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1450 struct fib6_table *table;
1451 struct net *net = dev_net(rt->dst.dev);
1453 if (rt == net->ipv6.ip6_null_entry)
1456 table = rt->rt6i_table;
1457 write_lock_bh(&table->tb6_lock);
1459 err = fib6_del(rt, info);
1460 dst_release(&rt->dst);
1462 write_unlock_bh(&table->tb6_lock);
1467 int ip6_del_rt(struct rt6_info *rt)
1469 struct nl_info info = {
1470 .nl_net = dev_net(rt->dst.dev),
1472 return __ip6_del_rt(rt, &info);
1475 static int ip6_route_del(struct fib6_config *cfg)
1477 struct fib6_table *table;
1478 struct fib6_node *fn;
1479 struct rt6_info *rt;
1482 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1486 read_lock_bh(&table->tb6_lock);
1488 fn = fib6_locate(&table->tb6_root,
1489 &cfg->fc_dst, cfg->fc_dst_len,
1490 &cfg->fc_src, cfg->fc_src_len);
1493 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1494 if (cfg->fc_ifindex &&
1496 rt->dst.dev->ifindex != cfg->fc_ifindex))
1498 if (cfg->fc_flags & RTF_GATEWAY &&
1499 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1501 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1504 read_unlock_bh(&table->tb6_lock);
1506 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1509 read_unlock_bh(&table->tb6_lock);
1517 struct ip6rd_flowi {
1519 struct in6_addr gateway;
1522 static struct rt6_info *__ip6_route_redirect(struct net *net,
1523 struct fib6_table *table,
1527 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1528 struct rt6_info *rt;
1529 struct fib6_node *fn;
1532 * Get the "current" route for this destination and
1533 * check if the redirect has come from approriate router.
1535 * RFC 2461 specifies that redirects should only be
1536 * accepted if they come from the nexthop to the target.
1537 * Due to the way the routes are chosen, this notion
1538 * is a bit fuzzy and one might need to check all possible
1542 read_lock_bh(&table->tb6_lock);
1543 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1545 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1547 * Current route is on-link; redirect is always invalid.
1549 * Seems, previous statement is not true. It could
1550 * be node, which looks for us as on-link (f.e. proxy ndisc)
1551 * But then router serving it might decide, that we should
1552 * know truth 8)8) --ANK (980726).
1554 if (rt6_check_expired(rt))
1556 if (!(rt->rt6i_flags & RTF_GATEWAY))
1558 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1560 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1566 rt = net->ipv6.ip6_null_entry;
1567 BACKTRACK(net, &fl6->saddr);
1571 read_unlock_bh(&table->tb6_lock);
1576 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1577 const struct in6_addr *src,
1578 const struct in6_addr *gateway,
1579 struct net_device *dev)
1581 int flags = RT6_LOOKUP_F_HAS_SADDR;
1582 struct net *net = dev_net(dev);
1583 struct ip6rd_flowi rdfl = {
1585 .flowi6_oif = dev->ifindex,
1591 rdfl.gateway = *gateway;
1593 if (rt6_need_strict(dest))
1594 flags |= RT6_LOOKUP_F_IFACE;
1596 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1597 flags, __ip6_route_redirect);
1600 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1601 const struct in6_addr *saddr,
1602 struct neighbour *neigh, u8 *lladdr, int on_link)
1604 struct rt6_info *rt, *nrt = NULL;
1605 struct netevent_redirect netevent;
1606 struct net *net = dev_net(neigh->dev);
1608 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1610 if (rt == net->ipv6.ip6_null_entry) {
1611 if (net_ratelimit())
1612 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1613 "for redirect target\n");
1618 * We have finally decided to accept it.
1621 neigh_update(neigh, lladdr, NUD_STALE,
1622 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1623 NEIGH_UPDATE_F_OVERRIDE|
1624 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1625 NEIGH_UPDATE_F_ISROUTER))
1629 * Redirect received -> path was valid.
1630 * Look, redirects are sent only in response to data packets,
1631 * so that this nexthop apparently is reachable. --ANK
1633 dst_confirm(&rt->dst);
1635 /* Duplicate redirect: silently ignore. */
1636 if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1639 nrt = ip6_rt_copy(rt, dest);
1643 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1645 nrt->rt6i_flags &= ~RTF_GATEWAY;
1647 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1648 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1650 if (ip6_ins_rt(nrt))
1653 netevent.old = &rt->dst;
1654 netevent.new = &nrt->dst;
1655 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1657 if (rt->rt6i_flags & RTF_CACHE) {
1663 dst_release(&rt->dst);
1667 * Handle ICMP "packet too big" messages
1668 * i.e. Path MTU discovery
1671 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1672 struct net *net, u32 pmtu, int ifindex)
1674 struct rt6_info *rt, *nrt;
1677 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1681 if (rt6_check_expired(rt)) {
1686 if (pmtu >= dst_mtu(&rt->dst))
1689 if (pmtu < IPV6_MIN_MTU) {
1691 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1692 * MTU (1280) and a fragment header should always be included
1693 * after a node receiving Too Big message reporting PMTU is
1694 * less than the IPv6 Minimum Link MTU.
1696 pmtu = IPV6_MIN_MTU;
1700 /* New mtu received -> path was valid.
1701 They are sent only in response to data packets,
1702 so that this nexthop apparently is reachable. --ANK
1704 dst_confirm(&rt->dst);
1706 /* Host route. If it is static, it would be better
1707 not to override it, but add new one, so that
1708 when cache entry will expire old pmtu
1709 would return automatically.
1711 if (rt->rt6i_flags & RTF_CACHE) {
1712 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1714 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1715 features |= RTAX_FEATURE_ALLFRAG;
1716 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1718 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1719 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1724 Two cases are possible:
1725 1. It is connected route. Action: COW
1726 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1728 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1729 nrt = rt6_alloc_cow(rt, daddr, saddr);
1731 nrt = rt6_alloc_clone(rt, daddr);
1734 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1736 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1737 features |= RTAX_FEATURE_ALLFRAG;
1738 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1741 /* According to RFC 1981, detecting PMTU increase shouldn't be
1742 * happened within 5 mins, the recommended timer is 10 mins.
1743 * Here this route expiration time is set to ip6_rt_mtu_expires
1744 * which is 10 mins. After 10 mins the decreased pmtu is expired
1745 * and detecting PMTU increase will be automatically happened.
1747 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1748 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1753 dst_release(&rt->dst);
1756 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1757 struct net_device *dev, u32 pmtu)
1759 struct net *net = dev_net(dev);
1762 * RFC 1981 states that a node "MUST reduce the size of the packets it
1763 * is sending along the path" that caused the Packet Too Big message.
1764 * Since it's not possible in the general case to determine which
1765 * interface was used to send the original packet, we update the MTU
1766 * on the interface that will be used to send future packets. We also
1767 * update the MTU on the interface that received the Packet Too Big in
1768 * case the original packet was forced out that interface with
1769 * SO_BINDTODEVICE or similar. This is the next best thing to the
1770 * correct behaviour, which would be to update the MTU on all
1773 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1774 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1778 * Misc support functions
1781 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1782 const struct in6_addr *dest)
1784 struct net *net = dev_net(ort->dst.dev);
1785 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1789 rt->dst.input = ort->dst.input;
1790 rt->dst.output = ort->dst.output;
1791 rt->dst.flags |= DST_HOST;
1793 rt->rt6i_dst.addr = *dest;
1794 rt->rt6i_dst.plen = 128;
1795 dst_copy_metrics(&rt->dst, &ort->dst);
1796 rt->dst.error = ort->dst.error;
1797 rt->rt6i_idev = ort->rt6i_idev;
1799 in6_dev_hold(rt->rt6i_idev);
1800 rt->dst.lastuse = jiffies;
1801 rt->dst.expires = 0;
1803 rt->rt6i_gateway = ort->rt6i_gateway;
1804 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1805 rt->rt6i_metric = 0;
1807 #ifdef CONFIG_IPV6_SUBTREES
1808 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1810 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1811 rt->rt6i_table = ort->rt6i_table;
1816 #ifdef CONFIG_IPV6_ROUTE_INFO
1817 static struct rt6_info *rt6_get_route_info(struct net *net,
1818 const struct in6_addr *prefix, int prefixlen,
1819 const struct in6_addr *gwaddr, int ifindex)
1821 struct fib6_node *fn;
1822 struct rt6_info *rt = NULL;
1823 struct fib6_table *table;
1825 table = fib6_get_table(net, RT6_TABLE_INFO);
1829 write_lock_bh(&table->tb6_lock);
1830 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1834 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1835 if (rt->dst.dev->ifindex != ifindex)
1837 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1839 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1845 write_unlock_bh(&table->tb6_lock);
1849 static struct rt6_info *rt6_add_route_info(struct net *net,
1850 const struct in6_addr *prefix, int prefixlen,
1851 const struct in6_addr *gwaddr, int ifindex,
1854 struct fib6_config cfg = {
1855 .fc_table = RT6_TABLE_INFO,
1856 .fc_metric = IP6_RT_PRIO_USER,
1857 .fc_ifindex = ifindex,
1858 .fc_dst_len = prefixlen,
1859 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1860 RTF_UP | RTF_PREF(pref),
1862 .fc_nlinfo.nlh = NULL,
1863 .fc_nlinfo.nl_net = net,
1866 cfg.fc_dst = *prefix;
1867 cfg.fc_gateway = *gwaddr;
1869 /* We should treat it as a default route if prefix length is 0. */
1871 cfg.fc_flags |= RTF_DEFAULT;
1873 ip6_route_add(&cfg);
1875 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1879 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1881 struct rt6_info *rt;
1882 struct fib6_table *table;
1884 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1888 write_lock_bh(&table->tb6_lock);
1889 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1890 if (dev == rt->dst.dev &&
1891 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1892 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1897 write_unlock_bh(&table->tb6_lock);
1901 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1902 struct net_device *dev,
1905 struct fib6_config cfg = {
1906 .fc_table = RT6_TABLE_DFLT,
1907 .fc_metric = IP6_RT_PRIO_USER,
1908 .fc_ifindex = dev->ifindex,
1909 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1910 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1912 .fc_nlinfo.nlh = NULL,
1913 .fc_nlinfo.nl_net = dev_net(dev),
1916 cfg.fc_gateway = *gwaddr;
1918 ip6_route_add(&cfg);
1920 return rt6_get_dflt_router(gwaddr, dev);
1923 void rt6_purge_dflt_routers(struct net *net)
1925 struct rt6_info *rt;
1926 struct fib6_table *table;
1928 /* NOTE: Keep consistent with rt6_get_dflt_router */
1929 table = fib6_get_table(net, RT6_TABLE_DFLT);
1934 read_lock_bh(&table->tb6_lock);
1935 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1936 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1938 read_unlock_bh(&table->tb6_lock);
1943 read_unlock_bh(&table->tb6_lock);
1946 static void rtmsg_to_fib6_config(struct net *net,
1947 struct in6_rtmsg *rtmsg,
1948 struct fib6_config *cfg)
1950 memset(cfg, 0, sizeof(*cfg));
1952 cfg->fc_table = RT6_TABLE_MAIN;
1953 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1954 cfg->fc_metric = rtmsg->rtmsg_metric;
1955 cfg->fc_expires = rtmsg->rtmsg_info;
1956 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1957 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1958 cfg->fc_flags = rtmsg->rtmsg_flags;
1960 cfg->fc_nlinfo.nl_net = net;
1962 cfg->fc_dst = rtmsg->rtmsg_dst;
1963 cfg->fc_src = rtmsg->rtmsg_src;
1964 cfg->fc_gateway = rtmsg->rtmsg_gateway;
1967 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1969 struct fib6_config cfg;
1970 struct in6_rtmsg rtmsg;
1974 case SIOCADDRT: /* Add a route */
1975 case SIOCDELRT: /* Delete a route */
1976 if (!capable(CAP_NET_ADMIN))
1978 err = copy_from_user(&rtmsg, arg,
1979 sizeof(struct in6_rtmsg));
1983 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1988 err = ip6_route_add(&cfg);
1991 err = ip6_route_del(&cfg);
2005 * Drop the packet on the floor
2008 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2011 struct dst_entry *dst = skb_dst(skb);
2012 switch (ipstats_mib_noroutes) {
2013 case IPSTATS_MIB_INNOROUTES:
2014 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2015 if (type == IPV6_ADDR_ANY) {
2016 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2017 IPSTATS_MIB_INADDRERRORS);
2021 case IPSTATS_MIB_OUTNOROUTES:
2022 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2023 ipstats_mib_noroutes);
2026 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2031 static int ip6_pkt_discard(struct sk_buff *skb)
2033 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2036 static int ip6_pkt_discard_out(struct sk_buff *skb)
2038 skb->dev = skb_dst(skb)->dev;
2039 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2042 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2044 static int ip6_pkt_prohibit(struct sk_buff *skb)
2046 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2049 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2051 skb->dev = skb_dst(skb)->dev;
2052 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2058 * Allocate a dst for local (unicast / anycast) address.
2061 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2062 const struct in6_addr *addr,
2065 struct net *net = dev_net(idev->dev);
2066 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2067 net->loopback_dev, 0);
2071 if (net_ratelimit())
2072 pr_warning("IPv6: Maximum number of routes reached,"
2073 " consider increasing route/max_size.\n");
2074 return ERR_PTR(-ENOMEM);
2079 rt->dst.flags |= DST_HOST;
2080 rt->dst.input = ip6_input;
2081 rt->dst.output = ip6_output;
2082 rt->rt6i_idev = idev;
2083 rt->dst.obsolete = -1;
2085 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2087 rt->rt6i_flags |= RTF_ANYCAST;
2089 rt->rt6i_flags |= RTF_LOCAL;
2090 err = rt6_bind_neighbour(rt, rt->dst.dev);
2093 return ERR_PTR(err);
2096 rt->rt6i_dst.addr = *addr;
2097 rt->rt6i_dst.plen = 128;
2098 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2100 atomic_set(&rt->dst.__refcnt, 1);
2105 int ip6_route_get_saddr(struct net *net,
2106 struct rt6_info *rt,
2107 const struct in6_addr *daddr,
2109 struct in6_addr *saddr)
2111 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2113 if (rt->rt6i_prefsrc.plen)
2114 *saddr = rt->rt6i_prefsrc.addr;
2116 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2117 daddr, prefs, saddr);
2121 /* remove deleted ip from prefsrc entries */
2122 struct arg_dev_net_ip {
2123 struct net_device *dev;
2125 struct in6_addr *addr;
2128 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2130 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2131 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2132 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2134 if (((void *)rt->dst.dev == dev || !dev) &&
2135 rt != net->ipv6.ip6_null_entry &&
2136 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2137 /* remove prefsrc entry */
2138 rt->rt6i_prefsrc.plen = 0;
2143 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2145 struct net *net = dev_net(ifp->idev->dev);
2146 struct arg_dev_net_ip adni = {
2147 .dev = ifp->idev->dev,
2151 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2154 struct arg_dev_net {
2155 struct net_device *dev;
2159 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2161 const struct arg_dev_net *adn = arg;
2162 const struct net_device *dev = adn->dev;
2164 if ((rt->dst.dev == dev || !dev) &&
2165 rt != adn->net->ipv6.ip6_null_entry)
2171 void rt6_ifdown(struct net *net, struct net_device *dev)
2173 struct arg_dev_net adn = {
2178 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2179 icmp6_clean_all(fib6_ifdown, &adn);
2182 struct rt6_mtu_change_arg
2184 struct net_device *dev;
2188 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2190 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2191 struct inet6_dev *idev;
2193 /* In IPv6 pmtu discovery is not optional,
2194 so that RTAX_MTU lock cannot disable it.
2195 We still use this lock to block changes
2196 caused by addrconf/ndisc.
2199 idev = __in6_dev_get(arg->dev);
2203 /* For administrative MTU increase, there is no way to discover
2204 IPv6 PMTU increase, so PMTU increase should be updated here.
2205 Since RFC 1981 doesn't include administrative MTU increase
2206 update PMTU increase is a MUST. (i.e. jumbo frame)
2209 If new MTU is less than route PMTU, this new MTU will be the
2210 lowest MTU in the path, update the route PMTU to reflect PMTU
2211 decreases; if new MTU is greater than route PMTU, and the
2212 old MTU is the lowest MTU in the path, update the route PMTU
2213 to reflect the increase. In this case if the other nodes' MTU
2214 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2217 if (rt->dst.dev == arg->dev &&
2218 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2219 (dst_mtu(&rt->dst) >= arg->mtu ||
2220 (dst_mtu(&rt->dst) < arg->mtu &&
2221 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2222 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2227 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2229 struct rt6_mtu_change_arg arg = {
2234 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2237 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2238 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2239 [RTA_OIF] = { .type = NLA_U32 },
2240 [RTA_IIF] = { .type = NLA_U32 },
2241 [RTA_PRIORITY] = { .type = NLA_U32 },
2242 [RTA_METRICS] = { .type = NLA_NESTED },
2245 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2246 struct fib6_config *cfg)
2249 struct nlattr *tb[RTA_MAX+1];
2252 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2257 rtm = nlmsg_data(nlh);
2258 memset(cfg, 0, sizeof(*cfg));
2260 cfg->fc_table = rtm->rtm_table;
2261 cfg->fc_dst_len = rtm->rtm_dst_len;
2262 cfg->fc_src_len = rtm->rtm_src_len;
2263 cfg->fc_flags = RTF_UP;
2264 cfg->fc_protocol = rtm->rtm_protocol;
2266 if (rtm->rtm_type == RTN_UNREACHABLE)
2267 cfg->fc_flags |= RTF_REJECT;
2269 if (rtm->rtm_type == RTN_LOCAL)
2270 cfg->fc_flags |= RTF_LOCAL;
2272 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2273 cfg->fc_nlinfo.nlh = nlh;
2274 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2276 if (tb[RTA_GATEWAY]) {
2277 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2278 cfg->fc_flags |= RTF_GATEWAY;
2282 int plen = (rtm->rtm_dst_len + 7) >> 3;
2284 if (nla_len(tb[RTA_DST]) < plen)
2287 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2291 int plen = (rtm->rtm_src_len + 7) >> 3;
2293 if (nla_len(tb[RTA_SRC]) < plen)
2296 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2299 if (tb[RTA_PREFSRC])
2300 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2303 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2305 if (tb[RTA_PRIORITY])
2306 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2308 if (tb[RTA_METRICS]) {
2309 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2310 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2314 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2321 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2323 struct fib6_config cfg;
2326 err = rtm_to_fib6_config(skb, nlh, &cfg);
2330 return ip6_route_del(&cfg);
2333 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2335 struct fib6_config cfg;
2338 err = rtm_to_fib6_config(skb, nlh, &cfg);
2342 return ip6_route_add(&cfg);
2345 static inline size_t rt6_nlmsg_size(void)
2347 return NLMSG_ALIGN(sizeof(struct rtmsg))
2348 + nla_total_size(16) /* RTA_SRC */
2349 + nla_total_size(16) /* RTA_DST */
2350 + nla_total_size(16) /* RTA_GATEWAY */
2351 + nla_total_size(16) /* RTA_PREFSRC */
2352 + nla_total_size(4) /* RTA_TABLE */
2353 + nla_total_size(4) /* RTA_IIF */
2354 + nla_total_size(4) /* RTA_OIF */
2355 + nla_total_size(4) /* RTA_PRIORITY */
2356 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2357 + nla_total_size(sizeof(struct rta_cacheinfo));
2360 static int rt6_fill_node(struct net *net,
2361 struct sk_buff *skb, struct rt6_info *rt,
2362 struct in6_addr *dst, struct in6_addr *src,
2363 int iif, int type, u32 pid, u32 seq,
2364 int prefix, int nowait, unsigned int flags)
2366 const struct inet_peer *peer;
2368 struct nlmsghdr *nlh;
2371 struct neighbour *n;
2374 if (prefix) { /* user wants prefix routes only */
2375 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2376 /* success since this is not a prefix route */
2381 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2385 rtm = nlmsg_data(nlh);
2386 rtm->rtm_family = AF_INET6;
2387 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2388 rtm->rtm_src_len = rt->rt6i_src.plen;
2391 table = rt->rt6i_table->tb6_id;
2393 table = RT6_TABLE_UNSPEC;
2394 rtm->rtm_table = table;
2395 NLA_PUT_U32(skb, RTA_TABLE, table);
2396 if (rt->rt6i_flags & RTF_REJECT)
2397 rtm->rtm_type = RTN_UNREACHABLE;
2398 else if (rt->rt6i_flags & RTF_LOCAL)
2399 rtm->rtm_type = RTN_LOCAL;
2400 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2401 rtm->rtm_type = RTN_LOCAL;
2403 rtm->rtm_type = RTN_UNICAST;
2405 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2406 rtm->rtm_protocol = rt->rt6i_protocol;
2407 if (rt->rt6i_flags & RTF_DYNAMIC)
2408 rtm->rtm_protocol = RTPROT_REDIRECT;
2409 else if (rt->rt6i_flags & RTF_ADDRCONF)
2410 rtm->rtm_protocol = RTPROT_KERNEL;
2411 else if (rt->rt6i_flags & RTF_DEFAULT)
2412 rtm->rtm_protocol = RTPROT_RA;
2414 if (rt->rt6i_flags & RTF_CACHE)
2415 rtm->rtm_flags |= RTM_F_CLONED;
2418 NLA_PUT(skb, RTA_DST, 16, dst);
2419 rtm->rtm_dst_len = 128;
2420 } else if (rtm->rtm_dst_len)
2421 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2422 #ifdef CONFIG_IPV6_SUBTREES
2424 NLA_PUT(skb, RTA_SRC, 16, src);
2425 rtm->rtm_src_len = 128;
2426 } else if (rtm->rtm_src_len)
2427 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2430 #ifdef CONFIG_IPV6_MROUTE
2431 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2432 int err = ip6mr_get_route(net, skb, rtm, nowait);
2437 goto nla_put_failure;
2439 if (err == -EMSGSIZE)
2440 goto nla_put_failure;
2445 NLA_PUT_U32(skb, RTA_IIF, iif);
2447 struct in6_addr saddr_buf;
2448 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2449 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2452 if (rt->rt6i_prefsrc.plen) {
2453 struct in6_addr saddr_buf;
2454 saddr_buf = rt->rt6i_prefsrc.addr;
2455 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2458 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2459 goto nla_put_failure;
2462 n = dst_get_neighbour_noref(&rt->dst);
2464 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2468 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2470 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2472 if (!(rt->rt6i_flags & RTF_EXPIRES))
2474 else if (rt->dst.expires - jiffies < INT_MAX)
2475 expires = rt->dst.expires - jiffies;
2479 peer = rt->rt6i_peer;
2481 if (peer && peer->tcp_ts_stamp) {
2483 tsage = get_seconds() - peer->tcp_ts_stamp;
2486 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2487 expires, rt->dst.error) < 0)
2488 goto nla_put_failure;
2490 return nlmsg_end(skb, nlh);
2493 nlmsg_cancel(skb, nlh);
2497 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2499 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2502 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2503 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2504 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2508 return rt6_fill_node(arg->net,
2509 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2510 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2511 prefix, 0, NLM_F_MULTI);
2514 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2516 struct net *net = sock_net(in_skb->sk);
2517 struct nlattr *tb[RTA_MAX+1];
2518 struct rt6_info *rt;
2519 struct sk_buff *skb;
2524 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2529 memset(&fl6, 0, sizeof(fl6));
2532 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2535 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2539 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2542 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2546 iif = nla_get_u32(tb[RTA_IIF]);
2549 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2552 struct net_device *dev;
2553 dev = __dev_get_by_index(net, iif);
2560 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2566 /* Reserve room for dummy headers, this skb can pass
2567 through good chunk of routing engine.
2569 skb_reset_mac_header(skb);
2570 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2572 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2573 skb_dst_set(skb, &rt->dst);
2575 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2576 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2577 nlh->nlmsg_seq, 0, 0, 0);
2583 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2588 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2590 struct sk_buff *skb;
2591 struct net *net = info->nl_net;
2596 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2598 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2602 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2603 event, info->pid, seq, 0, 0, 0);
2605 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2606 WARN_ON(err == -EMSGSIZE);
2610 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2611 info->nlh, gfp_any());
2615 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2618 static int ip6_route_dev_notify(struct notifier_block *this,
2619 unsigned long event, void *data)
2621 struct net_device *dev = (struct net_device *)data;
2622 struct net *net = dev_net(dev);
2624 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2625 net->ipv6.ip6_null_entry->dst.dev = dev;
2626 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2627 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2628 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2629 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2630 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2631 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2642 #ifdef CONFIG_PROC_FS
2653 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2655 struct seq_file *m = p_arg;
2656 struct neighbour *n;
2658 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2660 #ifdef CONFIG_IPV6_SUBTREES
2661 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2663 seq_puts(m, "00000000000000000000000000000000 00 ");
2666 n = dst_get_neighbour_noref(&rt->dst);
2668 seq_printf(m, "%pi6", n->primary_key);
2670 seq_puts(m, "00000000000000000000000000000000");
2673 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2674 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2675 rt->dst.__use, rt->rt6i_flags,
2676 rt->dst.dev ? rt->dst.dev->name : "");
2680 static int ipv6_route_show(struct seq_file *m, void *v)
2682 struct net *net = (struct net *)m->private;
2683 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2687 static int ipv6_route_open(struct inode *inode, struct file *file)
2689 return single_open_net(inode, file, ipv6_route_show);
2692 static const struct file_operations ipv6_route_proc_fops = {
2693 .owner = THIS_MODULE,
2694 .open = ipv6_route_open,
2696 .llseek = seq_lseek,
2697 .release = single_release_net,
2700 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2702 struct net *net = (struct net *)seq->private;
2703 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2704 net->ipv6.rt6_stats->fib_nodes,
2705 net->ipv6.rt6_stats->fib_route_nodes,
2706 net->ipv6.rt6_stats->fib_rt_alloc,
2707 net->ipv6.rt6_stats->fib_rt_entries,
2708 net->ipv6.rt6_stats->fib_rt_cache,
2709 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2710 net->ipv6.rt6_stats->fib_discarded_routes);
2715 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2717 return single_open_net(inode, file, rt6_stats_seq_show);
2720 static const struct file_operations rt6_stats_seq_fops = {
2721 .owner = THIS_MODULE,
2722 .open = rt6_stats_seq_open,
2724 .llseek = seq_lseek,
2725 .release = single_release_net,
2727 #endif /* CONFIG_PROC_FS */
2729 #ifdef CONFIG_SYSCTL
2732 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2733 void __user *buffer, size_t *lenp, loff_t *ppos)
2740 net = (struct net *)ctl->extra1;
2741 delay = net->ipv6.sysctl.flush_delay;
2742 proc_dointvec(ctl, write, buffer, lenp, ppos);
2743 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2747 ctl_table ipv6_route_table_template[] = {
2749 .procname = "flush",
2750 .data = &init_net.ipv6.sysctl.flush_delay,
2751 .maxlen = sizeof(int),
2753 .proc_handler = ipv6_sysctl_rtcache_flush
2756 .procname = "gc_thresh",
2757 .data = &ip6_dst_ops_template.gc_thresh,
2758 .maxlen = sizeof(int),
2760 .proc_handler = proc_dointvec,
2763 .procname = "max_size",
2764 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2765 .maxlen = sizeof(int),
2767 .proc_handler = proc_dointvec,
2770 .procname = "gc_min_interval",
2771 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2772 .maxlen = sizeof(int),
2774 .proc_handler = proc_dointvec_jiffies,
2777 .procname = "gc_timeout",
2778 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2779 .maxlen = sizeof(int),
2781 .proc_handler = proc_dointvec_jiffies,
2784 .procname = "gc_interval",
2785 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2786 .maxlen = sizeof(int),
2788 .proc_handler = proc_dointvec_jiffies,
2791 .procname = "gc_elasticity",
2792 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2793 .maxlen = sizeof(int),
2795 .proc_handler = proc_dointvec,
2798 .procname = "mtu_expires",
2799 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2800 .maxlen = sizeof(int),
2802 .proc_handler = proc_dointvec_jiffies,
2805 .procname = "min_adv_mss",
2806 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2807 .maxlen = sizeof(int),
2809 .proc_handler = proc_dointvec,
2812 .procname = "gc_min_interval_ms",
2813 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2814 .maxlen = sizeof(int),
2816 .proc_handler = proc_dointvec_ms_jiffies,
2821 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2823 struct ctl_table *table;
2825 table = kmemdup(ipv6_route_table_template,
2826 sizeof(ipv6_route_table_template),
2830 table[0].data = &net->ipv6.sysctl.flush_delay;
2831 table[0].extra1 = net;
2832 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2833 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2834 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2835 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2836 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2837 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2838 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2839 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2840 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2847 static int __net_init ip6_route_net_init(struct net *net)
2851 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2852 sizeof(net->ipv6.ip6_dst_ops));
2854 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2855 goto out_ip6_dst_ops;
2857 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2858 sizeof(*net->ipv6.ip6_null_entry),
2860 if (!net->ipv6.ip6_null_entry)
2861 goto out_ip6_dst_entries;
2862 net->ipv6.ip6_null_entry->dst.path =
2863 (struct dst_entry *)net->ipv6.ip6_null_entry;
2864 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2865 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2866 ip6_template_metrics, true);
2868 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2869 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2870 sizeof(*net->ipv6.ip6_prohibit_entry),
2872 if (!net->ipv6.ip6_prohibit_entry)
2873 goto out_ip6_null_entry;
2874 net->ipv6.ip6_prohibit_entry->dst.path =
2875 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2876 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2877 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2878 ip6_template_metrics, true);
2880 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2881 sizeof(*net->ipv6.ip6_blk_hole_entry),
2883 if (!net->ipv6.ip6_blk_hole_entry)
2884 goto out_ip6_prohibit_entry;
2885 net->ipv6.ip6_blk_hole_entry->dst.path =
2886 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2887 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2888 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2889 ip6_template_metrics, true);
2892 net->ipv6.sysctl.flush_delay = 0;
2893 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2894 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2895 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2896 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2897 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2898 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2899 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2901 #ifdef CONFIG_PROC_FS
2902 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2903 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2905 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2911 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2912 out_ip6_prohibit_entry:
2913 kfree(net->ipv6.ip6_prohibit_entry);
2915 kfree(net->ipv6.ip6_null_entry);
2917 out_ip6_dst_entries:
2918 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2923 static void __net_exit ip6_route_net_exit(struct net *net)
2925 #ifdef CONFIG_PROC_FS
2926 proc_net_remove(net, "ipv6_route");
2927 proc_net_remove(net, "rt6_stats");
2929 kfree(net->ipv6.ip6_null_entry);
2930 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2931 kfree(net->ipv6.ip6_prohibit_entry);
2932 kfree(net->ipv6.ip6_blk_hole_entry);
2934 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2937 static struct pernet_operations ip6_route_net_ops = {
2938 .init = ip6_route_net_init,
2939 .exit = ip6_route_net_exit,
2942 static struct notifier_block ip6_route_dev_notifier = {
2943 .notifier_call = ip6_route_dev_notify,
2947 int __init ip6_route_init(void)
2952 ip6_dst_ops_template.kmem_cachep =
2953 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2954 SLAB_HWCACHE_ALIGN, NULL);
2955 if (!ip6_dst_ops_template.kmem_cachep)
2958 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2960 goto out_kmem_cache;
2962 ret = register_pernet_subsys(&ip6_route_net_ops);
2964 goto out_dst_entries;
2966 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2968 /* Registering of the loopback is done before this portion of code,
2969 * the loopback reference in rt6_info will not be taken, do it
2970 * manually for init_net */
2971 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2972 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2973 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2974 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2975 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2977 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2981 goto out_register_subsys;
2987 ret = fib6_rules_init();
2992 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2993 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2994 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2995 goto fib6_rules_init;
2997 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2999 goto fib6_rules_init;
3005 fib6_rules_cleanup();
3010 out_register_subsys:
3011 unregister_pernet_subsys(&ip6_route_net_ops);
3013 dst_entries_destroy(&ip6_dst_blackhole_ops);
3015 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3019 void ip6_route_cleanup(void)
3021 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3022 fib6_rules_cleanup();
3025 unregister_pernet_subsys(&ip6_route_net_ops);
3026 dst_entries_destroy(&ip6_dst_blackhole_ops);
3027 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);