1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * ROUTE - implementation of the IP router.
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
61 #define pr_fmt(fmt) "IPv4: " fmt
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
107 #include <linux/sysctl.h>
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
113 #include "fib_lookup.h"
115 #define RT_FL_TOS(oldflp4) \
116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118 #define RT_GC_TIMEOUT (300*HZ)
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly = 9;
122 static int ip_rt_redirect_load __read_mostly = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly = HZ;
125 static int ip_rt_error_burst __read_mostly = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly = 256;
130 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
133 * Interface to generic destination cache.
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void ipv4_link_failure(struct sk_buff *skb);
141 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142 struct sk_buff *skb, u32 mtu,
144 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159 static struct dst_ops ipv4_dst_ops = {
161 .check = ipv4_dst_check,
162 .default_advmss = ipv4_default_advmss,
164 .cow_metrics = ipv4_cow_metrics,
165 .destroy = ipv4_dst_destroy,
166 .negative_advice = ipv4_negative_advice,
167 .link_failure = ipv4_link_failure,
168 .update_pmtu = ip_rt_update_pmtu,
169 .redirect = ip_do_redirect,
170 .local_out = __ip_local_out,
171 .neigh_lookup = ipv4_neigh_lookup,
172 .confirm_neigh = ipv4_confirm_neigh,
175 #define ECN_OR_COST(class) TC_PRIO_##class
177 const __u8 ip_tos2prio[16] = {
179 ECN_OR_COST(BESTEFFORT),
181 ECN_OR_COST(BESTEFFORT),
187 ECN_OR_COST(INTERACTIVE),
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE_BULK,
191 ECN_OR_COST(INTERACTIVE_BULK),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK)
195 EXPORT_SYMBOL(ip_tos2prio);
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
205 return SEQ_START_TOKEN;
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 if (v == SEQ_START_TOKEN)
221 seq_printf(seq, "%-127s\n",
222 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
228 static const struct seq_operations rt_cache_seq_ops = {
229 .start = rt_cache_seq_start,
230 .next = rt_cache_seq_next,
231 .stop = rt_cache_seq_stop,
232 .show = rt_cache_seq_show,
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 return seq_open(file, &rt_cache_seq_ops);
240 static const struct proc_ops rt_cache_proc_ops = {
241 .proc_open = rt_cache_seq_open,
242 .proc_read = seq_read,
243 .proc_lseek = seq_lseek,
244 .proc_release = seq_release,
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
253 return SEQ_START_TOKEN;
255 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256 if (!cpu_possible(cpu))
259 return &per_cpu(rt_cache_stat, cpu);
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269 if (!cpu_possible(cpu))
272 return &per_cpu(rt_cache_stat, cpu);
279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 struct rt_cache_stat *st = v;
288 if (v == SEQ_START_TOKEN) {
289 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
293 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
294 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
295 dst_entries_get_slow(&ipv4_dst_ops),
308 0, /* st->gc_total */
309 0, /* st->gc_ignored */
310 0, /* st->gc_goal_miss */
311 0, /* st->gc_dst_overflow */
312 0, /* st->in_hlist_search */
313 0 /* st->out_hlist_search */
318 static const struct seq_operations rt_cpu_seq_ops = {
319 .start = rt_cpu_seq_start,
320 .next = rt_cpu_seq_next,
321 .stop = rt_cpu_seq_stop,
322 .show = rt_cpu_seq_show,
326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 return seq_open(file, &rt_cpu_seq_ops);
331 static const struct proc_ops rt_cpu_proc_ops = {
332 .proc_open = rt_cpu_seq_open,
333 .proc_read = seq_read,
334 .proc_lseek = seq_lseek,
335 .proc_release = seq_release,
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 struct ip_rt_acct *dst, *src;
344 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348 for_each_possible_cpu(i) {
349 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350 for (j = 0; j < 256; j++) {
351 dst[j].o_bytes += src[j].o_bytes;
352 dst[j].o_packets += src[j].o_packets;
353 dst[j].i_bytes += src[j].i_bytes;
354 dst[j].i_packets += src[j].i_packets;
358 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
364 static int __net_init ip_rt_do_proc_init(struct net *net)
366 struct proc_dir_entry *pde;
368 pde = proc_create("rt_cache", 0444, net->proc_net,
373 pde = proc_create("rt_cache", 0444,
374 net->proc_net_stat, &rt_cpu_proc_ops);
378 #ifdef CONFIG_IP_ROUTE_CLASSID
379 pde = proc_create_single("rt_acct", 0, net->proc_net,
386 #ifdef CONFIG_IP_ROUTE_CLASSID
388 remove_proc_entry("rt_cache", net->proc_net_stat);
391 remove_proc_entry("rt_cache", net->proc_net);
396 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 remove_proc_entry("rt_cache", net->proc_net_stat);
399 remove_proc_entry("rt_cache", net->proc_net);
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 remove_proc_entry("rt_acct", net->proc_net);
405 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
406 .init = ip_rt_do_proc_init,
407 .exit = ip_rt_do_proc_exit,
410 static int __init ip_rt_proc_init(void)
412 return register_pernet_subsys(&ip_rt_proc_ops);
416 static inline int ip_rt_proc_init(void)
420 #endif /* CONFIG_PROC_FS */
422 static inline bool rt_is_expired(const struct rtable *rth)
424 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
427 void rt_cache_flush(struct net *net)
429 rt_genid_bump_ipv4(net);
432 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
436 const struct rtable *rt = container_of(dst, struct rtable, dst);
437 struct net_device *dev = dst->dev;
442 if (likely(rt->rt_gw_family == AF_INET)) {
443 n = ip_neigh_gw4(dev, rt->rt_gw4);
444 } else if (rt->rt_gw_family == AF_INET6) {
445 n = ip_neigh_gw6(dev, &rt->rt_gw6);
449 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450 n = ip_neigh_gw4(dev, pkey);
453 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
456 rcu_read_unlock_bh();
461 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 const struct rtable *rt = container_of(dst, struct rtable, dst);
464 struct net_device *dev = dst->dev;
465 const __be32 *pkey = daddr;
467 if (rt->rt_gw_family == AF_INET) {
468 pkey = (const __be32 *)&rt->rt_gw4;
469 } else if (rt->rt_gw_family == AF_INET6) {
470 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
473 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
476 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
479 #define IP_IDENTS_SZ 2048u
481 static atomic_t *ip_idents __read_mostly;
482 static u32 *ip_tstamps __read_mostly;
484 /* In order to protect privacy, we add a perturbation to identifiers
485 * if one generator is seldom used. This makes hard for an attacker
486 * to infer how many packets were sent between two points in time.
488 u32 ip_idents_reserve(u32 hash, int segs)
490 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
492 u32 old = READ_ONCE(*p_tstamp);
493 u32 now = (u32)jiffies;
496 if (old != now && cmpxchg(p_tstamp, old, now) == old)
497 delta = prandom_u32_max(now - old);
499 /* If UBSAN reports an error there, please make sure your compiler
500 * supports -fno-strict-overflow before reporting it that was a bug
501 * in UBSAN, and it has been fixed in GCC-8.
503 return atomic_add_return(segs + delta, p_id) - segs;
505 EXPORT_SYMBOL(ip_idents_reserve);
507 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
511 /* Note the following code is not safe, but this is okay. */
512 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
513 get_random_bytes(&net->ipv4.ip_id_key,
514 sizeof(net->ipv4.ip_id_key));
516 hash = siphash_3u32((__force u32)iph->daddr,
517 (__force u32)iph->saddr,
519 &net->ipv4.ip_id_key);
520 id = ip_idents_reserve(hash, segs);
523 EXPORT_SYMBOL(__ip_select_ident);
525 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
526 const struct sock *sk,
527 const struct iphdr *iph,
529 u8 prot, u32 mark, int flow_flags)
532 const struct inet_sock *inet = inet_sk(sk);
534 oif = sk->sk_bound_dev_if;
536 tos = RT_CONN_FLAGS(sk);
537 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
539 flowi4_init_output(fl4, oif, mark, tos,
540 RT_SCOPE_UNIVERSE, prot,
542 iph->daddr, iph->saddr, 0, 0,
543 sock_net_uid(net, sk));
546 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
547 const struct sock *sk)
549 const struct net *net = dev_net(skb->dev);
550 const struct iphdr *iph = ip_hdr(skb);
551 int oif = skb->dev->ifindex;
552 u8 tos = RT_TOS(iph->tos);
553 u8 prot = iph->protocol;
554 u32 mark = skb->mark;
556 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
559 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
561 const struct inet_sock *inet = inet_sk(sk);
562 const struct ip_options_rcu *inet_opt;
563 __be32 daddr = inet->inet_daddr;
566 inet_opt = rcu_dereference(inet->inet_opt);
567 if (inet_opt && inet_opt->opt.srr)
568 daddr = inet_opt->opt.faddr;
569 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
570 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
571 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
572 inet_sk_flowi_flags(sk),
573 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
577 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
578 const struct sk_buff *skb)
581 build_skb_flow_key(fl4, skb, sk);
583 build_sk_flow_key(fl4, sk);
586 static DEFINE_SPINLOCK(fnhe_lock);
588 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
592 rt = rcu_dereference(fnhe->fnhe_rth_input);
594 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
595 dst_dev_put(&rt->dst);
596 dst_release(&rt->dst);
598 rt = rcu_dereference(fnhe->fnhe_rth_output);
600 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
601 dst_dev_put(&rt->dst);
602 dst_release(&rt->dst);
606 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
608 struct fib_nh_exception *fnhe, *oldest;
610 oldest = rcu_dereference(hash->chain);
611 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
612 fnhe = rcu_dereference(fnhe->fnhe_next)) {
613 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
616 fnhe_flush_routes(oldest);
620 static inline u32 fnhe_hashfun(__be32 daddr)
622 static u32 fnhe_hashrnd __read_mostly;
625 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
626 hval = jhash_1word((__force u32)daddr, fnhe_hashrnd);
627 return hash_32(hval, FNHE_HASH_SHIFT);
630 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
632 rt->rt_pmtu = fnhe->fnhe_pmtu;
633 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
634 rt->dst.expires = fnhe->fnhe_expires;
637 rt->rt_flags |= RTCF_REDIRECTED;
638 rt->rt_uses_gateway = 1;
639 rt->rt_gw_family = AF_INET;
640 rt->rt_gw4 = fnhe->fnhe_gw;
644 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
645 __be32 gw, u32 pmtu, bool lock,
646 unsigned long expires)
648 struct fnhe_hash_bucket *hash;
649 struct fib_nh_exception *fnhe;
655 genid = fnhe_genid(dev_net(nhc->nhc_dev));
656 hval = fnhe_hashfun(daddr);
658 spin_lock_bh(&fnhe_lock);
660 hash = rcu_dereference(nhc->nhc_exceptions);
662 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
665 rcu_assign_pointer(nhc->nhc_exceptions, hash);
671 for (fnhe = rcu_dereference(hash->chain); fnhe;
672 fnhe = rcu_dereference(fnhe->fnhe_next)) {
673 if (fnhe->fnhe_daddr == daddr)
679 if (fnhe->fnhe_genid != genid)
680 fnhe->fnhe_genid = genid;
684 fnhe->fnhe_pmtu = pmtu;
685 fnhe->fnhe_mtu_locked = lock;
687 fnhe->fnhe_expires = max(1UL, expires);
688 /* Update all cached dsts too */
689 rt = rcu_dereference(fnhe->fnhe_rth_input);
691 fill_route_from_fnhe(rt, fnhe);
692 rt = rcu_dereference(fnhe->fnhe_rth_output);
694 fill_route_from_fnhe(rt, fnhe);
696 if (depth > FNHE_RECLAIM_DEPTH)
697 fnhe = fnhe_oldest(hash);
699 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
703 fnhe->fnhe_next = hash->chain;
704 rcu_assign_pointer(hash->chain, fnhe);
706 fnhe->fnhe_genid = genid;
707 fnhe->fnhe_daddr = daddr;
709 fnhe->fnhe_pmtu = pmtu;
710 fnhe->fnhe_mtu_locked = lock;
711 fnhe->fnhe_expires = max(1UL, expires);
713 /* Exception created; mark the cached routes for the nexthop
714 * stale, so anyone caching it rechecks if this exception
717 rt = rcu_dereference(nhc->nhc_rth_input);
719 rt->dst.obsolete = DST_OBSOLETE_KILL;
721 for_each_possible_cpu(i) {
722 struct rtable __rcu **prt;
723 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
724 rt = rcu_dereference(*prt);
726 rt->dst.obsolete = DST_OBSOLETE_KILL;
730 fnhe->fnhe_stamp = jiffies;
733 spin_unlock_bh(&fnhe_lock);
736 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739 __be32 new_gw = icmp_hdr(skb)->un.gateway;
740 __be32 old_gw = ip_hdr(skb)->saddr;
741 struct net_device *dev = skb->dev;
742 struct in_device *in_dev;
743 struct fib_result res;
747 switch (icmp_hdr(skb)->code & 7) {
749 case ICMP_REDIR_NETTOS:
750 case ICMP_REDIR_HOST:
751 case ICMP_REDIR_HOSTTOS:
758 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
761 in_dev = __in_dev_get_rcu(dev);
766 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
767 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
768 ipv4_is_zeronet(new_gw))
769 goto reject_redirect;
771 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
772 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
773 goto reject_redirect;
774 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
775 goto reject_redirect;
777 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
778 goto reject_redirect;
781 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
783 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
785 if (!(n->nud_state & NUD_VALID)) {
786 neigh_event_send(n, NULL);
788 if (fib_lookup(net, fl4, &res, 0) == 0) {
789 struct fib_nh_common *nhc = FIB_RES_NHC(res);
791 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
793 jiffies + ip_rt_gc_timeout);
796 rt->dst.obsolete = DST_OBSOLETE_KILL;
797 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
804 #ifdef CONFIG_IP_ROUTE_VERBOSE
805 if (IN_DEV_LOG_MARTIANS(in_dev)) {
806 const struct iphdr *iph = (const struct iphdr *) skb->data;
807 __be32 daddr = iph->daddr;
808 __be32 saddr = iph->saddr;
810 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
811 " Advised path = %pI4 -> %pI4\n",
812 &old_gw, dev->name, &new_gw,
819 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
823 const struct iphdr *iph = (const struct iphdr *) skb->data;
824 struct net *net = dev_net(skb->dev);
825 int oif = skb->dev->ifindex;
826 u8 tos = RT_TOS(iph->tos);
827 u8 prot = iph->protocol;
828 u32 mark = skb->mark;
830 rt = (struct rtable *) dst;
832 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
833 __ip_do_redirect(rt, skb, &fl4, true);
836 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
838 struct rtable *rt = (struct rtable *)dst;
839 struct dst_entry *ret = dst;
842 if (dst->obsolete > 0) {
845 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
856 * 1. The first ip_rt_redirect_number redirects are sent
857 * with exponential backoff, then we stop sending them at all,
858 * assuming that the host ignores our redirects.
859 * 2. If we did not see packets requiring redirects
860 * during ip_rt_redirect_silence, we assume that the host
861 * forgot redirected route and start to send redirects again.
863 * This algorithm is much cheaper and more intelligent than dumb load limiting
866 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
867 * and "frag. need" (breaks PMTU discovery) in icmp.c.
870 void ip_rt_send_redirect(struct sk_buff *skb)
872 struct rtable *rt = skb_rtable(skb);
873 struct in_device *in_dev;
874 struct inet_peer *peer;
880 in_dev = __in_dev_get_rcu(rt->dst.dev);
881 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
885 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
886 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
889 net = dev_net(rt->dst.dev);
890 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
892 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
893 rt_nexthop(rt, ip_hdr(skb)->daddr));
897 /* No redirected packets during ip_rt_redirect_silence;
898 * reset the algorithm.
900 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
901 peer->rate_tokens = 0;
902 peer->n_redirects = 0;
905 /* Too many ignored redirects; do not send anything
906 * set dst.rate_last to the last seen redirected packet.
908 if (peer->n_redirects >= ip_rt_redirect_number) {
909 peer->rate_last = jiffies;
913 /* Check for load limit; set rate_last to the latest sent
916 if (peer->n_redirects == 0 ||
919 (ip_rt_redirect_load << peer->n_redirects)))) {
920 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
922 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
923 peer->rate_last = jiffies;
925 #ifdef CONFIG_IP_ROUTE_VERBOSE
927 peer->n_redirects == ip_rt_redirect_number)
928 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
929 &ip_hdr(skb)->saddr, inet_iif(skb),
930 &ip_hdr(skb)->daddr, &gw);
937 static int ip_error(struct sk_buff *skb)
939 struct rtable *rt = skb_rtable(skb);
940 struct net_device *dev = skb->dev;
941 struct in_device *in_dev;
942 struct inet_peer *peer;
948 if (netif_is_l3_master(skb->dev)) {
949 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
954 in_dev = __in_dev_get_rcu(dev);
956 /* IP on this device is disabled. */
960 net = dev_net(rt->dst.dev);
961 if (!IN_DEV_FORWARD(in_dev)) {
962 switch (rt->dst.error) {
964 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
968 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
974 switch (rt->dst.error) {
979 code = ICMP_HOST_UNREACH;
982 code = ICMP_NET_UNREACH;
983 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
986 code = ICMP_PKT_FILTERED;
990 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
991 l3mdev_master_ifindex(skb->dev), 1);
996 peer->rate_tokens += now - peer->rate_last;
997 if (peer->rate_tokens > ip_rt_error_burst)
998 peer->rate_tokens = ip_rt_error_burst;
999 peer->rate_last = now;
1000 if (peer->rate_tokens >= ip_rt_error_cost)
1001 peer->rate_tokens -= ip_rt_error_cost;
1007 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1009 out: kfree_skb(skb);
1013 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1015 struct dst_entry *dst = &rt->dst;
1016 struct fib_result res;
1020 if (ip_mtu_locked(dst))
1023 old_mtu = ipv4_mtu(dst);
1027 if (mtu < ip_rt_min_pmtu) {
1029 mtu = min(old_mtu, ip_rt_min_pmtu);
1032 if (rt->rt_pmtu == mtu && !lock &&
1033 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1037 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1038 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1040 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1041 jiffies + ip_rt_mtu_expires);
1046 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1047 struct sk_buff *skb, u32 mtu,
1050 struct rtable *rt = (struct rtable *) dst;
1053 ip_rt_build_flow_key(&fl4, sk, skb);
1055 /* Don't make lookup fail for bridged encapsulations */
1056 if (skb && netif_is_any_bridge_port(skb->dev))
1059 __ip_rt_update_pmtu(rt, &fl4, mtu);
1062 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1063 int oif, u8 protocol)
1065 const struct iphdr *iph = (const struct iphdr *)skb->data;
1068 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1070 __build_flow_key(net, &fl4, NULL, iph, oif,
1071 RT_TOS(iph->tos), protocol, mark, 0);
1072 rt = __ip_route_output_key(net, &fl4);
1074 __ip_rt_update_pmtu(rt, &fl4, mtu);
1078 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1080 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1082 const struct iphdr *iph = (const struct iphdr *)skb->data;
1086 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1088 if (!fl4.flowi4_mark)
1089 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1091 rt = __ip_route_output_key(sock_net(sk), &fl4);
1093 __ip_rt_update_pmtu(rt, &fl4, mtu);
1098 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1100 const struct iphdr *iph = (const struct iphdr *)skb->data;
1103 struct dst_entry *odst = NULL;
1105 struct net *net = sock_net(sk);
1109 if (!ip_sk_accept_pmtu(sk))
1112 odst = sk_dst_get(sk);
1114 if (sock_owned_by_user(sk) || !odst) {
1115 __ipv4_sk_update_pmtu(skb, sk, mtu);
1119 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1121 rt = (struct rtable *)odst;
1122 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1123 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130 __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1132 if (!dst_check(&rt->dst, 0)) {
1134 dst_release(&rt->dst);
1136 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144 sk_dst_set(sk, &rt->dst);
1150 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1152 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1153 int oif, u8 protocol)
1155 const struct iphdr *iph = (const struct iphdr *)skb->data;
1159 __build_flow_key(net, &fl4, NULL, iph, oif,
1160 RT_TOS(iph->tos), protocol, 0, 0);
1161 rt = __ip_route_output_key(net, &fl4);
1163 __ip_do_redirect(rt, skb, &fl4, false);
1167 EXPORT_SYMBOL_GPL(ipv4_redirect);
1169 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1171 const struct iphdr *iph = (const struct iphdr *)skb->data;
1174 struct net *net = sock_net(sk);
1176 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1177 rt = __ip_route_output_key(net, &fl4);
1179 __ip_do_redirect(rt, skb, &fl4, false);
1183 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1185 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1187 struct rtable *rt = (struct rtable *) dst;
1189 /* All IPV4 dsts are created with ->obsolete set to the value
1190 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1191 * into this function always.
1193 * When a PMTU/redirect information update invalidates a route,
1194 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1195 * DST_OBSOLETE_DEAD.
1197 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1202 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1204 struct ip_options opt;
1207 /* Recompile ip options since IPCB may not be valid anymore.
1208 * Also check we have a reasonable ipv4 header.
1210 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1211 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1214 memset(&opt, 0, sizeof(opt));
1215 if (ip_hdr(skb)->ihl > 5) {
1216 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1218 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1221 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1227 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1230 static void ipv4_link_failure(struct sk_buff *skb)
1234 ipv4_send_dest_unreach(skb);
1236 rt = skb_rtable(skb);
1238 dst_set_expires(&rt->dst, 0);
1241 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1243 pr_debug("%s: %pI4 -> %pI4, %s\n",
1244 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1245 skb->dev ? skb->dev->name : "?");
1252 We do not cache source address of outgoing interface,
1253 because it is used only by IP RR, TS and SRR options,
1254 so that it out of fast path.
1256 BTW remember: "addr" is allowed to be not aligned
1260 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1264 if (rt_is_output_route(rt))
1265 src = ip_hdr(skb)->saddr;
1267 struct fib_result res;
1268 struct iphdr *iph = ip_hdr(skb);
1269 struct flowi4 fl4 = {
1270 .daddr = iph->daddr,
1271 .saddr = iph->saddr,
1272 .flowi4_tos = RT_TOS(iph->tos),
1273 .flowi4_oif = rt->dst.dev->ifindex,
1274 .flowi4_iif = skb->dev->ifindex,
1275 .flowi4_mark = skb->mark,
1279 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1280 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1282 src = inet_select_addr(rt->dst.dev,
1283 rt_nexthop(rt, iph->daddr),
1287 memcpy(addr, &src, 4);
1290 #ifdef CONFIG_IP_ROUTE_CLASSID
1291 static void set_class_tag(struct rtable *rt, u32 tag)
1293 if (!(rt->dst.tclassid & 0xFFFF))
1294 rt->dst.tclassid |= tag & 0xFFFF;
1295 if (!(rt->dst.tclassid & 0xFFFF0000))
1296 rt->dst.tclassid |= tag & 0xFFFF0000;
1300 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1302 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1303 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1306 return min(advmss, IPV4_MAX_PMTU - header_size);
1309 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1311 const struct rtable *rt = (const struct rtable *)dst;
1312 unsigned int mtu = rt->rt_pmtu;
1314 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1315 mtu = dst_metric_raw(dst, RTAX_MTU);
1320 mtu = READ_ONCE(dst->dev->mtu);
1322 if (unlikely(ip_mtu_locked(dst))) {
1323 if (rt->rt_uses_gateway && mtu > 576)
1327 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1329 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1332 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1334 struct fnhe_hash_bucket *hash;
1335 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1336 u32 hval = fnhe_hashfun(daddr);
1338 spin_lock_bh(&fnhe_lock);
1340 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1341 lockdep_is_held(&fnhe_lock));
1344 fnhe_p = &hash->chain;
1345 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1347 if (fnhe->fnhe_daddr == daddr) {
1348 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1349 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1350 /* set fnhe_daddr to 0 to ensure it won't bind with
1351 * new dsts in rt_bind_exception().
1353 fnhe->fnhe_daddr = 0;
1354 fnhe_flush_routes(fnhe);
1355 kfree_rcu(fnhe, rcu);
1358 fnhe_p = &fnhe->fnhe_next;
1359 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1360 lockdep_is_held(&fnhe_lock));
1363 spin_unlock_bh(&fnhe_lock);
1366 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1369 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1370 struct fib_nh_exception *fnhe;
1376 hval = fnhe_hashfun(daddr);
1378 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1379 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1380 if (fnhe->fnhe_daddr == daddr) {
1381 if (fnhe->fnhe_expires &&
1382 time_after(jiffies, fnhe->fnhe_expires)) {
1383 ip_del_fnhe(nhc, daddr);
1393 * 1. mtu on route is locked - use it
1394 * 2. mtu from nexthop exception
1395 * 3. mtu from egress device
1398 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1400 struct fib_nh_common *nhc = res->nhc;
1401 struct net_device *dev = nhc->nhc_dev;
1402 struct fib_info *fi = res->fi;
1405 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1406 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1410 struct fib_nh_exception *fnhe;
1412 fnhe = find_exception(nhc, daddr);
1413 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1414 mtu = fnhe->fnhe_pmtu;
1418 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1420 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1423 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1424 __be32 daddr, const bool do_cache)
1428 spin_lock_bh(&fnhe_lock);
1430 if (daddr == fnhe->fnhe_daddr) {
1431 struct rtable __rcu **porig;
1432 struct rtable *orig;
1433 int genid = fnhe_genid(dev_net(rt->dst.dev));
1435 if (rt_is_input_route(rt))
1436 porig = &fnhe->fnhe_rth_input;
1438 porig = &fnhe->fnhe_rth_output;
1439 orig = rcu_dereference(*porig);
1441 if (fnhe->fnhe_genid != genid) {
1442 fnhe->fnhe_genid = genid;
1444 fnhe->fnhe_pmtu = 0;
1445 fnhe->fnhe_expires = 0;
1446 fnhe->fnhe_mtu_locked = false;
1447 fnhe_flush_routes(fnhe);
1450 fill_route_from_fnhe(rt, fnhe);
1453 rt->rt_gw_family = AF_INET;
1458 rcu_assign_pointer(*porig, rt);
1460 dst_dev_put(&orig->dst);
1461 dst_release(&orig->dst);
1466 fnhe->fnhe_stamp = jiffies;
1468 spin_unlock_bh(&fnhe_lock);
1473 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1475 struct rtable *orig, *prev, **p;
1478 if (rt_is_input_route(rt)) {
1479 p = (struct rtable **)&nhc->nhc_rth_input;
1481 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1485 /* hold dst before doing cmpxchg() to avoid race condition
1489 prev = cmpxchg(p, orig, rt);
1492 rt_add_uncached_list(orig);
1493 dst_release(&orig->dst);
1496 dst_release(&rt->dst);
1503 struct uncached_list {
1505 struct list_head head;
1508 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1510 void rt_add_uncached_list(struct rtable *rt)
1512 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1514 rt->rt_uncached_list = ul;
1516 spin_lock_bh(&ul->lock);
1517 list_add_tail(&rt->rt_uncached, &ul->head);
1518 spin_unlock_bh(&ul->lock);
1521 void rt_del_uncached_list(struct rtable *rt)
1523 if (!list_empty(&rt->rt_uncached)) {
1524 struct uncached_list *ul = rt->rt_uncached_list;
1526 spin_lock_bh(&ul->lock);
1527 list_del(&rt->rt_uncached);
1528 spin_unlock_bh(&ul->lock);
1532 static void ipv4_dst_destroy(struct dst_entry *dst)
1534 struct rtable *rt = (struct rtable *)dst;
1536 ip_dst_metrics_put(dst);
1537 rt_del_uncached_list(rt);
1540 void rt_flush_dev(struct net_device *dev)
1545 for_each_possible_cpu(cpu) {
1546 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1548 spin_lock_bh(&ul->lock);
1549 list_for_each_entry(rt, &ul->head, rt_uncached) {
1550 if (rt->dst.dev != dev)
1552 rt->dst.dev = blackhole_netdev;
1553 dev_hold(rt->dst.dev);
1556 spin_unlock_bh(&ul->lock);
1560 static bool rt_cache_valid(const struct rtable *rt)
1563 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1567 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1568 const struct fib_result *res,
1569 struct fib_nh_exception *fnhe,
1570 struct fib_info *fi, u16 type, u32 itag,
1571 const bool do_cache)
1573 bool cached = false;
1576 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1578 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1579 rt->rt_uses_gateway = 1;
1580 rt->rt_gw_family = nhc->nhc_gw_family;
1581 /* only INET and INET6 are supported */
1582 if (likely(nhc->nhc_gw_family == AF_INET))
1583 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1585 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1588 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1590 #ifdef CONFIG_IP_ROUTE_CLASSID
1591 if (nhc->nhc_family == AF_INET) {
1594 nh = container_of(nhc, struct fib_nh, nh_common);
1595 rt->dst.tclassid = nh->nh_tclassid;
1598 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1600 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1602 cached = rt_cache_route(nhc, rt);
1603 if (unlikely(!cached)) {
1604 /* Routes we intend to cache in nexthop exception or
1605 * FIB nexthop have the DST_NOCACHE bit clear.
1606 * However, if we are unsuccessful at storing this
1607 * route into the cache we really need to set it.
1610 rt->rt_gw_family = AF_INET;
1613 rt_add_uncached_list(rt);
1616 rt_add_uncached_list(rt);
1618 #ifdef CONFIG_IP_ROUTE_CLASSID
1619 #ifdef CONFIG_IP_MULTIPLE_TABLES
1620 set_class_tag(rt, res->tclassid);
1622 set_class_tag(rt, itag);
1626 struct rtable *rt_dst_alloc(struct net_device *dev,
1627 unsigned int flags, u16 type,
1628 bool nopolicy, bool noxfrm)
1632 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1633 (nopolicy ? DST_NOPOLICY : 0) |
1634 (noxfrm ? DST_NOXFRM : 0));
1637 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1638 rt->rt_flags = flags;
1640 rt->rt_is_input = 0;
1643 rt->rt_mtu_locked = 0;
1644 rt->rt_uses_gateway = 0;
1645 rt->rt_gw_family = 0;
1647 INIT_LIST_HEAD(&rt->rt_uncached);
1649 rt->dst.output = ip_output;
1650 if (flags & RTCF_LOCAL)
1651 rt->dst.input = ip_local_deliver;
1656 EXPORT_SYMBOL(rt_dst_alloc);
1658 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1660 struct rtable *new_rt;
1662 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1666 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1667 new_rt->rt_flags = rt->rt_flags;
1668 new_rt->rt_type = rt->rt_type;
1669 new_rt->rt_is_input = rt->rt_is_input;
1670 new_rt->rt_iif = rt->rt_iif;
1671 new_rt->rt_pmtu = rt->rt_pmtu;
1672 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1673 new_rt->rt_gw_family = rt->rt_gw_family;
1674 if (rt->rt_gw_family == AF_INET)
1675 new_rt->rt_gw4 = rt->rt_gw4;
1676 else if (rt->rt_gw_family == AF_INET6)
1677 new_rt->rt_gw6 = rt->rt_gw6;
1678 INIT_LIST_HEAD(&new_rt->rt_uncached);
1680 new_rt->dst.input = rt->dst.input;
1681 new_rt->dst.output = rt->dst.output;
1682 new_rt->dst.error = rt->dst.error;
1683 new_rt->dst.lastuse = jiffies;
1684 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1688 EXPORT_SYMBOL(rt_dst_clone);
1690 /* called in rcu_read_lock() section */
1691 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1692 u8 tos, struct net_device *dev,
1693 struct in_device *in_dev, u32 *itag)
1697 /* Primary sanity checks. */
1701 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1702 skb->protocol != htons(ETH_P_IP))
1705 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1708 if (ipv4_is_zeronet(saddr)) {
1709 if (!ipv4_is_local_multicast(daddr) &&
1710 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1713 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1721 /* called in rcu_read_lock() section */
1722 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1723 u8 tos, struct net_device *dev, int our)
1725 struct in_device *in_dev = __in_dev_get_rcu(dev);
1726 unsigned int flags = RTCF_MULTICAST;
1731 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1736 flags |= RTCF_LOCAL;
1738 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1739 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1743 #ifdef CONFIG_IP_ROUTE_CLASSID
1744 rth->dst.tclassid = itag;
1746 rth->dst.output = ip_rt_bug;
1747 rth->rt_is_input= 1;
1749 #ifdef CONFIG_IP_MROUTE
1750 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1751 rth->dst.input = ip_mr_input;
1753 RT_CACHE_STAT_INC(in_slow_mc);
1755 skb_dst_set(skb, &rth->dst);
1760 static void ip_handle_martian_source(struct net_device *dev,
1761 struct in_device *in_dev,
1762 struct sk_buff *skb,
1766 RT_CACHE_STAT_INC(in_martian_src);
1767 #ifdef CONFIG_IP_ROUTE_VERBOSE
1768 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1770 * RFC1812 recommendation, if source is martian,
1771 * the only hint is MAC header.
1773 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1774 &daddr, &saddr, dev->name);
1775 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1776 print_hex_dump(KERN_WARNING, "ll header: ",
1777 DUMP_PREFIX_OFFSET, 16, 1,
1778 skb_mac_header(skb),
1779 dev->hard_header_len, false);
1785 /* called in rcu_read_lock() section */
1786 static int __mkroute_input(struct sk_buff *skb,
1787 const struct fib_result *res,
1788 struct in_device *in_dev,
1789 __be32 daddr, __be32 saddr, u32 tos)
1791 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1792 struct net_device *dev = nhc->nhc_dev;
1793 struct fib_nh_exception *fnhe;
1796 struct in_device *out_dev;
1800 /* get a working reference to the output device */
1801 out_dev = __in_dev_get_rcu(dev);
1803 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1807 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1808 in_dev->dev, in_dev, &itag);
1810 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1816 do_cache = res->fi && !itag;
1817 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1818 skb->protocol == htons(ETH_P_IP)) {
1821 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1822 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1823 inet_addr_onlink(out_dev, saddr, gw))
1824 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1827 if (skb->protocol != htons(ETH_P_IP)) {
1828 /* Not IP (i.e. ARP). Do not create route, if it is
1829 * invalid for proxy arp. DNAT routes are always valid.
1831 * Proxy arp feature have been extended to allow, ARP
1832 * replies back to the same interface, to support
1833 * Private VLAN switch technologies. See arp.c.
1835 if (out_dev == in_dev &&
1836 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1842 fnhe = find_exception(nhc, daddr);
1845 rth = rcu_dereference(fnhe->fnhe_rth_input);
1847 rth = rcu_dereference(nhc->nhc_rth_input);
1848 if (rt_cache_valid(rth)) {
1849 skb_dst_set_noref(skb, &rth->dst);
1854 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1855 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1856 IN_DEV_CONF_GET(out_dev, NOXFRM));
1862 rth->rt_is_input = 1;
1863 RT_CACHE_STAT_INC(in_slow_tot);
1865 rth->dst.input = ip_forward;
1867 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1869 lwtunnel_set_redirect(&rth->dst);
1870 skb_dst_set(skb, &rth->dst);
1877 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1878 /* To make ICMP packets follow the right flow, the multipath hash is
1879 * calculated from the inner IP addresses.
1881 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1882 struct flow_keys *hash_keys)
1884 const struct iphdr *outer_iph = ip_hdr(skb);
1885 const struct iphdr *key_iph = outer_iph;
1886 const struct iphdr *inner_iph;
1887 const struct icmphdr *icmph;
1888 struct iphdr _inner_iph;
1889 struct icmphdr _icmph;
1891 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1894 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1897 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1902 if (!icmp_is_err(icmph->type))
1905 inner_iph = skb_header_pointer(skb,
1906 outer_iph->ihl * 4 + sizeof(_icmph),
1907 sizeof(_inner_iph), &_inner_iph);
1911 key_iph = inner_iph;
1913 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1914 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1917 /* if skb is set it will be used and fl4 can be NULL */
1918 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1919 const struct sk_buff *skb, struct flow_keys *flkeys)
1921 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1922 struct flow_keys hash_keys;
1925 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1927 memset(&hash_keys, 0, sizeof(hash_keys));
1928 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1930 ip_multipath_l3_keys(skb, &hash_keys);
1932 hash_keys.addrs.v4addrs.src = fl4->saddr;
1933 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1937 /* skb is currently provided only when forwarding */
1939 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1940 struct flow_keys keys;
1942 /* short-circuit if we already have L4 hash present */
1944 return skb_get_hash_raw(skb) >> 1;
1946 memset(&hash_keys, 0, sizeof(hash_keys));
1949 skb_flow_dissect_flow_keys(skb, &keys, flag);
1953 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1954 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1955 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1956 hash_keys.ports.src = flkeys->ports.src;
1957 hash_keys.ports.dst = flkeys->ports.dst;
1958 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1960 memset(&hash_keys, 0, sizeof(hash_keys));
1961 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1962 hash_keys.addrs.v4addrs.src = fl4->saddr;
1963 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1964 hash_keys.ports.src = fl4->fl4_sport;
1965 hash_keys.ports.dst = fl4->fl4_dport;
1966 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1970 memset(&hash_keys, 0, sizeof(hash_keys));
1971 /* skb is currently provided only when forwarding */
1973 struct flow_keys keys;
1975 skb_flow_dissect_flow_keys(skb, &keys, 0);
1976 /* Inner can be v4 or v6 */
1977 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1978 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1979 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1980 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1981 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1982 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1983 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1984 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1985 hash_keys.tags.flow_label = keys.tags.flow_label;
1986 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1988 /* Same as case 0 */
1989 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1990 ip_multipath_l3_keys(skb, &hash_keys);
1993 /* Same as case 0 */
1994 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1995 hash_keys.addrs.v4addrs.src = fl4->saddr;
1996 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2000 mhash = flow_hash_from_keys(&hash_keys);
2003 mhash = jhash_2words(mhash, multipath_hash, 0);
2007 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2009 static int ip_mkroute_input(struct sk_buff *skb,
2010 struct fib_result *res,
2011 struct in_device *in_dev,
2012 __be32 daddr, __be32 saddr, u32 tos,
2013 struct flow_keys *hkeys)
2015 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2016 if (res->fi && fib_info_num_path(res->fi) > 1) {
2017 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2019 fib_select_multipath(res, h);
2023 /* create a routing cache entry */
2024 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2027 /* Implements all the saddr-related checks as ip_route_input_slow(),
2028 * assuming daddr is valid and the destination is not a local broadcast one.
2029 * Uses the provided hint instead of performing a route lookup.
2031 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2032 u8 tos, struct net_device *dev,
2033 const struct sk_buff *hint)
2035 struct in_device *in_dev = __in_dev_get_rcu(dev);
2036 struct rtable *rt = skb_rtable(hint);
2037 struct net *net = dev_net(dev);
2041 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2042 goto martian_source;
2044 if (ipv4_is_zeronet(saddr))
2045 goto martian_source;
2047 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2048 goto martian_source;
2050 if (rt->rt_type != RTN_LOCAL)
2051 goto skip_validate_source;
2053 tos &= IPTOS_RT_MASK;
2054 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2056 goto martian_source;
2058 skip_validate_source:
2059 skb_dst_copy(skb, hint);
2063 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2068 * NOTE. We drop all the packets that has local source
2069 * addresses, because every properly looped back packet
2070 * must have correct destination already attached by output routine.
2071 * Changes in the enforced policies must be applied also to
2072 * ip_route_use_hint().
2074 * Such approach solves two big problems:
2075 * 1. Not simplex devices are handled properly.
2076 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2077 * called with rcu_read_lock()
2080 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2081 u8 tos, struct net_device *dev,
2082 struct fib_result *res)
2084 struct in_device *in_dev = __in_dev_get_rcu(dev);
2085 struct flow_keys *flkeys = NULL, _flkeys;
2086 struct net *net = dev_net(dev);
2087 struct ip_tunnel_info *tun_info;
2089 unsigned int flags = 0;
2093 bool do_cache = true;
2095 /* IP on this device is disabled. */
2100 /* Check for the most weird martians, which can be not detected
2104 tun_info = skb_tunnel_info(skb);
2105 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2106 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2108 fl4.flowi4_tun_key.tun_id = 0;
2111 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2112 goto martian_source;
2116 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2119 /* Accept zero addresses only to limited broadcast;
2120 * I even do not know to fix it or not. Waiting for complains :-)
2122 if (ipv4_is_zeronet(saddr))
2123 goto martian_source;
2125 if (ipv4_is_zeronet(daddr))
2126 goto martian_destination;
2128 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2129 * and call it once if daddr or/and saddr are loopback addresses
2131 if (ipv4_is_loopback(daddr)) {
2132 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2133 goto martian_destination;
2134 } else if (ipv4_is_loopback(saddr)) {
2135 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2136 goto martian_source;
2140 * Now we are ready to route packet.
2143 fl4.flowi4_iif = dev->ifindex;
2144 fl4.flowi4_mark = skb->mark;
2145 fl4.flowi4_tos = tos;
2146 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2147 fl4.flowi4_flags = 0;
2150 fl4.flowi4_uid = sock_net_uid(net, NULL);
2152 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2155 fl4.flowi4_proto = 0;
2160 err = fib_lookup(net, &fl4, res, 0);
2162 if (!IN_DEV_FORWARD(in_dev))
2163 err = -EHOSTUNREACH;
2167 if (res->type == RTN_BROADCAST) {
2168 if (IN_DEV_BFORWARD(in_dev))
2170 /* not do cache if bc_forwarding is enabled */
2171 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2176 if (res->type == RTN_LOCAL) {
2177 err = fib_validate_source(skb, saddr, daddr, tos,
2178 0, dev, in_dev, &itag);
2180 goto martian_source;
2184 if (!IN_DEV_FORWARD(in_dev)) {
2185 err = -EHOSTUNREACH;
2188 if (res->type != RTN_UNICAST)
2189 goto martian_destination;
2192 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2196 if (skb->protocol != htons(ETH_P_IP))
2199 if (!ipv4_is_zeronet(saddr)) {
2200 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2203 goto martian_source;
2205 flags |= RTCF_BROADCAST;
2206 res->type = RTN_BROADCAST;
2207 RT_CACHE_STAT_INC(in_brd);
2210 do_cache &= res->fi && !itag;
2212 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2214 rth = rcu_dereference(nhc->nhc_rth_input);
2215 if (rt_cache_valid(rth)) {
2216 skb_dst_set_noref(skb, &rth->dst);
2222 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2223 flags | RTCF_LOCAL, res->type,
2224 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2228 rth->dst.output= ip_rt_bug;
2229 #ifdef CONFIG_IP_ROUTE_CLASSID
2230 rth->dst.tclassid = itag;
2232 rth->rt_is_input = 1;
2234 RT_CACHE_STAT_INC(in_slow_tot);
2235 if (res->type == RTN_UNREACHABLE) {
2236 rth->dst.input= ip_error;
2237 rth->dst.error= -err;
2238 rth->rt_flags &= ~RTCF_LOCAL;
2242 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2244 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2245 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2246 WARN_ON(rth->dst.input == lwtunnel_input);
2247 rth->dst.lwtstate->orig_input = rth->dst.input;
2248 rth->dst.input = lwtunnel_input;
2251 if (unlikely(!rt_cache_route(nhc, rth)))
2252 rt_add_uncached_list(rth);
2254 skb_dst_set(skb, &rth->dst);
2259 RT_CACHE_STAT_INC(in_no_route);
2260 res->type = RTN_UNREACHABLE;
2266 * Do not cache martian addresses: they should be logged (RFC1812)
2268 martian_destination:
2269 RT_CACHE_STAT_INC(in_martian_dst);
2270 #ifdef CONFIG_IP_ROUTE_VERBOSE
2271 if (IN_DEV_LOG_MARTIANS(in_dev))
2272 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2273 &daddr, &saddr, dev->name);
2285 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2289 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2290 u8 tos, struct net_device *dev)
2292 struct fib_result res;
2295 tos &= IPTOS_RT_MASK;
2297 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2302 EXPORT_SYMBOL(ip_route_input_noref);
2304 /* called with rcu_read_lock held */
2305 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2306 u8 tos, struct net_device *dev, struct fib_result *res)
2308 /* Multicast recognition logic is moved from route cache to here.
2309 The problem was that too many Ethernet cards have broken/missing
2310 hardware multicast filters :-( As result the host on multicasting
2311 network acquires a lot of useless route cache entries, sort of
2312 SDR messages from all the world. Now we try to get rid of them.
2313 Really, provided software IP multicast filter is organized
2314 reasonably (at least, hashed), it does not result in a slowdown
2315 comparing with route cache reject entries.
2316 Note, that multicast routers are not affected, because
2317 route cache entry is created eventually.
2319 if (ipv4_is_multicast(daddr)) {
2320 struct in_device *in_dev = __in_dev_get_rcu(dev);
2326 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2327 ip_hdr(skb)->protocol);
2329 /* check l3 master if no match yet */
2330 if (!our && netif_is_l3_slave(dev)) {
2331 struct in_device *l3_in_dev;
2333 l3_in_dev = __in_dev_get_rcu(skb->dev);
2335 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2336 ip_hdr(skb)->protocol);
2340 #ifdef CONFIG_IP_MROUTE
2342 (!ipv4_is_local_multicast(daddr) &&
2343 IN_DEV_MFORWARD(in_dev))
2346 err = ip_route_input_mc(skb, daddr, saddr,
2352 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2355 /* called with rcu_read_lock() */
2356 static struct rtable *__mkroute_output(const struct fib_result *res,
2357 const struct flowi4 *fl4, int orig_oif,
2358 struct net_device *dev_out,
2361 struct fib_info *fi = res->fi;
2362 struct fib_nh_exception *fnhe;
2363 struct in_device *in_dev;
2364 u16 type = res->type;
2368 in_dev = __in_dev_get_rcu(dev_out);
2370 return ERR_PTR(-EINVAL);
2372 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2373 if (ipv4_is_loopback(fl4->saddr) &&
2374 !(dev_out->flags & IFF_LOOPBACK) &&
2375 !netif_is_l3_master(dev_out))
2376 return ERR_PTR(-EINVAL);
2378 if (ipv4_is_lbcast(fl4->daddr))
2379 type = RTN_BROADCAST;
2380 else if (ipv4_is_multicast(fl4->daddr))
2381 type = RTN_MULTICAST;
2382 else if (ipv4_is_zeronet(fl4->daddr))
2383 return ERR_PTR(-EINVAL);
2385 if (dev_out->flags & IFF_LOOPBACK)
2386 flags |= RTCF_LOCAL;
2389 if (type == RTN_BROADCAST) {
2390 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2392 } else if (type == RTN_MULTICAST) {
2393 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2394 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2396 flags &= ~RTCF_LOCAL;
2399 /* If multicast route do not exist use
2400 * default one, but do not gateway in this case.
2403 if (fi && res->prefixlen < 4)
2405 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2406 (orig_oif != dev_out->ifindex)) {
2407 /* For local routes that require a particular output interface
2408 * we do not want to cache the result. Caching the result
2409 * causes incorrect behaviour when there are multiple source
2410 * addresses on the interface, the end result being that if the
2411 * intended recipient is waiting on that interface for the
2412 * packet he won't receive it because it will be delivered on
2413 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2414 * be set to the loopback interface as well.
2420 do_cache &= fi != NULL;
2422 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2423 struct rtable __rcu **prth;
2425 fnhe = find_exception(nhc, fl4->daddr);
2429 prth = &fnhe->fnhe_rth_output;
2431 if (unlikely(fl4->flowi4_flags &
2432 FLOWI_FLAG_KNOWN_NH &&
2433 !(nhc->nhc_gw_family &&
2434 nhc->nhc_scope == RT_SCOPE_LINK))) {
2438 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2440 rth = rcu_dereference(*prth);
2441 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2446 rth = rt_dst_alloc(dev_out, flags, type,
2447 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2448 IN_DEV_CONF_GET(in_dev, NOXFRM));
2450 return ERR_PTR(-ENOBUFS);
2452 rth->rt_iif = orig_oif;
2454 RT_CACHE_STAT_INC(out_slow_tot);
2456 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2457 if (flags & RTCF_LOCAL &&
2458 !(dev_out->flags & IFF_LOOPBACK)) {
2459 rth->dst.output = ip_mc_output;
2460 RT_CACHE_STAT_INC(out_slow_mc);
2462 #ifdef CONFIG_IP_MROUTE
2463 if (type == RTN_MULTICAST) {
2464 if (IN_DEV_MFORWARD(in_dev) &&
2465 !ipv4_is_local_multicast(fl4->daddr)) {
2466 rth->dst.input = ip_mr_input;
2467 rth->dst.output = ip_mc_output;
2473 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2474 lwtunnel_set_redirect(&rth->dst);
2480 * Major route resolver routine.
2483 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2484 const struct sk_buff *skb)
2486 __u8 tos = RT_FL_TOS(fl4);
2487 struct fib_result res = {
2495 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2496 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2497 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2498 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2501 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2506 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2508 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2509 struct fib_result *res,
2510 const struct sk_buff *skb)
2512 struct net_device *dev_out = NULL;
2513 int orig_oif = fl4->flowi4_oif;
2514 unsigned int flags = 0;
2519 if (ipv4_is_multicast(fl4->saddr) ||
2520 ipv4_is_lbcast(fl4->saddr) ||
2521 ipv4_is_zeronet(fl4->saddr)) {
2522 rth = ERR_PTR(-EINVAL);
2526 rth = ERR_PTR(-ENETUNREACH);
2528 /* I removed check for oif == dev_out->oif here.
2529 It was wrong for two reasons:
2530 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2531 is assigned to multiple interfaces.
2532 2. Moreover, we are allowed to send packets with saddr
2533 of another iface. --ANK
2536 if (fl4->flowi4_oif == 0 &&
2537 (ipv4_is_multicast(fl4->daddr) ||
2538 ipv4_is_lbcast(fl4->daddr))) {
2539 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2540 dev_out = __ip_dev_find(net, fl4->saddr, false);
2544 /* Special hack: user can direct multicasts
2545 and limited broadcast via necessary interface
2546 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2547 This hack is not just for fun, it allows
2548 vic,vat and friends to work.
2549 They bind socket to loopback, set ttl to zero
2550 and expect that it will work.
2551 From the viewpoint of routing cache they are broken,
2552 because we are not allowed to build multicast path
2553 with loopback source addr (look, routing cache
2554 cannot know, that ttl is zero, so that packet
2555 will not leave this host and route is valid).
2556 Luckily, this hack is good workaround.
2559 fl4->flowi4_oif = dev_out->ifindex;
2563 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2564 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2565 if (!__ip_dev_find(net, fl4->saddr, false))
2571 if (fl4->flowi4_oif) {
2572 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2573 rth = ERR_PTR(-ENODEV);
2577 /* RACE: Check return value of inet_select_addr instead. */
2578 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2579 rth = ERR_PTR(-ENETUNREACH);
2582 if (ipv4_is_local_multicast(fl4->daddr) ||
2583 ipv4_is_lbcast(fl4->daddr) ||
2584 fl4->flowi4_proto == IPPROTO_IGMP) {
2586 fl4->saddr = inet_select_addr(dev_out, 0,
2591 if (ipv4_is_multicast(fl4->daddr))
2592 fl4->saddr = inet_select_addr(dev_out, 0,
2594 else if (!fl4->daddr)
2595 fl4->saddr = inet_select_addr(dev_out, 0,
2601 fl4->daddr = fl4->saddr;
2603 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2604 dev_out = net->loopback_dev;
2605 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2606 res->type = RTN_LOCAL;
2607 flags |= RTCF_LOCAL;
2611 err = fib_lookup(net, fl4, res, 0);
2615 if (fl4->flowi4_oif &&
2616 (ipv4_is_multicast(fl4->daddr) ||
2617 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2618 /* Apparently, routing tables are wrong. Assume,
2619 that the destination is on link.
2622 Because we are allowed to send to iface
2623 even if it has NO routes and NO assigned
2624 addresses. When oif is specified, routing
2625 tables are looked up with only one purpose:
2626 to catch if destination is gatewayed, rather than
2627 direct. Moreover, if MSG_DONTROUTE is set,
2628 we send packet, ignoring both routing tables
2629 and ifaddr state. --ANK
2632 We could make it even if oif is unknown,
2633 likely IPv6, but we do not.
2636 if (fl4->saddr == 0)
2637 fl4->saddr = inet_select_addr(dev_out, 0,
2639 res->type = RTN_UNICAST;
2646 if (res->type == RTN_LOCAL) {
2648 if (res->fi->fib_prefsrc)
2649 fl4->saddr = res->fi->fib_prefsrc;
2651 fl4->saddr = fl4->daddr;
2654 /* L3 master device is the loopback for that domain */
2655 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2658 /* make sure orig_oif points to fib result device even
2659 * though packet rx/tx happens over loopback or l3mdev
2661 orig_oif = FIB_RES_OIF(*res);
2663 fl4->flowi4_oif = dev_out->ifindex;
2664 flags |= RTCF_LOCAL;
2668 fib_select_path(net, res, fl4, skb);
2670 dev_out = FIB_RES_DEV(*res);
2671 fl4->flowi4_oif = dev_out->ifindex;
2675 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2681 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2686 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2688 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2690 return mtu ? : dst->dev->mtu;
2693 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2694 struct sk_buff *skb, u32 mtu,
2699 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2700 struct sk_buff *skb)
2704 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2710 static struct dst_ops ipv4_dst_blackhole_ops = {
2712 .check = ipv4_blackhole_dst_check,
2713 .mtu = ipv4_blackhole_mtu,
2714 .default_advmss = ipv4_default_advmss,
2715 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2716 .redirect = ipv4_rt_blackhole_redirect,
2717 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2718 .neigh_lookup = ipv4_neigh_lookup,
2721 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2723 struct rtable *ort = (struct rtable *) dst_orig;
2726 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2728 struct dst_entry *new = &rt->dst;
2731 new->input = dst_discard;
2732 new->output = dst_discard_out;
2734 new->dev = net->loopback_dev;
2738 rt->rt_is_input = ort->rt_is_input;
2739 rt->rt_iif = ort->rt_iif;
2740 rt->rt_pmtu = ort->rt_pmtu;
2741 rt->rt_mtu_locked = ort->rt_mtu_locked;
2743 rt->rt_genid = rt_genid_ipv4(net);
2744 rt->rt_flags = ort->rt_flags;
2745 rt->rt_type = ort->rt_type;
2746 rt->rt_uses_gateway = ort->rt_uses_gateway;
2747 rt->rt_gw_family = ort->rt_gw_family;
2748 if (rt->rt_gw_family == AF_INET)
2749 rt->rt_gw4 = ort->rt_gw4;
2750 else if (rt->rt_gw_family == AF_INET6)
2751 rt->rt_gw6 = ort->rt_gw6;
2753 INIT_LIST_HEAD(&rt->rt_uncached);
2756 dst_release(dst_orig);
2758 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2761 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2762 const struct sock *sk)
2764 struct rtable *rt = __ip_route_output_key(net, flp4);
2769 if (flp4->flowi4_proto)
2770 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2771 flowi4_to_flowi(flp4),
2776 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2778 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2779 struct net_device *dev,
2780 struct net *net, __be32 *saddr,
2781 const struct ip_tunnel_info *info,
2782 u8 protocol, bool use_cache)
2784 #ifdef CONFIG_DST_CACHE
2785 struct dst_cache *dst_cache;
2787 struct rtable *rt = NULL;
2791 #ifdef CONFIG_DST_CACHE
2792 dst_cache = (struct dst_cache *)&info->dst_cache;
2794 rt = dst_cache_get_ip4(dst_cache, saddr);
2799 memset(&fl4, 0, sizeof(fl4));
2800 fl4.flowi4_mark = skb->mark;
2801 fl4.flowi4_proto = protocol;
2802 fl4.daddr = info->key.u.ipv4.dst;
2803 fl4.saddr = info->key.u.ipv4.src;
2804 tos = info->key.tos;
2805 fl4.flowi4_tos = RT_TOS(tos);
2807 rt = ip_route_output_key(net, &fl4);
2809 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2810 return ERR_PTR(-ENETUNREACH);
2812 if (rt->dst.dev == dev) { /* is this necessary? */
2813 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2815 return ERR_PTR(-ELOOP);
2817 #ifdef CONFIG_DST_CACHE
2819 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2824 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2826 /* called with rcu_read_lock held */
2827 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2828 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2829 struct sk_buff *skb, u32 portid, u32 seq,
2833 struct nlmsghdr *nlh;
2834 unsigned long expires = 0;
2836 u32 metrics[RTAX_MAX];
2838 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2842 r = nlmsg_data(nlh);
2843 r->rtm_family = AF_INET;
2844 r->rtm_dst_len = 32;
2846 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2847 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2848 if (nla_put_u32(skb, RTA_TABLE, table_id))
2849 goto nla_put_failure;
2850 r->rtm_type = rt->rt_type;
2851 r->rtm_scope = RT_SCOPE_UNIVERSE;
2852 r->rtm_protocol = RTPROT_UNSPEC;
2853 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2854 if (rt->rt_flags & RTCF_NOTIFY)
2855 r->rtm_flags |= RTM_F_NOTIFY;
2856 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2857 r->rtm_flags |= RTCF_DOREDIRECT;
2859 if (nla_put_in_addr(skb, RTA_DST, dst))
2860 goto nla_put_failure;
2862 r->rtm_src_len = 32;
2863 if (nla_put_in_addr(skb, RTA_SRC, src))
2864 goto nla_put_failure;
2867 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2868 goto nla_put_failure;
2869 #ifdef CONFIG_IP_ROUTE_CLASSID
2870 if (rt->dst.tclassid &&
2871 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2872 goto nla_put_failure;
2874 if (fl4 && !rt_is_input_route(rt) &&
2875 fl4->saddr != src) {
2876 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2877 goto nla_put_failure;
2879 if (rt->rt_uses_gateway) {
2880 if (rt->rt_gw_family == AF_INET &&
2881 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2882 goto nla_put_failure;
2883 } else if (rt->rt_gw_family == AF_INET6) {
2884 int alen = sizeof(struct in6_addr);
2888 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2890 goto nla_put_failure;
2892 via = nla_data(nla);
2893 via->rtvia_family = AF_INET6;
2894 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2898 expires = rt->dst.expires;
2900 unsigned long now = jiffies;
2902 if (time_before(now, expires))
2908 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2909 if (rt->rt_pmtu && expires)
2910 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2911 if (rt->rt_mtu_locked && expires)
2912 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2913 if (rtnetlink_put_metrics(skb, metrics) < 0)
2914 goto nla_put_failure;
2917 if (fl4->flowi4_mark &&
2918 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2919 goto nla_put_failure;
2921 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2922 nla_put_u32(skb, RTA_UID,
2923 from_kuid_munged(current_user_ns(),
2925 goto nla_put_failure;
2927 if (rt_is_input_route(rt)) {
2928 #ifdef CONFIG_IP_MROUTE
2929 if (ipv4_is_multicast(dst) &&
2930 !ipv4_is_local_multicast(dst) &&
2931 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2932 int err = ipmr_get_route(net, skb,
2933 fl4->saddr, fl4->daddr,
2939 goto nla_put_failure;
2943 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2944 goto nla_put_failure;
2948 error = rt->dst.error;
2950 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2951 goto nla_put_failure;
2953 nlmsg_end(skb, nlh);
2957 nlmsg_cancel(skb, nlh);
2961 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2962 struct netlink_callback *cb, u32 table_id,
2963 struct fnhe_hash_bucket *bucket, int genid,
2964 int *fa_index, int fa_start, unsigned int flags)
2968 for (i = 0; i < FNHE_HASH_SIZE; i++) {
2969 struct fib_nh_exception *fnhe;
2971 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2972 fnhe = rcu_dereference(fnhe->fnhe_next)) {
2976 if (*fa_index < fa_start)
2979 if (fnhe->fnhe_genid != genid)
2982 if (fnhe->fnhe_expires &&
2983 time_after(jiffies, fnhe->fnhe_expires))
2986 rt = rcu_dereference(fnhe->fnhe_rth_input);
2988 rt = rcu_dereference(fnhe->fnhe_rth_output);
2992 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2993 table_id, NULL, skb,
2994 NETLINK_CB(cb->skb).portid,
2995 cb->nlh->nlmsg_seq, flags);
3006 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3007 u32 table_id, struct fib_info *fi,
3008 int *fa_index, int fa_start, unsigned int flags)
3010 struct net *net = sock_net(cb->skb->sk);
3011 int nhsel, genid = fnhe_genid(net);
3013 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3014 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3015 struct fnhe_hash_bucket *bucket;
3018 if (nhc->nhc_flags & RTNH_F_DEAD)
3022 bucket = rcu_dereference(nhc->nhc_exceptions);
3025 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3026 genid, fa_index, fa_start,
3036 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3037 u8 ip_proto, __be16 sport,
3040 struct sk_buff *skb;
3043 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3047 /* Reserve room for dummy headers, this skb can pass
3048 * through good chunk of routing engine.
3050 skb_reset_mac_header(skb);
3051 skb_reset_network_header(skb);
3052 skb->protocol = htons(ETH_P_IP);
3053 iph = skb_put(skb, sizeof(struct iphdr));
3054 iph->protocol = ip_proto;
3060 skb_set_transport_header(skb, skb->len);
3062 switch (iph->protocol) {
3064 struct udphdr *udph;
3066 udph = skb_put_zero(skb, sizeof(struct udphdr));
3067 udph->source = sport;
3069 udph->len = sizeof(struct udphdr);
3074 struct tcphdr *tcph;
3076 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3077 tcph->source = sport;
3079 tcph->doff = sizeof(struct tcphdr) / 4;
3081 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3085 case IPPROTO_ICMP: {
3086 struct icmphdr *icmph;
3088 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3089 icmph->type = ICMP_ECHO;
3097 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3098 const struct nlmsghdr *nlh,
3100 struct netlink_ext_ack *extack)
3105 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3106 NL_SET_ERR_MSG(extack,
3107 "ipv4: Invalid header for route get request");
3111 if (!netlink_strict_get_check(skb))
3112 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3113 rtm_ipv4_policy, extack);
3115 rtm = nlmsg_data(nlh);
3116 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3117 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3118 rtm->rtm_table || rtm->rtm_protocol ||
3119 rtm->rtm_scope || rtm->rtm_type) {
3120 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3124 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3125 RTM_F_LOOKUP_TABLE |
3127 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3131 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3132 rtm_ipv4_policy, extack);
3136 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3137 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3138 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3142 for (i = 0; i <= RTA_MAX; i++) {
3158 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3166 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3167 struct netlink_ext_ack *extack)
3169 struct net *net = sock_net(in_skb->sk);
3170 struct nlattr *tb[RTA_MAX+1];
3171 u32 table_id = RT_TABLE_MAIN;
3172 __be16 sport = 0, dport = 0;
3173 struct fib_result res = {};
3174 u8 ip_proto = IPPROTO_UDP;
3175 struct rtable *rt = NULL;
3176 struct sk_buff *skb;
3178 struct flowi4 fl4 = {};
3186 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3190 rtm = nlmsg_data(nlh);
3191 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3192 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3193 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3194 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3196 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3198 uid = (iif ? INVALID_UID : current_uid());
3200 if (tb[RTA_IP_PROTO]) {
3201 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3202 &ip_proto, AF_INET, extack);
3208 sport = nla_get_be16(tb[RTA_SPORT]);
3211 dport = nla_get_be16(tb[RTA_DPORT]);
3213 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3219 fl4.flowi4_tos = rtm->rtm_tos;
3220 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3221 fl4.flowi4_mark = mark;
3222 fl4.flowi4_uid = uid;
3224 fl4.fl4_sport = sport;
3226 fl4.fl4_dport = dport;
3227 fl4.flowi4_proto = ip_proto;
3232 struct net_device *dev;
3234 dev = dev_get_by_index_rcu(net, iif);
3240 fl4.flowi4_iif = iif; /* for rt_fill_info */
3243 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3246 rt = skb_rtable(skb);
3247 if (err == 0 && rt->dst.error)
3248 err = -rt->dst.error;
3250 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3251 skb->dev = net->loopback_dev;
3252 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3257 skb_dst_set(skb, &rt->dst);
3263 if (rtm->rtm_flags & RTM_F_NOTIFY)
3264 rt->rt_flags |= RTCF_NOTIFY;
3266 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3267 table_id = res.table ? res.table->tb_id : 0;
3269 /* reset skb for netlink reply msg */
3271 skb_reset_network_header(skb);
3272 skb_reset_transport_header(skb);
3273 skb_reset_mac_header(skb);
3275 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3276 struct fib_rt_info fri;
3279 err = fib_props[res.type].error;
3281 err = -EHOSTUNREACH;
3285 fri.tb_id = table_id;
3286 fri.dst = res.prefix;
3287 fri.dst_len = res.prefixlen;
3288 fri.tos = fl4.flowi4_tos;
3289 fri.type = rt->rt_type;
3293 struct fib_alias *fa;
3295 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3296 u8 slen = 32 - fri.dst_len;
3298 if (fa->fa_slen == slen &&
3299 fa->tb_id == fri.tb_id &&
3300 fa->fa_tos == fri.tos &&
3301 fa->fa_info == res.fi &&
3302 fa->fa_type == fri.type) {
3303 fri.offload = fa->offload;
3304 fri.trap = fa->trap;
3309 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3310 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3312 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3313 NETLINK_CB(in_skb).portid,
3321 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3331 void ip_rt_multicast_event(struct in_device *in_dev)
3333 rt_cache_flush(dev_net(in_dev->dev));
3336 #ifdef CONFIG_SYSCTL
3337 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3338 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3339 static int ip_rt_gc_elasticity __read_mostly = 8;
3340 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3342 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3343 void *buffer, size_t *lenp, loff_t *ppos)
3345 struct net *net = (struct net *)__ctl->extra1;
3348 rt_cache_flush(net);
3349 fnhe_genid_bump(net);
3356 static struct ctl_table ipv4_route_table[] = {
3358 .procname = "gc_thresh",
3359 .data = &ipv4_dst_ops.gc_thresh,
3360 .maxlen = sizeof(int),
3362 .proc_handler = proc_dointvec,
3365 .procname = "max_size",
3366 .data = &ip_rt_max_size,
3367 .maxlen = sizeof(int),
3369 .proc_handler = proc_dointvec,
3372 /* Deprecated. Use gc_min_interval_ms */
3374 .procname = "gc_min_interval",
3375 .data = &ip_rt_gc_min_interval,
3376 .maxlen = sizeof(int),
3378 .proc_handler = proc_dointvec_jiffies,
3381 .procname = "gc_min_interval_ms",
3382 .data = &ip_rt_gc_min_interval,
3383 .maxlen = sizeof(int),
3385 .proc_handler = proc_dointvec_ms_jiffies,
3388 .procname = "gc_timeout",
3389 .data = &ip_rt_gc_timeout,
3390 .maxlen = sizeof(int),
3392 .proc_handler = proc_dointvec_jiffies,
3395 .procname = "gc_interval",
3396 .data = &ip_rt_gc_interval,
3397 .maxlen = sizeof(int),
3399 .proc_handler = proc_dointvec_jiffies,
3402 .procname = "redirect_load",
3403 .data = &ip_rt_redirect_load,
3404 .maxlen = sizeof(int),
3406 .proc_handler = proc_dointvec,
3409 .procname = "redirect_number",
3410 .data = &ip_rt_redirect_number,
3411 .maxlen = sizeof(int),
3413 .proc_handler = proc_dointvec,
3416 .procname = "redirect_silence",
3417 .data = &ip_rt_redirect_silence,
3418 .maxlen = sizeof(int),
3420 .proc_handler = proc_dointvec,
3423 .procname = "error_cost",
3424 .data = &ip_rt_error_cost,
3425 .maxlen = sizeof(int),
3427 .proc_handler = proc_dointvec,
3430 .procname = "error_burst",
3431 .data = &ip_rt_error_burst,
3432 .maxlen = sizeof(int),
3434 .proc_handler = proc_dointvec,
3437 .procname = "gc_elasticity",
3438 .data = &ip_rt_gc_elasticity,
3439 .maxlen = sizeof(int),
3441 .proc_handler = proc_dointvec,
3444 .procname = "mtu_expires",
3445 .data = &ip_rt_mtu_expires,
3446 .maxlen = sizeof(int),
3448 .proc_handler = proc_dointvec_jiffies,
3451 .procname = "min_pmtu",
3452 .data = &ip_rt_min_pmtu,
3453 .maxlen = sizeof(int),
3455 .proc_handler = proc_dointvec_minmax,
3456 .extra1 = &ip_min_valid_pmtu,
3459 .procname = "min_adv_mss",
3460 .data = &ip_rt_min_advmss,
3461 .maxlen = sizeof(int),
3463 .proc_handler = proc_dointvec,
3468 static const char ipv4_route_flush_procname[] = "flush";
3470 static struct ctl_table ipv4_route_flush_table[] = {
3472 .procname = ipv4_route_flush_procname,
3473 .maxlen = sizeof(int),
3475 .proc_handler = ipv4_sysctl_rtcache_flush,
3480 static __net_init int sysctl_route_net_init(struct net *net)
3482 struct ctl_table *tbl;
3484 tbl = ipv4_route_flush_table;
3485 if (!net_eq(net, &init_net)) {
3486 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3490 /* Don't export non-whitelisted sysctls to unprivileged users */
3491 if (net->user_ns != &init_user_ns) {
3492 if (tbl[0].procname != ipv4_route_flush_procname)
3493 tbl[0].procname = NULL;
3496 tbl[0].extra1 = net;
3498 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3499 if (!net->ipv4.route_hdr)
3504 if (tbl != ipv4_route_flush_table)
3510 static __net_exit void sysctl_route_net_exit(struct net *net)
3512 struct ctl_table *tbl;
3514 tbl = net->ipv4.route_hdr->ctl_table_arg;
3515 unregister_net_sysctl_table(net->ipv4.route_hdr);
3516 BUG_ON(tbl == ipv4_route_flush_table);
3520 static __net_initdata struct pernet_operations sysctl_route_ops = {
3521 .init = sysctl_route_net_init,
3522 .exit = sysctl_route_net_exit,
3526 static __net_init int rt_genid_init(struct net *net)
3528 atomic_set(&net->ipv4.rt_genid, 0);
3529 atomic_set(&net->fnhe_genid, 0);
3530 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3534 static __net_initdata struct pernet_operations rt_genid_ops = {
3535 .init = rt_genid_init,
3538 static int __net_init ipv4_inetpeer_init(struct net *net)
3540 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3544 inet_peer_base_init(bp);
3545 net->ipv4.peers = bp;
3549 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3551 struct inet_peer_base *bp = net->ipv4.peers;
3553 net->ipv4.peers = NULL;
3554 inetpeer_invalidate_tree(bp);
3558 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3559 .init = ipv4_inetpeer_init,
3560 .exit = ipv4_inetpeer_exit,
3563 #ifdef CONFIG_IP_ROUTE_CLASSID
3564 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3565 #endif /* CONFIG_IP_ROUTE_CLASSID */
3567 int __init ip_rt_init(void)
3571 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3574 panic("IP: failed to allocate ip_idents\n");
3576 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3578 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3580 panic("IP: failed to allocate ip_tstamps\n");
3582 for_each_possible_cpu(cpu) {
3583 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3585 INIT_LIST_HEAD(&ul->head);
3586 spin_lock_init(&ul->lock);
3588 #ifdef CONFIG_IP_ROUTE_CLASSID
3589 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3591 panic("IP: failed to allocate ip_rt_acct\n");
3594 ipv4_dst_ops.kmem_cachep =
3595 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3596 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3598 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3600 if (dst_entries_init(&ipv4_dst_ops) < 0)
3601 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3603 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3604 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3606 ipv4_dst_ops.gc_thresh = ~0;
3607 ip_rt_max_size = INT_MAX;
3612 if (ip_rt_proc_init())
3613 pr_err("Unable to create route proc files\n");
3618 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3619 RTNL_FLAG_DOIT_UNLOCKED);
3621 #ifdef CONFIG_SYSCTL
3622 register_pernet_subsys(&sysctl_route_ops);
3624 register_pernet_subsys(&rt_genid_ops);
3625 register_pernet_subsys(&ipv4_inetpeer_ops);
3629 #ifdef CONFIG_SYSCTL
3631 * We really need to sanitize the damn ipv4 init order, then all
3632 * this nonsense will go away.
3634 void __init ip_static_sysctl_init(void)
3636 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);