2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
117 #define RT_FL_TOS(oldflp4) \
118 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
120 #define RT_GC_TIMEOUT (300*HZ)
122 static int ip_rt_max_size;
123 static int ip_rt_redirect_number __read_mostly = 9;
124 static int ip_rt_redirect_load __read_mostly = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly = HZ;
127 static int ip_rt_error_burst __read_mostly = 5 * HZ;
128 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly = 256;
132 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
134 * Interface to generic destination cache.
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void ipv4_link_failure(struct sk_buff *skb);
142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu);
144 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
158 static struct dst_ops ipv4_dst_ops = {
160 .check = ipv4_dst_check,
161 .default_advmss = ipv4_default_advmss,
163 .cow_metrics = ipv4_cow_metrics,
164 .destroy = ipv4_dst_destroy,
165 .negative_advice = ipv4_negative_advice,
166 .link_failure = ipv4_link_failure,
167 .update_pmtu = ip_rt_update_pmtu,
168 .redirect = ip_do_redirect,
169 .local_out = __ip_local_out,
170 .neigh_lookup = ipv4_neigh_lookup,
173 #define ECN_OR_COST(class) TC_PRIO_##class
175 const __u8 ip_tos2prio[16] = {
177 ECN_OR_COST(BESTEFFORT),
179 ECN_OR_COST(BESTEFFORT),
185 ECN_OR_COST(INTERACTIVE),
187 ECN_OR_COST(INTERACTIVE),
188 TC_PRIO_INTERACTIVE_BULK,
189 ECN_OR_COST(INTERACTIVE_BULK),
190 TC_PRIO_INTERACTIVE_BULK,
191 ECN_OR_COST(INTERACTIVE_BULK)
193 EXPORT_SYMBOL(ip_tos2prio);
195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 return SEQ_START_TOKEN;
206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
218 if (v == SEQ_START_TOKEN)
219 seq_printf(seq, "%-127s\n",
220 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 static const struct seq_operations rt_cache_seq_ops = {
227 .start = rt_cache_seq_start,
228 .next = rt_cache_seq_next,
229 .stop = rt_cache_seq_stop,
230 .show = rt_cache_seq_show,
233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
235 return seq_open(file, &rt_cache_seq_ops);
238 static const struct file_operations rt_cache_seq_fops = {
239 .owner = THIS_MODULE,
240 .open = rt_cache_seq_open,
243 .release = seq_release,
247 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 return SEQ_START_TOKEN;
254 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
255 if (!cpu_possible(cpu))
258 return &per_cpu(rt_cache_stat, cpu);
263 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
268 if (!cpu_possible(cpu))
271 return &per_cpu(rt_cache_stat, cpu);
277 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
284 struct rt_cache_stat *st = v;
286 if (v == SEQ_START_TOKEN) {
287 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
292 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
293 dst_entries_get_slow(&ipv4_dst_ops),
306 0, /* st->gc_total */
307 0, /* st->gc_ignored */
308 0, /* st->gc_goal_miss */
309 0, /* st->gc_dst_overflow */
310 0, /* st->in_hlist_search */
311 0 /* st->out_hlist_search */
316 static const struct seq_operations rt_cpu_seq_ops = {
317 .start = rt_cpu_seq_start,
318 .next = rt_cpu_seq_next,
319 .stop = rt_cpu_seq_stop,
320 .show = rt_cpu_seq_show,
324 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
326 return seq_open(file, &rt_cpu_seq_ops);
329 static const struct file_operations rt_cpu_seq_fops = {
330 .owner = THIS_MODULE,
331 .open = rt_cpu_seq_open,
334 .release = seq_release,
337 #ifdef CONFIG_IP_ROUTE_CLASSID
338 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 struct ip_rt_acct *dst, *src;
343 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347 for_each_possible_cpu(i) {
348 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
349 for (j = 0; j < 256; j++) {
350 dst[j].o_bytes += src[j].o_bytes;
351 dst[j].o_packets += src[j].o_packets;
352 dst[j].i_bytes += src[j].i_bytes;
353 dst[j].i_packets += src[j].i_packets;
357 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362 static int rt_acct_proc_open(struct inode *inode, struct file *file)
364 return single_open(file, rt_acct_proc_show, NULL);
367 static const struct file_operations rt_acct_proc_fops = {
368 .owner = THIS_MODULE,
369 .open = rt_acct_proc_open,
372 .release = single_release,
376 static int __net_init ip_rt_do_proc_init(struct net *net)
378 struct proc_dir_entry *pde;
380 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385 pde = proc_create("rt_cache", S_IRUGO,
386 net->proc_net_stat, &rt_cpu_seq_fops);
390 #ifdef CONFIG_IP_ROUTE_CLASSID
391 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
397 #ifdef CONFIG_IP_ROUTE_CLASSID
399 remove_proc_entry("rt_cache", net->proc_net_stat);
402 remove_proc_entry("rt_cache", net->proc_net);
407 static void __net_exit ip_rt_do_proc_exit(struct net *net)
409 remove_proc_entry("rt_cache", net->proc_net_stat);
410 remove_proc_entry("rt_cache", net->proc_net);
411 #ifdef CONFIG_IP_ROUTE_CLASSID
412 remove_proc_entry("rt_acct", net->proc_net);
416 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
417 .init = ip_rt_do_proc_init,
418 .exit = ip_rt_do_proc_exit,
421 static int __init ip_rt_proc_init(void)
423 return register_pernet_subsys(&ip_rt_proc_ops);
427 static inline int ip_rt_proc_init(void)
431 #endif /* CONFIG_PROC_FS */
433 static inline bool rt_is_expired(const struct rtable *rth)
435 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
438 void rt_cache_flush(struct net *net)
440 rt_genid_bump_ipv4(net);
443 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447 struct net_device *dev = dst->dev;
448 const __be32 *pkey = daddr;
449 const struct rtable *rt;
452 rt = (const struct rtable *) dst;
454 pkey = (const __be32 *) &rt->rt_gateway;
456 pkey = &ip_hdr(skb)->daddr;
458 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
461 return neigh_create(&arp_tbl, pkey, dev);
464 #define IP_IDENTS_SZ 2048u
466 static atomic_t *ip_idents __read_mostly;
467 static u32 *ip_tstamps __read_mostly;
469 /* In order to protect privacy, we add a perturbation to identifiers
470 * if one generator is seldom used. This makes hard for an attacker
471 * to infer how many packets were sent between two points in time.
473 u32 ip_idents_reserve(u32 hash, int segs)
475 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
476 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
477 u32 old = ACCESS_ONCE(*p_tstamp);
478 u32 now = (u32)jiffies;
481 if (old != now && cmpxchg(p_tstamp, old, now) == old)
482 delta = prandom_u32_max(now - old);
484 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
486 old = (u32)atomic_read(p_id);
487 new = old + delta + segs;
488 } while (atomic_cmpxchg(p_id, old, new) != old);
492 EXPORT_SYMBOL(ip_idents_reserve);
494 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
496 static u32 ip_idents_hashrnd __read_mostly;
499 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
501 hash = jhash_3words((__force u32)iph->daddr,
502 (__force u32)iph->saddr,
503 iph->protocol ^ net_hash_mix(net),
505 id = ip_idents_reserve(hash, segs);
508 EXPORT_SYMBOL(__ip_select_ident);
510 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
511 const struct sock *sk,
512 const struct iphdr *iph,
514 u8 prot, u32 mark, int flow_flags)
517 const struct inet_sock *inet = inet_sk(sk);
519 oif = sk->sk_bound_dev_if;
521 tos = RT_CONN_FLAGS(sk);
522 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
524 flowi4_init_output(fl4, oif, mark, tos,
525 RT_SCOPE_UNIVERSE, prot,
527 iph->daddr, iph->saddr, 0, 0,
528 sock_net_uid(net, sk));
531 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
532 const struct sock *sk)
534 const struct iphdr *iph = ip_hdr(skb);
535 int oif = skb->dev->ifindex;
536 u8 tos = RT_TOS(iph->tos);
537 u8 prot = iph->protocol;
538 u32 mark = skb->mark;
540 __build_flow_key(sock_net(sk), fl4, sk, iph, oif, tos, prot, mark, 0);
543 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
545 const struct inet_sock *inet = inet_sk(sk);
546 const struct ip_options_rcu *inet_opt;
547 __be32 daddr = inet->inet_daddr;
550 inet_opt = rcu_dereference(inet->inet_opt);
551 if (inet_opt && inet_opt->opt.srr)
552 daddr = inet_opt->opt.faddr;
553 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
554 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
555 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
556 inet_sk_flowi_flags(sk),
557 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
561 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
562 const struct sk_buff *skb)
565 build_skb_flow_key(fl4, skb, sk);
567 build_sk_flow_key(fl4, sk);
570 static inline void rt_free(struct rtable *rt)
572 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
575 static DEFINE_SPINLOCK(fnhe_lock);
577 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
581 rt = rcu_dereference(fnhe->fnhe_rth_input);
583 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
586 rt = rcu_dereference(fnhe->fnhe_rth_output);
588 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
593 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
595 struct fib_nh_exception *fnhe, *oldest;
597 oldest = rcu_dereference(hash->chain);
598 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
599 fnhe = rcu_dereference(fnhe->fnhe_next)) {
600 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
603 fnhe_flush_routes(oldest);
607 static inline u32 fnhe_hashfun(__be32 daddr)
609 static u32 fnhe_hashrnd __read_mostly;
612 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
613 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
614 return hash_32(hval, FNHE_HASH_SHIFT);
617 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
619 rt->rt_pmtu = fnhe->fnhe_pmtu;
620 rt->dst.expires = fnhe->fnhe_expires;
623 rt->rt_flags |= RTCF_REDIRECTED;
624 rt->rt_gateway = fnhe->fnhe_gw;
625 rt->rt_uses_gateway = 1;
629 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
630 u32 pmtu, unsigned long expires)
632 struct fnhe_hash_bucket *hash;
633 struct fib_nh_exception *fnhe;
637 u32 hval = fnhe_hashfun(daddr);
639 spin_lock_bh(&fnhe_lock);
641 hash = rcu_dereference(nh->nh_exceptions);
643 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
646 rcu_assign_pointer(nh->nh_exceptions, hash);
652 for (fnhe = rcu_dereference(hash->chain); fnhe;
653 fnhe = rcu_dereference(fnhe->fnhe_next)) {
654 if (fnhe->fnhe_daddr == daddr)
663 fnhe->fnhe_pmtu = pmtu;
664 fnhe->fnhe_expires = max(1UL, expires);
666 /* Update all cached dsts too */
667 rt = rcu_dereference(fnhe->fnhe_rth_input);
669 fill_route_from_fnhe(rt, fnhe);
670 rt = rcu_dereference(fnhe->fnhe_rth_output);
672 fill_route_from_fnhe(rt, fnhe);
674 if (depth > FNHE_RECLAIM_DEPTH)
675 fnhe = fnhe_oldest(hash);
677 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
681 fnhe->fnhe_next = hash->chain;
682 rcu_assign_pointer(hash->chain, fnhe);
684 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
685 fnhe->fnhe_daddr = daddr;
687 fnhe->fnhe_pmtu = pmtu;
688 fnhe->fnhe_expires = expires;
690 /* Exception created; mark the cached routes for the nexthop
691 * stale, so anyone caching it rechecks if this exception
694 rt = rcu_dereference(nh->nh_rth_input);
696 rt->dst.obsolete = DST_OBSOLETE_KILL;
698 for_each_possible_cpu(i) {
699 struct rtable __rcu **prt;
700 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
701 rt = rcu_dereference(*prt);
703 rt->dst.obsolete = DST_OBSOLETE_KILL;
707 fnhe->fnhe_stamp = jiffies;
710 spin_unlock_bh(&fnhe_lock);
713 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
716 __be32 new_gw = icmp_hdr(skb)->un.gateway;
717 __be32 old_gw = ip_hdr(skb)->saddr;
718 struct net_device *dev = skb->dev;
719 struct in_device *in_dev;
720 struct fib_result res;
724 switch (icmp_hdr(skb)->code & 7) {
726 case ICMP_REDIR_NETTOS:
727 case ICMP_REDIR_HOST:
728 case ICMP_REDIR_HOSTTOS:
735 if (rt->rt_gateway != old_gw)
738 in_dev = __in_dev_get_rcu(dev);
743 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
744 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
745 ipv4_is_zeronet(new_gw))
746 goto reject_redirect;
748 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
749 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
750 goto reject_redirect;
751 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
752 goto reject_redirect;
754 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
755 goto reject_redirect;
758 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
760 if (!(n->nud_state & NUD_VALID)) {
761 neigh_event_send(n, NULL);
763 if (fib_lookup(net, fl4, &res, 0) == 0) {
764 struct fib_nh *nh = &FIB_RES_NH(res);
766 update_or_create_fnhe(nh, fl4->daddr, new_gw,
767 0, jiffies + ip_rt_gc_timeout);
770 rt->dst.obsolete = DST_OBSOLETE_KILL;
771 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
778 #ifdef CONFIG_IP_ROUTE_VERBOSE
779 if (IN_DEV_LOG_MARTIANS(in_dev)) {
780 const struct iphdr *iph = (const struct iphdr *) skb->data;
781 __be32 daddr = iph->daddr;
782 __be32 saddr = iph->saddr;
784 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
785 " Advised path = %pI4 -> %pI4\n",
786 &old_gw, dev->name, &new_gw,
793 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
797 const struct iphdr *iph = (const struct iphdr *) skb->data;
798 int oif = skb->dev->ifindex;
799 u8 tos = RT_TOS(iph->tos);
800 u8 prot = iph->protocol;
801 u32 mark = skb->mark;
803 rt = (struct rtable *) dst;
805 __build_flow_key(sock_net(sk), &fl4, sk, iph, oif, tos, prot, mark, 0);
806 __ip_do_redirect(rt, skb, &fl4, true);
809 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
811 struct rtable *rt = (struct rtable *)dst;
812 struct dst_entry *ret = dst;
815 if (dst->obsolete > 0) {
818 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
829 * 1. The first ip_rt_redirect_number redirects are sent
830 * with exponential backoff, then we stop sending them at all,
831 * assuming that the host ignores our redirects.
832 * 2. If we did not see packets requiring redirects
833 * during ip_rt_redirect_silence, we assume that the host
834 * forgot redirected route and start to send redirects again.
836 * This algorithm is much cheaper and more intelligent than dumb load limiting
839 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
840 * and "frag. need" (breaks PMTU discovery) in icmp.c.
843 void ip_rt_send_redirect(struct sk_buff *skb)
845 struct rtable *rt = skb_rtable(skb);
846 struct in_device *in_dev;
847 struct inet_peer *peer;
853 in_dev = __in_dev_get_rcu(rt->dst.dev);
854 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
858 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
859 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
862 net = dev_net(rt->dst.dev);
863 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
865 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
866 rt_nexthop(rt, ip_hdr(skb)->daddr));
870 /* No redirected packets during ip_rt_redirect_silence;
871 * reset the algorithm.
873 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
874 peer->rate_tokens = 0;
876 /* Too many ignored redirects; do not send anything
877 * set dst.rate_last to the last seen redirected packet.
879 if (peer->rate_tokens >= ip_rt_redirect_number) {
880 peer->rate_last = jiffies;
884 /* Check for load limit; set rate_last to the latest sent
887 if (peer->rate_tokens == 0 ||
890 (ip_rt_redirect_load << peer->rate_tokens)))) {
891 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
893 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
894 peer->rate_last = jiffies;
896 #ifdef CONFIG_IP_ROUTE_VERBOSE
898 peer->rate_tokens == ip_rt_redirect_number)
899 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
900 &ip_hdr(skb)->saddr, inet_iif(skb),
901 &ip_hdr(skb)->daddr, &gw);
908 static int ip_error(struct sk_buff *skb)
910 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
911 struct rtable *rt = skb_rtable(skb);
912 struct inet_peer *peer;
918 /* IP on this device is disabled. */
922 net = dev_net(rt->dst.dev);
923 if (!IN_DEV_FORWARD(in_dev)) {
924 switch (rt->dst.error) {
926 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
930 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
936 switch (rt->dst.error) {
941 code = ICMP_HOST_UNREACH;
944 code = ICMP_NET_UNREACH;
945 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
948 code = ICMP_PKT_FILTERED;
952 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
953 l3mdev_master_ifindex(skb->dev), 1);
958 peer->rate_tokens += now - peer->rate_last;
959 if (peer->rate_tokens > ip_rt_error_burst)
960 peer->rate_tokens = ip_rt_error_burst;
961 peer->rate_last = now;
962 if (peer->rate_tokens >= ip_rt_error_cost)
963 peer->rate_tokens -= ip_rt_error_cost;
969 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
975 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
977 struct dst_entry *dst = &rt->dst;
978 struct fib_result res;
980 if (dst_metric_locked(dst, RTAX_MTU))
983 if (ipv4_mtu(dst) < mtu)
986 if (mtu < ip_rt_min_pmtu)
987 mtu = ip_rt_min_pmtu;
989 if (rt->rt_pmtu == mtu &&
990 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
994 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
995 struct fib_nh *nh = &FIB_RES_NH(res);
997 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
998 jiffies + ip_rt_mtu_expires);
1003 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1004 struct sk_buff *skb, u32 mtu)
1006 struct rtable *rt = (struct rtable *) dst;
1009 ip_rt_build_flow_key(&fl4, sk, skb);
1010 __ip_rt_update_pmtu(rt, &fl4, mtu);
1013 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1014 int oif, u32 mark, u8 protocol, int flow_flags)
1016 const struct iphdr *iph = (const struct iphdr *) skb->data;
1021 mark = IP4_REPLY_MARK(net, skb->mark);
1023 __build_flow_key(net, &fl4, NULL, iph, oif,
1024 RT_TOS(iph->tos), protocol, mark, flow_flags);
1025 rt = __ip_route_output_key(net, &fl4);
1027 __ip_rt_update_pmtu(rt, &fl4, mtu);
1031 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1033 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1035 const struct iphdr *iph = (const struct iphdr *) skb->data;
1039 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1041 if (!fl4.flowi4_mark)
1042 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1044 rt = __ip_route_output_key(sock_net(sk), &fl4);
1046 __ip_rt_update_pmtu(rt, &fl4, mtu);
1051 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1053 const struct iphdr *iph = (const struct iphdr *) skb->data;
1056 struct dst_entry *odst = NULL;
1058 struct net *net = sock_net(sk);
1062 if (!ip_sk_accept_pmtu(sk))
1065 odst = sk_dst_get(sk);
1067 if (sock_owned_by_user(sk) || !odst) {
1068 __ipv4_sk_update_pmtu(skb, sk, mtu);
1072 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1074 rt = (struct rtable *)odst;
1075 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1076 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1083 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1085 if (!dst_check(&rt->dst, 0)) {
1087 dst_release(&rt->dst);
1089 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1097 sk_dst_set(sk, &rt->dst);
1103 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1105 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1106 int oif, u32 mark, u8 protocol, int flow_flags)
1108 const struct iphdr *iph = (const struct iphdr *) skb->data;
1112 __build_flow_key(net, &fl4, NULL, iph, oif,
1113 RT_TOS(iph->tos), protocol, mark, flow_flags);
1114 rt = __ip_route_output_key(net, &fl4);
1116 __ip_do_redirect(rt, skb, &fl4, false);
1120 EXPORT_SYMBOL_GPL(ipv4_redirect);
1122 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1124 const struct iphdr *iph = (const struct iphdr *) skb->data;
1127 struct net *net = sock_net(sk);
1129 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1130 rt = __ip_route_output_key(net, &fl4);
1132 __ip_do_redirect(rt, skb, &fl4, false);
1136 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1138 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1140 struct rtable *rt = (struct rtable *) dst;
1142 /* All IPV4 dsts are created with ->obsolete set to the value
1143 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1144 * into this function always.
1146 * When a PMTU/redirect information update invalidates a route,
1147 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1148 * DST_OBSOLETE_DEAD by dst_free().
1150 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1155 static void ipv4_link_failure(struct sk_buff *skb)
1159 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1161 rt = skb_rtable(skb);
1163 dst_set_expires(&rt->dst, 0);
1166 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1168 pr_debug("%s: %pI4 -> %pI4, %s\n",
1169 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1170 skb->dev ? skb->dev->name : "?");
1177 We do not cache source address of outgoing interface,
1178 because it is used only by IP RR, TS and SRR options,
1179 so that it out of fast path.
1181 BTW remember: "addr" is allowed to be not aligned
1185 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1189 if (rt_is_output_route(rt))
1190 src = ip_hdr(skb)->saddr;
1192 struct fib_result res;
1198 memset(&fl4, 0, sizeof(fl4));
1199 fl4.daddr = iph->daddr;
1200 fl4.saddr = iph->saddr;
1201 fl4.flowi4_tos = RT_TOS(iph->tos);
1202 fl4.flowi4_oif = rt->dst.dev->ifindex;
1203 fl4.flowi4_iif = skb->dev->ifindex;
1204 fl4.flowi4_mark = skb->mark;
1207 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1208 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1210 src = inet_select_addr(rt->dst.dev,
1211 rt_nexthop(rt, iph->daddr),
1215 memcpy(addr, &src, 4);
1218 #ifdef CONFIG_IP_ROUTE_CLASSID
1219 static void set_class_tag(struct rtable *rt, u32 tag)
1221 if (!(rt->dst.tclassid & 0xFFFF))
1222 rt->dst.tclassid |= tag & 0xFFFF;
1223 if (!(rt->dst.tclassid & 0xFFFF0000))
1224 rt->dst.tclassid |= tag & 0xFFFF0000;
1228 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1230 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1233 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1235 if (advmss > 65535 - 40)
1236 advmss = 65535 - 40;
1241 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1243 const struct rtable *rt = (const struct rtable *) dst;
1244 unsigned int mtu = rt->rt_pmtu;
1246 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1247 mtu = dst_metric_raw(dst, RTAX_MTU);
1252 mtu = dst->dev->mtu;
1254 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1255 if (rt->rt_uses_gateway && mtu > 576)
1259 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1261 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1264 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1266 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1267 struct fib_nh_exception *fnhe;
1273 hval = fnhe_hashfun(daddr);
1275 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1276 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1277 if (fnhe->fnhe_daddr == daddr)
1283 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1288 spin_lock_bh(&fnhe_lock);
1290 if (daddr == fnhe->fnhe_daddr) {
1291 struct rtable __rcu **porig;
1292 struct rtable *orig;
1293 int genid = fnhe_genid(dev_net(rt->dst.dev));
1295 if (rt_is_input_route(rt))
1296 porig = &fnhe->fnhe_rth_input;
1298 porig = &fnhe->fnhe_rth_output;
1299 orig = rcu_dereference(*porig);
1301 if (fnhe->fnhe_genid != genid) {
1302 fnhe->fnhe_genid = genid;
1304 fnhe->fnhe_pmtu = 0;
1305 fnhe->fnhe_expires = 0;
1306 fnhe_flush_routes(fnhe);
1309 fill_route_from_fnhe(rt, fnhe);
1310 if (!rt->rt_gateway)
1311 rt->rt_gateway = daddr;
1313 if (!(rt->dst.flags & DST_NOCACHE)) {
1314 rcu_assign_pointer(*porig, rt);
1320 fnhe->fnhe_stamp = jiffies;
1322 spin_unlock_bh(&fnhe_lock);
1327 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1329 struct rtable *orig, *prev, **p;
1332 if (rt_is_input_route(rt)) {
1333 p = (struct rtable **)&nh->nh_rth_input;
1335 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1339 prev = cmpxchg(p, orig, rt);
1349 struct uncached_list {
1351 struct list_head head;
1354 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1356 static void rt_add_uncached_list(struct rtable *rt)
1358 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1360 rt->rt_uncached_list = ul;
1362 spin_lock_bh(&ul->lock);
1363 list_add_tail(&rt->rt_uncached, &ul->head);
1364 spin_unlock_bh(&ul->lock);
1367 static void ipv4_dst_destroy(struct dst_entry *dst)
1369 struct rtable *rt = (struct rtable *) dst;
1371 if (!list_empty(&rt->rt_uncached)) {
1372 struct uncached_list *ul = rt->rt_uncached_list;
1374 spin_lock_bh(&ul->lock);
1375 list_del(&rt->rt_uncached);
1376 spin_unlock_bh(&ul->lock);
1380 void rt_flush_dev(struct net_device *dev)
1382 struct net *net = dev_net(dev);
1386 for_each_possible_cpu(cpu) {
1387 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1389 spin_lock_bh(&ul->lock);
1390 list_for_each_entry(rt, &ul->head, rt_uncached) {
1391 if (rt->dst.dev != dev)
1393 rt->dst.dev = net->loopback_dev;
1394 dev_hold(rt->dst.dev);
1397 spin_unlock_bh(&ul->lock);
1401 static bool rt_cache_valid(const struct rtable *rt)
1404 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1408 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1409 const struct fib_result *res,
1410 struct fib_nh_exception *fnhe,
1411 struct fib_info *fi, u16 type, u32 itag)
1413 bool cached = false;
1416 struct fib_nh *nh = &FIB_RES_NH(*res);
1418 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1419 rt->rt_gateway = nh->nh_gw;
1420 rt->rt_uses_gateway = 1;
1422 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1423 #ifdef CONFIG_IP_ROUTE_CLASSID
1424 rt->dst.tclassid = nh->nh_tclassid;
1426 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1428 cached = rt_bind_exception(rt, fnhe, daddr);
1429 else if (!(rt->dst.flags & DST_NOCACHE))
1430 cached = rt_cache_route(nh, rt);
1431 if (unlikely(!cached)) {
1432 /* Routes we intend to cache in nexthop exception or
1433 * FIB nexthop have the DST_NOCACHE bit clear.
1434 * However, if we are unsuccessful at storing this
1435 * route into the cache we really need to set it.
1437 rt->dst.flags |= DST_NOCACHE;
1438 if (!rt->rt_gateway)
1439 rt->rt_gateway = daddr;
1440 rt_add_uncached_list(rt);
1443 rt_add_uncached_list(rt);
1445 #ifdef CONFIG_IP_ROUTE_CLASSID
1446 #ifdef CONFIG_IP_MULTIPLE_TABLES
1447 set_class_tag(rt, res->tclassid);
1449 set_class_tag(rt, itag);
1453 struct rtable *rt_dst_alloc(struct net_device *dev,
1454 unsigned int flags, u16 type,
1455 bool nopolicy, bool noxfrm, bool will_cache)
1459 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1460 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1461 (nopolicy ? DST_NOPOLICY : 0) |
1462 (noxfrm ? DST_NOXFRM : 0));
1465 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1466 rt->rt_flags = flags;
1468 rt->rt_is_input = 0;
1472 rt->rt_uses_gateway = 0;
1473 rt->rt_table_id = 0;
1474 INIT_LIST_HEAD(&rt->rt_uncached);
1476 rt->dst.output = ip_output;
1477 if (flags & RTCF_LOCAL)
1478 rt->dst.input = ip_local_deliver;
1483 EXPORT_SYMBOL(rt_dst_alloc);
1485 /* called in rcu_read_lock() section */
1486 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1487 u8 tos, struct net_device *dev, int our)
1490 struct in_device *in_dev = __in_dev_get_rcu(dev);
1491 unsigned int flags = RTCF_MULTICAST;
1495 /* Primary sanity checks. */
1500 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1501 skb->protocol != htons(ETH_P_IP))
1504 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1507 if (ipv4_is_zeronet(saddr)) {
1508 if (!ipv4_is_local_multicast(daddr))
1511 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1517 flags |= RTCF_LOCAL;
1519 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1520 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1524 #ifdef CONFIG_IP_ROUTE_CLASSID
1525 rth->dst.tclassid = itag;
1527 rth->dst.output = ip_rt_bug;
1528 rth->rt_is_input= 1;
1530 #ifdef CONFIG_IP_MROUTE
1531 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1532 rth->dst.input = ip_mr_input;
1534 RT_CACHE_STAT_INC(in_slow_mc);
1536 skb_dst_set(skb, &rth->dst);
1548 static void ip_handle_martian_source(struct net_device *dev,
1549 struct in_device *in_dev,
1550 struct sk_buff *skb,
1554 RT_CACHE_STAT_INC(in_martian_src);
1555 #ifdef CONFIG_IP_ROUTE_VERBOSE
1556 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1558 * RFC1812 recommendation, if source is martian,
1559 * the only hint is MAC header.
1561 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1562 &daddr, &saddr, dev->name);
1563 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1564 print_hex_dump(KERN_WARNING, "ll header: ",
1565 DUMP_PREFIX_OFFSET, 16, 1,
1566 skb_mac_header(skb),
1567 dev->hard_header_len, true);
1573 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1575 struct fnhe_hash_bucket *hash;
1576 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1577 u32 hval = fnhe_hashfun(daddr);
1579 spin_lock_bh(&fnhe_lock);
1581 hash = rcu_dereference_protected(nh->nh_exceptions,
1582 lockdep_is_held(&fnhe_lock));
1585 fnhe_p = &hash->chain;
1586 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1588 if (fnhe->fnhe_daddr == daddr) {
1589 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1590 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1591 fnhe_flush_routes(fnhe);
1592 kfree_rcu(fnhe, rcu);
1595 fnhe_p = &fnhe->fnhe_next;
1596 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1597 lockdep_is_held(&fnhe_lock));
1600 spin_unlock_bh(&fnhe_lock);
1603 /* called in rcu_read_lock() section */
1604 static int __mkroute_input(struct sk_buff *skb,
1605 const struct fib_result *res,
1606 struct in_device *in_dev,
1607 __be32 daddr, __be32 saddr, u32 tos)
1609 struct fib_nh_exception *fnhe;
1612 struct in_device *out_dev;
1616 /* get a working reference to the output device */
1617 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1619 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1623 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1624 in_dev->dev, in_dev, &itag);
1626 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1632 do_cache = res->fi && !itag;
1633 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1634 skb->protocol == htons(ETH_P_IP) &&
1635 (IN_DEV_SHARED_MEDIA(out_dev) ||
1636 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1637 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1639 if (skb->protocol != htons(ETH_P_IP)) {
1640 /* Not IP (i.e. ARP). Do not create route, if it is
1641 * invalid for proxy arp. DNAT routes are always valid.
1643 * Proxy arp feature have been extended to allow, ARP
1644 * replies back to the same interface, to support
1645 * Private VLAN switch technologies. See arp.c.
1647 if (out_dev == in_dev &&
1648 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1654 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1657 rth = rcu_dereference(fnhe->fnhe_rth_input);
1658 if (rth && rth->dst.expires &&
1659 time_after(jiffies, rth->dst.expires)) {
1660 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1667 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1670 if (rt_cache_valid(rth)) {
1671 skb_dst_set_noref(skb, &rth->dst);
1676 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1677 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1678 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1684 rth->rt_is_input = 1;
1686 rth->rt_table_id = res->table->tb_id;
1687 RT_CACHE_STAT_INC(in_slow_tot);
1689 rth->dst.input = ip_forward;
1691 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1692 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1693 rth->dst.lwtstate->orig_output = rth->dst.output;
1694 rth->dst.output = lwtunnel_output;
1696 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1697 rth->dst.lwtstate->orig_input = rth->dst.input;
1698 rth->dst.input = lwtunnel_input;
1700 skb_dst_set(skb, &rth->dst);
1707 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1709 /* To make ICMP packets follow the right flow, the multipath hash is
1710 * calculated from the inner IP addresses in reverse order.
1712 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1714 const struct iphdr *outer_iph = ip_hdr(skb);
1715 struct icmphdr _icmph;
1716 const struct icmphdr *icmph;
1717 struct iphdr _inner_iph;
1718 const struct iphdr *inner_iph;
1720 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1723 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1728 if (icmph->type != ICMP_DEST_UNREACH &&
1729 icmph->type != ICMP_REDIRECT &&
1730 icmph->type != ICMP_TIME_EXCEEDED &&
1731 icmph->type != ICMP_PARAMETERPROB) {
1735 inner_iph = skb_header_pointer(skb,
1736 outer_iph->ihl * 4 + sizeof(_icmph),
1737 sizeof(_inner_iph), &_inner_iph);
1741 return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1744 return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1747 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1749 static int ip_mkroute_input(struct sk_buff *skb,
1750 struct fib_result *res,
1751 const struct flowi4 *fl4,
1752 struct in_device *in_dev,
1753 __be32 daddr, __be32 saddr, u32 tos)
1755 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1756 if (res->fi && res->fi->fib_nhs > 1) {
1759 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1760 h = ip_multipath_icmp_hash(skb);
1762 h = fib_multipath_hash(saddr, daddr);
1763 fib_select_multipath(res, h);
1767 /* create a routing cache entry */
1768 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1772 * NOTE. We drop all the packets that has local source
1773 * addresses, because every properly looped back packet
1774 * must have correct destination already attached by output routine.
1776 * Such approach solves two big problems:
1777 * 1. Not simplex devices are handled properly.
1778 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1779 * called with rcu_read_lock()
1782 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1783 u8 tos, struct net_device *dev)
1785 struct fib_result res;
1786 struct in_device *in_dev = __in_dev_get_rcu(dev);
1787 struct ip_tunnel_info *tun_info;
1789 unsigned int flags = 0;
1793 struct net *net = dev_net(dev);
1796 /* IP on this device is disabled. */
1801 /* Check for the most weird martians, which can be not detected
1805 tun_info = skb_tunnel_info(skb);
1806 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1807 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1809 fl4.flowi4_tun_key.tun_id = 0;
1812 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1813 goto martian_source;
1817 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1820 /* Accept zero addresses only to limited broadcast;
1821 * I even do not know to fix it or not. Waiting for complains :-)
1823 if (ipv4_is_zeronet(saddr))
1824 goto martian_source;
1826 if (ipv4_is_zeronet(daddr))
1827 goto martian_destination;
1829 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1830 * and call it once if daddr or/and saddr are loopback addresses
1832 if (ipv4_is_loopback(daddr)) {
1833 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1834 goto martian_destination;
1835 } else if (ipv4_is_loopback(saddr)) {
1836 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1837 goto martian_source;
1841 * Now we are ready to route packet.
1844 fl4.flowi4_iif = dev->ifindex;
1845 fl4.flowi4_mark = skb->mark;
1846 fl4.flowi4_tos = tos;
1847 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1848 fl4.flowi4_flags = 0;
1851 err = fib_lookup(net, &fl4, &res, 0);
1853 if (!IN_DEV_FORWARD(in_dev))
1854 err = -EHOSTUNREACH;
1858 if (res.type == RTN_BROADCAST)
1861 if (res.type == RTN_LOCAL) {
1862 err = fib_validate_source(skb, saddr, daddr, tos,
1863 0, dev, in_dev, &itag);
1865 goto martian_source;
1869 if (!IN_DEV_FORWARD(in_dev)) {
1870 err = -EHOSTUNREACH;
1873 if (res.type != RTN_UNICAST)
1874 goto martian_destination;
1876 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1880 if (skb->protocol != htons(ETH_P_IP))
1883 if (!ipv4_is_zeronet(saddr)) {
1884 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1887 goto martian_source;
1889 flags |= RTCF_BROADCAST;
1890 res.type = RTN_BROADCAST;
1891 RT_CACHE_STAT_INC(in_brd);
1897 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1898 if (rt_cache_valid(rth)) {
1899 skb_dst_set_noref(skb, &rth->dst);
1907 rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1908 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1912 rth->dst.output= ip_rt_bug;
1913 #ifdef CONFIG_IP_ROUTE_CLASSID
1914 rth->dst.tclassid = itag;
1916 rth->rt_is_input = 1;
1918 rth->rt_table_id = res.table->tb_id;
1920 RT_CACHE_STAT_INC(in_slow_tot);
1921 if (res.type == RTN_UNREACHABLE) {
1922 rth->dst.input= ip_error;
1923 rth->dst.error= -err;
1924 rth->rt_flags &= ~RTCF_LOCAL;
1927 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1928 rth->dst.flags |= DST_NOCACHE;
1929 rt_add_uncached_list(rth);
1932 skb_dst_set(skb, &rth->dst);
1937 RT_CACHE_STAT_INC(in_no_route);
1938 res.type = RTN_UNREACHABLE;
1944 * Do not cache martian addresses: they should be logged (RFC1812)
1946 martian_destination:
1947 RT_CACHE_STAT_INC(in_martian_dst);
1948 #ifdef CONFIG_IP_ROUTE_VERBOSE
1949 if (IN_DEV_LOG_MARTIANS(in_dev))
1950 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1951 &daddr, &saddr, dev->name);
1963 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1967 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1968 u8 tos, struct net_device *dev)
1974 /* Multicast recognition logic is moved from route cache to here.
1975 The problem was that too many Ethernet cards have broken/missing
1976 hardware multicast filters :-( As result the host on multicasting
1977 network acquires a lot of useless route cache entries, sort of
1978 SDR messages from all the world. Now we try to get rid of them.
1979 Really, provided software IP multicast filter is organized
1980 reasonably (at least, hashed), it does not result in a slowdown
1981 comparing with route cache reject entries.
1982 Note, that multicast routers are not affected, because
1983 route cache entry is created eventually.
1985 if (ipv4_is_multicast(daddr)) {
1986 struct in_device *in_dev = __in_dev_get_rcu(dev);
1990 our = ip_check_mc_rcu(in_dev, daddr, saddr,
1991 ip_hdr(skb)->protocol);
1993 /* check l3 master if no match yet */
1994 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
1995 struct in_device *l3_in_dev;
1997 l3_in_dev = __in_dev_get_rcu(skb->dev);
1999 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2000 ip_hdr(skb)->protocol);
2005 #ifdef CONFIG_IP_MROUTE
2007 (!ipv4_is_local_multicast(daddr) &&
2008 IN_DEV_MFORWARD(in_dev))
2011 res = ip_route_input_mc(skb, daddr, saddr,
2017 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2021 EXPORT_SYMBOL(ip_route_input_noref);
2023 /* called with rcu_read_lock() */
2024 static struct rtable *__mkroute_output(const struct fib_result *res,
2025 const struct flowi4 *fl4, int orig_oif,
2026 struct net_device *dev_out,
2029 struct fib_info *fi = res->fi;
2030 struct fib_nh_exception *fnhe;
2031 struct in_device *in_dev;
2032 u16 type = res->type;
2036 in_dev = __in_dev_get_rcu(dev_out);
2038 return ERR_PTR(-EINVAL);
2040 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2041 if (ipv4_is_loopback(fl4->saddr) &&
2042 !(dev_out->flags & IFF_LOOPBACK) &&
2043 !netif_is_l3_master(dev_out))
2044 return ERR_PTR(-EINVAL);
2046 if (ipv4_is_lbcast(fl4->daddr))
2047 type = RTN_BROADCAST;
2048 else if (ipv4_is_multicast(fl4->daddr))
2049 type = RTN_MULTICAST;
2050 else if (ipv4_is_zeronet(fl4->daddr))
2051 return ERR_PTR(-EINVAL);
2053 if (dev_out->flags & IFF_LOOPBACK)
2054 flags |= RTCF_LOCAL;
2057 if (type == RTN_BROADCAST) {
2058 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2060 } else if (type == RTN_MULTICAST) {
2061 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2062 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2064 flags &= ~RTCF_LOCAL;
2067 /* If multicast route do not exist use
2068 * default one, but do not gateway in this case.
2071 if (fi && res->prefixlen < 4)
2073 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2074 (orig_oif != dev_out->ifindex)) {
2075 /* For local routes that require a particular output interface
2076 * we do not want to cache the result. Caching the result
2077 * causes incorrect behaviour when there are multiple source
2078 * addresses on the interface, the end result being that if the
2079 * intended recipient is waiting on that interface for the
2080 * packet he won't receive it because it will be delivered on
2081 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2082 * be set to the loopback interface as well.
2088 do_cache &= fi != NULL;
2090 struct rtable __rcu **prth;
2091 struct fib_nh *nh = &FIB_RES_NH(*res);
2093 fnhe = find_exception(nh, fl4->daddr);
2095 prth = &fnhe->fnhe_rth_output;
2096 rth = rcu_dereference(*prth);
2097 if (rth && rth->dst.expires &&
2098 time_after(jiffies, rth->dst.expires)) {
2099 ip_del_fnhe(nh, fl4->daddr);
2106 if (unlikely(fl4->flowi4_flags &
2107 FLOWI_FLAG_KNOWN_NH &&
2109 nh->nh_scope == RT_SCOPE_LINK))) {
2113 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2114 rth = rcu_dereference(*prth);
2117 if (rt_cache_valid(rth)) {
2118 dst_hold(&rth->dst);
2124 rth = rt_dst_alloc(dev_out, flags, type,
2125 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2126 IN_DEV_CONF_GET(in_dev, NOXFRM),
2129 return ERR_PTR(-ENOBUFS);
2131 rth->rt_iif = orig_oif ? : 0;
2133 rth->rt_table_id = res->table->tb_id;
2135 RT_CACHE_STAT_INC(out_slow_tot);
2137 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2138 if (flags & RTCF_LOCAL &&
2139 !(dev_out->flags & IFF_LOOPBACK)) {
2140 rth->dst.output = ip_mc_output;
2141 RT_CACHE_STAT_INC(out_slow_mc);
2143 #ifdef CONFIG_IP_MROUTE
2144 if (type == RTN_MULTICAST) {
2145 if (IN_DEV_MFORWARD(in_dev) &&
2146 !ipv4_is_local_multicast(fl4->daddr)) {
2147 rth->dst.input = ip_mr_input;
2148 rth->dst.output = ip_mc_output;
2154 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2155 if (lwtunnel_output_redirect(rth->dst.lwtstate))
2156 rth->dst.output = lwtunnel_output;
2162 * Major route resolver routine.
2165 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2168 struct net_device *dev_out = NULL;
2169 __u8 tos = RT_FL_TOS(fl4);
2170 unsigned int flags = 0;
2171 struct fib_result res;
2174 int err = -ENETUNREACH;
2180 orig_oif = fl4->flowi4_oif;
2182 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2183 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2184 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2185 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2189 rth = ERR_PTR(-EINVAL);
2190 if (ipv4_is_multicast(fl4->saddr) ||
2191 ipv4_is_lbcast(fl4->saddr) ||
2192 ipv4_is_zeronet(fl4->saddr))
2195 /* I removed check for oif == dev_out->oif here.
2196 It was wrong for two reasons:
2197 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2198 is assigned to multiple interfaces.
2199 2. Moreover, we are allowed to send packets with saddr
2200 of another iface. --ANK
2203 if (fl4->flowi4_oif == 0 &&
2204 (ipv4_is_multicast(fl4->daddr) ||
2205 ipv4_is_lbcast(fl4->daddr))) {
2206 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2207 dev_out = __ip_dev_find(net, fl4->saddr, false);
2211 /* Special hack: user can direct multicasts
2212 and limited broadcast via necessary interface
2213 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2214 This hack is not just for fun, it allows
2215 vic,vat and friends to work.
2216 They bind socket to loopback, set ttl to zero
2217 and expect that it will work.
2218 From the viewpoint of routing cache they are broken,
2219 because we are not allowed to build multicast path
2220 with loopback source addr (look, routing cache
2221 cannot know, that ttl is zero, so that packet
2222 will not leave this host and route is valid).
2223 Luckily, this hack is good workaround.
2226 fl4->flowi4_oif = dev_out->ifindex;
2230 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2231 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2232 if (!__ip_dev_find(net, fl4->saddr, false))
2238 if (fl4->flowi4_oif) {
2239 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2240 rth = ERR_PTR(-ENODEV);
2244 /* RACE: Check return value of inet_select_addr instead. */
2245 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2246 rth = ERR_PTR(-ENETUNREACH);
2249 if (ipv4_is_local_multicast(fl4->daddr) ||
2250 ipv4_is_lbcast(fl4->daddr) ||
2251 fl4->flowi4_proto == IPPROTO_IGMP) {
2253 fl4->saddr = inet_select_addr(dev_out, 0,
2258 if (ipv4_is_multicast(fl4->daddr))
2259 fl4->saddr = inet_select_addr(dev_out, 0,
2261 else if (!fl4->daddr)
2262 fl4->saddr = inet_select_addr(dev_out, 0,
2268 fl4->daddr = fl4->saddr;
2270 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2271 dev_out = net->loopback_dev;
2272 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2273 res.type = RTN_LOCAL;
2274 flags |= RTCF_LOCAL;
2278 err = fib_lookup(net, fl4, &res, 0);
2282 if (fl4->flowi4_oif &&
2283 (ipv4_is_multicast(fl4->daddr) ||
2284 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2285 /* Apparently, routing tables are wrong. Assume,
2286 that the destination is on link.
2289 Because we are allowed to send to iface
2290 even if it has NO routes and NO assigned
2291 addresses. When oif is specified, routing
2292 tables are looked up with only one purpose:
2293 to catch if destination is gatewayed, rather than
2294 direct. Moreover, if MSG_DONTROUTE is set,
2295 we send packet, ignoring both routing tables
2296 and ifaddr state. --ANK
2299 We could make it even if oif is unknown,
2300 likely IPv6, but we do not.
2303 if (fl4->saddr == 0)
2304 fl4->saddr = inet_select_addr(dev_out, 0,
2306 res.type = RTN_UNICAST;
2313 if (res.type == RTN_LOCAL) {
2315 if (res.fi->fib_prefsrc)
2316 fl4->saddr = res.fi->fib_prefsrc;
2318 fl4->saddr = fl4->daddr;
2321 /* L3 master device is the loopback for that domain */
2322 dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2323 fl4->flowi4_oif = dev_out->ifindex;
2324 flags |= RTCF_LOCAL;
2328 fib_select_path(net, &res, fl4, mp_hash);
2330 dev_out = FIB_RES_DEV(res);
2331 fl4->flowi4_oif = dev_out->ifindex;
2335 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2341 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2343 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2348 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2350 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2352 return mtu ? : dst->dev->mtu;
2355 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2356 struct sk_buff *skb, u32 mtu)
2360 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2361 struct sk_buff *skb)
2365 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2371 static struct dst_ops ipv4_dst_blackhole_ops = {
2373 .check = ipv4_blackhole_dst_check,
2374 .mtu = ipv4_blackhole_mtu,
2375 .default_advmss = ipv4_default_advmss,
2376 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2377 .redirect = ipv4_rt_blackhole_redirect,
2378 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2379 .neigh_lookup = ipv4_neigh_lookup,
2382 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2384 struct rtable *ort = (struct rtable *) dst_orig;
2387 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2389 struct dst_entry *new = &rt->dst;
2392 new->input = dst_discard;
2393 new->output = dst_discard_out;
2395 new->dev = ort->dst.dev;
2399 rt->rt_is_input = ort->rt_is_input;
2400 rt->rt_iif = ort->rt_iif;
2401 rt->rt_pmtu = ort->rt_pmtu;
2403 rt->rt_genid = rt_genid_ipv4(net);
2404 rt->rt_flags = ort->rt_flags;
2405 rt->rt_type = ort->rt_type;
2406 rt->rt_gateway = ort->rt_gateway;
2407 rt->rt_uses_gateway = ort->rt_uses_gateway;
2409 INIT_LIST_HEAD(&rt->rt_uncached);
2413 dst_release(dst_orig);
2415 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2418 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2419 const struct sock *sk)
2421 struct rtable *rt = __ip_route_output_key(net, flp4);
2426 if (flp4->flowi4_proto)
2427 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2428 flowi4_to_flowi(flp4),
2433 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2435 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
2436 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2437 u32 seq, int event, int nowait, unsigned int flags)
2439 struct rtable *rt = skb_rtable(skb);
2441 struct nlmsghdr *nlh;
2442 unsigned long expires = 0;
2444 u32 metrics[RTAX_MAX];
2446 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2450 r = nlmsg_data(nlh);
2451 r->rtm_family = AF_INET;
2452 r->rtm_dst_len = 32;
2454 r->rtm_tos = fl4->flowi4_tos;
2455 r->rtm_table = table_id;
2456 if (nla_put_u32(skb, RTA_TABLE, table_id))
2457 goto nla_put_failure;
2458 r->rtm_type = rt->rt_type;
2459 r->rtm_scope = RT_SCOPE_UNIVERSE;
2460 r->rtm_protocol = RTPROT_UNSPEC;
2461 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2462 if (rt->rt_flags & RTCF_NOTIFY)
2463 r->rtm_flags |= RTM_F_NOTIFY;
2464 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2465 r->rtm_flags |= RTCF_DOREDIRECT;
2467 if (nla_put_in_addr(skb, RTA_DST, dst))
2468 goto nla_put_failure;
2470 r->rtm_src_len = 32;
2471 if (nla_put_in_addr(skb, RTA_SRC, src))
2472 goto nla_put_failure;
2475 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2476 goto nla_put_failure;
2477 #ifdef CONFIG_IP_ROUTE_CLASSID
2478 if (rt->dst.tclassid &&
2479 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2480 goto nla_put_failure;
2482 if (!rt_is_input_route(rt) &&
2483 fl4->saddr != src) {
2484 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2485 goto nla_put_failure;
2487 if (rt->rt_uses_gateway &&
2488 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2489 goto nla_put_failure;
2491 expires = rt->dst.expires;
2493 unsigned long now = jiffies;
2495 if (time_before(now, expires))
2501 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2502 if (rt->rt_pmtu && expires)
2503 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2504 if (rtnetlink_put_metrics(skb, metrics) < 0)
2505 goto nla_put_failure;
2507 if (fl4->flowi4_mark &&
2508 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2509 goto nla_put_failure;
2511 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2512 nla_put_u32(skb, RTA_UID,
2513 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2514 goto nla_put_failure;
2516 error = rt->dst.error;
2518 if (rt_is_input_route(rt)) {
2519 #ifdef CONFIG_IP_MROUTE
2520 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2521 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2522 int err = ipmr_get_route(net, skb,
2523 fl4->saddr, fl4->daddr,
2530 goto nla_put_failure;
2532 if (err == -EMSGSIZE)
2533 goto nla_put_failure;
2539 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2540 goto nla_put_failure;
2543 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2544 goto nla_put_failure;
2546 nlmsg_end(skb, nlh);
2550 nlmsg_cancel(skb, nlh);
2554 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2556 struct net *net = sock_net(in_skb->sk);
2558 struct nlattr *tb[RTA_MAX+1];
2559 struct rtable *rt = NULL;
2566 struct sk_buff *skb;
2567 u32 table_id = RT_TABLE_MAIN;
2570 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2574 rtm = nlmsg_data(nlh);
2576 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2582 /* Reserve room for dummy headers, this skb can pass
2583 through good chunk of routing engine.
2585 skb_reset_mac_header(skb);
2586 skb_reset_network_header(skb);
2588 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2589 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2590 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2592 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2593 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2594 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2595 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2597 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2599 uid = (iif ? INVALID_UID : current_uid());
2601 memset(&fl4, 0, sizeof(fl4));
2604 fl4.flowi4_tos = rtm->rtm_tos;
2605 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2606 fl4.flowi4_mark = mark;
2607 fl4.flowi4_uid = uid;
2610 struct net_device *dev;
2612 dev = __dev_get_by_index(net, iif);
2618 skb->protocol = htons(ETH_P_IP);
2622 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2625 rt = skb_rtable(skb);
2626 if (err == 0 && rt->dst.error)
2627 err = -rt->dst.error;
2629 rt = ip_route_output_key(net, &fl4);
2639 skb_dst_set(skb, &rt->dst);
2640 if (rtm->rtm_flags & RTM_F_NOTIFY)
2641 rt->rt_flags |= RTCF_NOTIFY;
2643 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2644 table_id = rt->rt_table_id;
2646 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2647 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2648 RTM_NEWROUTE, 0, 0);
2652 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2661 void ip_rt_multicast_event(struct in_device *in_dev)
2663 rt_cache_flush(dev_net(in_dev->dev));
2666 #ifdef CONFIG_SYSCTL
2667 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2668 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2669 static int ip_rt_gc_elasticity __read_mostly = 8;
2671 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2672 void __user *buffer,
2673 size_t *lenp, loff_t *ppos)
2675 struct net *net = (struct net *)__ctl->extra1;
2678 rt_cache_flush(net);
2679 fnhe_genid_bump(net);
2686 static struct ctl_table ipv4_route_table[] = {
2688 .procname = "gc_thresh",
2689 .data = &ipv4_dst_ops.gc_thresh,
2690 .maxlen = sizeof(int),
2692 .proc_handler = proc_dointvec,
2695 .procname = "max_size",
2696 .data = &ip_rt_max_size,
2697 .maxlen = sizeof(int),
2699 .proc_handler = proc_dointvec,
2702 /* Deprecated. Use gc_min_interval_ms */
2704 .procname = "gc_min_interval",
2705 .data = &ip_rt_gc_min_interval,
2706 .maxlen = sizeof(int),
2708 .proc_handler = proc_dointvec_jiffies,
2711 .procname = "gc_min_interval_ms",
2712 .data = &ip_rt_gc_min_interval,
2713 .maxlen = sizeof(int),
2715 .proc_handler = proc_dointvec_ms_jiffies,
2718 .procname = "gc_timeout",
2719 .data = &ip_rt_gc_timeout,
2720 .maxlen = sizeof(int),
2722 .proc_handler = proc_dointvec_jiffies,
2725 .procname = "gc_interval",
2726 .data = &ip_rt_gc_interval,
2727 .maxlen = sizeof(int),
2729 .proc_handler = proc_dointvec_jiffies,
2732 .procname = "redirect_load",
2733 .data = &ip_rt_redirect_load,
2734 .maxlen = sizeof(int),
2736 .proc_handler = proc_dointvec,
2739 .procname = "redirect_number",
2740 .data = &ip_rt_redirect_number,
2741 .maxlen = sizeof(int),
2743 .proc_handler = proc_dointvec,
2746 .procname = "redirect_silence",
2747 .data = &ip_rt_redirect_silence,
2748 .maxlen = sizeof(int),
2750 .proc_handler = proc_dointvec,
2753 .procname = "error_cost",
2754 .data = &ip_rt_error_cost,
2755 .maxlen = sizeof(int),
2757 .proc_handler = proc_dointvec,
2760 .procname = "error_burst",
2761 .data = &ip_rt_error_burst,
2762 .maxlen = sizeof(int),
2764 .proc_handler = proc_dointvec,
2767 .procname = "gc_elasticity",
2768 .data = &ip_rt_gc_elasticity,
2769 .maxlen = sizeof(int),
2771 .proc_handler = proc_dointvec,
2774 .procname = "mtu_expires",
2775 .data = &ip_rt_mtu_expires,
2776 .maxlen = sizeof(int),
2778 .proc_handler = proc_dointvec_jiffies,
2781 .procname = "min_pmtu",
2782 .data = &ip_rt_min_pmtu,
2783 .maxlen = sizeof(int),
2785 .proc_handler = proc_dointvec,
2788 .procname = "min_adv_mss",
2789 .data = &ip_rt_min_advmss,
2790 .maxlen = sizeof(int),
2792 .proc_handler = proc_dointvec,
2797 static struct ctl_table ipv4_route_flush_table[] = {
2799 .procname = "flush",
2800 .maxlen = sizeof(int),
2802 .proc_handler = ipv4_sysctl_rtcache_flush,
2807 static __net_init int sysctl_route_net_init(struct net *net)
2809 struct ctl_table *tbl;
2811 tbl = ipv4_route_flush_table;
2812 if (!net_eq(net, &init_net)) {
2813 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2817 /* Don't export sysctls to unprivileged users */
2818 if (net->user_ns != &init_user_ns)
2819 tbl[0].procname = NULL;
2821 tbl[0].extra1 = net;
2823 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2824 if (!net->ipv4.route_hdr)
2829 if (tbl != ipv4_route_flush_table)
2835 static __net_exit void sysctl_route_net_exit(struct net *net)
2837 struct ctl_table *tbl;
2839 tbl = net->ipv4.route_hdr->ctl_table_arg;
2840 unregister_net_sysctl_table(net->ipv4.route_hdr);
2841 BUG_ON(tbl == ipv4_route_flush_table);
2845 static __net_initdata struct pernet_operations sysctl_route_ops = {
2846 .init = sysctl_route_net_init,
2847 .exit = sysctl_route_net_exit,
2851 static __net_init int rt_genid_init(struct net *net)
2853 atomic_set(&net->ipv4.rt_genid, 0);
2854 atomic_set(&net->fnhe_genid, 0);
2855 get_random_bytes(&net->ipv4.dev_addr_genid,
2856 sizeof(net->ipv4.dev_addr_genid));
2860 static __net_initdata struct pernet_operations rt_genid_ops = {
2861 .init = rt_genid_init,
2864 static int __net_init ipv4_inetpeer_init(struct net *net)
2866 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2870 inet_peer_base_init(bp);
2871 net->ipv4.peers = bp;
2875 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2877 struct inet_peer_base *bp = net->ipv4.peers;
2879 net->ipv4.peers = NULL;
2880 inetpeer_invalidate_tree(bp);
2884 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2885 .init = ipv4_inetpeer_init,
2886 .exit = ipv4_inetpeer_exit,
2889 #ifdef CONFIG_IP_ROUTE_CLASSID
2890 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2891 #endif /* CONFIG_IP_ROUTE_CLASSID */
2893 int __init ip_rt_init(void)
2898 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2900 panic("IP: failed to allocate ip_idents\n");
2902 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2904 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2906 panic("IP: failed to allocate ip_tstamps\n");
2908 for_each_possible_cpu(cpu) {
2909 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2911 INIT_LIST_HEAD(&ul->head);
2912 spin_lock_init(&ul->lock);
2914 #ifdef CONFIG_IP_ROUTE_CLASSID
2915 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2917 panic("IP: failed to allocate ip_rt_acct\n");
2920 ipv4_dst_ops.kmem_cachep =
2921 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2922 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2924 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2926 if (dst_entries_init(&ipv4_dst_ops) < 0)
2927 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2929 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2930 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2932 ipv4_dst_ops.gc_thresh = ~0;
2933 ip_rt_max_size = INT_MAX;
2938 if (ip_rt_proc_init())
2939 pr_err("Unable to create route proc files\n");
2944 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2946 #ifdef CONFIG_SYSCTL
2947 register_pernet_subsys(&sysctl_route_ops);
2949 register_pernet_subsys(&rt_genid_ops);
2950 register_pernet_subsys(&ipv4_inetpeer_ops);
2954 #ifdef CONFIG_SYSCTL
2956 * We really need to sanitize the damn ipv4 init order, then all
2957 * this nonsense will go away.
2959 void __init ip_static_sysctl_init(void)
2961 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);