]> Git Repo - linux.git/blob - net/ipv4/route.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <[email protected]>
10  *              Alan Cox, <[email protected]>
11  *              Linus Torvalds, <[email protected]>
12  *              Alexey Kuznetsov, <[email protected]>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              ([email protected])      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly  = 9;
121 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly       = HZ;
124 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
126 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly       = 256;
128
129 /*
130  *      Interface to generic destination cache.
131  */
132
133 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
134 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
135 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
136 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137 static void              ipv4_link_failure(struct sk_buff *skb);
138 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139                                            struct sk_buff *skb, u32 mtu);
140 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141                                         struct sk_buff *skb);
142 static void             ipv4_dst_destroy(struct dst_entry *dst);
143
144 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
145                             int how)
146 {
147 }
148
149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151         WARN_ON(1);
152         return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156                                            struct sk_buff *skb,
157                                            const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .protocol =             cpu_to_be16(ETH_P_IP),
162         .check =                ipv4_dst_check,
163         .default_advmss =       ipv4_default_advmss,
164         .mtu =                  ipv4_mtu,
165         .cow_metrics =          ipv4_cow_metrics,
166         .destroy =              ipv4_dst_destroy,
167         .ifdown =               ipv4_dst_ifdown,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174 };
175
176 #define ECN_OR_COST(class)      TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BESTEFFORT,
182         ECN_OR_COST(BESTEFFORT),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_BULK,
186         ECN_OR_COST(BULK),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE,
190         ECN_OR_COST(INTERACTIVE),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK),
193         TC_PRIO_INTERACTIVE_BULK,
194         ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204         if (*pos)
205                 return NULL;
206         return SEQ_START_TOKEN;
207 }
208
209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211         ++*pos;
212         return NULL;
213 }
214
215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221         if (v == SEQ_START_TOKEN)
222                 seq_printf(seq, "%-127s\n",
223                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225                            "HHUptod\tSpecDst");
226         return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230         .start  = rt_cache_seq_start,
231         .next   = rt_cache_seq_next,
232         .stop   = rt_cache_seq_stop,
233         .show   = rt_cache_seq_show,
234 };
235
236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238         return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct file_operations rt_cache_seq_fops = {
242         .owner   = THIS_MODULE,
243         .open    = rt_cache_seq_open,
244         .read    = seq_read,
245         .llseek  = seq_lseek,
246         .release = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    st->in_hit,
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    st->out_hit,
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    st->gc_total,
310                    st->gc_ignored,
311                    st->gc_goal_miss,
312                    st->gc_dst_overflow,
313                    st->in_hlist_search,
314                    st->out_hlist_search
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .owner   = THIS_MODULE,
334         .open    = rt_cpu_seq_open,
335         .read    = seq_read,
336         .llseek  = seq_lseek,
337         .release = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343         struct ip_rt_acct *dst, *src;
344         unsigned int i, j;
345
346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347         if (!dst)
348                 return -ENOMEM;
349
350         for_each_possible_cpu(i) {
351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352                 for (j = 0; j < 256; j++) {
353                         dst[j].o_bytes   += src[j].o_bytes;
354                         dst[j].o_packets += src[j].o_packets;
355                         dst[j].i_bytes   += src[j].i_bytes;
356                         dst[j].i_packets += src[j].i_packets;
357                 }
358         }
359
360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361         kfree(dst);
362         return 0;
363 }
364
365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367         return single_open(file, rt_acct_proc_show, NULL);
368 }
369
370 static const struct file_operations rt_acct_proc_fops = {
371         .owner          = THIS_MODULE,
372         .open           = rt_acct_proc_open,
373         .read           = seq_read,
374         .llseek         = seq_lseek,
375         .release        = single_release,
376 };
377 #endif
378
379 static int __net_init ip_rt_do_proc_init(struct net *net)
380 {
381         struct proc_dir_entry *pde;
382
383         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384                           &rt_cache_seq_fops);
385         if (!pde)
386                 goto err1;
387
388         pde = proc_create("rt_cache", S_IRUGO,
389                           net->proc_net_stat, &rt_cpu_seq_fops);
390         if (!pde)
391                 goto err2;
392
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395         if (!pde)
396                 goto err3;
397 #endif
398         return 0;
399
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 err3:
402         remove_proc_entry("rt_cache", net->proc_net_stat);
403 #endif
404 err2:
405         remove_proc_entry("rt_cache", net->proc_net);
406 err1:
407         return -ENOMEM;
408 }
409
410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 {
412         remove_proc_entry("rt_cache", net->proc_net_stat);
413         remove_proc_entry("rt_cache", net->proc_net);
414 #ifdef CONFIG_IP_ROUTE_CLASSID
415         remove_proc_entry("rt_acct", net->proc_net);
416 #endif
417 }
418
419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
420         .init = ip_rt_do_proc_init,
421         .exit = ip_rt_do_proc_exit,
422 };
423
424 static int __init ip_rt_proc_init(void)
425 {
426         return register_pernet_subsys(&ip_rt_proc_ops);
427 }
428
429 #else
430 static inline int ip_rt_proc_init(void)
431 {
432         return 0;
433 }
434 #endif /* CONFIG_PROC_FS */
435
436 static inline bool rt_is_expired(const struct rtable *rth)
437 {
438         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
439 }
440
441 void rt_cache_flush(struct net *net)
442 {
443         rt_genid_bump(net);
444 }
445
446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447                                            struct sk_buff *skb,
448                                            const void *daddr)
449 {
450         struct net_device *dev = dst->dev;
451         const __be32 *pkey = daddr;
452         const struct rtable *rt;
453         struct neighbour *n;
454
455         rt = (const struct rtable *) dst;
456         if (rt->rt_gateway)
457                 pkey = (const __be32 *) &rt->rt_gateway;
458         else if (skb)
459                 pkey = &ip_hdr(skb)->daddr;
460
461         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462         if (n)
463                 return n;
464         return neigh_create(&arp_tbl, pkey, dev);
465 }
466
467 /*
468  * Peer allocation may fail only in serious out-of-memory conditions.  However
469  * we still can generate some output.
470  * Random ID selection looks a bit dangerous because we have no chances to
471  * select ID being unique in a reasonable period of time.
472  * But broken packet identifier may be better than no packet at all.
473  */
474 static void ip_select_fb_ident(struct iphdr *iph)
475 {
476         static DEFINE_SPINLOCK(ip_fb_id_lock);
477         static u32 ip_fallback_id;
478         u32 salt;
479
480         spin_lock_bh(&ip_fb_id_lock);
481         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
482         iph->id = htons(salt & 0xFFFF);
483         ip_fallback_id = salt;
484         spin_unlock_bh(&ip_fb_id_lock);
485 }
486
487 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
488 {
489         struct net *net = dev_net(dst->dev);
490         struct inet_peer *peer;
491
492         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
493         if (peer) {
494                 iph->id = htons(inet_getid(peer, more));
495                 inet_putpeer(peer);
496                 return;
497         }
498
499         ip_select_fb_ident(iph);
500 }
501 EXPORT_SYMBOL(__ip_select_ident);
502
503 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
504                              const struct iphdr *iph,
505                              int oif, u8 tos,
506                              u8 prot, u32 mark, int flow_flags)
507 {
508         if (sk) {
509                 const struct inet_sock *inet = inet_sk(sk);
510
511                 oif = sk->sk_bound_dev_if;
512                 mark = sk->sk_mark;
513                 tos = RT_CONN_FLAGS(sk);
514                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
515         }
516         flowi4_init_output(fl4, oif, mark, tos,
517                            RT_SCOPE_UNIVERSE, prot,
518                            flow_flags,
519                            iph->daddr, iph->saddr, 0, 0);
520 }
521
522 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
523                                const struct sock *sk)
524 {
525         const struct iphdr *iph = ip_hdr(skb);
526         int oif = skb->dev->ifindex;
527         u8 tos = RT_TOS(iph->tos);
528         u8 prot = iph->protocol;
529         u32 mark = skb->mark;
530
531         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532 }
533
534 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
535 {
536         const struct inet_sock *inet = inet_sk(sk);
537         const struct ip_options_rcu *inet_opt;
538         __be32 daddr = inet->inet_daddr;
539
540         rcu_read_lock();
541         inet_opt = rcu_dereference(inet->inet_opt);
542         if (inet_opt && inet_opt->opt.srr)
543                 daddr = inet_opt->opt.faddr;
544         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
545                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
546                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
547                            inet_sk_flowi_flags(sk),
548                            daddr, inet->inet_saddr, 0, 0);
549         rcu_read_unlock();
550 }
551
552 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
553                                  const struct sk_buff *skb)
554 {
555         if (skb)
556                 build_skb_flow_key(fl4, skb, sk);
557         else
558                 build_sk_flow_key(fl4, sk);
559 }
560
561 static inline void rt_free(struct rtable *rt)
562 {
563         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
564 }
565
566 static DEFINE_SPINLOCK(fnhe_lock);
567
568 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
569 {
570         struct fib_nh_exception *fnhe, *oldest;
571         struct rtable *orig;
572
573         oldest = rcu_dereference(hash->chain);
574         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
575              fnhe = rcu_dereference(fnhe->fnhe_next)) {
576                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
577                         oldest = fnhe;
578         }
579         orig = rcu_dereference(oldest->fnhe_rth);
580         if (orig) {
581                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
582                 rt_free(orig);
583         }
584         return oldest;
585 }
586
587 static inline u32 fnhe_hashfun(__be32 daddr)
588 {
589         u32 hval;
590
591         hval = (__force u32) daddr;
592         hval ^= (hval >> 11) ^ (hval >> 22);
593
594         return hval & (FNHE_HASH_SIZE - 1);
595 }
596
597 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
598 {
599         rt->rt_pmtu = fnhe->fnhe_pmtu;
600         rt->dst.expires = fnhe->fnhe_expires;
601
602         if (fnhe->fnhe_gw) {
603                 rt->rt_flags |= RTCF_REDIRECTED;
604                 rt->rt_gateway = fnhe->fnhe_gw;
605                 rt->rt_uses_gateway = 1;
606         }
607 }
608
609 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
610                                   u32 pmtu, unsigned long expires)
611 {
612         struct fnhe_hash_bucket *hash;
613         struct fib_nh_exception *fnhe;
614         struct rtable *rt;
615         unsigned int i;
616         int depth;
617         u32 hval = fnhe_hashfun(daddr);
618
619         spin_lock_bh(&fnhe_lock);
620
621         hash = nh->nh_exceptions;
622         if (!hash) {
623                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
624                 if (!hash)
625                         goto out_unlock;
626                 nh->nh_exceptions = hash;
627         }
628
629         hash += hval;
630
631         depth = 0;
632         for (fnhe = rcu_dereference(hash->chain); fnhe;
633              fnhe = rcu_dereference(fnhe->fnhe_next)) {
634                 if (fnhe->fnhe_daddr == daddr)
635                         break;
636                 depth++;
637         }
638
639         if (fnhe) {
640                 if (gw)
641                         fnhe->fnhe_gw = gw;
642                 if (pmtu) {
643                         fnhe->fnhe_pmtu = pmtu;
644                         fnhe->fnhe_expires = max(1UL, expires);
645                 }
646                 /* Update all cached dsts too */
647                 rt = rcu_dereference(fnhe->fnhe_rth);
648                 if (rt)
649                         fill_route_from_fnhe(rt, fnhe);
650         } else {
651                 if (depth > FNHE_RECLAIM_DEPTH)
652                         fnhe = fnhe_oldest(hash);
653                 else {
654                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
655                         if (!fnhe)
656                                 goto out_unlock;
657
658                         fnhe->fnhe_next = hash->chain;
659                         rcu_assign_pointer(hash->chain, fnhe);
660                 }
661                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
662                 fnhe->fnhe_daddr = daddr;
663                 fnhe->fnhe_gw = gw;
664                 fnhe->fnhe_pmtu = pmtu;
665                 fnhe->fnhe_expires = expires;
666
667                 /* Exception created; mark the cached routes for the nexthop
668                  * stale, so anyone caching it rechecks if this exception
669                  * applies to them.
670                  */
671                 for_each_possible_cpu(i) {
672                         struct rtable __rcu **prt;
673                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
674                         rt = rcu_dereference(*prt);
675                         if (rt)
676                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
677                 }
678         }
679
680         fnhe->fnhe_stamp = jiffies;
681
682 out_unlock:
683         spin_unlock_bh(&fnhe_lock);
684         return;
685 }
686
687 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
688                              bool kill_route)
689 {
690         __be32 new_gw = icmp_hdr(skb)->un.gateway;
691         __be32 old_gw = ip_hdr(skb)->saddr;
692         struct net_device *dev = skb->dev;
693         struct in_device *in_dev;
694         struct fib_result res;
695         struct neighbour *n;
696         struct net *net;
697
698         switch (icmp_hdr(skb)->code & 7) {
699         case ICMP_REDIR_NET:
700         case ICMP_REDIR_NETTOS:
701         case ICMP_REDIR_HOST:
702         case ICMP_REDIR_HOSTTOS:
703                 break;
704
705         default:
706                 return;
707         }
708
709         if (rt->rt_gateway != old_gw)
710                 return;
711
712         in_dev = __in_dev_get_rcu(dev);
713         if (!in_dev)
714                 return;
715
716         net = dev_net(dev);
717         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
718             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
719             ipv4_is_zeronet(new_gw))
720                 goto reject_redirect;
721
722         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
723                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
724                         goto reject_redirect;
725                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
726                         goto reject_redirect;
727         } else {
728                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
729                         goto reject_redirect;
730         }
731
732         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
733         if (n) {
734                 if (!(n->nud_state & NUD_VALID)) {
735                         neigh_event_send(n, NULL);
736                 } else {
737                         if (fib_lookup(net, fl4, &res) == 0) {
738                                 struct fib_nh *nh = &FIB_RES_NH(res);
739
740                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
741                                                       0, 0);
742                         }
743                         if (kill_route)
744                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
745                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
746                 }
747                 neigh_release(n);
748         }
749         return;
750
751 reject_redirect:
752 #ifdef CONFIG_IP_ROUTE_VERBOSE
753         if (IN_DEV_LOG_MARTIANS(in_dev)) {
754                 const struct iphdr *iph = (const struct iphdr *) skb->data;
755                 __be32 daddr = iph->daddr;
756                 __be32 saddr = iph->saddr;
757
758                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
759                                      "  Advised path = %pI4 -> %pI4\n",
760                                      &old_gw, dev->name, &new_gw,
761                                      &saddr, &daddr);
762         }
763 #endif
764         ;
765 }
766
767 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
768 {
769         struct rtable *rt;
770         struct flowi4 fl4;
771         const struct iphdr *iph = (const struct iphdr *) skb->data;
772         int oif = skb->dev->ifindex;
773         u8 tos = RT_TOS(iph->tos);
774         u8 prot = iph->protocol;
775         u32 mark = skb->mark;
776
777         rt = (struct rtable *) dst;
778
779         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
780         __ip_do_redirect(rt, skb, &fl4, true);
781 }
782
783 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
784 {
785         struct rtable *rt = (struct rtable *)dst;
786         struct dst_entry *ret = dst;
787
788         if (rt) {
789                 if (dst->obsolete > 0) {
790                         ip_rt_put(rt);
791                         ret = NULL;
792                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
793                            rt->dst.expires) {
794                         ip_rt_put(rt);
795                         ret = NULL;
796                 }
797         }
798         return ret;
799 }
800
801 /*
802  * Algorithm:
803  *      1. The first ip_rt_redirect_number redirects are sent
804  *         with exponential backoff, then we stop sending them at all,
805  *         assuming that the host ignores our redirects.
806  *      2. If we did not see packets requiring redirects
807  *         during ip_rt_redirect_silence, we assume that the host
808  *         forgot redirected route and start to send redirects again.
809  *
810  * This algorithm is much cheaper and more intelligent than dumb load limiting
811  * in icmp.c.
812  *
813  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
814  * and "frag. need" (breaks PMTU discovery) in icmp.c.
815  */
816
817 void ip_rt_send_redirect(struct sk_buff *skb)
818 {
819         struct rtable *rt = skb_rtable(skb);
820         struct in_device *in_dev;
821         struct inet_peer *peer;
822         struct net *net;
823         int log_martians;
824
825         rcu_read_lock();
826         in_dev = __in_dev_get_rcu(rt->dst.dev);
827         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
828                 rcu_read_unlock();
829                 return;
830         }
831         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
832         rcu_read_unlock();
833
834         net = dev_net(rt->dst.dev);
835         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
836         if (!peer) {
837                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
838                           rt_nexthop(rt, ip_hdr(skb)->daddr));
839                 return;
840         }
841
842         /* No redirected packets during ip_rt_redirect_silence;
843          * reset the algorithm.
844          */
845         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
846                 peer->rate_tokens = 0;
847
848         /* Too many ignored redirects; do not send anything
849          * set dst.rate_last to the last seen redirected packet.
850          */
851         if (peer->rate_tokens >= ip_rt_redirect_number) {
852                 peer->rate_last = jiffies;
853                 goto out_put_peer;
854         }
855
856         /* Check for load limit; set rate_last to the latest sent
857          * redirect.
858          */
859         if (peer->rate_tokens == 0 ||
860             time_after(jiffies,
861                        (peer->rate_last +
862                         (ip_rt_redirect_load << peer->rate_tokens)))) {
863                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
864
865                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
866                 peer->rate_last = jiffies;
867                 ++peer->rate_tokens;
868 #ifdef CONFIG_IP_ROUTE_VERBOSE
869                 if (log_martians &&
870                     peer->rate_tokens == ip_rt_redirect_number)
871                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
872                                              &ip_hdr(skb)->saddr, inet_iif(skb),
873                                              &ip_hdr(skb)->daddr, &gw);
874 #endif
875         }
876 out_put_peer:
877         inet_putpeer(peer);
878 }
879
880 static int ip_error(struct sk_buff *skb)
881 {
882         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
883         struct rtable *rt = skb_rtable(skb);
884         struct inet_peer *peer;
885         unsigned long now;
886         struct net *net;
887         bool send;
888         int code;
889
890         net = dev_net(rt->dst.dev);
891         if (!IN_DEV_FORWARD(in_dev)) {
892                 switch (rt->dst.error) {
893                 case EHOSTUNREACH:
894                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
895                         break;
896
897                 case ENETUNREACH:
898                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
899                         break;
900                 }
901                 goto out;
902         }
903
904         switch (rt->dst.error) {
905         case EINVAL:
906         default:
907                 goto out;
908         case EHOSTUNREACH:
909                 code = ICMP_HOST_UNREACH;
910                 break;
911         case ENETUNREACH:
912                 code = ICMP_NET_UNREACH;
913                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
914                 break;
915         case EACCES:
916                 code = ICMP_PKT_FILTERED;
917                 break;
918         }
919
920         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
921
922         send = true;
923         if (peer) {
924                 now = jiffies;
925                 peer->rate_tokens += now - peer->rate_last;
926                 if (peer->rate_tokens > ip_rt_error_burst)
927                         peer->rate_tokens = ip_rt_error_burst;
928                 peer->rate_last = now;
929                 if (peer->rate_tokens >= ip_rt_error_cost)
930                         peer->rate_tokens -= ip_rt_error_cost;
931                 else
932                         send = false;
933                 inet_putpeer(peer);
934         }
935         if (send)
936                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
937
938 out:    kfree_skb(skb);
939         return 0;
940 }
941
942 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
943 {
944         struct dst_entry *dst = &rt->dst;
945         struct fib_result res;
946
947         if (dst_metric_locked(dst, RTAX_MTU))
948                 return;
949
950         if (dst->dev->mtu < mtu)
951                 return;
952
953         if (mtu < ip_rt_min_pmtu)
954                 mtu = ip_rt_min_pmtu;
955
956         if (rt->rt_pmtu == mtu &&
957             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
958                 return;
959
960         rcu_read_lock();
961         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
962                 struct fib_nh *nh = &FIB_RES_NH(res);
963
964                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
965                                       jiffies + ip_rt_mtu_expires);
966         }
967         rcu_read_unlock();
968 }
969
970 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
971                               struct sk_buff *skb, u32 mtu)
972 {
973         struct rtable *rt = (struct rtable *) dst;
974         struct flowi4 fl4;
975
976         ip_rt_build_flow_key(&fl4, sk, skb);
977         __ip_rt_update_pmtu(rt, &fl4, mtu);
978 }
979
980 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
981                       int oif, u32 mark, u8 protocol, int flow_flags)
982 {
983         const struct iphdr *iph = (const struct iphdr *) skb->data;
984         struct flowi4 fl4;
985         struct rtable *rt;
986
987         __build_flow_key(&fl4, NULL, iph, oif,
988                          RT_TOS(iph->tos), protocol, mark, flow_flags);
989         rt = __ip_route_output_key(net, &fl4);
990         if (!IS_ERR(rt)) {
991                 __ip_rt_update_pmtu(rt, &fl4, mtu);
992                 ip_rt_put(rt);
993         }
994 }
995 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
996
997 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
998 {
999         const struct iphdr *iph = (const struct iphdr *) skb->data;
1000         struct flowi4 fl4;
1001         struct rtable *rt;
1002
1003         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1004         rt = __ip_route_output_key(sock_net(sk), &fl4);
1005         if (!IS_ERR(rt)) {
1006                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1007                 ip_rt_put(rt);
1008         }
1009 }
1010
1011 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1012 {
1013         const struct iphdr *iph = (const struct iphdr *) skb->data;
1014         struct flowi4 fl4;
1015         struct rtable *rt;
1016         struct dst_entry *dst;
1017         bool new = false;
1018
1019         bh_lock_sock(sk);
1020         rt = (struct rtable *) __sk_dst_get(sk);
1021
1022         if (sock_owned_by_user(sk) || !rt) {
1023                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1024                 goto out;
1025         }
1026
1027         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1028
1029         if (!__sk_dst_check(sk, 0)) {
1030                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1031                 if (IS_ERR(rt))
1032                         goto out;
1033
1034                 new = true;
1035         }
1036
1037         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1038
1039         dst = dst_check(&rt->dst, 0);
1040         if (!dst) {
1041                 if (new)
1042                         dst_release(&rt->dst);
1043
1044                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1045                 if (IS_ERR(rt))
1046                         goto out;
1047
1048                 new = true;
1049         }
1050
1051         if (new)
1052                 __sk_dst_set(sk, &rt->dst);
1053
1054 out:
1055         bh_unlock_sock(sk);
1056 }
1057 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1058
1059 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1060                    int oif, u32 mark, u8 protocol, int flow_flags)
1061 {
1062         const struct iphdr *iph = (const struct iphdr *) skb->data;
1063         struct flowi4 fl4;
1064         struct rtable *rt;
1065
1066         __build_flow_key(&fl4, NULL, iph, oif,
1067                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1068         rt = __ip_route_output_key(net, &fl4);
1069         if (!IS_ERR(rt)) {
1070                 __ip_do_redirect(rt, skb, &fl4, false);
1071                 ip_rt_put(rt);
1072         }
1073 }
1074 EXPORT_SYMBOL_GPL(ipv4_redirect);
1075
1076 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1077 {
1078         const struct iphdr *iph = (const struct iphdr *) skb->data;
1079         struct flowi4 fl4;
1080         struct rtable *rt;
1081
1082         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1083         rt = __ip_route_output_key(sock_net(sk), &fl4);
1084         if (!IS_ERR(rt)) {
1085                 __ip_do_redirect(rt, skb, &fl4, false);
1086                 ip_rt_put(rt);
1087         }
1088 }
1089 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1090
1091 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1092 {
1093         struct rtable *rt = (struct rtable *) dst;
1094
1095         /* All IPV4 dsts are created with ->obsolete set to the value
1096          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1097          * into this function always.
1098          *
1099          * When a PMTU/redirect information update invalidates a route,
1100          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1101          * DST_OBSOLETE_DEAD by dst_free().
1102          */
1103         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1104                 return NULL;
1105         return dst;
1106 }
1107
1108 static void ipv4_link_failure(struct sk_buff *skb)
1109 {
1110         struct rtable *rt;
1111
1112         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1113
1114         rt = skb_rtable(skb);
1115         if (rt)
1116                 dst_set_expires(&rt->dst, 0);
1117 }
1118
1119 static int ip_rt_bug(struct sk_buff *skb)
1120 {
1121         pr_debug("%s: %pI4 -> %pI4, %s\n",
1122                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1123                  skb->dev ? skb->dev->name : "?");
1124         kfree_skb(skb);
1125         WARN_ON(1);
1126         return 0;
1127 }
1128
1129 /*
1130    We do not cache source address of outgoing interface,
1131    because it is used only by IP RR, TS and SRR options,
1132    so that it out of fast path.
1133
1134    BTW remember: "addr" is allowed to be not aligned
1135    in IP options!
1136  */
1137
1138 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1139 {
1140         __be32 src;
1141
1142         if (rt_is_output_route(rt))
1143                 src = ip_hdr(skb)->saddr;
1144         else {
1145                 struct fib_result res;
1146                 struct flowi4 fl4;
1147                 struct iphdr *iph;
1148
1149                 iph = ip_hdr(skb);
1150
1151                 memset(&fl4, 0, sizeof(fl4));
1152                 fl4.daddr = iph->daddr;
1153                 fl4.saddr = iph->saddr;
1154                 fl4.flowi4_tos = RT_TOS(iph->tos);
1155                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1156                 fl4.flowi4_iif = skb->dev->ifindex;
1157                 fl4.flowi4_mark = skb->mark;
1158
1159                 rcu_read_lock();
1160                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1161                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1162                 else
1163                         src = inet_select_addr(rt->dst.dev,
1164                                                rt_nexthop(rt, iph->daddr),
1165                                                RT_SCOPE_UNIVERSE);
1166                 rcu_read_unlock();
1167         }
1168         memcpy(addr, &src, 4);
1169 }
1170
1171 #ifdef CONFIG_IP_ROUTE_CLASSID
1172 static void set_class_tag(struct rtable *rt, u32 tag)
1173 {
1174         if (!(rt->dst.tclassid & 0xFFFF))
1175                 rt->dst.tclassid |= tag & 0xFFFF;
1176         if (!(rt->dst.tclassid & 0xFFFF0000))
1177                 rt->dst.tclassid |= tag & 0xFFFF0000;
1178 }
1179 #endif
1180
1181 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1182 {
1183         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1184
1185         if (advmss == 0) {
1186                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1187                                ip_rt_min_advmss);
1188                 if (advmss > 65535 - 40)
1189                         advmss = 65535 - 40;
1190         }
1191         return advmss;
1192 }
1193
1194 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1195 {
1196         const struct rtable *rt = (const struct rtable *) dst;
1197         unsigned int mtu = rt->rt_pmtu;
1198
1199         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1200                 mtu = dst_metric_raw(dst, RTAX_MTU);
1201
1202         if (mtu)
1203                 return mtu;
1204
1205         mtu = dst->dev->mtu;
1206
1207         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1208                 if (rt->rt_uses_gateway && mtu > 576)
1209                         mtu = 576;
1210         }
1211
1212         if (mtu > IP_MAX_MTU)
1213                 mtu = IP_MAX_MTU;
1214
1215         return mtu;
1216 }
1217
1218 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1219 {
1220         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1221         struct fib_nh_exception *fnhe;
1222         u32 hval;
1223
1224         if (!hash)
1225                 return NULL;
1226
1227         hval = fnhe_hashfun(daddr);
1228
1229         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1230              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1231                 if (fnhe->fnhe_daddr == daddr)
1232                         return fnhe;
1233         }
1234         return NULL;
1235 }
1236
1237 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1238                               __be32 daddr)
1239 {
1240         bool ret = false;
1241
1242         spin_lock_bh(&fnhe_lock);
1243
1244         if (daddr == fnhe->fnhe_daddr) {
1245                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1246                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1247
1248                 if (fnhe->fnhe_genid != genid) {
1249                         fnhe->fnhe_genid = genid;
1250                         fnhe->fnhe_gw = 0;
1251                         fnhe->fnhe_pmtu = 0;
1252                         fnhe->fnhe_expires = 0;
1253                 }
1254                 fill_route_from_fnhe(rt, fnhe);
1255                 if (!rt->rt_gateway)
1256                         rt->rt_gateway = daddr;
1257
1258                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1259                 if (orig)
1260                         rt_free(orig);
1261
1262                 fnhe->fnhe_stamp = jiffies;
1263                 ret = true;
1264         }
1265         spin_unlock_bh(&fnhe_lock);
1266
1267         return ret;
1268 }
1269
1270 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1271 {
1272         struct rtable *orig, *prev, **p;
1273         bool ret = true;
1274
1275         if (rt_is_input_route(rt)) {
1276                 p = (struct rtable **)&nh->nh_rth_input;
1277         } else {
1278                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1279         }
1280         orig = *p;
1281
1282         prev = cmpxchg(p, orig, rt);
1283         if (prev == orig) {
1284                 if (orig)
1285                         rt_free(orig);
1286         } else
1287                 ret = false;
1288
1289         return ret;
1290 }
1291
1292 static DEFINE_SPINLOCK(rt_uncached_lock);
1293 static LIST_HEAD(rt_uncached_list);
1294
1295 static void rt_add_uncached_list(struct rtable *rt)
1296 {
1297         spin_lock_bh(&rt_uncached_lock);
1298         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1299         spin_unlock_bh(&rt_uncached_lock);
1300 }
1301
1302 static void ipv4_dst_destroy(struct dst_entry *dst)
1303 {
1304         struct rtable *rt = (struct rtable *) dst;
1305
1306         if (!list_empty(&rt->rt_uncached)) {
1307                 spin_lock_bh(&rt_uncached_lock);
1308                 list_del(&rt->rt_uncached);
1309                 spin_unlock_bh(&rt_uncached_lock);
1310         }
1311 }
1312
1313 void rt_flush_dev(struct net_device *dev)
1314 {
1315         if (!list_empty(&rt_uncached_list)) {
1316                 struct net *net = dev_net(dev);
1317                 struct rtable *rt;
1318
1319                 spin_lock_bh(&rt_uncached_lock);
1320                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1321                         if (rt->dst.dev != dev)
1322                                 continue;
1323                         rt->dst.dev = net->loopback_dev;
1324                         dev_hold(rt->dst.dev);
1325                         dev_put(dev);
1326                 }
1327                 spin_unlock_bh(&rt_uncached_lock);
1328         }
1329 }
1330
1331 static bool rt_cache_valid(const struct rtable *rt)
1332 {
1333         return  rt &&
1334                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1335                 !rt_is_expired(rt);
1336 }
1337
1338 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1339                            const struct fib_result *res,
1340                            struct fib_nh_exception *fnhe,
1341                            struct fib_info *fi, u16 type, u32 itag)
1342 {
1343         bool cached = false;
1344
1345         if (fi) {
1346                 struct fib_nh *nh = &FIB_RES_NH(*res);
1347
1348                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1349                         rt->rt_gateway = nh->nh_gw;
1350                         rt->rt_uses_gateway = 1;
1351                 }
1352                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1353 #ifdef CONFIG_IP_ROUTE_CLASSID
1354                 rt->dst.tclassid = nh->nh_tclassid;
1355 #endif
1356                 if (unlikely(fnhe))
1357                         cached = rt_bind_exception(rt, fnhe, daddr);
1358                 else if (!(rt->dst.flags & DST_NOCACHE))
1359                         cached = rt_cache_route(nh, rt);
1360                 if (unlikely(!cached)) {
1361                         /* Routes we intend to cache in nexthop exception or
1362                          * FIB nexthop have the DST_NOCACHE bit clear.
1363                          * However, if we are unsuccessful at storing this
1364                          * route into the cache we really need to set it.
1365                          */
1366                         rt->dst.flags |= DST_NOCACHE;
1367                         if (!rt->rt_gateway)
1368                                 rt->rt_gateway = daddr;
1369                         rt_add_uncached_list(rt);
1370                 }
1371         } else
1372                 rt_add_uncached_list(rt);
1373
1374 #ifdef CONFIG_IP_ROUTE_CLASSID
1375 #ifdef CONFIG_IP_MULTIPLE_TABLES
1376         set_class_tag(rt, res->tclassid);
1377 #endif
1378         set_class_tag(rt, itag);
1379 #endif
1380 }
1381
1382 static struct rtable *rt_dst_alloc(struct net_device *dev,
1383                                    bool nopolicy, bool noxfrm, bool will_cache)
1384 {
1385         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1386                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1387                          (nopolicy ? DST_NOPOLICY : 0) |
1388                          (noxfrm ? DST_NOXFRM : 0));
1389 }
1390
1391 /* called in rcu_read_lock() section */
1392 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1393                                 u8 tos, struct net_device *dev, int our)
1394 {
1395         struct rtable *rth;
1396         struct in_device *in_dev = __in_dev_get_rcu(dev);
1397         u32 itag = 0;
1398         int err;
1399
1400         /* Primary sanity checks. */
1401
1402         if (in_dev == NULL)
1403                 return -EINVAL;
1404
1405         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1406             skb->protocol != htons(ETH_P_IP))
1407                 goto e_inval;
1408
1409         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1410                 if (ipv4_is_loopback(saddr))
1411                         goto e_inval;
1412
1413         if (ipv4_is_zeronet(saddr)) {
1414                 if (!ipv4_is_local_multicast(daddr))
1415                         goto e_inval;
1416         } else {
1417                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1418                                           in_dev, &itag);
1419                 if (err < 0)
1420                         goto e_err;
1421         }
1422         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1423                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1424         if (!rth)
1425                 goto e_nobufs;
1426
1427 #ifdef CONFIG_IP_ROUTE_CLASSID
1428         rth->dst.tclassid = itag;
1429 #endif
1430         rth->dst.output = ip_rt_bug;
1431
1432         rth->rt_genid   = rt_genid(dev_net(dev));
1433         rth->rt_flags   = RTCF_MULTICAST;
1434         rth->rt_type    = RTN_MULTICAST;
1435         rth->rt_is_input= 1;
1436         rth->rt_iif     = 0;
1437         rth->rt_pmtu    = 0;
1438         rth->rt_gateway = 0;
1439         rth->rt_uses_gateway = 0;
1440         INIT_LIST_HEAD(&rth->rt_uncached);
1441         if (our) {
1442                 rth->dst.input= ip_local_deliver;
1443                 rth->rt_flags |= RTCF_LOCAL;
1444         }
1445
1446 #ifdef CONFIG_IP_MROUTE
1447         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1448                 rth->dst.input = ip_mr_input;
1449 #endif
1450         RT_CACHE_STAT_INC(in_slow_mc);
1451
1452         skb_dst_set(skb, &rth->dst);
1453         return 0;
1454
1455 e_nobufs:
1456         return -ENOBUFS;
1457 e_inval:
1458         return -EINVAL;
1459 e_err:
1460         return err;
1461 }
1462
1463
1464 static void ip_handle_martian_source(struct net_device *dev,
1465                                      struct in_device *in_dev,
1466                                      struct sk_buff *skb,
1467                                      __be32 daddr,
1468                                      __be32 saddr)
1469 {
1470         RT_CACHE_STAT_INC(in_martian_src);
1471 #ifdef CONFIG_IP_ROUTE_VERBOSE
1472         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1473                 /*
1474                  *      RFC1812 recommendation, if source is martian,
1475                  *      the only hint is MAC header.
1476                  */
1477                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1478                         &daddr, &saddr, dev->name);
1479                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1480                         print_hex_dump(KERN_WARNING, "ll header: ",
1481                                        DUMP_PREFIX_OFFSET, 16, 1,
1482                                        skb_mac_header(skb),
1483                                        dev->hard_header_len, true);
1484                 }
1485         }
1486 #endif
1487 }
1488
1489 /* called in rcu_read_lock() section */
1490 static int __mkroute_input(struct sk_buff *skb,
1491                            const struct fib_result *res,
1492                            struct in_device *in_dev,
1493                            __be32 daddr, __be32 saddr, u32 tos)
1494 {
1495         struct rtable *rth;
1496         int err;
1497         struct in_device *out_dev;
1498         unsigned int flags = 0;
1499         bool do_cache;
1500         u32 itag;
1501
1502         /* get a working reference to the output device */
1503         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1504         if (out_dev == NULL) {
1505                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1506                 return -EINVAL;
1507         }
1508
1509         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1510                                   in_dev->dev, in_dev, &itag);
1511         if (err < 0) {
1512                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1513                                          saddr);
1514
1515                 goto cleanup;
1516         }
1517
1518         do_cache = res->fi && !itag;
1519         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1520             (IN_DEV_SHARED_MEDIA(out_dev) ||
1521              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1522                 flags |= RTCF_DOREDIRECT;
1523                 do_cache = false;
1524         }
1525
1526         if (skb->protocol != htons(ETH_P_IP)) {
1527                 /* Not IP (i.e. ARP). Do not create route, if it is
1528                  * invalid for proxy arp. DNAT routes are always valid.
1529                  *
1530                  * Proxy arp feature have been extended to allow, ARP
1531                  * replies back to the same interface, to support
1532                  * Private VLAN switch technologies. See arp.c.
1533                  */
1534                 if (out_dev == in_dev &&
1535                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1536                         err = -EINVAL;
1537                         goto cleanup;
1538                 }
1539         }
1540
1541         if (do_cache) {
1542                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1543                 if (rt_cache_valid(rth)) {
1544                         skb_dst_set_noref(skb, &rth->dst);
1545                         goto out;
1546                 }
1547         }
1548
1549         rth = rt_dst_alloc(out_dev->dev,
1550                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1551                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1552         if (!rth) {
1553                 err = -ENOBUFS;
1554                 goto cleanup;
1555         }
1556
1557         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1558         rth->rt_flags = flags;
1559         rth->rt_type = res->type;
1560         rth->rt_is_input = 1;
1561         rth->rt_iif     = 0;
1562         rth->rt_pmtu    = 0;
1563         rth->rt_gateway = 0;
1564         rth->rt_uses_gateway = 0;
1565         INIT_LIST_HEAD(&rth->rt_uncached);
1566
1567         rth->dst.input = ip_forward;
1568         rth->dst.output = ip_output;
1569
1570         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1571         skb_dst_set(skb, &rth->dst);
1572 out:
1573         err = 0;
1574  cleanup:
1575         return err;
1576 }
1577
1578 static int ip_mkroute_input(struct sk_buff *skb,
1579                             struct fib_result *res,
1580                             const struct flowi4 *fl4,
1581                             struct in_device *in_dev,
1582                             __be32 daddr, __be32 saddr, u32 tos)
1583 {
1584 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1585         if (res->fi && res->fi->fib_nhs > 1)
1586                 fib_select_multipath(res);
1587 #endif
1588
1589         /* create a routing cache entry */
1590         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1591 }
1592
1593 /*
1594  *      NOTE. We drop all the packets that has local source
1595  *      addresses, because every properly looped back packet
1596  *      must have correct destination already attached by output routine.
1597  *
1598  *      Such approach solves two big problems:
1599  *      1. Not simplex devices are handled properly.
1600  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1601  *      called with rcu_read_lock()
1602  */
1603
1604 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1605                                u8 tos, struct net_device *dev)
1606 {
1607         struct fib_result res;
1608         struct in_device *in_dev = __in_dev_get_rcu(dev);
1609         struct flowi4   fl4;
1610         unsigned int    flags = 0;
1611         u32             itag = 0;
1612         struct rtable   *rth;
1613         int             err = -EINVAL;
1614         struct net    *net = dev_net(dev);
1615         bool do_cache;
1616
1617         /* IP on this device is disabled. */
1618
1619         if (!in_dev)
1620                 goto out;
1621
1622         /* Check for the most weird martians, which can be not detected
1623            by fib_lookup.
1624          */
1625
1626         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1627                 goto martian_source;
1628
1629         res.fi = NULL;
1630         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1631                 goto brd_input;
1632
1633         /* Accept zero addresses only to limited broadcast;
1634          * I even do not know to fix it or not. Waiting for complains :-)
1635          */
1636         if (ipv4_is_zeronet(saddr))
1637                 goto martian_source;
1638
1639         if (ipv4_is_zeronet(daddr))
1640                 goto martian_destination;
1641
1642         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1643          * and call it once if daddr or/and saddr are loopback addresses
1644          */
1645         if (ipv4_is_loopback(daddr)) {
1646                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1647                         goto martian_destination;
1648         } else if (ipv4_is_loopback(saddr)) {
1649                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1650                         goto martian_source;
1651         }
1652
1653         /*
1654          *      Now we are ready to route packet.
1655          */
1656         fl4.flowi4_oif = 0;
1657         fl4.flowi4_iif = dev->ifindex;
1658         fl4.flowi4_mark = skb->mark;
1659         fl4.flowi4_tos = tos;
1660         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1661         fl4.daddr = daddr;
1662         fl4.saddr = saddr;
1663         err = fib_lookup(net, &fl4, &res);
1664         if (err != 0)
1665                 goto no_route;
1666
1667         RT_CACHE_STAT_INC(in_slow_tot);
1668
1669         if (res.type == RTN_BROADCAST)
1670                 goto brd_input;
1671
1672         if (res.type == RTN_LOCAL) {
1673                 err = fib_validate_source(skb, saddr, daddr, tos,
1674                                           LOOPBACK_IFINDEX,
1675                                           dev, in_dev, &itag);
1676                 if (err < 0)
1677                         goto martian_source_keep_err;
1678                 goto local_input;
1679         }
1680
1681         if (!IN_DEV_FORWARD(in_dev))
1682                 goto no_route;
1683         if (res.type != RTN_UNICAST)
1684                 goto martian_destination;
1685
1686         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1687 out:    return err;
1688
1689 brd_input:
1690         if (skb->protocol != htons(ETH_P_IP))
1691                 goto e_inval;
1692
1693         if (!ipv4_is_zeronet(saddr)) {
1694                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1695                                           in_dev, &itag);
1696                 if (err < 0)
1697                         goto martian_source_keep_err;
1698         }
1699         flags |= RTCF_BROADCAST;
1700         res.type = RTN_BROADCAST;
1701         RT_CACHE_STAT_INC(in_brd);
1702
1703 local_input:
1704         do_cache = false;
1705         if (res.fi) {
1706                 if (!itag) {
1707                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1708                         if (rt_cache_valid(rth)) {
1709                                 skb_dst_set_noref(skb, &rth->dst);
1710                                 err = 0;
1711                                 goto out;
1712                         }
1713                         do_cache = true;
1714                 }
1715         }
1716
1717         rth = rt_dst_alloc(net->loopback_dev,
1718                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1719         if (!rth)
1720                 goto e_nobufs;
1721
1722         rth->dst.input= ip_local_deliver;
1723         rth->dst.output= ip_rt_bug;
1724 #ifdef CONFIG_IP_ROUTE_CLASSID
1725         rth->dst.tclassid = itag;
1726 #endif
1727
1728         rth->rt_genid = rt_genid(net);
1729         rth->rt_flags   = flags|RTCF_LOCAL;
1730         rth->rt_type    = res.type;
1731         rth->rt_is_input = 1;
1732         rth->rt_iif     = 0;
1733         rth->rt_pmtu    = 0;
1734         rth->rt_gateway = 0;
1735         rth->rt_uses_gateway = 0;
1736         INIT_LIST_HEAD(&rth->rt_uncached);
1737         if (res.type == RTN_UNREACHABLE) {
1738                 rth->dst.input= ip_error;
1739                 rth->dst.error= -err;
1740                 rth->rt_flags   &= ~RTCF_LOCAL;
1741         }
1742         if (do_cache)
1743                 rt_cache_route(&FIB_RES_NH(res), rth);
1744         skb_dst_set(skb, &rth->dst);
1745         err = 0;
1746         goto out;
1747
1748 no_route:
1749         RT_CACHE_STAT_INC(in_no_route);
1750         res.type = RTN_UNREACHABLE;
1751         if (err == -ESRCH)
1752                 err = -ENETUNREACH;
1753         goto local_input;
1754
1755         /*
1756          *      Do not cache martian addresses: they should be logged (RFC1812)
1757          */
1758 martian_destination:
1759         RT_CACHE_STAT_INC(in_martian_dst);
1760 #ifdef CONFIG_IP_ROUTE_VERBOSE
1761         if (IN_DEV_LOG_MARTIANS(in_dev))
1762                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1763                                      &daddr, &saddr, dev->name);
1764 #endif
1765
1766 e_inval:
1767         err = -EINVAL;
1768         goto out;
1769
1770 e_nobufs:
1771         err = -ENOBUFS;
1772         goto out;
1773
1774 martian_source:
1775         err = -EINVAL;
1776 martian_source_keep_err:
1777         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1778         goto out;
1779 }
1780
1781 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1782                          u8 tos, struct net_device *dev)
1783 {
1784         int res;
1785
1786         rcu_read_lock();
1787
1788         /* Multicast recognition logic is moved from route cache to here.
1789            The problem was that too many Ethernet cards have broken/missing
1790            hardware multicast filters :-( As result the host on multicasting
1791            network acquires a lot of useless route cache entries, sort of
1792            SDR messages from all the world. Now we try to get rid of them.
1793            Really, provided software IP multicast filter is organized
1794            reasonably (at least, hashed), it does not result in a slowdown
1795            comparing with route cache reject entries.
1796            Note, that multicast routers are not affected, because
1797            route cache entry is created eventually.
1798          */
1799         if (ipv4_is_multicast(daddr)) {
1800                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1801
1802                 if (in_dev) {
1803                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1804                                                   ip_hdr(skb)->protocol);
1805                         if (our
1806 #ifdef CONFIG_IP_MROUTE
1807                                 ||
1808                             (!ipv4_is_local_multicast(daddr) &&
1809                              IN_DEV_MFORWARD(in_dev))
1810 #endif
1811                            ) {
1812                                 int res = ip_route_input_mc(skb, daddr, saddr,
1813                                                             tos, dev, our);
1814                                 rcu_read_unlock();
1815                                 return res;
1816                         }
1817                 }
1818                 rcu_read_unlock();
1819                 return -EINVAL;
1820         }
1821         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1822         rcu_read_unlock();
1823         return res;
1824 }
1825 EXPORT_SYMBOL(ip_route_input_noref);
1826
1827 /* called with rcu_read_lock() */
1828 static struct rtable *__mkroute_output(const struct fib_result *res,
1829                                        const struct flowi4 *fl4, int orig_oif,
1830                                        struct net_device *dev_out,
1831                                        unsigned int flags)
1832 {
1833         struct fib_info *fi = res->fi;
1834         struct fib_nh_exception *fnhe;
1835         struct in_device *in_dev;
1836         u16 type = res->type;
1837         struct rtable *rth;
1838         bool do_cache;
1839
1840         in_dev = __in_dev_get_rcu(dev_out);
1841         if (!in_dev)
1842                 return ERR_PTR(-EINVAL);
1843
1844         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1845                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1846                         return ERR_PTR(-EINVAL);
1847
1848         if (ipv4_is_lbcast(fl4->daddr))
1849                 type = RTN_BROADCAST;
1850         else if (ipv4_is_multicast(fl4->daddr))
1851                 type = RTN_MULTICAST;
1852         else if (ipv4_is_zeronet(fl4->daddr))
1853                 return ERR_PTR(-EINVAL);
1854
1855         if (dev_out->flags & IFF_LOOPBACK)
1856                 flags |= RTCF_LOCAL;
1857
1858         do_cache = true;
1859         if (type == RTN_BROADCAST) {
1860                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1861                 fi = NULL;
1862         } else if (type == RTN_MULTICAST) {
1863                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1864                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1865                                      fl4->flowi4_proto))
1866                         flags &= ~RTCF_LOCAL;
1867                 else
1868                         do_cache = false;
1869                 /* If multicast route do not exist use
1870                  * default one, but do not gateway in this case.
1871                  * Yes, it is hack.
1872                  */
1873                 if (fi && res->prefixlen < 4)
1874                         fi = NULL;
1875         }
1876
1877         fnhe = NULL;
1878         do_cache &= fi != NULL;
1879         if (do_cache) {
1880                 struct rtable __rcu **prth;
1881                 struct fib_nh *nh = &FIB_RES_NH(*res);
1882
1883                 fnhe = find_exception(nh, fl4->daddr);
1884                 if (fnhe)
1885                         prth = &fnhe->fnhe_rth;
1886                 else {
1887                         if (unlikely(fl4->flowi4_flags &
1888                                      FLOWI_FLAG_KNOWN_NH &&
1889                                      !(nh->nh_gw &&
1890                                        nh->nh_scope == RT_SCOPE_LINK))) {
1891                                 do_cache = false;
1892                                 goto add;
1893                         }
1894                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1895                 }
1896                 rth = rcu_dereference(*prth);
1897                 if (rt_cache_valid(rth)) {
1898                         dst_hold(&rth->dst);
1899                         return rth;
1900                 }
1901         }
1902
1903 add:
1904         rth = rt_dst_alloc(dev_out,
1905                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1906                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1907                            do_cache);
1908         if (!rth)
1909                 return ERR_PTR(-ENOBUFS);
1910
1911         rth->dst.output = ip_output;
1912
1913         rth->rt_genid = rt_genid(dev_net(dev_out));
1914         rth->rt_flags   = flags;
1915         rth->rt_type    = type;
1916         rth->rt_is_input = 0;
1917         rth->rt_iif     = orig_oif ? : 0;
1918         rth->rt_pmtu    = 0;
1919         rth->rt_gateway = 0;
1920         rth->rt_uses_gateway = 0;
1921         INIT_LIST_HEAD(&rth->rt_uncached);
1922
1923         RT_CACHE_STAT_INC(out_slow_tot);
1924
1925         if (flags & RTCF_LOCAL)
1926                 rth->dst.input = ip_local_deliver;
1927         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1928                 if (flags & RTCF_LOCAL &&
1929                     !(dev_out->flags & IFF_LOOPBACK)) {
1930                         rth->dst.output = ip_mc_output;
1931                         RT_CACHE_STAT_INC(out_slow_mc);
1932                 }
1933 #ifdef CONFIG_IP_MROUTE
1934                 if (type == RTN_MULTICAST) {
1935                         if (IN_DEV_MFORWARD(in_dev) &&
1936                             !ipv4_is_local_multicast(fl4->daddr)) {
1937                                 rth->dst.input = ip_mr_input;
1938                                 rth->dst.output = ip_mc_output;
1939                         }
1940                 }
1941 #endif
1942         }
1943
1944         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1945
1946         return rth;
1947 }
1948
1949 /*
1950  * Major route resolver routine.
1951  */
1952
1953 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1954 {
1955         struct net_device *dev_out = NULL;
1956         __u8 tos = RT_FL_TOS(fl4);
1957         unsigned int flags = 0;
1958         struct fib_result res;
1959         struct rtable *rth;
1960         int orig_oif;
1961
1962         res.tclassid    = 0;
1963         res.fi          = NULL;
1964         res.table       = NULL;
1965
1966         orig_oif = fl4->flowi4_oif;
1967
1968         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1969         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1970         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1971                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1972
1973         rcu_read_lock();
1974         if (fl4->saddr) {
1975                 rth = ERR_PTR(-EINVAL);
1976                 if (ipv4_is_multicast(fl4->saddr) ||
1977                     ipv4_is_lbcast(fl4->saddr) ||
1978                     ipv4_is_zeronet(fl4->saddr))
1979                         goto out;
1980
1981                 /* I removed check for oif == dev_out->oif here.
1982                    It was wrong for two reasons:
1983                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1984                       is assigned to multiple interfaces.
1985                    2. Moreover, we are allowed to send packets with saddr
1986                       of another iface. --ANK
1987                  */
1988
1989                 if (fl4->flowi4_oif == 0 &&
1990                     (ipv4_is_multicast(fl4->daddr) ||
1991                      ipv4_is_lbcast(fl4->daddr))) {
1992                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1993                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1994                         if (dev_out == NULL)
1995                                 goto out;
1996
1997                         /* Special hack: user can direct multicasts
1998                            and limited broadcast via necessary interface
1999                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2000                            This hack is not just for fun, it allows
2001                            vic,vat and friends to work.
2002                            They bind socket to loopback, set ttl to zero
2003                            and expect that it will work.
2004                            From the viewpoint of routing cache they are broken,
2005                            because we are not allowed to build multicast path
2006                            with loopback source addr (look, routing cache
2007                            cannot know, that ttl is zero, so that packet
2008                            will not leave this host and route is valid).
2009                            Luckily, this hack is good workaround.
2010                          */
2011
2012                         fl4->flowi4_oif = dev_out->ifindex;
2013                         goto make_route;
2014                 }
2015
2016                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2017                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2018                         if (!__ip_dev_find(net, fl4->saddr, false))
2019                                 goto out;
2020                 }
2021         }
2022
2023
2024         if (fl4->flowi4_oif) {
2025                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2026                 rth = ERR_PTR(-ENODEV);
2027                 if (dev_out == NULL)
2028                         goto out;
2029
2030                 /* RACE: Check return value of inet_select_addr instead. */
2031                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2032                         rth = ERR_PTR(-ENETUNREACH);
2033                         goto out;
2034                 }
2035                 if (ipv4_is_local_multicast(fl4->daddr) ||
2036                     ipv4_is_lbcast(fl4->daddr)) {
2037                         if (!fl4->saddr)
2038                                 fl4->saddr = inet_select_addr(dev_out, 0,
2039                                                               RT_SCOPE_LINK);
2040                         goto make_route;
2041                 }
2042                 if (fl4->saddr) {
2043                         if (ipv4_is_multicast(fl4->daddr))
2044                                 fl4->saddr = inet_select_addr(dev_out, 0,
2045                                                               fl4->flowi4_scope);
2046                         else if (!fl4->daddr)
2047                                 fl4->saddr = inet_select_addr(dev_out, 0,
2048                                                               RT_SCOPE_HOST);
2049                 }
2050         }
2051
2052         if (!fl4->daddr) {
2053                 fl4->daddr = fl4->saddr;
2054                 if (!fl4->daddr)
2055                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2056                 dev_out = net->loopback_dev;
2057                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2058                 res.type = RTN_LOCAL;
2059                 flags |= RTCF_LOCAL;
2060                 goto make_route;
2061         }
2062
2063         if (fib_lookup(net, fl4, &res)) {
2064                 res.fi = NULL;
2065                 res.table = NULL;
2066                 if (fl4->flowi4_oif) {
2067                         /* Apparently, routing tables are wrong. Assume,
2068                            that the destination is on link.
2069
2070                            WHY? DW.
2071                            Because we are allowed to send to iface
2072                            even if it has NO routes and NO assigned
2073                            addresses. When oif is specified, routing
2074                            tables are looked up with only one purpose:
2075                            to catch if destination is gatewayed, rather than
2076                            direct. Moreover, if MSG_DONTROUTE is set,
2077                            we send packet, ignoring both routing tables
2078                            and ifaddr state. --ANK
2079
2080
2081                            We could make it even if oif is unknown,
2082                            likely IPv6, but we do not.
2083                          */
2084
2085                         if (fl4->saddr == 0)
2086                                 fl4->saddr = inet_select_addr(dev_out, 0,
2087                                                               RT_SCOPE_LINK);
2088                         res.type = RTN_UNICAST;
2089                         goto make_route;
2090                 }
2091                 rth = ERR_PTR(-ENETUNREACH);
2092                 goto out;
2093         }
2094
2095         if (res.type == RTN_LOCAL) {
2096                 if (!fl4->saddr) {
2097                         if (res.fi->fib_prefsrc)
2098                                 fl4->saddr = res.fi->fib_prefsrc;
2099                         else
2100                                 fl4->saddr = fl4->daddr;
2101                 }
2102                 dev_out = net->loopback_dev;
2103                 fl4->flowi4_oif = dev_out->ifindex;
2104                 flags |= RTCF_LOCAL;
2105                 goto make_route;
2106         }
2107
2108 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2109         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2110                 fib_select_multipath(&res);
2111         else
2112 #endif
2113         if (!res.prefixlen &&
2114             res.table->tb_num_default > 1 &&
2115             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2116                 fib_select_default(&res);
2117
2118         if (!fl4->saddr)
2119                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2120
2121         dev_out = FIB_RES_DEV(res);
2122         fl4->flowi4_oif = dev_out->ifindex;
2123
2124
2125 make_route:
2126         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2127
2128 out:
2129         rcu_read_unlock();
2130         return rth;
2131 }
2132 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2133
2134 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2135 {
2136         return NULL;
2137 }
2138
2139 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2140 {
2141         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2142
2143         return mtu ? : dst->dev->mtu;
2144 }
2145
2146 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2147                                           struct sk_buff *skb, u32 mtu)
2148 {
2149 }
2150
2151 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2152                                        struct sk_buff *skb)
2153 {
2154 }
2155
2156 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2157                                           unsigned long old)
2158 {
2159         return NULL;
2160 }
2161
2162 static struct dst_ops ipv4_dst_blackhole_ops = {
2163         .family                 =       AF_INET,
2164         .protocol               =       cpu_to_be16(ETH_P_IP),
2165         .check                  =       ipv4_blackhole_dst_check,
2166         .mtu                    =       ipv4_blackhole_mtu,
2167         .default_advmss         =       ipv4_default_advmss,
2168         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2169         .redirect               =       ipv4_rt_blackhole_redirect,
2170         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2171         .neigh_lookup           =       ipv4_neigh_lookup,
2172 };
2173
2174 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2175 {
2176         struct rtable *ort = (struct rtable *) dst_orig;
2177         struct rtable *rt;
2178
2179         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2180         if (rt) {
2181                 struct dst_entry *new = &rt->dst;
2182
2183                 new->__use = 1;
2184                 new->input = dst_discard;
2185                 new->output = dst_discard;
2186
2187                 new->dev = ort->dst.dev;
2188                 if (new->dev)
2189                         dev_hold(new->dev);
2190
2191                 rt->rt_is_input = ort->rt_is_input;
2192                 rt->rt_iif = ort->rt_iif;
2193                 rt->rt_pmtu = ort->rt_pmtu;
2194
2195                 rt->rt_genid = rt_genid(net);
2196                 rt->rt_flags = ort->rt_flags;
2197                 rt->rt_type = ort->rt_type;
2198                 rt->rt_gateway = ort->rt_gateway;
2199                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2200
2201                 INIT_LIST_HEAD(&rt->rt_uncached);
2202
2203                 dst_free(new);
2204         }
2205
2206         dst_release(dst_orig);
2207
2208         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2209 }
2210
2211 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2212                                     struct sock *sk)
2213 {
2214         struct rtable *rt = __ip_route_output_key(net, flp4);
2215
2216         if (IS_ERR(rt))
2217                 return rt;
2218
2219         if (flp4->flowi4_proto)
2220                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2221                                                    flowi4_to_flowi(flp4),
2222                                                    sk, 0);
2223
2224         return rt;
2225 }
2226 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2227
2228 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2229                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2230                         u32 seq, int event, int nowait, unsigned int flags)
2231 {
2232         struct rtable *rt = skb_rtable(skb);
2233         struct rtmsg *r;
2234         struct nlmsghdr *nlh;
2235         unsigned long expires = 0;
2236         u32 error;
2237         u32 metrics[RTAX_MAX];
2238
2239         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2240         if (nlh == NULL)
2241                 return -EMSGSIZE;
2242
2243         r = nlmsg_data(nlh);
2244         r->rtm_family    = AF_INET;
2245         r->rtm_dst_len  = 32;
2246         r->rtm_src_len  = 0;
2247         r->rtm_tos      = fl4->flowi4_tos;
2248         r->rtm_table    = RT_TABLE_MAIN;
2249         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2250                 goto nla_put_failure;
2251         r->rtm_type     = rt->rt_type;
2252         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2253         r->rtm_protocol = RTPROT_UNSPEC;
2254         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2255         if (rt->rt_flags & RTCF_NOTIFY)
2256                 r->rtm_flags |= RTM_F_NOTIFY;
2257
2258         if (nla_put_be32(skb, RTA_DST, dst))
2259                 goto nla_put_failure;
2260         if (src) {
2261                 r->rtm_src_len = 32;
2262                 if (nla_put_be32(skb, RTA_SRC, src))
2263                         goto nla_put_failure;
2264         }
2265         if (rt->dst.dev &&
2266             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2267                 goto nla_put_failure;
2268 #ifdef CONFIG_IP_ROUTE_CLASSID
2269         if (rt->dst.tclassid &&
2270             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2271                 goto nla_put_failure;
2272 #endif
2273         if (!rt_is_input_route(rt) &&
2274             fl4->saddr != src) {
2275                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2276                         goto nla_put_failure;
2277         }
2278         if (rt->rt_uses_gateway &&
2279             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2280                 goto nla_put_failure;
2281
2282         expires = rt->dst.expires;
2283         if (expires) {
2284                 unsigned long now = jiffies;
2285
2286                 if (time_before(now, expires))
2287                         expires -= now;
2288                 else
2289                         expires = 0;
2290         }
2291
2292         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2293         if (rt->rt_pmtu && expires)
2294                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2295         if (rtnetlink_put_metrics(skb, metrics) < 0)
2296                 goto nla_put_failure;
2297
2298         if (fl4->flowi4_mark &&
2299             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2300                 goto nla_put_failure;
2301
2302         error = rt->dst.error;
2303
2304         if (rt_is_input_route(rt)) {
2305 #ifdef CONFIG_IP_MROUTE
2306                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2307                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2308                         int err = ipmr_get_route(net, skb,
2309                                                  fl4->saddr, fl4->daddr,
2310                                                  r, nowait);
2311                         if (err <= 0) {
2312                                 if (!nowait) {
2313                                         if (err == 0)
2314                                                 return 0;
2315                                         goto nla_put_failure;
2316                                 } else {
2317                                         if (err == -EMSGSIZE)
2318                                                 goto nla_put_failure;
2319                                         error = err;
2320                                 }
2321                         }
2322                 } else
2323 #endif
2324                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2325                                 goto nla_put_failure;
2326         }
2327
2328         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2329                 goto nla_put_failure;
2330
2331         return nlmsg_end(skb, nlh);
2332
2333 nla_put_failure:
2334         nlmsg_cancel(skb, nlh);
2335         return -EMSGSIZE;
2336 }
2337
2338 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2339 {
2340         struct net *net = sock_net(in_skb->sk);
2341         struct rtmsg *rtm;
2342         struct nlattr *tb[RTA_MAX+1];
2343         struct rtable *rt = NULL;
2344         struct flowi4 fl4;
2345         __be32 dst = 0;
2346         __be32 src = 0;
2347         u32 iif;
2348         int err;
2349         int mark;
2350         struct sk_buff *skb;
2351
2352         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2353         if (err < 0)
2354                 goto errout;
2355
2356         rtm = nlmsg_data(nlh);
2357
2358         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2359         if (skb == NULL) {
2360                 err = -ENOBUFS;
2361                 goto errout;
2362         }
2363
2364         /* Reserve room for dummy headers, this skb can pass
2365            through good chunk of routing engine.
2366          */
2367         skb_reset_mac_header(skb);
2368         skb_reset_network_header(skb);
2369
2370         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2371         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2372         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2373
2374         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2375         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2376         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2377         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2378
2379         memset(&fl4, 0, sizeof(fl4));
2380         fl4.daddr = dst;
2381         fl4.saddr = src;
2382         fl4.flowi4_tos = rtm->rtm_tos;
2383         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2384         fl4.flowi4_mark = mark;
2385
2386         if (iif) {
2387                 struct net_device *dev;
2388
2389                 dev = __dev_get_by_index(net, iif);
2390                 if (dev == NULL) {
2391                         err = -ENODEV;
2392                         goto errout_free;
2393                 }
2394
2395                 skb->protocol   = htons(ETH_P_IP);
2396                 skb->dev        = dev;
2397                 skb->mark       = mark;
2398                 local_bh_disable();
2399                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2400                 local_bh_enable();
2401
2402                 rt = skb_rtable(skb);
2403                 if (err == 0 && rt->dst.error)
2404                         err = -rt->dst.error;
2405         } else {
2406                 rt = ip_route_output_key(net, &fl4);
2407
2408                 err = 0;
2409                 if (IS_ERR(rt))
2410                         err = PTR_ERR(rt);
2411         }
2412
2413         if (err)
2414                 goto errout_free;
2415
2416         skb_dst_set(skb, &rt->dst);
2417         if (rtm->rtm_flags & RTM_F_NOTIFY)
2418                 rt->rt_flags |= RTCF_NOTIFY;
2419
2420         err = rt_fill_info(net, dst, src, &fl4, skb,
2421                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2422                            RTM_NEWROUTE, 0, 0);
2423         if (err <= 0)
2424                 goto errout_free;
2425
2426         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2427 errout:
2428         return err;
2429
2430 errout_free:
2431         kfree_skb(skb);
2432         goto errout;
2433 }
2434
2435 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2436 {
2437         return skb->len;
2438 }
2439
2440 void ip_rt_multicast_event(struct in_device *in_dev)
2441 {
2442         rt_cache_flush(dev_net(in_dev->dev));
2443 }
2444
2445 #ifdef CONFIG_SYSCTL
2446 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2447 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2448 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2449 static int ip_rt_gc_elasticity __read_mostly    = 8;
2450
2451 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2452                                         void __user *buffer,
2453                                         size_t *lenp, loff_t *ppos)
2454 {
2455         struct net *net = (struct net *)__ctl->extra1;
2456
2457         if (write) {
2458                 rt_cache_flush(net);
2459                 fnhe_genid_bump(net);
2460                 return 0;
2461         }
2462
2463         return -EINVAL;
2464 }
2465
2466 static ctl_table ipv4_route_table[] = {
2467         {
2468                 .procname       = "gc_thresh",
2469                 .data           = &ipv4_dst_ops.gc_thresh,
2470                 .maxlen         = sizeof(int),
2471                 .mode           = 0644,
2472                 .proc_handler   = proc_dointvec,
2473         },
2474         {
2475                 .procname       = "max_size",
2476                 .data           = &ip_rt_max_size,
2477                 .maxlen         = sizeof(int),
2478                 .mode           = 0644,
2479                 .proc_handler   = proc_dointvec,
2480         },
2481         {
2482                 /*  Deprecated. Use gc_min_interval_ms */
2483
2484                 .procname       = "gc_min_interval",
2485                 .data           = &ip_rt_gc_min_interval,
2486                 .maxlen         = sizeof(int),
2487                 .mode           = 0644,
2488                 .proc_handler   = proc_dointvec_jiffies,
2489         },
2490         {
2491                 .procname       = "gc_min_interval_ms",
2492                 .data           = &ip_rt_gc_min_interval,
2493                 .maxlen         = sizeof(int),
2494                 .mode           = 0644,
2495                 .proc_handler   = proc_dointvec_ms_jiffies,
2496         },
2497         {
2498                 .procname       = "gc_timeout",
2499                 .data           = &ip_rt_gc_timeout,
2500                 .maxlen         = sizeof(int),
2501                 .mode           = 0644,
2502                 .proc_handler   = proc_dointvec_jiffies,
2503         },
2504         {
2505                 .procname       = "gc_interval",
2506                 .data           = &ip_rt_gc_interval,
2507                 .maxlen         = sizeof(int),
2508                 .mode           = 0644,
2509                 .proc_handler   = proc_dointvec_jiffies,
2510         },
2511         {
2512                 .procname       = "redirect_load",
2513                 .data           = &ip_rt_redirect_load,
2514                 .maxlen         = sizeof(int),
2515                 .mode           = 0644,
2516                 .proc_handler   = proc_dointvec,
2517         },
2518         {
2519                 .procname       = "redirect_number",
2520                 .data           = &ip_rt_redirect_number,
2521                 .maxlen         = sizeof(int),
2522                 .mode           = 0644,
2523                 .proc_handler   = proc_dointvec,
2524         },
2525         {
2526                 .procname       = "redirect_silence",
2527                 .data           = &ip_rt_redirect_silence,
2528                 .maxlen         = sizeof(int),
2529                 .mode           = 0644,
2530                 .proc_handler   = proc_dointvec,
2531         },
2532         {
2533                 .procname       = "error_cost",
2534                 .data           = &ip_rt_error_cost,
2535                 .maxlen         = sizeof(int),
2536                 .mode           = 0644,
2537                 .proc_handler   = proc_dointvec,
2538         },
2539         {
2540                 .procname       = "error_burst",
2541                 .data           = &ip_rt_error_burst,
2542                 .maxlen         = sizeof(int),
2543                 .mode           = 0644,
2544                 .proc_handler   = proc_dointvec,
2545         },
2546         {
2547                 .procname       = "gc_elasticity",
2548                 .data           = &ip_rt_gc_elasticity,
2549                 .maxlen         = sizeof(int),
2550                 .mode           = 0644,
2551                 .proc_handler   = proc_dointvec,
2552         },
2553         {
2554                 .procname       = "mtu_expires",
2555                 .data           = &ip_rt_mtu_expires,
2556                 .maxlen         = sizeof(int),
2557                 .mode           = 0644,
2558                 .proc_handler   = proc_dointvec_jiffies,
2559         },
2560         {
2561                 .procname       = "min_pmtu",
2562                 .data           = &ip_rt_min_pmtu,
2563                 .maxlen         = sizeof(int),
2564                 .mode           = 0644,
2565                 .proc_handler   = proc_dointvec,
2566         },
2567         {
2568                 .procname       = "min_adv_mss",
2569                 .data           = &ip_rt_min_advmss,
2570                 .maxlen         = sizeof(int),
2571                 .mode           = 0644,
2572                 .proc_handler   = proc_dointvec,
2573         },
2574         { }
2575 };
2576
2577 static struct ctl_table ipv4_route_flush_table[] = {
2578         {
2579                 .procname       = "flush",
2580                 .maxlen         = sizeof(int),
2581                 .mode           = 0200,
2582                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2583         },
2584         { },
2585 };
2586
2587 static __net_init int sysctl_route_net_init(struct net *net)
2588 {
2589         struct ctl_table *tbl;
2590
2591         tbl = ipv4_route_flush_table;
2592         if (!net_eq(net, &init_net)) {
2593                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2594                 if (tbl == NULL)
2595                         goto err_dup;
2596
2597                 /* Don't export sysctls to unprivileged users */
2598                 if (net->user_ns != &init_user_ns)
2599                         tbl[0].procname = NULL;
2600         }
2601         tbl[0].extra1 = net;
2602
2603         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2604         if (net->ipv4.route_hdr == NULL)
2605                 goto err_reg;
2606         return 0;
2607
2608 err_reg:
2609         if (tbl != ipv4_route_flush_table)
2610                 kfree(tbl);
2611 err_dup:
2612         return -ENOMEM;
2613 }
2614
2615 static __net_exit void sysctl_route_net_exit(struct net *net)
2616 {
2617         struct ctl_table *tbl;
2618
2619         tbl = net->ipv4.route_hdr->ctl_table_arg;
2620         unregister_net_sysctl_table(net->ipv4.route_hdr);
2621         BUG_ON(tbl == ipv4_route_flush_table);
2622         kfree(tbl);
2623 }
2624
2625 static __net_initdata struct pernet_operations sysctl_route_ops = {
2626         .init = sysctl_route_net_init,
2627         .exit = sysctl_route_net_exit,
2628 };
2629 #endif
2630
2631 static __net_init int rt_genid_init(struct net *net)
2632 {
2633         atomic_set(&net->rt_genid, 0);
2634         atomic_set(&net->fnhe_genid, 0);
2635         get_random_bytes(&net->ipv4.dev_addr_genid,
2636                          sizeof(net->ipv4.dev_addr_genid));
2637         return 0;
2638 }
2639
2640 static __net_initdata struct pernet_operations rt_genid_ops = {
2641         .init = rt_genid_init,
2642 };
2643
2644 static int __net_init ipv4_inetpeer_init(struct net *net)
2645 {
2646         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2647
2648         if (!bp)
2649                 return -ENOMEM;
2650         inet_peer_base_init(bp);
2651         net->ipv4.peers = bp;
2652         return 0;
2653 }
2654
2655 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2656 {
2657         struct inet_peer_base *bp = net->ipv4.peers;
2658
2659         net->ipv4.peers = NULL;
2660         inetpeer_invalidate_tree(bp);
2661         kfree(bp);
2662 }
2663
2664 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2665         .init   =       ipv4_inetpeer_init,
2666         .exit   =       ipv4_inetpeer_exit,
2667 };
2668
2669 #ifdef CONFIG_IP_ROUTE_CLASSID
2670 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2671 #endif /* CONFIG_IP_ROUTE_CLASSID */
2672
2673 int __init ip_rt_init(void)
2674 {
2675         int rc = 0;
2676
2677 #ifdef CONFIG_IP_ROUTE_CLASSID
2678         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2679         if (!ip_rt_acct)
2680                 panic("IP: failed to allocate ip_rt_acct\n");
2681 #endif
2682
2683         ipv4_dst_ops.kmem_cachep =
2684                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2685                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2686
2687         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2688
2689         if (dst_entries_init(&ipv4_dst_ops) < 0)
2690                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2691
2692         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2693                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2694
2695         ipv4_dst_ops.gc_thresh = ~0;
2696         ip_rt_max_size = INT_MAX;
2697
2698         devinet_init();
2699         ip_fib_init();
2700
2701         if (ip_rt_proc_init())
2702                 pr_err("Unable to create route proc files\n");
2703 #ifdef CONFIG_XFRM
2704         xfrm_init();
2705         xfrm4_init();
2706 #endif
2707         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2708
2709 #ifdef CONFIG_SYSCTL
2710         register_pernet_subsys(&sysctl_route_ops);
2711 #endif
2712         register_pernet_subsys(&rt_genid_ops);
2713         register_pernet_subsys(&ipv4_inetpeer_ops);
2714         return rc;
2715 }
2716
2717 #ifdef CONFIG_SYSCTL
2718 /*
2719  * We really need to sanitize the damn ipv4 init order, then all
2720  * this nonsense will go away.
2721  */
2722 void __init ip_static_sysctl_init(void)
2723 {
2724         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2725 }
2726 #endif
This page took 0.193851 seconds and 4 git commands to generate.