]> Git Repo - linux.git/blob - net/ipv4/route.c
net: busy-poll: return busypolling status to drivers
[linux.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <[email protected]>
10  *              Alan Cox, <[email protected]>
11  *              Linus Torvalds, <[email protected]>
12  *              Alexey Kuznetsov, <[email protected]>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              ([email protected])      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <linux/jhash.h>
93 #include <net/dst.h>
94 #include <net/dst_metadata.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/lwtunnel.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #include <linux/kmemleak.h>
112 #endif
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
116
117 #define RT_FL_TOS(oldflp4) \
118         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119
120 #define RT_GC_TIMEOUT (300*HZ)
121
122 static int ip_rt_max_size;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly       = 256;
131
132 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157
158 static struct dst_ops ipv4_dst_ops = {
159         .family =               AF_INET,
160         .check =                ipv4_dst_check,
161         .default_advmss =       ipv4_default_advmss,
162         .mtu =                  ipv4_mtu,
163         .cow_metrics =          ipv4_cow_metrics,
164         .destroy =              ipv4_dst_destroy,
165         .negative_advice =      ipv4_negative_advice,
166         .link_failure =         ipv4_link_failure,
167         .update_pmtu =          ip_rt_update_pmtu,
168         .redirect =             ip_do_redirect,
169         .local_out =            __ip_local_out,
170         .neigh_lookup =         ipv4_neigh_lookup,
171 };
172
173 #define ECN_OR_COST(class)      TC_PRIO_##class
174
175 const __u8 ip_tos2prio[16] = {
176         TC_PRIO_BESTEFFORT,
177         ECN_OR_COST(BESTEFFORT),
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BULK,
181         ECN_OR_COST(BULK),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_INTERACTIVE,
185         ECN_OR_COST(INTERACTIVE),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE_BULK,
189         ECN_OR_COST(INTERACTIVE_BULK),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK)
192 };
193 EXPORT_SYMBOL(ip_tos2prio);
194
195 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
196 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
197
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
200 {
201         if (*pos)
202                 return NULL;
203         return SEQ_START_TOKEN;
204 }
205
206 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
207 {
208         ++*pos;
209         return NULL;
210 }
211
212 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
213 {
214 }
215
216 static int rt_cache_seq_show(struct seq_file *seq, void *v)
217 {
218         if (v == SEQ_START_TOKEN)
219                 seq_printf(seq, "%-127s\n",
220                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
222                            "HHUptod\tSpecDst");
223         return 0;
224 }
225
226 static const struct seq_operations rt_cache_seq_ops = {
227         .start  = rt_cache_seq_start,
228         .next   = rt_cache_seq_next,
229         .stop   = rt_cache_seq_stop,
230         .show   = rt_cache_seq_show,
231 };
232
233 static int rt_cache_seq_open(struct inode *inode, struct file *file)
234 {
235         return seq_open(file, &rt_cache_seq_ops);
236 }
237
238 static const struct file_operations rt_cache_seq_fops = {
239         .owner   = THIS_MODULE,
240         .open    = rt_cache_seq_open,
241         .read    = seq_read,
242         .llseek  = seq_lseek,
243         .release = seq_release,
244 };
245
246
247 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
248 {
249         int cpu;
250
251         if (*pos == 0)
252                 return SEQ_START_TOKEN;
253
254         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
255                 if (!cpu_possible(cpu))
256                         continue;
257                 *pos = cpu+1;
258                 return &per_cpu(rt_cache_stat, cpu);
259         }
260         return NULL;
261 }
262
263 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
264 {
265         int cpu;
266
267         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
268                 if (!cpu_possible(cpu))
269                         continue;
270                 *pos = cpu+1;
271                 return &per_cpu(rt_cache_stat, cpu);
272         }
273         return NULL;
274
275 }
276
277 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
278 {
279
280 }
281
282 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
283 {
284         struct rt_cache_stat *st = v;
285
286         if (v == SEQ_START_TOKEN) {
287                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
288                 return 0;
289         }
290
291         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
292                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
293                    dst_entries_get_slow(&ipv4_dst_ops),
294                    0, /* st->in_hit */
295                    st->in_slow_tot,
296                    st->in_slow_mc,
297                    st->in_no_route,
298                    st->in_brd,
299                    st->in_martian_dst,
300                    st->in_martian_src,
301
302                    0, /* st->out_hit */
303                    st->out_slow_tot,
304                    st->out_slow_mc,
305
306                    0, /* st->gc_total */
307                    0, /* st->gc_ignored */
308                    0, /* st->gc_goal_miss */
309                    0, /* st->gc_dst_overflow */
310                    0, /* st->in_hlist_search */
311                    0  /* st->out_hlist_search */
312                 );
313         return 0;
314 }
315
316 static const struct seq_operations rt_cpu_seq_ops = {
317         .start  = rt_cpu_seq_start,
318         .next   = rt_cpu_seq_next,
319         .stop   = rt_cpu_seq_stop,
320         .show   = rt_cpu_seq_show,
321 };
322
323
324 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
325 {
326         return seq_open(file, &rt_cpu_seq_ops);
327 }
328
329 static const struct file_operations rt_cpu_seq_fops = {
330         .owner   = THIS_MODULE,
331         .open    = rt_cpu_seq_open,
332         .read    = seq_read,
333         .llseek  = seq_lseek,
334         .release = seq_release,
335 };
336
337 #ifdef CONFIG_IP_ROUTE_CLASSID
338 static int rt_acct_proc_show(struct seq_file *m, void *v)
339 {
340         struct ip_rt_acct *dst, *src;
341         unsigned int i, j;
342
343         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
344         if (!dst)
345                 return -ENOMEM;
346
347         for_each_possible_cpu(i) {
348                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
349                 for (j = 0; j < 256; j++) {
350                         dst[j].o_bytes   += src[j].o_bytes;
351                         dst[j].o_packets += src[j].o_packets;
352                         dst[j].i_bytes   += src[j].i_bytes;
353                         dst[j].i_packets += src[j].i_packets;
354                 }
355         }
356
357         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
358         kfree(dst);
359         return 0;
360 }
361
362 static int rt_acct_proc_open(struct inode *inode, struct file *file)
363 {
364         return single_open(file, rt_acct_proc_show, NULL);
365 }
366
367 static const struct file_operations rt_acct_proc_fops = {
368         .owner          = THIS_MODULE,
369         .open           = rt_acct_proc_open,
370         .read           = seq_read,
371         .llseek         = seq_lseek,
372         .release        = single_release,
373 };
374 #endif
375
376 static int __net_init ip_rt_do_proc_init(struct net *net)
377 {
378         struct proc_dir_entry *pde;
379
380         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
381                           &rt_cache_seq_fops);
382         if (!pde)
383                 goto err1;
384
385         pde = proc_create("rt_cache", S_IRUGO,
386                           net->proc_net_stat, &rt_cpu_seq_fops);
387         if (!pde)
388                 goto err2;
389
390 #ifdef CONFIG_IP_ROUTE_CLASSID
391         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
392         if (!pde)
393                 goto err3;
394 #endif
395         return 0;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398 err3:
399         remove_proc_entry("rt_cache", net->proc_net_stat);
400 #endif
401 err2:
402         remove_proc_entry("rt_cache", net->proc_net);
403 err1:
404         return -ENOMEM;
405 }
406
407 static void __net_exit ip_rt_do_proc_exit(struct net *net)
408 {
409         remove_proc_entry("rt_cache", net->proc_net_stat);
410         remove_proc_entry("rt_cache", net->proc_net);
411 #ifdef CONFIG_IP_ROUTE_CLASSID
412         remove_proc_entry("rt_acct", net->proc_net);
413 #endif
414 }
415
416 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
417         .init = ip_rt_do_proc_init,
418         .exit = ip_rt_do_proc_exit,
419 };
420
421 static int __init ip_rt_proc_init(void)
422 {
423         return register_pernet_subsys(&ip_rt_proc_ops);
424 }
425
426 #else
427 static inline int ip_rt_proc_init(void)
428 {
429         return 0;
430 }
431 #endif /* CONFIG_PROC_FS */
432
433 static inline bool rt_is_expired(const struct rtable *rth)
434 {
435         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
436 }
437
438 void rt_cache_flush(struct net *net)
439 {
440         rt_genid_bump_ipv4(net);
441 }
442
443 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
444                                            struct sk_buff *skb,
445                                            const void *daddr)
446 {
447         struct net_device *dev = dst->dev;
448         const __be32 *pkey = daddr;
449         const struct rtable *rt;
450         struct neighbour *n;
451
452         rt = (const struct rtable *) dst;
453         if (rt->rt_gateway)
454                 pkey = (const __be32 *) &rt->rt_gateway;
455         else if (skb)
456                 pkey = &ip_hdr(skb)->daddr;
457
458         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
459         if (n)
460                 return n;
461         return neigh_create(&arp_tbl, pkey, dev);
462 }
463
464 #define IP_IDENTS_SZ 2048u
465
466 static atomic_t *ip_idents __read_mostly;
467 static u32 *ip_tstamps __read_mostly;
468
469 /* In order to protect privacy, we add a perturbation to identifiers
470  * if one generator is seldom used. This makes hard for an attacker
471  * to infer how many packets were sent between two points in time.
472  */
473 u32 ip_idents_reserve(u32 hash, int segs)
474 {
475         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
476         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
477         u32 old = ACCESS_ONCE(*p_tstamp);
478         u32 now = (u32)jiffies;
479         u32 new, delta = 0;
480
481         if (old != now && cmpxchg(p_tstamp, old, now) == old)
482                 delta = prandom_u32_max(now - old);
483
484         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
485         do {
486                 old = (u32)atomic_read(p_id);
487                 new = old + delta + segs;
488         } while (atomic_cmpxchg(p_id, old, new) != old);
489
490         return new - segs;
491 }
492 EXPORT_SYMBOL(ip_idents_reserve);
493
494 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
495 {
496         static u32 ip_idents_hashrnd __read_mostly;
497         u32 hash, id;
498
499         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
500
501         hash = jhash_3words((__force u32)iph->daddr,
502                             (__force u32)iph->saddr,
503                             iph->protocol ^ net_hash_mix(net),
504                             ip_idents_hashrnd);
505         id = ip_idents_reserve(hash, segs);
506         iph->id = htons(id);
507 }
508 EXPORT_SYMBOL(__ip_select_ident);
509
510 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
511                              const struct sock *sk,
512                              const struct iphdr *iph,
513                              int oif, u8 tos,
514                              u8 prot, u32 mark, int flow_flags)
515 {
516         if (sk) {
517                 const struct inet_sock *inet = inet_sk(sk);
518
519                 oif = sk->sk_bound_dev_if;
520                 mark = sk->sk_mark;
521                 tos = RT_CONN_FLAGS(sk);
522                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
523         }
524         flowi4_init_output(fl4, oif, mark, tos,
525                            RT_SCOPE_UNIVERSE, prot,
526                            flow_flags,
527                            iph->daddr, iph->saddr, 0, 0,
528                            sock_net_uid(net, sk));
529 }
530
531 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
532                                const struct sock *sk)
533 {
534         const struct iphdr *iph = ip_hdr(skb);
535         int oif = skb->dev->ifindex;
536         u8 tos = RT_TOS(iph->tos);
537         u8 prot = iph->protocol;
538         u32 mark = skb->mark;
539
540         __build_flow_key(sock_net(sk), fl4, sk, iph, oif, tos, prot, mark, 0);
541 }
542
543 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
544 {
545         const struct inet_sock *inet = inet_sk(sk);
546         const struct ip_options_rcu *inet_opt;
547         __be32 daddr = inet->inet_daddr;
548
549         rcu_read_lock();
550         inet_opt = rcu_dereference(inet->inet_opt);
551         if (inet_opt && inet_opt->opt.srr)
552                 daddr = inet_opt->opt.faddr;
553         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
554                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
555                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
556                            inet_sk_flowi_flags(sk),
557                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
558         rcu_read_unlock();
559 }
560
561 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
562                                  const struct sk_buff *skb)
563 {
564         if (skb)
565                 build_skb_flow_key(fl4, skb, sk);
566         else
567                 build_sk_flow_key(fl4, sk);
568 }
569
570 static inline void rt_free(struct rtable *rt)
571 {
572         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
573 }
574
575 static DEFINE_SPINLOCK(fnhe_lock);
576
577 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
578 {
579         struct rtable *rt;
580
581         rt = rcu_dereference(fnhe->fnhe_rth_input);
582         if (rt) {
583                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
584                 rt_free(rt);
585         }
586         rt = rcu_dereference(fnhe->fnhe_rth_output);
587         if (rt) {
588                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
589                 rt_free(rt);
590         }
591 }
592
593 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
594 {
595         struct fib_nh_exception *fnhe, *oldest;
596
597         oldest = rcu_dereference(hash->chain);
598         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
599              fnhe = rcu_dereference(fnhe->fnhe_next)) {
600                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
601                         oldest = fnhe;
602         }
603         fnhe_flush_routes(oldest);
604         return oldest;
605 }
606
607 static inline u32 fnhe_hashfun(__be32 daddr)
608 {
609         static u32 fnhe_hashrnd __read_mostly;
610         u32 hval;
611
612         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
613         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
614         return hash_32(hval, FNHE_HASH_SHIFT);
615 }
616
617 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
618 {
619         rt->rt_pmtu = fnhe->fnhe_pmtu;
620         rt->dst.expires = fnhe->fnhe_expires;
621
622         if (fnhe->fnhe_gw) {
623                 rt->rt_flags |= RTCF_REDIRECTED;
624                 rt->rt_gateway = fnhe->fnhe_gw;
625                 rt->rt_uses_gateway = 1;
626         }
627 }
628
629 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
630                                   u32 pmtu, unsigned long expires)
631 {
632         struct fnhe_hash_bucket *hash;
633         struct fib_nh_exception *fnhe;
634         struct rtable *rt;
635         unsigned int i;
636         int depth;
637         u32 hval = fnhe_hashfun(daddr);
638
639         spin_lock_bh(&fnhe_lock);
640
641         hash = rcu_dereference(nh->nh_exceptions);
642         if (!hash) {
643                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
644                 if (!hash)
645                         goto out_unlock;
646                 rcu_assign_pointer(nh->nh_exceptions, hash);
647         }
648
649         hash += hval;
650
651         depth = 0;
652         for (fnhe = rcu_dereference(hash->chain); fnhe;
653              fnhe = rcu_dereference(fnhe->fnhe_next)) {
654                 if (fnhe->fnhe_daddr == daddr)
655                         break;
656                 depth++;
657         }
658
659         if (fnhe) {
660                 if (gw)
661                         fnhe->fnhe_gw = gw;
662                 if (pmtu) {
663                         fnhe->fnhe_pmtu = pmtu;
664                         fnhe->fnhe_expires = max(1UL, expires);
665                 }
666                 /* Update all cached dsts too */
667                 rt = rcu_dereference(fnhe->fnhe_rth_input);
668                 if (rt)
669                         fill_route_from_fnhe(rt, fnhe);
670                 rt = rcu_dereference(fnhe->fnhe_rth_output);
671                 if (rt)
672                         fill_route_from_fnhe(rt, fnhe);
673         } else {
674                 if (depth > FNHE_RECLAIM_DEPTH)
675                         fnhe = fnhe_oldest(hash);
676                 else {
677                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
678                         if (!fnhe)
679                                 goto out_unlock;
680
681                         fnhe->fnhe_next = hash->chain;
682                         rcu_assign_pointer(hash->chain, fnhe);
683                 }
684                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
685                 fnhe->fnhe_daddr = daddr;
686                 fnhe->fnhe_gw = gw;
687                 fnhe->fnhe_pmtu = pmtu;
688                 fnhe->fnhe_expires = expires;
689
690                 /* Exception created; mark the cached routes for the nexthop
691                  * stale, so anyone caching it rechecks if this exception
692                  * applies to them.
693                  */
694                 rt = rcu_dereference(nh->nh_rth_input);
695                 if (rt)
696                         rt->dst.obsolete = DST_OBSOLETE_KILL;
697
698                 for_each_possible_cpu(i) {
699                         struct rtable __rcu **prt;
700                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
701                         rt = rcu_dereference(*prt);
702                         if (rt)
703                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
704                 }
705         }
706
707         fnhe->fnhe_stamp = jiffies;
708
709 out_unlock:
710         spin_unlock_bh(&fnhe_lock);
711 }
712
713 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
714                              bool kill_route)
715 {
716         __be32 new_gw = icmp_hdr(skb)->un.gateway;
717         __be32 old_gw = ip_hdr(skb)->saddr;
718         struct net_device *dev = skb->dev;
719         struct in_device *in_dev;
720         struct fib_result res;
721         struct neighbour *n;
722         struct net *net;
723
724         switch (icmp_hdr(skb)->code & 7) {
725         case ICMP_REDIR_NET:
726         case ICMP_REDIR_NETTOS:
727         case ICMP_REDIR_HOST:
728         case ICMP_REDIR_HOSTTOS:
729                 break;
730
731         default:
732                 return;
733         }
734
735         if (rt->rt_gateway != old_gw)
736                 return;
737
738         in_dev = __in_dev_get_rcu(dev);
739         if (!in_dev)
740                 return;
741
742         net = dev_net(dev);
743         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
744             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
745             ipv4_is_zeronet(new_gw))
746                 goto reject_redirect;
747
748         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
749                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
750                         goto reject_redirect;
751                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
752                         goto reject_redirect;
753         } else {
754                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
755                         goto reject_redirect;
756         }
757
758         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
759         if (!n)
760                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
761         if (!IS_ERR(n)) {
762                 if (!(n->nud_state & NUD_VALID)) {
763                         neigh_event_send(n, NULL);
764                 } else {
765                         if (fib_lookup(net, fl4, &res, 0) == 0) {
766                                 struct fib_nh *nh = &FIB_RES_NH(res);
767
768                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
769                                                 0, jiffies + ip_rt_gc_timeout);
770                         }
771                         if (kill_route)
772                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
773                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
774                 }
775                 neigh_release(n);
776         }
777         return;
778
779 reject_redirect:
780 #ifdef CONFIG_IP_ROUTE_VERBOSE
781         if (IN_DEV_LOG_MARTIANS(in_dev)) {
782                 const struct iphdr *iph = (const struct iphdr *) skb->data;
783                 __be32 daddr = iph->daddr;
784                 __be32 saddr = iph->saddr;
785
786                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
787                                      "  Advised path = %pI4 -> %pI4\n",
788                                      &old_gw, dev->name, &new_gw,
789                                      &saddr, &daddr);
790         }
791 #endif
792         ;
793 }
794
795 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
796 {
797         struct rtable *rt;
798         struct flowi4 fl4;
799         const struct iphdr *iph = (const struct iphdr *) skb->data;
800         int oif = skb->dev->ifindex;
801         u8 tos = RT_TOS(iph->tos);
802         u8 prot = iph->protocol;
803         u32 mark = skb->mark;
804
805         rt = (struct rtable *) dst;
806
807         __build_flow_key(sock_net(sk), &fl4, sk, iph, oif, tos, prot, mark, 0);
808         __ip_do_redirect(rt, skb, &fl4, true);
809 }
810
811 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
812 {
813         struct rtable *rt = (struct rtable *)dst;
814         struct dst_entry *ret = dst;
815
816         if (rt) {
817                 if (dst->obsolete > 0) {
818                         ip_rt_put(rt);
819                         ret = NULL;
820                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
821                            rt->dst.expires) {
822                         ip_rt_put(rt);
823                         ret = NULL;
824                 }
825         }
826         return ret;
827 }
828
829 /*
830  * Algorithm:
831  *      1. The first ip_rt_redirect_number redirects are sent
832  *         with exponential backoff, then we stop sending them at all,
833  *         assuming that the host ignores our redirects.
834  *      2. If we did not see packets requiring redirects
835  *         during ip_rt_redirect_silence, we assume that the host
836  *         forgot redirected route and start to send redirects again.
837  *
838  * This algorithm is much cheaper and more intelligent than dumb load limiting
839  * in icmp.c.
840  *
841  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
842  * and "frag. need" (breaks PMTU discovery) in icmp.c.
843  */
844
845 void ip_rt_send_redirect(struct sk_buff *skb)
846 {
847         struct rtable *rt = skb_rtable(skb);
848         struct in_device *in_dev;
849         struct inet_peer *peer;
850         struct net *net;
851         int log_martians;
852         int vif;
853
854         rcu_read_lock();
855         in_dev = __in_dev_get_rcu(rt->dst.dev);
856         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
857                 rcu_read_unlock();
858                 return;
859         }
860         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
861         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
862         rcu_read_unlock();
863
864         net = dev_net(rt->dst.dev);
865         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
866         if (!peer) {
867                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
868                           rt_nexthop(rt, ip_hdr(skb)->daddr));
869                 return;
870         }
871
872         /* No redirected packets during ip_rt_redirect_silence;
873          * reset the algorithm.
874          */
875         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
876                 peer->rate_tokens = 0;
877
878         /* Too many ignored redirects; do not send anything
879          * set dst.rate_last to the last seen redirected packet.
880          */
881         if (peer->rate_tokens >= ip_rt_redirect_number) {
882                 peer->rate_last = jiffies;
883                 goto out_put_peer;
884         }
885
886         /* Check for load limit; set rate_last to the latest sent
887          * redirect.
888          */
889         if (peer->rate_tokens == 0 ||
890             time_after(jiffies,
891                        (peer->rate_last +
892                         (ip_rt_redirect_load << peer->rate_tokens)))) {
893                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
894
895                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
896                 peer->rate_last = jiffies;
897                 ++peer->rate_tokens;
898 #ifdef CONFIG_IP_ROUTE_VERBOSE
899                 if (log_martians &&
900                     peer->rate_tokens == ip_rt_redirect_number)
901                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
902                                              &ip_hdr(skb)->saddr, inet_iif(skb),
903                                              &ip_hdr(skb)->daddr, &gw);
904 #endif
905         }
906 out_put_peer:
907         inet_putpeer(peer);
908 }
909
910 static int ip_error(struct sk_buff *skb)
911 {
912         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
913         struct rtable *rt = skb_rtable(skb);
914         struct inet_peer *peer;
915         unsigned long now;
916         struct net *net;
917         bool send;
918         int code;
919
920         /* IP on this device is disabled. */
921         if (!in_dev)
922                 goto out;
923
924         net = dev_net(rt->dst.dev);
925         if (!IN_DEV_FORWARD(in_dev)) {
926                 switch (rt->dst.error) {
927                 case EHOSTUNREACH:
928                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
929                         break;
930
931                 case ENETUNREACH:
932                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
933                         break;
934                 }
935                 goto out;
936         }
937
938         switch (rt->dst.error) {
939         case EINVAL:
940         default:
941                 goto out;
942         case EHOSTUNREACH:
943                 code = ICMP_HOST_UNREACH;
944                 break;
945         case ENETUNREACH:
946                 code = ICMP_NET_UNREACH;
947                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
948                 break;
949         case EACCES:
950                 code = ICMP_PKT_FILTERED;
951                 break;
952         }
953
954         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
955                                l3mdev_master_ifindex(skb->dev), 1);
956
957         send = true;
958         if (peer) {
959                 now = jiffies;
960                 peer->rate_tokens += now - peer->rate_last;
961                 if (peer->rate_tokens > ip_rt_error_burst)
962                         peer->rate_tokens = ip_rt_error_burst;
963                 peer->rate_last = now;
964                 if (peer->rate_tokens >= ip_rt_error_cost)
965                         peer->rate_tokens -= ip_rt_error_cost;
966                 else
967                         send = false;
968                 inet_putpeer(peer);
969         }
970         if (send)
971                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
972
973 out:    kfree_skb(skb);
974         return 0;
975 }
976
977 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
978 {
979         struct dst_entry *dst = &rt->dst;
980         struct fib_result res;
981
982         if (dst_metric_locked(dst, RTAX_MTU))
983                 return;
984
985         if (ipv4_mtu(dst) < mtu)
986                 return;
987
988         if (mtu < ip_rt_min_pmtu)
989                 mtu = ip_rt_min_pmtu;
990
991         if (rt->rt_pmtu == mtu &&
992             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
993                 return;
994
995         rcu_read_lock();
996         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
997                 struct fib_nh *nh = &FIB_RES_NH(res);
998
999                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1000                                       jiffies + ip_rt_mtu_expires);
1001         }
1002         rcu_read_unlock();
1003 }
1004
1005 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1006                               struct sk_buff *skb, u32 mtu)
1007 {
1008         struct rtable *rt = (struct rtable *) dst;
1009         struct flowi4 fl4;
1010
1011         ip_rt_build_flow_key(&fl4, sk, skb);
1012         __ip_rt_update_pmtu(rt, &fl4, mtu);
1013 }
1014
1015 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1016                       int oif, u32 mark, u8 protocol, int flow_flags)
1017 {
1018         const struct iphdr *iph = (const struct iphdr *) skb->data;
1019         struct flowi4 fl4;
1020         struct rtable *rt;
1021
1022         if (!mark)
1023                 mark = IP4_REPLY_MARK(net, skb->mark);
1024
1025         __build_flow_key(net, &fl4, NULL, iph, oif,
1026                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1027         rt = __ip_route_output_key(net, &fl4);
1028         if (!IS_ERR(rt)) {
1029                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1030                 ip_rt_put(rt);
1031         }
1032 }
1033 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1034
1035 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1036 {
1037         const struct iphdr *iph = (const struct iphdr *) skb->data;
1038         struct flowi4 fl4;
1039         struct rtable *rt;
1040
1041         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1042
1043         if (!fl4.flowi4_mark)
1044                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1045
1046         rt = __ip_route_output_key(sock_net(sk), &fl4);
1047         if (!IS_ERR(rt)) {
1048                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1049                 ip_rt_put(rt);
1050         }
1051 }
1052
1053 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1054 {
1055         const struct iphdr *iph = (const struct iphdr *) skb->data;
1056         struct flowi4 fl4;
1057         struct rtable *rt;
1058         struct dst_entry *odst = NULL;
1059         bool new = false;
1060         struct net *net = sock_net(sk);
1061
1062         bh_lock_sock(sk);
1063
1064         if (!ip_sk_accept_pmtu(sk))
1065                 goto out;
1066
1067         odst = sk_dst_get(sk);
1068
1069         if (sock_owned_by_user(sk) || !odst) {
1070                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1071                 goto out;
1072         }
1073
1074         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1075
1076         rt = (struct rtable *)odst;
1077         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1078                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1079                 if (IS_ERR(rt))
1080                         goto out;
1081
1082                 new = true;
1083         }
1084
1085         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1086
1087         if (!dst_check(&rt->dst, 0)) {
1088                 if (new)
1089                         dst_release(&rt->dst);
1090
1091                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1092                 if (IS_ERR(rt))
1093                         goto out;
1094
1095                 new = true;
1096         }
1097
1098         if (new)
1099                 sk_dst_set(sk, &rt->dst);
1100
1101 out:
1102         bh_unlock_sock(sk);
1103         dst_release(odst);
1104 }
1105 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1106
1107 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1108                    int oif, u32 mark, u8 protocol, int flow_flags)
1109 {
1110         const struct iphdr *iph = (const struct iphdr *) skb->data;
1111         struct flowi4 fl4;
1112         struct rtable *rt;
1113
1114         __build_flow_key(net, &fl4, NULL, iph, oif,
1115                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1116         rt = __ip_route_output_key(net, &fl4);
1117         if (!IS_ERR(rt)) {
1118                 __ip_do_redirect(rt, skb, &fl4, false);
1119                 ip_rt_put(rt);
1120         }
1121 }
1122 EXPORT_SYMBOL_GPL(ipv4_redirect);
1123
1124 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1125 {
1126         const struct iphdr *iph = (const struct iphdr *) skb->data;
1127         struct flowi4 fl4;
1128         struct rtable *rt;
1129         struct net *net = sock_net(sk);
1130
1131         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1132         rt = __ip_route_output_key(net, &fl4);
1133         if (!IS_ERR(rt)) {
1134                 __ip_do_redirect(rt, skb, &fl4, false);
1135                 ip_rt_put(rt);
1136         }
1137 }
1138 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1139
1140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1141 {
1142         struct rtable *rt = (struct rtable *) dst;
1143
1144         /* All IPV4 dsts are created with ->obsolete set to the value
1145          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1146          * into this function always.
1147          *
1148          * When a PMTU/redirect information update invalidates a route,
1149          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1150          * DST_OBSOLETE_DEAD by dst_free().
1151          */
1152         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1153                 return NULL;
1154         return dst;
1155 }
1156
1157 static void ipv4_link_failure(struct sk_buff *skb)
1158 {
1159         struct rtable *rt;
1160
1161         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1162
1163         rt = skb_rtable(skb);
1164         if (rt)
1165                 dst_set_expires(&rt->dst, 0);
1166 }
1167
1168 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1169 {
1170         pr_debug("%s: %pI4 -> %pI4, %s\n",
1171                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1172                  skb->dev ? skb->dev->name : "?");
1173         kfree_skb(skb);
1174         WARN_ON(1);
1175         return 0;
1176 }
1177
1178 /*
1179    We do not cache source address of outgoing interface,
1180    because it is used only by IP RR, TS and SRR options,
1181    so that it out of fast path.
1182
1183    BTW remember: "addr" is allowed to be not aligned
1184    in IP options!
1185  */
1186
1187 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1188 {
1189         __be32 src;
1190
1191         if (rt_is_output_route(rt))
1192                 src = ip_hdr(skb)->saddr;
1193         else {
1194                 struct fib_result res;
1195                 struct flowi4 fl4;
1196                 struct iphdr *iph;
1197
1198                 iph = ip_hdr(skb);
1199
1200                 memset(&fl4, 0, sizeof(fl4));
1201                 fl4.daddr = iph->daddr;
1202                 fl4.saddr = iph->saddr;
1203                 fl4.flowi4_tos = RT_TOS(iph->tos);
1204                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1205                 fl4.flowi4_iif = skb->dev->ifindex;
1206                 fl4.flowi4_mark = skb->mark;
1207
1208                 rcu_read_lock();
1209                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1210                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1211                 else
1212                         src = inet_select_addr(rt->dst.dev,
1213                                                rt_nexthop(rt, iph->daddr),
1214                                                RT_SCOPE_UNIVERSE);
1215                 rcu_read_unlock();
1216         }
1217         memcpy(addr, &src, 4);
1218 }
1219
1220 #ifdef CONFIG_IP_ROUTE_CLASSID
1221 static void set_class_tag(struct rtable *rt, u32 tag)
1222 {
1223         if (!(rt->dst.tclassid & 0xFFFF))
1224                 rt->dst.tclassid |= tag & 0xFFFF;
1225         if (!(rt->dst.tclassid & 0xFFFF0000))
1226                 rt->dst.tclassid |= tag & 0xFFFF0000;
1227 }
1228 #endif
1229
1230 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1231 {
1232         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1233
1234         if (advmss == 0) {
1235                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1236                                ip_rt_min_advmss);
1237                 if (advmss > 65535 - 40)
1238                         advmss = 65535 - 40;
1239         }
1240         return advmss;
1241 }
1242
1243 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1244 {
1245         const struct rtable *rt = (const struct rtable *) dst;
1246         unsigned int mtu = rt->rt_pmtu;
1247
1248         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1249                 mtu = dst_metric_raw(dst, RTAX_MTU);
1250
1251         if (mtu)
1252                 return mtu;
1253
1254         mtu = dst->dev->mtu;
1255
1256         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1257                 if (rt->rt_uses_gateway && mtu > 576)
1258                         mtu = 576;
1259         }
1260
1261         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1262
1263         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1264 }
1265
1266 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1267 {
1268         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1269         struct fib_nh_exception *fnhe;
1270         u32 hval;
1271
1272         if (!hash)
1273                 return NULL;
1274
1275         hval = fnhe_hashfun(daddr);
1276
1277         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1278              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1279                 if (fnhe->fnhe_daddr == daddr)
1280                         return fnhe;
1281         }
1282         return NULL;
1283 }
1284
1285 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1286                               __be32 daddr)
1287 {
1288         bool ret = false;
1289
1290         spin_lock_bh(&fnhe_lock);
1291
1292         if (daddr == fnhe->fnhe_daddr) {
1293                 struct rtable __rcu **porig;
1294                 struct rtable *orig;
1295                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1296
1297                 if (rt_is_input_route(rt))
1298                         porig = &fnhe->fnhe_rth_input;
1299                 else
1300                         porig = &fnhe->fnhe_rth_output;
1301                 orig = rcu_dereference(*porig);
1302
1303                 if (fnhe->fnhe_genid != genid) {
1304                         fnhe->fnhe_genid = genid;
1305                         fnhe->fnhe_gw = 0;
1306                         fnhe->fnhe_pmtu = 0;
1307                         fnhe->fnhe_expires = 0;
1308                         fnhe_flush_routes(fnhe);
1309                         orig = NULL;
1310                 }
1311                 fill_route_from_fnhe(rt, fnhe);
1312                 if (!rt->rt_gateway)
1313                         rt->rt_gateway = daddr;
1314
1315                 if (!(rt->dst.flags & DST_NOCACHE)) {
1316                         rcu_assign_pointer(*porig, rt);
1317                         if (orig)
1318                                 rt_free(orig);
1319                         ret = true;
1320                 }
1321
1322                 fnhe->fnhe_stamp = jiffies;
1323         }
1324         spin_unlock_bh(&fnhe_lock);
1325
1326         return ret;
1327 }
1328
1329 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1330 {
1331         struct rtable *orig, *prev, **p;
1332         bool ret = true;
1333
1334         if (rt_is_input_route(rt)) {
1335                 p = (struct rtable **)&nh->nh_rth_input;
1336         } else {
1337                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1338         }
1339         orig = *p;
1340
1341         prev = cmpxchg(p, orig, rt);
1342         if (prev == orig) {
1343                 if (orig)
1344                         rt_free(orig);
1345         } else
1346                 ret = false;
1347
1348         return ret;
1349 }
1350
1351 struct uncached_list {
1352         spinlock_t              lock;
1353         struct list_head        head;
1354 };
1355
1356 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1357
1358 static void rt_add_uncached_list(struct rtable *rt)
1359 {
1360         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1361
1362         rt->rt_uncached_list = ul;
1363
1364         spin_lock_bh(&ul->lock);
1365         list_add_tail(&rt->rt_uncached, &ul->head);
1366         spin_unlock_bh(&ul->lock);
1367 }
1368
1369 static void ipv4_dst_destroy(struct dst_entry *dst)
1370 {
1371         struct rtable *rt = (struct rtable *) dst;
1372
1373         if (!list_empty(&rt->rt_uncached)) {
1374                 struct uncached_list *ul = rt->rt_uncached_list;
1375
1376                 spin_lock_bh(&ul->lock);
1377                 list_del(&rt->rt_uncached);
1378                 spin_unlock_bh(&ul->lock);
1379         }
1380 }
1381
1382 void rt_flush_dev(struct net_device *dev)
1383 {
1384         struct net *net = dev_net(dev);
1385         struct rtable *rt;
1386         int cpu;
1387
1388         for_each_possible_cpu(cpu) {
1389                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1390
1391                 spin_lock_bh(&ul->lock);
1392                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1393                         if (rt->dst.dev != dev)
1394                                 continue;
1395                         rt->dst.dev = net->loopback_dev;
1396                         dev_hold(rt->dst.dev);
1397                         dev_put(dev);
1398                 }
1399                 spin_unlock_bh(&ul->lock);
1400         }
1401 }
1402
1403 static bool rt_cache_valid(const struct rtable *rt)
1404 {
1405         return  rt &&
1406                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1407                 !rt_is_expired(rt);
1408 }
1409
1410 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1411                            const struct fib_result *res,
1412                            struct fib_nh_exception *fnhe,
1413                            struct fib_info *fi, u16 type, u32 itag)
1414 {
1415         bool cached = false;
1416
1417         if (fi) {
1418                 struct fib_nh *nh = &FIB_RES_NH(*res);
1419
1420                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1421                         rt->rt_gateway = nh->nh_gw;
1422                         rt->rt_uses_gateway = 1;
1423                 }
1424                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1425 #ifdef CONFIG_IP_ROUTE_CLASSID
1426                 rt->dst.tclassid = nh->nh_tclassid;
1427 #endif
1428                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1429                 if (unlikely(fnhe))
1430                         cached = rt_bind_exception(rt, fnhe, daddr);
1431                 else if (!(rt->dst.flags & DST_NOCACHE))
1432                         cached = rt_cache_route(nh, rt);
1433                 if (unlikely(!cached)) {
1434                         /* Routes we intend to cache in nexthop exception or
1435                          * FIB nexthop have the DST_NOCACHE bit clear.
1436                          * However, if we are unsuccessful at storing this
1437                          * route into the cache we really need to set it.
1438                          */
1439                         rt->dst.flags |= DST_NOCACHE;
1440                         if (!rt->rt_gateway)
1441                                 rt->rt_gateway = daddr;
1442                         rt_add_uncached_list(rt);
1443                 }
1444         } else
1445                 rt_add_uncached_list(rt);
1446
1447 #ifdef CONFIG_IP_ROUTE_CLASSID
1448 #ifdef CONFIG_IP_MULTIPLE_TABLES
1449         set_class_tag(rt, res->tclassid);
1450 #endif
1451         set_class_tag(rt, itag);
1452 #endif
1453 }
1454
1455 struct rtable *rt_dst_alloc(struct net_device *dev,
1456                             unsigned int flags, u16 type,
1457                             bool nopolicy, bool noxfrm, bool will_cache)
1458 {
1459         struct rtable *rt;
1460
1461         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1462                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1463                        (nopolicy ? DST_NOPOLICY : 0) |
1464                        (noxfrm ? DST_NOXFRM : 0));
1465
1466         if (rt) {
1467                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1468                 rt->rt_flags = flags;
1469                 rt->rt_type = type;
1470                 rt->rt_is_input = 0;
1471                 rt->rt_iif = 0;
1472                 rt->rt_pmtu = 0;
1473                 rt->rt_gateway = 0;
1474                 rt->rt_uses_gateway = 0;
1475                 rt->rt_table_id = 0;
1476                 INIT_LIST_HEAD(&rt->rt_uncached);
1477
1478                 rt->dst.output = ip_output;
1479                 if (flags & RTCF_LOCAL)
1480                         rt->dst.input = ip_local_deliver;
1481         }
1482
1483         return rt;
1484 }
1485 EXPORT_SYMBOL(rt_dst_alloc);
1486
1487 /* called in rcu_read_lock() section */
1488 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1489                                 u8 tos, struct net_device *dev, int our)
1490 {
1491         struct rtable *rth;
1492         struct in_device *in_dev = __in_dev_get_rcu(dev);
1493         unsigned int flags = RTCF_MULTICAST;
1494         u32 itag = 0;
1495         int err;
1496
1497         /* Primary sanity checks. */
1498
1499         if (!in_dev)
1500                 return -EINVAL;
1501
1502         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1503             skb->protocol != htons(ETH_P_IP))
1504                 goto e_inval;
1505
1506         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1507                 goto e_inval;
1508
1509         if (ipv4_is_zeronet(saddr)) {
1510                 if (!ipv4_is_local_multicast(daddr))
1511                         goto e_inval;
1512         } else {
1513                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1514                                           in_dev, &itag);
1515                 if (err < 0)
1516                         goto e_err;
1517         }
1518         if (our)
1519                 flags |= RTCF_LOCAL;
1520
1521         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1522                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1523         if (!rth)
1524                 goto e_nobufs;
1525
1526 #ifdef CONFIG_IP_ROUTE_CLASSID
1527         rth->dst.tclassid = itag;
1528 #endif
1529         rth->dst.output = ip_rt_bug;
1530         rth->rt_is_input= 1;
1531
1532 #ifdef CONFIG_IP_MROUTE
1533         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1534                 rth->dst.input = ip_mr_input;
1535 #endif
1536         RT_CACHE_STAT_INC(in_slow_mc);
1537
1538         skb_dst_set(skb, &rth->dst);
1539         return 0;
1540
1541 e_nobufs:
1542         return -ENOBUFS;
1543 e_inval:
1544         return -EINVAL;
1545 e_err:
1546         return err;
1547 }
1548
1549
1550 static void ip_handle_martian_source(struct net_device *dev,
1551                                      struct in_device *in_dev,
1552                                      struct sk_buff *skb,
1553                                      __be32 daddr,
1554                                      __be32 saddr)
1555 {
1556         RT_CACHE_STAT_INC(in_martian_src);
1557 #ifdef CONFIG_IP_ROUTE_VERBOSE
1558         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1559                 /*
1560                  *      RFC1812 recommendation, if source is martian,
1561                  *      the only hint is MAC header.
1562                  */
1563                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1564                         &daddr, &saddr, dev->name);
1565                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1566                         print_hex_dump(KERN_WARNING, "ll header: ",
1567                                        DUMP_PREFIX_OFFSET, 16, 1,
1568                                        skb_mac_header(skb),
1569                                        dev->hard_header_len, true);
1570                 }
1571         }
1572 #endif
1573 }
1574
1575 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1576 {
1577         struct fnhe_hash_bucket *hash;
1578         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1579         u32 hval = fnhe_hashfun(daddr);
1580
1581         spin_lock_bh(&fnhe_lock);
1582
1583         hash = rcu_dereference_protected(nh->nh_exceptions,
1584                                          lockdep_is_held(&fnhe_lock));
1585         hash += hval;
1586
1587         fnhe_p = &hash->chain;
1588         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1589         while (fnhe) {
1590                 if (fnhe->fnhe_daddr == daddr) {
1591                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1592                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1593                         fnhe_flush_routes(fnhe);
1594                         kfree_rcu(fnhe, rcu);
1595                         break;
1596                 }
1597                 fnhe_p = &fnhe->fnhe_next;
1598                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1599                                                  lockdep_is_held(&fnhe_lock));
1600         }
1601
1602         spin_unlock_bh(&fnhe_lock);
1603 }
1604
1605 /* called in rcu_read_lock() section */
1606 static int __mkroute_input(struct sk_buff *skb,
1607                            const struct fib_result *res,
1608                            struct in_device *in_dev,
1609                            __be32 daddr, __be32 saddr, u32 tos)
1610 {
1611         struct fib_nh_exception *fnhe;
1612         struct rtable *rth;
1613         int err;
1614         struct in_device *out_dev;
1615         bool do_cache;
1616         u32 itag = 0;
1617
1618         /* get a working reference to the output device */
1619         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1620         if (!out_dev) {
1621                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1622                 return -EINVAL;
1623         }
1624
1625         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1626                                   in_dev->dev, in_dev, &itag);
1627         if (err < 0) {
1628                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1629                                          saddr);
1630
1631                 goto cleanup;
1632         }
1633
1634         do_cache = res->fi && !itag;
1635         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1636             skb->protocol == htons(ETH_P_IP) &&
1637             (IN_DEV_SHARED_MEDIA(out_dev) ||
1638              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1639                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1640
1641         if (skb->protocol != htons(ETH_P_IP)) {
1642                 /* Not IP (i.e. ARP). Do not create route, if it is
1643                  * invalid for proxy arp. DNAT routes are always valid.
1644                  *
1645                  * Proxy arp feature have been extended to allow, ARP
1646                  * replies back to the same interface, to support
1647                  * Private VLAN switch technologies. See arp.c.
1648                  */
1649                 if (out_dev == in_dev &&
1650                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1651                         err = -EINVAL;
1652                         goto cleanup;
1653                 }
1654         }
1655
1656         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1657         if (do_cache) {
1658                 if (fnhe) {
1659                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1660                         if (rth && rth->dst.expires &&
1661                             time_after(jiffies, rth->dst.expires)) {
1662                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1663                                 fnhe = NULL;
1664                         } else {
1665                                 goto rt_cache;
1666                         }
1667                 }
1668
1669                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1670
1671 rt_cache:
1672                 if (rt_cache_valid(rth)) {
1673                         skb_dst_set_noref(skb, &rth->dst);
1674                         goto out;
1675                 }
1676         }
1677
1678         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1679                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1680                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1681         if (!rth) {
1682                 err = -ENOBUFS;
1683                 goto cleanup;
1684         }
1685
1686         rth->rt_is_input = 1;
1687         if (res->table)
1688                 rth->rt_table_id = res->table->tb_id;
1689         RT_CACHE_STAT_INC(in_slow_tot);
1690
1691         rth->dst.input = ip_forward;
1692
1693         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1694         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1695                 rth->dst.lwtstate->orig_output = rth->dst.output;
1696                 rth->dst.output = lwtunnel_output;
1697         }
1698         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1699                 rth->dst.lwtstate->orig_input = rth->dst.input;
1700                 rth->dst.input = lwtunnel_input;
1701         }
1702         skb_dst_set(skb, &rth->dst);
1703 out:
1704         err = 0;
1705  cleanup:
1706         return err;
1707 }
1708
1709 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1710
1711 /* To make ICMP packets follow the right flow, the multipath hash is
1712  * calculated from the inner IP addresses in reverse order.
1713  */
1714 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1715 {
1716         const struct iphdr *outer_iph = ip_hdr(skb);
1717         struct icmphdr _icmph;
1718         const struct icmphdr *icmph;
1719         struct iphdr _inner_iph;
1720         const struct iphdr *inner_iph;
1721
1722         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1723                 goto standard_hash;
1724
1725         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1726                                    &_icmph);
1727         if (!icmph)
1728                 goto standard_hash;
1729
1730         if (icmph->type != ICMP_DEST_UNREACH &&
1731             icmph->type != ICMP_REDIRECT &&
1732             icmph->type != ICMP_TIME_EXCEEDED &&
1733             icmph->type != ICMP_PARAMETERPROB) {
1734                 goto standard_hash;
1735         }
1736
1737         inner_iph = skb_header_pointer(skb,
1738                                        outer_iph->ihl * 4 + sizeof(_icmph),
1739                                        sizeof(_inner_iph), &_inner_iph);
1740         if (!inner_iph)
1741                 goto standard_hash;
1742
1743         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1744
1745 standard_hash:
1746         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1747 }
1748
1749 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1750
1751 static int ip_mkroute_input(struct sk_buff *skb,
1752                             struct fib_result *res,
1753                             const struct flowi4 *fl4,
1754                             struct in_device *in_dev,
1755                             __be32 daddr, __be32 saddr, u32 tos)
1756 {
1757 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1758         if (res->fi && res->fi->fib_nhs > 1) {
1759                 int h;
1760
1761                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1762                         h = ip_multipath_icmp_hash(skb);
1763                 else
1764                         h = fib_multipath_hash(saddr, daddr);
1765                 fib_select_multipath(res, h);
1766         }
1767 #endif
1768
1769         /* create a routing cache entry */
1770         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1771 }
1772
1773 /*
1774  *      NOTE. We drop all the packets that has local source
1775  *      addresses, because every properly looped back packet
1776  *      must have correct destination already attached by output routine.
1777  *
1778  *      Such approach solves two big problems:
1779  *      1. Not simplex devices are handled properly.
1780  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1781  *      called with rcu_read_lock()
1782  */
1783
1784 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1785                                u8 tos, struct net_device *dev)
1786 {
1787         struct fib_result res;
1788         struct in_device *in_dev = __in_dev_get_rcu(dev);
1789         struct ip_tunnel_info *tun_info;
1790         struct flowi4   fl4;
1791         unsigned int    flags = 0;
1792         u32             itag = 0;
1793         struct rtable   *rth;
1794         int             err = -EINVAL;
1795         struct net    *net = dev_net(dev);
1796         bool do_cache;
1797
1798         /* IP on this device is disabled. */
1799
1800         if (!in_dev)
1801                 goto out;
1802
1803         /* Check for the most weird martians, which can be not detected
1804            by fib_lookup.
1805          */
1806
1807         tun_info = skb_tunnel_info(skb);
1808         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1809                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1810         else
1811                 fl4.flowi4_tun_key.tun_id = 0;
1812         skb_dst_drop(skb);
1813
1814         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1815                 goto martian_source;
1816
1817         res.fi = NULL;
1818         res.table = NULL;
1819         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1820                 goto brd_input;
1821
1822         /* Accept zero addresses only to limited broadcast;
1823          * I even do not know to fix it or not. Waiting for complains :-)
1824          */
1825         if (ipv4_is_zeronet(saddr))
1826                 goto martian_source;
1827
1828         if (ipv4_is_zeronet(daddr))
1829                 goto martian_destination;
1830
1831         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1832          * and call it once if daddr or/and saddr are loopback addresses
1833          */
1834         if (ipv4_is_loopback(daddr)) {
1835                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1836                         goto martian_destination;
1837         } else if (ipv4_is_loopback(saddr)) {
1838                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1839                         goto martian_source;
1840         }
1841
1842         /*
1843          *      Now we are ready to route packet.
1844          */
1845         fl4.flowi4_oif = 0;
1846         fl4.flowi4_iif = dev->ifindex;
1847         fl4.flowi4_mark = skb->mark;
1848         fl4.flowi4_tos = tos;
1849         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1850         fl4.flowi4_flags = 0;
1851         fl4.daddr = daddr;
1852         fl4.saddr = saddr;
1853         err = fib_lookup(net, &fl4, &res, 0);
1854         if (err != 0) {
1855                 if (!IN_DEV_FORWARD(in_dev))
1856                         err = -EHOSTUNREACH;
1857                 goto no_route;
1858         }
1859
1860         if (res.type == RTN_BROADCAST)
1861                 goto brd_input;
1862
1863         if (res.type == RTN_LOCAL) {
1864                 err = fib_validate_source(skb, saddr, daddr, tos,
1865                                           0, dev, in_dev, &itag);
1866                 if (err < 0)
1867                         goto martian_source;
1868                 goto local_input;
1869         }
1870
1871         if (!IN_DEV_FORWARD(in_dev)) {
1872                 err = -EHOSTUNREACH;
1873                 goto no_route;
1874         }
1875         if (res.type != RTN_UNICAST)
1876                 goto martian_destination;
1877
1878         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1879 out:    return err;
1880
1881 brd_input:
1882         if (skb->protocol != htons(ETH_P_IP))
1883                 goto e_inval;
1884
1885         if (!ipv4_is_zeronet(saddr)) {
1886                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1887                                           in_dev, &itag);
1888                 if (err < 0)
1889                         goto martian_source;
1890         }
1891         flags |= RTCF_BROADCAST;
1892         res.type = RTN_BROADCAST;
1893         RT_CACHE_STAT_INC(in_brd);
1894
1895 local_input:
1896         do_cache = false;
1897         if (res.fi) {
1898                 if (!itag) {
1899                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1900                         if (rt_cache_valid(rth)) {
1901                                 skb_dst_set_noref(skb, &rth->dst);
1902                                 err = 0;
1903                                 goto out;
1904                         }
1905                         do_cache = true;
1906                 }
1907         }
1908
1909         rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1910                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1911         if (!rth)
1912                 goto e_nobufs;
1913
1914         rth->dst.output= ip_rt_bug;
1915 #ifdef CONFIG_IP_ROUTE_CLASSID
1916         rth->dst.tclassid = itag;
1917 #endif
1918         rth->rt_is_input = 1;
1919         if (res.table)
1920                 rth->rt_table_id = res.table->tb_id;
1921
1922         RT_CACHE_STAT_INC(in_slow_tot);
1923         if (res.type == RTN_UNREACHABLE) {
1924                 rth->dst.input= ip_error;
1925                 rth->dst.error= -err;
1926                 rth->rt_flags   &= ~RTCF_LOCAL;
1927         }
1928         if (do_cache) {
1929                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1930                         rth->dst.flags |= DST_NOCACHE;
1931                         rt_add_uncached_list(rth);
1932                 }
1933         }
1934         skb_dst_set(skb, &rth->dst);
1935         err = 0;
1936         goto out;
1937
1938 no_route:
1939         RT_CACHE_STAT_INC(in_no_route);
1940         res.type = RTN_UNREACHABLE;
1941         res.fi = NULL;
1942         res.table = NULL;
1943         goto local_input;
1944
1945         /*
1946          *      Do not cache martian addresses: they should be logged (RFC1812)
1947          */
1948 martian_destination:
1949         RT_CACHE_STAT_INC(in_martian_dst);
1950 #ifdef CONFIG_IP_ROUTE_VERBOSE
1951         if (IN_DEV_LOG_MARTIANS(in_dev))
1952                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1953                                      &daddr, &saddr, dev->name);
1954 #endif
1955
1956 e_inval:
1957         err = -EINVAL;
1958         goto out;
1959
1960 e_nobufs:
1961         err = -ENOBUFS;
1962         goto out;
1963
1964 martian_source:
1965         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1966         goto out;
1967 }
1968
1969 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1970                          u8 tos, struct net_device *dev)
1971 {
1972         int res;
1973
1974         rcu_read_lock();
1975
1976         /* Multicast recognition logic is moved from route cache to here.
1977            The problem was that too many Ethernet cards have broken/missing
1978            hardware multicast filters :-( As result the host on multicasting
1979            network acquires a lot of useless route cache entries, sort of
1980            SDR messages from all the world. Now we try to get rid of them.
1981            Really, provided software IP multicast filter is organized
1982            reasonably (at least, hashed), it does not result in a slowdown
1983            comparing with route cache reject entries.
1984            Note, that multicast routers are not affected, because
1985            route cache entry is created eventually.
1986          */
1987         if (ipv4_is_multicast(daddr)) {
1988                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1989                 int our = 0;
1990
1991                 if (in_dev)
1992                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
1993                                               ip_hdr(skb)->protocol);
1994
1995                 /* check l3 master if no match yet */
1996                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
1997                         struct in_device *l3_in_dev;
1998
1999                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2000                         if (l3_in_dev)
2001                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2002                                                       ip_hdr(skb)->protocol);
2003                 }
2004
2005                 res = -EINVAL;
2006                 if (our
2007 #ifdef CONFIG_IP_MROUTE
2008                         ||
2009                     (!ipv4_is_local_multicast(daddr) &&
2010                      IN_DEV_MFORWARD(in_dev))
2011 #endif
2012                    ) {
2013                         res = ip_route_input_mc(skb, daddr, saddr,
2014                                                 tos, dev, our);
2015                 }
2016                 rcu_read_unlock();
2017                 return res;
2018         }
2019         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2020         rcu_read_unlock();
2021         return res;
2022 }
2023 EXPORT_SYMBOL(ip_route_input_noref);
2024
2025 /* called with rcu_read_lock() */
2026 static struct rtable *__mkroute_output(const struct fib_result *res,
2027                                        const struct flowi4 *fl4, int orig_oif,
2028                                        struct net_device *dev_out,
2029                                        unsigned int flags)
2030 {
2031         struct fib_info *fi = res->fi;
2032         struct fib_nh_exception *fnhe;
2033         struct in_device *in_dev;
2034         u16 type = res->type;
2035         struct rtable *rth;
2036         bool do_cache;
2037
2038         in_dev = __in_dev_get_rcu(dev_out);
2039         if (!in_dev)
2040                 return ERR_PTR(-EINVAL);
2041
2042         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2043                 if (ipv4_is_loopback(fl4->saddr) &&
2044                     !(dev_out->flags & IFF_LOOPBACK) &&
2045                     !netif_is_l3_master(dev_out))
2046                         return ERR_PTR(-EINVAL);
2047
2048         if (ipv4_is_lbcast(fl4->daddr))
2049                 type = RTN_BROADCAST;
2050         else if (ipv4_is_multicast(fl4->daddr))
2051                 type = RTN_MULTICAST;
2052         else if (ipv4_is_zeronet(fl4->daddr))
2053                 return ERR_PTR(-EINVAL);
2054
2055         if (dev_out->flags & IFF_LOOPBACK)
2056                 flags |= RTCF_LOCAL;
2057
2058         do_cache = true;
2059         if (type == RTN_BROADCAST) {
2060                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2061                 fi = NULL;
2062         } else if (type == RTN_MULTICAST) {
2063                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2064                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2065                                      fl4->flowi4_proto))
2066                         flags &= ~RTCF_LOCAL;
2067                 else
2068                         do_cache = false;
2069                 /* If multicast route do not exist use
2070                  * default one, but do not gateway in this case.
2071                  * Yes, it is hack.
2072                  */
2073                 if (fi && res->prefixlen < 4)
2074                         fi = NULL;
2075         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2076                    (orig_oif != dev_out->ifindex)) {
2077                 /* For local routes that require a particular output interface
2078                  * we do not want to cache the result.  Caching the result
2079                  * causes incorrect behaviour when there are multiple source
2080                  * addresses on the interface, the end result being that if the
2081                  * intended recipient is waiting on that interface for the
2082                  * packet he won't receive it because it will be delivered on
2083                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2084                  * be set to the loopback interface as well.
2085                  */
2086                 fi = NULL;
2087         }
2088
2089         fnhe = NULL;
2090         do_cache &= fi != NULL;
2091         if (do_cache) {
2092                 struct rtable __rcu **prth;
2093                 struct fib_nh *nh = &FIB_RES_NH(*res);
2094
2095                 fnhe = find_exception(nh, fl4->daddr);
2096                 if (fnhe) {
2097                         prth = &fnhe->fnhe_rth_output;
2098                         rth = rcu_dereference(*prth);
2099                         if (rth && rth->dst.expires &&
2100                             time_after(jiffies, rth->dst.expires)) {
2101                                 ip_del_fnhe(nh, fl4->daddr);
2102                                 fnhe = NULL;
2103                         } else {
2104                                 goto rt_cache;
2105                         }
2106                 }
2107
2108                 if (unlikely(fl4->flowi4_flags &
2109                              FLOWI_FLAG_KNOWN_NH &&
2110                              !(nh->nh_gw &&
2111                                nh->nh_scope == RT_SCOPE_LINK))) {
2112                         do_cache = false;
2113                         goto add;
2114                 }
2115                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2116                 rth = rcu_dereference(*prth);
2117
2118 rt_cache:
2119                 if (rt_cache_valid(rth)) {
2120                         dst_hold(&rth->dst);
2121                         return rth;
2122                 }
2123         }
2124
2125 add:
2126         rth = rt_dst_alloc(dev_out, flags, type,
2127                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2128                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2129                            do_cache);
2130         if (!rth)
2131                 return ERR_PTR(-ENOBUFS);
2132
2133         rth->rt_iif     = orig_oif ? : 0;
2134         if (res->table)
2135                 rth->rt_table_id = res->table->tb_id;
2136
2137         RT_CACHE_STAT_INC(out_slow_tot);
2138
2139         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2140                 if (flags & RTCF_LOCAL &&
2141                     !(dev_out->flags & IFF_LOOPBACK)) {
2142                         rth->dst.output = ip_mc_output;
2143                         RT_CACHE_STAT_INC(out_slow_mc);
2144                 }
2145 #ifdef CONFIG_IP_MROUTE
2146                 if (type == RTN_MULTICAST) {
2147                         if (IN_DEV_MFORWARD(in_dev) &&
2148                             !ipv4_is_local_multicast(fl4->daddr)) {
2149                                 rth->dst.input = ip_mr_input;
2150                                 rth->dst.output = ip_mc_output;
2151                         }
2152                 }
2153 #endif
2154         }
2155
2156         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2157         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2158                 rth->dst.output = lwtunnel_output;
2159
2160         return rth;
2161 }
2162
2163 /*
2164  * Major route resolver routine.
2165  */
2166
2167 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2168                                           int mp_hash)
2169 {
2170         struct net_device *dev_out = NULL;
2171         __u8 tos = RT_FL_TOS(fl4);
2172         unsigned int flags = 0;
2173         struct fib_result res;
2174         struct rtable *rth;
2175         int orig_oif;
2176         int err = -ENETUNREACH;
2177
2178         res.tclassid    = 0;
2179         res.fi          = NULL;
2180         res.table       = NULL;
2181
2182         orig_oif = fl4->flowi4_oif;
2183
2184         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2185         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2186         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2187                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2188
2189         rcu_read_lock();
2190         if (fl4->saddr) {
2191                 rth = ERR_PTR(-EINVAL);
2192                 if (ipv4_is_multicast(fl4->saddr) ||
2193                     ipv4_is_lbcast(fl4->saddr) ||
2194                     ipv4_is_zeronet(fl4->saddr))
2195                         goto out;
2196
2197                 /* I removed check for oif == dev_out->oif here.
2198                    It was wrong for two reasons:
2199                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2200                       is assigned to multiple interfaces.
2201                    2. Moreover, we are allowed to send packets with saddr
2202                       of another iface. --ANK
2203                  */
2204
2205                 if (fl4->flowi4_oif == 0 &&
2206                     (ipv4_is_multicast(fl4->daddr) ||
2207                      ipv4_is_lbcast(fl4->daddr))) {
2208                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2209                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2210                         if (!dev_out)
2211                                 goto out;
2212
2213                         /* Special hack: user can direct multicasts
2214                            and limited broadcast via necessary interface
2215                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2216                            This hack is not just for fun, it allows
2217                            vic,vat and friends to work.
2218                            They bind socket to loopback, set ttl to zero
2219                            and expect that it will work.
2220                            From the viewpoint of routing cache they are broken,
2221                            because we are not allowed to build multicast path
2222                            with loopback source addr (look, routing cache
2223                            cannot know, that ttl is zero, so that packet
2224                            will not leave this host and route is valid).
2225                            Luckily, this hack is good workaround.
2226                          */
2227
2228                         fl4->flowi4_oif = dev_out->ifindex;
2229                         goto make_route;
2230                 }
2231
2232                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2233                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2234                         if (!__ip_dev_find(net, fl4->saddr, false))
2235                                 goto out;
2236                 }
2237         }
2238
2239
2240         if (fl4->flowi4_oif) {
2241                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2242                 rth = ERR_PTR(-ENODEV);
2243                 if (!dev_out)
2244                         goto out;
2245
2246                 /* RACE: Check return value of inet_select_addr instead. */
2247                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2248                         rth = ERR_PTR(-ENETUNREACH);
2249                         goto out;
2250                 }
2251                 if (ipv4_is_local_multicast(fl4->daddr) ||
2252                     ipv4_is_lbcast(fl4->daddr) ||
2253                     fl4->flowi4_proto == IPPROTO_IGMP) {
2254                         if (!fl4->saddr)
2255                                 fl4->saddr = inet_select_addr(dev_out, 0,
2256                                                               RT_SCOPE_LINK);
2257                         goto make_route;
2258                 }
2259                 if (!fl4->saddr) {
2260                         if (ipv4_is_multicast(fl4->daddr))
2261                                 fl4->saddr = inet_select_addr(dev_out, 0,
2262                                                               fl4->flowi4_scope);
2263                         else if (!fl4->daddr)
2264                                 fl4->saddr = inet_select_addr(dev_out, 0,
2265                                                               RT_SCOPE_HOST);
2266                 }
2267         }
2268
2269         if (!fl4->daddr) {
2270                 fl4->daddr = fl4->saddr;
2271                 if (!fl4->daddr)
2272                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2273                 dev_out = net->loopback_dev;
2274                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2275                 res.type = RTN_LOCAL;
2276                 flags |= RTCF_LOCAL;
2277                 goto make_route;
2278         }
2279
2280         err = fib_lookup(net, fl4, &res, 0);
2281         if (err) {
2282                 res.fi = NULL;
2283                 res.table = NULL;
2284                 if (fl4->flowi4_oif &&
2285                     (ipv4_is_multicast(fl4->daddr) ||
2286                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2287                         /* Apparently, routing tables are wrong. Assume,
2288                            that the destination is on link.
2289
2290                            WHY? DW.
2291                            Because we are allowed to send to iface
2292                            even if it has NO routes and NO assigned
2293                            addresses. When oif is specified, routing
2294                            tables are looked up with only one purpose:
2295                            to catch if destination is gatewayed, rather than
2296                            direct. Moreover, if MSG_DONTROUTE is set,
2297                            we send packet, ignoring both routing tables
2298                            and ifaddr state. --ANK
2299
2300
2301                            We could make it even if oif is unknown,
2302                            likely IPv6, but we do not.
2303                          */
2304
2305                         if (fl4->saddr == 0)
2306                                 fl4->saddr = inet_select_addr(dev_out, 0,
2307                                                               RT_SCOPE_LINK);
2308                         res.type = RTN_UNICAST;
2309                         goto make_route;
2310                 }
2311                 rth = ERR_PTR(err);
2312                 goto out;
2313         }
2314
2315         if (res.type == RTN_LOCAL) {
2316                 if (!fl4->saddr) {
2317                         if (res.fi->fib_prefsrc)
2318                                 fl4->saddr = res.fi->fib_prefsrc;
2319                         else
2320                                 fl4->saddr = fl4->daddr;
2321                 }
2322
2323                 /* L3 master device is the loopback for that domain */
2324                 dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2325                 fl4->flowi4_oif = dev_out->ifindex;
2326                 flags |= RTCF_LOCAL;
2327                 goto make_route;
2328         }
2329
2330         fib_select_path(net, &res, fl4, mp_hash);
2331
2332         dev_out = FIB_RES_DEV(res);
2333         fl4->flowi4_oif = dev_out->ifindex;
2334
2335
2336 make_route:
2337         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2338
2339 out:
2340         rcu_read_unlock();
2341         return rth;
2342 }
2343 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2344
2345 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2346 {
2347         return NULL;
2348 }
2349
2350 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2351 {
2352         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2353
2354         return mtu ? : dst->dev->mtu;
2355 }
2356
2357 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2358                                           struct sk_buff *skb, u32 mtu)
2359 {
2360 }
2361
2362 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2363                                        struct sk_buff *skb)
2364 {
2365 }
2366
2367 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2368                                           unsigned long old)
2369 {
2370         return NULL;
2371 }
2372
2373 static struct dst_ops ipv4_dst_blackhole_ops = {
2374         .family                 =       AF_INET,
2375         .check                  =       ipv4_blackhole_dst_check,
2376         .mtu                    =       ipv4_blackhole_mtu,
2377         .default_advmss         =       ipv4_default_advmss,
2378         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2379         .redirect               =       ipv4_rt_blackhole_redirect,
2380         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2381         .neigh_lookup           =       ipv4_neigh_lookup,
2382 };
2383
2384 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2385 {
2386         struct rtable *ort = (struct rtable *) dst_orig;
2387         struct rtable *rt;
2388
2389         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2390         if (rt) {
2391                 struct dst_entry *new = &rt->dst;
2392
2393                 new->__use = 1;
2394                 new->input = dst_discard;
2395                 new->output = dst_discard_out;
2396
2397                 new->dev = ort->dst.dev;
2398                 if (new->dev)
2399                         dev_hold(new->dev);
2400
2401                 rt->rt_is_input = ort->rt_is_input;
2402                 rt->rt_iif = ort->rt_iif;
2403                 rt->rt_pmtu = ort->rt_pmtu;
2404
2405                 rt->rt_genid = rt_genid_ipv4(net);
2406                 rt->rt_flags = ort->rt_flags;
2407                 rt->rt_type = ort->rt_type;
2408                 rt->rt_gateway = ort->rt_gateway;
2409                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2410
2411                 INIT_LIST_HEAD(&rt->rt_uncached);
2412                 dst_free(new);
2413         }
2414
2415         dst_release(dst_orig);
2416
2417         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2418 }
2419
2420 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2421                                     const struct sock *sk)
2422 {
2423         struct rtable *rt = __ip_route_output_key(net, flp4);
2424
2425         if (IS_ERR(rt))
2426                 return rt;
2427
2428         if (flp4->flowi4_proto)
2429                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2430                                                         flowi4_to_flowi(flp4),
2431                                                         sk, 0);
2432
2433         return rt;
2434 }
2435 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2436
2437 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2438                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2439                         u32 seq, int event, int nowait, unsigned int flags)
2440 {
2441         struct rtable *rt = skb_rtable(skb);
2442         struct rtmsg *r;
2443         struct nlmsghdr *nlh;
2444         unsigned long expires = 0;
2445         u32 error;
2446         u32 metrics[RTAX_MAX];
2447
2448         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2449         if (!nlh)
2450                 return -EMSGSIZE;
2451
2452         r = nlmsg_data(nlh);
2453         r->rtm_family    = AF_INET;
2454         r->rtm_dst_len  = 32;
2455         r->rtm_src_len  = 0;
2456         r->rtm_tos      = fl4->flowi4_tos;
2457         r->rtm_table    = table_id;
2458         if (nla_put_u32(skb, RTA_TABLE, table_id))
2459                 goto nla_put_failure;
2460         r->rtm_type     = rt->rt_type;
2461         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2462         r->rtm_protocol = RTPROT_UNSPEC;
2463         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2464         if (rt->rt_flags & RTCF_NOTIFY)
2465                 r->rtm_flags |= RTM_F_NOTIFY;
2466         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2467                 r->rtm_flags |= RTCF_DOREDIRECT;
2468
2469         if (nla_put_in_addr(skb, RTA_DST, dst))
2470                 goto nla_put_failure;
2471         if (src) {
2472                 r->rtm_src_len = 32;
2473                 if (nla_put_in_addr(skb, RTA_SRC, src))
2474                         goto nla_put_failure;
2475         }
2476         if (rt->dst.dev &&
2477             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2478                 goto nla_put_failure;
2479 #ifdef CONFIG_IP_ROUTE_CLASSID
2480         if (rt->dst.tclassid &&
2481             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2482                 goto nla_put_failure;
2483 #endif
2484         if (!rt_is_input_route(rt) &&
2485             fl4->saddr != src) {
2486                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2487                         goto nla_put_failure;
2488         }
2489         if (rt->rt_uses_gateway &&
2490             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2491                 goto nla_put_failure;
2492
2493         expires = rt->dst.expires;
2494         if (expires) {
2495                 unsigned long now = jiffies;
2496
2497                 if (time_before(now, expires))
2498                         expires -= now;
2499                 else
2500                         expires = 0;
2501         }
2502
2503         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2504         if (rt->rt_pmtu && expires)
2505                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2506         if (rtnetlink_put_metrics(skb, metrics) < 0)
2507                 goto nla_put_failure;
2508
2509         if (fl4->flowi4_mark &&
2510             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2511                 goto nla_put_failure;
2512
2513         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2514             nla_put_u32(skb, RTA_UID,
2515                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2516                 goto nla_put_failure;
2517
2518         error = rt->dst.error;
2519
2520         if (rt_is_input_route(rt)) {
2521 #ifdef CONFIG_IP_MROUTE
2522                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2523                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2524                         int err = ipmr_get_route(net, skb,
2525                                                  fl4->saddr, fl4->daddr,
2526                                                  r, nowait, portid);
2527
2528                         if (err <= 0) {
2529                                 if (!nowait) {
2530                                         if (err == 0)
2531                                                 return 0;
2532                                         goto nla_put_failure;
2533                                 } else {
2534                                         if (err == -EMSGSIZE)
2535                                                 goto nla_put_failure;
2536                                         error = err;
2537                                 }
2538                         }
2539                 } else
2540 #endif
2541                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2542                                 goto nla_put_failure;
2543         }
2544
2545         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2546                 goto nla_put_failure;
2547
2548         nlmsg_end(skb, nlh);
2549         return 0;
2550
2551 nla_put_failure:
2552         nlmsg_cancel(skb, nlh);
2553         return -EMSGSIZE;
2554 }
2555
2556 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2557 {
2558         struct net *net = sock_net(in_skb->sk);
2559         struct rtmsg *rtm;
2560         struct nlattr *tb[RTA_MAX+1];
2561         struct rtable *rt = NULL;
2562         struct flowi4 fl4;
2563         __be32 dst = 0;
2564         __be32 src = 0;
2565         u32 iif;
2566         int err;
2567         int mark;
2568         struct sk_buff *skb;
2569         u32 table_id = RT_TABLE_MAIN;
2570         kuid_t uid;
2571
2572         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2573         if (err < 0)
2574                 goto errout;
2575
2576         rtm = nlmsg_data(nlh);
2577
2578         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2579         if (!skb) {
2580                 err = -ENOBUFS;
2581                 goto errout;
2582         }
2583
2584         /* Reserve room for dummy headers, this skb can pass
2585            through good chunk of routing engine.
2586          */
2587         skb_reset_mac_header(skb);
2588         skb_reset_network_header(skb);
2589
2590         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2591         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2592         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2593
2594         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2595         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2596         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2597         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2598         if (tb[RTA_UID])
2599                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2600         else
2601                 uid = (iif ? INVALID_UID : current_uid());
2602
2603         memset(&fl4, 0, sizeof(fl4));
2604         fl4.daddr = dst;
2605         fl4.saddr = src;
2606         fl4.flowi4_tos = rtm->rtm_tos;
2607         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2608         fl4.flowi4_mark = mark;
2609         fl4.flowi4_uid = uid;
2610
2611         if (iif) {
2612                 struct net_device *dev;
2613
2614                 dev = __dev_get_by_index(net, iif);
2615                 if (!dev) {
2616                         err = -ENODEV;
2617                         goto errout_free;
2618                 }
2619
2620                 skb->protocol   = htons(ETH_P_IP);
2621                 skb->dev        = dev;
2622                 skb->mark       = mark;
2623                 local_bh_disable();
2624                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2625                 local_bh_enable();
2626
2627                 rt = skb_rtable(skb);
2628                 if (err == 0 && rt->dst.error)
2629                         err = -rt->dst.error;
2630         } else {
2631                 rt = ip_route_output_key(net, &fl4);
2632
2633                 err = 0;
2634                 if (IS_ERR(rt))
2635                         err = PTR_ERR(rt);
2636         }
2637
2638         if (err)
2639                 goto errout_free;
2640
2641         skb_dst_set(skb, &rt->dst);
2642         if (rtm->rtm_flags & RTM_F_NOTIFY)
2643                 rt->rt_flags |= RTCF_NOTIFY;
2644
2645         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2646                 table_id = rt->rt_table_id;
2647
2648         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2649                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2650                            RTM_NEWROUTE, 0, 0);
2651         if (err < 0)
2652                 goto errout_free;
2653
2654         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2655 errout:
2656         return err;
2657
2658 errout_free:
2659         kfree_skb(skb);
2660         goto errout;
2661 }
2662
2663 void ip_rt_multicast_event(struct in_device *in_dev)
2664 {
2665         rt_cache_flush(dev_net(in_dev->dev));
2666 }
2667
2668 #ifdef CONFIG_SYSCTL
2669 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2670 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2671 static int ip_rt_gc_elasticity __read_mostly    = 8;
2672
2673 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2674                                         void __user *buffer,
2675                                         size_t *lenp, loff_t *ppos)
2676 {
2677         struct net *net = (struct net *)__ctl->extra1;
2678
2679         if (write) {
2680                 rt_cache_flush(net);
2681                 fnhe_genid_bump(net);
2682                 return 0;
2683         }
2684
2685         return -EINVAL;
2686 }
2687
2688 static struct ctl_table ipv4_route_table[] = {
2689         {
2690                 .procname       = "gc_thresh",
2691                 .data           = &ipv4_dst_ops.gc_thresh,
2692                 .maxlen         = sizeof(int),
2693                 .mode           = 0644,
2694                 .proc_handler   = proc_dointvec,
2695         },
2696         {
2697                 .procname       = "max_size",
2698                 .data           = &ip_rt_max_size,
2699                 .maxlen         = sizeof(int),
2700                 .mode           = 0644,
2701                 .proc_handler   = proc_dointvec,
2702         },
2703         {
2704                 /*  Deprecated. Use gc_min_interval_ms */
2705
2706                 .procname       = "gc_min_interval",
2707                 .data           = &ip_rt_gc_min_interval,
2708                 .maxlen         = sizeof(int),
2709                 .mode           = 0644,
2710                 .proc_handler   = proc_dointvec_jiffies,
2711         },
2712         {
2713                 .procname       = "gc_min_interval_ms",
2714                 .data           = &ip_rt_gc_min_interval,
2715                 .maxlen         = sizeof(int),
2716                 .mode           = 0644,
2717                 .proc_handler   = proc_dointvec_ms_jiffies,
2718         },
2719         {
2720                 .procname       = "gc_timeout",
2721                 .data           = &ip_rt_gc_timeout,
2722                 .maxlen         = sizeof(int),
2723                 .mode           = 0644,
2724                 .proc_handler   = proc_dointvec_jiffies,
2725         },
2726         {
2727                 .procname       = "gc_interval",
2728                 .data           = &ip_rt_gc_interval,
2729                 .maxlen         = sizeof(int),
2730                 .mode           = 0644,
2731                 .proc_handler   = proc_dointvec_jiffies,
2732         },
2733         {
2734                 .procname       = "redirect_load",
2735                 .data           = &ip_rt_redirect_load,
2736                 .maxlen         = sizeof(int),
2737                 .mode           = 0644,
2738                 .proc_handler   = proc_dointvec,
2739         },
2740         {
2741                 .procname       = "redirect_number",
2742                 .data           = &ip_rt_redirect_number,
2743                 .maxlen         = sizeof(int),
2744                 .mode           = 0644,
2745                 .proc_handler   = proc_dointvec,
2746         },
2747         {
2748                 .procname       = "redirect_silence",
2749                 .data           = &ip_rt_redirect_silence,
2750                 .maxlen         = sizeof(int),
2751                 .mode           = 0644,
2752                 .proc_handler   = proc_dointvec,
2753         },
2754         {
2755                 .procname       = "error_cost",
2756                 .data           = &ip_rt_error_cost,
2757                 .maxlen         = sizeof(int),
2758                 .mode           = 0644,
2759                 .proc_handler   = proc_dointvec,
2760         },
2761         {
2762                 .procname       = "error_burst",
2763                 .data           = &ip_rt_error_burst,
2764                 .maxlen         = sizeof(int),
2765                 .mode           = 0644,
2766                 .proc_handler   = proc_dointvec,
2767         },
2768         {
2769                 .procname       = "gc_elasticity",
2770                 .data           = &ip_rt_gc_elasticity,
2771                 .maxlen         = sizeof(int),
2772                 .mode           = 0644,
2773                 .proc_handler   = proc_dointvec,
2774         },
2775         {
2776                 .procname       = "mtu_expires",
2777                 .data           = &ip_rt_mtu_expires,
2778                 .maxlen         = sizeof(int),
2779                 .mode           = 0644,
2780                 .proc_handler   = proc_dointvec_jiffies,
2781         },
2782         {
2783                 .procname       = "min_pmtu",
2784                 .data           = &ip_rt_min_pmtu,
2785                 .maxlen         = sizeof(int),
2786                 .mode           = 0644,
2787                 .proc_handler   = proc_dointvec,
2788         },
2789         {
2790                 .procname       = "min_adv_mss",
2791                 .data           = &ip_rt_min_advmss,
2792                 .maxlen         = sizeof(int),
2793                 .mode           = 0644,
2794                 .proc_handler   = proc_dointvec,
2795         },
2796         { }
2797 };
2798
2799 static struct ctl_table ipv4_route_flush_table[] = {
2800         {
2801                 .procname       = "flush",
2802                 .maxlen         = sizeof(int),
2803                 .mode           = 0200,
2804                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2805         },
2806         { },
2807 };
2808
2809 static __net_init int sysctl_route_net_init(struct net *net)
2810 {
2811         struct ctl_table *tbl;
2812
2813         tbl = ipv4_route_flush_table;
2814         if (!net_eq(net, &init_net)) {
2815                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2816                 if (!tbl)
2817                         goto err_dup;
2818
2819                 /* Don't export sysctls to unprivileged users */
2820                 if (net->user_ns != &init_user_ns)
2821                         tbl[0].procname = NULL;
2822         }
2823         tbl[0].extra1 = net;
2824
2825         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2826         if (!net->ipv4.route_hdr)
2827                 goto err_reg;
2828         return 0;
2829
2830 err_reg:
2831         if (tbl != ipv4_route_flush_table)
2832                 kfree(tbl);
2833 err_dup:
2834         return -ENOMEM;
2835 }
2836
2837 static __net_exit void sysctl_route_net_exit(struct net *net)
2838 {
2839         struct ctl_table *tbl;
2840
2841         tbl = net->ipv4.route_hdr->ctl_table_arg;
2842         unregister_net_sysctl_table(net->ipv4.route_hdr);
2843         BUG_ON(tbl == ipv4_route_flush_table);
2844         kfree(tbl);
2845 }
2846
2847 static __net_initdata struct pernet_operations sysctl_route_ops = {
2848         .init = sysctl_route_net_init,
2849         .exit = sysctl_route_net_exit,
2850 };
2851 #endif
2852
2853 static __net_init int rt_genid_init(struct net *net)
2854 {
2855         atomic_set(&net->ipv4.rt_genid, 0);
2856         atomic_set(&net->fnhe_genid, 0);
2857         get_random_bytes(&net->ipv4.dev_addr_genid,
2858                          sizeof(net->ipv4.dev_addr_genid));
2859         return 0;
2860 }
2861
2862 static __net_initdata struct pernet_operations rt_genid_ops = {
2863         .init = rt_genid_init,
2864 };
2865
2866 static int __net_init ipv4_inetpeer_init(struct net *net)
2867 {
2868         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2869
2870         if (!bp)
2871                 return -ENOMEM;
2872         inet_peer_base_init(bp);
2873         net->ipv4.peers = bp;
2874         return 0;
2875 }
2876
2877 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2878 {
2879         struct inet_peer_base *bp = net->ipv4.peers;
2880
2881         net->ipv4.peers = NULL;
2882         inetpeer_invalidate_tree(bp);
2883         kfree(bp);
2884 }
2885
2886 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2887         .init   =       ipv4_inetpeer_init,
2888         .exit   =       ipv4_inetpeer_exit,
2889 };
2890
2891 #ifdef CONFIG_IP_ROUTE_CLASSID
2892 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2893 #endif /* CONFIG_IP_ROUTE_CLASSID */
2894
2895 int __init ip_rt_init(void)
2896 {
2897         int rc = 0;
2898         int cpu;
2899
2900         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2901         if (!ip_idents)
2902                 panic("IP: failed to allocate ip_idents\n");
2903
2904         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2905
2906         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2907         if (!ip_tstamps)
2908                 panic("IP: failed to allocate ip_tstamps\n");
2909
2910         for_each_possible_cpu(cpu) {
2911                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2912
2913                 INIT_LIST_HEAD(&ul->head);
2914                 spin_lock_init(&ul->lock);
2915         }
2916 #ifdef CONFIG_IP_ROUTE_CLASSID
2917         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2918         if (!ip_rt_acct)
2919                 panic("IP: failed to allocate ip_rt_acct\n");
2920 #endif
2921
2922         ipv4_dst_ops.kmem_cachep =
2923                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2924                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2925
2926         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2927
2928         if (dst_entries_init(&ipv4_dst_ops) < 0)
2929                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2930
2931         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2932                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2933
2934         ipv4_dst_ops.gc_thresh = ~0;
2935         ip_rt_max_size = INT_MAX;
2936
2937         devinet_init();
2938         ip_fib_init();
2939
2940         if (ip_rt_proc_init())
2941                 pr_err("Unable to create route proc files\n");
2942 #ifdef CONFIG_XFRM
2943         xfrm_init();
2944         xfrm4_init();
2945 #endif
2946         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2947
2948 #ifdef CONFIG_SYSCTL
2949         register_pernet_subsys(&sysctl_route_ops);
2950 #endif
2951         register_pernet_subsys(&rt_genid_ops);
2952         register_pernet_subsys(&ipv4_inetpeer_ops);
2953         return rc;
2954 }
2955
2956 #ifdef CONFIG_SYSCTL
2957 /*
2958  * We really need to sanitize the damn ipv4 init order, then all
2959  * this nonsense will go away.
2960  */
2961 void __init ip_static_sysctl_init(void)
2962 {
2963         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2964 }
2965 #endif
This page took 0.203022 seconds and 4 git commands to generate.