]> Git Repo - linux.git/blob - net/ipv4/route.c
bpf: allow for tailcalls in BPF subprograms for x64 JIT
[linux.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              ROUTE - implementation of the IP router.
8  *
9  * Authors:     Ross Biro
10  *              Fred N. van Kempen, <[email protected]>
11  *              Alan Cox, <[email protected]>
12  *              Linus Torvalds, <[email protected]>
13  *              Alexey Kuznetsov, <[email protected]>
14  *
15  * Fixes:
16  *              Alan Cox        :       Verify area fixes.
17  *              Alan Cox        :       cli() protects routing changes
18  *              Rui Oliveira    :       ICMP routing table updates
19  *              ([email protected])      Routing table insertion and update
20  *              Linus Torvalds  :       Rewrote bits to be sensible
21  *              Alan Cox        :       Added BSD route gw semantics
22  *              Alan Cox        :       Super /proc >4K
23  *              Alan Cox        :       MTU in route table
24  *              Alan Cox        :       MSS actually. Also added the window
25  *                                      clamper.
26  *              Sam Lantinga    :       Fixed route matching in rt_del()
27  *              Alan Cox        :       Routing cache support.
28  *              Alan Cox        :       Removed compatibility cruft.
29  *              Alan Cox        :       RTF_REJECT support.
30  *              Alan Cox        :       TCP irtt support.
31  *              Jonathan Naylor :       Added Metric support.
32  *      Miquel van Smoorenburg  :       BSD API fixes.
33  *      Miquel van Smoorenburg  :       Metrics.
34  *              Alan Cox        :       Use __u32 properly
35  *              Alan Cox        :       Aligned routing errors more closely with BSD
36  *                                      our system is still very different.
37  *              Alan Cox        :       Faster /proc handling
38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
39  *                                      routing caches and better behaviour.
40  *
41  *              Olaf Erb        :       irtt wasn't being copied right.
42  *              Bjorn Ekwall    :       Kerneld route support.
43  *              Alan Cox        :       Multicast fixed (I hope)
44  *              Pavel Krauz     :       Limited broadcast fixed
45  *              Mike McLagan    :       Routing by source
46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
47  *                                      route.c and rewritten from scratch.
48  *              Andi Kleen      :       Load-limit warning messages.
49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
53  *              Marc Boucher    :       routing by fwmark
54  *      Robert Olsson           :       Added rt_cache statistics
55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
59  */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/string.h>
70 #include <linux/socket.h>
71 #include <linux/sockios.h>
72 #include <linux/errno.h>
73 #include <linux/in.h>
74 #include <linux/inet.h>
75 #include <linux/netdevice.h>
76 #include <linux/proc_fs.h>
77 #include <linux/init.h>
78 #include <linux/skbuff.h>
79 #include <linux/inetdevice.h>
80 #include <linux/igmp.h>
81 #include <linux/pkt_sched.h>
82 #include <linux/mroute.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/random.h>
85 #include <linux/rcupdate.h>
86 #include <linux/times.h>
87 #include <linux/slab.h>
88 #include <linux/jhash.h>
89 #include <net/dst.h>
90 #include <net/dst_metadata.h>
91 #include <net/net_namespace.h>
92 #include <net/protocol.h>
93 #include <net/ip.h>
94 #include <net/route.h>
95 #include <net/inetpeer.h>
96 #include <net/sock.h>
97 #include <net/ip_fib.h>
98 #include <net/nexthop.h>
99 #include <net/arp.h>
100 #include <net/tcp.h>
101 #include <net/icmp.h>
102 #include <net/xfrm.h>
103 #include <net/lwtunnel.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #endif
109 #include <net/secure_seq.h>
110 #include <net/ip_tunnels.h>
111 #include <net/l3mdev.h>
112
113 #include "fib_lookup.h"
114
115 #define RT_FL_TOS(oldflp4) \
116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
131
132 /*
133  *      Interface to generic destination cache.
134  */
135
136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
137 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
138 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140 static void              ipv4_link_failure(struct sk_buff *skb);
141 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
142                                            struct sk_buff *skb, u32 mtu,
143                                            bool confirm_neigh);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149 {
150         WARN_ON(1);
151         return NULL;
152 }
153
154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155                                            struct sk_buff *skb,
156                                            const void *daddr);
157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .check =                ipv4_dst_check,
162         .default_advmss =       ipv4_default_advmss,
163         .mtu =                  ipv4_mtu,
164         .cow_metrics =          ipv4_cow_metrics,
165         .destroy =              ipv4_dst_destroy,
166         .negative_advice =      ipv4_negative_advice,
167         .link_failure =         ipv4_link_failure,
168         .update_pmtu =          ip_rt_update_pmtu,
169         .redirect =             ip_do_redirect,
170         .local_out =            __ip_local_out,
171         .neigh_lookup =         ipv4_neigh_lookup,
172         .confirm_neigh =        ipv4_confirm_neigh,
173 };
174
175 #define ECN_OR_COST(class)      TC_PRIO_##class
176
177 const __u8 ip_tos2prio[16] = {
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203         if (*pos)
204                 return NULL;
205         return SEQ_START_TOKEN;
206 }
207
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210         ++*pos;
211         return NULL;
212 }
213
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220         if (v == SEQ_START_TOKEN)
221                 seq_printf(seq, "%-127s\n",
222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224                            "HHUptod\tSpecDst");
225         return 0;
226 }
227
228 static const struct seq_operations rt_cache_seq_ops = {
229         .start  = rt_cache_seq_start,
230         .next   = rt_cache_seq_next,
231         .stop   = rt_cache_seq_stop,
232         .show   = rt_cache_seq_show,
233 };
234
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237         return seq_open(file, &rt_cache_seq_ops);
238 }
239
240 static const struct proc_ops rt_cache_proc_ops = {
241         .proc_open      = rt_cache_seq_open,
242         .proc_read      = seq_read,
243         .proc_lseek     = seq_lseek,
244         .proc_release   = seq_release,
245 };
246
247
248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249 {
250         int cpu;
251
252         if (*pos == 0)
253                 return SEQ_START_TOKEN;
254
255         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
256                 if (!cpu_possible(cpu))
257                         continue;
258                 *pos = cpu+1;
259                 return &per_cpu(rt_cache_stat, cpu);
260         }
261         return NULL;
262 }
263
264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265 {
266         int cpu;
267
268         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
269                 if (!cpu_possible(cpu))
270                         continue;
271                 *pos = cpu+1;
272                 return &per_cpu(rt_cache_stat, cpu);
273         }
274         (*pos)++;
275         return NULL;
276
277 }
278
279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280 {
281
282 }
283
284 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285 {
286         struct rt_cache_stat *st = v;
287
288         if (v == SEQ_START_TOKEN) {
289                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
290                 return 0;
291         }
292
293         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
294                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
295                    dst_entries_get_slow(&ipv4_dst_ops),
296                    0, /* st->in_hit */
297                    st->in_slow_tot,
298                    st->in_slow_mc,
299                    st->in_no_route,
300                    st->in_brd,
301                    st->in_martian_dst,
302                    st->in_martian_src,
303
304                    0, /* st->out_hit */
305                    st->out_slow_tot,
306                    st->out_slow_mc,
307
308                    0, /* st->gc_total */
309                    0, /* st->gc_ignored */
310                    0, /* st->gc_goal_miss */
311                    0, /* st->gc_dst_overflow */
312                    0, /* st->in_hlist_search */
313                    0  /* st->out_hlist_search */
314                 );
315         return 0;
316 }
317
318 static const struct seq_operations rt_cpu_seq_ops = {
319         .start  = rt_cpu_seq_start,
320         .next   = rt_cpu_seq_next,
321         .stop   = rt_cpu_seq_stop,
322         .show   = rt_cpu_seq_show,
323 };
324
325
326 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327 {
328         return seq_open(file, &rt_cpu_seq_ops);
329 }
330
331 static const struct proc_ops rt_cpu_proc_ops = {
332         .proc_open      = rt_cpu_seq_open,
333         .proc_read      = seq_read,
334         .proc_lseek     = seq_lseek,
335         .proc_release   = seq_release,
336 };
337
338 #ifdef CONFIG_IP_ROUTE_CLASSID
339 static int rt_acct_proc_show(struct seq_file *m, void *v)
340 {
341         struct ip_rt_acct *dst, *src;
342         unsigned int i, j;
343
344         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345         if (!dst)
346                 return -ENOMEM;
347
348         for_each_possible_cpu(i) {
349                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350                 for (j = 0; j < 256; j++) {
351                         dst[j].o_bytes   += src[j].o_bytes;
352                         dst[j].o_packets += src[j].o_packets;
353                         dst[j].i_bytes   += src[j].i_bytes;
354                         dst[j].i_packets += src[j].i_packets;
355                 }
356         }
357
358         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359         kfree(dst);
360         return 0;
361 }
362 #endif
363
364 static int __net_init ip_rt_do_proc_init(struct net *net)
365 {
366         struct proc_dir_entry *pde;
367
368         pde = proc_create("rt_cache", 0444, net->proc_net,
369                           &rt_cache_proc_ops);
370         if (!pde)
371                 goto err1;
372
373         pde = proc_create("rt_cache", 0444,
374                           net->proc_net_stat, &rt_cpu_proc_ops);
375         if (!pde)
376                 goto err2;
377
378 #ifdef CONFIG_IP_ROUTE_CLASSID
379         pde = proc_create_single("rt_acct", 0, net->proc_net,
380                         rt_acct_proc_show);
381         if (!pde)
382                 goto err3;
383 #endif
384         return 0;
385
386 #ifdef CONFIG_IP_ROUTE_CLASSID
387 err3:
388         remove_proc_entry("rt_cache", net->proc_net_stat);
389 #endif
390 err2:
391         remove_proc_entry("rt_cache", net->proc_net);
392 err1:
393         return -ENOMEM;
394 }
395
396 static void __net_exit ip_rt_do_proc_exit(struct net *net)
397 {
398         remove_proc_entry("rt_cache", net->proc_net_stat);
399         remove_proc_entry("rt_cache", net->proc_net);
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401         remove_proc_entry("rt_acct", net->proc_net);
402 #endif
403 }
404
405 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
406         .init = ip_rt_do_proc_init,
407         .exit = ip_rt_do_proc_exit,
408 };
409
410 static int __init ip_rt_proc_init(void)
411 {
412         return register_pernet_subsys(&ip_rt_proc_ops);
413 }
414
415 #else
416 static inline int ip_rt_proc_init(void)
417 {
418         return 0;
419 }
420 #endif /* CONFIG_PROC_FS */
421
422 static inline bool rt_is_expired(const struct rtable *rth)
423 {
424         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
425 }
426
427 void rt_cache_flush(struct net *net)
428 {
429         rt_genid_bump_ipv4(net);
430 }
431
432 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
433                                            struct sk_buff *skb,
434                                            const void *daddr)
435 {
436         const struct rtable *rt = container_of(dst, struct rtable, dst);
437         struct net_device *dev = dst->dev;
438         struct neighbour *n;
439
440         rcu_read_lock_bh();
441
442         if (likely(rt->rt_gw_family == AF_INET)) {
443                 n = ip_neigh_gw4(dev, rt->rt_gw4);
444         } else if (rt->rt_gw_family == AF_INET6) {
445                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
446         } else {
447                 __be32 pkey;
448
449                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450                 n = ip_neigh_gw4(dev, pkey);
451         }
452
453         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
454                 n = NULL;
455
456         rcu_read_unlock_bh();
457
458         return n;
459 }
460
461 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
462 {
463         const struct rtable *rt = container_of(dst, struct rtable, dst);
464         struct net_device *dev = dst->dev;
465         const __be32 *pkey = daddr;
466
467         if (rt->rt_gw_family == AF_INET) {
468                 pkey = (const __be32 *)&rt->rt_gw4;
469         } else if (rt->rt_gw_family == AF_INET6) {
470                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
471         } else if (!daddr ||
472                  (rt->rt_flags &
473                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
474                 return;
475         }
476         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
477 }
478
479 #define IP_IDENTS_SZ 2048u
480
481 static atomic_t *ip_idents __read_mostly;
482 static u32 *ip_tstamps __read_mostly;
483
484 /* In order to protect privacy, we add a perturbation to identifiers
485  * if one generator is seldom used. This makes hard for an attacker
486  * to infer how many packets were sent between two points in time.
487  */
488 u32 ip_idents_reserve(u32 hash, int segs)
489 {
490         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
492         u32 old = READ_ONCE(*p_tstamp);
493         u32 now = (u32)jiffies;
494         u32 delta = 0;
495
496         if (old != now && cmpxchg(p_tstamp, old, now) == old)
497                 delta = prandom_u32_max(now - old);
498
499         /* If UBSAN reports an error there, please make sure your compiler
500          * supports -fno-strict-overflow before reporting it that was a bug
501          * in UBSAN, and it has been fixed in GCC-8.
502          */
503         return atomic_add_return(segs + delta, p_id) - segs;
504 }
505 EXPORT_SYMBOL(ip_idents_reserve);
506
507 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
508 {
509         u32 hash, id;
510
511         /* Note the following code is not safe, but this is okay. */
512         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
513                 get_random_bytes(&net->ipv4.ip_id_key,
514                                  sizeof(net->ipv4.ip_id_key));
515
516         hash = siphash_3u32((__force u32)iph->daddr,
517                             (__force u32)iph->saddr,
518                             iph->protocol,
519                             &net->ipv4.ip_id_key);
520         id = ip_idents_reserve(hash, segs);
521         iph->id = htons(id);
522 }
523 EXPORT_SYMBOL(__ip_select_ident);
524
525 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
526                              const struct sock *sk,
527                              const struct iphdr *iph,
528                              int oif, u8 tos,
529                              u8 prot, u32 mark, int flow_flags)
530 {
531         if (sk) {
532                 const struct inet_sock *inet = inet_sk(sk);
533
534                 oif = sk->sk_bound_dev_if;
535                 mark = sk->sk_mark;
536                 tos = RT_CONN_FLAGS(sk);
537                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
538         }
539         flowi4_init_output(fl4, oif, mark, tos,
540                            RT_SCOPE_UNIVERSE, prot,
541                            flow_flags,
542                            iph->daddr, iph->saddr, 0, 0,
543                            sock_net_uid(net, sk));
544 }
545
546 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
547                                const struct sock *sk)
548 {
549         const struct net *net = dev_net(skb->dev);
550         const struct iphdr *iph = ip_hdr(skb);
551         int oif = skb->dev->ifindex;
552         u8 tos = RT_TOS(iph->tos);
553         u8 prot = iph->protocol;
554         u32 mark = skb->mark;
555
556         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
557 }
558
559 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
560 {
561         const struct inet_sock *inet = inet_sk(sk);
562         const struct ip_options_rcu *inet_opt;
563         __be32 daddr = inet->inet_daddr;
564
565         rcu_read_lock();
566         inet_opt = rcu_dereference(inet->inet_opt);
567         if (inet_opt && inet_opt->opt.srr)
568                 daddr = inet_opt->opt.faddr;
569         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
570                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
571                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
572                            inet_sk_flowi_flags(sk),
573                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
574         rcu_read_unlock();
575 }
576
577 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
578                                  const struct sk_buff *skb)
579 {
580         if (skb)
581                 build_skb_flow_key(fl4, skb, sk);
582         else
583                 build_sk_flow_key(fl4, sk);
584 }
585
586 static DEFINE_SPINLOCK(fnhe_lock);
587
588 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
589 {
590         struct rtable *rt;
591
592         rt = rcu_dereference(fnhe->fnhe_rth_input);
593         if (rt) {
594                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
595                 dst_dev_put(&rt->dst);
596                 dst_release(&rt->dst);
597         }
598         rt = rcu_dereference(fnhe->fnhe_rth_output);
599         if (rt) {
600                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
601                 dst_dev_put(&rt->dst);
602                 dst_release(&rt->dst);
603         }
604 }
605
606 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
607 {
608         struct fib_nh_exception *fnhe, *oldest;
609
610         oldest = rcu_dereference(hash->chain);
611         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
612              fnhe = rcu_dereference(fnhe->fnhe_next)) {
613                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
614                         oldest = fnhe;
615         }
616         fnhe_flush_routes(oldest);
617         return oldest;
618 }
619
620 static inline u32 fnhe_hashfun(__be32 daddr)
621 {
622         static u32 fnhe_hashrnd __read_mostly;
623         u32 hval;
624
625         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
626         hval = jhash_1word((__force u32)daddr, fnhe_hashrnd);
627         return hash_32(hval, FNHE_HASH_SHIFT);
628 }
629
630 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
631 {
632         rt->rt_pmtu = fnhe->fnhe_pmtu;
633         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
634         rt->dst.expires = fnhe->fnhe_expires;
635
636         if (fnhe->fnhe_gw) {
637                 rt->rt_flags |= RTCF_REDIRECTED;
638                 rt->rt_uses_gateway = 1;
639                 rt->rt_gw_family = AF_INET;
640                 rt->rt_gw4 = fnhe->fnhe_gw;
641         }
642 }
643
644 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
645                                   __be32 gw, u32 pmtu, bool lock,
646                                   unsigned long expires)
647 {
648         struct fnhe_hash_bucket *hash;
649         struct fib_nh_exception *fnhe;
650         struct rtable *rt;
651         u32 genid, hval;
652         unsigned int i;
653         int depth;
654
655         genid = fnhe_genid(dev_net(nhc->nhc_dev));
656         hval = fnhe_hashfun(daddr);
657
658         spin_lock_bh(&fnhe_lock);
659
660         hash = rcu_dereference(nhc->nhc_exceptions);
661         if (!hash) {
662                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
663                 if (!hash)
664                         goto out_unlock;
665                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
666         }
667
668         hash += hval;
669
670         depth = 0;
671         for (fnhe = rcu_dereference(hash->chain); fnhe;
672              fnhe = rcu_dereference(fnhe->fnhe_next)) {
673                 if (fnhe->fnhe_daddr == daddr)
674                         break;
675                 depth++;
676         }
677
678         if (fnhe) {
679                 if (fnhe->fnhe_genid != genid)
680                         fnhe->fnhe_genid = genid;
681                 if (gw)
682                         fnhe->fnhe_gw = gw;
683                 if (pmtu) {
684                         fnhe->fnhe_pmtu = pmtu;
685                         fnhe->fnhe_mtu_locked = lock;
686                 }
687                 fnhe->fnhe_expires = max(1UL, expires);
688                 /* Update all cached dsts too */
689                 rt = rcu_dereference(fnhe->fnhe_rth_input);
690                 if (rt)
691                         fill_route_from_fnhe(rt, fnhe);
692                 rt = rcu_dereference(fnhe->fnhe_rth_output);
693                 if (rt)
694                         fill_route_from_fnhe(rt, fnhe);
695         } else {
696                 if (depth > FNHE_RECLAIM_DEPTH)
697                         fnhe = fnhe_oldest(hash);
698                 else {
699                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
700                         if (!fnhe)
701                                 goto out_unlock;
702
703                         fnhe->fnhe_next = hash->chain;
704                         rcu_assign_pointer(hash->chain, fnhe);
705                 }
706                 fnhe->fnhe_genid = genid;
707                 fnhe->fnhe_daddr = daddr;
708                 fnhe->fnhe_gw = gw;
709                 fnhe->fnhe_pmtu = pmtu;
710                 fnhe->fnhe_mtu_locked = lock;
711                 fnhe->fnhe_expires = max(1UL, expires);
712
713                 /* Exception created; mark the cached routes for the nexthop
714                  * stale, so anyone caching it rechecks if this exception
715                  * applies to them.
716                  */
717                 rt = rcu_dereference(nhc->nhc_rth_input);
718                 if (rt)
719                         rt->dst.obsolete = DST_OBSOLETE_KILL;
720
721                 for_each_possible_cpu(i) {
722                         struct rtable __rcu **prt;
723                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
724                         rt = rcu_dereference(*prt);
725                         if (rt)
726                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
727                 }
728         }
729
730         fnhe->fnhe_stamp = jiffies;
731
732 out_unlock:
733         spin_unlock_bh(&fnhe_lock);
734 }
735
736 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
737                              bool kill_route)
738 {
739         __be32 new_gw = icmp_hdr(skb)->un.gateway;
740         __be32 old_gw = ip_hdr(skb)->saddr;
741         struct net_device *dev = skb->dev;
742         struct in_device *in_dev;
743         struct fib_result res;
744         struct neighbour *n;
745         struct net *net;
746
747         switch (icmp_hdr(skb)->code & 7) {
748         case ICMP_REDIR_NET:
749         case ICMP_REDIR_NETTOS:
750         case ICMP_REDIR_HOST:
751         case ICMP_REDIR_HOSTTOS:
752                 break;
753
754         default:
755                 return;
756         }
757
758         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
759                 return;
760
761         in_dev = __in_dev_get_rcu(dev);
762         if (!in_dev)
763                 return;
764
765         net = dev_net(dev);
766         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
767             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
768             ipv4_is_zeronet(new_gw))
769                 goto reject_redirect;
770
771         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
772                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
773                         goto reject_redirect;
774                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
775                         goto reject_redirect;
776         } else {
777                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
778                         goto reject_redirect;
779         }
780
781         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
782         if (!n)
783                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
784         if (!IS_ERR(n)) {
785                 if (!(n->nud_state & NUD_VALID)) {
786                         neigh_event_send(n, NULL);
787                 } else {
788                         if (fib_lookup(net, fl4, &res, 0) == 0) {
789                                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
790
791                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
792                                                 0, false,
793                                                 jiffies + ip_rt_gc_timeout);
794                         }
795                         if (kill_route)
796                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
797                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
798                 }
799                 neigh_release(n);
800         }
801         return;
802
803 reject_redirect:
804 #ifdef CONFIG_IP_ROUTE_VERBOSE
805         if (IN_DEV_LOG_MARTIANS(in_dev)) {
806                 const struct iphdr *iph = (const struct iphdr *) skb->data;
807                 __be32 daddr = iph->daddr;
808                 __be32 saddr = iph->saddr;
809
810                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
811                                      "  Advised path = %pI4 -> %pI4\n",
812                                      &old_gw, dev->name, &new_gw,
813                                      &saddr, &daddr);
814         }
815 #endif
816         ;
817 }
818
819 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
820 {
821         struct rtable *rt;
822         struct flowi4 fl4;
823         const struct iphdr *iph = (const struct iphdr *) skb->data;
824         struct net *net = dev_net(skb->dev);
825         int oif = skb->dev->ifindex;
826         u8 tos = RT_TOS(iph->tos);
827         u8 prot = iph->protocol;
828         u32 mark = skb->mark;
829
830         rt = (struct rtable *) dst;
831
832         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
833         __ip_do_redirect(rt, skb, &fl4, true);
834 }
835
836 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
837 {
838         struct rtable *rt = (struct rtable *)dst;
839         struct dst_entry *ret = dst;
840
841         if (rt) {
842                 if (dst->obsolete > 0) {
843                         ip_rt_put(rt);
844                         ret = NULL;
845                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
846                            rt->dst.expires) {
847                         ip_rt_put(rt);
848                         ret = NULL;
849                 }
850         }
851         return ret;
852 }
853
854 /*
855  * Algorithm:
856  *      1. The first ip_rt_redirect_number redirects are sent
857  *         with exponential backoff, then we stop sending them at all,
858  *         assuming that the host ignores our redirects.
859  *      2. If we did not see packets requiring redirects
860  *         during ip_rt_redirect_silence, we assume that the host
861  *         forgot redirected route and start to send redirects again.
862  *
863  * This algorithm is much cheaper and more intelligent than dumb load limiting
864  * in icmp.c.
865  *
866  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
867  * and "frag. need" (breaks PMTU discovery) in icmp.c.
868  */
869
870 void ip_rt_send_redirect(struct sk_buff *skb)
871 {
872         struct rtable *rt = skb_rtable(skb);
873         struct in_device *in_dev;
874         struct inet_peer *peer;
875         struct net *net;
876         int log_martians;
877         int vif;
878
879         rcu_read_lock();
880         in_dev = __in_dev_get_rcu(rt->dst.dev);
881         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
882                 rcu_read_unlock();
883                 return;
884         }
885         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
886         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
887         rcu_read_unlock();
888
889         net = dev_net(rt->dst.dev);
890         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
891         if (!peer) {
892                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
893                           rt_nexthop(rt, ip_hdr(skb)->daddr));
894                 return;
895         }
896
897         /* No redirected packets during ip_rt_redirect_silence;
898          * reset the algorithm.
899          */
900         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
901                 peer->rate_tokens = 0;
902                 peer->n_redirects = 0;
903         }
904
905         /* Too many ignored redirects; do not send anything
906          * set dst.rate_last to the last seen redirected packet.
907          */
908         if (peer->n_redirects >= ip_rt_redirect_number) {
909                 peer->rate_last = jiffies;
910                 goto out_put_peer;
911         }
912
913         /* Check for load limit; set rate_last to the latest sent
914          * redirect.
915          */
916         if (peer->n_redirects == 0 ||
917             time_after(jiffies,
918                        (peer->rate_last +
919                         (ip_rt_redirect_load << peer->n_redirects)))) {
920                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
921
922                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
923                 peer->rate_last = jiffies;
924                 ++peer->n_redirects;
925 #ifdef CONFIG_IP_ROUTE_VERBOSE
926                 if (log_martians &&
927                     peer->n_redirects == ip_rt_redirect_number)
928                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
929                                              &ip_hdr(skb)->saddr, inet_iif(skb),
930                                              &ip_hdr(skb)->daddr, &gw);
931 #endif
932         }
933 out_put_peer:
934         inet_putpeer(peer);
935 }
936
937 static int ip_error(struct sk_buff *skb)
938 {
939         struct rtable *rt = skb_rtable(skb);
940         struct net_device *dev = skb->dev;
941         struct in_device *in_dev;
942         struct inet_peer *peer;
943         unsigned long now;
944         struct net *net;
945         bool send;
946         int code;
947
948         if (netif_is_l3_master(skb->dev)) {
949                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
950                 if (!dev)
951                         goto out;
952         }
953
954         in_dev = __in_dev_get_rcu(dev);
955
956         /* IP on this device is disabled. */
957         if (!in_dev)
958                 goto out;
959
960         net = dev_net(rt->dst.dev);
961         if (!IN_DEV_FORWARD(in_dev)) {
962                 switch (rt->dst.error) {
963                 case EHOSTUNREACH:
964                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
965                         break;
966
967                 case ENETUNREACH:
968                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
969                         break;
970                 }
971                 goto out;
972         }
973
974         switch (rt->dst.error) {
975         case EINVAL:
976         default:
977                 goto out;
978         case EHOSTUNREACH:
979                 code = ICMP_HOST_UNREACH;
980                 break;
981         case ENETUNREACH:
982                 code = ICMP_NET_UNREACH;
983                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
984                 break;
985         case EACCES:
986                 code = ICMP_PKT_FILTERED;
987                 break;
988         }
989
990         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
991                                l3mdev_master_ifindex(skb->dev), 1);
992
993         send = true;
994         if (peer) {
995                 now = jiffies;
996                 peer->rate_tokens += now - peer->rate_last;
997                 if (peer->rate_tokens > ip_rt_error_burst)
998                         peer->rate_tokens = ip_rt_error_burst;
999                 peer->rate_last = now;
1000                 if (peer->rate_tokens >= ip_rt_error_cost)
1001                         peer->rate_tokens -= ip_rt_error_cost;
1002                 else
1003                         send = false;
1004                 inet_putpeer(peer);
1005         }
1006         if (send)
1007                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008
1009 out:    kfree_skb(skb);
1010         return 0;
1011 }
1012
1013 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014 {
1015         struct dst_entry *dst = &rt->dst;
1016         struct fib_result res;
1017         bool lock = false;
1018         u32 old_mtu;
1019
1020         if (ip_mtu_locked(dst))
1021                 return;
1022
1023         old_mtu = ipv4_mtu(dst);
1024         if (old_mtu < mtu)
1025                 return;
1026
1027         if (mtu < ip_rt_min_pmtu) {
1028                 lock = true;
1029                 mtu = min(old_mtu, ip_rt_min_pmtu);
1030         }
1031
1032         if (rt->rt_pmtu == mtu && !lock &&
1033             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1034                 return;
1035
1036         rcu_read_lock();
1037         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1038                 struct fib_nh_common *nhc = FIB_RES_NHC(res);
1039
1040                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1041                                       jiffies + ip_rt_mtu_expires);
1042         }
1043         rcu_read_unlock();
1044 }
1045
1046 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1047                               struct sk_buff *skb, u32 mtu,
1048                               bool confirm_neigh)
1049 {
1050         struct rtable *rt = (struct rtable *) dst;
1051         struct flowi4 fl4;
1052
1053         ip_rt_build_flow_key(&fl4, sk, skb);
1054
1055         /* Don't make lookup fail for bridged encapsulations */
1056         if (skb && netif_is_any_bridge_port(skb->dev))
1057                 fl4.flowi4_oif = 0;
1058
1059         __ip_rt_update_pmtu(rt, &fl4, mtu);
1060 }
1061
1062 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1063                       int oif, u8 protocol)
1064 {
1065         const struct iphdr *iph = (const struct iphdr *)skb->data;
1066         struct flowi4 fl4;
1067         struct rtable *rt;
1068         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1069
1070         __build_flow_key(net, &fl4, NULL, iph, oif,
1071                          RT_TOS(iph->tos), protocol, mark, 0);
1072         rt = __ip_route_output_key(net, &fl4);
1073         if (!IS_ERR(rt)) {
1074                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1075                 ip_rt_put(rt);
1076         }
1077 }
1078 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1079
1080 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1081 {
1082         const struct iphdr *iph = (const struct iphdr *)skb->data;
1083         struct flowi4 fl4;
1084         struct rtable *rt;
1085
1086         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1087
1088         if (!fl4.flowi4_mark)
1089                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1090
1091         rt = __ip_route_output_key(sock_net(sk), &fl4);
1092         if (!IS_ERR(rt)) {
1093                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1094                 ip_rt_put(rt);
1095         }
1096 }
1097
1098 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1099 {
1100         const struct iphdr *iph = (const struct iphdr *)skb->data;
1101         struct flowi4 fl4;
1102         struct rtable *rt;
1103         struct dst_entry *odst = NULL;
1104         bool new = false;
1105         struct net *net = sock_net(sk);
1106
1107         bh_lock_sock(sk);
1108
1109         if (!ip_sk_accept_pmtu(sk))
1110                 goto out;
1111
1112         odst = sk_dst_get(sk);
1113
1114         if (sock_owned_by_user(sk) || !odst) {
1115                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1116                 goto out;
1117         }
1118
1119         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1120
1121         rt = (struct rtable *)odst;
1122         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1123                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1124                 if (IS_ERR(rt))
1125                         goto out;
1126
1127                 new = true;
1128         }
1129
1130         __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1131
1132         if (!dst_check(&rt->dst, 0)) {
1133                 if (new)
1134                         dst_release(&rt->dst);
1135
1136                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1137                 if (IS_ERR(rt))
1138                         goto out;
1139
1140                 new = true;
1141         }
1142
1143         if (new)
1144                 sk_dst_set(sk, &rt->dst);
1145
1146 out:
1147         bh_unlock_sock(sk);
1148         dst_release(odst);
1149 }
1150 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1151
1152 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1153                    int oif, u8 protocol)
1154 {
1155         const struct iphdr *iph = (const struct iphdr *)skb->data;
1156         struct flowi4 fl4;
1157         struct rtable *rt;
1158
1159         __build_flow_key(net, &fl4, NULL, iph, oif,
1160                          RT_TOS(iph->tos), protocol, 0, 0);
1161         rt = __ip_route_output_key(net, &fl4);
1162         if (!IS_ERR(rt)) {
1163                 __ip_do_redirect(rt, skb, &fl4, false);
1164                 ip_rt_put(rt);
1165         }
1166 }
1167 EXPORT_SYMBOL_GPL(ipv4_redirect);
1168
1169 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1170 {
1171         const struct iphdr *iph = (const struct iphdr *)skb->data;
1172         struct flowi4 fl4;
1173         struct rtable *rt;
1174         struct net *net = sock_net(sk);
1175
1176         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1177         rt = __ip_route_output_key(net, &fl4);
1178         if (!IS_ERR(rt)) {
1179                 __ip_do_redirect(rt, skb, &fl4, false);
1180                 ip_rt_put(rt);
1181         }
1182 }
1183 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1184
1185 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1186 {
1187         struct rtable *rt = (struct rtable *) dst;
1188
1189         /* All IPV4 dsts are created with ->obsolete set to the value
1190          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1191          * into this function always.
1192          *
1193          * When a PMTU/redirect information update invalidates a route,
1194          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1195          * DST_OBSOLETE_DEAD.
1196          */
1197         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1198                 return NULL;
1199         return dst;
1200 }
1201
1202 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1203 {
1204         struct ip_options opt;
1205         int res;
1206
1207         /* Recompile ip options since IPCB may not be valid anymore.
1208          * Also check we have a reasonable ipv4 header.
1209          */
1210         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1211             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1212                 return;
1213
1214         memset(&opt, 0, sizeof(opt));
1215         if (ip_hdr(skb)->ihl > 5) {
1216                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1217                         return;
1218                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1219
1220                 rcu_read_lock();
1221                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1222                 rcu_read_unlock();
1223
1224                 if (res)
1225                         return;
1226         }
1227         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1228 }
1229
1230 static void ipv4_link_failure(struct sk_buff *skb)
1231 {
1232         struct rtable *rt;
1233
1234         ipv4_send_dest_unreach(skb);
1235
1236         rt = skb_rtable(skb);
1237         if (rt)
1238                 dst_set_expires(&rt->dst, 0);
1239 }
1240
1241 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1242 {
1243         pr_debug("%s: %pI4 -> %pI4, %s\n",
1244                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1245                  skb->dev ? skb->dev->name : "?");
1246         kfree_skb(skb);
1247         WARN_ON(1);
1248         return 0;
1249 }
1250
1251 /*
1252    We do not cache source address of outgoing interface,
1253    because it is used only by IP RR, TS and SRR options,
1254    so that it out of fast path.
1255
1256    BTW remember: "addr" is allowed to be not aligned
1257    in IP options!
1258  */
1259
1260 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1261 {
1262         __be32 src;
1263
1264         if (rt_is_output_route(rt))
1265                 src = ip_hdr(skb)->saddr;
1266         else {
1267                 struct fib_result res;
1268                 struct iphdr *iph = ip_hdr(skb);
1269                 struct flowi4 fl4 = {
1270                         .daddr = iph->daddr,
1271                         .saddr = iph->saddr,
1272                         .flowi4_tos = RT_TOS(iph->tos),
1273                         .flowi4_oif = rt->dst.dev->ifindex,
1274                         .flowi4_iif = skb->dev->ifindex,
1275                         .flowi4_mark = skb->mark,
1276                 };
1277
1278                 rcu_read_lock();
1279                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1280                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1281                 else
1282                         src = inet_select_addr(rt->dst.dev,
1283                                                rt_nexthop(rt, iph->daddr),
1284                                                RT_SCOPE_UNIVERSE);
1285                 rcu_read_unlock();
1286         }
1287         memcpy(addr, &src, 4);
1288 }
1289
1290 #ifdef CONFIG_IP_ROUTE_CLASSID
1291 static void set_class_tag(struct rtable *rt, u32 tag)
1292 {
1293         if (!(rt->dst.tclassid & 0xFFFF))
1294                 rt->dst.tclassid |= tag & 0xFFFF;
1295         if (!(rt->dst.tclassid & 0xFFFF0000))
1296                 rt->dst.tclassid |= tag & 0xFFFF0000;
1297 }
1298 #endif
1299
1300 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1301 {
1302         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1303         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1304                                     ip_rt_min_advmss);
1305
1306         return min(advmss, IPV4_MAX_PMTU - header_size);
1307 }
1308
1309 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1310 {
1311         const struct rtable *rt = (const struct rtable *)dst;
1312         unsigned int mtu = rt->rt_pmtu;
1313
1314         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1315                 mtu = dst_metric_raw(dst, RTAX_MTU);
1316
1317         if (mtu)
1318                 return mtu;
1319
1320         mtu = READ_ONCE(dst->dev->mtu);
1321
1322         if (unlikely(ip_mtu_locked(dst))) {
1323                 if (rt->rt_uses_gateway && mtu > 576)
1324                         mtu = 576;
1325         }
1326
1327         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1328
1329         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1330 }
1331
1332 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1333 {
1334         struct fnhe_hash_bucket *hash;
1335         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1336         u32 hval = fnhe_hashfun(daddr);
1337
1338         spin_lock_bh(&fnhe_lock);
1339
1340         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1341                                          lockdep_is_held(&fnhe_lock));
1342         hash += hval;
1343
1344         fnhe_p = &hash->chain;
1345         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1346         while (fnhe) {
1347                 if (fnhe->fnhe_daddr == daddr) {
1348                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1349                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1350                         /* set fnhe_daddr to 0 to ensure it won't bind with
1351                          * new dsts in rt_bind_exception().
1352                          */
1353                         fnhe->fnhe_daddr = 0;
1354                         fnhe_flush_routes(fnhe);
1355                         kfree_rcu(fnhe, rcu);
1356                         break;
1357                 }
1358                 fnhe_p = &fnhe->fnhe_next;
1359                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1360                                                  lockdep_is_held(&fnhe_lock));
1361         }
1362
1363         spin_unlock_bh(&fnhe_lock);
1364 }
1365
1366 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1367                                                __be32 daddr)
1368 {
1369         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1370         struct fib_nh_exception *fnhe;
1371         u32 hval;
1372
1373         if (!hash)
1374                 return NULL;
1375
1376         hval = fnhe_hashfun(daddr);
1377
1378         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1379              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1380                 if (fnhe->fnhe_daddr == daddr) {
1381                         if (fnhe->fnhe_expires &&
1382                             time_after(jiffies, fnhe->fnhe_expires)) {
1383                                 ip_del_fnhe(nhc, daddr);
1384                                 break;
1385                         }
1386                         return fnhe;
1387                 }
1388         }
1389         return NULL;
1390 }
1391
1392 /* MTU selection:
1393  * 1. mtu on route is locked - use it
1394  * 2. mtu from nexthop exception
1395  * 3. mtu from egress device
1396  */
1397
1398 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1399 {
1400         struct fib_nh_common *nhc = res->nhc;
1401         struct net_device *dev = nhc->nhc_dev;
1402         struct fib_info *fi = res->fi;
1403         u32 mtu = 0;
1404
1405         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1406             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1407                 mtu = fi->fib_mtu;
1408
1409         if (likely(!mtu)) {
1410                 struct fib_nh_exception *fnhe;
1411
1412                 fnhe = find_exception(nhc, daddr);
1413                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1414                         mtu = fnhe->fnhe_pmtu;
1415         }
1416
1417         if (likely(!mtu))
1418                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1419
1420         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1421 }
1422
1423 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1424                               __be32 daddr, const bool do_cache)
1425 {
1426         bool ret = false;
1427
1428         spin_lock_bh(&fnhe_lock);
1429
1430         if (daddr == fnhe->fnhe_daddr) {
1431                 struct rtable __rcu **porig;
1432                 struct rtable *orig;
1433                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1434
1435                 if (rt_is_input_route(rt))
1436                         porig = &fnhe->fnhe_rth_input;
1437                 else
1438                         porig = &fnhe->fnhe_rth_output;
1439                 orig = rcu_dereference(*porig);
1440
1441                 if (fnhe->fnhe_genid != genid) {
1442                         fnhe->fnhe_genid = genid;
1443                         fnhe->fnhe_gw = 0;
1444                         fnhe->fnhe_pmtu = 0;
1445                         fnhe->fnhe_expires = 0;
1446                         fnhe->fnhe_mtu_locked = false;
1447                         fnhe_flush_routes(fnhe);
1448                         orig = NULL;
1449                 }
1450                 fill_route_from_fnhe(rt, fnhe);
1451                 if (!rt->rt_gw4) {
1452                         rt->rt_gw4 = daddr;
1453                         rt->rt_gw_family = AF_INET;
1454                 }
1455
1456                 if (do_cache) {
1457                         dst_hold(&rt->dst);
1458                         rcu_assign_pointer(*porig, rt);
1459                         if (orig) {
1460                                 dst_dev_put(&orig->dst);
1461                                 dst_release(&orig->dst);
1462                         }
1463                         ret = true;
1464                 }
1465
1466                 fnhe->fnhe_stamp = jiffies;
1467         }
1468         spin_unlock_bh(&fnhe_lock);
1469
1470         return ret;
1471 }
1472
1473 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1474 {
1475         struct rtable *orig, *prev, **p;
1476         bool ret = true;
1477
1478         if (rt_is_input_route(rt)) {
1479                 p = (struct rtable **)&nhc->nhc_rth_input;
1480         } else {
1481                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1482         }
1483         orig = *p;
1484
1485         /* hold dst before doing cmpxchg() to avoid race condition
1486          * on this dst
1487          */
1488         dst_hold(&rt->dst);
1489         prev = cmpxchg(p, orig, rt);
1490         if (prev == orig) {
1491                 if (orig) {
1492                         rt_add_uncached_list(orig);
1493                         dst_release(&orig->dst);
1494                 }
1495         } else {
1496                 dst_release(&rt->dst);
1497                 ret = false;
1498         }
1499
1500         return ret;
1501 }
1502
1503 struct uncached_list {
1504         spinlock_t              lock;
1505         struct list_head        head;
1506 };
1507
1508 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1509
1510 void rt_add_uncached_list(struct rtable *rt)
1511 {
1512         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1513
1514         rt->rt_uncached_list = ul;
1515
1516         spin_lock_bh(&ul->lock);
1517         list_add_tail(&rt->rt_uncached, &ul->head);
1518         spin_unlock_bh(&ul->lock);
1519 }
1520
1521 void rt_del_uncached_list(struct rtable *rt)
1522 {
1523         if (!list_empty(&rt->rt_uncached)) {
1524                 struct uncached_list *ul = rt->rt_uncached_list;
1525
1526                 spin_lock_bh(&ul->lock);
1527                 list_del(&rt->rt_uncached);
1528                 spin_unlock_bh(&ul->lock);
1529         }
1530 }
1531
1532 static void ipv4_dst_destroy(struct dst_entry *dst)
1533 {
1534         struct rtable *rt = (struct rtable *)dst;
1535
1536         ip_dst_metrics_put(dst);
1537         rt_del_uncached_list(rt);
1538 }
1539
1540 void rt_flush_dev(struct net_device *dev)
1541 {
1542         struct rtable *rt;
1543         int cpu;
1544
1545         for_each_possible_cpu(cpu) {
1546                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1547
1548                 spin_lock_bh(&ul->lock);
1549                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1550                         if (rt->dst.dev != dev)
1551                                 continue;
1552                         rt->dst.dev = blackhole_netdev;
1553                         dev_hold(rt->dst.dev);
1554                         dev_put(dev);
1555                 }
1556                 spin_unlock_bh(&ul->lock);
1557         }
1558 }
1559
1560 static bool rt_cache_valid(const struct rtable *rt)
1561 {
1562         return  rt &&
1563                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1564                 !rt_is_expired(rt);
1565 }
1566
1567 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1568                            const struct fib_result *res,
1569                            struct fib_nh_exception *fnhe,
1570                            struct fib_info *fi, u16 type, u32 itag,
1571                            const bool do_cache)
1572 {
1573         bool cached = false;
1574
1575         if (fi) {
1576                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1577
1578                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1579                         rt->rt_uses_gateway = 1;
1580                         rt->rt_gw_family = nhc->nhc_gw_family;
1581                         /* only INET and INET6 are supported */
1582                         if (likely(nhc->nhc_gw_family == AF_INET))
1583                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1584                         else
1585                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1586                 }
1587
1588                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1589
1590 #ifdef CONFIG_IP_ROUTE_CLASSID
1591                 if (nhc->nhc_family == AF_INET) {
1592                         struct fib_nh *nh;
1593
1594                         nh = container_of(nhc, struct fib_nh, nh_common);
1595                         rt->dst.tclassid = nh->nh_tclassid;
1596                 }
1597 #endif
1598                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1599                 if (unlikely(fnhe))
1600                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1601                 else if (do_cache)
1602                         cached = rt_cache_route(nhc, rt);
1603                 if (unlikely(!cached)) {
1604                         /* Routes we intend to cache in nexthop exception or
1605                          * FIB nexthop have the DST_NOCACHE bit clear.
1606                          * However, if we are unsuccessful at storing this
1607                          * route into the cache we really need to set it.
1608                          */
1609                         if (!rt->rt_gw4) {
1610                                 rt->rt_gw_family = AF_INET;
1611                                 rt->rt_gw4 = daddr;
1612                         }
1613                         rt_add_uncached_list(rt);
1614                 }
1615         } else
1616                 rt_add_uncached_list(rt);
1617
1618 #ifdef CONFIG_IP_ROUTE_CLASSID
1619 #ifdef CONFIG_IP_MULTIPLE_TABLES
1620         set_class_tag(rt, res->tclassid);
1621 #endif
1622         set_class_tag(rt, itag);
1623 #endif
1624 }
1625
1626 struct rtable *rt_dst_alloc(struct net_device *dev,
1627                             unsigned int flags, u16 type,
1628                             bool nopolicy, bool noxfrm)
1629 {
1630         struct rtable *rt;
1631
1632         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1633                        (nopolicy ? DST_NOPOLICY : 0) |
1634                        (noxfrm ? DST_NOXFRM : 0));
1635
1636         if (rt) {
1637                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1638                 rt->rt_flags = flags;
1639                 rt->rt_type = type;
1640                 rt->rt_is_input = 0;
1641                 rt->rt_iif = 0;
1642                 rt->rt_pmtu = 0;
1643                 rt->rt_mtu_locked = 0;
1644                 rt->rt_uses_gateway = 0;
1645                 rt->rt_gw_family = 0;
1646                 rt->rt_gw4 = 0;
1647                 INIT_LIST_HEAD(&rt->rt_uncached);
1648
1649                 rt->dst.output = ip_output;
1650                 if (flags & RTCF_LOCAL)
1651                         rt->dst.input = ip_local_deliver;
1652         }
1653
1654         return rt;
1655 }
1656 EXPORT_SYMBOL(rt_dst_alloc);
1657
1658 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1659 {
1660         struct rtable *new_rt;
1661
1662         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1663                            rt->dst.flags);
1664
1665         if (new_rt) {
1666                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1667                 new_rt->rt_flags = rt->rt_flags;
1668                 new_rt->rt_type = rt->rt_type;
1669                 new_rt->rt_is_input = rt->rt_is_input;
1670                 new_rt->rt_iif = rt->rt_iif;
1671                 new_rt->rt_pmtu = rt->rt_pmtu;
1672                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1673                 new_rt->rt_gw_family = rt->rt_gw_family;
1674                 if (rt->rt_gw_family == AF_INET)
1675                         new_rt->rt_gw4 = rt->rt_gw4;
1676                 else if (rt->rt_gw_family == AF_INET6)
1677                         new_rt->rt_gw6 = rt->rt_gw6;
1678                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1679
1680                 new_rt->dst.input = rt->dst.input;
1681                 new_rt->dst.output = rt->dst.output;
1682                 new_rt->dst.error = rt->dst.error;
1683                 new_rt->dst.lastuse = jiffies;
1684                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1685         }
1686         return new_rt;
1687 }
1688 EXPORT_SYMBOL(rt_dst_clone);
1689
1690 /* called in rcu_read_lock() section */
1691 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1692                           u8 tos, struct net_device *dev,
1693                           struct in_device *in_dev, u32 *itag)
1694 {
1695         int err;
1696
1697         /* Primary sanity checks. */
1698         if (!in_dev)
1699                 return -EINVAL;
1700
1701         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1702             skb->protocol != htons(ETH_P_IP))
1703                 return -EINVAL;
1704
1705         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1706                 return -EINVAL;
1707
1708         if (ipv4_is_zeronet(saddr)) {
1709                 if (!ipv4_is_local_multicast(daddr) &&
1710                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1711                         return -EINVAL;
1712         } else {
1713                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1714                                           in_dev, itag);
1715                 if (err < 0)
1716                         return err;
1717         }
1718         return 0;
1719 }
1720
1721 /* called in rcu_read_lock() section */
1722 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1723                              u8 tos, struct net_device *dev, int our)
1724 {
1725         struct in_device *in_dev = __in_dev_get_rcu(dev);
1726         unsigned int flags = RTCF_MULTICAST;
1727         struct rtable *rth;
1728         u32 itag = 0;
1729         int err;
1730
1731         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1732         if (err)
1733                 return err;
1734
1735         if (our)
1736                 flags |= RTCF_LOCAL;
1737
1738         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1739                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1740         if (!rth)
1741                 return -ENOBUFS;
1742
1743 #ifdef CONFIG_IP_ROUTE_CLASSID
1744         rth->dst.tclassid = itag;
1745 #endif
1746         rth->dst.output = ip_rt_bug;
1747         rth->rt_is_input= 1;
1748
1749 #ifdef CONFIG_IP_MROUTE
1750         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1751                 rth->dst.input = ip_mr_input;
1752 #endif
1753         RT_CACHE_STAT_INC(in_slow_mc);
1754
1755         skb_dst_set(skb, &rth->dst);
1756         return 0;
1757 }
1758
1759
1760 static void ip_handle_martian_source(struct net_device *dev,
1761                                      struct in_device *in_dev,
1762                                      struct sk_buff *skb,
1763                                      __be32 daddr,
1764                                      __be32 saddr)
1765 {
1766         RT_CACHE_STAT_INC(in_martian_src);
1767 #ifdef CONFIG_IP_ROUTE_VERBOSE
1768         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1769                 /*
1770                  *      RFC1812 recommendation, if source is martian,
1771                  *      the only hint is MAC header.
1772                  */
1773                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1774                         &daddr, &saddr, dev->name);
1775                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1776                         print_hex_dump(KERN_WARNING, "ll header: ",
1777                                        DUMP_PREFIX_OFFSET, 16, 1,
1778                                        skb_mac_header(skb),
1779                                        dev->hard_header_len, false);
1780                 }
1781         }
1782 #endif
1783 }
1784
1785 /* called in rcu_read_lock() section */
1786 static int __mkroute_input(struct sk_buff *skb,
1787                            const struct fib_result *res,
1788                            struct in_device *in_dev,
1789                            __be32 daddr, __be32 saddr, u32 tos)
1790 {
1791         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1792         struct net_device *dev = nhc->nhc_dev;
1793         struct fib_nh_exception *fnhe;
1794         struct rtable *rth;
1795         int err;
1796         struct in_device *out_dev;
1797         bool do_cache;
1798         u32 itag = 0;
1799
1800         /* get a working reference to the output device */
1801         out_dev = __in_dev_get_rcu(dev);
1802         if (!out_dev) {
1803                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1804                 return -EINVAL;
1805         }
1806
1807         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1808                                   in_dev->dev, in_dev, &itag);
1809         if (err < 0) {
1810                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1811                                          saddr);
1812
1813                 goto cleanup;
1814         }
1815
1816         do_cache = res->fi && !itag;
1817         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1818             skb->protocol == htons(ETH_P_IP)) {
1819                 __be32 gw;
1820
1821                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1822                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1823                     inet_addr_onlink(out_dev, saddr, gw))
1824                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1825         }
1826
1827         if (skb->protocol != htons(ETH_P_IP)) {
1828                 /* Not IP (i.e. ARP). Do not create route, if it is
1829                  * invalid for proxy arp. DNAT routes are always valid.
1830                  *
1831                  * Proxy arp feature have been extended to allow, ARP
1832                  * replies back to the same interface, to support
1833                  * Private VLAN switch technologies. See arp.c.
1834                  */
1835                 if (out_dev == in_dev &&
1836                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1837                         err = -EINVAL;
1838                         goto cleanup;
1839                 }
1840         }
1841
1842         fnhe = find_exception(nhc, daddr);
1843         if (do_cache) {
1844                 if (fnhe)
1845                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1846                 else
1847                         rth = rcu_dereference(nhc->nhc_rth_input);
1848                 if (rt_cache_valid(rth)) {
1849                         skb_dst_set_noref(skb, &rth->dst);
1850                         goto out;
1851                 }
1852         }
1853
1854         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1855                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1856                            IN_DEV_CONF_GET(out_dev, NOXFRM));
1857         if (!rth) {
1858                 err = -ENOBUFS;
1859                 goto cleanup;
1860         }
1861
1862         rth->rt_is_input = 1;
1863         RT_CACHE_STAT_INC(in_slow_tot);
1864
1865         rth->dst.input = ip_forward;
1866
1867         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1868                        do_cache);
1869         lwtunnel_set_redirect(&rth->dst);
1870         skb_dst_set(skb, &rth->dst);
1871 out:
1872         err = 0;
1873  cleanup:
1874         return err;
1875 }
1876
1877 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1878 /* To make ICMP packets follow the right flow, the multipath hash is
1879  * calculated from the inner IP addresses.
1880  */
1881 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1882                                  struct flow_keys *hash_keys)
1883 {
1884         const struct iphdr *outer_iph = ip_hdr(skb);
1885         const struct iphdr *key_iph = outer_iph;
1886         const struct iphdr *inner_iph;
1887         const struct icmphdr *icmph;
1888         struct iphdr _inner_iph;
1889         struct icmphdr _icmph;
1890
1891         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1892                 goto out;
1893
1894         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1895                 goto out;
1896
1897         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1898                                    &_icmph);
1899         if (!icmph)
1900                 goto out;
1901
1902         if (!icmp_is_err(icmph->type))
1903                 goto out;
1904
1905         inner_iph = skb_header_pointer(skb,
1906                                        outer_iph->ihl * 4 + sizeof(_icmph),
1907                                        sizeof(_inner_iph), &_inner_iph);
1908         if (!inner_iph)
1909                 goto out;
1910
1911         key_iph = inner_iph;
1912 out:
1913         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1914         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1915 }
1916
1917 /* if skb is set it will be used and fl4 can be NULL */
1918 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1919                        const struct sk_buff *skb, struct flow_keys *flkeys)
1920 {
1921         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1922         struct flow_keys hash_keys;
1923         u32 mhash;
1924
1925         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1926         case 0:
1927                 memset(&hash_keys, 0, sizeof(hash_keys));
1928                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1929                 if (skb) {
1930                         ip_multipath_l3_keys(skb, &hash_keys);
1931                 } else {
1932                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1933                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1934                 }
1935                 break;
1936         case 1:
1937                 /* skb is currently provided only when forwarding */
1938                 if (skb) {
1939                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1940                         struct flow_keys keys;
1941
1942                         /* short-circuit if we already have L4 hash present */
1943                         if (skb->l4_hash)
1944                                 return skb_get_hash_raw(skb) >> 1;
1945
1946                         memset(&hash_keys, 0, sizeof(hash_keys));
1947
1948                         if (!flkeys) {
1949                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1950                                 flkeys = &keys;
1951                         }
1952
1953                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1954                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1955                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1956                         hash_keys.ports.src = flkeys->ports.src;
1957                         hash_keys.ports.dst = flkeys->ports.dst;
1958                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1959                 } else {
1960                         memset(&hash_keys, 0, sizeof(hash_keys));
1961                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1962                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1963                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1964                         hash_keys.ports.src = fl4->fl4_sport;
1965                         hash_keys.ports.dst = fl4->fl4_dport;
1966                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1967                 }
1968                 break;
1969         case 2:
1970                 memset(&hash_keys, 0, sizeof(hash_keys));
1971                 /* skb is currently provided only when forwarding */
1972                 if (skb) {
1973                         struct flow_keys keys;
1974
1975                         skb_flow_dissect_flow_keys(skb, &keys, 0);
1976                         /* Inner can be v4 or v6 */
1977                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1978                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1979                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1980                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1981                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1982                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1983                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1984                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1985                                 hash_keys.tags.flow_label = keys.tags.flow_label;
1986                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1987                         } else {
1988                                 /* Same as case 0 */
1989                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1990                                 ip_multipath_l3_keys(skb, &hash_keys);
1991                         }
1992                 } else {
1993                         /* Same as case 0 */
1994                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1995                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1996                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1997                 }
1998                 break;
1999         }
2000         mhash = flow_hash_from_keys(&hash_keys);
2001
2002         if (multipath_hash)
2003                 mhash = jhash_2words(mhash, multipath_hash, 0);
2004
2005         return mhash >> 1;
2006 }
2007 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2008
2009 static int ip_mkroute_input(struct sk_buff *skb,
2010                             struct fib_result *res,
2011                             struct in_device *in_dev,
2012                             __be32 daddr, __be32 saddr, u32 tos,
2013                             struct flow_keys *hkeys)
2014 {
2015 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2016         if (res->fi && fib_info_num_path(res->fi) > 1) {
2017                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2018
2019                 fib_select_multipath(res, h);
2020         }
2021 #endif
2022
2023         /* create a routing cache entry */
2024         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2025 }
2026
2027 /* Implements all the saddr-related checks as ip_route_input_slow(),
2028  * assuming daddr is valid and the destination is not a local broadcast one.
2029  * Uses the provided hint instead of performing a route lookup.
2030  */
2031 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2032                       u8 tos, struct net_device *dev,
2033                       const struct sk_buff *hint)
2034 {
2035         struct in_device *in_dev = __in_dev_get_rcu(dev);
2036         struct rtable *rt = skb_rtable(hint);
2037         struct net *net = dev_net(dev);
2038         int err = -EINVAL;
2039         u32 tag = 0;
2040
2041         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2042                 goto martian_source;
2043
2044         if (ipv4_is_zeronet(saddr))
2045                 goto martian_source;
2046
2047         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2048                 goto martian_source;
2049
2050         if (rt->rt_type != RTN_LOCAL)
2051                 goto skip_validate_source;
2052
2053         tos &= IPTOS_RT_MASK;
2054         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2055         if (err < 0)
2056                 goto martian_source;
2057
2058 skip_validate_source:
2059         skb_dst_copy(skb, hint);
2060         return 0;
2061
2062 martian_source:
2063         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2064         return err;
2065 }
2066
2067 /*
2068  *      NOTE. We drop all the packets that has local source
2069  *      addresses, because every properly looped back packet
2070  *      must have correct destination already attached by output routine.
2071  *      Changes in the enforced policies must be applied also to
2072  *      ip_route_use_hint().
2073  *
2074  *      Such approach solves two big problems:
2075  *      1. Not simplex devices are handled properly.
2076  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2077  *      called with rcu_read_lock()
2078  */
2079
2080 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2081                                u8 tos, struct net_device *dev,
2082                                struct fib_result *res)
2083 {
2084         struct in_device *in_dev = __in_dev_get_rcu(dev);
2085         struct flow_keys *flkeys = NULL, _flkeys;
2086         struct net    *net = dev_net(dev);
2087         struct ip_tunnel_info *tun_info;
2088         int             err = -EINVAL;
2089         unsigned int    flags = 0;
2090         u32             itag = 0;
2091         struct rtable   *rth;
2092         struct flowi4   fl4;
2093         bool do_cache = true;
2094
2095         /* IP on this device is disabled. */
2096
2097         if (!in_dev)
2098                 goto out;
2099
2100         /* Check for the most weird martians, which can be not detected
2101            by fib_lookup.
2102          */
2103
2104         tun_info = skb_tunnel_info(skb);
2105         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2106                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2107         else
2108                 fl4.flowi4_tun_key.tun_id = 0;
2109         skb_dst_drop(skb);
2110
2111         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2112                 goto martian_source;
2113
2114         res->fi = NULL;
2115         res->table = NULL;
2116         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2117                 goto brd_input;
2118
2119         /* Accept zero addresses only to limited broadcast;
2120          * I even do not know to fix it or not. Waiting for complains :-)
2121          */
2122         if (ipv4_is_zeronet(saddr))
2123                 goto martian_source;
2124
2125         if (ipv4_is_zeronet(daddr))
2126                 goto martian_destination;
2127
2128         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2129          * and call it once if daddr or/and saddr are loopback addresses
2130          */
2131         if (ipv4_is_loopback(daddr)) {
2132                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2133                         goto martian_destination;
2134         } else if (ipv4_is_loopback(saddr)) {
2135                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2136                         goto martian_source;
2137         }
2138
2139         /*
2140          *      Now we are ready to route packet.
2141          */
2142         fl4.flowi4_oif = 0;
2143         fl4.flowi4_iif = dev->ifindex;
2144         fl4.flowi4_mark = skb->mark;
2145         fl4.flowi4_tos = tos;
2146         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2147         fl4.flowi4_flags = 0;
2148         fl4.daddr = daddr;
2149         fl4.saddr = saddr;
2150         fl4.flowi4_uid = sock_net_uid(net, NULL);
2151
2152         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2153                 flkeys = &_flkeys;
2154         } else {
2155                 fl4.flowi4_proto = 0;
2156                 fl4.fl4_sport = 0;
2157                 fl4.fl4_dport = 0;
2158         }
2159
2160         err = fib_lookup(net, &fl4, res, 0);
2161         if (err != 0) {
2162                 if (!IN_DEV_FORWARD(in_dev))
2163                         err = -EHOSTUNREACH;
2164                 goto no_route;
2165         }
2166
2167         if (res->type == RTN_BROADCAST) {
2168                 if (IN_DEV_BFORWARD(in_dev))
2169                         goto make_route;
2170                 /* not do cache if bc_forwarding is enabled */
2171                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2172                         do_cache = false;
2173                 goto brd_input;
2174         }
2175
2176         if (res->type == RTN_LOCAL) {
2177                 err = fib_validate_source(skb, saddr, daddr, tos,
2178                                           0, dev, in_dev, &itag);
2179                 if (err < 0)
2180                         goto martian_source;
2181                 goto local_input;
2182         }
2183
2184         if (!IN_DEV_FORWARD(in_dev)) {
2185                 err = -EHOSTUNREACH;
2186                 goto no_route;
2187         }
2188         if (res->type != RTN_UNICAST)
2189                 goto martian_destination;
2190
2191 make_route:
2192         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2193 out:    return err;
2194
2195 brd_input:
2196         if (skb->protocol != htons(ETH_P_IP))
2197                 goto e_inval;
2198
2199         if (!ipv4_is_zeronet(saddr)) {
2200                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2201                                           in_dev, &itag);
2202                 if (err < 0)
2203                         goto martian_source;
2204         }
2205         flags |= RTCF_BROADCAST;
2206         res->type = RTN_BROADCAST;
2207         RT_CACHE_STAT_INC(in_brd);
2208
2209 local_input:
2210         do_cache &= res->fi && !itag;
2211         if (do_cache) {
2212                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2213
2214                 rth = rcu_dereference(nhc->nhc_rth_input);
2215                 if (rt_cache_valid(rth)) {
2216                         skb_dst_set_noref(skb, &rth->dst);
2217                         err = 0;
2218                         goto out;
2219                 }
2220         }
2221
2222         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2223                            flags | RTCF_LOCAL, res->type,
2224                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2225         if (!rth)
2226                 goto e_nobufs;
2227
2228         rth->dst.output= ip_rt_bug;
2229 #ifdef CONFIG_IP_ROUTE_CLASSID
2230         rth->dst.tclassid = itag;
2231 #endif
2232         rth->rt_is_input = 1;
2233
2234         RT_CACHE_STAT_INC(in_slow_tot);
2235         if (res->type == RTN_UNREACHABLE) {
2236                 rth->dst.input= ip_error;
2237                 rth->dst.error= -err;
2238                 rth->rt_flags   &= ~RTCF_LOCAL;
2239         }
2240
2241         if (do_cache) {
2242                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2243
2244                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2245                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2246                         WARN_ON(rth->dst.input == lwtunnel_input);
2247                         rth->dst.lwtstate->orig_input = rth->dst.input;
2248                         rth->dst.input = lwtunnel_input;
2249                 }
2250
2251                 if (unlikely(!rt_cache_route(nhc, rth)))
2252                         rt_add_uncached_list(rth);
2253         }
2254         skb_dst_set(skb, &rth->dst);
2255         err = 0;
2256         goto out;
2257
2258 no_route:
2259         RT_CACHE_STAT_INC(in_no_route);
2260         res->type = RTN_UNREACHABLE;
2261         res->fi = NULL;
2262         res->table = NULL;
2263         goto local_input;
2264
2265         /*
2266          *      Do not cache martian addresses: they should be logged (RFC1812)
2267          */
2268 martian_destination:
2269         RT_CACHE_STAT_INC(in_martian_dst);
2270 #ifdef CONFIG_IP_ROUTE_VERBOSE
2271         if (IN_DEV_LOG_MARTIANS(in_dev))
2272                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2273                                      &daddr, &saddr, dev->name);
2274 #endif
2275
2276 e_inval:
2277         err = -EINVAL;
2278         goto out;
2279
2280 e_nobufs:
2281         err = -ENOBUFS;
2282         goto out;
2283
2284 martian_source:
2285         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2286         goto out;
2287 }
2288
2289 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2290                          u8 tos, struct net_device *dev)
2291 {
2292         struct fib_result res;
2293         int err;
2294
2295         tos &= IPTOS_RT_MASK;
2296         rcu_read_lock();
2297         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2298         rcu_read_unlock();
2299
2300         return err;
2301 }
2302 EXPORT_SYMBOL(ip_route_input_noref);
2303
2304 /* called with rcu_read_lock held */
2305 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2306                        u8 tos, struct net_device *dev, struct fib_result *res)
2307 {
2308         /* Multicast recognition logic is moved from route cache to here.
2309            The problem was that too many Ethernet cards have broken/missing
2310            hardware multicast filters :-( As result the host on multicasting
2311            network acquires a lot of useless route cache entries, sort of
2312            SDR messages from all the world. Now we try to get rid of them.
2313            Really, provided software IP multicast filter is organized
2314            reasonably (at least, hashed), it does not result in a slowdown
2315            comparing with route cache reject entries.
2316            Note, that multicast routers are not affected, because
2317            route cache entry is created eventually.
2318          */
2319         if (ipv4_is_multicast(daddr)) {
2320                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2321                 int our = 0;
2322                 int err = -EINVAL;
2323
2324                 if (!in_dev)
2325                         return err;
2326                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2327                                       ip_hdr(skb)->protocol);
2328
2329                 /* check l3 master if no match yet */
2330                 if (!our && netif_is_l3_slave(dev)) {
2331                         struct in_device *l3_in_dev;
2332
2333                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2334                         if (l3_in_dev)
2335                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2336                                                       ip_hdr(skb)->protocol);
2337                 }
2338
2339                 if (our
2340 #ifdef CONFIG_IP_MROUTE
2341                         ||
2342                     (!ipv4_is_local_multicast(daddr) &&
2343                      IN_DEV_MFORWARD(in_dev))
2344 #endif
2345                    ) {
2346                         err = ip_route_input_mc(skb, daddr, saddr,
2347                                                 tos, dev, our);
2348                 }
2349                 return err;
2350         }
2351
2352         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2353 }
2354
2355 /* called with rcu_read_lock() */
2356 static struct rtable *__mkroute_output(const struct fib_result *res,
2357                                        const struct flowi4 *fl4, int orig_oif,
2358                                        struct net_device *dev_out,
2359                                        unsigned int flags)
2360 {
2361         struct fib_info *fi = res->fi;
2362         struct fib_nh_exception *fnhe;
2363         struct in_device *in_dev;
2364         u16 type = res->type;
2365         struct rtable *rth;
2366         bool do_cache;
2367
2368         in_dev = __in_dev_get_rcu(dev_out);
2369         if (!in_dev)
2370                 return ERR_PTR(-EINVAL);
2371
2372         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2373                 if (ipv4_is_loopback(fl4->saddr) &&
2374                     !(dev_out->flags & IFF_LOOPBACK) &&
2375                     !netif_is_l3_master(dev_out))
2376                         return ERR_PTR(-EINVAL);
2377
2378         if (ipv4_is_lbcast(fl4->daddr))
2379                 type = RTN_BROADCAST;
2380         else if (ipv4_is_multicast(fl4->daddr))
2381                 type = RTN_MULTICAST;
2382         else if (ipv4_is_zeronet(fl4->daddr))
2383                 return ERR_PTR(-EINVAL);
2384
2385         if (dev_out->flags & IFF_LOOPBACK)
2386                 flags |= RTCF_LOCAL;
2387
2388         do_cache = true;
2389         if (type == RTN_BROADCAST) {
2390                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2391                 fi = NULL;
2392         } else if (type == RTN_MULTICAST) {
2393                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2394                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2395                                      fl4->flowi4_proto))
2396                         flags &= ~RTCF_LOCAL;
2397                 else
2398                         do_cache = false;
2399                 /* If multicast route do not exist use
2400                  * default one, but do not gateway in this case.
2401                  * Yes, it is hack.
2402                  */
2403                 if (fi && res->prefixlen < 4)
2404                         fi = NULL;
2405         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2406                    (orig_oif != dev_out->ifindex)) {
2407                 /* For local routes that require a particular output interface
2408                  * we do not want to cache the result.  Caching the result
2409                  * causes incorrect behaviour when there are multiple source
2410                  * addresses on the interface, the end result being that if the
2411                  * intended recipient is waiting on that interface for the
2412                  * packet he won't receive it because it will be delivered on
2413                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2414                  * be set to the loopback interface as well.
2415                  */
2416                 do_cache = false;
2417         }
2418
2419         fnhe = NULL;
2420         do_cache &= fi != NULL;
2421         if (fi) {
2422                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2423                 struct rtable __rcu **prth;
2424
2425                 fnhe = find_exception(nhc, fl4->daddr);
2426                 if (!do_cache)
2427                         goto add;
2428                 if (fnhe) {
2429                         prth = &fnhe->fnhe_rth_output;
2430                 } else {
2431                         if (unlikely(fl4->flowi4_flags &
2432                                      FLOWI_FLAG_KNOWN_NH &&
2433                                      !(nhc->nhc_gw_family &&
2434                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2435                                 do_cache = false;
2436                                 goto add;
2437                         }
2438                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2439                 }
2440                 rth = rcu_dereference(*prth);
2441                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2442                         return rth;
2443         }
2444
2445 add:
2446         rth = rt_dst_alloc(dev_out, flags, type,
2447                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2448                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2449         if (!rth)
2450                 return ERR_PTR(-ENOBUFS);
2451
2452         rth->rt_iif = orig_oif;
2453
2454         RT_CACHE_STAT_INC(out_slow_tot);
2455
2456         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2457                 if (flags & RTCF_LOCAL &&
2458                     !(dev_out->flags & IFF_LOOPBACK)) {
2459                         rth->dst.output = ip_mc_output;
2460                         RT_CACHE_STAT_INC(out_slow_mc);
2461                 }
2462 #ifdef CONFIG_IP_MROUTE
2463                 if (type == RTN_MULTICAST) {
2464                         if (IN_DEV_MFORWARD(in_dev) &&
2465                             !ipv4_is_local_multicast(fl4->daddr)) {
2466                                 rth->dst.input = ip_mr_input;
2467                                 rth->dst.output = ip_mc_output;
2468                         }
2469                 }
2470 #endif
2471         }
2472
2473         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2474         lwtunnel_set_redirect(&rth->dst);
2475
2476         return rth;
2477 }
2478
2479 /*
2480  * Major route resolver routine.
2481  */
2482
2483 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2484                                         const struct sk_buff *skb)
2485 {
2486         __u8 tos = RT_FL_TOS(fl4);
2487         struct fib_result res = {
2488                 .type           = RTN_UNSPEC,
2489                 .fi             = NULL,
2490                 .table          = NULL,
2491                 .tclassid       = 0,
2492         };
2493         struct rtable *rth;
2494
2495         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2496         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2497         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2498                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2499
2500         rcu_read_lock();
2501         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2502         rcu_read_unlock();
2503
2504         return rth;
2505 }
2506 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2507
2508 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2509                                             struct fib_result *res,
2510                                             const struct sk_buff *skb)
2511 {
2512         struct net_device *dev_out = NULL;
2513         int orig_oif = fl4->flowi4_oif;
2514         unsigned int flags = 0;
2515         struct rtable *rth;
2516         int err;
2517
2518         if (fl4->saddr) {
2519                 if (ipv4_is_multicast(fl4->saddr) ||
2520                     ipv4_is_lbcast(fl4->saddr) ||
2521                     ipv4_is_zeronet(fl4->saddr)) {
2522                         rth = ERR_PTR(-EINVAL);
2523                         goto out;
2524                 }
2525
2526                 rth = ERR_PTR(-ENETUNREACH);
2527
2528                 /* I removed check for oif == dev_out->oif here.
2529                    It was wrong for two reasons:
2530                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2531                       is assigned to multiple interfaces.
2532                    2. Moreover, we are allowed to send packets with saddr
2533                       of another iface. --ANK
2534                  */
2535
2536                 if (fl4->flowi4_oif == 0 &&
2537                     (ipv4_is_multicast(fl4->daddr) ||
2538                      ipv4_is_lbcast(fl4->daddr))) {
2539                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2540                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2541                         if (!dev_out)
2542                                 goto out;
2543
2544                         /* Special hack: user can direct multicasts
2545                            and limited broadcast via necessary interface
2546                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2547                            This hack is not just for fun, it allows
2548                            vic,vat and friends to work.
2549                            They bind socket to loopback, set ttl to zero
2550                            and expect that it will work.
2551                            From the viewpoint of routing cache they are broken,
2552                            because we are not allowed to build multicast path
2553                            with loopback source addr (look, routing cache
2554                            cannot know, that ttl is zero, so that packet
2555                            will not leave this host and route is valid).
2556                            Luckily, this hack is good workaround.
2557                          */
2558
2559                         fl4->flowi4_oif = dev_out->ifindex;
2560                         goto make_route;
2561                 }
2562
2563                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2564                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2565                         if (!__ip_dev_find(net, fl4->saddr, false))
2566                                 goto out;
2567                 }
2568         }
2569
2570
2571         if (fl4->flowi4_oif) {
2572                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2573                 rth = ERR_PTR(-ENODEV);
2574                 if (!dev_out)
2575                         goto out;
2576
2577                 /* RACE: Check return value of inet_select_addr instead. */
2578                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2579                         rth = ERR_PTR(-ENETUNREACH);
2580                         goto out;
2581                 }
2582                 if (ipv4_is_local_multicast(fl4->daddr) ||
2583                     ipv4_is_lbcast(fl4->daddr) ||
2584                     fl4->flowi4_proto == IPPROTO_IGMP) {
2585                         if (!fl4->saddr)
2586                                 fl4->saddr = inet_select_addr(dev_out, 0,
2587                                                               RT_SCOPE_LINK);
2588                         goto make_route;
2589                 }
2590                 if (!fl4->saddr) {
2591                         if (ipv4_is_multicast(fl4->daddr))
2592                                 fl4->saddr = inet_select_addr(dev_out, 0,
2593                                                               fl4->flowi4_scope);
2594                         else if (!fl4->daddr)
2595                                 fl4->saddr = inet_select_addr(dev_out, 0,
2596                                                               RT_SCOPE_HOST);
2597                 }
2598         }
2599
2600         if (!fl4->daddr) {
2601                 fl4->daddr = fl4->saddr;
2602                 if (!fl4->daddr)
2603                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2604                 dev_out = net->loopback_dev;
2605                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2606                 res->type = RTN_LOCAL;
2607                 flags |= RTCF_LOCAL;
2608                 goto make_route;
2609         }
2610
2611         err = fib_lookup(net, fl4, res, 0);
2612         if (err) {
2613                 res->fi = NULL;
2614                 res->table = NULL;
2615                 if (fl4->flowi4_oif &&
2616                     (ipv4_is_multicast(fl4->daddr) ||
2617                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2618                         /* Apparently, routing tables are wrong. Assume,
2619                            that the destination is on link.
2620
2621                            WHY? DW.
2622                            Because we are allowed to send to iface
2623                            even if it has NO routes and NO assigned
2624                            addresses. When oif is specified, routing
2625                            tables are looked up with only one purpose:
2626                            to catch if destination is gatewayed, rather than
2627                            direct. Moreover, if MSG_DONTROUTE is set,
2628                            we send packet, ignoring both routing tables
2629                            and ifaddr state. --ANK
2630
2631
2632                            We could make it even if oif is unknown,
2633                            likely IPv6, but we do not.
2634                          */
2635
2636                         if (fl4->saddr == 0)
2637                                 fl4->saddr = inet_select_addr(dev_out, 0,
2638                                                               RT_SCOPE_LINK);
2639                         res->type = RTN_UNICAST;
2640                         goto make_route;
2641                 }
2642                 rth = ERR_PTR(err);
2643                 goto out;
2644         }
2645
2646         if (res->type == RTN_LOCAL) {
2647                 if (!fl4->saddr) {
2648                         if (res->fi->fib_prefsrc)
2649                                 fl4->saddr = res->fi->fib_prefsrc;
2650                         else
2651                                 fl4->saddr = fl4->daddr;
2652                 }
2653
2654                 /* L3 master device is the loopback for that domain */
2655                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2656                         net->loopback_dev;
2657
2658                 /* make sure orig_oif points to fib result device even
2659                  * though packet rx/tx happens over loopback or l3mdev
2660                  */
2661                 orig_oif = FIB_RES_OIF(*res);
2662
2663                 fl4->flowi4_oif = dev_out->ifindex;
2664                 flags |= RTCF_LOCAL;
2665                 goto make_route;
2666         }
2667
2668         fib_select_path(net, res, fl4, skb);
2669
2670         dev_out = FIB_RES_DEV(*res);
2671         fl4->flowi4_oif = dev_out->ifindex;
2672
2673
2674 make_route:
2675         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2676
2677 out:
2678         return rth;
2679 }
2680
2681 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2682 {
2683         return NULL;
2684 }
2685
2686 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2687 {
2688         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2689
2690         return mtu ? : dst->dev->mtu;
2691 }
2692
2693 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2694                                           struct sk_buff *skb, u32 mtu,
2695                                           bool confirm_neigh)
2696 {
2697 }
2698
2699 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2700                                        struct sk_buff *skb)
2701 {
2702 }
2703
2704 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2705                                           unsigned long old)
2706 {
2707         return NULL;
2708 }
2709
2710 static struct dst_ops ipv4_dst_blackhole_ops = {
2711         .family                 =       AF_INET,
2712         .check                  =       ipv4_blackhole_dst_check,
2713         .mtu                    =       ipv4_blackhole_mtu,
2714         .default_advmss         =       ipv4_default_advmss,
2715         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2716         .redirect               =       ipv4_rt_blackhole_redirect,
2717         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2718         .neigh_lookup           =       ipv4_neigh_lookup,
2719 };
2720
2721 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2722 {
2723         struct rtable *ort = (struct rtable *) dst_orig;
2724         struct rtable *rt;
2725
2726         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2727         if (rt) {
2728                 struct dst_entry *new = &rt->dst;
2729
2730                 new->__use = 1;
2731                 new->input = dst_discard;
2732                 new->output = dst_discard_out;
2733
2734                 new->dev = net->loopback_dev;
2735                 if (new->dev)
2736                         dev_hold(new->dev);
2737
2738                 rt->rt_is_input = ort->rt_is_input;
2739                 rt->rt_iif = ort->rt_iif;
2740                 rt->rt_pmtu = ort->rt_pmtu;
2741                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2742
2743                 rt->rt_genid = rt_genid_ipv4(net);
2744                 rt->rt_flags = ort->rt_flags;
2745                 rt->rt_type = ort->rt_type;
2746                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2747                 rt->rt_gw_family = ort->rt_gw_family;
2748                 if (rt->rt_gw_family == AF_INET)
2749                         rt->rt_gw4 = ort->rt_gw4;
2750                 else if (rt->rt_gw_family == AF_INET6)
2751                         rt->rt_gw6 = ort->rt_gw6;
2752
2753                 INIT_LIST_HEAD(&rt->rt_uncached);
2754         }
2755
2756         dst_release(dst_orig);
2757
2758         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2759 }
2760
2761 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2762                                     const struct sock *sk)
2763 {
2764         struct rtable *rt = __ip_route_output_key(net, flp4);
2765
2766         if (IS_ERR(rt))
2767                 return rt;
2768
2769         if (flp4->flowi4_proto)
2770                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2771                                                         flowi4_to_flowi(flp4),
2772                                                         sk, 0);
2773
2774         return rt;
2775 }
2776 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2777
2778 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2779                                       struct net_device *dev,
2780                                       struct net *net, __be32 *saddr,
2781                                       const struct ip_tunnel_info *info,
2782                                       u8 protocol, bool use_cache)
2783 {
2784 #ifdef CONFIG_DST_CACHE
2785         struct dst_cache *dst_cache;
2786 #endif
2787         struct rtable *rt = NULL;
2788         struct flowi4 fl4;
2789         __u8 tos;
2790
2791 #ifdef CONFIG_DST_CACHE
2792         dst_cache = (struct dst_cache *)&info->dst_cache;
2793         if (use_cache) {
2794                 rt = dst_cache_get_ip4(dst_cache, saddr);
2795                 if (rt)
2796                         return rt;
2797         }
2798 #endif
2799         memset(&fl4, 0, sizeof(fl4));
2800         fl4.flowi4_mark = skb->mark;
2801         fl4.flowi4_proto = protocol;
2802         fl4.daddr = info->key.u.ipv4.dst;
2803         fl4.saddr = info->key.u.ipv4.src;
2804         tos = info->key.tos;
2805         fl4.flowi4_tos = RT_TOS(tos);
2806
2807         rt = ip_route_output_key(net, &fl4);
2808         if (IS_ERR(rt)) {
2809                 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2810                 return ERR_PTR(-ENETUNREACH);
2811         }
2812         if (rt->dst.dev == dev) { /* is this necessary? */
2813                 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2814                 ip_rt_put(rt);
2815                 return ERR_PTR(-ELOOP);
2816         }
2817 #ifdef CONFIG_DST_CACHE
2818         if (use_cache)
2819                 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2820 #endif
2821         *saddr = fl4.saddr;
2822         return rt;
2823 }
2824 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2825
2826 /* called with rcu_read_lock held */
2827 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2828                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2829                         struct sk_buff *skb, u32 portid, u32 seq,
2830                         unsigned int flags)
2831 {
2832         struct rtmsg *r;
2833         struct nlmsghdr *nlh;
2834         unsigned long expires = 0;
2835         u32 error;
2836         u32 metrics[RTAX_MAX];
2837
2838         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2839         if (!nlh)
2840                 return -EMSGSIZE;
2841
2842         r = nlmsg_data(nlh);
2843         r->rtm_family    = AF_INET;
2844         r->rtm_dst_len  = 32;
2845         r->rtm_src_len  = 0;
2846         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2847         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2848         if (nla_put_u32(skb, RTA_TABLE, table_id))
2849                 goto nla_put_failure;
2850         r->rtm_type     = rt->rt_type;
2851         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2852         r->rtm_protocol = RTPROT_UNSPEC;
2853         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2854         if (rt->rt_flags & RTCF_NOTIFY)
2855                 r->rtm_flags |= RTM_F_NOTIFY;
2856         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2857                 r->rtm_flags |= RTCF_DOREDIRECT;
2858
2859         if (nla_put_in_addr(skb, RTA_DST, dst))
2860                 goto nla_put_failure;
2861         if (src) {
2862                 r->rtm_src_len = 32;
2863                 if (nla_put_in_addr(skb, RTA_SRC, src))
2864                         goto nla_put_failure;
2865         }
2866         if (rt->dst.dev &&
2867             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2868                 goto nla_put_failure;
2869 #ifdef CONFIG_IP_ROUTE_CLASSID
2870         if (rt->dst.tclassid &&
2871             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2872                 goto nla_put_failure;
2873 #endif
2874         if (fl4 && !rt_is_input_route(rt) &&
2875             fl4->saddr != src) {
2876                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2877                         goto nla_put_failure;
2878         }
2879         if (rt->rt_uses_gateway) {
2880                 if (rt->rt_gw_family == AF_INET &&
2881                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2882                         goto nla_put_failure;
2883                 } else if (rt->rt_gw_family == AF_INET6) {
2884                         int alen = sizeof(struct in6_addr);
2885                         struct nlattr *nla;
2886                         struct rtvia *via;
2887
2888                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2889                         if (!nla)
2890                                 goto nla_put_failure;
2891
2892                         via = nla_data(nla);
2893                         via->rtvia_family = AF_INET6;
2894                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2895                 }
2896         }
2897
2898         expires = rt->dst.expires;
2899         if (expires) {
2900                 unsigned long now = jiffies;
2901
2902                 if (time_before(now, expires))
2903                         expires -= now;
2904                 else
2905                         expires = 0;
2906         }
2907
2908         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2909         if (rt->rt_pmtu && expires)
2910                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2911         if (rt->rt_mtu_locked && expires)
2912                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2913         if (rtnetlink_put_metrics(skb, metrics) < 0)
2914                 goto nla_put_failure;
2915
2916         if (fl4) {
2917                 if (fl4->flowi4_mark &&
2918                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2919                         goto nla_put_failure;
2920
2921                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2922                     nla_put_u32(skb, RTA_UID,
2923                                 from_kuid_munged(current_user_ns(),
2924                                                  fl4->flowi4_uid)))
2925                         goto nla_put_failure;
2926
2927                 if (rt_is_input_route(rt)) {
2928 #ifdef CONFIG_IP_MROUTE
2929                         if (ipv4_is_multicast(dst) &&
2930                             !ipv4_is_local_multicast(dst) &&
2931                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2932                                 int err = ipmr_get_route(net, skb,
2933                                                          fl4->saddr, fl4->daddr,
2934                                                          r, portid);
2935
2936                                 if (err <= 0) {
2937                                         if (err == 0)
2938                                                 return 0;
2939                                         goto nla_put_failure;
2940                                 }
2941                         } else
2942 #endif
2943                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2944                                         goto nla_put_failure;
2945                 }
2946         }
2947
2948         error = rt->dst.error;
2949
2950         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2951                 goto nla_put_failure;
2952
2953         nlmsg_end(skb, nlh);
2954         return 0;
2955
2956 nla_put_failure:
2957         nlmsg_cancel(skb, nlh);
2958         return -EMSGSIZE;
2959 }
2960
2961 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2962                             struct netlink_callback *cb, u32 table_id,
2963                             struct fnhe_hash_bucket *bucket, int genid,
2964                             int *fa_index, int fa_start, unsigned int flags)
2965 {
2966         int i;
2967
2968         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2969                 struct fib_nh_exception *fnhe;
2970
2971                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2972                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2973                         struct rtable *rt;
2974                         int err;
2975
2976                         if (*fa_index < fa_start)
2977                                 goto next;
2978
2979                         if (fnhe->fnhe_genid != genid)
2980                                 goto next;
2981
2982                         if (fnhe->fnhe_expires &&
2983                             time_after(jiffies, fnhe->fnhe_expires))
2984                                 goto next;
2985
2986                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2987                         if (!rt)
2988                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2989                         if (!rt)
2990                                 goto next;
2991
2992                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2993                                            table_id, NULL, skb,
2994                                            NETLINK_CB(cb->skb).portid,
2995                                            cb->nlh->nlmsg_seq, flags);
2996                         if (err)
2997                                 return err;
2998 next:
2999                         (*fa_index)++;
3000                 }
3001         }
3002
3003         return 0;
3004 }
3005
3006 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3007                        u32 table_id, struct fib_info *fi,
3008                        int *fa_index, int fa_start, unsigned int flags)
3009 {
3010         struct net *net = sock_net(cb->skb->sk);
3011         int nhsel, genid = fnhe_genid(net);
3012
3013         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3014                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3015                 struct fnhe_hash_bucket *bucket;
3016                 int err;
3017
3018                 if (nhc->nhc_flags & RTNH_F_DEAD)
3019                         continue;
3020
3021                 rcu_read_lock();
3022                 bucket = rcu_dereference(nhc->nhc_exceptions);
3023                 err = 0;
3024                 if (bucket)
3025                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3026                                                genid, fa_index, fa_start,
3027                                                flags);
3028                 rcu_read_unlock();
3029                 if (err)
3030                         return err;
3031         }
3032
3033         return 0;
3034 }
3035
3036 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3037                                                    u8 ip_proto, __be16 sport,
3038                                                    __be16 dport)
3039 {
3040         struct sk_buff *skb;
3041         struct iphdr *iph;
3042
3043         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3044         if (!skb)
3045                 return NULL;
3046
3047         /* Reserve room for dummy headers, this skb can pass
3048          * through good chunk of routing engine.
3049          */
3050         skb_reset_mac_header(skb);
3051         skb_reset_network_header(skb);
3052         skb->protocol = htons(ETH_P_IP);
3053         iph = skb_put(skb, sizeof(struct iphdr));
3054         iph->protocol = ip_proto;
3055         iph->saddr = src;
3056         iph->daddr = dst;
3057         iph->version = 0x4;
3058         iph->frag_off = 0;
3059         iph->ihl = 0x5;
3060         skb_set_transport_header(skb, skb->len);
3061
3062         switch (iph->protocol) {
3063         case IPPROTO_UDP: {
3064                 struct udphdr *udph;
3065
3066                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3067                 udph->source = sport;
3068                 udph->dest = dport;
3069                 udph->len = sizeof(struct udphdr);
3070                 udph->check = 0;
3071                 break;
3072         }
3073         case IPPROTO_TCP: {
3074                 struct tcphdr *tcph;
3075
3076                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3077                 tcph->source    = sport;
3078                 tcph->dest      = dport;
3079                 tcph->doff      = sizeof(struct tcphdr) / 4;
3080                 tcph->rst = 1;
3081                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3082                                             src, dst, 0);
3083                 break;
3084         }
3085         case IPPROTO_ICMP: {
3086                 struct icmphdr *icmph;
3087
3088                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3089                 icmph->type = ICMP_ECHO;
3090                 icmph->code = 0;
3091         }
3092         }
3093
3094         return skb;
3095 }
3096
3097 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3098                                        const struct nlmsghdr *nlh,
3099                                        struct nlattr **tb,
3100                                        struct netlink_ext_ack *extack)
3101 {
3102         struct rtmsg *rtm;
3103         int i, err;
3104
3105         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3106                 NL_SET_ERR_MSG(extack,
3107                                "ipv4: Invalid header for route get request");
3108                 return -EINVAL;
3109         }
3110
3111         if (!netlink_strict_get_check(skb))
3112                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3113                                               rtm_ipv4_policy, extack);
3114
3115         rtm = nlmsg_data(nlh);
3116         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3117             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3118             rtm->rtm_table || rtm->rtm_protocol ||
3119             rtm->rtm_scope || rtm->rtm_type) {
3120                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3121                 return -EINVAL;
3122         }
3123
3124         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3125                                RTM_F_LOOKUP_TABLE |
3126                                RTM_F_FIB_MATCH)) {
3127                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3128                 return -EINVAL;
3129         }
3130
3131         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3132                                             rtm_ipv4_policy, extack);
3133         if (err)
3134                 return err;
3135
3136         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3137             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3138                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3139                 return -EINVAL;
3140         }
3141
3142         for (i = 0; i <= RTA_MAX; i++) {
3143                 if (!tb[i])
3144                         continue;
3145
3146                 switch (i) {
3147                 case RTA_IIF:
3148                 case RTA_OIF:
3149                 case RTA_SRC:
3150                 case RTA_DST:
3151                 case RTA_IP_PROTO:
3152                 case RTA_SPORT:
3153                 case RTA_DPORT:
3154                 case RTA_MARK:
3155                 case RTA_UID:
3156                         break;
3157                 default:
3158                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3159                         return -EINVAL;
3160                 }
3161         }
3162
3163         return 0;
3164 }
3165
3166 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3167                              struct netlink_ext_ack *extack)
3168 {
3169         struct net *net = sock_net(in_skb->sk);
3170         struct nlattr *tb[RTA_MAX+1];
3171         u32 table_id = RT_TABLE_MAIN;
3172         __be16 sport = 0, dport = 0;
3173         struct fib_result res = {};
3174         u8 ip_proto = IPPROTO_UDP;
3175         struct rtable *rt = NULL;
3176         struct sk_buff *skb;
3177         struct rtmsg *rtm;
3178         struct flowi4 fl4 = {};
3179         __be32 dst = 0;
3180         __be32 src = 0;
3181         kuid_t uid;
3182         u32 iif;
3183         int err;
3184         int mark;
3185
3186         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3187         if (err < 0)
3188                 return err;
3189
3190         rtm = nlmsg_data(nlh);
3191         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3192         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3193         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3194         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3195         if (tb[RTA_UID])
3196                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3197         else
3198                 uid = (iif ? INVALID_UID : current_uid());
3199
3200         if (tb[RTA_IP_PROTO]) {
3201                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3202                                                   &ip_proto, AF_INET, extack);
3203                 if (err)
3204                         return err;
3205         }
3206
3207         if (tb[RTA_SPORT])
3208                 sport = nla_get_be16(tb[RTA_SPORT]);
3209
3210         if (tb[RTA_DPORT])
3211                 dport = nla_get_be16(tb[RTA_DPORT]);
3212
3213         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3214         if (!skb)
3215                 return -ENOBUFS;
3216
3217         fl4.daddr = dst;
3218         fl4.saddr = src;
3219         fl4.flowi4_tos = rtm->rtm_tos;
3220         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3221         fl4.flowi4_mark = mark;
3222         fl4.flowi4_uid = uid;
3223         if (sport)
3224                 fl4.fl4_sport = sport;
3225         if (dport)
3226                 fl4.fl4_dport = dport;
3227         fl4.flowi4_proto = ip_proto;
3228
3229         rcu_read_lock();
3230
3231         if (iif) {
3232                 struct net_device *dev;
3233
3234                 dev = dev_get_by_index_rcu(net, iif);
3235                 if (!dev) {
3236                         err = -ENODEV;
3237                         goto errout_rcu;
3238                 }
3239
3240                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3241                 skb->dev        = dev;
3242                 skb->mark       = mark;
3243                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3244                                          dev, &res);
3245
3246                 rt = skb_rtable(skb);
3247                 if (err == 0 && rt->dst.error)
3248                         err = -rt->dst.error;
3249         } else {
3250                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3251                 skb->dev = net->loopback_dev;
3252                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3253                 err = 0;
3254                 if (IS_ERR(rt))
3255                         err = PTR_ERR(rt);
3256                 else
3257                         skb_dst_set(skb, &rt->dst);
3258         }
3259
3260         if (err)
3261                 goto errout_rcu;
3262
3263         if (rtm->rtm_flags & RTM_F_NOTIFY)
3264                 rt->rt_flags |= RTCF_NOTIFY;
3265
3266         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3267                 table_id = res.table ? res.table->tb_id : 0;
3268
3269         /* reset skb for netlink reply msg */
3270         skb_trim(skb, 0);
3271         skb_reset_network_header(skb);
3272         skb_reset_transport_header(skb);
3273         skb_reset_mac_header(skb);
3274
3275         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3276                 struct fib_rt_info fri;
3277
3278                 if (!res.fi) {
3279                         err = fib_props[res.type].error;
3280                         if (!err)
3281                                 err = -EHOSTUNREACH;
3282                         goto errout_rcu;
3283                 }
3284                 fri.fi = res.fi;
3285                 fri.tb_id = table_id;
3286                 fri.dst = res.prefix;
3287                 fri.dst_len = res.prefixlen;
3288                 fri.tos = fl4.flowi4_tos;
3289                 fri.type = rt->rt_type;
3290                 fri.offload = 0;
3291                 fri.trap = 0;
3292                 if (res.fa_head) {
3293                         struct fib_alias *fa;
3294
3295                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3296                                 u8 slen = 32 - fri.dst_len;
3297
3298                                 if (fa->fa_slen == slen &&
3299                                     fa->tb_id == fri.tb_id &&
3300                                     fa->fa_tos == fri.tos &&
3301                                     fa->fa_info == res.fi &&
3302                                     fa->fa_type == fri.type) {
3303                                         fri.offload = fa->offload;
3304                                         fri.trap = fa->trap;
3305                                         break;
3306                                 }
3307                         }
3308                 }
3309                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3310                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3311         } else {
3312                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3313                                    NETLINK_CB(in_skb).portid,
3314                                    nlh->nlmsg_seq, 0);
3315         }
3316         if (err < 0)
3317                 goto errout_rcu;
3318
3319         rcu_read_unlock();
3320
3321         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3322
3323 errout_free:
3324         return err;
3325 errout_rcu:
3326         rcu_read_unlock();
3327         kfree_skb(skb);
3328         goto errout_free;
3329 }
3330
3331 void ip_rt_multicast_event(struct in_device *in_dev)
3332 {
3333         rt_cache_flush(dev_net(in_dev->dev));
3334 }
3335
3336 #ifdef CONFIG_SYSCTL
3337 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3338 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3339 static int ip_rt_gc_elasticity __read_mostly    = 8;
3340 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3341
3342 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3343                 void *buffer, size_t *lenp, loff_t *ppos)
3344 {
3345         struct net *net = (struct net *)__ctl->extra1;
3346
3347         if (write) {
3348                 rt_cache_flush(net);
3349                 fnhe_genid_bump(net);
3350                 return 0;
3351         }
3352
3353         return -EINVAL;
3354 }
3355
3356 static struct ctl_table ipv4_route_table[] = {
3357         {
3358                 .procname       = "gc_thresh",
3359                 .data           = &ipv4_dst_ops.gc_thresh,
3360                 .maxlen         = sizeof(int),
3361                 .mode           = 0644,
3362                 .proc_handler   = proc_dointvec,
3363         },
3364         {
3365                 .procname       = "max_size",
3366                 .data           = &ip_rt_max_size,
3367                 .maxlen         = sizeof(int),
3368                 .mode           = 0644,
3369                 .proc_handler   = proc_dointvec,
3370         },
3371         {
3372                 /*  Deprecated. Use gc_min_interval_ms */
3373
3374                 .procname       = "gc_min_interval",
3375                 .data           = &ip_rt_gc_min_interval,
3376                 .maxlen         = sizeof(int),
3377                 .mode           = 0644,
3378                 .proc_handler   = proc_dointvec_jiffies,
3379         },
3380         {
3381                 .procname       = "gc_min_interval_ms",
3382                 .data           = &ip_rt_gc_min_interval,
3383                 .maxlen         = sizeof(int),
3384                 .mode           = 0644,
3385                 .proc_handler   = proc_dointvec_ms_jiffies,
3386         },
3387         {
3388                 .procname       = "gc_timeout",
3389                 .data           = &ip_rt_gc_timeout,
3390                 .maxlen         = sizeof(int),
3391                 .mode           = 0644,
3392                 .proc_handler   = proc_dointvec_jiffies,
3393         },
3394         {
3395                 .procname       = "gc_interval",
3396                 .data           = &ip_rt_gc_interval,
3397                 .maxlen         = sizeof(int),
3398                 .mode           = 0644,
3399                 .proc_handler   = proc_dointvec_jiffies,
3400         },
3401         {
3402                 .procname       = "redirect_load",
3403                 .data           = &ip_rt_redirect_load,
3404                 .maxlen         = sizeof(int),
3405                 .mode           = 0644,
3406                 .proc_handler   = proc_dointvec,
3407         },
3408         {
3409                 .procname       = "redirect_number",
3410                 .data           = &ip_rt_redirect_number,
3411                 .maxlen         = sizeof(int),
3412                 .mode           = 0644,
3413                 .proc_handler   = proc_dointvec,
3414         },
3415         {
3416                 .procname       = "redirect_silence",
3417                 .data           = &ip_rt_redirect_silence,
3418                 .maxlen         = sizeof(int),
3419                 .mode           = 0644,
3420                 .proc_handler   = proc_dointvec,
3421         },
3422         {
3423                 .procname       = "error_cost",
3424                 .data           = &ip_rt_error_cost,
3425                 .maxlen         = sizeof(int),
3426                 .mode           = 0644,
3427                 .proc_handler   = proc_dointvec,
3428         },
3429         {
3430                 .procname       = "error_burst",
3431                 .data           = &ip_rt_error_burst,
3432                 .maxlen         = sizeof(int),
3433                 .mode           = 0644,
3434                 .proc_handler   = proc_dointvec,
3435         },
3436         {
3437                 .procname       = "gc_elasticity",
3438                 .data           = &ip_rt_gc_elasticity,
3439                 .maxlen         = sizeof(int),
3440                 .mode           = 0644,
3441                 .proc_handler   = proc_dointvec,
3442         },
3443         {
3444                 .procname       = "mtu_expires",
3445                 .data           = &ip_rt_mtu_expires,
3446                 .maxlen         = sizeof(int),
3447                 .mode           = 0644,
3448                 .proc_handler   = proc_dointvec_jiffies,
3449         },
3450         {
3451                 .procname       = "min_pmtu",
3452                 .data           = &ip_rt_min_pmtu,
3453                 .maxlen         = sizeof(int),
3454                 .mode           = 0644,
3455                 .proc_handler   = proc_dointvec_minmax,
3456                 .extra1         = &ip_min_valid_pmtu,
3457         },
3458         {
3459                 .procname       = "min_adv_mss",
3460                 .data           = &ip_rt_min_advmss,
3461                 .maxlen         = sizeof(int),
3462                 .mode           = 0644,
3463                 .proc_handler   = proc_dointvec,
3464         },
3465         { }
3466 };
3467
3468 static const char ipv4_route_flush_procname[] = "flush";
3469
3470 static struct ctl_table ipv4_route_flush_table[] = {
3471         {
3472                 .procname       = ipv4_route_flush_procname,
3473                 .maxlen         = sizeof(int),
3474                 .mode           = 0200,
3475                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3476         },
3477         { },
3478 };
3479
3480 static __net_init int sysctl_route_net_init(struct net *net)
3481 {
3482         struct ctl_table *tbl;
3483
3484         tbl = ipv4_route_flush_table;
3485         if (!net_eq(net, &init_net)) {
3486                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3487                 if (!tbl)
3488                         goto err_dup;
3489
3490                 /* Don't export non-whitelisted sysctls to unprivileged users */
3491                 if (net->user_ns != &init_user_ns) {
3492                         if (tbl[0].procname != ipv4_route_flush_procname)
3493                                 tbl[0].procname = NULL;
3494                 }
3495         }
3496         tbl[0].extra1 = net;
3497
3498         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3499         if (!net->ipv4.route_hdr)
3500                 goto err_reg;
3501         return 0;
3502
3503 err_reg:
3504         if (tbl != ipv4_route_flush_table)
3505                 kfree(tbl);
3506 err_dup:
3507         return -ENOMEM;
3508 }
3509
3510 static __net_exit void sysctl_route_net_exit(struct net *net)
3511 {
3512         struct ctl_table *tbl;
3513
3514         tbl = net->ipv4.route_hdr->ctl_table_arg;
3515         unregister_net_sysctl_table(net->ipv4.route_hdr);
3516         BUG_ON(tbl == ipv4_route_flush_table);
3517         kfree(tbl);
3518 }
3519
3520 static __net_initdata struct pernet_operations sysctl_route_ops = {
3521         .init = sysctl_route_net_init,
3522         .exit = sysctl_route_net_exit,
3523 };
3524 #endif
3525
3526 static __net_init int rt_genid_init(struct net *net)
3527 {
3528         atomic_set(&net->ipv4.rt_genid, 0);
3529         atomic_set(&net->fnhe_genid, 0);
3530         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3531         return 0;
3532 }
3533
3534 static __net_initdata struct pernet_operations rt_genid_ops = {
3535         .init = rt_genid_init,
3536 };
3537
3538 static int __net_init ipv4_inetpeer_init(struct net *net)
3539 {
3540         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3541
3542         if (!bp)
3543                 return -ENOMEM;
3544         inet_peer_base_init(bp);
3545         net->ipv4.peers = bp;
3546         return 0;
3547 }
3548
3549 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3550 {
3551         struct inet_peer_base *bp = net->ipv4.peers;
3552
3553         net->ipv4.peers = NULL;
3554         inetpeer_invalidate_tree(bp);
3555         kfree(bp);
3556 }
3557
3558 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3559         .init   =       ipv4_inetpeer_init,
3560         .exit   =       ipv4_inetpeer_exit,
3561 };
3562
3563 #ifdef CONFIG_IP_ROUTE_CLASSID
3564 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3565 #endif /* CONFIG_IP_ROUTE_CLASSID */
3566
3567 int __init ip_rt_init(void)
3568 {
3569         int cpu;
3570
3571         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3572                                   GFP_KERNEL);
3573         if (!ip_idents)
3574                 panic("IP: failed to allocate ip_idents\n");
3575
3576         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3577
3578         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3579         if (!ip_tstamps)
3580                 panic("IP: failed to allocate ip_tstamps\n");
3581
3582         for_each_possible_cpu(cpu) {
3583                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3584
3585                 INIT_LIST_HEAD(&ul->head);
3586                 spin_lock_init(&ul->lock);
3587         }
3588 #ifdef CONFIG_IP_ROUTE_CLASSID
3589         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3590         if (!ip_rt_acct)
3591                 panic("IP: failed to allocate ip_rt_acct\n");
3592 #endif
3593
3594         ipv4_dst_ops.kmem_cachep =
3595                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3596                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3597
3598         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3599
3600         if (dst_entries_init(&ipv4_dst_ops) < 0)
3601                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3602
3603         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3604                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3605
3606         ipv4_dst_ops.gc_thresh = ~0;
3607         ip_rt_max_size = INT_MAX;
3608
3609         devinet_init();
3610         ip_fib_init();
3611
3612         if (ip_rt_proc_init())
3613                 pr_err("Unable to create route proc files\n");
3614 #ifdef CONFIG_XFRM
3615         xfrm_init();
3616         xfrm4_init();
3617 #endif
3618         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3619                       RTNL_FLAG_DOIT_UNLOCKED);
3620
3621 #ifdef CONFIG_SYSCTL
3622         register_pernet_subsys(&sysctl_route_ops);
3623 #endif
3624         register_pernet_subsys(&rt_genid_ops);
3625         register_pernet_subsys(&ipv4_inetpeer_ops);
3626         return 0;
3627 }
3628
3629 #ifdef CONFIG_SYSCTL
3630 /*
3631  * We really need to sanitize the damn ipv4 init order, then all
3632  * this nonsense will go away.
3633  */
3634 void __init ip_static_sysctl_init(void)
3635 {
3636         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3637 }
3638 #endif
This page took 0.234462 seconds and 4 git commands to generate.