]> Git Repo - linux.git/blame - net/ipv4/route.c
net: hold sock reference while processing tx timestamps
[linux.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <[email protected]>
10 * Alan Cox, <[email protected]>
11 * Linus Torvalds, <[email protected]>
12 * Alexey Kuznetsov, <[email protected]>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * ([email protected]) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
1da177e4
LT
65#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
1da177e4 71#include <linux/mm.h>
424c4b70 72#include <linux/bootmem.h>
1da177e4
LT
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
39c90ece 82#include <linux/workqueue.h>
1da177e4 83#include <linux/skbuff.h>
1da177e4
LT
84#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
5a0e3ad6 93#include <linux/slab.h>
352e512c 94#include <net/dst.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
8d71740c 106#include <net/netevent.h>
63f3444f 107#include <net/rtnetlink.h>
1da177e4
LT
108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
3769cffb 111#include <net/atmclip.h>
6e5714ea 112#include <net/secure_seq.h>
1da177e4 113
68a5e3dd
DM
114#define RT_FL_TOS(oldflp4) \
115 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
1da177e4
LT
116
117#define IP_MAX_MTU 0xFFF0
118
119#define RT_GC_TIMEOUT (300*HZ)
120
1da177e4 121static int ip_rt_max_size;
817bc4db
SH
122static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
123static int ip_rt_gc_interval __read_mostly = 60 * HZ;
124static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
130static int ip_rt_gc_elasticity __read_mostly = 8;
131static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133static int ip_rt_min_advmss __read_mostly = 256;
1080d709 134static int rt_chain_length_max __read_mostly = 20;
1da177e4 135
1da177e4
LT
136/*
137 * Interface to generic destination cache.
138 */
139
140static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 141static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
d33e4553 142static unsigned int ipv4_default_mtu(const struct dst_entry *dst);
1da177e4 143static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4
LT
144static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145static void ipv4_link_failure(struct sk_buff *skb);
146static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
569d3645 147static int rt_garbage_collect(struct dst_ops *ops);
1da177e4 148
72cdd1d9
ED
149static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
150 int how)
151{
152}
1da177e4 153
62fa8a84
DM
154static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
155{
06582540
DM
156 struct rtable *rt = (struct rtable *) dst;
157 struct inet_peer *peer;
158 u32 *p = NULL;
159
160 if (!rt->peer)
a48eff12 161 rt_bind_peer(rt, rt->rt_dst, 1);
62fa8a84 162
06582540
DM
163 peer = rt->peer;
164 if (peer) {
62fa8a84
DM
165 u32 *old_p = __DST_METRICS_PTR(old);
166 unsigned long prev, new;
167
06582540
DM
168 p = peer->metrics;
169 if (inet_metrics_new(peer))
170 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
62fa8a84
DM
171
172 new = (unsigned long) p;
173 prev = cmpxchg(&dst->_metrics, old, new);
174
175 if (prev != old) {
62fa8a84
DM
176 p = __DST_METRICS_PTR(prev);
177 if (prev & DST_METRICS_READ_ONLY)
178 p = NULL;
179 } else {
62fa8a84
DM
180 if (rt->fi) {
181 fib_info_put(rt->fi);
182 rt->fi = NULL;
183 }
184 }
185 }
186 return p;
187}
188
d3aaeb38
DM
189static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
190
1da177e4
LT
191static struct dst_ops ipv4_dst_ops = {
192 .family = AF_INET,
09640e63 193 .protocol = cpu_to_be16(ETH_P_IP),
1da177e4
LT
194 .gc = rt_garbage_collect,
195 .check = ipv4_dst_check,
0dbaee3b 196 .default_advmss = ipv4_default_advmss,
d33e4553 197 .default_mtu = ipv4_default_mtu,
62fa8a84 198 .cow_metrics = ipv4_cow_metrics,
1da177e4
LT
199 .destroy = ipv4_dst_destroy,
200 .ifdown = ipv4_dst_ifdown,
201 .negative_advice = ipv4_negative_advice,
202 .link_failure = ipv4_link_failure,
203 .update_pmtu = ip_rt_update_pmtu,
1ac06e03 204 .local_out = __ip_local_out,
d3aaeb38 205 .neigh_lookup = ipv4_neigh_lookup,
1da177e4
LT
206};
207
208#define ECN_OR_COST(class) TC_PRIO_##class
209
4839c52b 210const __u8 ip_tos2prio[16] = {
1da177e4 211 TC_PRIO_BESTEFFORT,
4a2b9c37 212 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
213 TC_PRIO_BESTEFFORT,
214 ECN_OR_COST(BESTEFFORT),
215 TC_PRIO_BULK,
216 ECN_OR_COST(BULK),
217 TC_PRIO_BULK,
218 ECN_OR_COST(BULK),
219 TC_PRIO_INTERACTIVE,
220 ECN_OR_COST(INTERACTIVE),
221 TC_PRIO_INTERACTIVE,
222 ECN_OR_COST(INTERACTIVE),
223 TC_PRIO_INTERACTIVE_BULK,
224 ECN_OR_COST(INTERACTIVE_BULK),
225 TC_PRIO_INTERACTIVE_BULK,
226 ECN_OR_COST(INTERACTIVE_BULK)
227};
228
229
230/*
231 * Route cache.
232 */
233
234/* The locking scheme is rather straight forward:
235 *
236 * 1) Read-Copy Update protects the buckets of the central route hash.
237 * 2) Only writers remove entries, and they hold the lock
238 * as they look at rtable reference counts.
239 * 3) Only readers acquire references to rtable entries,
240 * they do so with atomic increments and with the
241 * lock held.
242 */
243
244struct rt_hash_bucket {
1c31720a 245 struct rtable __rcu *chain;
22c047cc 246};
1080d709 247
8a25d5de
IM
248#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
249 defined(CONFIG_PROVE_LOCKING)
22c047cc
ED
250/*
251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
252 * The size of this table is a power of two and depends on the number of CPUS.
62051200 253 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
22c047cc 254 */
62051200
IM
255#ifdef CONFIG_LOCKDEP
256# define RT_HASH_LOCK_SZ 256
22c047cc 257#else
62051200
IM
258# if NR_CPUS >= 32
259# define RT_HASH_LOCK_SZ 4096
260# elif NR_CPUS >= 16
261# define RT_HASH_LOCK_SZ 2048
262# elif NR_CPUS >= 8
263# define RT_HASH_LOCK_SZ 1024
264# elif NR_CPUS >= 4
265# define RT_HASH_LOCK_SZ 512
266# else
267# define RT_HASH_LOCK_SZ 256
268# endif
22c047cc
ED
269#endif
270
271static spinlock_t *rt_hash_locks;
272# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
1ff1cc20
PE
273
274static __init void rt_hash_lock_init(void)
275{
276 int i;
277
278 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
279 GFP_KERNEL);
280 if (!rt_hash_locks)
281 panic("IP: failed to allocate rt_hash_locks\n");
282
283 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
284 spin_lock_init(&rt_hash_locks[i]);
285}
22c047cc
ED
286#else
287# define rt_hash_lock_addr(slot) NULL
1ff1cc20
PE
288
289static inline void rt_hash_lock_init(void)
290{
291}
22c047cc 292#endif
1da177e4 293
817bc4db
SH
294static struct rt_hash_bucket *rt_hash_table __read_mostly;
295static unsigned rt_hash_mask __read_mostly;
296static unsigned int rt_hash_log __read_mostly;
1da177e4 297
2f970d83 298static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
27f39c73 299#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
1da177e4 300
b00180de 301static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
0eae88f3 302 int genid)
1da177e4 303{
0eae88f3 304 return jhash_3words((__force u32)daddr, (__force u32)saddr,
b00180de 305 idx, genid)
29e75252 306 & rt_hash_mask;
1da177e4
LT
307}
308
e84f84f2
DL
309static inline int rt_genid(struct net *net)
310{
311 return atomic_read(&net->ipv4.rt_genid);
312}
313
1da177e4
LT
314#ifdef CONFIG_PROC_FS
315struct rt_cache_iter_state {
a75e936f 316 struct seq_net_private p;
1da177e4 317 int bucket;
29e75252 318 int genid;
1da177e4
LT
319};
320
1218854a 321static struct rtable *rt_cache_get_first(struct seq_file *seq)
1da177e4 322{
1218854a 323 struct rt_cache_iter_state *st = seq->private;
1da177e4 324 struct rtable *r = NULL;
1da177e4
LT
325
326 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
1c31720a 327 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
a6272665 328 continue;
1da177e4 329 rcu_read_lock_bh();
a898def2 330 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
29e75252 331 while (r) {
d8d1f30b 332 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
a75e936f 333 r->rt_genid == st->genid)
29e75252 334 return r;
d8d1f30b 335 r = rcu_dereference_bh(r->dst.rt_next);
29e75252 336 }
1da177e4
LT
337 rcu_read_unlock_bh();
338 }
29e75252 339 return r;
1da177e4
LT
340}
341
1218854a 342static struct rtable *__rt_cache_get_next(struct seq_file *seq,
642d6318 343 struct rtable *r)
1da177e4 344{
1218854a 345 struct rt_cache_iter_state *st = seq->private;
a6272665 346
1c31720a 347 r = rcu_dereference_bh(r->dst.rt_next);
1da177e4
LT
348 while (!r) {
349 rcu_read_unlock_bh();
a6272665
ED
350 do {
351 if (--st->bucket < 0)
352 return NULL;
1c31720a 353 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
1da177e4 354 rcu_read_lock_bh();
1c31720a 355 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
1da177e4 356 }
1c31720a 357 return r;
1da177e4
LT
358}
359
1218854a 360static struct rtable *rt_cache_get_next(struct seq_file *seq,
642d6318
DL
361 struct rtable *r)
362{
1218854a
YH
363 struct rt_cache_iter_state *st = seq->private;
364 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
d8d1f30b 365 if (dev_net(r->dst.dev) != seq_file_net(seq))
a75e936f 366 continue;
642d6318
DL
367 if (r->rt_genid == st->genid)
368 break;
369 }
370 return r;
371}
372
1218854a 373static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
1da177e4 374{
1218854a 375 struct rtable *r = rt_cache_get_first(seq);
1da177e4
LT
376
377 if (r)
1218854a 378 while (pos && (r = rt_cache_get_next(seq, r)))
1da177e4
LT
379 --pos;
380 return pos ? NULL : r;
381}
382
383static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
384{
29e75252 385 struct rt_cache_iter_state *st = seq->private;
29e75252 386 if (*pos)
1218854a 387 return rt_cache_get_idx(seq, *pos - 1);
e84f84f2 388 st->genid = rt_genid(seq_file_net(seq));
29e75252 389 return SEQ_START_TOKEN;
1da177e4
LT
390}
391
392static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
393{
29e75252 394 struct rtable *r;
1da177e4
LT
395
396 if (v == SEQ_START_TOKEN)
1218854a 397 r = rt_cache_get_first(seq);
1da177e4 398 else
1218854a 399 r = rt_cache_get_next(seq, v);
1da177e4
LT
400 ++*pos;
401 return r;
402}
403
404static void rt_cache_seq_stop(struct seq_file *seq, void *v)
405{
406 if (v && v != SEQ_START_TOKEN)
407 rcu_read_unlock_bh();
408}
409
410static int rt_cache_seq_show(struct seq_file *seq, void *v)
411{
412 if (v == SEQ_START_TOKEN)
413 seq_printf(seq, "%-127s\n",
414 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
415 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
416 "HHUptod\tSpecDst");
417 else {
418 struct rtable *r = v;
69cce1d1 419 struct neighbour *n;
5e659e4c 420 int len;
1da177e4 421
69cce1d1 422 n = dst_get_neighbour(&r->dst);
0eae88f3
ED
423 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
424 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
d8d1f30b 425 r->dst.dev ? r->dst.dev->name : "*",
0eae88f3
ED
426 (__force u32)r->rt_dst,
427 (__force u32)r->rt_gateway,
d8d1f30b
CG
428 r->rt_flags, atomic_read(&r->dst.__refcnt),
429 r->dst.__use, 0, (__force u32)r->rt_src,
0dbaee3b 430 dst_metric_advmss(&r->dst) + 40,
d8d1f30b
CG
431 dst_metric(&r->dst, RTAX_WINDOW),
432 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
433 dst_metric(&r->dst, RTAX_RTTVAR)),
475949d8 434 r->rt_key_tos,
f6b72b62 435 -1,
69cce1d1 436 (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
5e659e4c
PE
437 r->rt_spec_dst, &len);
438
439 seq_printf(seq, "%*s\n", 127 - len, "");
e905a9ed
YH
440 }
441 return 0;
1da177e4
LT
442}
443
f690808e 444static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
445 .start = rt_cache_seq_start,
446 .next = rt_cache_seq_next,
447 .stop = rt_cache_seq_stop,
448 .show = rt_cache_seq_show,
449};
450
451static int rt_cache_seq_open(struct inode *inode, struct file *file)
452{
a75e936f 453 return seq_open_net(inode, file, &rt_cache_seq_ops,
cf7732e4 454 sizeof(struct rt_cache_iter_state));
1da177e4
LT
455}
456
9a32144e 457static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
458 .owner = THIS_MODULE,
459 .open = rt_cache_seq_open,
460 .read = seq_read,
461 .llseek = seq_lseek,
a75e936f 462 .release = seq_release_net,
1da177e4
LT
463};
464
465
466static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
467{
468 int cpu;
469
470 if (*pos == 0)
471 return SEQ_START_TOKEN;
472
0f23174a 473 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
474 if (!cpu_possible(cpu))
475 continue;
476 *pos = cpu+1;
2f970d83 477 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
478 }
479 return NULL;
480}
481
482static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
483{
484 int cpu;
485
0f23174a 486 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
487 if (!cpu_possible(cpu))
488 continue;
489 *pos = cpu+1;
2f970d83 490 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
491 }
492 return NULL;
e905a9ed 493
1da177e4
LT
494}
495
496static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
497{
498
499}
500
501static int rt_cpu_seq_show(struct seq_file *seq, void *v)
502{
503 struct rt_cache_stat *st = v;
504
505 if (v == SEQ_START_TOKEN) {
5bec0039 506 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
507 return 0;
508 }
e905a9ed 509
1da177e4
LT
510 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
511 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 512 dst_entries_get_slow(&ipv4_dst_ops),
1da177e4
LT
513 st->in_hit,
514 st->in_slow_tot,
515 st->in_slow_mc,
516 st->in_no_route,
517 st->in_brd,
518 st->in_martian_dst,
519 st->in_martian_src,
520
521 st->out_hit,
522 st->out_slow_tot,
e905a9ed 523 st->out_slow_mc,
1da177e4
LT
524
525 st->gc_total,
526 st->gc_ignored,
527 st->gc_goal_miss,
528 st->gc_dst_overflow,
529 st->in_hlist_search,
530 st->out_hlist_search
531 );
532 return 0;
533}
534
f690808e 535static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
536 .start = rt_cpu_seq_start,
537 .next = rt_cpu_seq_next,
538 .stop = rt_cpu_seq_stop,
539 .show = rt_cpu_seq_show,
540};
541
542
543static int rt_cpu_seq_open(struct inode *inode, struct file *file)
544{
545 return seq_open(file, &rt_cpu_seq_ops);
546}
547
9a32144e 548static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
549 .owner = THIS_MODULE,
550 .open = rt_cpu_seq_open,
551 .read = seq_read,
552 .llseek = seq_lseek,
553 .release = seq_release,
554};
555
c7066f70 556#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 557static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 558{
a661c419
AD
559 struct ip_rt_acct *dst, *src;
560 unsigned int i, j;
561
562 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
563 if (!dst)
564 return -ENOMEM;
565
566 for_each_possible_cpu(i) {
567 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
568 for (j = 0; j < 256; j++) {
569 dst[j].o_bytes += src[j].o_bytes;
570 dst[j].o_packets += src[j].o_packets;
571 dst[j].i_bytes += src[j].i_bytes;
572 dst[j].i_packets += src[j].i_packets;
573 }
78c686e9
PE
574 }
575
a661c419
AD
576 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
577 kfree(dst);
578 return 0;
579}
78c686e9 580
a661c419
AD
581static int rt_acct_proc_open(struct inode *inode, struct file *file)
582{
583 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 584}
a661c419
AD
585
586static const struct file_operations rt_acct_proc_fops = {
587 .owner = THIS_MODULE,
588 .open = rt_acct_proc_open,
589 .read = seq_read,
590 .llseek = seq_lseek,
591 .release = single_release,
592};
78c686e9 593#endif
107f1634 594
73b38711 595static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
596{
597 struct proc_dir_entry *pde;
598
599 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
600 &rt_cache_seq_fops);
601 if (!pde)
602 goto err1;
603
77020720
WC
604 pde = proc_create("rt_cache", S_IRUGO,
605 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
606 if (!pde)
607 goto err2;
608
c7066f70 609#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 610 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
611 if (!pde)
612 goto err3;
613#endif
614 return 0;
615
c7066f70 616#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
617err3:
618 remove_proc_entry("rt_cache", net->proc_net_stat);
619#endif
620err2:
621 remove_proc_entry("rt_cache", net->proc_net);
622err1:
623 return -ENOMEM;
624}
73b38711
DL
625
626static void __net_exit ip_rt_do_proc_exit(struct net *net)
627{
628 remove_proc_entry("rt_cache", net->proc_net_stat);
629 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 630#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 631 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 632#endif
73b38711
DL
633}
634
635static struct pernet_operations ip_rt_proc_ops __net_initdata = {
636 .init = ip_rt_do_proc_init,
637 .exit = ip_rt_do_proc_exit,
638};
639
640static int __init ip_rt_proc_init(void)
641{
642 return register_pernet_subsys(&ip_rt_proc_ops);
643}
644
107f1634 645#else
73b38711 646static inline int ip_rt_proc_init(void)
107f1634
PE
647{
648 return 0;
649}
1da177e4 650#endif /* CONFIG_PROC_FS */
e905a9ed 651
5969f71d 652static inline void rt_free(struct rtable *rt)
1da177e4 653{
d8d1f30b 654 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
655}
656
5969f71d 657static inline void rt_drop(struct rtable *rt)
1da177e4 658{
1da177e4 659 ip_rt_put(rt);
d8d1f30b 660 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
1da177e4
LT
661}
662
5969f71d 663static inline int rt_fast_clean(struct rtable *rth)
1da177e4
LT
664{
665 /* Kill broadcast/multicast entries very aggresively, if they
666 collide in hash table with more useful entries */
667 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
c7537967 668 rt_is_input_route(rth) && rth->dst.rt_next;
1da177e4
LT
669}
670
5969f71d 671static inline int rt_valuable(struct rtable *rth)
1da177e4
LT
672{
673 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
2c8cec5c 674 (rth->peer && rth->peer->pmtu_expires);
1da177e4
LT
675}
676
677static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
678{
679 unsigned long age;
680 int ret = 0;
681
d8d1f30b 682 if (atomic_read(&rth->dst.__refcnt))
1da177e4
LT
683 goto out;
684
d8d1f30b 685 age = jiffies - rth->dst.lastuse;
1da177e4
LT
686 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
687 (age <= tmo2 && rt_valuable(rth)))
688 goto out;
689 ret = 1;
690out: return ret;
691}
692
693/* Bits of score are:
694 * 31: very valuable
695 * 30: not quite useless
696 * 29..0: usage counter
697 */
698static inline u32 rt_score(struct rtable *rt)
699{
d8d1f30b 700 u32 score = jiffies - rt->dst.lastuse;
1da177e4
LT
701
702 score = ~score & ~(3<<30);
703
704 if (rt_valuable(rt))
705 score |= (1<<31);
706
c7537967 707 if (rt_is_output_route(rt) ||
1da177e4
LT
708 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
709 score |= (1<<30);
710
711 return score;
712}
713
1080d709
NH
714static inline bool rt_caching(const struct net *net)
715{
716 return net->ipv4.current_rt_cache_rebuild_count <=
717 net->ipv4.sysctl_rt_cache_rebuild_count;
718}
719
5e2b61f7
DM
720static inline bool compare_hash_inputs(const struct rtable *rt1,
721 const struct rtable *rt2)
1080d709 722{
5e2b61f7
DM
723 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
724 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
97a80410 725 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
1080d709
NH
726}
727
5e2b61f7 728static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
1da177e4 729{
5e2b61f7
DM
730 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
732 (rt1->rt_mark ^ rt2->rt_mark) |
475949d8 733 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
d547f727 734 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
97a80410 735 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
1da177e4
LT
736}
737
b5921910
DL
738static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
739{
d8d1f30b 740 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
b5921910
DL
741}
742
e84f84f2
DL
743static inline int rt_is_expired(struct rtable *rth)
744{
d8d1f30b 745 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
e84f84f2
DL
746}
747
beb659bd
ED
748/*
749 * Perform a full scan of hash table and free all entries.
750 * Can be called by a softirq or a process.
751 * In the later case, we want to be reschedule if necessary
752 */
6561a3b1 753static void rt_do_flush(struct net *net, int process_context)
beb659bd
ED
754{
755 unsigned int i;
756 struct rtable *rth, *next;
757
758 for (i = 0; i <= rt_hash_mask; i++) {
6561a3b1
DM
759 struct rtable __rcu **pprev;
760 struct rtable *list;
761
beb659bd
ED
762 if (process_context && need_resched())
763 cond_resched();
1c31720a 764 rth = rcu_dereference_raw(rt_hash_table[i].chain);
beb659bd
ED
765 if (!rth)
766 continue;
767
768 spin_lock_bh(rt_hash_lock_addr(i));
32cb5b4e 769
6561a3b1
DM
770 list = NULL;
771 pprev = &rt_hash_table[i].chain;
772 rth = rcu_dereference_protected(*pprev,
1c31720a 773 lockdep_is_held(rt_hash_lock_addr(i)));
32cb5b4e 774
6561a3b1
DM
775 while (rth) {
776 next = rcu_dereference_protected(rth->dst.rt_next,
1c31720a 777 lockdep_is_held(rt_hash_lock_addr(i)));
6561a3b1
DM
778
779 if (!net ||
780 net_eq(dev_net(rth->dst.dev), net)) {
781 rcu_assign_pointer(*pprev, next);
782 rcu_assign_pointer(rth->dst.rt_next, list);
783 list = rth;
32cb5b4e 784 } else {
6561a3b1 785 pprev = &rth->dst.rt_next;
32cb5b4e 786 }
6561a3b1 787 rth = next;
32cb5b4e 788 }
6561a3b1 789
beb659bd
ED
790 spin_unlock_bh(rt_hash_lock_addr(i));
791
6561a3b1
DM
792 for (; list; list = next) {
793 next = rcu_dereference_protected(list->dst.rt_next, 1);
794 rt_free(list);
beb659bd
ED
795 }
796 }
797}
798
1080d709
NH
799/*
800 * While freeing expired entries, we compute average chain length
801 * and standard deviation, using fixed-point arithmetic.
802 * This to have an estimation of rt_chain_length_max
803 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
804 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
805 */
806
807#define FRACT_BITS 3
808#define ONE (1UL << FRACT_BITS)
809
98376387
ED
810/*
811 * Given a hash chain and an item in this hash chain,
812 * find if a previous entry has the same hash_inputs
813 * (but differs on tos, mark or oif)
814 * Returns 0 if an alias is found.
815 * Returns ONE if rth has no alias before itself.
816 */
817static int has_noalias(const struct rtable *head, const struct rtable *rth)
818{
819 const struct rtable *aux = head;
820
821 while (aux != rth) {
5e2b61f7 822 if (compare_hash_inputs(aux, rth))
98376387 823 return 0;
1c31720a 824 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
98376387
ED
825 }
826 return ONE;
827}
828
29e75252 829/*
25985edc 830 * Perturbation of rt_genid by a small quantity [1..256]
29e75252
ED
831 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
832 * many times (2^24) without giving recent rt_genid.
833 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
1da177e4 834 */
86c657f6 835static void rt_cache_invalidate(struct net *net)
1da177e4 836{
29e75252 837 unsigned char shuffle;
1da177e4 838
29e75252 839 get_random_bytes(&shuffle, sizeof(shuffle));
e84f84f2 840 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
1da177e4
LT
841}
842
29e75252
ED
843/*
844 * delay < 0 : invalidate cache (fast : entries will be deleted later)
845 * delay >= 0 : invalidate & flush cache (can be long)
846 */
76e6ebfb 847void rt_cache_flush(struct net *net, int delay)
1da177e4 848{
86c657f6 849 rt_cache_invalidate(net);
29e75252 850 if (delay >= 0)
6561a3b1 851 rt_do_flush(net, !in_softirq());
1da177e4
LT
852}
853
a5ee1551 854/* Flush previous cache invalidated entries from the cache */
6561a3b1 855void rt_cache_flush_batch(struct net *net)
a5ee1551 856{
6561a3b1 857 rt_do_flush(net, !in_softirq());
a5ee1551
EB
858}
859
1080d709
NH
860static void rt_emergency_hash_rebuild(struct net *net)
861{
3ee94372 862 if (net_ratelimit())
1080d709 863 printk(KERN_WARNING "Route hash chain too long!\n");
3ee94372 864 rt_cache_invalidate(net);
1080d709
NH
865}
866
1da177e4
LT
867/*
868 Short description of GC goals.
869
870 We want to build algorithm, which will keep routing cache
871 at some equilibrium point, when number of aged off entries
872 is kept approximately equal to newly generated ones.
873
874 Current expiration strength is variable "expire".
875 We try to adjust it dynamically, so that if networking
876 is idle expires is large enough to keep enough of warm entries,
877 and when load increases it reduces to limit cache size.
878 */
879
569d3645 880static int rt_garbage_collect(struct dst_ops *ops)
1da177e4
LT
881{
882 static unsigned long expire = RT_GC_TIMEOUT;
883 static unsigned long last_gc;
884 static int rover;
885 static int equilibrium;
1c31720a
ED
886 struct rtable *rth;
887 struct rtable __rcu **rthp;
1da177e4
LT
888 unsigned long now = jiffies;
889 int goal;
fc66f95c 890 int entries = dst_entries_get_fast(&ipv4_dst_ops);
1da177e4
LT
891
892 /*
893 * Garbage collection is pretty expensive,
894 * do not make it too frequently.
895 */
896
897 RT_CACHE_STAT_INC(gc_total);
898
899 if (now - last_gc < ip_rt_gc_min_interval &&
fc66f95c 900 entries < ip_rt_max_size) {
1da177e4
LT
901 RT_CACHE_STAT_INC(gc_ignored);
902 goto out;
903 }
904
fc66f95c 905 entries = dst_entries_get_slow(&ipv4_dst_ops);
1da177e4 906 /* Calculate number of entries, which we want to expire now. */
fc66f95c 907 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1da177e4
LT
908 if (goal <= 0) {
909 if (equilibrium < ipv4_dst_ops.gc_thresh)
910 equilibrium = ipv4_dst_ops.gc_thresh;
fc66f95c 911 goal = entries - equilibrium;
1da177e4 912 if (goal > 0) {
b790cedd 913 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 914 goal = entries - equilibrium;
1da177e4
LT
915 }
916 } else {
917 /* We are in dangerous area. Try to reduce cache really
918 * aggressively.
919 */
b790cedd 920 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
fc66f95c 921 equilibrium = entries - goal;
1da177e4
LT
922 }
923
924 if (now - last_gc >= ip_rt_gc_min_interval)
925 last_gc = now;
926
927 if (goal <= 0) {
928 equilibrium += goal;
929 goto work_done;
930 }
931
932 do {
933 int i, k;
934
935 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
936 unsigned long tmo = expire;
937
938 k = (k + 1) & rt_hash_mask;
939 rthp = &rt_hash_table[k].chain;
22c047cc 940 spin_lock_bh(rt_hash_lock_addr(k));
1c31720a
ED
941 while ((rth = rcu_dereference_protected(*rthp,
942 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
e84f84f2 943 if (!rt_is_expired(rth) &&
29e75252 944 !rt_may_expire(rth, tmo, expire)) {
1da177e4 945 tmo >>= 1;
d8d1f30b 946 rthp = &rth->dst.rt_next;
1da177e4
LT
947 continue;
948 }
d8d1f30b 949 *rthp = rth->dst.rt_next;
1da177e4
LT
950 rt_free(rth);
951 goal--;
1da177e4 952 }
22c047cc 953 spin_unlock_bh(rt_hash_lock_addr(k));
1da177e4
LT
954 if (goal <= 0)
955 break;
956 }
957 rover = k;
958
959 if (goal <= 0)
960 goto work_done;
961
962 /* Goal is not achieved. We stop process if:
963
964 - if expire reduced to zero. Otherwise, expire is halfed.
965 - if table is not full.
966 - if we are called from interrupt.
967 - jiffies check is just fallback/debug loop breaker.
968 We will not spin here for long time in any case.
969 */
970
971 RT_CACHE_STAT_INC(gc_goal_miss);
972
973 if (expire == 0)
974 break;
975
976 expire >>= 1;
1da177e4 977
fc66f95c 978 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
979 goto out;
980 } while (!in_softirq() && time_before_eq(jiffies, now));
981
fc66f95c
ED
982 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
983 goto out;
984 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1da177e4
LT
985 goto out;
986 if (net_ratelimit())
987 printk(KERN_WARNING "dst cache overflow\n");
988 RT_CACHE_STAT_INC(gc_dst_overflow);
989 return 1;
990
991work_done:
992 expire += ip_rt_gc_min_interval;
993 if (expire > ip_rt_gc_timeout ||
fc66f95c
ED
994 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
995 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1da177e4 996 expire = ip_rt_gc_timeout;
1da177e4
LT
997out: return 0;
998}
999
98376387
ED
1000/*
1001 * Returns number of entries in a hash chain that have different hash_inputs
1002 */
1003static int slow_chain_length(const struct rtable *head)
1004{
1005 int length = 0;
1006 const struct rtable *rth = head;
1007
1008 while (rth) {
1009 length += has_noalias(head, rth);
1c31720a 1010 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
98376387
ED
1011 }
1012 return length >> FRACT_BITS;
1013}
1014
d3aaeb38 1015static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
3769cffb 1016{
3769cffb 1017 struct neigh_table *tbl = &arp_tbl;
d3aaeb38
DM
1018 static const __be32 inaddr_any = 0;
1019 struct net_device *dev = dst->dev;
1020 const __be32 *pkey = daddr;
3769cffb
DM
1021 struct neighbour *n;
1022
1023#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1024 if (dev->type == ARPHRD_ATM)
1025 tbl = clip_tbl_hook;
1026#endif
3769cffb 1027 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
d3aaeb38
DM
1028 pkey = &inaddr_any;
1029
1030 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1031 if (n)
1032 return n;
1033 return neigh_create(tbl, pkey, dev);
1034}
1035
1036static int rt_bind_neighbour(struct rtable *rt)
1037{
1038 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
3769cffb
DM
1039 if (IS_ERR(n))
1040 return PTR_ERR(n);
69cce1d1 1041 dst_set_neighbour(&rt->dst, n);
3769cffb
DM
1042
1043 return 0;
1044}
1045
b23dd4fe
DM
1046static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1047 struct sk_buff *skb, int ifindex)
1da177e4 1048{
1c31720a
ED
1049 struct rtable *rth, *cand;
1050 struct rtable __rcu **rthp, **candp;
1da177e4 1051 unsigned long now;
1da177e4
LT
1052 u32 min_score;
1053 int chain_length;
1054 int attempts = !in_softirq();
1055
1056restart:
1057 chain_length = 0;
1058 min_score = ~(u32)0;
1059 cand = NULL;
1060 candp = NULL;
1061 now = jiffies;
1062
d8d1f30b 1063 if (!rt_caching(dev_net(rt->dst.dev))) {
73e42897
NH
1064 /*
1065 * If we're not caching, just tell the caller we
1066 * were successful and don't touch the route. The
1067 * caller hold the sole reference to the cache entry, and
1068 * it will be released when the caller is done with it.
1069 * If we drop it here, the callers have no way to resolve routes
1070 * when we're not caching. Instead, just point *rp at rt, so
1071 * the caller gets a single use out of the route
b6280b47
NH
1072 * Note that we do rt_free on this new route entry, so that
1073 * once its refcount hits zero, we are still able to reap it
1074 * (Thanks Alexey)
27b75c95
ED
1075 * Note: To avoid expensive rcu stuff for this uncached dst,
1076 * we set DST_NOCACHE so that dst_release() can free dst without
1077 * waiting a grace period.
73e42897 1078 */
b6280b47 1079
c7d4426a 1080 rt->dst.flags |= DST_NOCACHE;
c7537967 1081 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1082 int err = rt_bind_neighbour(rt);
b6280b47
NH
1083 if (err) {
1084 if (net_ratelimit())
1085 printk(KERN_WARNING
1086 "Neighbour table failure & not caching routes.\n");
27b75c95 1087 ip_rt_put(rt);
b23dd4fe 1088 return ERR_PTR(err);
b6280b47
NH
1089 }
1090 }
1091
b6280b47 1092 goto skip_hashing;
1080d709
NH
1093 }
1094
1da177e4
LT
1095 rthp = &rt_hash_table[hash].chain;
1096
22c047cc 1097 spin_lock_bh(rt_hash_lock_addr(hash));
1c31720a
ED
1098 while ((rth = rcu_dereference_protected(*rthp,
1099 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1100 if (rt_is_expired(rth)) {
d8d1f30b 1101 *rthp = rth->dst.rt_next;
29e75252
ED
1102 rt_free(rth);
1103 continue;
1104 }
5e2b61f7 1105 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1da177e4 1106 /* Put it first */
d8d1f30b 1107 *rthp = rth->dst.rt_next;
1da177e4
LT
1108 /*
1109 * Since lookup is lockfree, the deletion
1110 * must be visible to another weakly ordered CPU before
1111 * the insertion at the start of the hash chain.
1112 */
d8d1f30b 1113 rcu_assign_pointer(rth->dst.rt_next,
1da177e4
LT
1114 rt_hash_table[hash].chain);
1115 /*
1116 * Since lookup is lockfree, the update writes
1117 * must be ordered for consistency on SMP.
1118 */
1119 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1120
d8d1f30b 1121 dst_use(&rth->dst, now);
22c047cc 1122 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1123
1124 rt_drop(rt);
b23dd4fe 1125 if (skb)
d8d1f30b 1126 skb_dst_set(skb, &rth->dst);
b23dd4fe 1127 return rth;
1da177e4
LT
1128 }
1129
d8d1f30b 1130 if (!atomic_read(&rth->dst.__refcnt)) {
1da177e4
LT
1131 u32 score = rt_score(rth);
1132
1133 if (score <= min_score) {
1134 cand = rth;
1135 candp = rthp;
1136 min_score = score;
1137 }
1138 }
1139
1140 chain_length++;
1141
d8d1f30b 1142 rthp = &rth->dst.rt_next;
1da177e4
LT
1143 }
1144
1145 if (cand) {
1146 /* ip_rt_gc_elasticity used to be average length of chain
1147 * length, when exceeded gc becomes really aggressive.
1148 *
1149 * The second limit is less certain. At the moment it allows
1150 * only 2 entries per bucket. We will see.
1151 */
1152 if (chain_length > ip_rt_gc_elasticity) {
d8d1f30b 1153 *candp = cand->dst.rt_next;
1da177e4
LT
1154 rt_free(cand);
1155 }
1080d709 1156 } else {
98376387
ED
1157 if (chain_length > rt_chain_length_max &&
1158 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
d8d1f30b 1159 struct net *net = dev_net(rt->dst.dev);
1080d709 1160 int num = ++net->ipv4.current_rt_cache_rebuild_count;
b35ecb5d 1161 if (!rt_caching(net)) {
1080d709 1162 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
d8d1f30b 1163 rt->dst.dev->name, num);
1080d709 1164 }
b35ecb5d 1165 rt_emergency_hash_rebuild(net);
6a2bad70
PE
1166 spin_unlock_bh(rt_hash_lock_addr(hash));
1167
5e2b61f7 1168 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
6a2bad70
PE
1169 ifindex, rt_genid(net));
1170 goto restart;
1080d709 1171 }
1da177e4
LT
1172 }
1173
1174 /* Try to bind route to arp only if it is output
1175 route or unicast forwarding path.
1176 */
c7537967 1177 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
3769cffb 1178 int err = rt_bind_neighbour(rt);
1da177e4 1179 if (err) {
22c047cc 1180 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1181
1182 if (err != -ENOBUFS) {
1183 rt_drop(rt);
b23dd4fe 1184 return ERR_PTR(err);
1da177e4
LT
1185 }
1186
1187 /* Neighbour tables are full and nothing
1188 can be released. Try to shrink route cache,
1189 it is most likely it holds some neighbour records.
1190 */
1191 if (attempts-- > 0) {
1192 int saved_elasticity = ip_rt_gc_elasticity;
1193 int saved_int = ip_rt_gc_min_interval;
1194 ip_rt_gc_elasticity = 1;
1195 ip_rt_gc_min_interval = 0;
569d3645 1196 rt_garbage_collect(&ipv4_dst_ops);
1da177e4
LT
1197 ip_rt_gc_min_interval = saved_int;
1198 ip_rt_gc_elasticity = saved_elasticity;
1199 goto restart;
1200 }
1201
1202 if (net_ratelimit())
7e1b33e5 1203 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1da177e4 1204 rt_drop(rt);
b23dd4fe 1205 return ERR_PTR(-ENOBUFS);
1da177e4
LT
1206 }
1207 }
1208
d8d1f30b 1209 rt->dst.rt_next = rt_hash_table[hash].chain;
1080d709 1210
00269b54
ED
1211 /*
1212 * Since lookup is lockfree, we must make sure
25985edc 1213 * previous writes to rt are committed to memory
00269b54
ED
1214 * before making rt visible to other CPUS.
1215 */
1ddbcb00 1216 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1080d709 1217
22c047cc 1218 spin_unlock_bh(rt_hash_lock_addr(hash));
73e42897 1219
b6280b47 1220skip_hashing:
b23dd4fe 1221 if (skb)
d8d1f30b 1222 skb_dst_set(skb, &rt->dst);
b23dd4fe 1223 return rt;
1da177e4
LT
1224}
1225
6431cbc2
DM
1226static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1227
1228static u32 rt_peer_genid(void)
1229{
1230 return atomic_read(&__rt_peer_genid);
1231}
1232
a48eff12 1233void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1da177e4 1234{
1da177e4
LT
1235 struct inet_peer *peer;
1236
a48eff12 1237 peer = inet_getpeer_v4(daddr, create);
1da177e4 1238
49e8ab03 1239 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1da177e4 1240 inet_putpeer(peer);
6431cbc2
DM
1241 else
1242 rt->rt_peer_genid = rt_peer_genid();
1da177e4
LT
1243}
1244
1245/*
1246 * Peer allocation may fail only in serious out-of-memory conditions. However
1247 * we still can generate some output.
1248 * Random ID selection looks a bit dangerous because we have no chances to
1249 * select ID being unique in a reasonable period of time.
1250 * But broken packet identifier may be better than no packet at all.
1251 */
1252static void ip_select_fb_ident(struct iphdr *iph)
1253{
1254 static DEFINE_SPINLOCK(ip_fb_id_lock);
1255 static u32 ip_fallback_id;
1256 u32 salt;
1257
1258 spin_lock_bh(&ip_fb_id_lock);
e448515c 1259 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1da177e4
LT
1260 iph->id = htons(salt & 0xFFFF);
1261 ip_fallback_id = salt;
1262 spin_unlock_bh(&ip_fb_id_lock);
1263}
1264
1265void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1266{
1267 struct rtable *rt = (struct rtable *) dst;
1268
1269 if (rt) {
1270 if (rt->peer == NULL)
a48eff12 1271 rt_bind_peer(rt, rt->rt_dst, 1);
1da177e4
LT
1272
1273 /* If peer is attached to destination, it is never detached,
1274 so that we need not to grab a lock to dereference it.
1275 */
1276 if (rt->peer) {
1277 iph->id = htons(inet_getid(rt->peer, more));
1278 return;
1279 }
1280 } else
e905a9ed 1281 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
9c2b3328 1282 __builtin_return_address(0));
1da177e4
LT
1283
1284 ip_select_fb_ident(iph);
1285}
4bc2f18b 1286EXPORT_SYMBOL(__ip_select_ident);
1da177e4
LT
1287
1288static void rt_del(unsigned hash, struct rtable *rt)
1289{
1c31720a
ED
1290 struct rtable __rcu **rthp;
1291 struct rtable *aux;
1da177e4 1292
29e75252 1293 rthp = &rt_hash_table[hash].chain;
22c047cc 1294 spin_lock_bh(rt_hash_lock_addr(hash));
1da177e4 1295 ip_rt_put(rt);
1c31720a
ED
1296 while ((aux = rcu_dereference_protected(*rthp,
1297 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
e84f84f2 1298 if (aux == rt || rt_is_expired(aux)) {
d8d1f30b 1299 *rthp = aux->dst.rt_next;
29e75252
ED
1300 rt_free(aux);
1301 continue;
1da177e4 1302 }
d8d1f30b 1303 rthp = &aux->dst.rt_next;
29e75252 1304 }
22c047cc 1305 spin_unlock_bh(rt_hash_lock_addr(hash));
1da177e4
LT
1306}
1307
ed7865a4 1308/* called in rcu_read_lock() section */
f7655229
AV
1309void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1310 __be32 saddr, struct net_device *dev)
1da177e4 1311{
ed7865a4 1312 struct in_device *in_dev = __in_dev_get_rcu(dev);
f39925db 1313 struct inet_peer *peer;
317805b8 1314 struct net *net;
1da177e4 1315
1da177e4
LT
1316 if (!in_dev)
1317 return;
1318
c346dca1 1319 net = dev_net(dev);
9d4fb27d
JP
1320 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1321 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1322 ipv4_is_zeronet(new_gw))
1da177e4
LT
1323 goto reject_redirect;
1324
1325 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1326 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1327 goto reject_redirect;
1328 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1329 goto reject_redirect;
1330 } else {
317805b8 1331 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
1332 goto reject_redirect;
1333 }
1334
f39925db
DM
1335 peer = inet_getpeer_v4(daddr, 1);
1336 if (peer) {
1337 peer->redirect_learned.a4 = new_gw;
e905a9ed 1338
f39925db 1339 inet_putpeer(peer);
1da177e4 1340
f39925db 1341 atomic_inc(&__rt_peer_genid);
1da177e4 1342 }
1da177e4
LT
1343 return;
1344
1345reject_redirect:
1346#ifdef CONFIG_IP_ROUTE_VERBOSE
1347 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
1348 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1349 " Advised path = %pI4 -> %pI4\n",
1350 &old_gw, dev->name, &new_gw,
1351 &saddr, &daddr);
1da177e4 1352#endif
ed7865a4 1353 ;
1da177e4
LT
1354}
1355
fe6fe792
ED
1356static bool peer_pmtu_expired(struct inet_peer *peer)
1357{
1358 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1359
1360 return orig &&
1361 time_after_eq(jiffies, orig) &&
1362 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1363}
1364
1365static bool peer_pmtu_cleaned(struct inet_peer *peer)
1366{
1367 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1368
1369 return orig &&
1370 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1371}
1372
1da177e4
LT
1373static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1374{
ee6b9673 1375 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
1376 struct dst_entry *ret = dst;
1377
1378 if (rt) {
d11a4dc1 1379 if (dst->obsolete > 0) {
1da177e4
LT
1380 ip_rt_put(rt);
1381 ret = NULL;
2c8cec5c 1382 } else if (rt->rt_flags & RTCF_REDIRECTED) {
5e2b61f7
DM
1383 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1384 rt->rt_oif,
e84f84f2 1385 rt_genid(dev_net(dst->dev)));
1da177e4
LT
1386 rt_del(hash, rt);
1387 ret = NULL;
fe6fe792
ED
1388 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1389 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1390 }
1391 }
1392 return ret;
1393}
1394
1395/*
1396 * Algorithm:
1397 * 1. The first ip_rt_redirect_number redirects are sent
1398 * with exponential backoff, then we stop sending them at all,
1399 * assuming that the host ignores our redirects.
1400 * 2. If we did not see packets requiring redirects
1401 * during ip_rt_redirect_silence, we assume that the host
1402 * forgot redirected route and start to send redirects again.
1403 *
1404 * This algorithm is much cheaper and more intelligent than dumb load limiting
1405 * in icmp.c.
1406 *
1407 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1408 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1409 */
1410
1411void ip_rt_send_redirect(struct sk_buff *skb)
1412{
511c3f92 1413 struct rtable *rt = skb_rtable(skb);
30038fc6 1414 struct in_device *in_dev;
92d86829 1415 struct inet_peer *peer;
30038fc6 1416 int log_martians;
1da177e4 1417
30038fc6 1418 rcu_read_lock();
d8d1f30b 1419 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
1420 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1421 rcu_read_unlock();
1da177e4 1422 return;
30038fc6
ED
1423 }
1424 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1425 rcu_read_unlock();
1da177e4 1426
92d86829 1427 if (!rt->peer)
a48eff12 1428 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1429 peer = rt->peer;
1430 if (!peer) {
1431 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1432 return;
1433 }
1434
1da177e4
LT
1435 /* No redirected packets during ip_rt_redirect_silence;
1436 * reset the algorithm.
1437 */
92d86829
DM
1438 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1439 peer->rate_tokens = 0;
1da177e4
LT
1440
1441 /* Too many ignored redirects; do not send anything
d8d1f30b 1442 * set dst.rate_last to the last seen redirected packet.
1da177e4 1443 */
92d86829
DM
1444 if (peer->rate_tokens >= ip_rt_redirect_number) {
1445 peer->rate_last = jiffies;
30038fc6 1446 return;
1da177e4
LT
1447 }
1448
1449 /* Check for load limit; set rate_last to the latest sent
1450 * redirect.
1451 */
92d86829 1452 if (peer->rate_tokens == 0 ||
14fb8a76 1453 time_after(jiffies,
92d86829
DM
1454 (peer->rate_last +
1455 (ip_rt_redirect_load << peer->rate_tokens)))) {
1da177e4 1456 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
92d86829
DM
1457 peer->rate_last = jiffies;
1458 ++peer->rate_tokens;
1da177e4 1459#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 1460 if (log_martians &&
92d86829 1461 peer->rate_tokens == ip_rt_redirect_number &&
1da177e4 1462 net_ratelimit())
673d57e7 1463 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
c5be24ff 1464 &ip_hdr(skb)->saddr, rt->rt_iif,
673d57e7 1465 &rt->rt_dst, &rt->rt_gateway);
1da177e4
LT
1466#endif
1467 }
1da177e4
LT
1468}
1469
1470static int ip_error(struct sk_buff *skb)
1471{
511c3f92 1472 struct rtable *rt = skb_rtable(skb);
92d86829 1473 struct inet_peer *peer;
1da177e4 1474 unsigned long now;
92d86829 1475 bool send;
1da177e4
LT
1476 int code;
1477
d8d1f30b 1478 switch (rt->dst.error) {
4500ebf8
JP
1479 case EINVAL:
1480 default:
1481 goto out;
1482 case EHOSTUNREACH:
1483 code = ICMP_HOST_UNREACH;
1484 break;
1485 case ENETUNREACH:
1486 code = ICMP_NET_UNREACH;
1487 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1488 IPSTATS_MIB_INNOROUTES);
1489 break;
1490 case EACCES:
1491 code = ICMP_PKT_FILTERED;
1492 break;
1da177e4
LT
1493 }
1494
92d86829 1495 if (!rt->peer)
a48eff12 1496 rt_bind_peer(rt, rt->rt_dst, 1);
92d86829
DM
1497 peer = rt->peer;
1498
1499 send = true;
1500 if (peer) {
1501 now = jiffies;
1502 peer->rate_tokens += now - peer->rate_last;
1503 if (peer->rate_tokens > ip_rt_error_burst)
1504 peer->rate_tokens = ip_rt_error_burst;
1505 peer->rate_last = now;
1506 if (peer->rate_tokens >= ip_rt_error_cost)
1507 peer->rate_tokens -= ip_rt_error_cost;
1508 else
1509 send = false;
1da177e4 1510 }
92d86829
DM
1511 if (send)
1512 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1513
1514out: kfree_skb(skb);
1515 return 0;
e905a9ed 1516}
1da177e4
LT
1517
1518/*
1519 * The last two values are not from the RFC but
1520 * are needed for AMPRnet AX.25 paths.
1521 */
1522
9b5b5cff 1523static const unsigned short mtu_plateau[] =
1da177e4
LT
1524{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1525
5969f71d 1526static inline unsigned short guess_mtu(unsigned short old_mtu)
1da177e4
LT
1527{
1528 int i;
e905a9ed 1529
1da177e4
LT
1530 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1531 if (old_mtu > mtu_plateau[i])
1532 return mtu_plateau[i];
1533 return 68;
1534}
1535
b71d1d42 1536unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
0010e465
TT
1537 unsigned short new_mtu,
1538 struct net_device *dev)
1da177e4 1539{
1da177e4 1540 unsigned short old_mtu = ntohs(iph->tot_len);
1da177e4 1541 unsigned short est_mtu = 0;
2c8cec5c 1542 struct inet_peer *peer;
1da177e4 1543
2c8cec5c
DM
1544 peer = inet_getpeer_v4(iph->daddr, 1);
1545 if (peer) {
1546 unsigned short mtu = new_mtu;
1da177e4 1547
2c8cec5c
DM
1548 if (new_mtu < 68 || new_mtu >= old_mtu) {
1549 /* BSD 4.2 derived systems incorrectly adjust
1550 * tot_len by the IP header length, and report
1551 * a zero MTU in the ICMP message.
1552 */
1553 if (mtu == 0 &&
1554 old_mtu >= 68 + (iph->ihl << 2))
1555 old_mtu -= iph->ihl << 2;
1556 mtu = guess_mtu(old_mtu);
1557 }
0010e465 1558
2c8cec5c
DM
1559 if (mtu < ip_rt_min_pmtu)
1560 mtu = ip_rt_min_pmtu;
1561 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1562 unsigned long pmtu_expires;
1563
1564 pmtu_expires = jiffies + ip_rt_mtu_expires;
1565 if (!pmtu_expires)
1566 pmtu_expires = 1UL;
1567
2c8cec5c
DM
1568 est_mtu = mtu;
1569 peer->pmtu_learned = mtu;
46af3180 1570 peer->pmtu_expires = pmtu_expires;
2c8cec5c 1571 }
1da177e4 1572
2c8cec5c 1573 inet_putpeer(peer);
1da177e4 1574
2c8cec5c 1575 atomic_inc(&__rt_peer_genid);
1da177e4
LT
1576 }
1577 return est_mtu ? : new_mtu;
1578}
1579
2c8cec5c
DM
1580static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1581{
fe6fe792 1582 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
2c8cec5c 1583
fe6fe792
ED
1584 if (!expires)
1585 return;
46af3180 1586 if (time_before(jiffies, expires)) {
2c8cec5c
DM
1587 u32 orig_dst_mtu = dst_mtu(dst);
1588 if (peer->pmtu_learned < orig_dst_mtu) {
1589 if (!peer->pmtu_orig)
1590 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1591 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1592 }
1593 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1594 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1595}
1596
1da177e4
LT
1597static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1598{
2c8cec5c
DM
1599 struct rtable *rt = (struct rtable *) dst;
1600 struct inet_peer *peer;
1601
1602 dst_confirm(dst);
1603
1604 if (!rt->peer)
a48eff12 1605 rt_bind_peer(rt, rt->rt_dst, 1);
2c8cec5c
DM
1606 peer = rt->peer;
1607 if (peer) {
fe6fe792
ED
1608 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1609
2c8cec5c 1610 if (mtu < ip_rt_min_pmtu)
1da177e4 1611 mtu = ip_rt_min_pmtu;
fe6fe792 1612 if (!pmtu_expires || mtu < peer->pmtu_learned) {
46af3180
HS
1613
1614 pmtu_expires = jiffies + ip_rt_mtu_expires;
1615 if (!pmtu_expires)
1616 pmtu_expires = 1UL;
1617
2c8cec5c 1618 peer->pmtu_learned = mtu;
46af3180 1619 peer->pmtu_expires = pmtu_expires;
2c8cec5c
DM
1620
1621 atomic_inc(&__rt_peer_genid);
1622 rt->rt_peer_genid = rt_peer_genid();
1da177e4 1623 }
46af3180 1624 check_peer_pmtu(dst, peer);
1da177e4
LT
1625 }
1626}
1627
f39925db
DM
1628static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1629{
1630 struct rtable *rt = (struct rtable *) dst;
1631 __be32 orig_gw = rt->rt_gateway;
f2c31e32 1632 struct neighbour *n, *old_n;
f39925db
DM
1633
1634 dst_confirm(&rt->dst);
1635
f39925db 1636 rt->rt_gateway = peer->redirect_learned.a4;
f2c31e32
ED
1637
1638 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1639 if (IS_ERR(n))
1640 return PTR_ERR(n);
1641 old_n = xchg(&rt->dst._neighbour, n);
1642 if (old_n)
1643 neigh_release(old_n);
69cce1d1
DM
1644 if (!n || !(n->nud_state & NUD_VALID)) {
1645 if (n)
1646 neigh_event_send(n, NULL);
f39925db
DM
1647 rt->rt_gateway = orig_gw;
1648 return -EAGAIN;
1649 } else {
1650 rt->rt_flags |= RTCF_REDIRECTED;
69cce1d1 1651 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
f39925db
DM
1652 }
1653 return 0;
1654}
1655
1da177e4
LT
1656static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1657{
6431cbc2
DM
1658 struct rtable *rt = (struct rtable *) dst;
1659
1660 if (rt_is_expired(rt))
d11a4dc1 1661 return NULL;
6431cbc2 1662 if (rt->rt_peer_genid != rt_peer_genid()) {
2c8cec5c
DM
1663 struct inet_peer *peer;
1664
6431cbc2 1665 if (!rt->peer)
a48eff12 1666 rt_bind_peer(rt, rt->rt_dst, 0);
6431cbc2 1667
2c8cec5c 1668 peer = rt->peer;
fe6fe792 1669 if (peer) {
2c8cec5c
DM
1670 check_peer_pmtu(dst, peer);
1671
fe6fe792
ED
1672 if (peer->redirect_learned.a4 &&
1673 peer->redirect_learned.a4 != rt->rt_gateway) {
1674 if (check_peer_redir(dst, peer))
1675 return NULL;
1676 }
f39925db
DM
1677 }
1678
6431cbc2
DM
1679 rt->rt_peer_genid = rt_peer_genid();
1680 }
d11a4dc1 1681 return dst;
1da177e4
LT
1682}
1683
1684static void ipv4_dst_destroy(struct dst_entry *dst)
1685{
1686 struct rtable *rt = (struct rtable *) dst;
1687 struct inet_peer *peer = rt->peer;
1da177e4 1688
62fa8a84
DM
1689 if (rt->fi) {
1690 fib_info_put(rt->fi);
1691 rt->fi = NULL;
1692 }
1da177e4
LT
1693 if (peer) {
1694 rt->peer = NULL;
1695 inet_putpeer(peer);
1696 }
1da177e4
LT
1697}
1698
1da177e4
LT
1699
1700static void ipv4_link_failure(struct sk_buff *skb)
1701{
1702 struct rtable *rt;
1703
1704 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1705
511c3f92 1706 rt = skb_rtable(skb);
fe6fe792
ED
1707 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1708 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1da177e4
LT
1709}
1710
1711static int ip_rt_bug(struct sk_buff *skb)
1712{
673d57e7
HH
1713 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1714 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1da177e4
LT
1715 skb->dev ? skb->dev->name : "?");
1716 kfree_skb(skb);
c378a9c0 1717 WARN_ON(1);
1da177e4
LT
1718 return 0;
1719}
1720
1721/*
1722 We do not cache source address of outgoing interface,
1723 because it is used only by IP RR, TS and SRR options,
1724 so that it out of fast path.
1725
1726 BTW remember: "addr" is allowed to be not aligned
1727 in IP options!
1728 */
1729
8e36360a 1730void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1731{
a61ced5d 1732 __be32 src;
1da177e4 1733
c7537967 1734 if (rt_is_output_route(rt))
c5be24ff 1735 src = ip_hdr(skb)->saddr;
ebc0ffae 1736 else {
8e36360a
DM
1737 struct fib_result res;
1738 struct flowi4 fl4;
1739 struct iphdr *iph;
1740
1741 iph = ip_hdr(skb);
1742
1743 memset(&fl4, 0, sizeof(fl4));
1744 fl4.daddr = iph->daddr;
1745 fl4.saddr = iph->saddr;
b0fe4a31 1746 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1747 fl4.flowi4_oif = rt->dst.dev->ifindex;
1748 fl4.flowi4_iif = skb->dev->ifindex;
1749 fl4.flowi4_mark = skb->mark;
5e2b61f7 1750
ebc0ffae 1751 rcu_read_lock();
68a5e3dd 1752 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
436c3b66 1753 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae
ED
1754 else
1755 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1da177e4 1756 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1757 rcu_read_unlock();
1758 }
1da177e4
LT
1759 memcpy(addr, &src, 4);
1760}
1761
c7066f70 1762#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1763static void set_class_tag(struct rtable *rt, u32 tag)
1764{
d8d1f30b
CG
1765 if (!(rt->dst.tclassid & 0xFFFF))
1766 rt->dst.tclassid |= tag & 0xFFFF;
1767 if (!(rt->dst.tclassid & 0xFFFF0000))
1768 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1769}
1770#endif
1771
0dbaee3b
DM
1772static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1773{
1774 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1775
1776 if (advmss == 0) {
1777 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1778 ip_rt_min_advmss);
1779 if (advmss > 65535 - 40)
1780 advmss = 65535 - 40;
1781 }
1782 return advmss;
1783}
1784
d33e4553
DM
1785static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1786{
1787 unsigned int mtu = dst->dev->mtu;
1788
1789 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1790 const struct rtable *rt = (const struct rtable *) dst;
1791
1792 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1793 mtu = 576;
1794 }
1795
1796 if (mtu > IP_MAX_MTU)
1797 mtu = IP_MAX_MTU;
1798
1799 return mtu;
1800}
1801
813b3b5d 1802static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1803 struct fib_info *fi)
a4daad6b 1804{
0131ba45
DM
1805 struct inet_peer *peer;
1806 int create = 0;
a4daad6b 1807
0131ba45
DM
1808 /* If a peer entry exists for this destination, we must hook
1809 * it up in order to get at cached metrics.
1810 */
813b3b5d 1811 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
0131ba45
DM
1812 create = 1;
1813
3c0afdca 1814 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
0131ba45 1815 if (peer) {
3c0afdca 1816 rt->rt_peer_genid = rt_peer_genid();
a4daad6b
DM
1817 if (inet_metrics_new(peer))
1818 memcpy(peer->metrics, fi->fib_metrics,
1819 sizeof(u32) * RTAX_MAX);
1820 dst_init_metrics(&rt->dst, peer->metrics, false);
2c8cec5c 1821
fe6fe792 1822 check_peer_pmtu(&rt->dst, peer);
f39925db
DM
1823 if (peer->redirect_learned.a4 &&
1824 peer->redirect_learned.a4 != rt->rt_gateway) {
1825 rt->rt_gateway = peer->redirect_learned.a4;
1826 rt->rt_flags |= RTCF_REDIRECTED;
1827 }
0131ba45
DM
1828 } else {
1829 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1830 rt->fi = fi;
1831 atomic_inc(&fi->fib_clntref);
1832 }
1833 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
a4daad6b
DM
1834 }
1835}
1836
813b3b5d 1837static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
5e2b61f7 1838 const struct fib_result *res,
982721f3 1839 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1840{
defb3519 1841 struct dst_entry *dst = &rt->dst;
1da177e4
LT
1842
1843 if (fi) {
1844 if (FIB_RES_GW(*res) &&
1845 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1846 rt->rt_gateway = FIB_RES_GW(*res);
813b3b5d 1847 rt_init_metrics(rt, fl4, fi);
c7066f70 1848#ifdef CONFIG_IP_ROUTE_CLASSID
defb3519 1849 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1da177e4 1850#endif
d33e4553 1851 }
defb3519 1852
defb3519
DM
1853 if (dst_mtu(dst) > IP_MAX_MTU)
1854 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
0dbaee3b 1855 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
defb3519 1856 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1da177e4 1857
c7066f70 1858#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1859#ifdef CONFIG_IP_MULTIPLE_TABLES
1860 set_class_tag(rt, fib_rules_tclass(res));
1861#endif
1862 set_class_tag(rt, itag);
1863#endif
1da177e4
LT
1864}
1865
5c1e6aa3
DM
1866static struct rtable *rt_dst_alloc(struct net_device *dev,
1867 bool nopolicy, bool noxfrm)
0c4dcd58 1868{
5c1e6aa3
DM
1869 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1870 DST_HOST |
1871 (nopolicy ? DST_NOPOLICY : 0) |
1872 (noxfrm ? DST_NOXFRM : 0));
0c4dcd58
DM
1873}
1874
96d36220 1875/* called in rcu_read_lock() section */
9e12bb22 1876static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1877 u8 tos, struct net_device *dev, int our)
1878{
96d36220 1879 unsigned int hash;
1da177e4 1880 struct rtable *rth;
a61ced5d 1881 __be32 spec_dst;
96d36220 1882 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 1883 u32 itag = 0;
b5f7e755 1884 int err;
1da177e4
LT
1885
1886 /* Primary sanity checks. */
1887
1888 if (in_dev == NULL)
1889 return -EINVAL;
1890
1e637c74 1891 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 1892 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1893 goto e_inval;
1894
f97c1e0c
JP
1895 if (ipv4_is_zeronet(saddr)) {
1896 if (!ipv4_is_local_multicast(daddr))
1da177e4
LT
1897 goto e_inval;
1898 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
b5f7e755 1899 } else {
5c04c819
MS
1900 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1901 &itag);
b5f7e755
ED
1902 if (err < 0)
1903 goto e_err;
1904 }
5c1e6aa3
DM
1905 rth = rt_dst_alloc(init_net.loopback_dev,
1906 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
1907 if (!rth)
1908 goto e_nobufs;
1909
cf911662
DM
1910#ifdef CONFIG_IP_ROUTE_CLASSID
1911 rth->dst.tclassid = itag;
1912#endif
d8d1f30b 1913 rth->dst.output = ip_rt_bug;
1da177e4 1914
5e2b61f7 1915 rth->rt_key_dst = daddr;
5e2b61f7 1916 rth->rt_key_src = saddr;
cf911662
DM
1917 rth->rt_genid = rt_genid(dev_net(dev));
1918 rth->rt_flags = RTCF_MULTICAST;
1919 rth->rt_type = RTN_MULTICAST;
475949d8 1920 rth->rt_key_tos = tos;
cf911662 1921 rth->rt_dst = daddr;
1da177e4 1922 rth->rt_src = saddr;
1b86a58f 1923 rth->rt_route_iif = dev->ifindex;
5e2b61f7 1924 rth->rt_iif = dev->ifindex;
5e2b61f7 1925 rth->rt_oif = 0;
cf911662 1926 rth->rt_mark = skb->mark;
1da177e4
LT
1927 rth->rt_gateway = daddr;
1928 rth->rt_spec_dst= spec_dst;
cf911662
DM
1929 rth->rt_peer_genid = 0;
1930 rth->peer = NULL;
1931 rth->fi = NULL;
1da177e4 1932 if (our) {
d8d1f30b 1933 rth->dst.input= ip_local_deliver;
1da177e4
LT
1934 rth->rt_flags |= RTCF_LOCAL;
1935 }
1936
1937#ifdef CONFIG_IP_MROUTE
f97c1e0c 1938 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1939 rth->dst.input = ip_mr_input;
1da177e4
LT
1940#endif
1941 RT_CACHE_STAT_INC(in_slow_mc);
1942
e84f84f2 1943 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
b23dd4fe 1944 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
9aa3c94c 1945 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1da177e4
LT
1946
1947e_nobufs:
1da177e4 1948 return -ENOBUFS;
1da177e4 1949e_inval:
96d36220 1950 return -EINVAL;
b5f7e755 1951e_err:
b5f7e755 1952 return err;
1da177e4
LT
1953}
1954
1955
1956static void ip_handle_martian_source(struct net_device *dev,
1957 struct in_device *in_dev,
1958 struct sk_buff *skb,
9e12bb22
AV
1959 __be32 daddr,
1960 __be32 saddr)
1da177e4
LT
1961{
1962 RT_CACHE_STAT_INC(in_martian_src);
1963#ifdef CONFIG_IP_ROUTE_VERBOSE
1964 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1965 /*
1966 * RFC1812 recommendation, if source is martian,
1967 * the only hint is MAC header.
1968 */
673d57e7
HH
1969 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1970 &daddr, &saddr, dev->name);
98e399f8 1971 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1da177e4 1972 int i;
98e399f8 1973 const unsigned char *p = skb_mac_header(skb);
1da177e4
LT
1974 printk(KERN_WARNING "ll header: ");
1975 for (i = 0; i < dev->hard_header_len; i++, p++) {
1976 printk("%02x", *p);
1977 if (i < (dev->hard_header_len - 1))
1978 printk(":");
1979 }
1980 printk("\n");
1981 }
1982 }
1983#endif
1984}
1985
47360228 1986/* called in rcu_read_lock() section */
5969f71d 1987static int __mkroute_input(struct sk_buff *skb,
982721f3 1988 const struct fib_result *res,
5969f71d
SH
1989 struct in_device *in_dev,
1990 __be32 daddr, __be32 saddr, u32 tos,
1991 struct rtable **result)
1da177e4 1992{
1da177e4
LT
1993 struct rtable *rth;
1994 int err;
1995 struct in_device *out_dev;
47360228 1996 unsigned int flags = 0;
d9c9df8c
AV
1997 __be32 spec_dst;
1998 u32 itag;
1da177e4
LT
1999
2000 /* get a working reference to the output device */
47360228 2001 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1da177e4
LT
2002 if (out_dev == NULL) {
2003 if (net_ratelimit())
2004 printk(KERN_CRIT "Bug in ip_route_input" \
2005 "_slow(). Please, report\n");
2006 return -EINVAL;
2007 }
2008
2009
5c04c819
MS
2010 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2011 in_dev->dev, &spec_dst, &itag);
1da177e4 2012 if (err < 0) {
e905a9ed 2013 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 2014 saddr);
e905a9ed 2015
1da177e4
LT
2016 goto cleanup;
2017 }
2018
2019 if (err)
2020 flags |= RTCF_DIRECTSRC;
2021
51b77cae 2022 if (out_dev == in_dev && err &&
1da177e4
LT
2023 (IN_DEV_SHARED_MEDIA(out_dev) ||
2024 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2025 flags |= RTCF_DOREDIRECT;
2026
2027 if (skb->protocol != htons(ETH_P_IP)) {
2028 /* Not IP (i.e. ARP). Do not create route, if it is
2029 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
2030 *
2031 * Proxy arp feature have been extended to allow, ARP
2032 * replies back to the same interface, to support
2033 * Private VLAN switch technologies. See arp.c.
1da177e4 2034 */
65324144
JDB
2035 if (out_dev == in_dev &&
2036 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
2037 err = -EINVAL;
2038 goto cleanup;
2039 }
2040 }
2041
5c1e6aa3
DM
2042 rth = rt_dst_alloc(out_dev->dev,
2043 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2044 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
2045 if (!rth) {
2046 err = -ENOBUFS;
2047 goto cleanup;
2048 }
2049
5e2b61f7 2050 rth->rt_key_dst = daddr;
5e2b61f7 2051 rth->rt_key_src = saddr;
cf911662
DM
2052 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2053 rth->rt_flags = flags;
2054 rth->rt_type = res->type;
475949d8 2055 rth->rt_key_tos = tos;
cf911662 2056 rth->rt_dst = daddr;
1da177e4 2057 rth->rt_src = saddr;
1b86a58f 2058 rth->rt_route_iif = in_dev->dev->ifindex;
5e2b61f7 2059 rth->rt_iif = in_dev->dev->ifindex;
5e2b61f7 2060 rth->rt_oif = 0;
cf911662
DM
2061 rth->rt_mark = skb->mark;
2062 rth->rt_gateway = daddr;
1da177e4 2063 rth->rt_spec_dst= spec_dst;
cf911662
DM
2064 rth->rt_peer_genid = 0;
2065 rth->peer = NULL;
2066 rth->fi = NULL;
1da177e4 2067
d8d1f30b
CG
2068 rth->dst.input = ip_forward;
2069 rth->dst.output = ip_output;
1da177e4 2070
5e2b61f7 2071 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
1da177e4 2072
1da177e4
LT
2073 *result = rth;
2074 err = 0;
2075 cleanup:
1da177e4 2076 return err;
e905a9ed 2077}
1da177e4 2078
5969f71d
SH
2079static int ip_mkroute_input(struct sk_buff *skb,
2080 struct fib_result *res,
68a5e3dd 2081 const struct flowi4 *fl4,
5969f71d
SH
2082 struct in_device *in_dev,
2083 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 2084{
7abaa27c 2085 struct rtable* rth = NULL;
1da177e4
LT
2086 int err;
2087 unsigned hash;
2088
2089#ifdef CONFIG_IP_ROUTE_MULTIPATH
ff3fccb3 2090 if (res->fi && res->fi->fib_nhs > 1)
1b7fe593 2091 fib_select_multipath(res);
1da177e4
LT
2092#endif
2093
2094 /* create a routing cache entry */
2095 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2096 if (err)
2097 return err;
1da177e4
LT
2098
2099 /* put it into the cache */
68a5e3dd 2100 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
d8d1f30b 2101 rt_genid(dev_net(rth->dst.dev)));
68a5e3dd 2102 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
b23dd4fe
DM
2103 if (IS_ERR(rth))
2104 return PTR_ERR(rth);
2105 return 0;
1da177e4
LT
2106}
2107
1da177e4
LT
2108/*
2109 * NOTE. We drop all the packets that has local source
2110 * addresses, because every properly looped back packet
2111 * must have correct destination already attached by output routine.
2112 *
2113 * Such approach solves two big problems:
2114 * 1. Not simplex devices are handled properly.
2115 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2116 * called with rcu_read_lock()
1da177e4
LT
2117 */
2118
9e12bb22 2119static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
2120 u8 tos, struct net_device *dev)
2121{
2122 struct fib_result res;
96d36220 2123 struct in_device *in_dev = __in_dev_get_rcu(dev);
68a5e3dd 2124 struct flowi4 fl4;
1da177e4
LT
2125 unsigned flags = 0;
2126 u32 itag = 0;
2127 struct rtable * rth;
2128 unsigned hash;
9e12bb22 2129 __be32 spec_dst;
1da177e4 2130 int err = -EINVAL;
c346dca1 2131 struct net * net = dev_net(dev);
1da177e4
LT
2132
2133 /* IP on this device is disabled. */
2134
2135 if (!in_dev)
2136 goto out;
2137
2138 /* Check for the most weird martians, which can be not detected
2139 by fib_lookup.
2140 */
2141
1e637c74 2142 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
f97c1e0c 2143 ipv4_is_loopback(saddr))
1da177e4
LT
2144 goto martian_source;
2145
27a954bd 2146 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2147 goto brd_input;
2148
2149 /* Accept zero addresses only to limited broadcast;
2150 * I even do not know to fix it or not. Waiting for complains :-)
2151 */
f97c1e0c 2152 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2153 goto martian_source;
2154
27a954bd 2155 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
1da177e4
LT
2156 goto martian_destination;
2157
2158 /*
2159 * Now we are ready to route packet.
2160 */
68a5e3dd
DM
2161 fl4.flowi4_oif = 0;
2162 fl4.flowi4_iif = dev->ifindex;
2163 fl4.flowi4_mark = skb->mark;
2164 fl4.flowi4_tos = tos;
2165 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2166 fl4.daddr = daddr;
2167 fl4.saddr = saddr;
2168 err = fib_lookup(net, &fl4, &res);
ebc0ffae 2169 if (err != 0) {
1da177e4 2170 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2171 goto e_hostunreach;
1da177e4
LT
2172 goto no_route;
2173 }
1da177e4
LT
2174
2175 RT_CACHE_STAT_INC(in_slow_tot);
2176
2177 if (res.type == RTN_BROADCAST)
2178 goto brd_input;
2179
2180 if (res.type == RTN_LOCAL) {
5c04c819 2181 err = fib_validate_source(skb, saddr, daddr, tos,
ebc0ffae 2182 net->loopback_dev->ifindex,
5c04c819 2183 dev, &spec_dst, &itag);
b5f7e755
ED
2184 if (err < 0)
2185 goto martian_source_keep_err;
2186 if (err)
1da177e4
LT
2187 flags |= RTCF_DIRECTSRC;
2188 spec_dst = daddr;
2189 goto local_input;
2190 }
2191
2192 if (!IN_DEV_FORWARD(in_dev))
2c2910a4 2193 goto e_hostunreach;
1da177e4
LT
2194 if (res.type != RTN_UNICAST)
2195 goto martian_destination;
2196
68a5e3dd 2197 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1da177e4
LT
2198out: return err;
2199
2200brd_input:
2201 if (skb->protocol != htons(ETH_P_IP))
2202 goto e_inval;
2203
f97c1e0c 2204 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2205 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2206 else {
5c04c819
MS
2207 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2208 &itag);
1da177e4 2209 if (err < 0)
b5f7e755 2210 goto martian_source_keep_err;
1da177e4
LT
2211 if (err)
2212 flags |= RTCF_DIRECTSRC;
2213 }
2214 flags |= RTCF_BROADCAST;
2215 res.type = RTN_BROADCAST;
2216 RT_CACHE_STAT_INC(in_brd);
2217
2218local_input:
5c1e6aa3
DM
2219 rth = rt_dst_alloc(net->loopback_dev,
2220 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2221 if (!rth)
2222 goto e_nobufs;
2223
cf911662 2224 rth->dst.input= ip_local_deliver;
d8d1f30b 2225 rth->dst.output= ip_rt_bug;
cf911662
DM
2226#ifdef CONFIG_IP_ROUTE_CLASSID
2227 rth->dst.tclassid = itag;
2228#endif
1da177e4 2229
5e2b61f7 2230 rth->rt_key_dst = daddr;
5e2b61f7 2231 rth->rt_key_src = saddr;
cf911662
DM
2232 rth->rt_genid = rt_genid(net);
2233 rth->rt_flags = flags|RTCF_LOCAL;
2234 rth->rt_type = res.type;
475949d8 2235 rth->rt_key_tos = tos;
cf911662 2236 rth->rt_dst = daddr;
1da177e4 2237 rth->rt_src = saddr;
c7066f70 2238#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b 2239 rth->dst.tclassid = itag;
1da177e4 2240#endif
1b86a58f 2241 rth->rt_route_iif = dev->ifindex;
5e2b61f7 2242 rth->rt_iif = dev->ifindex;
cf911662
DM
2243 rth->rt_oif = 0;
2244 rth->rt_mark = skb->mark;
1da177e4
LT
2245 rth->rt_gateway = daddr;
2246 rth->rt_spec_dst= spec_dst;
cf911662
DM
2247 rth->rt_peer_genid = 0;
2248 rth->peer = NULL;
2249 rth->fi = NULL;
1da177e4 2250 if (res.type == RTN_UNREACHABLE) {
d8d1f30b
CG
2251 rth->dst.input= ip_error;
2252 rth->dst.error= -err;
1da177e4
LT
2253 rth->rt_flags &= ~RTCF_LOCAL;
2254 }
68a5e3dd
DM
2255 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2256 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
b23dd4fe
DM
2257 err = 0;
2258 if (IS_ERR(rth))
2259 err = PTR_ERR(rth);
ebc0ffae 2260 goto out;
1da177e4
LT
2261
2262no_route:
2263 RT_CACHE_STAT_INC(in_no_route);
2264 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2265 res.type = RTN_UNREACHABLE;
7f53878d
MC
2266 if (err == -ESRCH)
2267 err = -ENETUNREACH;
1da177e4
LT
2268 goto local_input;
2269
2270 /*
2271 * Do not cache martian addresses: they should be logged (RFC1812)
2272 */
2273martian_destination:
2274 RT_CACHE_STAT_INC(in_martian_dst);
2275#ifdef CONFIG_IP_ROUTE_VERBOSE
2276 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
673d57e7
HH
2277 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2278 &daddr, &saddr, dev->name);
1da177e4 2279#endif
2c2910a4
DE
2280
2281e_hostunreach:
e905a9ed 2282 err = -EHOSTUNREACH;
ebc0ffae 2283 goto out;
2c2910a4 2284
1da177e4
LT
2285e_inval:
2286 err = -EINVAL;
ebc0ffae 2287 goto out;
1da177e4
LT
2288
2289e_nobufs:
2290 err = -ENOBUFS;
ebc0ffae 2291 goto out;
1da177e4
LT
2292
2293martian_source:
b5f7e755
ED
2294 err = -EINVAL;
2295martian_source_keep_err:
1da177e4 2296 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2297 goto out;
1da177e4
LT
2298}
2299
407eadd9
ED
2300int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2301 u8 tos, struct net_device *dev, bool noref)
1da177e4
LT
2302{
2303 struct rtable * rth;
2304 unsigned hash;
2305 int iif = dev->ifindex;
b5921910 2306 struct net *net;
96d36220 2307 int res;
1da177e4 2308
c346dca1 2309 net = dev_net(dev);
1080d709 2310
96d36220
ED
2311 rcu_read_lock();
2312
1080d709
NH
2313 if (!rt_caching(net))
2314 goto skip_cache;
2315
1da177e4 2316 tos &= IPTOS_RT_MASK;
e84f84f2 2317 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
1da177e4 2318
1da177e4 2319 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
d8d1f30b 2320 rth = rcu_dereference(rth->dst.rt_next)) {
5e2b61f7
DM
2321 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2322 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
97a80410 2323 (rth->rt_route_iif ^ iif) |
475949d8 2324 (rth->rt_key_tos ^ tos)) == 0 &&
5e2b61f7 2325 rth->rt_mark == skb->mark &&
d8d1f30b 2326 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2327 !rt_is_expired(rth)) {
407eadd9 2328 if (noref) {
d8d1f30b
CG
2329 dst_use_noref(&rth->dst, jiffies);
2330 skb_dst_set_noref(skb, &rth->dst);
407eadd9 2331 } else {
d8d1f30b
CG
2332 dst_use(&rth->dst, jiffies);
2333 skb_dst_set(skb, &rth->dst);
407eadd9 2334 }
1da177e4
LT
2335 RT_CACHE_STAT_INC(in_hit);
2336 rcu_read_unlock();
1da177e4
LT
2337 return 0;
2338 }
2339 RT_CACHE_STAT_INC(in_hlist_search);
2340 }
1da177e4 2341
1080d709 2342skip_cache:
1da177e4
LT
2343 /* Multicast recognition logic is moved from route cache to here.
2344 The problem was that too many Ethernet cards have broken/missing
2345 hardware multicast filters :-( As result the host on multicasting
2346 network acquires a lot of useless route cache entries, sort of
2347 SDR messages from all the world. Now we try to get rid of them.
2348 Really, provided software IP multicast filter is organized
2349 reasonably (at least, hashed), it does not result in a slowdown
2350 comparing with route cache reject entries.
2351 Note, that multicast routers are not affected, because
2352 route cache entry is created eventually.
2353 */
f97c1e0c 2354 if (ipv4_is_multicast(daddr)) {
96d36220 2355 struct in_device *in_dev = __in_dev_get_rcu(dev);
1da177e4 2356
96d36220 2357 if (in_dev) {
dbdd9a52
DM
2358 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2359 ip_hdr(skb)->protocol);
1da177e4
LT
2360 if (our
2361#ifdef CONFIG_IP_MROUTE
9d4fb27d
JP
2362 ||
2363 (!ipv4_is_local_multicast(daddr) &&
2364 IN_DEV_MFORWARD(in_dev))
1da177e4 2365#endif
9d4fb27d 2366 ) {
96d36220
ED
2367 int res = ip_route_input_mc(skb, daddr, saddr,
2368 tos, dev, our);
1da177e4 2369 rcu_read_unlock();
96d36220 2370 return res;
1da177e4
LT
2371 }
2372 }
2373 rcu_read_unlock();
2374 return -EINVAL;
2375 }
96d36220
ED
2376 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2377 rcu_read_unlock();
2378 return res;
1da177e4 2379}
407eadd9 2380EXPORT_SYMBOL(ip_route_input_common);
1da177e4 2381
ebc0ffae 2382/* called with rcu_read_lock() */
982721f3 2383static struct rtable *__mkroute_output(const struct fib_result *res,
68a5e3dd 2384 const struct flowi4 *fl4,
813b3b5d
DM
2385 __be32 orig_daddr, __be32 orig_saddr,
2386 int orig_oif, struct net_device *dev_out,
5ada5527 2387 unsigned int flags)
1da177e4 2388{
982721f3 2389 struct fib_info *fi = res->fi;
813b3b5d 2390 u32 tos = RT_FL_TOS(fl4);
5ada5527 2391 struct in_device *in_dev;
982721f3 2392 u16 type = res->type;
5ada5527 2393 struct rtable *rth;
1da177e4 2394
68a5e3dd 2395 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
5ada5527 2396 return ERR_PTR(-EINVAL);
1da177e4 2397
68a5e3dd 2398 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2399 type = RTN_BROADCAST;
68a5e3dd 2400 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2401 type = RTN_MULTICAST;
68a5e3dd 2402 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2403 return ERR_PTR(-EINVAL);
1da177e4
LT
2404
2405 if (dev_out->flags & IFF_LOOPBACK)
2406 flags |= RTCF_LOCAL;
2407
dd28d1a0 2408 in_dev = __in_dev_get_rcu(dev_out);
ebc0ffae 2409 if (!in_dev)
5ada5527 2410 return ERR_PTR(-EINVAL);
ebc0ffae 2411
982721f3 2412 if (type == RTN_BROADCAST) {
1da177e4 2413 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2414 fi = NULL;
2415 } else if (type == RTN_MULTICAST) {
dd28d1a0 2416 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2417 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2418 fl4->flowi4_proto))
1da177e4
LT
2419 flags &= ~RTCF_LOCAL;
2420 /* If multicast route do not exist use
dd28d1a0
ED
2421 * default one, but do not gateway in this case.
2422 * Yes, it is hack.
1da177e4 2423 */
982721f3
DM
2424 if (fi && res->prefixlen < 4)
2425 fi = NULL;
1da177e4
LT
2426 }
2427
5c1e6aa3
DM
2428 rth = rt_dst_alloc(dev_out,
2429 IN_DEV_CONF_GET(in_dev, NOPOLICY),
0c4dcd58 2430 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2431 if (!rth)
5ada5527 2432 return ERR_PTR(-ENOBUFS);
8391d07b 2433
cf911662
DM
2434 rth->dst.output = ip_output;
2435
813b3b5d
DM
2436 rth->rt_key_dst = orig_daddr;
2437 rth->rt_key_src = orig_saddr;
cf911662
DM
2438 rth->rt_genid = rt_genid(dev_net(dev_out));
2439 rth->rt_flags = flags;
2440 rth->rt_type = type;
475949d8 2441 rth->rt_key_tos = tos;
68a5e3dd
DM
2442 rth->rt_dst = fl4->daddr;
2443 rth->rt_src = fl4->saddr;
1b86a58f 2444 rth->rt_route_iif = 0;
813b3b5d
DM
2445 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2446 rth->rt_oif = orig_oif;
2447 rth->rt_mark = fl4->flowi4_mark;
68a5e3dd
DM
2448 rth->rt_gateway = fl4->daddr;
2449 rth->rt_spec_dst= fl4->saddr;
cf911662
DM
2450 rth->rt_peer_genid = 0;
2451 rth->peer = NULL;
2452 rth->fi = NULL;
1da177e4
LT
2453
2454 RT_CACHE_STAT_INC(out_slow_tot);
2455
2456 if (flags & RTCF_LOCAL) {
d8d1f30b 2457 rth->dst.input = ip_local_deliver;
68a5e3dd 2458 rth->rt_spec_dst = fl4->daddr;
1da177e4
LT
2459 }
2460 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
68a5e3dd 2461 rth->rt_spec_dst = fl4->saddr;
e905a9ed 2462 if (flags & RTCF_LOCAL &&
1da177e4 2463 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2464 rth->dst.output = ip_mc_output;
1da177e4
LT
2465 RT_CACHE_STAT_INC(out_slow_mc);
2466 }
2467#ifdef CONFIG_IP_MROUTE
982721f3 2468 if (type == RTN_MULTICAST) {
1da177e4 2469 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2470 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2471 rth->dst.input = ip_mr_input;
2472 rth->dst.output = ip_mc_output;
1da177e4
LT
2473 }
2474 }
2475#endif
2476 }
2477
813b3b5d 2478 rt_set_nexthop(rth, fl4, res, fi, type, 0);
1da177e4 2479
5ada5527 2480 return rth;
1da177e4
LT
2481}
2482
1da177e4
LT
2483/*
2484 * Major route resolver routine.
0197aa38 2485 * called with rcu_read_lock();
1da177e4
LT
2486 */
2487
813b3b5d 2488static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
1da177e4 2489{
1da177e4 2490 struct net_device *dev_out = NULL;
813b3b5d
DM
2491 u32 tos = RT_FL_TOS(fl4);
2492 unsigned int flags = 0;
2493 struct fib_result res;
5ada5527 2494 struct rtable *rth;
813b3b5d
DM
2495 __be32 orig_daddr;
2496 __be32 orig_saddr;
2497 int orig_oif;
1da177e4
LT
2498
2499 res.fi = NULL;
2500#ifdef CONFIG_IP_MULTIPLE_TABLES
2501 res.r = NULL;
2502#endif
2503
813b3b5d
DM
2504 orig_daddr = fl4->daddr;
2505 orig_saddr = fl4->saddr;
2506 orig_oif = fl4->flowi4_oif;
2507
2508 fl4->flowi4_iif = net->loopback_dev->ifindex;
2509 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2510 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2511 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2512
010c2708 2513 rcu_read_lock();
813b3b5d 2514 if (fl4->saddr) {
b23dd4fe 2515 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2516 if (ipv4_is_multicast(fl4->saddr) ||
2517 ipv4_is_lbcast(fl4->saddr) ||
2518 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2519 goto out;
2520
1da177e4
LT
2521 /* I removed check for oif == dev_out->oif here.
2522 It was wrong for two reasons:
1ab35276
DL
2523 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2524 is assigned to multiple interfaces.
1da177e4
LT
2525 2. Moreover, we are allowed to send packets with saddr
2526 of another iface. --ANK
2527 */
2528
813b3b5d
DM
2529 if (fl4->flowi4_oif == 0 &&
2530 (ipv4_is_multicast(fl4->daddr) ||
2531 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2532 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2533 dev_out = __ip_dev_find(net, fl4->saddr, false);
a210d01a
JA
2534 if (dev_out == NULL)
2535 goto out;
2536
1da177e4
LT
2537 /* Special hack: user can direct multicasts
2538 and limited broadcast via necessary interface
2539 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2540 This hack is not just for fun, it allows
2541 vic,vat and friends to work.
2542 They bind socket to loopback, set ttl to zero
2543 and expect that it will work.
2544 From the viewpoint of routing cache they are broken,
2545 because we are not allowed to build multicast path
2546 with loopback source addr (look, routing cache
2547 cannot know, that ttl is zero, so that packet
2548 will not leave this host and route is valid).
2549 Luckily, this hack is good workaround.
2550 */
2551
813b3b5d 2552 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2553 goto make_route;
2554 }
a210d01a 2555
813b3b5d 2556 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2557 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2558 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2559 goto out;
a210d01a 2560 }
1da177e4
LT
2561 }
2562
2563
813b3b5d
DM
2564 if (fl4->flowi4_oif) {
2565 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2566 rth = ERR_PTR(-ENODEV);
1da177e4
LT
2567 if (dev_out == NULL)
2568 goto out;
e5ed6399
HX
2569
2570 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2571 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2572 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2573 goto out;
2574 }
813b3b5d
DM
2575 if (ipv4_is_local_multicast(fl4->daddr) ||
2576 ipv4_is_lbcast(fl4->daddr)) {
2577 if (!fl4->saddr)
2578 fl4->saddr = inet_select_addr(dev_out, 0,
2579 RT_SCOPE_LINK);
1da177e4
LT
2580 goto make_route;
2581 }
813b3b5d
DM
2582 if (fl4->saddr) {
2583 if (ipv4_is_multicast(fl4->daddr))
2584 fl4->saddr = inet_select_addr(dev_out, 0,
2585 fl4->flowi4_scope);
2586 else if (!fl4->daddr)
2587 fl4->saddr = inet_select_addr(dev_out, 0,
2588 RT_SCOPE_HOST);
1da177e4
LT
2589 }
2590 }
2591
813b3b5d
DM
2592 if (!fl4->daddr) {
2593 fl4->daddr = fl4->saddr;
2594 if (!fl4->daddr)
2595 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2596 dev_out = net->loopback_dev;
813b3b5d 2597 fl4->flowi4_oif = net->loopback_dev->ifindex;
1da177e4
LT
2598 res.type = RTN_LOCAL;
2599 flags |= RTCF_LOCAL;
2600 goto make_route;
2601 }
2602
813b3b5d 2603 if (fib_lookup(net, fl4, &res)) {
1da177e4 2604 res.fi = NULL;
813b3b5d 2605 if (fl4->flowi4_oif) {
1da177e4
LT
2606 /* Apparently, routing tables are wrong. Assume,
2607 that the destination is on link.
2608
2609 WHY? DW.
2610 Because we are allowed to send to iface
2611 even if it has NO routes and NO assigned
2612 addresses. When oif is specified, routing
2613 tables are looked up with only one purpose:
2614 to catch if destination is gatewayed, rather than
2615 direct. Moreover, if MSG_DONTROUTE is set,
2616 we send packet, ignoring both routing tables
2617 and ifaddr state. --ANK
2618
2619
2620 We could make it even if oif is unknown,
2621 likely IPv6, but we do not.
2622 */
2623
813b3b5d
DM
2624 if (fl4->saddr == 0)
2625 fl4->saddr = inet_select_addr(dev_out, 0,
2626 RT_SCOPE_LINK);
1da177e4
LT
2627 res.type = RTN_UNICAST;
2628 goto make_route;
2629 }
b23dd4fe 2630 rth = ERR_PTR(-ENETUNREACH);
1da177e4
LT
2631 goto out;
2632 }
1da177e4
LT
2633
2634 if (res.type == RTN_LOCAL) {
813b3b5d 2635 if (!fl4->saddr) {
9fc3bbb4 2636 if (res.fi->fib_prefsrc)
813b3b5d 2637 fl4->saddr = res.fi->fib_prefsrc;
9fc3bbb4 2638 else
813b3b5d 2639 fl4->saddr = fl4->daddr;
9fc3bbb4 2640 }
b40afd0e 2641 dev_out = net->loopback_dev;
813b3b5d 2642 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2643 res.fi = NULL;
2644 flags |= RTCF_LOCAL;
2645 goto make_route;
2646 }
2647
2648#ifdef CONFIG_IP_ROUTE_MULTIPATH
813b3b5d 2649 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
1b7fe593 2650 fib_select_multipath(&res);
1da177e4
LT
2651 else
2652#endif
21d8c49e
DM
2653 if (!res.prefixlen &&
2654 res.table->tb_num_default > 1 &&
813b3b5d 2655 res.type == RTN_UNICAST && !fl4->flowi4_oif)
0c838ff1 2656 fib_select_default(&res);
1da177e4 2657
813b3b5d
DM
2658 if (!fl4->saddr)
2659 fl4->saddr = FIB_RES_PREFSRC(net, res);
1da177e4 2660
1da177e4 2661 dev_out = FIB_RES_DEV(res);
813b3b5d 2662 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2663
2664
2665make_route:
813b3b5d
DM
2666 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2667 dev_out, flags);
b23dd4fe 2668 if (!IS_ERR(rth)) {
5ada5527
DM
2669 unsigned int hash;
2670
813b3b5d 2671 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
5ada5527 2672 rt_genid(dev_net(dev_out)));
813b3b5d 2673 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
5ada5527 2674 }
1da177e4 2675
010c2708
DM
2676out:
2677 rcu_read_unlock();
b23dd4fe 2678 return rth;
1da177e4
LT
2679}
2680
813b3b5d 2681struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
1da177e4 2682{
1da177e4 2683 struct rtable *rth;
010c2708 2684 unsigned int hash;
1da177e4 2685
1080d709
NH
2686 if (!rt_caching(net))
2687 goto slow_output;
2688
9d6ec938 2689 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
1da177e4
LT
2690
2691 rcu_read_lock_bh();
a898def2 2692 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
d8d1f30b 2693 rth = rcu_dereference_bh(rth->dst.rt_next)) {
9d6ec938
DM
2694 if (rth->rt_key_dst == flp4->daddr &&
2695 rth->rt_key_src == flp4->saddr &&
c7537967 2696 rt_is_output_route(rth) &&
9d6ec938
DM
2697 rth->rt_oif == flp4->flowi4_oif &&
2698 rth->rt_mark == flp4->flowi4_mark &&
475949d8 2699 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
b5921910 2700 (IPTOS_RT_MASK | RTO_ONLINK)) &&
d8d1f30b 2701 net_eq(dev_net(rth->dst.dev), net) &&
e84f84f2 2702 !rt_is_expired(rth)) {
d8d1f30b 2703 dst_use(&rth->dst, jiffies);
1da177e4
LT
2704 RT_CACHE_STAT_INC(out_hit);
2705 rcu_read_unlock_bh();
56157872
DM
2706 if (!flp4->saddr)
2707 flp4->saddr = rth->rt_src;
2708 if (!flp4->daddr)
2709 flp4->daddr = rth->rt_dst;
b23dd4fe 2710 return rth;
1da177e4
LT
2711 }
2712 RT_CACHE_STAT_INC(out_hlist_search);
2713 }
2714 rcu_read_unlock_bh();
2715
1080d709 2716slow_output:
9d6ec938 2717 return ip_route_output_slow(net, flp4);
1da177e4 2718}
d8c97a94
ACM
2719EXPORT_SYMBOL_GPL(__ip_route_output_key);
2720
ae2688d5
JW
2721static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2722{
2723 return NULL;
2724}
2725
ec831ea7
RD
2726static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
2727{
2728 return 0;
2729}
2730
14e50e57
DM
2731static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2732{
2733}
2734
0972ddb2
HB
2735static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2736 unsigned long old)
2737{
2738 return NULL;
2739}
2740
14e50e57
DM
2741static struct dst_ops ipv4_dst_blackhole_ops = {
2742 .family = AF_INET,
09640e63 2743 .protocol = cpu_to_be16(ETH_P_IP),
14e50e57 2744 .destroy = ipv4_dst_destroy,
ae2688d5 2745 .check = ipv4_blackhole_dst_check,
ec831ea7 2746 .default_mtu = ipv4_blackhole_default_mtu,
214f45c9 2747 .default_advmss = ipv4_default_advmss,
14e50e57 2748 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
0972ddb2 2749 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2750 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2751};
2752
2774c131 2753struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2754{
5c1e6aa3 2755 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2774c131 2756 struct rtable *ort = (struct rtable *) dst_orig;
14e50e57
DM
2757
2758 if (rt) {
d8d1f30b 2759 struct dst_entry *new = &rt->dst;
14e50e57 2760
14e50e57 2761 new->__use = 1;
352e512c
HX
2762 new->input = dst_discard;
2763 new->output = dst_discard;
defb3519 2764 dst_copy_metrics(new, &ort->dst);
14e50e57 2765
d8d1f30b 2766 new->dev = ort->dst.dev;
14e50e57
DM
2767 if (new->dev)
2768 dev_hold(new->dev);
2769
5e2b61f7
DM
2770 rt->rt_key_dst = ort->rt_key_dst;
2771 rt->rt_key_src = ort->rt_key_src;
475949d8 2772 rt->rt_key_tos = ort->rt_key_tos;
1b86a58f 2773 rt->rt_route_iif = ort->rt_route_iif;
5e2b61f7
DM
2774 rt->rt_iif = ort->rt_iif;
2775 rt->rt_oif = ort->rt_oif;
2776 rt->rt_mark = ort->rt_mark;
14e50e57 2777
e84f84f2 2778 rt->rt_genid = rt_genid(net);
14e50e57
DM
2779 rt->rt_flags = ort->rt_flags;
2780 rt->rt_type = ort->rt_type;
2781 rt->rt_dst = ort->rt_dst;
2782 rt->rt_src = ort->rt_src;
14e50e57
DM
2783 rt->rt_gateway = ort->rt_gateway;
2784 rt->rt_spec_dst = ort->rt_spec_dst;
2785 rt->peer = ort->peer;
2786 if (rt->peer)
2787 atomic_inc(&rt->peer->refcnt);
62fa8a84
DM
2788 rt->fi = ort->fi;
2789 if (rt->fi)
2790 atomic_inc(&rt->fi->fib_clntref);
14e50e57
DM
2791
2792 dst_free(new);
2793 }
2794
2774c131
DM
2795 dst_release(dst_orig);
2796
2797 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2798}
2799
9d6ec938 2800struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
b23dd4fe 2801 struct sock *sk)
1da177e4 2802{
9d6ec938 2803 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2804
b23dd4fe
DM
2805 if (IS_ERR(rt))
2806 return rt;
1da177e4 2807
56157872 2808 if (flp4->flowi4_proto)
9d6ec938
DM
2809 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2810 flowi4_to_flowi(flp4),
2811 sk, 0);
1da177e4 2812
b23dd4fe 2813 return rt;
1da177e4 2814}
d8c97a94
ACM
2815EXPORT_SYMBOL_GPL(ip_route_output_flow);
2816
4feb88e5
BT
2817static int rt_fill_info(struct net *net,
2818 struct sk_buff *skb, u32 pid, u32 seq, int event,
b6544c0b 2819 int nowait, unsigned int flags)
1da177e4 2820{
511c3f92 2821 struct rtable *rt = skb_rtable(skb);
1da177e4 2822 struct rtmsg *r;
be403ea1 2823 struct nlmsghdr *nlh;
fe6fe792
ED
2824 long expires = 0;
2825 const struct inet_peer *peer = rt->peer;
e3703b3d 2826 u32 id = 0, ts = 0, tsage = 0, error;
be403ea1
TG
2827
2828 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2829 if (nlh == NULL)
26932566 2830 return -EMSGSIZE;
be403ea1
TG
2831
2832 r = nlmsg_data(nlh);
1da177e4
LT
2833 r->rtm_family = AF_INET;
2834 r->rtm_dst_len = 32;
2835 r->rtm_src_len = 0;
475949d8 2836 r->rtm_tos = rt->rt_key_tos;
1da177e4 2837 r->rtm_table = RT_TABLE_MAIN;
be403ea1 2838 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
1da177e4
LT
2839 r->rtm_type = rt->rt_type;
2840 r->rtm_scope = RT_SCOPE_UNIVERSE;
2841 r->rtm_protocol = RTPROT_UNSPEC;
2842 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2843 if (rt->rt_flags & RTCF_NOTIFY)
2844 r->rtm_flags |= RTM_F_NOTIFY;
be403ea1 2845
17fb2c64 2846 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
be403ea1 2847
5e2b61f7 2848 if (rt->rt_key_src) {
1da177e4 2849 r->rtm_src_len = 32;
5e2b61f7 2850 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
1da177e4 2851 }
d8d1f30b
CG
2852 if (rt->dst.dev)
2853 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
c7066f70 2854#ifdef CONFIG_IP_ROUTE_CLASSID
d8d1f30b
CG
2855 if (rt->dst.tclassid)
2856 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
1da177e4 2857#endif
c7537967 2858 if (rt_is_input_route(rt))
17fb2c64 2859 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
5e2b61f7 2860 else if (rt->rt_src != rt->rt_key_src)
17fb2c64 2861 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
be403ea1 2862
1da177e4 2863 if (rt->rt_dst != rt->rt_gateway)
17fb2c64 2864 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
be403ea1 2865
defb3519 2866 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
be403ea1
TG
2867 goto nla_put_failure;
2868
5e2b61f7
DM
2869 if (rt->rt_mark)
2870 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
963bfeee 2871
d8d1f30b 2872 error = rt->dst.error;
fe6fe792 2873 if (peer) {
317fe0e6 2874 inet_peer_refcheck(rt->peer);
fe6fe792
ED
2875 id = atomic_read(&peer->ip_id_count) & 0xffff;
2876 if (peer->tcp_ts_stamp) {
2877 ts = peer->tcp_ts;
2878 tsage = get_seconds() - peer->tcp_ts_stamp;
1da177e4 2879 }
fe6fe792
ED
2880 expires = ACCESS_ONCE(peer->pmtu_expires);
2881 if (expires)
2882 expires -= jiffies;
1da177e4 2883 }
be403ea1 2884
c7537967 2885 if (rt_is_input_route(rt)) {
1da177e4 2886#ifdef CONFIG_IP_MROUTE
e448515c 2887 __be32 dst = rt->rt_dst;
1da177e4 2888
f97c1e0c 2889 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
4feb88e5 2890 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
9a1b9496
DM
2891 int err = ipmr_get_route(net, skb,
2892 rt->rt_src, rt->rt_dst,
2893 r, nowait);
1da177e4
LT
2894 if (err <= 0) {
2895 if (!nowait) {
2896 if (err == 0)
2897 return 0;
be403ea1 2898 goto nla_put_failure;
1da177e4
LT
2899 } else {
2900 if (err == -EMSGSIZE)
be403ea1 2901 goto nla_put_failure;
e3703b3d 2902 error = err;
1da177e4
LT
2903 }
2904 }
2905 } else
2906#endif
5e2b61f7 2907 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
1da177e4
LT
2908 }
2909
d8d1f30b 2910 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
e3703b3d
TG
2911 expires, error) < 0)
2912 goto nla_put_failure;
be403ea1
TG
2913
2914 return nlmsg_end(skb, nlh);
1da177e4 2915
be403ea1 2916nla_put_failure:
26932566
PM
2917 nlmsg_cancel(skb, nlh);
2918 return -EMSGSIZE;
1da177e4
LT
2919}
2920
63f3444f 2921static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1da177e4 2922{
3b1e0a65 2923 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2924 struct rtmsg *rtm;
2925 struct nlattr *tb[RTA_MAX+1];
1da177e4 2926 struct rtable *rt = NULL;
9e12bb22
AV
2927 __be32 dst = 0;
2928 __be32 src = 0;
2929 u32 iif;
d889ce3b 2930 int err;
963bfeee 2931 int mark;
1da177e4
LT
2932 struct sk_buff *skb;
2933
d889ce3b
TG
2934 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2935 if (err < 0)
2936 goto errout;
2937
2938 rtm = nlmsg_data(nlh);
2939
1da177e4 2940 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
d889ce3b
TG
2941 if (skb == NULL) {
2942 err = -ENOBUFS;
2943 goto errout;
2944 }
1da177e4
LT
2945
2946 /* Reserve room for dummy headers, this skb can pass
2947 through good chunk of routing engine.
2948 */
459a98ed 2949 skb_reset_mac_header(skb);
c1d2bbe1 2950 skb_reset_network_header(skb);
d2c962b8
SH
2951
2952 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
eddc9ec5 2953 ip_hdr(skb)->protocol = IPPROTO_ICMP;
1da177e4
LT
2954 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2955
17fb2c64
AV
2956 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2957 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
d889ce3b 2958 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2959 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
1da177e4
LT
2960
2961 if (iif) {
d889ce3b
TG
2962 struct net_device *dev;
2963
1937504d 2964 dev = __dev_get_by_index(net, iif);
d889ce3b
TG
2965 if (dev == NULL) {
2966 err = -ENODEV;
2967 goto errout_free;
2968 }
2969
1da177e4
LT
2970 skb->protocol = htons(ETH_P_IP);
2971 skb->dev = dev;
963bfeee 2972 skb->mark = mark;
1da177e4
LT
2973 local_bh_disable();
2974 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2975 local_bh_enable();
d889ce3b 2976
511c3f92 2977 rt = skb_rtable(skb);
d8d1f30b
CG
2978 if (err == 0 && rt->dst.error)
2979 err = -rt->dst.error;
1da177e4 2980 } else {
68a5e3dd
DM
2981 struct flowi4 fl4 = {
2982 .daddr = dst,
2983 .saddr = src,
2984 .flowi4_tos = rtm->rtm_tos,
2985 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2986 .flowi4_mark = mark,
d889ce3b 2987 };
9d6ec938 2988 rt = ip_route_output_key(net, &fl4);
b23dd4fe
DM
2989
2990 err = 0;
2991 if (IS_ERR(rt))
2992 err = PTR_ERR(rt);
1da177e4 2993 }
d889ce3b 2994
1da177e4 2995 if (err)
d889ce3b 2996 goto errout_free;
1da177e4 2997
d8d1f30b 2998 skb_dst_set(skb, &rt->dst);
1da177e4
LT
2999 if (rtm->rtm_flags & RTM_F_NOTIFY)
3000 rt->rt_flags |= RTCF_NOTIFY;
3001
4feb88e5 3002 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
1937504d 3003 RTM_NEWROUTE, 0, 0);
d889ce3b
TG
3004 if (err <= 0)
3005 goto errout_free;
1da177e4 3006
1937504d 3007 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
d889ce3b 3008errout:
2942e900 3009 return err;
1da177e4 3010
d889ce3b 3011errout_free:
1da177e4 3012 kfree_skb(skb);
d889ce3b 3013 goto errout;
1da177e4
LT
3014}
3015
3016int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3017{
3018 struct rtable *rt;
3019 int h, s_h;
3020 int idx, s_idx;
1937504d
DL
3021 struct net *net;
3022
3b1e0a65 3023 net = sock_net(skb->sk);
1da177e4
LT
3024
3025 s_h = cb->args[0];
d8c92830
ED
3026 if (s_h < 0)
3027 s_h = 0;
1da177e4 3028 s_idx = idx = cb->args[1];
a6272665
ED
3029 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3030 if (!rt_hash_table[h].chain)
3031 continue;
1da177e4 3032 rcu_read_lock_bh();
a898def2 3033 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
d8d1f30b
CG
3034 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3035 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
1da177e4 3036 continue;
e84f84f2 3037 if (rt_is_expired(rt))
29e75252 3038 continue;
d8d1f30b 3039 skb_dst_set_noref(skb, &rt->dst);
4feb88e5 3040 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
e905a9ed 3041 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
b6544c0b 3042 1, NLM_F_MULTI) <= 0) {
adf30907 3043 skb_dst_drop(skb);
1da177e4
LT
3044 rcu_read_unlock_bh();
3045 goto done;
3046 }
adf30907 3047 skb_dst_drop(skb);
1da177e4
LT
3048 }
3049 rcu_read_unlock_bh();
3050 }
3051
3052done:
3053 cb->args[0] = h;
3054 cb->args[1] = idx;
3055 return skb->len;
3056}
3057
3058void ip_rt_multicast_event(struct in_device *in_dev)
3059{
76e6ebfb 3060 rt_cache_flush(dev_net(in_dev->dev), 0);
1da177e4
LT
3061}
3062
3063#ifdef CONFIG_SYSCTL
81c684d1 3064static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
8d65af78 3065 void __user *buffer,
1da177e4
LT
3066 size_t *lenp, loff_t *ppos)
3067{
3068 if (write) {
639e104f 3069 int flush_delay;
81c684d1 3070 ctl_table ctl;
39a23e75 3071 struct net *net;
639e104f 3072
81c684d1
DL
3073 memcpy(&ctl, __ctl, sizeof(ctl));
3074 ctl.data = &flush_delay;
8d65af78 3075 proc_dointvec(&ctl, write, buffer, lenp, ppos);
639e104f 3076
81c684d1 3077 net = (struct net *)__ctl->extra1;
39a23e75 3078 rt_cache_flush(net, flush_delay);
1da177e4 3079 return 0;
e905a9ed 3080 }
1da177e4
LT
3081
3082 return -EINVAL;
3083}
3084
eeb61f71 3085static ctl_table ipv4_route_table[] = {
1da177e4 3086 {
1da177e4
LT
3087 .procname = "gc_thresh",
3088 .data = &ipv4_dst_ops.gc_thresh,
3089 .maxlen = sizeof(int),
3090 .mode = 0644,
6d9f239a 3091 .proc_handler = proc_dointvec,
1da177e4
LT
3092 },
3093 {
1da177e4
LT
3094 .procname = "max_size",
3095 .data = &ip_rt_max_size,
3096 .maxlen = sizeof(int),
3097 .mode = 0644,
6d9f239a 3098 .proc_handler = proc_dointvec,
1da177e4
LT
3099 },
3100 {
3101 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3102
1da177e4
LT
3103 .procname = "gc_min_interval",
3104 .data = &ip_rt_gc_min_interval,
3105 .maxlen = sizeof(int),
3106 .mode = 0644,
6d9f239a 3107 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3108 },
3109 {
1da177e4
LT
3110 .procname = "gc_min_interval_ms",
3111 .data = &ip_rt_gc_min_interval,
3112 .maxlen = sizeof(int),
3113 .mode = 0644,
6d9f239a 3114 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3115 },
3116 {
1da177e4
LT
3117 .procname = "gc_timeout",
3118 .data = &ip_rt_gc_timeout,
3119 .maxlen = sizeof(int),
3120 .mode = 0644,
6d9f239a 3121 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3122 },
3123 {
1da177e4
LT
3124 .procname = "gc_interval",
3125 .data = &ip_rt_gc_interval,
3126 .maxlen = sizeof(int),
3127 .mode = 0644,
6d9f239a 3128 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3129 },
3130 {
1da177e4
LT
3131 .procname = "redirect_load",
3132 .data = &ip_rt_redirect_load,
3133 .maxlen = sizeof(int),
3134 .mode = 0644,
6d9f239a 3135 .proc_handler = proc_dointvec,
1da177e4
LT
3136 },
3137 {
1da177e4
LT
3138 .procname = "redirect_number",
3139 .data = &ip_rt_redirect_number,
3140 .maxlen = sizeof(int),
3141 .mode = 0644,
6d9f239a 3142 .proc_handler = proc_dointvec,
1da177e4
LT
3143 },
3144 {
1da177e4
LT
3145 .procname = "redirect_silence",
3146 .data = &ip_rt_redirect_silence,
3147 .maxlen = sizeof(int),
3148 .mode = 0644,
6d9f239a 3149 .proc_handler = proc_dointvec,
1da177e4
LT
3150 },
3151 {
1da177e4
LT
3152 .procname = "error_cost",
3153 .data = &ip_rt_error_cost,
3154 .maxlen = sizeof(int),
3155 .mode = 0644,
6d9f239a 3156 .proc_handler = proc_dointvec,
1da177e4
LT
3157 },
3158 {
1da177e4
LT
3159 .procname = "error_burst",
3160 .data = &ip_rt_error_burst,
3161 .maxlen = sizeof(int),
3162 .mode = 0644,
6d9f239a 3163 .proc_handler = proc_dointvec,
1da177e4
LT
3164 },
3165 {
1da177e4
LT
3166 .procname = "gc_elasticity",
3167 .data = &ip_rt_gc_elasticity,
3168 .maxlen = sizeof(int),
3169 .mode = 0644,
6d9f239a 3170 .proc_handler = proc_dointvec,
1da177e4
LT
3171 },
3172 {
1da177e4
LT
3173 .procname = "mtu_expires",
3174 .data = &ip_rt_mtu_expires,
3175 .maxlen = sizeof(int),
3176 .mode = 0644,
6d9f239a 3177 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3178 },
3179 {
1da177e4
LT
3180 .procname = "min_pmtu",
3181 .data = &ip_rt_min_pmtu,
3182 .maxlen = sizeof(int),
3183 .mode = 0644,
6d9f239a 3184 .proc_handler = proc_dointvec,
1da177e4
LT
3185 },
3186 {
1da177e4
LT
3187 .procname = "min_adv_mss",
3188 .data = &ip_rt_min_advmss,
3189 .maxlen = sizeof(int),
3190 .mode = 0644,
6d9f239a 3191 .proc_handler = proc_dointvec,
1da177e4 3192 },
f8572d8f 3193 { }
1da177e4 3194};
39a23e75 3195
2f4520d3
AV
3196static struct ctl_table empty[1];
3197
3198static struct ctl_table ipv4_skeleton[] =
3199{
f8572d8f 3200 { .procname = "route",
d994af0d 3201 .mode = 0555, .child = ipv4_route_table},
f8572d8f 3202 { .procname = "neigh",
d994af0d 3203 .mode = 0555, .child = empty},
2f4520d3
AV
3204 { }
3205};
3206
3207static __net_initdata struct ctl_path ipv4_path[] = {
f8572d8f
EB
3208 { .procname = "net", },
3209 { .procname = "ipv4", },
39a23e75
DL
3210 { },
3211};
3212
39a23e75
DL
3213static struct ctl_table ipv4_route_flush_table[] = {
3214 {
39a23e75
DL
3215 .procname = "flush",
3216 .maxlen = sizeof(int),
3217 .mode = 0200,
6d9f239a 3218 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3219 },
f8572d8f 3220 { },
39a23e75
DL
3221};
3222
2f4520d3 3223static __net_initdata struct ctl_path ipv4_route_path[] = {
f8572d8f
EB
3224 { .procname = "net", },
3225 { .procname = "ipv4", },
3226 { .procname = "route", },
2f4520d3
AV
3227 { },
3228};
3229
39a23e75
DL
3230static __net_init int sysctl_route_net_init(struct net *net)
3231{
3232 struct ctl_table *tbl;
3233
3234 tbl = ipv4_route_flush_table;
09ad9bc7 3235 if (!net_eq(net, &init_net)) {
39a23e75
DL
3236 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3237 if (tbl == NULL)
3238 goto err_dup;
3239 }
3240 tbl[0].extra1 = net;
3241
3242 net->ipv4.route_hdr =
3243 register_net_sysctl_table(net, ipv4_route_path, tbl);
3244 if (net->ipv4.route_hdr == NULL)
3245 goto err_reg;
3246 return 0;
3247
3248err_reg:
3249 if (tbl != ipv4_route_flush_table)
3250 kfree(tbl);
3251err_dup:
3252 return -ENOMEM;
3253}
3254
3255static __net_exit void sysctl_route_net_exit(struct net *net)
3256{
3257 struct ctl_table *tbl;
3258
3259 tbl = net->ipv4.route_hdr->ctl_table_arg;
3260 unregister_net_sysctl_table(net->ipv4.route_hdr);
3261 BUG_ON(tbl == ipv4_route_flush_table);
3262 kfree(tbl);
3263}
3264
3265static __net_initdata struct pernet_operations sysctl_route_ops = {
3266 .init = sysctl_route_net_init,
3267 .exit = sysctl_route_net_exit,
3268};
1da177e4
LT
3269#endif
3270
3ee94372 3271static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3272{
3ee94372
NH
3273 get_random_bytes(&net->ipv4.rt_genid,
3274 sizeof(net->ipv4.rt_genid));
436c3b66
DM
3275 get_random_bytes(&net->ipv4.dev_addr_genid,
3276 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
3277 return 0;
3278}
3279
3ee94372
NH
3280static __net_initdata struct pernet_operations rt_genid_ops = {
3281 .init = rt_genid_init,
9f5e97e5
DL
3282};
3283
3284
c7066f70 3285#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3286struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3287#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4
LT
3288
3289static __initdata unsigned long rhash_entries;
3290static int __init set_rhash_entries(char *str)
3291{
3292 if (!str)
3293 return 0;
3294 rhash_entries = simple_strtoul(str, &str, 0);
3295 return 1;
3296}
3297__setup("rhash_entries=", set_rhash_entries);
3298
3299int __init ip_rt_init(void)
3300{
424c4b70 3301 int rc = 0;
1da177e4 3302
c7066f70 3303#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3304 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3305 if (!ip_rt_acct)
3306 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3307#endif
3308
e5d679f3
AD
3309 ipv4_dst_ops.kmem_cachep =
3310 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3311 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3312
14e50e57
DM
3313 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3314
fc66f95c
ED
3315 if (dst_entries_init(&ipv4_dst_ops) < 0)
3316 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3317
3318 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3319 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3320
424c4b70
ED
3321 rt_hash_table = (struct rt_hash_bucket *)
3322 alloc_large_system_hash("IP route cache",
3323 sizeof(struct rt_hash_bucket),
3324 rhash_entries,
4481374c 3325 (totalram_pages >= 128 * 1024) ?
18955cfc 3326 15 : 17,
8d1502de 3327 0,
424c4b70
ED
3328 &rt_hash_log,
3329 &rt_hash_mask,
c9503e0f 3330 rhash_entries ? 0 : 512 * 1024);
22c047cc
ED
3331 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3332 rt_hash_lock_init();
1da177e4
LT
3333
3334 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3335 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3336
1da177e4
LT
3337 devinet_init();
3338 ip_fib_init();
3339
73b38711 3340 if (ip_rt_proc_init())
107f1634 3341 printk(KERN_ERR "Unable to create route proc files\n");
1da177e4
LT
3342#ifdef CONFIG_XFRM
3343 xfrm_init();
a33bc5c1 3344 xfrm4_init(ip_rt_max_size);
1da177e4 3345#endif
c7ac8679 3346 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3347
39a23e75
DL
3348#ifdef CONFIG_SYSCTL
3349 register_pernet_subsys(&sysctl_route_ops);
3350#endif
3ee94372 3351 register_pernet_subsys(&rt_genid_ops);
1da177e4
LT
3352 return rc;
3353}
3354
a1bc6eb4 3355#ifdef CONFIG_SYSCTL
eeb61f71
AV
3356/*
3357 * We really need to sanitize the damn ipv4 init order, then all
3358 * this nonsense will go away.
3359 */
3360void __init ip_static_sysctl_init(void)
3361{
2f4520d3 3362 register_sysctl_paths(ipv4_path, ipv4_skeleton);
eeb61f71 3363}
a1bc6eb4 3364#endif
This page took 1.461958 seconds and 4 git commands to generate.